sonatoki 0.3.1__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {sonatoki-0.3.1 → sonatoki-0.3.2}/PKG-INFO +1 -1
  2. {sonatoki-0.3.1 → sonatoki-0.3.2}/pyproject.toml +1 -3
  3. {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/Configs.py +26 -25
  4. {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/Filters.py +58 -12
  5. {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/constants.py +53 -28
  6. {sonatoki-0.3.1 → sonatoki-0.3.2}/tests/test_filters.py +38 -59
  7. {sonatoki-0.3.1 → sonatoki-0.3.2}/tests/test_ilo.py +31 -11
  8. sonatoki-0.3.2/tests/test_properties.py +78 -0
  9. {sonatoki-0.3.1 → sonatoki-0.3.2}/tests/test_utils.py +1 -1
  10. {sonatoki-0.3.1 → sonatoki-0.3.2}/LICENSE +0 -0
  11. {sonatoki-0.3.1 → sonatoki-0.3.2}/README.md +0 -0
  12. {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/Cleaners.py +0 -0
  13. {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/Preprocessors.py +0 -0
  14. {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/Scorers.py +0 -0
  15. {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/Tokenizers.py +0 -0
  16. {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/__init__.py +0 -0
  17. {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/__main__.py +0 -0
  18. {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/ilo.py +0 -0
  19. {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/linku.json +0 -0
  20. {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/py.typed +0 -0
  21. {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/sandbox.json +0 -0
  22. {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/utils.py +0 -0
  23. {sonatoki-0.3.1 → sonatoki-0.3.2}/tests/__init__.py +0 -0
  24. {sonatoki-0.3.1 → sonatoki-0.3.2}/tests/test_cleaners.py +0 -0
  25. {sonatoki-0.3.1 → sonatoki-0.3.2}/tests/test_preprocessors.py +0 -0
  26. {sonatoki-0.3.1 → sonatoki-0.3.2}/tests/test_scorers.py +0 -0
  27. {sonatoki-0.3.1 → sonatoki-0.3.2}/tests/test_tokenize.py +0 -0
  28. {sonatoki-0.3.1 → sonatoki-0.3.2}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
  29. {sonatoki-0.3.1 → sonatoki-0.3.2}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.3.1"
3
+ version = "0.3.2"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -16,8 +16,6 @@ readme = "README.md"
16
16
  [project.license]
17
17
  text = "AGPL-3.0-or-later"
18
18
 
19
- [project.optional-dependencies]
20
-
21
19
  [build-system]
22
20
  requires = [
23
21
  "pdm-backend",
@@ -5,17 +5,17 @@ from typing import List, Type, TypedDict
5
5
  # LOCAL
6
6
  from sonatoki.Filters import (
7
7
  Filter,
8
- NimiPu,
9
8
  Numeric,
10
- OrFilter,
11
9
  Syllabic,
12
10
  NimiUCSUR,
13
11
  Alphabetic,
14
12
  ProperName,
15
- Phonotactic,
16
13
  Punctuation,
14
+ LongSyllabic,
15
+ Miscellaneous,
17
16
  NimiLinkuCore,
18
- NimiPuSynonyms,
17
+ LongAlphabetic,
18
+ LongProperName,
19
19
  OrMemberFilter,
20
20
  NimiLinkuCommon,
21
21
  NimiLinkuObscure,
@@ -28,12 +28,9 @@ from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
28
28
  from sonatoki.Tokenizers import Tokenizer, WordTokenizer
29
29
  from sonatoki.Preprocessors import (
30
30
  URLs,
31
+ Backticks,
31
32
  Reference,
32
33
  Preprocessor,
33
- DiscordEmotes,
34
- DiscordSpecial,
35
- DiscordChannels,
36
- DiscordMentions,
37
34
  AngleBracketObject,
38
35
  )
39
36
 
@@ -62,14 +59,14 @@ BaseConfig: IloConfig = {
62
59
 
63
60
 
64
61
  PrefConfig: IloConfig = {
65
- "preprocessors": [URLs, Reference],
62
+ "preprocessors": [Backticks, URLs, Reference],
66
63
  "cleaners": [ConsecutiveDuplicates],
67
- "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
64
+ "ignoring_filters": [Numeric, Punctuation],
68
65
  "scoring_filters": [
69
- OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
70
- Syllabic,
71
- ProperName,
72
- Alphabetic,
66
+ OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
67
+ LongSyllabic,
68
+ LongProperName,
69
+ LongAlphabetic,
73
70
  ],
74
71
  "scorer": SoftScaling,
75
72
  "passing_score": 0.8,
@@ -77,9 +74,9 @@ PrefConfig: IloConfig = {
77
74
  }
78
75
 
79
76
  CorpusConfig: IloConfig = {
80
- "preprocessors": [URLs, AngleBracketObject, Reference],
77
+ "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
81
78
  "cleaners": [ConsecutiveDuplicates],
82
- "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
79
+ "ignoring_filters": [Numeric, Punctuation],
83
80
  "scoring_filters": [
84
81
  OrMemberFilter(
85
82
  NimiLinkuCore,
@@ -88,10 +85,11 @@ CorpusConfig: IloConfig = {
88
85
  NimiLinkuObscure,
89
86
  NimiLinkuSandbox,
90
87
  NimiUCSUR,
88
+ Miscellaneous,
91
89
  ),
92
- Syllabic,
93
- ProperName,
94
- Alphabetic,
90
+ LongSyllabic,
91
+ LongProperName,
92
+ LongAlphabetic,
95
93
  ],
96
94
  "scorer": SoftScaling,
97
95
  "passing_score": 0.8,
@@ -99,25 +97,28 @@ CorpusConfig: IloConfig = {
99
97
  }
100
98
 
101
99
 
100
+ """
101
+ Mimics the previous implementation of ilo pi toki pona taso
102
+ """
102
103
  LazyConfig: IloConfig = {
103
- "preprocessors": [URLs],
104
+ "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
104
105
  "cleaners": [ConsecutiveDuplicates],
105
106
  "ignoring_filters": [Numeric, Punctuation],
106
- "scoring_filters": [Alphabetic, NimiUCSUR, ProperName],
107
+ "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
107
108
  "scorer": SoftPassFail,
108
109
  "passing_score": 0.8,
109
110
  "word_tokenizer": WordTokenizer,
110
111
  }
111
112
 
112
113
  DiscordConfig: IloConfig = {
113
- "preprocessors": [URLs, AngleBracketObject, Reference],
114
+ "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
114
115
  "cleaners": [ConsecutiveDuplicates],
115
116
  "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
116
117
  "scoring_filters": [
117
118
  OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
118
- Syllabic,
119
- ProperName,
120
- Alphabetic,
119
+ LongSyllabic,
120
+ LongProperName,
121
+ LongAlphabetic,
121
122
  ],
122
123
  "scorer": SoftScaling,
123
124
  "passing_score": 0.8,
@@ -42,6 +42,33 @@ class Filter(ABC):
42
42
  raise NotImplementedError
43
43
 
44
44
 
45
+ class MinLen(Filter):
46
+ """
47
+ Meta filter meant to be inherited by another filter to add a length requirement.
48
+ Multiple-inherit with `MinLen` as the first argument so `super()` resolves correctly.
49
+ You may also construct any other filter with a minimum length filter like so:
50
+
51
+ ```
52
+ MinLen(Alphabetic, 3)
53
+ ```
54
+ """
55
+
56
+ length = 0
57
+
58
+ @classmethod
59
+ @cache(maxsize=None)
60
+ def filter(cls, token: str) -> bool:
61
+ if len(token) < cls.length:
62
+ return False
63
+ return super().filter(token)
64
+
65
+ def __new__(cls, filter: Type[Filter], length_: int) -> Type[Filter]:
66
+ class MinLenFilter(MinLen, Filter):
67
+ length = length_
68
+
69
+ return MinLenFilter
70
+
71
+
45
72
  class RegexFilter(Filter):
46
73
  pattern: "re.Pattern[str]"
47
74
 
@@ -83,11 +110,16 @@ class SubsetFilter(Filter):
83
110
 
84
111
 
85
112
  class Miscellaneous(MemberFilter):
86
- tokens = set(ALLOWABLES)
113
+ tokens = prep_dictionary(ALLOWABLES)
87
114
 
88
115
 
89
116
  class EnglishIgnorables(MemberFilter):
90
- tokens = set(IGNORABLES)
117
+ """NOTE: Not recommended for use.
118
+ It is better to use a Long* filter such as LongSyllabic than to use this filter.
119
+ This filter hides words from scoring rather than scoring them poorly,
120
+ which is more of a benefit than a loss for a word you would like to omit."""
121
+
122
+ tokens = prep_dictionary(IGNORABLES)
91
123
 
92
124
 
93
125
  class ProperName(Filter):
@@ -109,6 +141,10 @@ class ProperName(Filter):
109
141
  # this will errantly match.
110
142
 
111
143
 
144
+ class LongProperName(MinLen, ProperName):
145
+ length = 2 # reject "names" of length 1
146
+
147
+
112
148
  class NimiPu(MemberFilter):
113
149
  tokens = prep_dictionary(NIMI_PU)
114
150
 
@@ -166,6 +202,10 @@ class Phonotactic(RegexFilter):
166
202
  )
167
203
 
168
204
 
205
+ class LongPhonotactic(MinLen, Phonotactic):
206
+ length = 3
207
+
208
+
169
209
  class Syllabic(RegexFilter):
170
210
  """Determines if a given token is syllabically valid Toki Pona (or `n`).
171
211
  Words must have correctly ordered vowels and consonants, but the phonotactic
@@ -179,6 +219,10 @@ class Syllabic(RegexFilter):
179
219
  )
180
220
 
181
221
 
222
+ class LongSyllabic(MinLen, Syllabic):
223
+ length = 3
224
+
225
+
182
226
  class Alphabetic(SubsetFilter):
183
227
  tokens = set(ALPHABET)
184
228
 
@@ -187,9 +231,8 @@ class AlphabeticRe(RegexFilter):
187
231
  pattern = re.compile(rf"[{ALPHABET}]+", flags=re.IGNORECASE)
188
232
 
189
233
 
190
- class TwoOrMoreAlphabetic(Filter):
191
- # TODO: alphabetic implementation that ignores single characters
192
- pass
234
+ class LongAlphabetic(MinLen, Alphabetic):
235
+ length = 3
193
236
 
194
237
 
195
238
  class Numeric(Filter):
@@ -266,11 +309,9 @@ class OrFilter:
266
309
  if not len(filters) >= 2:
267
310
  raise ValueError("Provide at least two Filters to OrFilter.")
268
311
 
269
- subset_filters = [f for f in filters if issubclass(f, MemberFilter)]
270
- if len(subset_filters) >= 2:
271
- raise Warning(
272
- "Prefer OrMemberFilter for combining two or more MemberFilters."
273
- )
312
+ member_filters = [f for f in filters if issubclass(f, MemberFilter)]
313
+ if len(member_filters) >= 2:
314
+ raise Warning("Use OrMemberFilter for combining two or more MemberFilters.")
274
315
 
275
316
  filter = cls.__generic_filter(*filters)
276
317
 
@@ -279,7 +320,7 @@ class OrFilter:
279
320
 
280
321
  class OrMemberFilter:
281
322
  @staticmethod
282
- def __subset_filter(*filters: Type[MemberFilter]) -> Type[MemberFilter]:
323
+ def __member_filter(*filters: Type[MemberFilter]) -> Type[MemberFilter]:
283
324
  all_token_sets: List[Set[str]] = [f.tokens for f in filters]
284
325
  all_tokens: Set[str] = set().union(*all_token_sets)
285
326
 
@@ -291,7 +332,7 @@ class OrMemberFilter:
291
332
  def __new__(cls, *filters_: Type[MemberFilter]) -> Type[MemberFilter]:
292
333
  if not len(filters_) >= 2:
293
334
  raise ValueError("Provide two or more MemberFilters to OrMemberFilter.")
294
- filter = cls.__subset_filter(*filters_)
335
+ filter = cls.__member_filter(*filters_)
295
336
  return filter
296
337
 
297
338
 
@@ -323,6 +364,11 @@ __all__ = [
323
364
  "Alphabetic",
324
365
  "AndFilter",
325
366
  "EnglishIgnorables",
367
+ "LongAlphabetic",
368
+ "LongPhonotactic",
369
+ "LongProperName",
370
+ "LongSyllabic",
371
+ "MinLen",
326
372
  "NimiLinkuCore",
327
373
  "NimiLinkuSandbox",
328
374
  "NimiPu",
@@ -1,6 +1,6 @@
1
1
  # STL
2
2
  import json
3
- from typing import Dict, List
3
+ from typing import Set, Dict, List
4
4
  from pathlib import Path
5
5
 
6
6
  # LOCAL
@@ -383,37 +383,62 @@ LANGUAGE = "english" # for NLTK
383
383
 
384
384
  """Commonly occurring strings which are some kind of valid Toki Pona or external token"""
385
385
  ALLOWABLES = {
386
- "cw", # Content Warning
387
386
  "x", # ala
388
387
  "y", # anu
389
388
  "kxk", # ken ala ken
390
389
  "wxw", # wile ala wile
391
390
  }
392
391
 
393
- IGNORABLES = {
394
- # o, e, n are not here bc they're not frequently problematic in english messages
395
- "a",
396
- "am",
397
- "an",
398
- "i",
399
- "in",
400
- "is",
401
- "l", # they'll
402
- "m", # i'm
403
- "me",
404
- "no",
405
- "s", # let's
406
- "so",
407
- "t", # don't
408
- "to",
409
- "u", # you
410
- "we",
411
- "un", # un-
412
- "use",
392
+ PHONOMATCHES = {
393
+ # "a", # ignore
394
+ # "an", # against
395
+ # "i", # against
396
+ # "in", # against
413
397
  "some",
414
- "like",
398
+ "like", # against
399
+ # "me", # against
400
+ # "no", # against
401
+ # "on", # against
402
+ # "se", # against
403
+ # "so", # against
404
+ # "some", # against
405
+ "to", # ignore
406
+ # "u", # against
407
+ # "un", # against
408
+ "use", # against
409
+ # "we", # against
415
410
  }
416
411
 
412
+ ALPHABETIC_MATCHES = PHONOMATCHES | {
413
+ "a",
414
+ # "am",
415
+ # "as",
416
+ # "at",
417
+ # "aw", # aww
418
+ # "ek", # eek
419
+ # "ew",
420
+ # "ik",
421
+ # "il", # ill
422
+ # "im",
423
+ # "im",
424
+ # "ip",
425
+ # "is",
426
+ # "it",
427
+ # "l", # they'll
428
+ # "m", # i'm
429
+ # "ok",
430
+ # "op",
431
+ # "ow",
432
+ # "s", # let's
433
+ # "t", # don't
434
+ # "up",
435
+ # "us",
436
+ # "ut",
437
+ # "uw",
438
+ }
439
+
440
+ IGNORABLES = PHONOMATCHES | ALPHABETIC_MATCHES
441
+
417
442
  UCSUR_RANGES = [
418
443
  "\\U000F1900-\\U000F1977", # pu
419
444
  "\\U000F1978-\\U000F1988", # ku suli
@@ -426,14 +451,14 @@ NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
426
451
  # NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
427
452
 
428
453
 
429
- def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> List[str]:
430
- return [d["word"] for d in data.values() if d[key] == value]
454
+ def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> Set[str]:
455
+ return {d["word"] for d in data.values() if d[key] == value}
431
456
 
432
457
 
433
458
  with open(LINKU) as f:
434
459
  linku: Dict[str, Dict[str, str]] = json.loads(f.read())
435
- NIMI_PU: List[str] = category_helper(linku, "book", "pu")
436
- NIMI_PU_SYNONYMS: List[str] = ["namako", "kin", "oko"]
460
+ NIMI_PU = category_helper(linku, "book", "pu")
461
+ NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
437
462
 
438
463
  NIMI_KU_SULI = category_helper(linku, "book", "ku suli")
439
464
  NIMI_KU_LILI = category_helper(linku, "book", "ku lili")
@@ -445,7 +470,7 @@ with open(LINKU) as f:
445
470
 
446
471
  with open(SANDBOX) as f:
447
472
  sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
448
- NIMI_LINKU_SANDBOX: List[str] = [d["word"] for d in sandbox.values()]
473
+ NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
449
474
 
450
475
  del linku
451
476
  del sandbox
@@ -18,11 +18,13 @@ from sonatoki.Filters import (
18
18
  Phonotactic,
19
19
  Punctuation,
20
20
  AlphabeticRe,
21
+ LongSyllabic,
21
22
  NimiLinkuCore,
22
23
  PunctuationRe,
23
- NimiPuSynonyms,
24
+ LongAlphabetic,
24
25
  OrMemberFilter,
25
26
  PunctuationRe1,
27
+ LongPhonotactic,
26
28
  NimiLinkuCommon,
27
29
  NimiLinkuObscure,
28
30
  NimiLinkuSandbox,
@@ -34,7 +36,6 @@ from sonatoki.constants import (
34
36
  NIMI_KU_LILI,
35
37
  NIMI_KU_SULI,
36
38
  NIMI_LINKU_CORE,
37
- NIMI_PU_SYNONYMS,
38
39
  NIMI_LINKU_COMMON,
39
40
  NIMI_LINKU_OBSCURE,
40
41
  NIMI_LINKU_SANDBOX,
@@ -45,7 +46,7 @@ from sonatoki.constants import (
45
46
  from .test_utils import PROPER_NAME_RE
46
47
 
47
48
 
48
- @given(st.sampled_from(NIMI_PU))
49
+ @given(st.sampled_from(list(NIMI_PU)))
49
50
  @example("lukin")
50
51
  @example("selo")
51
52
  @example("li")
@@ -54,14 +55,14 @@ def test_NimiPu(s: str):
54
55
  assert res, repr(s)
55
56
 
56
57
 
57
- @given(st.sampled_from(NIMI_LINKU_CORE))
58
+ @given(st.sampled_from(list(NIMI_LINKU_CORE)))
58
59
  @example("pona")
59
60
  def test_NimiLinkuCore(s: str):
60
61
  res = NimiLinkuCore.filter(s)
61
62
  assert res, repr(s)
62
63
 
63
64
 
64
- @given(st.sampled_from(NIMI_LINKU_COMMON))
65
+ @given(st.sampled_from(list(NIMI_LINKU_COMMON)))
65
66
  @example("n")
66
67
  @example("tonsi")
67
68
  @example("kipisi")
@@ -70,19 +71,19 @@ def test_NimiLinkuCommon(s: str):
70
71
  assert res, repr(s)
71
72
 
72
73
 
73
- @given(st.sampled_from(NIMI_LINKU_UNCOMMON))
74
+ @given(st.sampled_from(list(NIMI_LINKU_UNCOMMON)))
74
75
  def test_NimiLinkuUncommon(s: str):
75
76
  res = NimiLinkuUncommon.filter(s)
76
77
  assert res, repr(s)
77
78
 
78
79
 
79
- @given(st.sampled_from(NIMI_LINKU_OBSCURE))
80
+ @given(st.sampled_from(list(NIMI_LINKU_OBSCURE)))
80
81
  def test_NimiLinkuObscure(s: str):
81
82
  res = NimiLinkuObscure.filter(s)
82
83
  assert res, repr(s)
83
84
 
84
85
 
85
- @given(st.sampled_from(NIMI_LINKU_SANDBOX))
86
+ @given(st.sampled_from(list(NIMI_LINKU_SANDBOX)))
86
87
  @example("kalamARR")
87
88
  @example("Pingo")
88
89
  def test_NimiLinkuSandbox(s: str):
@@ -101,6 +102,13 @@ def test_Phonotactic(s: str):
101
102
  assert res, repr(s)
102
103
 
103
104
 
105
+ @given(st.from_regex(Phonotactic.pattern.pattern, fullmatch=True))
106
+ def test_LongPhonotactic(s: str):
107
+ len_ok = len(s) >= LongPhonotactic.length
108
+ res = LongPhonotactic.filter(s)
109
+ assert res == len_ok, repr(s) # will match given fullmatch
110
+
111
+
104
112
  @given(st.from_regex(Syllabic.pattern.pattern, fullmatch=True))
105
113
  @example("wuwojitiwunwonjintinmanna")
106
114
  def test_Syllabic(s: str):
@@ -108,6 +116,13 @@ def test_Syllabic(s: str):
108
116
  assert res, repr(s)
109
117
 
110
118
 
119
+ @given(st.from_regex(Syllabic.pattern.pattern, fullmatch=True))
120
+ def test_LongSyllabic(s: str):
121
+ len_ok = len(s) >= LongSyllabic.length
122
+ res = LongSyllabic.filter(s)
123
+ assert res == len_ok
124
+
125
+
111
126
  @given(st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True))
112
127
  @example("muems")
113
128
  @example("mpptp")
@@ -118,6 +133,13 @@ def test_Alphabetic(s: str):
118
133
  assert res_fn == res_re, repr(s)
119
134
 
120
135
 
136
+ @given(st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True))
137
+ def test_LongAlphabetic(s: str):
138
+ len_ok = len(s) >= LongAlphabetic.length
139
+ res = LongAlphabetic.filter(s)
140
+ assert res == len_ok
141
+
142
+
121
143
  @given(st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True))
122
144
  def test_AlphabeticRe(s: str):
123
145
  res_re = AlphabeticRe.filter(s)
@@ -181,7 +203,7 @@ def test_OrFilter(s: str):
181
203
  # NOTE: No subset filter test because A | B is not the same as A combined with B.
182
204
  # e.g. "apple" passes Alphabetic, "..." passes Punctuation, "apple..." passes neither
183
205
  # but would incorrectly pass a combined filter.
184
- @given(st.sampled_from(NIMI_PU + NIMI_LINKU_OBSCURE))
206
+ @given(st.sampled_from(list(NIMI_PU | NIMI_LINKU_OBSCURE)))
185
207
  def test_OrMemberFilter(s: str):
186
208
  filter = OrMemberFilter(NimiPu, NimiLinkuObscure)
187
209
  res = filter.filter(s)
@@ -192,11 +214,13 @@ def test_OrMemberFilter(s: str):
192
214
 
193
215
  @given(
194
216
  st.sampled_from(
195
- NIMI_KU_SULI
196
- + NIMI_KU_LILI
197
- + NIMI_LINKU_UNCOMMON
198
- + NIMI_LINKU_OBSCURE
199
- + NIMI_LINKU_SANDBOX,
217
+ list(
218
+ NIMI_KU_SULI
219
+ | NIMI_KU_LILI
220
+ | NIMI_LINKU_UNCOMMON
221
+ | NIMI_LINKU_OBSCURE
222
+ | NIMI_LINKU_SANDBOX
223
+ ),
200
224
  )
201
225
  )
202
226
  def test_OrMemberFilter_IsipinEpiku(s: str):
@@ -216,48 +240,3 @@ def test_OrMemberFilter_IsipinEpiku(s: str):
216
240
  assert res and (
217
241
  res_ku_suli or res_ku_lili or res_uncommon or res_obscure or res_sandbox
218
242
  )
219
-
220
-
221
- @given(st.sampled_from(NIMI_PU + NIMI_PU_SYNONYMS))
222
- def test_pu_filters_non_overlap(s: str):
223
- res_pu = NimiPu.filter(s)
224
- res_synonyms = NimiPuSynonyms.filter(s)
225
- assert (res_pu + res_synonyms) == 1
226
-
227
-
228
- @given(st.sampled_from(NIMI_KU_SULI + NIMI_KU_LILI))
229
- def test_ku_filters_non_overlap(s: str):
230
- res_ku_suli = NimiKuSuli.filter(s)
231
- res_ku_lili = NimiKuLili.filter(s)
232
- assert (res_ku_suli + res_ku_lili) == 1
233
-
234
-
235
- @given(
236
- st.sampled_from(
237
- NIMI_LINKU_CORE
238
- + NIMI_LINKU_COMMON
239
- + NIMI_LINKU_UNCOMMON
240
- + NIMI_LINKU_OBSCURE
241
- + NIMI_LINKU_SANDBOX
242
- )
243
- )
244
- def test_linku_filters_non_overlap(s: str):
245
- s = Lowercase.clean(s)
246
- s = ConsecutiveDuplicates.clean(s)
247
-
248
- res_core = NimiLinkuCore.filter(s)
249
- res_common = NimiLinkuCommon.filter(s)
250
- res_uncommon = NimiLinkuUncommon.filter(s)
251
- res_obscure = NimiLinkuObscure.filter(s)
252
- res_sandbox = NimiLinkuSandbox.filter(s)
253
-
254
- assert (res_core + res_common + res_uncommon + res_obscure + res_sandbox) == 1
255
-
256
-
257
- @given(st.sampled_from(NIMI_LINKU_CORE + NIMI_LINKU_COMMON + NIMI_LINKU_UNCOMMON))
258
- def test_nimi_linku_properties(s: str):
259
- assert ConsecutiveDuplicates.clean(s) == s, repr(s)
260
- assert Alphabetic.filter(s), repr(s)
261
- assert Syllabic.filter(s), repr(s)
262
- assert Phonotactic.filter(s), repr(s)
263
- # Passing phonotactic implies all of the above
@@ -59,6 +59,8 @@ SYLLABIC_MATCHES = [
59
59
  "mi sona ala e nimi sunopatikuna",
60
60
  "kalama wuwojiti li pana e sona",
61
61
  "jan Awaja en jan Alasali en jan Akesinu li pona", # syllables match before names here
62
+ "jan Ke Tami",
63
+ "kulupu Kuko",
62
64
  ]
63
65
 
64
66
  ALPHABETIC_MATCHES = [
@@ -85,13 +87,20 @@ SOME_INVALID = [
85
87
  "mi tawa ma ohio",
86
88
  "sina toki e nimi what pi toki Inli",
87
89
  "wawa la o lukin e ni: your mom",
90
+ "lete li ike x.x", # this is an emoticon but passes because 'x' is in Filters.Miscellaneous
88
91
  ]
89
92
 
90
93
  CORPUS_SPECIFIC = [
91
- "ki le konsi si te isipin epiku le pasila to",
94
+ # "ki le konsi si te isipin epiku le pasila to",
95
+ "ki konsi te isipin epiku pasila to", # the sandbox has not documented si or le
92
96
  'jasima omekapo, ki nimisin "jasima enko nimisin". ki enko alu linluwi Jutu alu epiku ki epiku baba is you. ki likujo "SINtelen pona", ki epiku alu "sitelen pona". ki kepen wawajete isipin, kin ki yupekosi alu lipamanka alu wawajete, kin ki enko isipin lipamanka linluwi alu wawajete',
93
97
  "kalamARRRR",
94
98
  "Pingo",
99
+ "we Luke",
100
+ ]
101
+ CORPUS_SPECIFIC_XFAIL = [
102
+ "How to Cut a Kiwi",
103
+ "a e i o u",
95
104
  ]
96
105
 
97
106
 
@@ -103,6 +112,7 @@ EXCESSIVE_SYLLABICS = [
103
112
  "I manipulate a passe pile so a ton emulate, akin to intake",
104
113
  "a ton of insolate puke. make no amen, no joke.",
105
114
  "I elope so, to an elite untaken tune, some unwise tone",
115
+ "insane asinine lemon awesome atone joke",
106
116
  ]
107
117
 
108
118
  EXCESSIVE_ALPHABETICS = [
@@ -122,11 +132,13 @@ EXCESSIVE_NAMES = [
122
132
  "I Want To Evade The Filter",
123
133
  "If You Do This The Bot Can't See You",
124
134
  "This Is A Statement In Perfect Toki Pona, I Guarantee",
125
- "How to Cut a Kiwi", # previous false positive; fixed by english ignorables
126
135
  ]
127
136
 
128
137
  EXCESSIVE_ENGLISH = [
129
138
  "me when i tawa sike", # previous false positive; fixed by english ignorables
139
+ "Maybe I’m too nasa", # previous false positive; fixed by LongSyllabic and LongAlphabetic
140
+ "I see :)",
141
+ "I wanna see", # same down to here
130
142
  ]
131
143
 
132
144
  NON_MATCHES = [
@@ -134,6 +146,7 @@ NON_MATCHES = [
134
146
  "super bruh moment 64",
135
147
  "homestuck",
136
148
  "homestuck Homestuck",
149
+ "what if i went to the store ",
137
150
  ]
138
151
 
139
152
  KNOWN_GOOD = (
@@ -150,22 +163,23 @@ KNOWN_BAD = (
150
163
  + EXCESSIVE_ALPHABETICS
151
164
  + EXCESSIVE_NAMES
152
165
  + EXCESSIVE_TYPOES
166
+ + EXCESSIVE_ENGLISH
153
167
  + NON_MATCHES
154
168
  )
155
169
 
156
170
  FALSE_NEGATIVES = [
157
171
  # emoticon should not be a problem
158
- "lete li ike x.x",
159
172
  # a token that is one edit off a known word should be allowed
160
173
  "mi pnoa",
161
174
  "tok",
162
175
  "mut",
163
176
  "poan",
164
177
  "mtue",
178
+ "mi nasa B^)", # emoticon
165
179
  ]
166
180
 
167
181
  FALSE_POSITIVES = [
168
- "Maybe I’m too nasa",
182
+ "insane asinine lemon awesome atone",
169
183
  ]
170
184
 
171
185
 
@@ -174,16 +188,16 @@ def test_known_good_pref(ilo: Ilo, text: str):
174
188
  assert ilo.is_toki_pona(text), text
175
189
 
176
190
 
191
+ @pytest.mark.parametrize("text", KNOWN_BAD + CORPUS_SPECIFIC)
192
+ def test_known_bad_pref(ilo: Ilo, text: str):
193
+ assert not ilo.is_toki_pona(text), text
194
+
195
+
177
196
  @pytest.mark.parametrize("text", KNOWN_GOOD + CORPUS_SPECIFIC)
178
197
  def test_known_good_corpus(corpus_ilo: Ilo, text: str):
179
198
  assert corpus_ilo.is_toki_pona(text), text
180
199
 
181
200
 
182
- @pytest.mark.parametrize("text", KNOWN_BAD + CORPUS_SPECIFIC)
183
- def test_known_bad(ilo: Ilo, text: str):
184
- assert not ilo.is_toki_pona(text), text
185
-
186
-
187
201
  @pytest.mark.parametrize("text", KNOWN_BAD)
188
202
  def test_known_bad_corpus(corpus_ilo: Ilo, text: str):
189
203
  assert not corpus_ilo.is_toki_pona(text), text
@@ -209,11 +223,17 @@ def test_weakness_of_lazy(lazy_ilo: Ilo, text: str):
209
223
 
210
224
  @pytest.mark.xfail
211
225
  @pytest.mark.parametrize("text", FALSE_POSITIVES)
212
- def test_false_positives(ilo: Ilo, text: str):
226
+ def test_false_positives_pref(ilo: Ilo, text: str):
213
227
  assert not ilo.is_toki_pona(text)
214
228
 
215
229
 
216
230
  @pytest.mark.xfail
217
231
  @pytest.mark.parametrize("text", FALSE_NEGATIVES)
218
- def test_false_negatives(ilo: Ilo, text: str):
232
+ def test_false_negatives_pref(ilo: Ilo, text: str):
219
233
  assert ilo.is_toki_pona(text)
234
+
235
+
236
+ @pytest.mark.xfail
237
+ @pytest.mark.parametrize("text", CORPUS_SPECIFIC_XFAIL)
238
+ def test_false_positives_corpus(corpus_ilo: Ilo, text: str):
239
+ assert not corpus_ilo.is_toki_pona(text)
@@ -0,0 +1,78 @@
1
+ # PDM
2
+ import hypothesis.strategies as st
3
+ from hypothesis import given
4
+
5
+ # LOCAL
6
+ from sonatoki.Filters import (
7
+ NimiPu,
8
+ Syllabic,
9
+ Alphabetic,
10
+ NimiKuLili,
11
+ NimiKuSuli,
12
+ Phonotactic,
13
+ NimiLinkuCore,
14
+ NimiPuSynonyms,
15
+ NimiLinkuCommon,
16
+ NimiLinkuObscure,
17
+ NimiLinkuSandbox,
18
+ NimiLinkuUncommon,
19
+ )
20
+ from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates
21
+ from sonatoki.constants import (
22
+ NIMI_PU,
23
+ NIMI_KU_LILI,
24
+ NIMI_KU_SULI,
25
+ NIMI_LINKU_CORE,
26
+ NIMI_PU_SYNONYMS,
27
+ NIMI_LINKU_COMMON,
28
+ NIMI_LINKU_OBSCURE,
29
+ NIMI_LINKU_SANDBOX,
30
+ NIMI_LINKU_UNCOMMON,
31
+ )
32
+
33
+
34
+ @given(st.sampled_from(list(NIMI_PU | NIMI_PU_SYNONYMS)))
35
+ def test_pu_filters_non_overlap(s: str):
36
+ res_pu = NimiPu.filter(s)
37
+ res_synonyms = NimiPuSynonyms.filter(s)
38
+ assert (res_pu + res_synonyms) == 1
39
+
40
+
41
+ @given(st.sampled_from(list(NIMI_KU_SULI | NIMI_KU_LILI)))
42
+ def test_ku_filters_non_overlap(s: str):
43
+ res_ku_suli = NimiKuSuli.filter(s)
44
+ res_ku_lili = NimiKuLili.filter(s)
45
+ assert (res_ku_suli + res_ku_lili) == 1
46
+
47
+
48
+ @given(
49
+ st.sampled_from(
50
+ list(
51
+ NIMI_LINKU_CORE
52
+ | NIMI_LINKU_COMMON
53
+ | NIMI_LINKU_UNCOMMON
54
+ | NIMI_LINKU_OBSCURE
55
+ | NIMI_LINKU_SANDBOX
56
+ )
57
+ )
58
+ )
59
+ def test_linku_filters_non_overlap(s: str):
60
+ s = Lowercase.clean(s)
61
+ s = ConsecutiveDuplicates.clean(s)
62
+
63
+ res_core = NimiLinkuCore.filter(s)
64
+ res_common = NimiLinkuCommon.filter(s)
65
+ res_uncommon = NimiLinkuUncommon.filter(s)
66
+ res_obscure = NimiLinkuObscure.filter(s)
67
+ res_sandbox = NimiLinkuSandbox.filter(s)
68
+
69
+ assert (res_core + res_common + res_uncommon + res_obscure + res_sandbox) == 1
70
+
71
+
72
+ @given(st.sampled_from(list(NIMI_LINKU_CORE | NIMI_LINKU_COMMON | NIMI_LINKU_UNCOMMON)))
73
+ def test_nimi_linku_properties(s: str):
74
+ assert ConsecutiveDuplicates.clean(s) == s, repr(s)
75
+ assert Alphabetic.filter(s), repr(s)
76
+ assert Syllabic.filter(s), repr(s)
77
+ assert Phonotactic.filter(s), repr(s)
78
+ # Passing phonotactic implies all of the above
@@ -11,7 +11,7 @@ from sonatoki.constants import NIMI_LINKU_CORE, NIMI_LINKU_COMMON
11
11
  PROPER_NAME_RE = r"[A-Z][a-z]*"
12
12
 
13
13
  token_strategy = (
14
- st.sampled_from(NIMI_LINKU_CORE + NIMI_LINKU_COMMON)
14
+ st.sampled_from(list(NIMI_LINKU_CORE | NIMI_LINKU_COMMON))
15
15
  | st.from_regex(Phonotactic.pattern.pattern, fullmatch=True)
16
16
  | st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
17
17
  | st.from_regex(PROPER_NAME_RE, fullmatch=True)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes