sonatoki 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/Configs.py CHANGED
@@ -5,17 +5,17 @@ from typing import List, Type, TypedDict
5
5
  # LOCAL
6
6
  from sonatoki.Filters import (
7
7
  Filter,
8
- NimiPu,
9
8
  Numeric,
10
- OrFilter,
11
9
  Syllabic,
12
10
  NimiUCSUR,
13
11
  Alphabetic,
14
12
  ProperName,
15
- Phonotactic,
16
13
  Punctuation,
14
+ LongSyllabic,
15
+ Miscellaneous,
17
16
  NimiLinkuCore,
18
- NimiPuSynonyms,
17
+ LongAlphabetic,
18
+ LongProperName,
19
19
  OrMemberFilter,
20
20
  NimiLinkuCommon,
21
21
  NimiLinkuObscure,
@@ -28,12 +28,9 @@ from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
28
28
  from sonatoki.Tokenizers import Tokenizer, WordTokenizer
29
29
  from sonatoki.Preprocessors import (
30
30
  URLs,
31
+ Backticks,
31
32
  Reference,
32
33
  Preprocessor,
33
- DiscordEmotes,
34
- DiscordSpecial,
35
- DiscordChannels,
36
- DiscordMentions,
37
34
  AngleBracketObject,
38
35
  )
39
36
 
@@ -62,14 +59,14 @@ BaseConfig: IloConfig = {
62
59
 
63
60
 
64
61
  PrefConfig: IloConfig = {
65
- "preprocessors": [URLs, Reference],
62
+ "preprocessors": [Backticks, URLs, Reference],
66
63
  "cleaners": [ConsecutiveDuplicates],
67
- "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
64
+ "ignoring_filters": [Numeric, Punctuation],
68
65
  "scoring_filters": [
69
- OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
70
- Syllabic,
71
- ProperName,
72
- Alphabetic,
66
+ OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
67
+ LongSyllabic,
68
+ LongProperName,
69
+ LongAlphabetic,
73
70
  ],
74
71
  "scorer": SoftScaling,
75
72
  "passing_score": 0.8,
@@ -77,9 +74,9 @@ PrefConfig: IloConfig = {
77
74
  }
78
75
 
79
76
  CorpusConfig: IloConfig = {
80
- "preprocessors": [URLs, AngleBracketObject, Reference],
77
+ "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
81
78
  "cleaners": [ConsecutiveDuplicates],
82
- "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
79
+ "ignoring_filters": [Numeric, Punctuation],
83
80
  "scoring_filters": [
84
81
  OrMemberFilter(
85
82
  NimiLinkuCore,
@@ -88,10 +85,11 @@ CorpusConfig: IloConfig = {
88
85
  NimiLinkuObscure,
89
86
  NimiLinkuSandbox,
90
87
  NimiUCSUR,
88
+ Miscellaneous,
91
89
  ),
92
- Syllabic,
93
- ProperName,
94
- Alphabetic,
90
+ LongSyllabic,
91
+ LongProperName,
92
+ LongAlphabetic,
95
93
  ],
96
94
  "scorer": SoftScaling,
97
95
  "passing_score": 0.8,
@@ -99,25 +97,28 @@ CorpusConfig: IloConfig = {
99
97
  }
100
98
 
101
99
 
100
+ """
101
+ Mimics the previous implementation of ilo pi toki pona taso
102
+ """
102
103
  LazyConfig: IloConfig = {
103
- "preprocessors": [URLs],
104
+ "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
104
105
  "cleaners": [ConsecutiveDuplicates],
105
106
  "ignoring_filters": [Numeric, Punctuation],
106
- "scoring_filters": [Alphabetic, NimiUCSUR, ProperName],
107
+ "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
107
108
  "scorer": SoftPassFail,
108
109
  "passing_score": 0.8,
109
110
  "word_tokenizer": WordTokenizer,
110
111
  }
111
112
 
112
113
  DiscordConfig: IloConfig = {
113
- "preprocessors": [URLs, AngleBracketObject, Reference],
114
+ "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
114
115
  "cleaners": [ConsecutiveDuplicates],
115
116
  "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
116
117
  "scoring_filters": [
117
118
  OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
118
- Syllabic,
119
- ProperName,
120
- Alphabetic,
119
+ LongSyllabic,
120
+ LongProperName,
121
+ LongAlphabetic,
121
122
  ],
122
123
  "scorer": SoftScaling,
123
124
  "passing_score": 0.8,
sonatoki/Filters.py CHANGED
@@ -42,6 +42,33 @@ class Filter(ABC):
42
42
  raise NotImplementedError
43
43
 
44
44
 
45
+ class MinLen(Filter):
46
+ """
47
+ Meta filter meant to be inherited by another filter to add a length requirement.
48
+ Multiple-inherit with `MinLen` as the first argument so `super()` resolves correctly.
49
+ You may also construct any other filter with a minimum length filter like so:
50
+
51
+ ```
52
+ MinLen(Alphabetic, 3)
53
+ ```
54
+ """
55
+
56
+ length = 0
57
+
58
+ @classmethod
59
+ @cache(maxsize=None)
60
+ def filter(cls, token: str) -> bool:
61
+ if len(token) < cls.length:
62
+ return False
63
+ return super().filter(token)
64
+
65
+ def __new__(cls, filter: Type[Filter], length_: int) -> Type[Filter]:
66
+ class MinLenFilter(MinLen, Filter):
67
+ length = length_
68
+
69
+ return MinLenFilter
70
+
71
+
45
72
  class RegexFilter(Filter):
46
73
  pattern: "re.Pattern[str]"
47
74
 
@@ -83,11 +110,16 @@ class SubsetFilter(Filter):
83
110
 
84
111
 
85
112
  class Miscellaneous(MemberFilter):
86
- tokens = set(ALLOWABLES)
113
+ tokens = prep_dictionary(ALLOWABLES)
87
114
 
88
115
 
89
116
  class EnglishIgnorables(MemberFilter):
90
- tokens = set(IGNORABLES)
117
+ """NOTE: Not recommended for use.
118
+ It is better to use a Long* filter such as LongSyllabic than to use this filter.
119
+ This filter hides words from scoring rather than scoring them poorly,
120
+ which is more of a benefit than a loss for a word you would like to omit."""
121
+
122
+ tokens = prep_dictionary(IGNORABLES)
91
123
 
92
124
 
93
125
  class ProperName(Filter):
@@ -109,6 +141,10 @@ class ProperName(Filter):
109
141
  # this will errantly match.
110
142
 
111
143
 
144
+ class LongProperName(MinLen, ProperName):
145
+ length = 2 # reject "names" of length 1
146
+
147
+
112
148
  class NimiPu(MemberFilter):
113
149
  tokens = prep_dictionary(NIMI_PU)
114
150
 
@@ -166,6 +202,10 @@ class Phonotactic(RegexFilter):
166
202
  )
167
203
 
168
204
 
205
+ class LongPhonotactic(MinLen, Phonotactic):
206
+ length = 3
207
+
208
+
169
209
  class Syllabic(RegexFilter):
170
210
  """Determines if a given token is syllabically valid Toki Pona (or `n`).
171
211
  Words must have correctly ordered vowels and consonants, but the phonotactic
@@ -179,6 +219,10 @@ class Syllabic(RegexFilter):
179
219
  )
180
220
 
181
221
 
222
+ class LongSyllabic(MinLen, Syllabic):
223
+ length = 3
224
+
225
+
182
226
  class Alphabetic(SubsetFilter):
183
227
  tokens = set(ALPHABET)
184
228
 
@@ -187,9 +231,8 @@ class AlphabeticRe(RegexFilter):
187
231
  pattern = re.compile(rf"[{ALPHABET}]+", flags=re.IGNORECASE)
188
232
 
189
233
 
190
- class TwoOrMoreAlphabetic(Filter):
191
- # TODO: alphabetic implementation that ignores single characters
192
- pass
234
+ class LongAlphabetic(MinLen, Alphabetic):
235
+ length = 3
193
236
 
194
237
 
195
238
  class Numeric(Filter):
@@ -266,11 +309,9 @@ class OrFilter:
266
309
  if not len(filters) >= 2:
267
310
  raise ValueError("Provide at least two Filters to OrFilter.")
268
311
 
269
- subset_filters = [f for f in filters if issubclass(f, MemberFilter)]
270
- if len(subset_filters) >= 2:
271
- raise Warning(
272
- "Prefer OrMemberFilter for combining two or more MemberFilters."
273
- )
312
+ member_filters = [f for f in filters if issubclass(f, MemberFilter)]
313
+ if len(member_filters) >= 2:
314
+ raise Warning("Use OrMemberFilter for combining two or more MemberFilters.")
274
315
 
275
316
  filter = cls.__generic_filter(*filters)
276
317
 
@@ -279,7 +320,7 @@ class OrFilter:
279
320
 
280
321
  class OrMemberFilter:
281
322
  @staticmethod
282
- def __subset_filter(*filters: Type[MemberFilter]) -> Type[MemberFilter]:
323
+ def __member_filter(*filters: Type[MemberFilter]) -> Type[MemberFilter]:
283
324
  all_token_sets: List[Set[str]] = [f.tokens for f in filters]
284
325
  all_tokens: Set[str] = set().union(*all_token_sets)
285
326
 
@@ -291,7 +332,7 @@ class OrMemberFilter:
291
332
  def __new__(cls, *filters_: Type[MemberFilter]) -> Type[MemberFilter]:
292
333
  if not len(filters_) >= 2:
293
334
  raise ValueError("Provide two or more MemberFilters to OrMemberFilter.")
294
- filter = cls.__subset_filter(*filters_)
335
+ filter = cls.__member_filter(*filters_)
295
336
  return filter
296
337
 
297
338
 
@@ -323,6 +364,11 @@ __all__ = [
323
364
  "Alphabetic",
324
365
  "AndFilter",
325
366
  "EnglishIgnorables",
367
+ "LongAlphabetic",
368
+ "LongPhonotactic",
369
+ "LongProperName",
370
+ "LongSyllabic",
371
+ "MinLen",
326
372
  "NimiLinkuCore",
327
373
  "NimiLinkuSandbox",
328
374
  "NimiPu",
sonatoki/constants.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # STL
2
2
  import json
3
- from typing import Dict, List
3
+ from typing import Set, Dict, List
4
4
  from pathlib import Path
5
5
 
6
6
  # LOCAL
@@ -383,37 +383,62 @@ LANGUAGE = "english" # for NLTK
383
383
 
384
384
  """Commonly occurring strings which are some kind of valid Toki Pona or external token"""
385
385
  ALLOWABLES = {
386
- "cw", # Content Warning
387
386
  "x", # ala
388
387
  "y", # anu
389
388
  "kxk", # ken ala ken
390
389
  "wxw", # wile ala wile
391
390
  }
392
391
 
393
- IGNORABLES = {
394
- # o, e, n are not here bc they're not frequently problematic in english messages
395
- "a",
396
- "am",
397
- "an",
398
- "i",
399
- "in",
400
- "is",
401
- "l", # they'll
402
- "m", # i'm
403
- "me",
404
- "no",
405
- "s", # let's
406
- "so",
407
- "t", # don't
408
- "to",
409
- "u", # you
410
- "we",
411
- "un", # un-
412
- "use",
392
+ PHONOMATCHES = {
393
+ # "a", # ignore
394
+ # "an", # against
395
+ # "i", # against
396
+ # "in", # against
413
397
  "some",
414
- "like",
398
+ "like", # against
399
+ # "me", # against
400
+ # "no", # against
401
+ # "on", # against
402
+ # "se", # against
403
+ # "so", # against
404
+ # "some", # against
405
+ "to", # ignore
406
+ # "u", # against
407
+ # "un", # against
408
+ "use", # against
409
+ # "we", # against
415
410
  }
416
411
 
412
+ ALPHABETIC_MATCHES = PHONOMATCHES | {
413
+ "a",
414
+ # "am",
415
+ # "as",
416
+ # "at",
417
+ # "aw", # aww
418
+ # "ek", # eek
419
+ # "ew",
420
+ # "ik",
421
+ # "il", # ill
422
+ # "im",
423
+ # "im",
424
+ # "ip",
425
+ # "is",
426
+ # "it",
427
+ # "l", # they'll
428
+ # "m", # i'm
429
+ # "ok",
430
+ # "op",
431
+ # "ow",
432
+ # "s", # let's
433
+ # "t", # don't
434
+ # "up",
435
+ # "us",
436
+ # "ut",
437
+ # "uw",
438
+ }
439
+
440
+ IGNORABLES = PHONOMATCHES | ALPHABETIC_MATCHES
441
+
417
442
  UCSUR_RANGES = [
418
443
  "\\U000F1900-\\U000F1977", # pu
419
444
  "\\U000F1978-\\U000F1988", # ku suli
@@ -426,14 +451,14 @@ NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
426
451
  # NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
427
452
 
428
453
 
429
- def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> List[str]:
430
- return [d["word"] for d in data.values() if d[key] == value]
454
+ def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> Set[str]:
455
+ return {d["word"] for d in data.values() if d[key] == value}
431
456
 
432
457
 
433
458
  with open(LINKU) as f:
434
459
  linku: Dict[str, Dict[str, str]] = json.loads(f.read())
435
- NIMI_PU: List[str] = category_helper(linku, "book", "pu")
436
- NIMI_PU_SYNONYMS: List[str] = ["namako", "kin", "oko"]
460
+ NIMI_PU = category_helper(linku, "book", "pu")
461
+ NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
437
462
 
438
463
  NIMI_KU_SULI = category_helper(linku, "book", "ku suli")
439
464
  NIMI_KU_LILI = category_helper(linku, "book", "ku lili")
@@ -445,7 +470,7 @@ with open(LINKU) as f:
445
470
 
446
471
  with open(SANDBOX) as f:
447
472
  sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
448
- NIMI_LINKU_SANDBOX: List[str] = [d["word"] for d in sandbox.values()]
473
+ NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
449
474
 
450
475
  del linku
451
476
  del sandbox
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,18 +1,18 @@
1
- sonatoki-0.3.1.dist-info/METADATA,sha256=nWomuM-AeE98VwnWen7qffNclw8emxAf-oFtXwba8wI,6341
2
- sonatoki-0.3.1.dist-info/WHEEL,sha256=vnE8JVcI2Wz7GRKorsPArnBdnW2SWKWGow5gu5tHlRU,90
3
- sonatoki-0.3.1.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
1
+ sonatoki-0.3.2.dist-info/METADATA,sha256=9cnhaaYFLxN3uaubD0jfTAU_CC9wUGtzho4fs1UGLFc,6341
2
+ sonatoki-0.3.2.dist-info/WHEEL,sha256=vnE8JVcI2Wz7GRKorsPArnBdnW2SWKWGow5gu5tHlRU,90
3
+ sonatoki-0.3.2.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
4
4
  sonatoki/Cleaners.py,sha256=m0j1a1vs9Mdqp724r9Xfh1Y_tyP6GYCkihv8rH8m7lA,1871
5
- sonatoki/Configs.py,sha256=NS1_esoDNna8LyH_9bPMkxbo2sMSilYhG1PwYLdq6L8,3402
6
- sonatoki/Filters.py,sha256=-j5xSZ8URjqalQVGMBabMvJ5ofZWazfN7YPfXkM_4uQ,9429
5
+ sonatoki/Configs.py,sha256=o_uFp-Z6sbhbMi8drgQTkdu8S5LaTr0Xnns6Cg0cHSY,3548
6
+ sonatoki/Filters.py,sha256=-7zIV_IBsbASR7pF5WuoABNtBW5a7L135Ev_Rrn35o4,10664
7
7
  sonatoki/Preprocessors.py,sha256=aMXXuFBDlJudvzvukvCa7BixuROXXEb62un7I-TGOGs,4441
8
8
  sonatoki/Scorers.py,sha256=W-1uYiqjsDejJzoe592ixs7wHazjJXPhuo-41zuJ26U,3643
9
9
  sonatoki/Tokenizers.py,sha256=So5_Tu6J98MD3yVcwB_X3lw2uMG0TN6XHcTbQjFCu5Q,4254
10
10
  sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  sonatoki/__main__.py,sha256=6xc-wIrrFo9wTyn4zRQNAmqwmJBtVvCMwV-CrM-hueA,82
12
- sonatoki/constants.py,sha256=XTFmEcnLBXwdYXjTq_EuW9e_TWLtnNLz2vFCf8m-sz0,12844
12
+ sonatoki/constants.py,sha256=qq1_ZTsVKG_d7nqlJv3a-KS6ZvYwfUSHWA--e0BuyXc,13268
13
13
  sonatoki/ilo.py,sha256=yyLgNPI0Hmb4f1BzX6IRHr11FPChfL2xDR_9odlr8_8,3849
14
14
  sonatoki/linku.json,sha256=B5KNdhyM5UEfMciROgh1ECHr3i-ASBeMvwrkzNJX47c,271013
15
15
  sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  sonatoki/sandbox.json,sha256=hx6LRsfvmmTtqXcXIyCsfSaGK3DZ-GCdbM8xhZQBHoA,77650
17
17
  sonatoki/utils.py,sha256=OMaRyoNvKGKYQCBDjQyaCI58-wMpQ0wrrNjTJKsEZ9Y,3550
18
- sonatoki-0.3.1.dist-info/RECORD,,
18
+ sonatoki-0.3.2.dist-info/RECORD,,