sonatoki 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Configs.py +26 -25
- sonatoki/Filters.py +58 -12
- sonatoki/constants.py +53 -28
- {sonatoki-0.3.1.dist-info → sonatoki-0.3.2.dist-info}/METADATA +1 -1
- {sonatoki-0.3.1.dist-info → sonatoki-0.3.2.dist-info}/RECORD +7 -7
- {sonatoki-0.3.1.dist-info → sonatoki-0.3.2.dist-info}/WHEEL +0 -0
- {sonatoki-0.3.1.dist-info → sonatoki-0.3.2.dist-info}/licenses/LICENSE +0 -0
sonatoki/Configs.py
CHANGED
@@ -5,17 +5,17 @@ from typing import List, Type, TypedDict
|
|
5
5
|
# LOCAL
|
6
6
|
from sonatoki.Filters import (
|
7
7
|
Filter,
|
8
|
-
NimiPu,
|
9
8
|
Numeric,
|
10
|
-
OrFilter,
|
11
9
|
Syllabic,
|
12
10
|
NimiUCSUR,
|
13
11
|
Alphabetic,
|
14
12
|
ProperName,
|
15
|
-
Phonotactic,
|
16
13
|
Punctuation,
|
14
|
+
LongSyllabic,
|
15
|
+
Miscellaneous,
|
17
16
|
NimiLinkuCore,
|
18
|
-
|
17
|
+
LongAlphabetic,
|
18
|
+
LongProperName,
|
19
19
|
OrMemberFilter,
|
20
20
|
NimiLinkuCommon,
|
21
21
|
NimiLinkuObscure,
|
@@ -28,12 +28,9 @@ from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
|
28
28
|
from sonatoki.Tokenizers import Tokenizer, WordTokenizer
|
29
29
|
from sonatoki.Preprocessors import (
|
30
30
|
URLs,
|
31
|
+
Backticks,
|
31
32
|
Reference,
|
32
33
|
Preprocessor,
|
33
|
-
DiscordEmotes,
|
34
|
-
DiscordSpecial,
|
35
|
-
DiscordChannels,
|
36
|
-
DiscordMentions,
|
37
34
|
AngleBracketObject,
|
38
35
|
)
|
39
36
|
|
@@ -62,14 +59,14 @@ BaseConfig: IloConfig = {
|
|
62
59
|
|
63
60
|
|
64
61
|
PrefConfig: IloConfig = {
|
65
|
-
"preprocessors": [URLs, Reference],
|
62
|
+
"preprocessors": [Backticks, URLs, Reference],
|
66
63
|
"cleaners": [ConsecutiveDuplicates],
|
67
|
-
"ignoring_filters": [Numeric, Punctuation
|
64
|
+
"ignoring_filters": [Numeric, Punctuation],
|
68
65
|
"scoring_filters": [
|
69
|
-
OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
|
70
|
-
|
71
|
-
|
72
|
-
|
66
|
+
OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
|
67
|
+
LongSyllabic,
|
68
|
+
LongProperName,
|
69
|
+
LongAlphabetic,
|
73
70
|
],
|
74
71
|
"scorer": SoftScaling,
|
75
72
|
"passing_score": 0.8,
|
@@ -77,9 +74,9 @@ PrefConfig: IloConfig = {
|
|
77
74
|
}
|
78
75
|
|
79
76
|
CorpusConfig: IloConfig = {
|
80
|
-
"preprocessors": [URLs, AngleBracketObject, Reference],
|
77
|
+
"preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
|
81
78
|
"cleaners": [ConsecutiveDuplicates],
|
82
|
-
"ignoring_filters": [Numeric, Punctuation
|
79
|
+
"ignoring_filters": [Numeric, Punctuation],
|
83
80
|
"scoring_filters": [
|
84
81
|
OrMemberFilter(
|
85
82
|
NimiLinkuCore,
|
@@ -88,10 +85,11 @@ CorpusConfig: IloConfig = {
|
|
88
85
|
NimiLinkuObscure,
|
89
86
|
NimiLinkuSandbox,
|
90
87
|
NimiUCSUR,
|
88
|
+
Miscellaneous,
|
91
89
|
),
|
92
|
-
|
93
|
-
|
94
|
-
|
90
|
+
LongSyllabic,
|
91
|
+
LongProperName,
|
92
|
+
LongAlphabetic,
|
95
93
|
],
|
96
94
|
"scorer": SoftScaling,
|
97
95
|
"passing_score": 0.8,
|
@@ -99,25 +97,28 @@ CorpusConfig: IloConfig = {
|
|
99
97
|
}
|
100
98
|
|
101
99
|
|
100
|
+
"""
|
101
|
+
Mimics the previous implementation of ilo pi toki pona taso
|
102
|
+
"""
|
102
103
|
LazyConfig: IloConfig = {
|
103
|
-
"preprocessors": [URLs],
|
104
|
+
"preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
|
104
105
|
"cleaners": [ConsecutiveDuplicates],
|
105
106
|
"ignoring_filters": [Numeric, Punctuation],
|
106
|
-
"scoring_filters": [Alphabetic, NimiUCSUR, ProperName],
|
107
|
+
"scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
|
107
108
|
"scorer": SoftPassFail,
|
108
109
|
"passing_score": 0.8,
|
109
110
|
"word_tokenizer": WordTokenizer,
|
110
111
|
}
|
111
112
|
|
112
113
|
DiscordConfig: IloConfig = {
|
113
|
-
"preprocessors": [URLs, AngleBracketObject, Reference],
|
114
|
+
"preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
|
114
115
|
"cleaners": [ConsecutiveDuplicates],
|
115
116
|
"ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
|
116
117
|
"scoring_filters": [
|
117
118
|
OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
|
118
|
-
|
119
|
-
|
120
|
-
|
119
|
+
LongSyllabic,
|
120
|
+
LongProperName,
|
121
|
+
LongAlphabetic,
|
121
122
|
],
|
122
123
|
"scorer": SoftScaling,
|
123
124
|
"passing_score": 0.8,
|
sonatoki/Filters.py
CHANGED
@@ -42,6 +42,33 @@ class Filter(ABC):
|
|
42
42
|
raise NotImplementedError
|
43
43
|
|
44
44
|
|
45
|
+
class MinLen(Filter):
|
46
|
+
"""
|
47
|
+
Meta filter meant to be inherited by another filter to add a length requirement.
|
48
|
+
Multiple-inherit with `MinLen` as the first argument so `super()` resolves correctly.
|
49
|
+
You may also construct any other filter with a minimum length filter like so:
|
50
|
+
|
51
|
+
```
|
52
|
+
MinLen(Alphabetic, 3)
|
53
|
+
```
|
54
|
+
"""
|
55
|
+
|
56
|
+
length = 0
|
57
|
+
|
58
|
+
@classmethod
|
59
|
+
@cache(maxsize=None)
|
60
|
+
def filter(cls, token: str) -> bool:
|
61
|
+
if len(token) < cls.length:
|
62
|
+
return False
|
63
|
+
return super().filter(token)
|
64
|
+
|
65
|
+
def __new__(cls, filter: Type[Filter], length_: int) -> Type[Filter]:
|
66
|
+
class MinLenFilter(MinLen, Filter):
|
67
|
+
length = length_
|
68
|
+
|
69
|
+
return MinLenFilter
|
70
|
+
|
71
|
+
|
45
72
|
class RegexFilter(Filter):
|
46
73
|
pattern: "re.Pattern[str]"
|
47
74
|
|
@@ -83,11 +110,16 @@ class SubsetFilter(Filter):
|
|
83
110
|
|
84
111
|
|
85
112
|
class Miscellaneous(MemberFilter):
|
86
|
-
tokens =
|
113
|
+
tokens = prep_dictionary(ALLOWABLES)
|
87
114
|
|
88
115
|
|
89
116
|
class EnglishIgnorables(MemberFilter):
|
90
|
-
|
117
|
+
"""NOTE: Not recommended for use.
|
118
|
+
It is better to use a Long* filter such as LongSyllabic than to use this filter.
|
119
|
+
This filter hides words from scoring rather than scoring them poorly,
|
120
|
+
which is more of a benefit than a loss for a word you would like to omit."""
|
121
|
+
|
122
|
+
tokens = prep_dictionary(IGNORABLES)
|
91
123
|
|
92
124
|
|
93
125
|
class ProperName(Filter):
|
@@ -109,6 +141,10 @@ class ProperName(Filter):
|
|
109
141
|
# this will errantly match.
|
110
142
|
|
111
143
|
|
144
|
+
class LongProperName(MinLen, ProperName):
|
145
|
+
length = 2 # reject "names" of length 1
|
146
|
+
|
147
|
+
|
112
148
|
class NimiPu(MemberFilter):
|
113
149
|
tokens = prep_dictionary(NIMI_PU)
|
114
150
|
|
@@ -166,6 +202,10 @@ class Phonotactic(RegexFilter):
|
|
166
202
|
)
|
167
203
|
|
168
204
|
|
205
|
+
class LongPhonotactic(MinLen, Phonotactic):
|
206
|
+
length = 3
|
207
|
+
|
208
|
+
|
169
209
|
class Syllabic(RegexFilter):
|
170
210
|
"""Determines if a given token is syllabically valid Toki Pona (or `n`).
|
171
211
|
Words must have correctly ordered vowels and consonants, but the phonotactic
|
@@ -179,6 +219,10 @@ class Syllabic(RegexFilter):
|
|
179
219
|
)
|
180
220
|
|
181
221
|
|
222
|
+
class LongSyllabic(MinLen, Syllabic):
|
223
|
+
length = 3
|
224
|
+
|
225
|
+
|
182
226
|
class Alphabetic(SubsetFilter):
|
183
227
|
tokens = set(ALPHABET)
|
184
228
|
|
@@ -187,9 +231,8 @@ class AlphabeticRe(RegexFilter):
|
|
187
231
|
pattern = re.compile(rf"[{ALPHABET}]+", flags=re.IGNORECASE)
|
188
232
|
|
189
233
|
|
190
|
-
class
|
191
|
-
|
192
|
-
pass
|
234
|
+
class LongAlphabetic(MinLen, Alphabetic):
|
235
|
+
length = 3
|
193
236
|
|
194
237
|
|
195
238
|
class Numeric(Filter):
|
@@ -266,11 +309,9 @@ class OrFilter:
|
|
266
309
|
if not len(filters) >= 2:
|
267
310
|
raise ValueError("Provide at least two Filters to OrFilter.")
|
268
311
|
|
269
|
-
|
270
|
-
if len(
|
271
|
-
raise Warning(
|
272
|
-
"Prefer OrMemberFilter for combining two or more MemberFilters."
|
273
|
-
)
|
312
|
+
member_filters = [f for f in filters if issubclass(f, MemberFilter)]
|
313
|
+
if len(member_filters) >= 2:
|
314
|
+
raise Warning("Use OrMemberFilter for combining two or more MemberFilters.")
|
274
315
|
|
275
316
|
filter = cls.__generic_filter(*filters)
|
276
317
|
|
@@ -279,7 +320,7 @@ class OrFilter:
|
|
279
320
|
|
280
321
|
class OrMemberFilter:
|
281
322
|
@staticmethod
|
282
|
-
def
|
323
|
+
def __member_filter(*filters: Type[MemberFilter]) -> Type[MemberFilter]:
|
283
324
|
all_token_sets: List[Set[str]] = [f.tokens for f in filters]
|
284
325
|
all_tokens: Set[str] = set().union(*all_token_sets)
|
285
326
|
|
@@ -291,7 +332,7 @@ class OrMemberFilter:
|
|
291
332
|
def __new__(cls, *filters_: Type[MemberFilter]) -> Type[MemberFilter]:
|
292
333
|
if not len(filters_) >= 2:
|
293
334
|
raise ValueError("Provide two or more MemberFilters to OrMemberFilter.")
|
294
|
-
filter = cls.
|
335
|
+
filter = cls.__member_filter(*filters_)
|
295
336
|
return filter
|
296
337
|
|
297
338
|
|
@@ -323,6 +364,11 @@ __all__ = [
|
|
323
364
|
"Alphabetic",
|
324
365
|
"AndFilter",
|
325
366
|
"EnglishIgnorables",
|
367
|
+
"LongAlphabetic",
|
368
|
+
"LongPhonotactic",
|
369
|
+
"LongProperName",
|
370
|
+
"LongSyllabic",
|
371
|
+
"MinLen",
|
326
372
|
"NimiLinkuCore",
|
327
373
|
"NimiLinkuSandbox",
|
328
374
|
"NimiPu",
|
sonatoki/constants.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# STL
|
2
2
|
import json
|
3
|
-
from typing import Dict, List
|
3
|
+
from typing import Set, Dict, List
|
4
4
|
from pathlib import Path
|
5
5
|
|
6
6
|
# LOCAL
|
@@ -383,37 +383,62 @@ LANGUAGE = "english" # for NLTK
|
|
383
383
|
|
384
384
|
"""Commonly occurring strings which are some kind of valid Toki Pona or external token"""
|
385
385
|
ALLOWABLES = {
|
386
|
-
"cw", # Content Warning
|
387
386
|
"x", # ala
|
388
387
|
"y", # anu
|
389
388
|
"kxk", # ken ala ken
|
390
389
|
"wxw", # wile ala wile
|
391
390
|
}
|
392
391
|
|
393
|
-
|
394
|
-
#
|
395
|
-
"
|
396
|
-
"
|
397
|
-
"
|
398
|
-
"i",
|
399
|
-
"in",
|
400
|
-
"is",
|
401
|
-
"l", # they'll
|
402
|
-
"m", # i'm
|
403
|
-
"me",
|
404
|
-
"no",
|
405
|
-
"s", # let's
|
406
|
-
"so",
|
407
|
-
"t", # don't
|
408
|
-
"to",
|
409
|
-
"u", # you
|
410
|
-
"we",
|
411
|
-
"un", # un-
|
412
|
-
"use",
|
392
|
+
PHONOMATCHES = {
|
393
|
+
# "a", # ignore
|
394
|
+
# "an", # against
|
395
|
+
# "i", # against
|
396
|
+
# "in", # against
|
413
397
|
"some",
|
414
|
-
"like",
|
398
|
+
"like", # against
|
399
|
+
# "me", # against
|
400
|
+
# "no", # against
|
401
|
+
# "on", # against
|
402
|
+
# "se", # against
|
403
|
+
# "so", # against
|
404
|
+
# "some", # against
|
405
|
+
"to", # ignore
|
406
|
+
# "u", # against
|
407
|
+
# "un", # against
|
408
|
+
"use", # against
|
409
|
+
# "we", # against
|
415
410
|
}
|
416
411
|
|
412
|
+
ALPHABETIC_MATCHES = PHONOMATCHES | {
|
413
|
+
"a",
|
414
|
+
# "am",
|
415
|
+
# "as",
|
416
|
+
# "at",
|
417
|
+
# "aw", # aww
|
418
|
+
# "ek", # eek
|
419
|
+
# "ew",
|
420
|
+
# "ik",
|
421
|
+
# "il", # ill
|
422
|
+
# "im",
|
423
|
+
# "im",
|
424
|
+
# "ip",
|
425
|
+
# "is",
|
426
|
+
# "it",
|
427
|
+
# "l", # they'll
|
428
|
+
# "m", # i'm
|
429
|
+
# "ok",
|
430
|
+
# "op",
|
431
|
+
# "ow",
|
432
|
+
# "s", # let's
|
433
|
+
# "t", # don't
|
434
|
+
# "up",
|
435
|
+
# "us",
|
436
|
+
# "ut",
|
437
|
+
# "uw",
|
438
|
+
}
|
439
|
+
|
440
|
+
IGNORABLES = PHONOMATCHES | ALPHABETIC_MATCHES
|
441
|
+
|
417
442
|
UCSUR_RANGES = [
|
418
443
|
"\\U000F1900-\\U000F1977", # pu
|
419
444
|
"\\U000F1978-\\U000F1988", # ku suli
|
@@ -426,14 +451,14 @@ NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
|
|
426
451
|
# NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
|
427
452
|
|
428
453
|
|
429
|
-
def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) ->
|
430
|
-
return
|
454
|
+
def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> Set[str]:
|
455
|
+
return {d["word"] for d in data.values() if d[key] == value}
|
431
456
|
|
432
457
|
|
433
458
|
with open(LINKU) as f:
|
434
459
|
linku: Dict[str, Dict[str, str]] = json.loads(f.read())
|
435
|
-
NIMI_PU
|
436
|
-
NIMI_PU_SYNONYMS
|
460
|
+
NIMI_PU = category_helper(linku, "book", "pu")
|
461
|
+
NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
|
437
462
|
|
438
463
|
NIMI_KU_SULI = category_helper(linku, "book", "ku suli")
|
439
464
|
NIMI_KU_LILI = category_helper(linku, "book", "ku lili")
|
@@ -445,7 +470,7 @@ with open(LINKU) as f:
|
|
445
470
|
|
446
471
|
with open(SANDBOX) as f:
|
447
472
|
sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
|
448
|
-
NIMI_LINKU_SANDBOX
|
473
|
+
NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
|
449
474
|
|
450
475
|
del linku
|
451
476
|
del sandbox
|
@@ -1,18 +1,18 @@
|
|
1
|
-
sonatoki-0.3.
|
2
|
-
sonatoki-0.3.
|
3
|
-
sonatoki-0.3.
|
1
|
+
sonatoki-0.3.2.dist-info/METADATA,sha256=9cnhaaYFLxN3uaubD0jfTAU_CC9wUGtzho4fs1UGLFc,6341
|
2
|
+
sonatoki-0.3.2.dist-info/WHEEL,sha256=vnE8JVcI2Wz7GRKorsPArnBdnW2SWKWGow5gu5tHlRU,90
|
3
|
+
sonatoki-0.3.2.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
|
4
4
|
sonatoki/Cleaners.py,sha256=m0j1a1vs9Mdqp724r9Xfh1Y_tyP6GYCkihv8rH8m7lA,1871
|
5
|
-
sonatoki/Configs.py,sha256=
|
6
|
-
sonatoki/Filters.py,sha256=-
|
5
|
+
sonatoki/Configs.py,sha256=o_uFp-Z6sbhbMi8drgQTkdu8S5LaTr0Xnns6Cg0cHSY,3548
|
6
|
+
sonatoki/Filters.py,sha256=-7zIV_IBsbASR7pF5WuoABNtBW5a7L135Ev_Rrn35o4,10664
|
7
7
|
sonatoki/Preprocessors.py,sha256=aMXXuFBDlJudvzvukvCa7BixuROXXEb62un7I-TGOGs,4441
|
8
8
|
sonatoki/Scorers.py,sha256=W-1uYiqjsDejJzoe592ixs7wHazjJXPhuo-41zuJ26U,3643
|
9
9
|
sonatoki/Tokenizers.py,sha256=So5_Tu6J98MD3yVcwB_X3lw2uMG0TN6XHcTbQjFCu5Q,4254
|
10
10
|
sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
sonatoki/__main__.py,sha256=6xc-wIrrFo9wTyn4zRQNAmqwmJBtVvCMwV-CrM-hueA,82
|
12
|
-
sonatoki/constants.py,sha256=
|
12
|
+
sonatoki/constants.py,sha256=qq1_ZTsVKG_d7nqlJv3a-KS6ZvYwfUSHWA--e0BuyXc,13268
|
13
13
|
sonatoki/ilo.py,sha256=yyLgNPI0Hmb4f1BzX6IRHr11FPChfL2xDR_9odlr8_8,3849
|
14
14
|
sonatoki/linku.json,sha256=B5KNdhyM5UEfMciROgh1ECHr3i-ASBeMvwrkzNJX47c,271013
|
15
15
|
sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
16
|
sonatoki/sandbox.json,sha256=hx6LRsfvmmTtqXcXIyCsfSaGK3DZ-GCdbM8xhZQBHoA,77650
|
17
17
|
sonatoki/utils.py,sha256=OMaRyoNvKGKYQCBDjQyaCI58-wMpQ0wrrNjTJKsEZ9Y,3550
|
18
|
-
sonatoki-0.3.
|
18
|
+
sonatoki-0.3.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|