sonatoki 0.8.4__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Configs.py +20 -14
- sonatoki/Filters.py +65 -11
- sonatoki/Scorers.py +67 -1
- sonatoki/__main__.py +4 -4
- sonatoki/constants.py +5 -4
- {sonatoki-0.8.4.dist-info → sonatoki-0.9.0.dist-info}/METADATA +1 -1
- {sonatoki-0.8.4.dist-info → sonatoki-0.9.0.dist-info}/RECORD +10 -9
- {sonatoki-0.8.4.dist-info → sonatoki-0.9.0.dist-info}/WHEEL +1 -1
- sonatoki-0.9.0.dist-info/entry_points.txt +4 -0
- {sonatoki-0.8.4.dist-info → sonatoki-0.9.0.dist-info}/licenses/LICENSE +0 -0
sonatoki/Configs.py
CHANGED
@@ -9,14 +9,17 @@ from sonatoki.types import Number
|
|
9
9
|
from sonatoki.Filters import (
|
10
10
|
Or,
|
11
11
|
And,
|
12
|
+
Len,
|
12
13
|
Not,
|
13
14
|
Filter,
|
14
15
|
PuName,
|
15
16
|
Numeric,
|
17
|
+
Syllabic,
|
16
18
|
NimiUCSUR,
|
17
19
|
Alphabetic,
|
18
20
|
NimiKuLili,
|
19
21
|
NimiKuSuli,
|
22
|
+
ProperName,
|
20
23
|
Punctuation,
|
21
24
|
LongSyllabic,
|
22
25
|
Miscellaneous,
|
@@ -29,7 +32,7 @@ from sonatoki.Filters import (
|
|
29
32
|
NimiLinkuUncommon,
|
30
33
|
FalsePosAlphabetic,
|
31
34
|
)
|
32
|
-
from sonatoki.Scorers import Scorer, PassFail, SoftScaling, SoftPassFail
|
35
|
+
from sonatoki.Scorers import Scorer, Soften, Voting, PassFail, SoftScaling, SoftPassFail
|
33
36
|
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
34
37
|
from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
|
35
38
|
from sonatoki.Preprocessors import (
|
@@ -62,8 +65,8 @@ __DICT_PHONOMATCHES = {
|
|
62
65
|
"we", # 1st person plural, english
|
63
66
|
"wi", # wii and discussions of syllables
|
64
67
|
"sole", # singular, of shoe
|
68
|
+
"omen", # ominous
|
65
69
|
# unexplored candidates for removal
|
66
|
-
# "omen", # ominous
|
67
70
|
# "papa", # father
|
68
71
|
# "lo", # "lo" and "loo"
|
69
72
|
# "ewe", # sheep
|
@@ -99,11 +102,11 @@ PrefConfig: IloConfig = {
|
|
99
102
|
"cleaners": [ConsecutiveDuplicates],
|
100
103
|
"ignoring_filters": [Numeric, Punctuation],
|
101
104
|
"scoring_filters": [
|
102
|
-
Or(NimiLinkuByUsage(30), NimiUCSUR),
|
103
|
-
And(
|
105
|
+
Len(Or(NimiLinkuByUsage(30), NimiUCSUR), max=15),
|
106
|
+
Len(And(Syllabic, Not(FalsePosSyllabic)), min=3, max=24),
|
104
107
|
# NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
|
105
|
-
|
106
|
-
And(
|
108
|
+
Len(ProperName, min=2, max=24),
|
109
|
+
Len(And(Alphabetic, Not(FalsePosAlphabetic)), min=3, max=24),
|
107
110
|
],
|
108
111
|
"scorer": SoftScaling,
|
109
112
|
"passing_score": 0.8,
|
@@ -114,15 +117,18 @@ CorpusConfig: IloConfig = {
|
|
114
117
|
"cleaners": [ConsecutiveDuplicates],
|
115
118
|
"ignoring_filters": [Numeric, Punctuation],
|
116
119
|
"scoring_filters": [
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
120
|
+
Len(
|
121
|
+
Or(
|
122
|
+
# awkward but efficient syntax
|
123
|
+
NimiLinkuByUsage(0)(sub=__DICT_PHONOMATCHES),
|
124
|
+
NimiUCSUR,
|
125
|
+
Miscellaneous,
|
126
|
+
),
|
127
|
+
max=19,
|
122
128
|
),
|
123
|
-
And(
|
124
|
-
|
125
|
-
And(
|
129
|
+
Len(And(Syllabic, Not(FalsePosSyllabic)), min=3, max=24),
|
130
|
+
Len(ProperName, min=2, max=24),
|
131
|
+
Len(And(Alphabetic, Not(FalsePosAlphabetic)), min=3, max=24),
|
126
132
|
],
|
127
133
|
"scorer": SoftScaling,
|
128
134
|
"passing_score": 0.8,
|
sonatoki/Filters.py
CHANGED
@@ -7,7 +7,7 @@ from functools import lru_cache as cache # cache comes in 3.9
|
|
7
7
|
|
8
8
|
# PDM
|
9
9
|
import regex
|
10
|
-
from typing_extensions import override
|
10
|
+
from typing_extensions import override, deprecated
|
11
11
|
|
12
12
|
# LOCAL
|
13
13
|
from sonatoki.types import LinkuBooks, LinkuUsageDate, LinkuUsageCategory
|
@@ -41,6 +41,7 @@ class Filter(ABC):
|
|
41
41
|
raise NotImplementedError
|
42
42
|
|
43
43
|
|
44
|
+
@deprecated("Use sonatoki.Filters.Len instead")
|
44
45
|
class MinLen(Filter):
|
45
46
|
"""
|
46
47
|
Meta filter meant to be inherited by another filter to add a length requirement.
|
@@ -62,12 +63,54 @@ class MinLen(Filter):
|
|
62
63
|
return super().filter(token)
|
63
64
|
|
64
65
|
def __new__(cls, filter: Type[Filter], length_: int) -> Type[Filter]:
|
65
|
-
class MinLenFilter(MinLen,
|
66
|
+
class MinLenFilter(MinLen, filter):
|
66
67
|
length = length_
|
67
68
|
|
68
69
|
return MinLenFilter
|
69
70
|
|
70
71
|
|
72
|
+
class Len(Filter):
|
73
|
+
"""Meta filter to be inherited by another filter to add any length
|
74
|
+
requirement. A bound will only be considered if it is non-zero, so you may
|
75
|
+
omit a minimum length or a maximum length to bound only one of them.
|
76
|
+
|
77
|
+
If inherited when defining a class, `Len` must be the first argument so `super()` resolves correctly.
|
78
|
+
|
79
|
+
To add minimum or maximum length requirements when defining a class:
|
80
|
+
```
|
81
|
+
class LongAlphabetic(Len, Alphabetic):
|
82
|
+
minlen = 3
|
83
|
+
maxlen = 20
|
84
|
+
```
|
85
|
+
|
86
|
+
You may also construct any other filter with a minimum length filter like so:
|
87
|
+
```
|
88
|
+
Len(Alphabetic, min=3, max=20)
|
89
|
+
```
|
90
|
+
"""
|
91
|
+
|
92
|
+
minlen = 0
|
93
|
+
maxlen = 0
|
94
|
+
|
95
|
+
@classmethod
|
96
|
+
@cache(maxsize=None)
|
97
|
+
def filter(cls, token: str) -> bool:
|
98
|
+
tokenlen = len(token)
|
99
|
+
|
100
|
+
if cls.minlen and tokenlen < cls.minlen:
|
101
|
+
return False
|
102
|
+
if cls.maxlen and tokenlen > cls.maxlen:
|
103
|
+
return False
|
104
|
+
return super().filter(token)
|
105
|
+
|
106
|
+
def __new__(cls, filter: Type[Filter], min: int = 0, max: int = 0) -> Type[Filter]:
|
107
|
+
class LenFilter(Len, filter):
|
108
|
+
minlen = min
|
109
|
+
maxlen = max
|
110
|
+
|
111
|
+
return LenFilter
|
112
|
+
|
113
|
+
|
71
114
|
class RegexFilter(Filter):
|
72
115
|
pattern: "re.Pattern[str]"
|
73
116
|
|
@@ -183,8 +226,8 @@ class PuName(Filter):
|
|
183
226
|
# this will errantly match.
|
184
227
|
|
185
228
|
|
186
|
-
class LongProperName(
|
187
|
-
|
229
|
+
class LongProperName(Len, ProperName):
|
230
|
+
minlen = 2 # reject "names" of length 1
|
188
231
|
|
189
232
|
|
190
233
|
class NimiLinkuByUsage:
|
@@ -252,8 +295,8 @@ class Phonotactic(RegexFilter):
|
|
252
295
|
)
|
253
296
|
|
254
297
|
|
255
|
-
class LongPhonotactic(
|
256
|
-
|
298
|
+
class LongPhonotactic(Len, Phonotactic):
|
299
|
+
minlen = 3
|
257
300
|
|
258
301
|
|
259
302
|
class Syllabic(RegexFilter):
|
@@ -271,8 +314,8 @@ class Syllabic(RegexFilter):
|
|
271
314
|
)
|
272
315
|
|
273
316
|
|
274
|
-
class LongSyllabic(
|
275
|
-
|
317
|
+
class LongSyllabic(Len, Syllabic):
|
318
|
+
minlen = 3
|
276
319
|
|
277
320
|
|
278
321
|
class Alphabetic(SubsetFilter):
|
@@ -283,8 +326,8 @@ class AlphabeticRe(RegexFilter):
|
|
283
326
|
pattern = re.compile(rf"[{ALPHABET}]+", flags=re.IGNORECASE)
|
284
327
|
|
285
328
|
|
286
|
-
class LongAlphabetic(
|
287
|
-
|
329
|
+
class LongAlphabetic(Len, Alphabetic):
|
330
|
+
minlen = 3
|
288
331
|
|
289
332
|
|
290
333
|
class Numeric(Filter):
|
@@ -448,15 +491,26 @@ class Not(Filter):
|
|
448
491
|
return NotFilter
|
449
492
|
|
450
493
|
|
494
|
+
class Pass(Filter):
|
495
|
+
@classmethod
|
496
|
+
@override
|
497
|
+
@cache(maxsize=None)
|
498
|
+
def filter(cls, token: str) -> bool:
|
499
|
+
return True
|
500
|
+
|
501
|
+
|
502
|
+
class Fail(Not, Pass): ...
|
503
|
+
|
504
|
+
|
451
505
|
__all__ = [
|
452
506
|
"Alphabetic",
|
453
507
|
"And",
|
454
508
|
"FalsePosSyllabic",
|
509
|
+
"Len",
|
455
510
|
"LongAlphabetic",
|
456
511
|
"LongPhonotactic",
|
457
512
|
"LongProperName",
|
458
513
|
"LongSyllabic",
|
459
|
-
"MinLen",
|
460
514
|
"NimiLinkuCore",
|
461
515
|
"NimiLinkuSandbox",
|
462
516
|
"NimiPu",
|
sonatoki/Scorers.py
CHANGED
@@ -8,7 +8,7 @@ from typing_extensions import override
|
|
8
8
|
|
9
9
|
# LOCAL
|
10
10
|
from sonatoki.types import Number, Scorecard
|
11
|
-
from sonatoki.Filters import Filter
|
11
|
+
from sonatoki.Filters import Pass, Filter
|
12
12
|
|
13
13
|
|
14
14
|
class Scorer(ABC):
|
@@ -112,6 +112,67 @@ class Scaling(Scorer):
|
|
112
112
|
return total_score / max_score if max_score else 0
|
113
113
|
|
114
114
|
|
115
|
+
class Voting(Scaling):
|
116
|
+
"""Derives from `Scaling` in assigning scores from 0 to 1 based on how soon
|
117
|
+
a filter matches, with the first filter scoring a 1. However, after all
|
118
|
+
scores are derived, each token scoring 0 is given a is given an opportunity
|
119
|
+
to score based on its nearest 3 neighbors.
|
120
|
+
|
121
|
+
If created with a Filter, tokens must also pass that filter to be
|
122
|
+
considered for voting.
|
123
|
+
"""
|
124
|
+
|
125
|
+
prereq: Type[Filter] = Pass
|
126
|
+
threshold: int = 0
|
127
|
+
|
128
|
+
def __new__(cls, filter: Type[Filter], threshold_: int):
|
129
|
+
class AnonVoting(Voting):
|
130
|
+
prereq = filter
|
131
|
+
threshold = threshold_
|
132
|
+
|
133
|
+
return AnonVoting
|
134
|
+
|
135
|
+
@classmethod
|
136
|
+
@override
|
137
|
+
def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
|
138
|
+
if not tokens:
|
139
|
+
return 1
|
140
|
+
|
141
|
+
if len(tokens) < 4:
|
142
|
+
return super().score(tokens, filters)
|
143
|
+
|
144
|
+
len_filters = len(filters)
|
145
|
+
max_score = len(tokens) * len_filters
|
146
|
+
|
147
|
+
# score_token only emits ints
|
148
|
+
# but the averaging emits floats
|
149
|
+
# it doesn't really matter as long as no score exceeds len_filters
|
150
|
+
scores: List[Number] = []
|
151
|
+
for token in tokens:
|
152
|
+
score = cls.score_token(token, filters, len_filters)
|
153
|
+
scores.append(score)
|
154
|
+
|
155
|
+
# only consider scores from before voting
|
156
|
+
copied_scores = scores[:]
|
157
|
+
for i, (token, score) in enumerate(zip(tokens, copied_scores)):
|
158
|
+
if score > cls.threshold:
|
159
|
+
continue
|
160
|
+
if not cls.prereq.filter(token):
|
161
|
+
continue
|
162
|
+
|
163
|
+
# TODO: this is kinda dumb.
|
164
|
+
# we want to get exactly 3 neighbors, favoring 2 before and 1 after
|
165
|
+
# the way i'm doing this is both bad and slow as hell
|
166
|
+
start = max(i - 2, 0)
|
167
|
+
end = min(i + 1, len(scores) - 1)
|
168
|
+
neighbors = copied_scores[start:i] + copied_scores[i + 1 : end + 1]
|
169
|
+
scores[i] = sum(neighbors) / len(neighbors)
|
170
|
+
|
171
|
+
total_score = sum(scores)
|
172
|
+
|
173
|
+
return total_score / max_score if max_score else 0
|
174
|
+
|
175
|
+
|
115
176
|
class SoftPassFail(Soften, PassFail):
|
116
177
|
"""Same as `PassFail`, but shorter messages are subject to less harsh
|
117
178
|
scoring."""
|
@@ -122,6 +183,11 @@ class SoftScaling(Soften, Scaling):
|
|
122
183
|
scoring."""
|
123
184
|
|
124
185
|
|
186
|
+
class SoftVoting(Soften, Voting):
|
187
|
+
"""Same as `Voting`, but shorter messages are subject to less harsh
|
188
|
+
scoring."""
|
189
|
+
|
190
|
+
|
125
191
|
class SentenceScorer(ABC):
|
126
192
|
@classmethod
|
127
193
|
@abstractmethod
|
sonatoki/__main__.py
CHANGED
@@ -60,11 +60,11 @@ def download_json(url: str) -> Dict[str, Any]:
|
|
60
60
|
|
61
61
|
def regen_linku_data():
|
62
62
|
data = download_json(LINKU_WORDS)
|
63
|
-
with open(os.path.join(HERE, "linku.json"), "w") as f:
|
63
|
+
with open(os.path.join(HERE, "linku.json"), "w", encoding="utf-8") as f:
|
64
64
|
_ = f.write(json.dumps(data))
|
65
65
|
|
66
66
|
data = download_json(LINKU_SANDBOX)
|
67
|
-
with open(os.path.join(HERE, "sandbox.json"), "w") as f:
|
67
|
+
with open(os.path.join(HERE, "sandbox.json"), "w", encoding="utf-8") as f:
|
68
68
|
_ = f.write(json.dumps(data))
|
69
69
|
|
70
70
|
|
@@ -96,11 +96,11 @@ def regen_false_negatives():
|
|
96
96
|
continue
|
97
97
|
|
98
98
|
# TODO: include short matches or no?
|
99
|
-
with open(os.path.join(HERE, "syllabic.txt"), "w") as f:
|
99
|
+
with open(os.path.join(HERE, "syllabic.txt"), "w", encoding="utf-8") as f:
|
100
100
|
syllabic_final = sorted([word + "\n" for word in syllabic_matches])
|
101
101
|
f.writelines(syllabic_final)
|
102
102
|
|
103
|
-
with open(os.path.join(HERE, "alphabetic.txt"), "w") as f:
|
103
|
+
with open(os.path.join(HERE, "alphabetic.txt"), "w", encoding="utf-8") as f:
|
104
104
|
alphabetic_final = sorted([word + "\n" for word in alphabetic_matches])
|
105
105
|
f.writelines(alphabetic_final)
|
106
106
|
|
sonatoki/constants.py
CHANGED
@@ -648,6 +648,7 @@ FALSE_POS_SYLLABIC = {
|
|
648
648
|
"insolate",
|
649
649
|
"insulate",
|
650
650
|
"intense",
|
651
|
+
"saluton",
|
651
652
|
# "june",
|
652
653
|
"lemon",
|
653
654
|
"manipulate",
|
@@ -698,9 +699,9 @@ def linku_data() -> Dict[str, LinkuWord]:
|
|
698
699
|
# NOTE: this does open+read+parse two files each time you construct a filter
|
699
700
|
# but i expect users to construct filters only at the start of runtime
|
700
701
|
# there is no reason to waste your RAM by leaving the linku data in it
|
701
|
-
with open(LINKU) as f:
|
702
|
+
with open(LINKU, "r", encoding="utf-8") as f:
|
702
703
|
linku: Dict[str, LinkuWord] = json.loads(f.read())
|
703
|
-
with open(SANDBOX) as f:
|
704
|
+
with open(SANDBOX, "r", encoding="utf-8") as f:
|
704
705
|
sandbox: Dict[str, LinkuWord] = json.loads(f.read())
|
705
706
|
|
706
707
|
return {**linku, **sandbox}
|
@@ -731,10 +732,10 @@ def words_by_usage(
|
|
731
732
|
NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
|
732
733
|
|
733
734
|
|
734
|
-
# with open(SYLLABICS) as f:
|
735
|
+
# with open(SYLLABICS, "r", encoding="utf-8") as f:
|
735
736
|
# FALSE_POS_SYLLABIC = {line.strip() for line in f}
|
736
737
|
#
|
737
|
-
# with open(ALPHABETICS) as f:
|
738
|
+
# with open(ALPHABETICS, "r", encoding="utf-8") as f:
|
738
739
|
# FALSE_POS_ALPHABETIC = {line.strip() for line in f}
|
739
740
|
|
740
741
|
__all__ = [
|
@@ -1,16 +1,17 @@
|
|
1
|
-
sonatoki-0.
|
2
|
-
sonatoki-0.
|
3
|
-
sonatoki-0.
|
1
|
+
sonatoki-0.9.0.dist-info/METADATA,sha256=fg-GTxWTWA71w87R_dRWY07pBioj_034nEGHgxE9_EU,6893
|
2
|
+
sonatoki-0.9.0.dist-info/WHEEL,sha256=pM0IBB6ZwH3nkEPhtcp50KvKNX-07jYtnb1g1m6Z4Co,90
|
3
|
+
sonatoki-0.9.0.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
|
4
|
+
sonatoki-0.9.0.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
|
4
5
|
sonatoki/Cleaners.py,sha256=x2dT3MpDUfbrHA0EP2D3n1sTiKFFi5jw9ha-1dX973o,1958
|
5
|
-
sonatoki/Configs.py,sha256
|
6
|
-
sonatoki/Filters.py,sha256=
|
6
|
+
sonatoki/Configs.py,sha256=dwWe1kmgXA9CmiF2KBKhkQfPYOs_TQHjXb5DW9wuMBA,4899
|
7
|
+
sonatoki/Filters.py,sha256=8HAtR6_Rk6GPboaS_MHwSjZBJxYnAA8kYbRPI0eR6sM,14823
|
7
8
|
sonatoki/Preprocessors.py,sha256=RmzkvPVo6Kdx1rZ5HeR9cTtx6oxpp2iLKrOMCUEqIrM,7107
|
8
|
-
sonatoki/Scorers.py,sha256=
|
9
|
+
sonatoki/Scorers.py,sha256=qoceRK14-D-xgi5DM290f-PJNOViEzOsRXBdsBbNIWM,7441
|
9
10
|
sonatoki/Tokenizers.py,sha256=8lpC70bzXOpHyhVr5bmqpYKmdmQvJdf7X5-Icc9RRCw,5040
|
10
11
|
sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
-
sonatoki/__main__.py,sha256=
|
12
|
+
sonatoki/__main__.py,sha256=QIWRLYS1jb7OBUBK5s8kYoeiMv6MLBlt_I7H7tIVjpU,5745
|
12
13
|
sonatoki/alphabetic.txt,sha256=duyqAKilD2vLIr75RShCIAnktNJcGeEoQIk18V6czmg,11702
|
13
|
-
sonatoki/constants.py,sha256=
|
14
|
+
sonatoki/constants.py,sha256=Akqt8Kw3F9PPdpZ4UbcrTy2BGfIafNTf2YH0D35cO7k,19431
|
14
15
|
sonatoki/ilo.py,sha256=Dsn0yagkwjqpAQoCj6mkZ6NqWeanRF2lxNDNoqjWGLo,5993
|
15
16
|
sonatoki/linku.json,sha256=d72Dvht-a4gBmdqLLI8mElvo83zSpbxDmxJj05hOudM,295413
|
16
17
|
sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -18,4 +19,4 @@ sonatoki/sandbox.json,sha256=44csrQDaVtV-n8OyewabX1J9MmUFCsPct5C8E5Xuc58,140197
|
|
18
19
|
sonatoki/syllabic.txt,sha256=HnqY4TrZ3tPcHah3TsvG9F9gjMrnAGdJ8hHJNHyyUPc,1712
|
19
20
|
sonatoki/types.py,sha256=zoVJeaDLOPstREiHtoD9pv-AOCsJq2C4_GG3nTYd114,1267
|
20
21
|
sonatoki/utils.py,sha256=sT5xLMEj0aLpy8GP92HKblJU1Wt1m8NUlMgCFWB32xQ,2265
|
21
|
-
sonatoki-0.
|
22
|
+
sonatoki-0.9.0.dist-info/RECORD,,
|
File without changes
|