sonatoki 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Configs.py +2 -2
- sonatoki/Filters.py +25 -2
- {sonatoki-0.7.0.dist-info → sonatoki-0.8.0.dist-info}/METADATA +1 -1
- {sonatoki-0.7.0.dist-info → sonatoki-0.8.0.dist-info}/RECORD +6 -6
- {sonatoki-0.7.0.dist-info → sonatoki-0.8.0.dist-info}/WHEEL +0 -0
- {sonatoki-0.7.0.dist-info → sonatoki-0.8.0.dist-info}/licenses/LICENSE +0 -0
sonatoki/Configs.py
CHANGED
@@ -11,12 +11,12 @@ from sonatoki.Filters import (
|
|
11
11
|
And,
|
12
12
|
Not,
|
13
13
|
Filter,
|
14
|
+
PuName,
|
14
15
|
Numeric,
|
15
16
|
NimiUCSUR,
|
16
17
|
Alphabetic,
|
17
18
|
NimiKuLili,
|
18
19
|
NimiKuSuli,
|
19
|
-
ProperName,
|
20
20
|
Punctuation,
|
21
21
|
LongSyllabic,
|
22
22
|
Miscellaneous,
|
@@ -131,7 +131,7 @@ LazyConfig: IloConfig = {
|
|
131
131
|
"preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
|
132
132
|
"cleaners": [ConsecutiveDuplicates],
|
133
133
|
"ignoring_filters": [Numeric, Punctuation],
|
134
|
-
"scoring_filters": [Alphabetic, NimiUCSUR,
|
134
|
+
"scoring_filters": [Alphabetic, NimiUCSUR, PuName, Miscellaneous],
|
135
135
|
"scorer": SoftPassFail,
|
136
136
|
"passing_score": 0.8,
|
137
137
|
"word_tokenizer": WordTokenizerRe, # mimics old tokenizer
|
sonatoki/Filters.py
CHANGED
@@ -141,8 +141,27 @@ class FalsePosAlphabetic(MemberFilter):
|
|
141
141
|
|
142
142
|
|
143
143
|
class ProperName(Filter):
|
144
|
-
"""
|
145
|
-
|
144
|
+
"""Determine if a given token is a valid name based on a reasonable weakening of
|
145
|
+
the rules given in Toki Pona: The Language of Good. A token matches if it has a capital
|
146
|
+
letter at its start and is **not** fully capitalized.
|
147
|
+
|
148
|
+
This corrects an issue with PuName, where scripts lacking a case distinction are
|
149
|
+
errantly counted"""
|
150
|
+
|
151
|
+
@classmethod
|
152
|
+
@override
|
153
|
+
@cache(maxsize=None)
|
154
|
+
def filter(cls, token: str) -> bool:
|
155
|
+
first_capitalized = token[0].isupper()
|
156
|
+
all_caps = token.isupper()
|
157
|
+
|
158
|
+
return first_capitalized and not all_caps
|
159
|
+
|
160
|
+
|
161
|
+
class PuName(Filter):
|
162
|
+
"""Determine if a given token is a valid name (also called a loan word) based on
|
163
|
+
the rules given in Toki Pona: The Language of Good.
|
164
|
+
When Toki Pona is written with the Latin alphabet, names are
|
146
165
|
capitalized at their start. This filter identifies those tokens.
|
147
166
|
|
148
167
|
Note that this alone cannot determine if a token is a valid name,
|
@@ -156,6 +175,9 @@ class ProperName(Filter):
|
|
156
175
|
@override
|
157
176
|
@cache(maxsize=None)
|
158
177
|
def filter(cls, token: str) -> bool:
|
178
|
+
# first_capitalized = token[0].isupper()
|
179
|
+
# rest_capitalized = token[1:] == token[1:].upper()
|
180
|
+
# return first_capitalized and not rest_capitalized
|
159
181
|
return token == token.capitalize()
|
160
182
|
# TODO: If the token is in a script which doesn't have a case distinction,
|
161
183
|
# this will errantly match.
|
@@ -445,6 +467,7 @@ __all__ = [
|
|
445
467
|
"Or",
|
446
468
|
"Phonotactic",
|
447
469
|
"ProperName",
|
470
|
+
"PuName",
|
448
471
|
"Punctuation",
|
449
472
|
"Syllabic",
|
450
473
|
]
|
@@ -1,9 +1,9 @@
|
|
1
|
-
sonatoki-0.
|
2
|
-
sonatoki-0.
|
3
|
-
sonatoki-0.
|
1
|
+
sonatoki-0.8.0.dist-info/METADATA,sha256=52aTVq7ljGFzYm1Pdh9tKaRN3IVfXruRJZbwIiAPi9w,6517
|
2
|
+
sonatoki-0.8.0.dist-info/WHEEL,sha256=rSwsxJWe3vzyR5HCwjWXQruDgschpei4h_giTm0dJVE,90
|
3
|
+
sonatoki-0.8.0.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
|
4
4
|
sonatoki/Cleaners.py,sha256=x2dT3MpDUfbrHA0EP2D3n1sTiKFFi5jw9ha-1dX973o,1958
|
5
|
-
sonatoki/Configs.py,sha256=
|
6
|
-
sonatoki/Filters.py,sha256=
|
5
|
+
sonatoki/Configs.py,sha256=h6-igZbhbYoYA0gJLrd3YCa5annTqacsAMGB1dX3v9A,4758
|
6
|
+
sonatoki/Filters.py,sha256=rBEJrY_R6koFpoYl4yfo_9UR-i21HbvlUF0ORg1g0WE,13411
|
7
7
|
sonatoki/Preprocessors.py,sha256=nY0_cmF4aEmGZxXc7ZEvhvf2BZO6GnrMUC8IqDwu47A,6034
|
8
8
|
sonatoki/Scorers.py,sha256=aCU3p9rD4QOy-uu851FGGw-ARqUCG_l4V_z5rtRL420,5236
|
9
9
|
sonatoki/Tokenizers.py,sha256=8lpC70bzXOpHyhVr5bmqpYKmdmQvJdf7X5-Icc9RRCw,5040
|
@@ -18,4 +18,4 @@ sonatoki/sandbox.json,sha256=44csrQDaVtV-n8OyewabX1J9MmUFCsPct5C8E5Xuc58,140197
|
|
18
18
|
sonatoki/syllabic.txt,sha256=HnqY4TrZ3tPcHah3TsvG9F9gjMrnAGdJ8hHJNHyyUPc,1712
|
19
19
|
sonatoki/types.py,sha256=zoVJeaDLOPstREiHtoD9pv-AOCsJq2C4_GG3nTYd114,1267
|
20
20
|
sonatoki/utils.py,sha256=sT5xLMEj0aLpy8GP92HKblJU1Wt1m8NUlMgCFWB32xQ,2265
|
21
|
-
sonatoki-0.
|
21
|
+
sonatoki-0.8.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|