sonatoki 0.7.0__tar.gz → 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sonatoki-0.7.0 → sonatoki-0.8.0}/PKG-INFO +1 -1
- {sonatoki-0.7.0 → sonatoki-0.8.0}/pyproject.toml +1 -1
- {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/Configs.py +2 -2
- {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/Filters.py +25 -2
- {sonatoki-0.7.0 → sonatoki-0.8.0}/tests/test_filters.py +3 -3
- {sonatoki-0.7.0 → sonatoki-0.8.0}/tests/test_ilo.py +4 -1
- {sonatoki-0.7.0 → sonatoki-0.8.0}/tests/test_scorers.py +2 -2
- {sonatoki-0.7.0 → sonatoki-0.8.0}/LICENSE +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/README.md +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/Cleaners.py +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/Preprocessors.py +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/Scorers.py +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/Tokenizers.py +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/__init__.py +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/__main__.py +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/alphabetic.txt +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/constants.py +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/ilo.py +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/linku.json +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/py.typed +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/sandbox.json +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/syllabic.txt +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/types.py +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/src/sonatoki/utils.py +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/tests/__init__.py +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/tests/test_cleaners.py +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/tests/test_preprocessors.py +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/tests/test_properties.py +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/tests/test_tokenize.py +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/tests/test_utils.py +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
- {sonatoki-0.7.0 → sonatoki-0.8.0}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -11,12 +11,12 @@ from sonatoki.Filters import (
|
|
11
11
|
And,
|
12
12
|
Not,
|
13
13
|
Filter,
|
14
|
+
PuName,
|
14
15
|
Numeric,
|
15
16
|
NimiUCSUR,
|
16
17
|
Alphabetic,
|
17
18
|
NimiKuLili,
|
18
19
|
NimiKuSuli,
|
19
|
-
ProperName,
|
20
20
|
Punctuation,
|
21
21
|
LongSyllabic,
|
22
22
|
Miscellaneous,
|
@@ -131,7 +131,7 @@ LazyConfig: IloConfig = {
|
|
131
131
|
"preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
|
132
132
|
"cleaners": [ConsecutiveDuplicates],
|
133
133
|
"ignoring_filters": [Numeric, Punctuation],
|
134
|
-
"scoring_filters": [Alphabetic, NimiUCSUR,
|
134
|
+
"scoring_filters": [Alphabetic, NimiUCSUR, PuName, Miscellaneous],
|
135
135
|
"scorer": SoftPassFail,
|
136
136
|
"passing_score": 0.8,
|
137
137
|
"word_tokenizer": WordTokenizerRe, # mimics old tokenizer
|
@@ -141,8 +141,27 @@ class FalsePosAlphabetic(MemberFilter):
|
|
141
141
|
|
142
142
|
|
143
143
|
class ProperName(Filter):
|
144
|
-
"""
|
145
|
-
|
144
|
+
"""Determine if a given token is a valid name based on a reasonable weakening of
|
145
|
+
the rules given in Toki Pona: The Language of Good. A token matches if it has a capital
|
146
|
+
letter at its start and is **not** fully capitalized.
|
147
|
+
|
148
|
+
This corrects an issue with PuName, where scripts lacking a case distinction are
|
149
|
+
errantly counted"""
|
150
|
+
|
151
|
+
@classmethod
|
152
|
+
@override
|
153
|
+
@cache(maxsize=None)
|
154
|
+
def filter(cls, token: str) -> bool:
|
155
|
+
first_capitalized = token[0].isupper()
|
156
|
+
all_caps = token.isupper()
|
157
|
+
|
158
|
+
return first_capitalized and not all_caps
|
159
|
+
|
160
|
+
|
161
|
+
class PuName(Filter):
|
162
|
+
"""Determine if a given token is a valid name (also called a loan word) based on
|
163
|
+
the rules given in Toki Pona: The Language of Good.
|
164
|
+
When Toki Pona is written with the Latin alphabet, names are
|
146
165
|
capitalized at their start. This filter identifies those tokens.
|
147
166
|
|
148
167
|
Note that this alone cannot determine if a token is a valid name,
|
@@ -156,6 +175,9 @@ class ProperName(Filter):
|
|
156
175
|
@override
|
157
176
|
@cache(maxsize=None)
|
158
177
|
def filter(cls, token: str) -> bool:
|
178
|
+
# first_capitalized = token[0].isupper()
|
179
|
+
# rest_capitalized = token[1:] == token[1:].upper()
|
180
|
+
# return first_capitalized and not rest_capitalized
|
159
181
|
return token == token.capitalize()
|
160
182
|
# TODO: If the token is in a script which doesn't have a case distinction,
|
161
183
|
# this will errantly match.
|
@@ -445,6 +467,7 @@ __all__ = [
|
|
445
467
|
"Or",
|
446
468
|
"Phonotactic",
|
447
469
|
"ProperName",
|
470
|
+
"PuName",
|
448
471
|
"Punctuation",
|
449
472
|
"Syllabic",
|
450
473
|
]
|
@@ -11,12 +11,12 @@ from sonatoki.Filters import (
|
|
11
11
|
And,
|
12
12
|
Not,
|
13
13
|
NimiPu,
|
14
|
+
PuName,
|
14
15
|
Numeric,
|
15
16
|
Syllabic,
|
16
17
|
Alphabetic,
|
17
18
|
NimiKuLili,
|
18
19
|
NimiKuSuli,
|
19
|
-
ProperName,
|
20
20
|
Phonotactic,
|
21
21
|
Punctuation,
|
22
22
|
AlphabeticRe,
|
@@ -144,7 +144,7 @@ def test_AlphabeticRe(s: str):
|
|
144
144
|
|
145
145
|
@given(st.from_regex(PROPER_NAME_RE, fullmatch=True))
|
146
146
|
def test_ProperName(s: str):
|
147
|
-
res =
|
147
|
+
res = PuName.filter(s)
|
148
148
|
assert res, repr(s)
|
149
149
|
|
150
150
|
|
@@ -247,7 +247,7 @@ def test_OrFilter_IsipinEpiku(s: str):
|
|
247
247
|
@given(st.sampled_from(list(words_by_tag("book", "pu"))))
|
248
248
|
def test_AndFilter(s: str):
|
249
249
|
s = s.capitalize()
|
250
|
-
f = And(
|
250
|
+
f = And(PuName, NimiPu)
|
251
251
|
assert f.filter(s)
|
252
252
|
|
253
253
|
|
@@ -94,6 +94,9 @@ NAME_MATCHES = [
|
|
94
94
|
"toki Kanse li lon",
|
95
95
|
"toki Lojban li nasa e lawa mi",
|
96
96
|
"ilo Firefox",
|
97
|
+
"ilo FaceBook li nasa",
|
98
|
+
"mi kepeken ilo MySQL",
|
99
|
+
"poki li nasin SQLite",
|
97
100
|
"mi musi Space Station 13",
|
98
101
|
"jan Tepo en jan Salo en jan Lakuse en pipi Kewapi en soweli Eweke en mi li musi",
|
99
102
|
]
|
@@ -135,7 +138,6 @@ EXCESSIVE_SYLLABICS = [
|
|
135
138
|
]
|
136
139
|
|
137
140
|
EXCESSIVE_ALPHABETICS = [
|
138
|
-
"21st", # candidate for xfails?
|
139
141
|
"wen i tok usin onli notes in toki pona i look silli. ",
|
140
142
|
"I wait, I sulk, as a tool I make stoops to ineptness.",
|
141
143
|
"aaa i non-saw usa's most multiple element-set. it's as asinine as in `e`-less speak",
|
@@ -161,6 +163,7 @@ EXCESSIVE_ENGLISH = [
|
|
161
163
|
"i'm online all the time",
|
162
164
|
"How to Cut a Kiwi",
|
163
165
|
"a e i o u",
|
166
|
+
"21st", # previous false positive; fixed by ProperName change
|
164
167
|
]
|
165
168
|
|
166
169
|
NON_MATCHES = [
|
@@ -10,10 +10,10 @@ from hypothesis import given, example
|
|
10
10
|
from sonatoki.Filters import (
|
11
11
|
Filter,
|
12
12
|
NimiPu,
|
13
|
+
PuName,
|
13
14
|
Numeric,
|
14
15
|
Syllabic,
|
15
16
|
Alphabetic,
|
16
|
-
ProperName,
|
17
17
|
Phonotactic,
|
18
18
|
NimiLinkuCore,
|
19
19
|
PunctuationRe,
|
@@ -31,7 +31,7 @@ FILTERS = [
|
|
31
31
|
NimiLinkuCore,
|
32
32
|
NimiLinkuCommon,
|
33
33
|
Alphabetic,
|
34
|
-
|
34
|
+
PuName,
|
35
35
|
Phonotactic,
|
36
36
|
PunctuationRe,
|
37
37
|
]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|