sonatoki 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Configs.py +41 -30
- sonatoki/Filters.py +121 -24
- sonatoki/constants.py +74 -38
- sonatoki/utils.py +14 -1
- {sonatoki-0.3.0.dist-info → sonatoki-0.3.2.dist-info}/METADATA +28 -17
- {sonatoki-0.3.0.dist-info → sonatoki-0.3.2.dist-info}/RECORD +8 -8
- {sonatoki-0.3.0.dist-info → sonatoki-0.3.2.dist-info}/WHEEL +0 -0
- {sonatoki-0.3.0.dist-info → sonatoki-0.3.2.dist-info}/licenses/LICENSE +0 -0
sonatoki/Configs.py
CHANGED
@@ -1,36 +1,36 @@
|
|
1
1
|
# STL
|
2
2
|
from copy import deepcopy
|
3
|
-
from typing import List, Type,
|
3
|
+
from typing import List, Type, TypedDict
|
4
4
|
|
5
5
|
# LOCAL
|
6
6
|
from sonatoki.Filters import (
|
7
7
|
Filter,
|
8
|
-
NimiPu,
|
9
8
|
Numeric,
|
10
|
-
OrFilter,
|
11
9
|
Syllabic,
|
12
|
-
NimiLinku,
|
13
|
-
NimiPuAle,
|
14
10
|
NimiUCSUR,
|
15
11
|
Alphabetic,
|
16
12
|
ProperName,
|
17
|
-
Phonotactic,
|
18
13
|
Punctuation,
|
19
|
-
|
14
|
+
LongSyllabic,
|
15
|
+
Miscellaneous,
|
16
|
+
NimiLinkuCore,
|
17
|
+
LongAlphabetic,
|
18
|
+
LongProperName,
|
19
|
+
OrMemberFilter,
|
20
|
+
NimiLinkuCommon,
|
21
|
+
NimiLinkuObscure,
|
20
22
|
NimiLinkuSandbox,
|
21
23
|
EnglishIgnorables,
|
24
|
+
NimiLinkuUncommon,
|
22
25
|
)
|
23
26
|
from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
|
24
27
|
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
25
28
|
from sonatoki.Tokenizers import Tokenizer, WordTokenizer
|
26
29
|
from sonatoki.Preprocessors import (
|
27
30
|
URLs,
|
31
|
+
Backticks,
|
28
32
|
Reference,
|
29
33
|
Preprocessor,
|
30
|
-
DiscordEmotes,
|
31
|
-
DiscordSpecial,
|
32
|
-
DiscordChannels,
|
33
|
-
DiscordMentions,
|
34
34
|
AngleBracketObject,
|
35
35
|
)
|
36
36
|
|
@@ -59,14 +59,14 @@ BaseConfig: IloConfig = {
|
|
59
59
|
|
60
60
|
|
61
61
|
PrefConfig: IloConfig = {
|
62
|
-
"preprocessors": [URLs, Reference],
|
62
|
+
"preprocessors": [Backticks, URLs, Reference],
|
63
63
|
"cleaners": [ConsecutiveDuplicates],
|
64
|
-
"ignoring_filters": [Numeric, Punctuation
|
64
|
+
"ignoring_filters": [Numeric, Punctuation],
|
65
65
|
"scoring_filters": [
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
66
|
+
OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
|
67
|
+
LongSyllabic,
|
68
|
+
LongProperName,
|
69
|
+
LongAlphabetic,
|
70
70
|
],
|
71
71
|
"scorer": SoftScaling,
|
72
72
|
"passing_score": 0.8,
|
@@ -74,14 +74,22 @@ PrefConfig: IloConfig = {
|
|
74
74
|
}
|
75
75
|
|
76
76
|
CorpusConfig: IloConfig = {
|
77
|
-
"preprocessors": [URLs, AngleBracketObject, Reference],
|
77
|
+
"preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
|
78
78
|
"cleaners": [ConsecutiveDuplicates],
|
79
|
-
"ignoring_filters": [Numeric, Punctuation
|
79
|
+
"ignoring_filters": [Numeric, Punctuation],
|
80
80
|
"scoring_filters": [
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
81
|
+
OrMemberFilter(
|
82
|
+
NimiLinkuCore,
|
83
|
+
NimiLinkuCommon,
|
84
|
+
NimiLinkuUncommon,
|
85
|
+
NimiLinkuObscure,
|
86
|
+
NimiLinkuSandbox,
|
87
|
+
NimiUCSUR,
|
88
|
+
Miscellaneous,
|
89
|
+
),
|
90
|
+
LongSyllabic,
|
91
|
+
LongProperName,
|
92
|
+
LongAlphabetic,
|
85
93
|
],
|
86
94
|
"scorer": SoftScaling,
|
87
95
|
"passing_score": 0.8,
|
@@ -89,25 +97,28 @@ CorpusConfig: IloConfig = {
|
|
89
97
|
}
|
90
98
|
|
91
99
|
|
100
|
+
"""
|
101
|
+
Mimics the previous implementation of ilo pi toki pona taso
|
102
|
+
"""
|
92
103
|
LazyConfig: IloConfig = {
|
93
|
-
"preprocessors": [URLs],
|
104
|
+
"preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
|
94
105
|
"cleaners": [ConsecutiveDuplicates],
|
95
106
|
"ignoring_filters": [Numeric, Punctuation],
|
96
|
-
"scoring_filters": [Alphabetic, NimiUCSUR, ProperName],
|
107
|
+
"scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
|
97
108
|
"scorer": SoftPassFail,
|
98
109
|
"passing_score": 0.8,
|
99
110
|
"word_tokenizer": WordTokenizer,
|
100
111
|
}
|
101
112
|
|
102
113
|
DiscordConfig: IloConfig = {
|
103
|
-
"preprocessors": [URLs, AngleBracketObject, Reference],
|
114
|
+
"preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
|
104
115
|
"cleaners": [ConsecutiveDuplicates],
|
105
116
|
"ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
|
106
117
|
"scoring_filters": [
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
118
|
+
OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
|
119
|
+
LongSyllabic,
|
120
|
+
LongProperName,
|
121
|
+
LongAlphabetic,
|
111
122
|
],
|
112
123
|
"scorer": SoftScaling,
|
113
124
|
"passing_score": 0.8,
|
sonatoki/Filters.py
CHANGED
@@ -9,6 +9,7 @@ import regex
|
|
9
9
|
from typing_extensions import override
|
10
10
|
|
11
11
|
# LOCAL
|
12
|
+
from sonatoki.utils import prep_dictionary
|
12
13
|
from sonatoki.constants import (
|
13
14
|
VOWELS,
|
14
15
|
NIMI_PU,
|
@@ -17,13 +18,17 @@ from sonatoki.constants import (
|
|
17
18
|
ALLOWABLES,
|
18
19
|
CONSONANTS,
|
19
20
|
IGNORABLES,
|
20
|
-
NIMI_LINKU,
|
21
21
|
NIMI_UCSUR,
|
22
|
-
|
22
|
+
NIMI_KU_LILI,
|
23
|
+
NIMI_KU_SULI,
|
24
|
+
NIMI_LINKU_CORE,
|
23
25
|
ALL_PUNCT_RANGES,
|
24
26
|
NIMI_PU_SYNONYMS,
|
27
|
+
NIMI_LINKU_COMMON,
|
28
|
+
NIMI_LINKU_OBSCURE,
|
25
29
|
NIMI_LINKU_SANDBOX,
|
26
30
|
UCSUR_PUNCT_RANGES,
|
31
|
+
NIMI_LINKU_UNCOMMON,
|
27
32
|
)
|
28
33
|
|
29
34
|
regex.DEFAULT_VERSION = regex.VERSION1
|
@@ -37,6 +42,33 @@ class Filter(ABC):
|
|
37
42
|
raise NotImplementedError
|
38
43
|
|
39
44
|
|
45
|
+
class MinLen(Filter):
|
46
|
+
"""
|
47
|
+
Meta filter meant to be inherited by another filter to add a length requirement.
|
48
|
+
Multiple-inherit with `MinLen` as the first argument so `super()` resolves correctly.
|
49
|
+
You may also construct any other filter with a minimum length filter like so:
|
50
|
+
|
51
|
+
```
|
52
|
+
MinLen(Alphabetic, 3)
|
53
|
+
```
|
54
|
+
"""
|
55
|
+
|
56
|
+
length = 0
|
57
|
+
|
58
|
+
@classmethod
|
59
|
+
@cache(maxsize=None)
|
60
|
+
def filter(cls, token: str) -> bool:
|
61
|
+
if len(token) < cls.length:
|
62
|
+
return False
|
63
|
+
return super().filter(token)
|
64
|
+
|
65
|
+
def __new__(cls, filter: Type[Filter], length_: int) -> Type[Filter]:
|
66
|
+
class MinLenFilter(MinLen, Filter):
|
67
|
+
length = length_
|
68
|
+
|
69
|
+
return MinLenFilter
|
70
|
+
|
71
|
+
|
40
72
|
class RegexFilter(Filter):
|
41
73
|
pattern: "re.Pattern[str]"
|
42
74
|
|
@@ -78,11 +110,16 @@ class SubsetFilter(Filter):
|
|
78
110
|
|
79
111
|
|
80
112
|
class Miscellaneous(MemberFilter):
|
81
|
-
tokens =
|
113
|
+
tokens = prep_dictionary(ALLOWABLES)
|
82
114
|
|
83
115
|
|
84
116
|
class EnglishIgnorables(MemberFilter):
|
85
|
-
|
117
|
+
"""NOTE: Not recommended for use.
|
118
|
+
It is better to use a Long* filter such as LongSyllabic than to use this filter.
|
119
|
+
This filter hides words from scoring rather than scoring them poorly,
|
120
|
+
which is more of a benefit than a loss for a word you would like to omit."""
|
121
|
+
|
122
|
+
tokens = prep_dictionary(IGNORABLES)
|
86
123
|
|
87
124
|
|
88
125
|
class ProperName(Filter):
|
@@ -104,28 +141,48 @@ class ProperName(Filter):
|
|
104
141
|
# this will errantly match.
|
105
142
|
|
106
143
|
|
144
|
+
class LongProperName(MinLen, ProperName):
|
145
|
+
length = 2 # reject "names" of length 1
|
146
|
+
|
147
|
+
|
107
148
|
class NimiPu(MemberFilter):
|
108
|
-
tokens =
|
149
|
+
tokens = prep_dictionary(NIMI_PU)
|
150
|
+
|
151
|
+
|
152
|
+
class NimiPuSynonyms(MemberFilter):
|
153
|
+
tokens = prep_dictionary(NIMI_PU_SYNONYMS)
|
154
|
+
|
155
|
+
|
156
|
+
class NimiKuSuli(MemberFilter):
|
157
|
+
tokens = prep_dictionary(NIMI_KU_SULI)
|
158
|
+
|
109
159
|
|
160
|
+
class NimiKuLili(MemberFilter):
|
161
|
+
tokens = prep_dictionary(NIMI_KU_LILI)
|
110
162
|
|
111
|
-
class NimiPuAle(MemberFilter):
|
112
|
-
tokens = set(NIMI_PU + NIMI_PU_SYNONYMS)
|
113
163
|
|
164
|
+
class NimiLinkuCore(MemberFilter):
|
165
|
+
tokens = prep_dictionary(NIMI_LINKU_CORE)
|
114
166
|
|
115
|
-
class NimiLinku(MemberFilter):
|
116
|
-
tokens = set(NIMI_LINKU)
|
117
167
|
|
168
|
+
class NimiLinkuCommon(MemberFilter):
|
169
|
+
tokens = prep_dictionary(NIMI_LINKU_COMMON)
|
118
170
|
|
119
|
-
|
120
|
-
|
171
|
+
|
172
|
+
class NimiLinkuUncommon(MemberFilter):
|
173
|
+
tokens = prep_dictionary(NIMI_LINKU_UNCOMMON)
|
174
|
+
|
175
|
+
|
176
|
+
class NimiLinkuObscure(MemberFilter):
|
177
|
+
tokens = prep_dictionary(NIMI_LINKU_OBSCURE)
|
121
178
|
|
122
179
|
|
123
180
|
class NimiLinkuSandbox(MemberFilter):
|
124
|
-
tokens =
|
181
|
+
tokens = prep_dictionary(NIMI_LINKU_SANDBOX)
|
125
182
|
|
126
183
|
|
127
184
|
class NimiUCSUR(MemberFilter):
|
128
|
-
tokens =
|
185
|
+
tokens = prep_dictionary(NIMI_UCSUR)
|
129
186
|
|
130
187
|
|
131
188
|
class Phonotactic(RegexFilter):
|
@@ -145,6 +202,10 @@ class Phonotactic(RegexFilter):
|
|
145
202
|
)
|
146
203
|
|
147
204
|
|
205
|
+
class LongPhonotactic(MinLen, Phonotactic):
|
206
|
+
length = 3
|
207
|
+
|
208
|
+
|
148
209
|
class Syllabic(RegexFilter):
|
149
210
|
"""Determines if a given token is syllabically valid Toki Pona (or `n`).
|
150
211
|
Words must have correctly ordered vowels and consonants, but the phonotactic
|
@@ -158,6 +219,10 @@ class Syllabic(RegexFilter):
|
|
158
219
|
)
|
159
220
|
|
160
221
|
|
222
|
+
class LongSyllabic(MinLen, Syllabic):
|
223
|
+
length = 3
|
224
|
+
|
225
|
+
|
161
226
|
class Alphabetic(SubsetFilter):
|
162
227
|
tokens = set(ALPHABET)
|
163
228
|
|
@@ -166,9 +231,8 @@ class AlphabeticRe(RegexFilter):
|
|
166
231
|
pattern = re.compile(rf"[{ALPHABET}]+", flags=re.IGNORECASE)
|
167
232
|
|
168
233
|
|
169
|
-
class
|
170
|
-
|
171
|
-
pass
|
234
|
+
class LongAlphabetic(MinLen, Alphabetic):
|
235
|
+
length = 3
|
172
236
|
|
173
237
|
|
174
238
|
class Numeric(Filter):
|
@@ -224,11 +288,10 @@ class OrFilter:
|
|
224
288
|
Instead, the user is responsible for building an OrFilter out of their desired filters.
|
225
289
|
"""
|
226
290
|
|
227
|
-
|
228
|
-
|
229
|
-
raise ValueError("Must provide at least two Filters to OrFilter.")
|
291
|
+
@staticmethod
|
292
|
+
def __generic_filter(*filters_: Type[Filter]) -> Type[Filter]:
|
230
293
|
|
231
|
-
class
|
294
|
+
class CombinedFilter(Filter):
|
232
295
|
filters: List[Type[Filter]] = list(filters_) # TODO: tuple better?
|
233
296
|
|
234
297
|
@classmethod
|
@@ -240,7 +303,37 @@ class OrFilter:
|
|
240
303
|
return True
|
241
304
|
return False
|
242
305
|
|
243
|
-
return
|
306
|
+
return CombinedFilter
|
307
|
+
|
308
|
+
def __new__(cls, *filters: Type[Filter]) -> Type[Filter]:
|
309
|
+
if not len(filters) >= 2:
|
310
|
+
raise ValueError("Provide at least two Filters to OrFilter.")
|
311
|
+
|
312
|
+
member_filters = [f for f in filters if issubclass(f, MemberFilter)]
|
313
|
+
if len(member_filters) >= 2:
|
314
|
+
raise Warning("Use OrMemberFilter for combining two or more MemberFilters.")
|
315
|
+
|
316
|
+
filter = cls.__generic_filter(*filters)
|
317
|
+
|
318
|
+
return filter
|
319
|
+
|
320
|
+
|
321
|
+
class OrMemberFilter:
|
322
|
+
@staticmethod
|
323
|
+
def __member_filter(*filters: Type[MemberFilter]) -> Type[MemberFilter]:
|
324
|
+
all_token_sets: List[Set[str]] = [f.tokens for f in filters]
|
325
|
+
all_tokens: Set[str] = set().union(*all_token_sets)
|
326
|
+
|
327
|
+
class CombinedFilter(MemberFilter):
|
328
|
+
tokens = all_tokens
|
329
|
+
|
330
|
+
return CombinedFilter
|
331
|
+
|
332
|
+
def __new__(cls, *filters_: Type[MemberFilter]) -> Type[MemberFilter]:
|
333
|
+
if not len(filters_) >= 2:
|
334
|
+
raise ValueError("Provide two or more MemberFilters to OrMemberFilter.")
|
335
|
+
filter = cls.__member_filter(*filters_)
|
336
|
+
return filter
|
244
337
|
|
245
338
|
|
246
339
|
class AndFilter(Filter):
|
@@ -271,11 +364,15 @@ __all__ = [
|
|
271
364
|
"Alphabetic",
|
272
365
|
"AndFilter",
|
273
366
|
"EnglishIgnorables",
|
274
|
-
"
|
275
|
-
"
|
367
|
+
"LongAlphabetic",
|
368
|
+
"LongPhonotactic",
|
369
|
+
"LongProperName",
|
370
|
+
"LongSyllabic",
|
371
|
+
"MinLen",
|
372
|
+
"NimiLinkuCore",
|
276
373
|
"NimiLinkuSandbox",
|
277
374
|
"NimiPu",
|
278
|
-
"
|
375
|
+
"NimiPuSynonyms",
|
279
376
|
"NimiUCSUR",
|
280
377
|
"Numeric",
|
281
378
|
"OrFilter",
|
sonatoki/constants.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# STL
|
2
2
|
import json
|
3
|
-
from typing import Dict, List
|
3
|
+
from typing import Set, Dict, List
|
4
4
|
from pathlib import Path
|
5
5
|
|
6
6
|
# LOCAL
|
@@ -383,37 +383,62 @@ LANGUAGE = "english" # for NLTK
|
|
383
383
|
|
384
384
|
"""Commonly occurring strings which are some kind of valid Toki Pona or external token"""
|
385
385
|
ALLOWABLES = {
|
386
|
-
"cw", # Content Warning
|
387
386
|
"x", # ala
|
388
387
|
"y", # anu
|
389
388
|
"kxk", # ken ala ken
|
390
389
|
"wxw", # wile ala wile
|
391
390
|
}
|
392
391
|
|
393
|
-
|
394
|
-
#
|
395
|
-
"
|
396
|
-
"
|
397
|
-
"
|
398
|
-
"i",
|
399
|
-
"in",
|
400
|
-
"is",
|
401
|
-
"l", # they'll
|
402
|
-
"m", # i'm
|
403
|
-
"me",
|
404
|
-
"no",
|
405
|
-
"s", # let's
|
406
|
-
"so",
|
407
|
-
"t", # don't
|
408
|
-
"to",
|
409
|
-
"u", # you
|
410
|
-
"we",
|
411
|
-
"un", # un-
|
412
|
-
"use",
|
392
|
+
PHONOMATCHES = {
|
393
|
+
# "a", # ignore
|
394
|
+
# "an", # against
|
395
|
+
# "i", # against
|
396
|
+
# "in", # against
|
413
397
|
"some",
|
414
|
-
"like",
|
398
|
+
"like", # against
|
399
|
+
# "me", # against
|
400
|
+
# "no", # against
|
401
|
+
# "on", # against
|
402
|
+
# "se", # against
|
403
|
+
# "so", # against
|
404
|
+
# "some", # against
|
405
|
+
"to", # ignore
|
406
|
+
# "u", # against
|
407
|
+
# "un", # against
|
408
|
+
"use", # against
|
409
|
+
# "we", # against
|
410
|
+
}
|
411
|
+
|
412
|
+
ALPHABETIC_MATCHES = PHONOMATCHES | {
|
413
|
+
"a",
|
414
|
+
# "am",
|
415
|
+
# "as",
|
416
|
+
# "at",
|
417
|
+
# "aw", # aww
|
418
|
+
# "ek", # eek
|
419
|
+
# "ew",
|
420
|
+
# "ik",
|
421
|
+
# "il", # ill
|
422
|
+
# "im",
|
423
|
+
# "im",
|
424
|
+
# "ip",
|
425
|
+
# "is",
|
426
|
+
# "it",
|
427
|
+
# "l", # they'll
|
428
|
+
# "m", # i'm
|
429
|
+
# "ok",
|
430
|
+
# "op",
|
431
|
+
# "ow",
|
432
|
+
# "s", # let's
|
433
|
+
# "t", # don't
|
434
|
+
# "up",
|
435
|
+
# "us",
|
436
|
+
# "ut",
|
437
|
+
# "uw",
|
415
438
|
}
|
416
439
|
|
440
|
+
IGNORABLES = PHONOMATCHES | ALPHABETIC_MATCHES
|
441
|
+
|
417
442
|
UCSUR_RANGES = [
|
418
443
|
"\\U000F1900-\\U000F1977", # pu
|
419
444
|
"\\U000F1978-\\U000F1988", # ku suli
|
@@ -421,24 +446,31 @@ UCSUR_RANGES = [
|
|
421
446
|
]
|
422
447
|
NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
|
423
448
|
|
449
|
+
|
450
|
+
# NIMI_PU_UCSUR_RANGES = ["\\U000F1900-\\U000F1977"]
|
451
|
+
# NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
|
452
|
+
|
453
|
+
|
454
|
+
def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> Set[str]:
|
455
|
+
return {d["word"] for d in data.values() if d[key] == value}
|
456
|
+
|
457
|
+
|
424
458
|
with open(LINKU) as f:
|
425
459
|
linku: Dict[str, Dict[str, str]] = json.loads(f.read())
|
426
|
-
NIMI_PU
|
427
|
-
NIMI_PU_SYNONYMS
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
460
|
+
NIMI_PU = category_helper(linku, "book", "pu")
|
461
|
+
NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
|
462
|
+
|
463
|
+
NIMI_KU_SULI = category_helper(linku, "book", "ku suli")
|
464
|
+
NIMI_KU_LILI = category_helper(linku, "book", "ku lili")
|
465
|
+
|
466
|
+
NIMI_LINKU_CORE = category_helper(linku, "usage_category", "core")
|
467
|
+
NIMI_LINKU_COMMON = category_helper(linku, "usage_category", "common")
|
468
|
+
NIMI_LINKU_UNCOMMON = category_helper(linku, "usage_category", "uncommon")
|
469
|
+
NIMI_LINKU_OBSCURE = category_helper(linku, "usage_category", "obscure")
|
436
470
|
|
437
471
|
with open(SANDBOX) as f:
|
438
472
|
sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
|
439
|
-
NIMI_LINKU_SANDBOX
|
440
|
-
d["word"] for d in sandbox.values()
|
441
|
-
]
|
473
|
+
NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
|
442
474
|
|
443
475
|
del linku
|
444
476
|
del sandbox
|
@@ -449,9 +481,13 @@ __all__ = [
|
|
449
481
|
"ALL_PUNCT_RANGES",
|
450
482
|
"ALPHABET",
|
451
483
|
"CONSONANTS",
|
452
|
-
"
|
453
|
-
"
|
484
|
+
"NIMI_KU_LILI",
|
485
|
+
"NIMI_KU_SULI",
|
486
|
+
"NIMI_LINKU_COMMON",
|
487
|
+
"NIMI_LINKU_CORE",
|
488
|
+
"NIMI_LINKU_OBSCURE",
|
454
489
|
"NIMI_LINKU_SANDBOX",
|
490
|
+
"NIMI_LINKU_UNCOMMON",
|
455
491
|
"NIMI_PU",
|
456
492
|
"NIMI_PU_SYNONYMS",
|
457
493
|
"POSIX_PUNCT",
|
sonatoki/utils.py
CHANGED
@@ -1,10 +1,23 @@
|
|
1
1
|
# STL
|
2
2
|
import re
|
3
|
-
from typing import List
|
3
|
+
from typing import Set, List, Iterable
|
4
|
+
|
5
|
+
# LOCAL
|
6
|
+
from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates
|
4
7
|
|
5
8
|
TO_ESCAPE = ["\\", "^", "[", "]", "-"]
|
6
9
|
|
7
10
|
|
11
|
+
def prep_dictionary(words: Iterable[str]) -> Set[str]:
|
12
|
+
out: Set[str] = set()
|
13
|
+
cleaners = [Lowercase, ConsecutiveDuplicates]
|
14
|
+
for word in words:
|
15
|
+
for c in cleaners:
|
16
|
+
word = c.clean(word)
|
17
|
+
out.add(word)
|
18
|
+
return out
|
19
|
+
|
20
|
+
|
8
21
|
def regex_escape(s: str) -> str:
|
9
22
|
"""Escape all characters which must be escaped when embedded in a character class."""
|
10
23
|
for c in TO_ESCAPE:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sonatoki
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.2
|
4
4
|
Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
|
5
5
|
Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
|
6
6
|
License: AGPL-3.0-or-later
|
@@ -12,15 +12,22 @@ Description-Content-Type: text/markdown
|
|
12
12
|
|
13
13
|
# sona toki
|
14
14
|
|
15
|
+
<div align="center">
|
16
|
+
|
17
|
+

|
18
|
+
[](https://pypi.org/project/sonatoki)
|
19
|
+
|
20
|
+
</div>
|
21
|
+
|
15
22
|
## What is **sona toki**?
|
16
23
|
|
17
|
-
This library, "Language Knowledge," helps you identify whether a message is in Toki Pona.
|
24
|
+
This library, "Language Knowledge," helps you identify whether a message is in Toki Pona. It does so by determining whether a large enough number of words in a statement are "in Toki Pona". No grammar checking, yet.
|
18
25
|
|
19
|
-
I wrote
|
26
|
+
I wrote this library with a variety of scraps and lessons learned from a prior project, [ilo pi toki pona taso, "toki-pona-only tool"](https://github.com/gregdan3/ilo-pi-toki-pona-taso). That tool now uses this library to great success!
|
20
27
|
|
21
|
-
If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language
|
28
|
+
If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language. This complexity applies to Toki Pona too.
|
22
29
|
|
23
|
-
|
30
|
+
So, this project "solves" that complex problem by offering an opinionated tokenizer and a configurable parser, allowing you to tune its output to your preferences and goals. [Even silly ones.](https://sona.pona.la/wiki/isipin_epiku).
|
24
31
|
|
25
32
|
## Quick Start
|
26
33
|
|
@@ -53,12 +60,12 @@ Or if you'd prefer to configure on your own:
|
|
53
60
|
from copy import deepcopy
|
54
61
|
from sonatoki.ilo import Ilo
|
55
62
|
from sonatoki.Configs import BaseConfig
|
56
|
-
from sonatoki.Filters import
|
63
|
+
from sonatoki.Filters import NimiLinkuCore, Phonotactic, ProperName
|
57
64
|
from sonatoki.Scorers import SoftPassFail
|
58
65
|
|
59
66
|
def main():
|
60
67
|
config = deepcopy(BaseConfig)
|
61
|
-
config["scoring_filters"].extend([
|
68
|
+
config["scoring_filters"].extend([NimiLinkuCore, Phonotactic, ProperName])
|
62
69
|
config["scorer"] = SoftPassFail
|
63
70
|
|
64
71
|
ilo = Ilo(**config)
|
@@ -88,24 +95,28 @@ After our proposal has been examined and a result given by the committee, I will
|
|
88
95
|
|
89
96
|
### What's the deal with the tokenizers?
|
90
97
|
|
91
|
-
The Toki Pona tokenizer `
|
92
|
-
|
93
|
-
Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet, so a more aggressive tokenizer is highly desirable.
|
98
|
+
The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer` has the goal of tokenizing statements such that every token either represents a word candidate ("toki", "mumumu") or a complete non-candidate ("..!", "123").
|
99
|
+
This design is highly undesirable for NLTK's English tokenizer because English words can have "punctuation" characters in them.
|
100
|
+
But Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet or in [Private Use Area Unicode characters](https://www.kreativekorp.com/ucsur/), so a more aggressive tokenizer is highly desirable.
|
94
101
|
|
95
|
-
The
|
102
|
+
The goal of splitting into word candidates and non-candidates is important, because any [encoding of Toki Pona's logographic script](https://www.kreativekorp.com/ucsur/charts/sitelen.html) will require each character be split into its own token, where the default behavior would be to leave consecutive non-punctuation together.
|
96
103
|
|
97
104
|
### Aren't there a lot of false positives?
|
98
105
|
|
99
|
-
Yes
|
106
|
+
Yes, depending on the filter you choose and how you apply it.
|
107
|
+
It's up to you to use this tool responsibly on input you've done your best to clean, such as by using stronger filters before weaker ones.
|
108
|
+
For now though, here's a list of relevant false positives:
|
100
109
|
|
101
|
-
- `ProperName` will errantly match text in languages without a capital/lowercase distinction, artificially
|
102
|
-
- `Alphabetic` will match a _lot_ of undesirable text- it essentially allows 14 letters of the English alphabet.
|
110
|
+
- `ProperName` will errantly match text in languages without a capital/lowercase distinction, artificially increasing scores.
|
111
|
+
- `Alphabetic` will match a _lot_ of undesirable text- it essentially allows 14 letters of the English alphabet. For example, "I'm well" would match as _three_ words: "i", "m", "well".
|
112
|
+
- `NimiPu` and other sets containing `a`, `mute`, `open`, and others will unavoidably match those words in English text too.
|
103
113
|
|
104
114
|
### Don't some of the cleaners/filters conflict?
|
105
115
|
|
106
|
-
Yes
|
116
|
+
Yes, though not terribly much.
|
107
117
|
|
108
118
|
- `ConsecutiveDuplicates` may errantly change a word's validity. For example, "manna" is phonotactically invalid in Toki Pona, but would become "mana" which is valid.
|
109
|
-
- `ConsecutiveDuplicates` will not work correctly with syllabaries
|
119
|
+
- `ConsecutiveDuplicates` will not work correctly with syllabaries, though this should not change the validity of the analyzed word unless you attempt to dictionary match these words.
|
120
|
+
- If you build your own `MemberFilter` with words that have capital letters or consecutive duplicates, they will never match unless you use `prep_dictionary`.
|
110
121
|
|
111
|
-
You'll notice
|
122
|
+
You'll notice these are mostly casued by applying latin alphabet filters to non-latin text. Working on it!
|
@@ -1,18 +1,18 @@
|
|
1
|
-
sonatoki-0.3.
|
2
|
-
sonatoki-0.3.
|
3
|
-
sonatoki-0.3.
|
1
|
+
sonatoki-0.3.2.dist-info/METADATA,sha256=9cnhaaYFLxN3uaubD0jfTAU_CC9wUGtzho4fs1UGLFc,6341
|
2
|
+
sonatoki-0.3.2.dist-info/WHEEL,sha256=vnE8JVcI2Wz7GRKorsPArnBdnW2SWKWGow5gu5tHlRU,90
|
3
|
+
sonatoki-0.3.2.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
|
4
4
|
sonatoki/Cleaners.py,sha256=m0j1a1vs9Mdqp724r9Xfh1Y_tyP6GYCkihv8rH8m7lA,1871
|
5
|
-
sonatoki/Configs.py,sha256=
|
6
|
-
sonatoki/Filters.py,sha256
|
5
|
+
sonatoki/Configs.py,sha256=o_uFp-Z6sbhbMi8drgQTkdu8S5LaTr0Xnns6Cg0cHSY,3548
|
6
|
+
sonatoki/Filters.py,sha256=-7zIV_IBsbASR7pF5WuoABNtBW5a7L135Ev_Rrn35o4,10664
|
7
7
|
sonatoki/Preprocessors.py,sha256=aMXXuFBDlJudvzvukvCa7BixuROXXEb62un7I-TGOGs,4441
|
8
8
|
sonatoki/Scorers.py,sha256=W-1uYiqjsDejJzoe592ixs7wHazjJXPhuo-41zuJ26U,3643
|
9
9
|
sonatoki/Tokenizers.py,sha256=So5_Tu6J98MD3yVcwB_X3lw2uMG0TN6XHcTbQjFCu5Q,4254
|
10
10
|
sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
sonatoki/__main__.py,sha256=6xc-wIrrFo9wTyn4zRQNAmqwmJBtVvCMwV-CrM-hueA,82
|
12
|
-
sonatoki/constants.py,sha256=
|
12
|
+
sonatoki/constants.py,sha256=qq1_ZTsVKG_d7nqlJv3a-KS6ZvYwfUSHWA--e0BuyXc,13268
|
13
13
|
sonatoki/ilo.py,sha256=yyLgNPI0Hmb4f1BzX6IRHr11FPChfL2xDR_9odlr8_8,3849
|
14
14
|
sonatoki/linku.json,sha256=B5KNdhyM5UEfMciROgh1ECHr3i-ASBeMvwrkzNJX47c,271013
|
15
15
|
sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
16
|
sonatoki/sandbox.json,sha256=hx6LRsfvmmTtqXcXIyCsfSaGK3DZ-GCdbM8xhZQBHoA,77650
|
17
|
-
sonatoki/utils.py,sha256=
|
18
|
-
sonatoki-0.3.
|
17
|
+
sonatoki/utils.py,sha256=OMaRyoNvKGKYQCBDjQyaCI58-wMpQ0wrrNjTJKsEZ9Y,3550
|
18
|
+
sonatoki-0.3.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|