sonatoki 0.3.1__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sonatoki-0.3.1 → sonatoki-0.3.2}/PKG-INFO +1 -1
- {sonatoki-0.3.1 → sonatoki-0.3.2}/pyproject.toml +1 -3
- {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/Configs.py +26 -25
- {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/Filters.py +58 -12
- {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/constants.py +53 -28
- {sonatoki-0.3.1 → sonatoki-0.3.2}/tests/test_filters.py +38 -59
- {sonatoki-0.3.1 → sonatoki-0.3.2}/tests/test_ilo.py +31 -11
- sonatoki-0.3.2/tests/test_properties.py +78 -0
- {sonatoki-0.3.1 → sonatoki-0.3.2}/tests/test_utils.py +1 -1
- {sonatoki-0.3.1 → sonatoki-0.3.2}/LICENSE +0 -0
- {sonatoki-0.3.1 → sonatoki-0.3.2}/README.md +0 -0
- {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/Cleaners.py +0 -0
- {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/Preprocessors.py +0 -0
- {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/Scorers.py +0 -0
- {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/Tokenizers.py +0 -0
- {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/__init__.py +0 -0
- {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/__main__.py +0 -0
- {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/ilo.py +0 -0
- {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/linku.json +0 -0
- {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/py.typed +0 -0
- {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/sandbox.json +0 -0
- {sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/utils.py +0 -0
- {sonatoki-0.3.1 → sonatoki-0.3.2}/tests/__init__.py +0 -0
- {sonatoki-0.3.1 → sonatoki-0.3.2}/tests/test_cleaners.py +0 -0
- {sonatoki-0.3.1 → sonatoki-0.3.2}/tests/test_preprocessors.py +0 -0
- {sonatoki-0.3.1 → sonatoki-0.3.2}/tests/test_scorers.py +0 -0
- {sonatoki-0.3.1 → sonatoki-0.3.2}/tests/test_tokenize.py +0 -0
- {sonatoki-0.3.1 → sonatoki-0.3.2}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
- {sonatoki-0.3.1 → sonatoki-0.3.2}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "sonatoki"
|
3
|
-
version = "0.3.
|
3
|
+
version = "0.3.2"
|
4
4
|
description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
|
5
5
|
authors = [
|
6
6
|
{ name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
|
@@ -16,8 +16,6 @@ readme = "README.md"
|
|
16
16
|
[project.license]
|
17
17
|
text = "AGPL-3.0-or-later"
|
18
18
|
|
19
|
-
[project.optional-dependencies]
|
20
|
-
|
21
19
|
[build-system]
|
22
20
|
requires = [
|
23
21
|
"pdm-backend",
|
@@ -5,17 +5,17 @@ from typing import List, Type, TypedDict
|
|
5
5
|
# LOCAL
|
6
6
|
from sonatoki.Filters import (
|
7
7
|
Filter,
|
8
|
-
NimiPu,
|
9
8
|
Numeric,
|
10
|
-
OrFilter,
|
11
9
|
Syllabic,
|
12
10
|
NimiUCSUR,
|
13
11
|
Alphabetic,
|
14
12
|
ProperName,
|
15
|
-
Phonotactic,
|
16
13
|
Punctuation,
|
14
|
+
LongSyllabic,
|
15
|
+
Miscellaneous,
|
17
16
|
NimiLinkuCore,
|
18
|
-
|
17
|
+
LongAlphabetic,
|
18
|
+
LongProperName,
|
19
19
|
OrMemberFilter,
|
20
20
|
NimiLinkuCommon,
|
21
21
|
NimiLinkuObscure,
|
@@ -28,12 +28,9 @@ from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
|
28
28
|
from sonatoki.Tokenizers import Tokenizer, WordTokenizer
|
29
29
|
from sonatoki.Preprocessors import (
|
30
30
|
URLs,
|
31
|
+
Backticks,
|
31
32
|
Reference,
|
32
33
|
Preprocessor,
|
33
|
-
DiscordEmotes,
|
34
|
-
DiscordSpecial,
|
35
|
-
DiscordChannels,
|
36
|
-
DiscordMentions,
|
37
34
|
AngleBracketObject,
|
38
35
|
)
|
39
36
|
|
@@ -62,14 +59,14 @@ BaseConfig: IloConfig = {
|
|
62
59
|
|
63
60
|
|
64
61
|
PrefConfig: IloConfig = {
|
65
|
-
"preprocessors": [URLs, Reference],
|
62
|
+
"preprocessors": [Backticks, URLs, Reference],
|
66
63
|
"cleaners": [ConsecutiveDuplicates],
|
67
|
-
"ignoring_filters": [Numeric, Punctuation
|
64
|
+
"ignoring_filters": [Numeric, Punctuation],
|
68
65
|
"scoring_filters": [
|
69
|
-
OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
|
70
|
-
|
71
|
-
|
72
|
-
|
66
|
+
OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
|
67
|
+
LongSyllabic,
|
68
|
+
LongProperName,
|
69
|
+
LongAlphabetic,
|
73
70
|
],
|
74
71
|
"scorer": SoftScaling,
|
75
72
|
"passing_score": 0.8,
|
@@ -77,9 +74,9 @@ PrefConfig: IloConfig = {
|
|
77
74
|
}
|
78
75
|
|
79
76
|
CorpusConfig: IloConfig = {
|
80
|
-
"preprocessors": [URLs, AngleBracketObject, Reference],
|
77
|
+
"preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
|
81
78
|
"cleaners": [ConsecutiveDuplicates],
|
82
|
-
"ignoring_filters": [Numeric, Punctuation
|
79
|
+
"ignoring_filters": [Numeric, Punctuation],
|
83
80
|
"scoring_filters": [
|
84
81
|
OrMemberFilter(
|
85
82
|
NimiLinkuCore,
|
@@ -88,10 +85,11 @@ CorpusConfig: IloConfig = {
|
|
88
85
|
NimiLinkuObscure,
|
89
86
|
NimiLinkuSandbox,
|
90
87
|
NimiUCSUR,
|
88
|
+
Miscellaneous,
|
91
89
|
),
|
92
|
-
|
93
|
-
|
94
|
-
|
90
|
+
LongSyllabic,
|
91
|
+
LongProperName,
|
92
|
+
LongAlphabetic,
|
95
93
|
],
|
96
94
|
"scorer": SoftScaling,
|
97
95
|
"passing_score": 0.8,
|
@@ -99,25 +97,28 @@ CorpusConfig: IloConfig = {
|
|
99
97
|
}
|
100
98
|
|
101
99
|
|
100
|
+
"""
|
101
|
+
Mimics the previous implementation of ilo pi toki pona taso
|
102
|
+
"""
|
102
103
|
LazyConfig: IloConfig = {
|
103
|
-
"preprocessors": [URLs],
|
104
|
+
"preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
|
104
105
|
"cleaners": [ConsecutiveDuplicates],
|
105
106
|
"ignoring_filters": [Numeric, Punctuation],
|
106
|
-
"scoring_filters": [Alphabetic, NimiUCSUR, ProperName],
|
107
|
+
"scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
|
107
108
|
"scorer": SoftPassFail,
|
108
109
|
"passing_score": 0.8,
|
109
110
|
"word_tokenizer": WordTokenizer,
|
110
111
|
}
|
111
112
|
|
112
113
|
DiscordConfig: IloConfig = {
|
113
|
-
"preprocessors": [URLs, AngleBracketObject, Reference],
|
114
|
+
"preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
|
114
115
|
"cleaners": [ConsecutiveDuplicates],
|
115
116
|
"ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
|
116
117
|
"scoring_filters": [
|
117
118
|
OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
|
118
|
-
|
119
|
-
|
120
|
-
|
119
|
+
LongSyllabic,
|
120
|
+
LongProperName,
|
121
|
+
LongAlphabetic,
|
121
122
|
],
|
122
123
|
"scorer": SoftScaling,
|
123
124
|
"passing_score": 0.8,
|
@@ -42,6 +42,33 @@ class Filter(ABC):
|
|
42
42
|
raise NotImplementedError
|
43
43
|
|
44
44
|
|
45
|
+
class MinLen(Filter):
|
46
|
+
"""
|
47
|
+
Meta filter meant to be inherited by another filter to add a length requirement.
|
48
|
+
Multiple-inherit with `MinLen` as the first argument so `super()` resolves correctly.
|
49
|
+
You may also construct any other filter with a minimum length filter like so:
|
50
|
+
|
51
|
+
```
|
52
|
+
MinLen(Alphabetic, 3)
|
53
|
+
```
|
54
|
+
"""
|
55
|
+
|
56
|
+
length = 0
|
57
|
+
|
58
|
+
@classmethod
|
59
|
+
@cache(maxsize=None)
|
60
|
+
def filter(cls, token: str) -> bool:
|
61
|
+
if len(token) < cls.length:
|
62
|
+
return False
|
63
|
+
return super().filter(token)
|
64
|
+
|
65
|
+
def __new__(cls, filter: Type[Filter], length_: int) -> Type[Filter]:
|
66
|
+
class MinLenFilter(MinLen, Filter):
|
67
|
+
length = length_
|
68
|
+
|
69
|
+
return MinLenFilter
|
70
|
+
|
71
|
+
|
45
72
|
class RegexFilter(Filter):
|
46
73
|
pattern: "re.Pattern[str]"
|
47
74
|
|
@@ -83,11 +110,16 @@ class SubsetFilter(Filter):
|
|
83
110
|
|
84
111
|
|
85
112
|
class Miscellaneous(MemberFilter):
|
86
|
-
tokens =
|
113
|
+
tokens = prep_dictionary(ALLOWABLES)
|
87
114
|
|
88
115
|
|
89
116
|
class EnglishIgnorables(MemberFilter):
|
90
|
-
|
117
|
+
"""NOTE: Not recommended for use.
|
118
|
+
It is better to use a Long* filter such as LongSyllabic than to use this filter.
|
119
|
+
This filter hides words from scoring rather than scoring them poorly,
|
120
|
+
which is more of a benefit than a loss for a word you would like to omit."""
|
121
|
+
|
122
|
+
tokens = prep_dictionary(IGNORABLES)
|
91
123
|
|
92
124
|
|
93
125
|
class ProperName(Filter):
|
@@ -109,6 +141,10 @@ class ProperName(Filter):
|
|
109
141
|
# this will errantly match.
|
110
142
|
|
111
143
|
|
144
|
+
class LongProperName(MinLen, ProperName):
|
145
|
+
length = 2 # reject "names" of length 1
|
146
|
+
|
147
|
+
|
112
148
|
class NimiPu(MemberFilter):
|
113
149
|
tokens = prep_dictionary(NIMI_PU)
|
114
150
|
|
@@ -166,6 +202,10 @@ class Phonotactic(RegexFilter):
|
|
166
202
|
)
|
167
203
|
|
168
204
|
|
205
|
+
class LongPhonotactic(MinLen, Phonotactic):
|
206
|
+
length = 3
|
207
|
+
|
208
|
+
|
169
209
|
class Syllabic(RegexFilter):
|
170
210
|
"""Determines if a given token is syllabically valid Toki Pona (or `n`).
|
171
211
|
Words must have correctly ordered vowels and consonants, but the phonotactic
|
@@ -179,6 +219,10 @@ class Syllabic(RegexFilter):
|
|
179
219
|
)
|
180
220
|
|
181
221
|
|
222
|
+
class LongSyllabic(MinLen, Syllabic):
|
223
|
+
length = 3
|
224
|
+
|
225
|
+
|
182
226
|
class Alphabetic(SubsetFilter):
|
183
227
|
tokens = set(ALPHABET)
|
184
228
|
|
@@ -187,9 +231,8 @@ class AlphabeticRe(RegexFilter):
|
|
187
231
|
pattern = re.compile(rf"[{ALPHABET}]+", flags=re.IGNORECASE)
|
188
232
|
|
189
233
|
|
190
|
-
class
|
191
|
-
|
192
|
-
pass
|
234
|
+
class LongAlphabetic(MinLen, Alphabetic):
|
235
|
+
length = 3
|
193
236
|
|
194
237
|
|
195
238
|
class Numeric(Filter):
|
@@ -266,11 +309,9 @@ class OrFilter:
|
|
266
309
|
if not len(filters) >= 2:
|
267
310
|
raise ValueError("Provide at least two Filters to OrFilter.")
|
268
311
|
|
269
|
-
|
270
|
-
if len(
|
271
|
-
raise Warning(
|
272
|
-
"Prefer OrMemberFilter for combining two or more MemberFilters."
|
273
|
-
)
|
312
|
+
member_filters = [f for f in filters if issubclass(f, MemberFilter)]
|
313
|
+
if len(member_filters) >= 2:
|
314
|
+
raise Warning("Use OrMemberFilter for combining two or more MemberFilters.")
|
274
315
|
|
275
316
|
filter = cls.__generic_filter(*filters)
|
276
317
|
|
@@ -279,7 +320,7 @@ class OrFilter:
|
|
279
320
|
|
280
321
|
class OrMemberFilter:
|
281
322
|
@staticmethod
|
282
|
-
def
|
323
|
+
def __member_filter(*filters: Type[MemberFilter]) -> Type[MemberFilter]:
|
283
324
|
all_token_sets: List[Set[str]] = [f.tokens for f in filters]
|
284
325
|
all_tokens: Set[str] = set().union(*all_token_sets)
|
285
326
|
|
@@ -291,7 +332,7 @@ class OrMemberFilter:
|
|
291
332
|
def __new__(cls, *filters_: Type[MemberFilter]) -> Type[MemberFilter]:
|
292
333
|
if not len(filters_) >= 2:
|
293
334
|
raise ValueError("Provide two or more MemberFilters to OrMemberFilter.")
|
294
|
-
filter = cls.
|
335
|
+
filter = cls.__member_filter(*filters_)
|
295
336
|
return filter
|
296
337
|
|
297
338
|
|
@@ -323,6 +364,11 @@ __all__ = [
|
|
323
364
|
"Alphabetic",
|
324
365
|
"AndFilter",
|
325
366
|
"EnglishIgnorables",
|
367
|
+
"LongAlphabetic",
|
368
|
+
"LongPhonotactic",
|
369
|
+
"LongProperName",
|
370
|
+
"LongSyllabic",
|
371
|
+
"MinLen",
|
326
372
|
"NimiLinkuCore",
|
327
373
|
"NimiLinkuSandbox",
|
328
374
|
"NimiPu",
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# STL
|
2
2
|
import json
|
3
|
-
from typing import Dict, List
|
3
|
+
from typing import Set, Dict, List
|
4
4
|
from pathlib import Path
|
5
5
|
|
6
6
|
# LOCAL
|
@@ -383,37 +383,62 @@ LANGUAGE = "english" # for NLTK
|
|
383
383
|
|
384
384
|
"""Commonly occurring strings which are some kind of valid Toki Pona or external token"""
|
385
385
|
ALLOWABLES = {
|
386
|
-
"cw", # Content Warning
|
387
386
|
"x", # ala
|
388
387
|
"y", # anu
|
389
388
|
"kxk", # ken ala ken
|
390
389
|
"wxw", # wile ala wile
|
391
390
|
}
|
392
391
|
|
393
|
-
|
394
|
-
#
|
395
|
-
"
|
396
|
-
"
|
397
|
-
"
|
398
|
-
"i",
|
399
|
-
"in",
|
400
|
-
"is",
|
401
|
-
"l", # they'll
|
402
|
-
"m", # i'm
|
403
|
-
"me",
|
404
|
-
"no",
|
405
|
-
"s", # let's
|
406
|
-
"so",
|
407
|
-
"t", # don't
|
408
|
-
"to",
|
409
|
-
"u", # you
|
410
|
-
"we",
|
411
|
-
"un", # un-
|
412
|
-
"use",
|
392
|
+
PHONOMATCHES = {
|
393
|
+
# "a", # ignore
|
394
|
+
# "an", # against
|
395
|
+
# "i", # against
|
396
|
+
# "in", # against
|
413
397
|
"some",
|
414
|
-
"like",
|
398
|
+
"like", # against
|
399
|
+
# "me", # against
|
400
|
+
# "no", # against
|
401
|
+
# "on", # against
|
402
|
+
# "se", # against
|
403
|
+
# "so", # against
|
404
|
+
# "some", # against
|
405
|
+
"to", # ignore
|
406
|
+
# "u", # against
|
407
|
+
# "un", # against
|
408
|
+
"use", # against
|
409
|
+
# "we", # against
|
415
410
|
}
|
416
411
|
|
412
|
+
ALPHABETIC_MATCHES = PHONOMATCHES | {
|
413
|
+
"a",
|
414
|
+
# "am",
|
415
|
+
# "as",
|
416
|
+
# "at",
|
417
|
+
# "aw", # aww
|
418
|
+
# "ek", # eek
|
419
|
+
# "ew",
|
420
|
+
# "ik",
|
421
|
+
# "il", # ill
|
422
|
+
# "im",
|
423
|
+
# "im",
|
424
|
+
# "ip",
|
425
|
+
# "is",
|
426
|
+
# "it",
|
427
|
+
# "l", # they'll
|
428
|
+
# "m", # i'm
|
429
|
+
# "ok",
|
430
|
+
# "op",
|
431
|
+
# "ow",
|
432
|
+
# "s", # let's
|
433
|
+
# "t", # don't
|
434
|
+
# "up",
|
435
|
+
# "us",
|
436
|
+
# "ut",
|
437
|
+
# "uw",
|
438
|
+
}
|
439
|
+
|
440
|
+
IGNORABLES = PHONOMATCHES | ALPHABETIC_MATCHES
|
441
|
+
|
417
442
|
UCSUR_RANGES = [
|
418
443
|
"\\U000F1900-\\U000F1977", # pu
|
419
444
|
"\\U000F1978-\\U000F1988", # ku suli
|
@@ -426,14 +451,14 @@ NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
|
|
426
451
|
# NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
|
427
452
|
|
428
453
|
|
429
|
-
def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) ->
|
430
|
-
return
|
454
|
+
def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> Set[str]:
|
455
|
+
return {d["word"] for d in data.values() if d[key] == value}
|
431
456
|
|
432
457
|
|
433
458
|
with open(LINKU) as f:
|
434
459
|
linku: Dict[str, Dict[str, str]] = json.loads(f.read())
|
435
|
-
NIMI_PU
|
436
|
-
NIMI_PU_SYNONYMS
|
460
|
+
NIMI_PU = category_helper(linku, "book", "pu")
|
461
|
+
NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
|
437
462
|
|
438
463
|
NIMI_KU_SULI = category_helper(linku, "book", "ku suli")
|
439
464
|
NIMI_KU_LILI = category_helper(linku, "book", "ku lili")
|
@@ -445,7 +470,7 @@ with open(LINKU) as f:
|
|
445
470
|
|
446
471
|
with open(SANDBOX) as f:
|
447
472
|
sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
|
448
|
-
NIMI_LINKU_SANDBOX
|
473
|
+
NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
|
449
474
|
|
450
475
|
del linku
|
451
476
|
del sandbox
|
@@ -18,11 +18,13 @@ from sonatoki.Filters import (
|
|
18
18
|
Phonotactic,
|
19
19
|
Punctuation,
|
20
20
|
AlphabeticRe,
|
21
|
+
LongSyllabic,
|
21
22
|
NimiLinkuCore,
|
22
23
|
PunctuationRe,
|
23
|
-
|
24
|
+
LongAlphabetic,
|
24
25
|
OrMemberFilter,
|
25
26
|
PunctuationRe1,
|
27
|
+
LongPhonotactic,
|
26
28
|
NimiLinkuCommon,
|
27
29
|
NimiLinkuObscure,
|
28
30
|
NimiLinkuSandbox,
|
@@ -34,7 +36,6 @@ from sonatoki.constants import (
|
|
34
36
|
NIMI_KU_LILI,
|
35
37
|
NIMI_KU_SULI,
|
36
38
|
NIMI_LINKU_CORE,
|
37
|
-
NIMI_PU_SYNONYMS,
|
38
39
|
NIMI_LINKU_COMMON,
|
39
40
|
NIMI_LINKU_OBSCURE,
|
40
41
|
NIMI_LINKU_SANDBOX,
|
@@ -45,7 +46,7 @@ from sonatoki.constants import (
|
|
45
46
|
from .test_utils import PROPER_NAME_RE
|
46
47
|
|
47
48
|
|
48
|
-
@given(st.sampled_from(NIMI_PU))
|
49
|
+
@given(st.sampled_from(list(NIMI_PU)))
|
49
50
|
@example("lukin")
|
50
51
|
@example("selo")
|
51
52
|
@example("li")
|
@@ -54,14 +55,14 @@ def test_NimiPu(s: str):
|
|
54
55
|
assert res, repr(s)
|
55
56
|
|
56
57
|
|
57
|
-
@given(st.sampled_from(NIMI_LINKU_CORE))
|
58
|
+
@given(st.sampled_from(list(NIMI_LINKU_CORE)))
|
58
59
|
@example("pona")
|
59
60
|
def test_NimiLinkuCore(s: str):
|
60
61
|
res = NimiLinkuCore.filter(s)
|
61
62
|
assert res, repr(s)
|
62
63
|
|
63
64
|
|
64
|
-
@given(st.sampled_from(NIMI_LINKU_COMMON))
|
65
|
+
@given(st.sampled_from(list(NIMI_LINKU_COMMON)))
|
65
66
|
@example("n")
|
66
67
|
@example("tonsi")
|
67
68
|
@example("kipisi")
|
@@ -70,19 +71,19 @@ def test_NimiLinkuCommon(s: str):
|
|
70
71
|
assert res, repr(s)
|
71
72
|
|
72
73
|
|
73
|
-
@given(st.sampled_from(NIMI_LINKU_UNCOMMON))
|
74
|
+
@given(st.sampled_from(list(NIMI_LINKU_UNCOMMON)))
|
74
75
|
def test_NimiLinkuUncommon(s: str):
|
75
76
|
res = NimiLinkuUncommon.filter(s)
|
76
77
|
assert res, repr(s)
|
77
78
|
|
78
79
|
|
79
|
-
@given(st.sampled_from(NIMI_LINKU_OBSCURE))
|
80
|
+
@given(st.sampled_from(list(NIMI_LINKU_OBSCURE)))
|
80
81
|
def test_NimiLinkuObscure(s: str):
|
81
82
|
res = NimiLinkuObscure.filter(s)
|
82
83
|
assert res, repr(s)
|
83
84
|
|
84
85
|
|
85
|
-
@given(st.sampled_from(NIMI_LINKU_SANDBOX))
|
86
|
+
@given(st.sampled_from(list(NIMI_LINKU_SANDBOX)))
|
86
87
|
@example("kalamARR")
|
87
88
|
@example("Pingo")
|
88
89
|
def test_NimiLinkuSandbox(s: str):
|
@@ -101,6 +102,13 @@ def test_Phonotactic(s: str):
|
|
101
102
|
assert res, repr(s)
|
102
103
|
|
103
104
|
|
105
|
+
@given(st.from_regex(Phonotactic.pattern.pattern, fullmatch=True))
|
106
|
+
def test_LongPhonotactic(s: str):
|
107
|
+
len_ok = len(s) >= LongPhonotactic.length
|
108
|
+
res = LongPhonotactic.filter(s)
|
109
|
+
assert res == len_ok, repr(s) # will match given fullmatch
|
110
|
+
|
111
|
+
|
104
112
|
@given(st.from_regex(Syllabic.pattern.pattern, fullmatch=True))
|
105
113
|
@example("wuwojitiwunwonjintinmanna")
|
106
114
|
def test_Syllabic(s: str):
|
@@ -108,6 +116,13 @@ def test_Syllabic(s: str):
|
|
108
116
|
assert res, repr(s)
|
109
117
|
|
110
118
|
|
119
|
+
@given(st.from_regex(Syllabic.pattern.pattern, fullmatch=True))
|
120
|
+
def test_LongSyllabic(s: str):
|
121
|
+
len_ok = len(s) >= LongSyllabic.length
|
122
|
+
res = LongSyllabic.filter(s)
|
123
|
+
assert res == len_ok
|
124
|
+
|
125
|
+
|
111
126
|
@given(st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True))
|
112
127
|
@example("muems")
|
113
128
|
@example("mpptp")
|
@@ -118,6 +133,13 @@ def test_Alphabetic(s: str):
|
|
118
133
|
assert res_fn == res_re, repr(s)
|
119
134
|
|
120
135
|
|
136
|
+
@given(st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True))
|
137
|
+
def test_LongAlphabetic(s: str):
|
138
|
+
len_ok = len(s) >= LongAlphabetic.length
|
139
|
+
res = LongAlphabetic.filter(s)
|
140
|
+
assert res == len_ok
|
141
|
+
|
142
|
+
|
121
143
|
@given(st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True))
|
122
144
|
def test_AlphabeticRe(s: str):
|
123
145
|
res_re = AlphabeticRe.filter(s)
|
@@ -181,7 +203,7 @@ def test_OrFilter(s: str):
|
|
181
203
|
# NOTE: No subset filter test because A | B is not the same as A combined with B.
|
182
204
|
# e.g. "apple" passes Alphabetic, "..." passes Punctuation, "apple..." passes neither
|
183
205
|
# but would incorrectly pass a combined filter.
|
184
|
-
@given(st.sampled_from(NIMI_PU
|
206
|
+
@given(st.sampled_from(list(NIMI_PU | NIMI_LINKU_OBSCURE)))
|
185
207
|
def test_OrMemberFilter(s: str):
|
186
208
|
filter = OrMemberFilter(NimiPu, NimiLinkuObscure)
|
187
209
|
res = filter.filter(s)
|
@@ -192,11 +214,13 @@ def test_OrMemberFilter(s: str):
|
|
192
214
|
|
193
215
|
@given(
|
194
216
|
st.sampled_from(
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
217
|
+
list(
|
218
|
+
NIMI_KU_SULI
|
219
|
+
| NIMI_KU_LILI
|
220
|
+
| NIMI_LINKU_UNCOMMON
|
221
|
+
| NIMI_LINKU_OBSCURE
|
222
|
+
| NIMI_LINKU_SANDBOX
|
223
|
+
),
|
200
224
|
)
|
201
225
|
)
|
202
226
|
def test_OrMemberFilter_IsipinEpiku(s: str):
|
@@ -216,48 +240,3 @@ def test_OrMemberFilter_IsipinEpiku(s: str):
|
|
216
240
|
assert res and (
|
217
241
|
res_ku_suli or res_ku_lili or res_uncommon or res_obscure or res_sandbox
|
218
242
|
)
|
219
|
-
|
220
|
-
|
221
|
-
@given(st.sampled_from(NIMI_PU + NIMI_PU_SYNONYMS))
|
222
|
-
def test_pu_filters_non_overlap(s: str):
|
223
|
-
res_pu = NimiPu.filter(s)
|
224
|
-
res_synonyms = NimiPuSynonyms.filter(s)
|
225
|
-
assert (res_pu + res_synonyms) == 1
|
226
|
-
|
227
|
-
|
228
|
-
@given(st.sampled_from(NIMI_KU_SULI + NIMI_KU_LILI))
|
229
|
-
def test_ku_filters_non_overlap(s: str):
|
230
|
-
res_ku_suli = NimiKuSuli.filter(s)
|
231
|
-
res_ku_lili = NimiKuLili.filter(s)
|
232
|
-
assert (res_ku_suli + res_ku_lili) == 1
|
233
|
-
|
234
|
-
|
235
|
-
@given(
|
236
|
-
st.sampled_from(
|
237
|
-
NIMI_LINKU_CORE
|
238
|
-
+ NIMI_LINKU_COMMON
|
239
|
-
+ NIMI_LINKU_UNCOMMON
|
240
|
-
+ NIMI_LINKU_OBSCURE
|
241
|
-
+ NIMI_LINKU_SANDBOX
|
242
|
-
)
|
243
|
-
)
|
244
|
-
def test_linku_filters_non_overlap(s: str):
|
245
|
-
s = Lowercase.clean(s)
|
246
|
-
s = ConsecutiveDuplicates.clean(s)
|
247
|
-
|
248
|
-
res_core = NimiLinkuCore.filter(s)
|
249
|
-
res_common = NimiLinkuCommon.filter(s)
|
250
|
-
res_uncommon = NimiLinkuUncommon.filter(s)
|
251
|
-
res_obscure = NimiLinkuObscure.filter(s)
|
252
|
-
res_sandbox = NimiLinkuSandbox.filter(s)
|
253
|
-
|
254
|
-
assert (res_core + res_common + res_uncommon + res_obscure + res_sandbox) == 1
|
255
|
-
|
256
|
-
|
257
|
-
@given(st.sampled_from(NIMI_LINKU_CORE + NIMI_LINKU_COMMON + NIMI_LINKU_UNCOMMON))
|
258
|
-
def test_nimi_linku_properties(s: str):
|
259
|
-
assert ConsecutiveDuplicates.clean(s) == s, repr(s)
|
260
|
-
assert Alphabetic.filter(s), repr(s)
|
261
|
-
assert Syllabic.filter(s), repr(s)
|
262
|
-
assert Phonotactic.filter(s), repr(s)
|
263
|
-
# Passing phonotactic implies all of the above
|
@@ -59,6 +59,8 @@ SYLLABIC_MATCHES = [
|
|
59
59
|
"mi sona ala e nimi sunopatikuna",
|
60
60
|
"kalama wuwojiti li pana e sona",
|
61
61
|
"jan Awaja en jan Alasali en jan Akesinu li pona", # syllables match before names here
|
62
|
+
"jan Ke Tami",
|
63
|
+
"kulupu Kuko",
|
62
64
|
]
|
63
65
|
|
64
66
|
ALPHABETIC_MATCHES = [
|
@@ -85,13 +87,20 @@ SOME_INVALID = [
|
|
85
87
|
"mi tawa ma ohio",
|
86
88
|
"sina toki e nimi what pi toki Inli",
|
87
89
|
"wawa la o lukin e ni: your mom",
|
90
|
+
"lete li ike x.x", # this is an emoticon but passes because 'x' is in Filters.Miscellaneous
|
88
91
|
]
|
89
92
|
|
90
93
|
CORPUS_SPECIFIC = [
|
91
|
-
"ki le konsi si te isipin epiku le pasila to",
|
94
|
+
# "ki le konsi si te isipin epiku le pasila to",
|
95
|
+
"ki konsi te isipin epiku pasila to", # the sandbox has not documented si or le
|
92
96
|
'jasima omekapo, ki nimisin "jasima enko nimisin". ki enko alu linluwi Jutu alu epiku ki epiku baba is you. ki likujo "SINtelen pona", ki epiku alu "sitelen pona". ki kepen wawajete isipin, kin ki yupekosi alu lipamanka alu wawajete, kin ki enko isipin lipamanka linluwi alu wawajete',
|
93
97
|
"kalamARRRR",
|
94
98
|
"Pingo",
|
99
|
+
"we Luke",
|
100
|
+
]
|
101
|
+
CORPUS_SPECIFIC_XFAIL = [
|
102
|
+
"How to Cut a Kiwi",
|
103
|
+
"a e i o u",
|
95
104
|
]
|
96
105
|
|
97
106
|
|
@@ -103,6 +112,7 @@ EXCESSIVE_SYLLABICS = [
|
|
103
112
|
"I manipulate a passe pile so a ton emulate, akin to intake",
|
104
113
|
"a ton of insolate puke. make no amen, no joke.",
|
105
114
|
"I elope so, to an elite untaken tune, some unwise tone",
|
115
|
+
"insane asinine lemon awesome atone joke",
|
106
116
|
]
|
107
117
|
|
108
118
|
EXCESSIVE_ALPHABETICS = [
|
@@ -122,11 +132,13 @@ EXCESSIVE_NAMES = [
|
|
122
132
|
"I Want To Evade The Filter",
|
123
133
|
"If You Do This The Bot Can't See You",
|
124
134
|
"This Is A Statement In Perfect Toki Pona, I Guarantee",
|
125
|
-
"How to Cut a Kiwi", # previous false positive; fixed by english ignorables
|
126
135
|
]
|
127
136
|
|
128
137
|
EXCESSIVE_ENGLISH = [
|
129
138
|
"me when i tawa sike", # previous false positive; fixed by english ignorables
|
139
|
+
"Maybe I’m too nasa", # previous false positive; fixed by LongSyllabic and LongAlphabetic
|
140
|
+
"I see :)",
|
141
|
+
"I wanna see", # same down to here
|
130
142
|
]
|
131
143
|
|
132
144
|
NON_MATCHES = [
|
@@ -134,6 +146,7 @@ NON_MATCHES = [
|
|
134
146
|
"super bruh moment 64",
|
135
147
|
"homestuck",
|
136
148
|
"homestuck Homestuck",
|
149
|
+
"what if i went to the store ",
|
137
150
|
]
|
138
151
|
|
139
152
|
KNOWN_GOOD = (
|
@@ -150,22 +163,23 @@ KNOWN_BAD = (
|
|
150
163
|
+ EXCESSIVE_ALPHABETICS
|
151
164
|
+ EXCESSIVE_NAMES
|
152
165
|
+ EXCESSIVE_TYPOES
|
166
|
+
+ EXCESSIVE_ENGLISH
|
153
167
|
+ NON_MATCHES
|
154
168
|
)
|
155
169
|
|
156
170
|
FALSE_NEGATIVES = [
|
157
171
|
# emoticon should not be a problem
|
158
|
-
"lete li ike x.x",
|
159
172
|
# a token that is one edit off a known word should be allowed
|
160
173
|
"mi pnoa",
|
161
174
|
"tok",
|
162
175
|
"mut",
|
163
176
|
"poan",
|
164
177
|
"mtue",
|
178
|
+
"mi nasa B^)", # emoticon
|
165
179
|
]
|
166
180
|
|
167
181
|
FALSE_POSITIVES = [
|
168
|
-
"
|
182
|
+
"insane asinine lemon awesome atone",
|
169
183
|
]
|
170
184
|
|
171
185
|
|
@@ -174,16 +188,16 @@ def test_known_good_pref(ilo: Ilo, text: str):
|
|
174
188
|
assert ilo.is_toki_pona(text), text
|
175
189
|
|
176
190
|
|
191
|
+
@pytest.mark.parametrize("text", KNOWN_BAD + CORPUS_SPECIFIC)
|
192
|
+
def test_known_bad_pref(ilo: Ilo, text: str):
|
193
|
+
assert not ilo.is_toki_pona(text), text
|
194
|
+
|
195
|
+
|
177
196
|
@pytest.mark.parametrize("text", KNOWN_GOOD + CORPUS_SPECIFIC)
|
178
197
|
def test_known_good_corpus(corpus_ilo: Ilo, text: str):
|
179
198
|
assert corpus_ilo.is_toki_pona(text), text
|
180
199
|
|
181
200
|
|
182
|
-
@pytest.mark.parametrize("text", KNOWN_BAD + CORPUS_SPECIFIC)
|
183
|
-
def test_known_bad(ilo: Ilo, text: str):
|
184
|
-
assert not ilo.is_toki_pona(text), text
|
185
|
-
|
186
|
-
|
187
201
|
@pytest.mark.parametrize("text", KNOWN_BAD)
|
188
202
|
def test_known_bad_corpus(corpus_ilo: Ilo, text: str):
|
189
203
|
assert not corpus_ilo.is_toki_pona(text), text
|
@@ -209,11 +223,17 @@ def test_weakness_of_lazy(lazy_ilo: Ilo, text: str):
|
|
209
223
|
|
210
224
|
@pytest.mark.xfail
|
211
225
|
@pytest.mark.parametrize("text", FALSE_POSITIVES)
|
212
|
-
def
|
226
|
+
def test_false_positives_pref(ilo: Ilo, text: str):
|
213
227
|
assert not ilo.is_toki_pona(text)
|
214
228
|
|
215
229
|
|
216
230
|
@pytest.mark.xfail
|
217
231
|
@pytest.mark.parametrize("text", FALSE_NEGATIVES)
|
218
|
-
def
|
232
|
+
def test_false_negatives_pref(ilo: Ilo, text: str):
|
219
233
|
assert ilo.is_toki_pona(text)
|
234
|
+
|
235
|
+
|
236
|
+
@pytest.mark.xfail
|
237
|
+
@pytest.mark.parametrize("text", CORPUS_SPECIFIC_XFAIL)
|
238
|
+
def test_false_positives_corpus(corpus_ilo: Ilo, text: str):
|
239
|
+
assert not corpus_ilo.is_toki_pona(text)
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# PDM
|
2
|
+
import hypothesis.strategies as st
|
3
|
+
from hypothesis import given
|
4
|
+
|
5
|
+
# LOCAL
|
6
|
+
from sonatoki.Filters import (
|
7
|
+
NimiPu,
|
8
|
+
Syllabic,
|
9
|
+
Alphabetic,
|
10
|
+
NimiKuLili,
|
11
|
+
NimiKuSuli,
|
12
|
+
Phonotactic,
|
13
|
+
NimiLinkuCore,
|
14
|
+
NimiPuSynonyms,
|
15
|
+
NimiLinkuCommon,
|
16
|
+
NimiLinkuObscure,
|
17
|
+
NimiLinkuSandbox,
|
18
|
+
NimiLinkuUncommon,
|
19
|
+
)
|
20
|
+
from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates
|
21
|
+
from sonatoki.constants import (
|
22
|
+
NIMI_PU,
|
23
|
+
NIMI_KU_LILI,
|
24
|
+
NIMI_KU_SULI,
|
25
|
+
NIMI_LINKU_CORE,
|
26
|
+
NIMI_PU_SYNONYMS,
|
27
|
+
NIMI_LINKU_COMMON,
|
28
|
+
NIMI_LINKU_OBSCURE,
|
29
|
+
NIMI_LINKU_SANDBOX,
|
30
|
+
NIMI_LINKU_UNCOMMON,
|
31
|
+
)
|
32
|
+
|
33
|
+
|
34
|
+
@given(st.sampled_from(list(NIMI_PU | NIMI_PU_SYNONYMS)))
|
35
|
+
def test_pu_filters_non_overlap(s: str):
|
36
|
+
res_pu = NimiPu.filter(s)
|
37
|
+
res_synonyms = NimiPuSynonyms.filter(s)
|
38
|
+
assert (res_pu + res_synonyms) == 1
|
39
|
+
|
40
|
+
|
41
|
+
@given(st.sampled_from(list(NIMI_KU_SULI | NIMI_KU_LILI)))
|
42
|
+
def test_ku_filters_non_overlap(s: str):
|
43
|
+
res_ku_suli = NimiKuSuli.filter(s)
|
44
|
+
res_ku_lili = NimiKuLili.filter(s)
|
45
|
+
assert (res_ku_suli + res_ku_lili) == 1
|
46
|
+
|
47
|
+
|
48
|
+
@given(
|
49
|
+
st.sampled_from(
|
50
|
+
list(
|
51
|
+
NIMI_LINKU_CORE
|
52
|
+
| NIMI_LINKU_COMMON
|
53
|
+
| NIMI_LINKU_UNCOMMON
|
54
|
+
| NIMI_LINKU_OBSCURE
|
55
|
+
| NIMI_LINKU_SANDBOX
|
56
|
+
)
|
57
|
+
)
|
58
|
+
)
|
59
|
+
def test_linku_filters_non_overlap(s: str):
|
60
|
+
s = Lowercase.clean(s)
|
61
|
+
s = ConsecutiveDuplicates.clean(s)
|
62
|
+
|
63
|
+
res_core = NimiLinkuCore.filter(s)
|
64
|
+
res_common = NimiLinkuCommon.filter(s)
|
65
|
+
res_uncommon = NimiLinkuUncommon.filter(s)
|
66
|
+
res_obscure = NimiLinkuObscure.filter(s)
|
67
|
+
res_sandbox = NimiLinkuSandbox.filter(s)
|
68
|
+
|
69
|
+
assert (res_core + res_common + res_uncommon + res_obscure + res_sandbox) == 1
|
70
|
+
|
71
|
+
|
72
|
+
@given(st.sampled_from(list(NIMI_LINKU_CORE | NIMI_LINKU_COMMON | NIMI_LINKU_UNCOMMON)))
|
73
|
+
def test_nimi_linku_properties(s: str):
|
74
|
+
assert ConsecutiveDuplicates.clean(s) == s, repr(s)
|
75
|
+
assert Alphabetic.filter(s), repr(s)
|
76
|
+
assert Syllabic.filter(s), repr(s)
|
77
|
+
assert Phonotactic.filter(s), repr(s)
|
78
|
+
# Passing phonotactic implies all of the above
|
@@ -11,7 +11,7 @@ from sonatoki.constants import NIMI_LINKU_CORE, NIMI_LINKU_COMMON
|
|
11
11
|
PROPER_NAME_RE = r"[A-Z][a-z]*"
|
12
12
|
|
13
13
|
token_strategy = (
|
14
|
-
st.sampled_from(NIMI_LINKU_CORE
|
14
|
+
st.sampled_from(list(NIMI_LINKU_CORE | NIMI_LINKU_COMMON))
|
15
15
|
| st.from_regex(Phonotactic.pattern.pattern, fullmatch=True)
|
16
16
|
| st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
|
17
17
|
| st.from_regex(PROPER_NAME_RE, fullmatch=True)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|