sonatoki 0.6.1__tar.gz → 0.6.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sonatoki-0.6.1 → sonatoki-0.6.3}/PKG-INFO +1 -1
- {sonatoki-0.6.1 → sonatoki-0.6.3}/pyproject.toml +1 -1
- {sonatoki-0.6.1 → sonatoki-0.6.3}/src/sonatoki/Configs.py +31 -39
- {sonatoki-0.6.1 → sonatoki-0.6.3}/src/sonatoki/Filters.py +16 -1
- {sonatoki-0.6.1 → sonatoki-0.6.3}/src/sonatoki/constants.py +1 -1
- {sonatoki-0.6.1 → sonatoki-0.6.3}/tests/test_filters.py +33 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/LICENSE +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/README.md +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/src/sonatoki/Cleaners.py +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/src/sonatoki/Preprocessors.py +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/src/sonatoki/Scorers.py +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/src/sonatoki/Tokenizers.py +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/src/sonatoki/__init__.py +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/src/sonatoki/__main__.py +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/src/sonatoki/alphabetic.txt +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/src/sonatoki/ilo.py +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/src/sonatoki/linku.json +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/src/sonatoki/py.typed +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/src/sonatoki/sandbox.json +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/src/sonatoki/syllabic.txt +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/src/sonatoki/utils.py +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/tests/__init__.py +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/tests/test_cleaners.py +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/tests/test_ilo.py +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/tests/test_preprocessors.py +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/tests/test_properties.py +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/tests/test_scorers.py +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/tests/test_tokenize.py +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/tests/test_utils.py +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
- {sonatoki-0.6.1 → sonatoki-0.6.3}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
# STL
|
2
2
|
from copy import deepcopy
|
3
|
-
from typing import
|
3
|
+
from typing import List, Type, TypedDict
|
4
4
|
|
5
5
|
# PDM
|
6
6
|
from typing_extensions import NotRequired
|
@@ -12,13 +12,11 @@ from sonatoki.Filters import (
|
|
12
12
|
Not,
|
13
13
|
Filter,
|
14
14
|
Numeric,
|
15
|
-
Syllabic,
|
16
15
|
NimiUCSUR,
|
17
16
|
Alphabetic,
|
18
17
|
NimiKuLili,
|
19
18
|
NimiKuSuli,
|
20
19
|
ProperName,
|
21
|
-
Phonotactic,
|
22
20
|
Punctuation,
|
23
21
|
LongSyllabic,
|
24
22
|
Miscellaneous,
|
@@ -44,6 +42,34 @@ from sonatoki.Preprocessors import (
|
|
44
42
|
AngleBracketObject,
|
45
43
|
)
|
46
44
|
|
45
|
+
__DICT_PHONOMATCHES = {
|
46
|
+
# Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
|
47
|
+
# In this case, all of these appear more often in English by a factor of at least 10.
|
48
|
+
"aka", # also known as
|
49
|
+
"an", # article
|
50
|
+
"api", # API
|
51
|
+
"i", # 1st person
|
52
|
+
"kana", # japanese script
|
53
|
+
"me", # 1st person singular, english
|
54
|
+
"ne", # "no" in several languages
|
55
|
+
"nu", # "new" in english, "now" in dutch
|
56
|
+
"se", # spanish particle, english "see"
|
57
|
+
"take", # acquire, perhaps forcefully or without permission
|
58
|
+
"ten", # 10
|
59
|
+
"to", # to, too
|
60
|
+
"je", # 1st person pronoun, french
|
61
|
+
"u", # no u
|
62
|
+
"we", # 1st person plural, english
|
63
|
+
"wi", # wii and discussions of syllables
|
64
|
+
"sole", # singular, of shoe
|
65
|
+
# unexplored candidates for removal
|
66
|
+
# "omen", # ominous
|
67
|
+
# "papa", # father
|
68
|
+
# "lo", # "lo" and "loo"
|
69
|
+
# "ewe", # sheep
|
70
|
+
# "pa", # father- eh?
|
71
|
+
}
|
72
|
+
|
47
73
|
|
48
74
|
class IloConfig(TypedDict):
|
49
75
|
preprocessors: List[Type[Preprocessor]]
|
@@ -92,8 +118,8 @@ CorpusConfig: IloConfig = {
|
|
92
118
|
NimiLinkuCore,
|
93
119
|
NimiLinkuCommon,
|
94
120
|
NimiLinkuUncommon,
|
95
|
-
NimiLinkuObscure,
|
96
|
-
NimiLinkuSandbox,
|
121
|
+
NimiLinkuObscure(sub=__DICT_PHONOMATCHES),
|
122
|
+
NimiLinkuSandbox(sub=__DICT_PHONOMATCHES),
|
97
123
|
NimiUCSUR,
|
98
124
|
Miscellaneous,
|
99
125
|
),
|
@@ -104,40 +130,6 @@ CorpusConfig: IloConfig = {
|
|
104
130
|
"scorer": SoftScaling,
|
105
131
|
"passing_score": 0.8,
|
106
132
|
}
|
107
|
-
|
108
|
-
# TODO: create a mechanism to omit tokens from a filter with more granularity
|
109
|
-
__corpus_tokens_dict: Set[str] = cast(
|
110
|
-
Set[str],
|
111
|
-
CorpusConfig["scoring_filters"][
|
112
|
-
0
|
113
|
-
].tokens, # pyright: ignore[reportAttributeAccessIssue]
|
114
|
-
)
|
115
|
-
__corpus_tokens_dict -= {
|
116
|
-
# Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
|
117
|
-
# In this case, all of these appear more often in English by a factor of at least 10.
|
118
|
-
"aka", # also known as
|
119
|
-
"an", # article
|
120
|
-
"api", # API
|
121
|
-
"i", # 1st person
|
122
|
-
"kana", # japanese script
|
123
|
-
"me", # 1st person
|
124
|
-
"ne", # "no" in several languages
|
125
|
-
"nu", # "new", now in dutch
|
126
|
-
"se", # spanish particle, "see"
|
127
|
-
"take", # acquire, perhaps forcefully or without permission
|
128
|
-
"ten", # 10
|
129
|
-
"to", # to, too
|
130
|
-
"u", # no u
|
131
|
-
"we", # 1st person plural
|
132
|
-
"wi", # wii and discussions of syllables
|
133
|
-
"sole", # singular, of shoe
|
134
|
-
# unexplored candidates for removal
|
135
|
-
# "omen", # ominous
|
136
|
-
# "papa", # father
|
137
|
-
# "lo", # "lo" and "loo"
|
138
|
-
# "ewe", # sheep
|
139
|
-
# "pa", # father- eh?
|
140
|
-
}
|
141
133
|
"""Mimics the previous implementation of ilo pi toki pona taso."""
|
142
134
|
LazyConfig: IloConfig = {
|
143
135
|
"preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
|
@@ -1,7 +1,8 @@
|
|
1
1
|
# STL
|
2
2
|
import re
|
3
3
|
from abc import ABC, abstractmethod
|
4
|
-
from
|
4
|
+
from copy import deepcopy
|
5
|
+
from typing import Set, List, Type, Optional
|
5
6
|
from functools import lru_cache as cache # cache comes in 3.9
|
6
7
|
|
7
8
|
# PDM
|
@@ -101,6 +102,20 @@ class MemberFilter(Filter):
|
|
101
102
|
def filter(cls, token: str) -> bool:
|
102
103
|
return token.lower() in cls.tokens
|
103
104
|
|
105
|
+
def __new__(
|
106
|
+
cls, add: Optional[Set[str]] = None, sub: Optional[Set[str]] = None
|
107
|
+
) -> Type[Filter]:
|
108
|
+
parent_tokens = deepcopy(cls.tokens)
|
109
|
+
if add:
|
110
|
+
parent_tokens = parent_tokens.union(add)
|
111
|
+
if sub:
|
112
|
+
parent_tokens -= sub
|
113
|
+
|
114
|
+
class AnonMemberFilter(MemberFilter):
|
115
|
+
tokens = parent_tokens
|
116
|
+
|
117
|
+
return AnonMemberFilter
|
118
|
+
|
104
119
|
|
105
120
|
class SubsetFilter(Filter):
|
106
121
|
tokens: Set[str]
|
@@ -501,7 +501,7 @@ ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
|
|
501
501
|
SENTENCE_PUNCT = """.?!:;()[-]·•…"""
|
502
502
|
# NOTE: quotes were previously included, but in TP they are *not* reliably sentence boundaries
|
503
503
|
|
504
|
-
INTRA_WORD_PUNCT = """-'"""
|
504
|
+
INTRA_WORD_PUNCT = """-'’"""
|
505
505
|
|
506
506
|
|
507
507
|
LINKU = Path(__file__).resolve().parent / Path("linku.json")
|
@@ -280,3 +280,36 @@ def test_AndNotFilter(s: str):
|
|
280
280
|
if res_fp:
|
281
281
|
# syl matched- but if fp matches, then the composed filter should not match
|
282
282
|
assert not res_composed
|
283
|
+
|
284
|
+
|
285
|
+
@given(st.sampled_from(list(NIMI_PU | NIMI_KU_SULI)))
|
286
|
+
def test_AddTokensToMemberFilter(s: str):
|
287
|
+
PuEnKuSuliFilter = NimiPu(add=NimiKuSuli.tokens)
|
288
|
+
assert PuEnKuSuliFilter.filter(s)
|
289
|
+
|
290
|
+
|
291
|
+
@given(st.sampled_from(list(NIMI_LINKU_SANDBOX | NIMI_KU_LILI)))
|
292
|
+
def test_AddTokensToMemberFilterNegative(s: str):
|
293
|
+
PuEnKuSuliFilter = NimiPu(add=NimiKuSuli.tokens)
|
294
|
+
assert not PuEnKuSuliFilter.filter(s)
|
295
|
+
|
296
|
+
|
297
|
+
@given(
|
298
|
+
st.sampled_from(
|
299
|
+
list(
|
300
|
+
NIMI_PU
|
301
|
+
| NIMI_KU_SULI
|
302
|
+
| NIMI_KU_LILI
|
303
|
+
| NIMI_LINKU_UNCOMMON
|
304
|
+
| NIMI_LINKU_OBSCURE
|
305
|
+
| NIMI_LINKU_SANDBOX
|
306
|
+
),
|
307
|
+
)
|
308
|
+
| st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
|
309
|
+
)
|
310
|
+
def test_SubTokensFromMemberFilter(s: str):
|
311
|
+
NimiAlaFilter = NimiLinkuCore(sub=NimiPu.tokens)
|
312
|
+
# core is a strict subset of pu
|
313
|
+
# if kin becomes core, needs to be corrected
|
314
|
+
|
315
|
+
assert not NimiAlaFilter.filter(s)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|