sonatoki 0.2.2__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sonatoki-0.2.2 → sonatoki-0.3.0}/PKG-INFO +1 -1
- {sonatoki-0.2.2 → sonatoki-0.3.0}/pyproject.toml +1 -1
- {sonatoki-0.2.2 → sonatoki-0.3.0}/src/sonatoki/Cleaners.py +7 -0
- sonatoki-0.3.0/src/sonatoki/Configs.py +129 -0
- {sonatoki-0.2.2 → sonatoki-0.3.0}/src/sonatoki/Filters.py +86 -6
- {sonatoki-0.2.2 → sonatoki-0.3.0}/src/sonatoki/Tokenizers.py +33 -17
- sonatoki-0.3.0/src/sonatoki/constants.py +462 -0
- {sonatoki-0.2.2 → sonatoki-0.3.0}/src/sonatoki/utils.py +26 -1
- sonatoki-0.3.0/tests/__init__.py +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.0}/tests/test_filters.py +1 -0
- {sonatoki-0.2.2 → sonatoki-0.3.0}/tests/test_ilo.py +92 -35
- {sonatoki-0.2.2 → sonatoki-0.3.0}/tests/test_tokenize.py +28 -27
- {sonatoki-0.2.2 → sonatoki-0.3.0}/tests/tokenize_cases/tokenize_words_tok.yml +44 -0
- sonatoki-0.2.2/src/sonatoki/Configs.py +0 -80
- sonatoki-0.2.2/src/sonatoki/constants.py +0 -72
- {sonatoki-0.2.2 → sonatoki-0.3.0}/LICENSE +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.0}/README.md +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.0}/src/sonatoki/Preprocessors.py +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.0}/src/sonatoki/Scorers.py +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.0}/src/sonatoki/__init__.py +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.0}/src/sonatoki/__main__.py +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.0}/src/sonatoki/ilo.py +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.0}/src/sonatoki/linku.json +0 -0
- /sonatoki-0.2.2/tests/__init__.py → /sonatoki-0.3.0/src/sonatoki/py.typed +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.0}/src/sonatoki/sandbox.json +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.0}/tests/test_cleaners.py +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.0}/tests/test_preprocessors.py +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.0}/tests/test_scorers.py +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.0}/tests/test_utils.py +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.0}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
@@ -0,0 +1,129 @@
|
|
1
|
+
# STL
|
2
|
+
from copy import deepcopy
|
3
|
+
from typing import List, Type, Union, TypedDict
|
4
|
+
|
5
|
+
# LOCAL
|
6
|
+
from sonatoki.Filters import (
|
7
|
+
Filter,
|
8
|
+
NimiPu,
|
9
|
+
Numeric,
|
10
|
+
OrFilter,
|
11
|
+
Syllabic,
|
12
|
+
NimiLinku,
|
13
|
+
NimiPuAle,
|
14
|
+
NimiUCSUR,
|
15
|
+
Alphabetic,
|
16
|
+
ProperName,
|
17
|
+
Phonotactic,
|
18
|
+
Punctuation,
|
19
|
+
NimiLinkuAle,
|
20
|
+
NimiLinkuSandbox,
|
21
|
+
EnglishIgnorables,
|
22
|
+
)
|
23
|
+
from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
|
24
|
+
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
25
|
+
from sonatoki.Tokenizers import Tokenizer, WordTokenizer
|
26
|
+
from sonatoki.Preprocessors import (
|
27
|
+
URLs,
|
28
|
+
Reference,
|
29
|
+
Preprocessor,
|
30
|
+
DiscordEmotes,
|
31
|
+
DiscordSpecial,
|
32
|
+
DiscordChannels,
|
33
|
+
DiscordMentions,
|
34
|
+
AngleBracketObject,
|
35
|
+
)
|
36
|
+
|
37
|
+
|
38
|
+
class IloConfig(TypedDict):
|
39
|
+
preprocessors: List[Type[Preprocessor]]
|
40
|
+
word_tokenizer: Type[Tokenizer]
|
41
|
+
cleaners: List[Type[Cleaner]]
|
42
|
+
ignoring_filters: List[Type[Filter]]
|
43
|
+
scoring_filters: List[Type[Filter]]
|
44
|
+
scorer: Type[Scorer]
|
45
|
+
passing_score: Number
|
46
|
+
|
47
|
+
|
48
|
+
# TODO: branching configs?
|
49
|
+
|
50
|
+
BaseConfig: IloConfig = {
|
51
|
+
"preprocessors": [URLs],
|
52
|
+
"cleaners": [ConsecutiveDuplicates],
|
53
|
+
"ignoring_filters": [Numeric, Punctuation],
|
54
|
+
"scoring_filters": [],
|
55
|
+
"scorer": PassFail,
|
56
|
+
"passing_score": 0.8,
|
57
|
+
"word_tokenizer": WordTokenizer,
|
58
|
+
}
|
59
|
+
|
60
|
+
|
61
|
+
PrefConfig: IloConfig = {
|
62
|
+
"preprocessors": [URLs, Reference],
|
63
|
+
"cleaners": [ConsecutiveDuplicates],
|
64
|
+
"ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
|
65
|
+
"scoring_filters": [
|
66
|
+
OrFilter(NimiLinku, NimiUCSUR),
|
67
|
+
Syllabic,
|
68
|
+
ProperName,
|
69
|
+
Alphabetic,
|
70
|
+
],
|
71
|
+
"scorer": SoftScaling,
|
72
|
+
"passing_score": 0.8,
|
73
|
+
"word_tokenizer": WordTokenizer,
|
74
|
+
}
|
75
|
+
|
76
|
+
CorpusConfig: IloConfig = {
|
77
|
+
"preprocessors": [URLs, AngleBracketObject, Reference],
|
78
|
+
"cleaners": [ConsecutiveDuplicates],
|
79
|
+
"ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
|
80
|
+
"scoring_filters": [
|
81
|
+
OrFilter(NimiLinkuSandbox, NimiUCSUR),
|
82
|
+
Syllabic,
|
83
|
+
ProperName,
|
84
|
+
Alphabetic,
|
85
|
+
],
|
86
|
+
"scorer": SoftScaling,
|
87
|
+
"passing_score": 0.8,
|
88
|
+
"word_tokenizer": WordTokenizer,
|
89
|
+
}
|
90
|
+
|
91
|
+
|
92
|
+
LazyConfig: IloConfig = {
|
93
|
+
"preprocessors": [URLs],
|
94
|
+
"cleaners": [ConsecutiveDuplicates],
|
95
|
+
"ignoring_filters": [Numeric, Punctuation],
|
96
|
+
"scoring_filters": [Alphabetic, NimiUCSUR, ProperName],
|
97
|
+
"scorer": SoftPassFail,
|
98
|
+
"passing_score": 0.8,
|
99
|
+
"word_tokenizer": WordTokenizer,
|
100
|
+
}
|
101
|
+
|
102
|
+
DiscordConfig: IloConfig = {
|
103
|
+
"preprocessors": [URLs, AngleBracketObject, Reference],
|
104
|
+
"cleaners": [ConsecutiveDuplicates],
|
105
|
+
"ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
|
106
|
+
"scoring_filters": [
|
107
|
+
OrFilter(NimiLinku, NimiUCSUR),
|
108
|
+
Syllabic,
|
109
|
+
ProperName,
|
110
|
+
Alphabetic,
|
111
|
+
],
|
112
|
+
"scorer": SoftScaling,
|
113
|
+
"passing_score": 0.8,
|
114
|
+
"word_tokenizer": WordTokenizer,
|
115
|
+
}
|
116
|
+
|
117
|
+
TelegramConfig: IloConfig = deepcopy(PrefConfig)
|
118
|
+
ForumConfig: IloConfig = deepcopy(PrefConfig)
|
119
|
+
|
120
|
+
__all__ = [
|
121
|
+
"BaseConfig",
|
122
|
+
"CorpusConfig",
|
123
|
+
"DiscordConfig",
|
124
|
+
"ForumConfig",
|
125
|
+
"IloConfig",
|
126
|
+
"LazyConfig",
|
127
|
+
"PrefConfig",
|
128
|
+
"TelegramConfig",
|
129
|
+
]
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# STL
|
2
2
|
import re
|
3
3
|
from abc import ABC, abstractmethod
|
4
|
-
from typing import Set
|
4
|
+
from typing import Set, List, Type
|
5
5
|
from functools import lru_cache as cache # cache comes in 3.9
|
6
6
|
|
7
7
|
# PDM
|
@@ -13,15 +13,17 @@ from sonatoki.constants import (
|
|
13
13
|
VOWELS,
|
14
14
|
NIMI_PU,
|
15
15
|
ALPHABET,
|
16
|
+
ALL_PUNCT,
|
16
17
|
ALLOWABLES,
|
17
18
|
CONSONANTS,
|
19
|
+
IGNORABLES,
|
18
20
|
NIMI_LINKU,
|
19
|
-
|
20
|
-
UNICODE_PUNCT,
|
21
|
+
NIMI_UCSUR,
|
21
22
|
NIMI_LINKU_LILI,
|
22
23
|
ALL_PUNCT_RANGES,
|
23
24
|
NIMI_PU_SYNONYMS,
|
24
25
|
NIMI_LINKU_SANDBOX,
|
26
|
+
UCSUR_PUNCT_RANGES,
|
25
27
|
)
|
26
28
|
|
27
29
|
regex.DEFAULT_VERSION = regex.VERSION1
|
@@ -79,6 +81,10 @@ class Miscellaneous(MemberFilter):
|
|
79
81
|
tokens = set(ALLOWABLES)
|
80
82
|
|
81
83
|
|
84
|
+
class EnglishIgnorables(MemberFilter):
|
85
|
+
tokens = set(IGNORABLES)
|
86
|
+
|
87
|
+
|
82
88
|
class ProperName(Filter):
|
83
89
|
"""Determines if a given token is a valid name (also called a loan word).
|
84
90
|
When Toki Pona is written with the Latin alphabet, names are generally
|
@@ -118,6 +124,10 @@ class NimiLinkuSandbox(MemberFilter):
|
|
118
124
|
tokens = set(NIMI_LINKU + NIMI_LINKU_LILI + NIMI_LINKU_SANDBOX)
|
119
125
|
|
120
126
|
|
127
|
+
class NimiUCSUR(MemberFilter):
|
128
|
+
tokens = set(NIMI_UCSUR)
|
129
|
+
|
130
|
+
|
121
131
|
class Phonotactic(RegexFilter):
|
122
132
|
"""Determines if a given token is phonotactically valid Toki Pona (or `n`).
|
123
133
|
Excludes both consecutive nasals and the illegal syllables:
|
@@ -156,6 +166,11 @@ class AlphabeticRe(RegexFilter):
|
|
156
166
|
pattern = re.compile(rf"[{ALPHABET}]+", flags=re.IGNORECASE)
|
157
167
|
|
158
168
|
|
169
|
+
class TwoOrMoreAlphabetic(Filter):
|
170
|
+
# TODO: alphabetic implementation that ignores single characters
|
171
|
+
pass
|
172
|
+
|
173
|
+
|
159
174
|
class Numeric(Filter):
|
160
175
|
"""Determine if a given token is entirely numeric.
|
161
176
|
Covers all numeric symbols in Unicode.
|
@@ -175,12 +190,13 @@ class Numeric(Filter):
|
|
175
190
|
class Punctuation(SubsetFilter):
|
176
191
|
"""Identify whether a token is entirely punctuation. Fastest implementation."""
|
177
192
|
|
178
|
-
tokens = set(
|
193
|
+
tokens = set(ALL_PUNCT)
|
179
194
|
|
180
195
|
|
181
196
|
class PunctuationRe(RegexFilter):
|
182
197
|
"""Faster implementation of `PunctuationRe1`.
|
183
|
-
Goes out of date compared to the `regex` library if
|
198
|
+
Goes out of date compared to the `regex` library if UNICODE_PUNCT_RANGES is not updated.
|
199
|
+
"""
|
184
200
|
|
185
201
|
pattern = re.compile(rf"[{ALL_PUNCT_RANGES}]+")
|
186
202
|
|
@@ -188,17 +204,81 @@ class PunctuationRe(RegexFilter):
|
|
188
204
|
class PunctuationRe1(Regex1Filter):
|
189
205
|
"""Reference implementation for identifying tokens made entirely of punctuation."""
|
190
206
|
|
191
|
-
pattern = regex.compile(
|
207
|
+
pattern = regex.compile(
|
208
|
+
rf"[\p{{Punctuation}}\p{{posix_punct}}{UCSUR_PUNCT_RANGES}]+"
|
209
|
+
)
|
210
|
+
|
211
|
+
|
212
|
+
class OrFilter:
|
213
|
+
"""Instantiate with more than one filter to compose them into one filter,
|
214
|
+
returning True when any individual filter matches or False otherwise.
|
215
|
+
Requires at least two filters.
|
216
|
+
|
217
|
+
OrFilter exists as a compromise between the need to score some filters equally,
|
218
|
+
while not adding custom behavior to scorers.
|
219
|
+
I could have allowed a position to have a list of filters instead of one filter,
|
220
|
+
but this would require cleaning the user's input, and nested handling of lists.
|
221
|
+
It also would not have been as powerful- I would need another param for the and/or switch,
|
222
|
+
or to not give users the choice.
|
223
|
+
|
224
|
+
Instead, the user is responsible for building an OrFilter out of their desired filters.
|
225
|
+
"""
|
226
|
+
|
227
|
+
def __new__(cls, *filters_: Type[Filter]) -> Type[Filter]:
|
228
|
+
if not len(filters_) >= 2:
|
229
|
+
raise ValueError("Must provide at least two Filters to OrFilter.")
|
230
|
+
|
231
|
+
class AnonymousOrFilter(Filter):
|
232
|
+
filters: List[Type[Filter]] = list(filters_) # TODO: tuple better?
|
233
|
+
|
234
|
+
@classmethod
|
235
|
+
@override
|
236
|
+
@cache(maxsize=None)
|
237
|
+
def filter(cls, token: str) -> bool:
|
238
|
+
for f in cls.filters:
|
239
|
+
if f.filter(token):
|
240
|
+
return True
|
241
|
+
return False
|
242
|
+
|
243
|
+
return AnonymousOrFilter
|
244
|
+
|
245
|
+
|
246
|
+
class AndFilter(Filter):
|
247
|
+
"""Instantiate with more than one filter to compose them into one filter,
|
248
|
+
returning False when any individual filter fails to match or True otherwise.
|
249
|
+
Requires at least two filters."""
|
250
|
+
|
251
|
+
def __new__(cls, *filters_: Type[Filter]) -> Type[Filter]:
|
252
|
+
if not len(filters_) >= 2:
|
253
|
+
raise ValueError("Must provide at least two Filters to AndFilter.")
|
254
|
+
|
255
|
+
class AnonymousAndFilter(Filter):
|
256
|
+
filters: List[Type[Filter]] = list(filters_) # TODO: tuple better?
|
257
|
+
|
258
|
+
@classmethod
|
259
|
+
@override
|
260
|
+
@cache(maxsize=None)
|
261
|
+
def filter(cls, token: str) -> bool:
|
262
|
+
for f in cls.filters:
|
263
|
+
if not f.filter(token):
|
264
|
+
return False
|
265
|
+
return True
|
266
|
+
|
267
|
+
return AnonymousAndFilter
|
192
268
|
|
193
269
|
|
194
270
|
__all__ = [
|
195
271
|
"Alphabetic",
|
272
|
+
"AndFilter",
|
273
|
+
"EnglishIgnorables",
|
196
274
|
"NimiLinku",
|
197
275
|
"NimiLinkuAle",
|
198
276
|
"NimiLinkuSandbox",
|
199
277
|
"NimiPu",
|
200
278
|
"NimiPuAle",
|
279
|
+
"NimiUCSUR",
|
201
280
|
"Numeric",
|
281
|
+
"OrFilter",
|
202
282
|
"Phonotactic",
|
203
283
|
"ProperName",
|
204
284
|
"Punctuation",
|
@@ -5,16 +5,12 @@ from typing import Set, List
|
|
5
5
|
|
6
6
|
# PDM
|
7
7
|
import regex
|
8
|
-
from typing_extensions import override
|
8
|
+
from typing_extensions import override, deprecated
|
9
9
|
|
10
10
|
# LOCAL
|
11
11
|
from sonatoki.utils import regex_escape
|
12
|
-
from sonatoki.
|
13
|
-
|
14
|
-
UNICODE_PUNCT,
|
15
|
-
SENTENCE_PUNCT,
|
16
|
-
ALL_PUNCT_RANGES,
|
17
|
-
)
|
12
|
+
from sonatoki.Filters import NimiUCSUR # seriously this sucks
|
13
|
+
from sonatoki.constants import ALL_PUNCT, SENTENCE_PUNCT, ALL_PUNCT_RANGES
|
18
14
|
|
19
15
|
regex.DEFAULT_VERSION = regex.VERSION1
|
20
16
|
|
@@ -50,7 +46,12 @@ class Regex1Tokenizer(Tokenizer):
|
|
50
46
|
|
51
47
|
|
52
48
|
class WordTokenizer(SetTokenizer):
|
53
|
-
delimiters = set(
|
49
|
+
delimiters = set(ALL_PUNCT)
|
50
|
+
|
51
|
+
@classmethod
|
52
|
+
def __helper(cls, s: str, tokens: List[str], last_match: int, i: int):
|
53
|
+
match = s[last_match:i].split()
|
54
|
+
[tokens.append(t) for t in match if t]
|
54
55
|
|
55
56
|
@classmethod
|
56
57
|
@override
|
@@ -60,32 +61,47 @@ class WordTokenizer(SetTokenizer):
|
|
60
61
|
|
61
62
|
tokens: List[str] = []
|
62
63
|
|
64
|
+
i = 0 # ensure i is bound
|
63
65
|
last_match = 0
|
64
66
|
last_membership = s[0] in cls.delimiters
|
65
67
|
for i, char in enumerate(s):
|
66
68
|
mem = char in cls.delimiters
|
67
|
-
|
69
|
+
ucsur = NimiUCSUR.filter(char) # always "changed" means
|
70
|
+
changed = (mem != last_membership) or ucsur
|
71
|
+
# this keeps contiguous words together, but splits UCSUR
|
72
|
+
if not changed:
|
73
|
+
continue
|
74
|
+
|
75
|
+
if ucsur:
|
76
|
+
if i > last_match:
|
77
|
+
# Add the token before UCSUR character
|
78
|
+
cls.__helper(s, tokens, last_match, i)
|
79
|
+
# Add UCSUR character itself as a token
|
80
|
+
tokens.append(char)
|
81
|
+
last_match = i + 1
|
82
|
+
last_membership = mem
|
68
83
|
continue
|
69
84
|
|
70
|
-
|
71
|
-
# TODO: kinda sucks? what about unicode whitespace?
|
85
|
+
cls.__helper(s, tokens, last_match, i)
|
72
86
|
last_match = i
|
73
87
|
last_membership = mem
|
74
|
-
[tokens.append(t) for t in match if t]
|
75
|
-
|
76
|
-
match = s[last_match:].strip().split()
|
77
|
-
if match:
|
78
|
-
tokens.extend(match)
|
79
88
|
|
89
|
+
cls.__helper(s, tokens, last_match, i + 1)
|
80
90
|
return tokens
|
81
91
|
|
82
92
|
|
93
|
+
@deprecated(
|
94
|
+
"WordTokenizerRe is a previous reference implementation. Its behavior has diverged from WordTokenizer and it may not be restored."
|
95
|
+
)
|
83
96
|
class WordTokenizerRe(RegexTokenizer):
|
84
97
|
pattern = re.compile(rf"""([{ALL_PUNCT_RANGES}]+|\s+)""")
|
85
98
|
|
86
99
|
|
100
|
+
@deprecated(
|
101
|
+
"WordTokenizerRe1 is a previous reference implementation. Its behavior has diverged from WordTokenizer and it may not be restored."
|
102
|
+
)
|
87
103
|
class WordTokenizerRe1(Regex1Tokenizer):
|
88
|
-
"""Reference implementation for
|
104
|
+
"""Reference implementation for WordTokenizer."""
|
89
105
|
|
90
106
|
pattern = regex.compile(r"""([\p{posix_punct}\p{Punctuation}]+|\s+)""")
|
91
107
|
|