sonatoki 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Cleaners.py +4 -1
- sonatoki/Configs.py +52 -31
- sonatoki/Filters.py +96 -33
- sonatoki/Preprocessors.py +12 -6
- sonatoki/Scorers.py +54 -51
- sonatoki/constants.py +21 -29
- sonatoki/linku.json +1 -1
- sonatoki/sandbox.json +1 -1
- sonatoki/utils.py +23 -5
- {sonatoki-0.3.1.dist-info → sonatoki-0.3.3.dist-info}/METADATA +1 -1
- sonatoki-0.3.3.dist-info/RECORD +18 -0
- {sonatoki-0.3.1.dist-info → sonatoki-0.3.3.dist-info}/WHEEL +1 -1
- sonatoki-0.3.1.dist-info/RECORD +0 -18
- {sonatoki-0.3.1.dist-info → sonatoki-0.3.3.dist-info}/licenses/LICENSE +0 -0
sonatoki/Cleaners.py
CHANGED
@@ -10,6 +10,7 @@ class Cleaner(ABC):
|
|
10
10
|
@classmethod
|
11
11
|
@abstractmethod
|
12
12
|
def clean(cls, token: str) -> str:
|
13
|
+
"""Transform a token to remove some undesirable part."""
|
13
14
|
raise NotImplementedError
|
14
15
|
|
15
16
|
|
@@ -33,7 +34,8 @@ class ConsecutiveDuplicates(Cleaner):
|
|
33
34
|
may be altered for emphasis or effect, such as in "sonaaaa" or "AAAAAA".
|
34
35
|
|
35
36
|
This may be undesirable for moraic scripts like Hiragana, where `わわ` would be
|
36
|
-
incorrectly reduced to `わ`. This does preserve phonotactic validity, though.
|
37
|
+
incorrectly reduced to `わ`. This does preserve phonotactic validity, though.
|
38
|
+
"""
|
37
39
|
|
38
40
|
@classmethod
|
39
41
|
@override
|
@@ -69,4 +71,5 @@ class Lowercase(Cleaner):
|
|
69
71
|
|
70
72
|
__all__ = [
|
71
73
|
"ConsecutiveDuplicates",
|
74
|
+
"Lowercase",
|
72
75
|
]
|
sonatoki/Configs.py
CHANGED
@@ -5,22 +5,23 @@ from typing import List, Type, TypedDict
|
|
5
5
|
# LOCAL
|
6
6
|
from sonatoki.Filters import (
|
7
7
|
Filter,
|
8
|
-
NimiPu,
|
9
8
|
Numeric,
|
10
|
-
OrFilter,
|
11
9
|
Syllabic,
|
12
10
|
NimiUCSUR,
|
13
11
|
Alphabetic,
|
12
|
+
NimiKuLili,
|
13
|
+
NimiKuSuli,
|
14
14
|
ProperName,
|
15
|
-
Phonotactic,
|
16
15
|
Punctuation,
|
16
|
+
LongSyllabic,
|
17
|
+
Miscellaneous,
|
17
18
|
NimiLinkuCore,
|
18
|
-
|
19
|
+
LongAlphabetic,
|
20
|
+
LongProperName,
|
19
21
|
OrMemberFilter,
|
20
22
|
NimiLinkuCommon,
|
21
23
|
NimiLinkuObscure,
|
22
24
|
NimiLinkuSandbox,
|
23
|
-
EnglishIgnorables,
|
24
25
|
NimiLinkuUncommon,
|
25
26
|
)
|
26
27
|
from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
|
@@ -28,12 +29,9 @@ from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
|
28
29
|
from sonatoki.Tokenizers import Tokenizer, WordTokenizer
|
29
30
|
from sonatoki.Preprocessors import (
|
30
31
|
URLs,
|
32
|
+
Backticks,
|
31
33
|
Reference,
|
32
34
|
Preprocessor,
|
33
|
-
DiscordEmotes,
|
34
|
-
DiscordSpecial,
|
35
|
-
DiscordChannels,
|
36
|
-
DiscordMentions,
|
37
35
|
AngleBracketObject,
|
38
36
|
)
|
39
37
|
|
@@ -48,7 +46,7 @@ class IloConfig(TypedDict):
|
|
48
46
|
passing_score: Number
|
49
47
|
|
50
48
|
|
51
|
-
# TODO: branching configs?
|
49
|
+
# TODO: branching configs? config builder?
|
52
50
|
|
53
51
|
BaseConfig: IloConfig = {
|
54
52
|
"preprocessors": [URLs],
|
@@ -62,14 +60,14 @@ BaseConfig: IloConfig = {
|
|
62
60
|
|
63
61
|
|
64
62
|
PrefConfig: IloConfig = {
|
65
|
-
"preprocessors": [URLs, Reference],
|
63
|
+
"preprocessors": [Backticks, URLs, Reference],
|
66
64
|
"cleaners": [ConsecutiveDuplicates],
|
67
|
-
"ignoring_filters": [Numeric, Punctuation
|
65
|
+
"ignoring_filters": [Numeric, Punctuation],
|
68
66
|
"scoring_filters": [
|
69
|
-
OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
|
70
|
-
|
71
|
-
|
72
|
-
|
67
|
+
OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
|
68
|
+
LongSyllabic,
|
69
|
+
LongProperName,
|
70
|
+
LongAlphabetic,
|
73
71
|
],
|
74
72
|
"scorer": SoftScaling,
|
75
73
|
"passing_score": 0.8,
|
@@ -77,9 +75,9 @@ PrefConfig: IloConfig = {
|
|
77
75
|
}
|
78
76
|
|
79
77
|
CorpusConfig: IloConfig = {
|
80
|
-
"preprocessors": [URLs, AngleBracketObject, Reference],
|
78
|
+
"preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
|
81
79
|
"cleaners": [ConsecutiveDuplicates],
|
82
|
-
"ignoring_filters": [Numeric, Punctuation
|
80
|
+
"ignoring_filters": [Numeric, Punctuation],
|
83
81
|
"scoring_filters": [
|
84
82
|
OrMemberFilter(
|
85
83
|
NimiLinkuCore,
|
@@ -88,36 +86,58 @@ CorpusConfig: IloConfig = {
|
|
88
86
|
NimiLinkuObscure,
|
89
87
|
NimiLinkuSandbox,
|
90
88
|
NimiUCSUR,
|
89
|
+
Miscellaneous,
|
91
90
|
),
|
92
|
-
|
93
|
-
|
94
|
-
|
91
|
+
LongSyllabic,
|
92
|
+
LongProperName,
|
93
|
+
LongAlphabetic,
|
95
94
|
],
|
96
95
|
"scorer": SoftScaling,
|
97
96
|
"passing_score": 0.8,
|
98
97
|
"word_tokenizer": WordTokenizer,
|
99
98
|
}
|
100
|
-
|
101
|
-
|
99
|
+
"""Mimics the previous implementation of ilo pi toki pona taso."""
|
102
100
|
LazyConfig: IloConfig = {
|
103
|
-
"preprocessors": [URLs],
|
101
|
+
"preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
|
104
102
|
"cleaners": [ConsecutiveDuplicates],
|
105
103
|
"ignoring_filters": [Numeric, Punctuation],
|
106
|
-
"scoring_filters": [Alphabetic, NimiUCSUR, ProperName],
|
104
|
+
"scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
|
107
105
|
"scorer": SoftPassFail,
|
108
106
|
"passing_score": 0.8,
|
109
107
|
"word_tokenizer": WordTokenizer,
|
110
108
|
}
|
109
|
+
"""This is extremely silly."""
|
110
|
+
IsipinEpikuConfig: IloConfig = {
|
111
|
+
"preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
|
112
|
+
"cleaners": [ConsecutiveDuplicates],
|
113
|
+
"ignoring_filters": [Numeric, Punctuation],
|
114
|
+
"scoring_filters": [
|
115
|
+
OrMemberFilter(
|
116
|
+
NimiKuSuli,
|
117
|
+
NimiKuLili,
|
118
|
+
NimiLinkuUncommon,
|
119
|
+
NimiLinkuObscure,
|
120
|
+
NimiLinkuSandbox,
|
121
|
+
),
|
122
|
+
LongSyllabic,
|
123
|
+
LongProperName,
|
124
|
+
LongAlphabetic,
|
125
|
+
],
|
126
|
+
"scorer": SoftScaling,
|
127
|
+
"passing_score": 0.8,
|
128
|
+
"word_tokenizer": WordTokenizer,
|
129
|
+
}
|
130
|
+
|
111
131
|
|
112
132
|
DiscordConfig: IloConfig = {
|
113
|
-
"preprocessors": [URLs, AngleBracketObject, Reference],
|
133
|
+
"preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
|
114
134
|
"cleaners": [ConsecutiveDuplicates],
|
115
|
-
"ignoring_filters": [Numeric, Punctuation
|
135
|
+
"ignoring_filters": [Numeric, Punctuation],
|
116
136
|
"scoring_filters": [
|
117
|
-
OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
|
118
|
-
|
119
|
-
|
120
|
-
|
137
|
+
OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
|
138
|
+
LongSyllabic,
|
139
|
+
LongProperName,
|
140
|
+
LongAlphabetic,
|
121
141
|
],
|
122
142
|
"scorer": SoftScaling,
|
123
143
|
"passing_score": 0.8,
|
@@ -127,6 +147,7 @@ DiscordConfig: IloConfig = {
|
|
127
147
|
TelegramConfig: IloConfig = deepcopy(PrefConfig)
|
128
148
|
ForumConfig: IloConfig = deepcopy(PrefConfig)
|
129
149
|
|
150
|
+
|
130
151
|
__all__ = [
|
131
152
|
"BaseConfig",
|
132
153
|
"CorpusConfig",
|
sonatoki/Filters.py
CHANGED
@@ -42,6 +42,33 @@ class Filter(ABC):
|
|
42
42
|
raise NotImplementedError
|
43
43
|
|
44
44
|
|
45
|
+
class MinLen(Filter):
|
46
|
+
"""
|
47
|
+
Meta filter meant to be inherited by another filter to add a length requirement.
|
48
|
+
Multiple-inherit with `MinLen` as the first argument so `super()` resolves correctly.
|
49
|
+
You may also construct any other filter with a minimum length filter like so:
|
50
|
+
|
51
|
+
```
|
52
|
+
MinLen(Alphabetic, 3)
|
53
|
+
```
|
54
|
+
"""
|
55
|
+
|
56
|
+
length = 0
|
57
|
+
|
58
|
+
@classmethod
|
59
|
+
@cache(maxsize=None)
|
60
|
+
def filter(cls, token: str) -> bool:
|
61
|
+
if len(token) < cls.length:
|
62
|
+
return False
|
63
|
+
return super().filter(token)
|
64
|
+
|
65
|
+
def __new__(cls, filter: Type[Filter], length_: int) -> Type[Filter]:
|
66
|
+
class MinLenFilter(MinLen, Filter):
|
67
|
+
length = length_
|
68
|
+
|
69
|
+
return MinLenFilter
|
70
|
+
|
71
|
+
|
45
72
|
class RegexFilter(Filter):
|
46
73
|
pattern: "re.Pattern[str]"
|
47
74
|
|
@@ -83,11 +110,16 @@ class SubsetFilter(Filter):
|
|
83
110
|
|
84
111
|
|
85
112
|
class Miscellaneous(MemberFilter):
|
86
|
-
tokens =
|
113
|
+
tokens = prep_dictionary(ALLOWABLES)
|
87
114
|
|
88
115
|
|
89
116
|
class EnglishIgnorables(MemberFilter):
|
90
|
-
|
117
|
+
"""NOTE: Not recommended for use.
|
118
|
+
It is better to use a Long* filter such as LongSyllabic than to use this filter.
|
119
|
+
This filter hides words from scoring rather than scoring them poorly,
|
120
|
+
which is more of a benefit than a loss for a word you would like to omit."""
|
121
|
+
|
122
|
+
tokens = prep_dictionary(IGNORABLES)
|
91
123
|
|
92
124
|
|
93
125
|
class ProperName(Filter):
|
@@ -95,9 +127,11 @@ class ProperName(Filter):
|
|
95
127
|
When Toki Pona is written with the Latin alphabet, names are generally
|
96
128
|
capitalized at their start. This filter identifies those tokens.
|
97
129
|
|
98
|
-
Note that this alone cannot determine if a token is a valid name,
|
99
|
-
a standalone name is considered invalid in Toki Pona- names
|
100
|
-
This tool only examines one token at a
|
130
|
+
Note that this alone cannot determine if a token is a valid name,
|
131
|
+
because a standalone name is considered invalid in Toki Pona- names
|
132
|
+
generally have head nouns. This tool only examines one token at a
|
133
|
+
time, so cannot detect names any better than identifying their
|
134
|
+
capital letter.
|
101
135
|
"""
|
102
136
|
|
103
137
|
@classmethod
|
@@ -109,6 +143,10 @@ class ProperName(Filter):
|
|
109
143
|
# this will errantly match.
|
110
144
|
|
111
145
|
|
146
|
+
class LongProperName(MinLen, ProperName):
|
147
|
+
length = 2 # reject "names" of length 1
|
148
|
+
|
149
|
+
|
112
150
|
class NimiPu(MemberFilter):
|
113
151
|
tokens = prep_dictionary(NIMI_PU)
|
114
152
|
|
@@ -151,12 +189,14 @@ class NimiUCSUR(MemberFilter):
|
|
151
189
|
|
152
190
|
class Phonotactic(RegexFilter):
|
153
191
|
"""Determines if a given token is phonotactically valid Toki Pona (or `n`).
|
192
|
+
|
154
193
|
Excludes both consecutive nasals and the illegal syllables:
|
155
194
|
- "nm", "nn"
|
156
195
|
- "wu", "wo", "ji", "ti"
|
157
196
|
|
158
197
|
Note that if this validator is used after `Cleaners.ConsecutiveDuplicates`,
|
159
|
-
"nn" cannot be found.
|
198
|
+
"nn" cannot be found.
|
199
|
+
"""
|
160
200
|
|
161
201
|
pattern = re.compile(
|
162
202
|
rf"^((^[{VOWELS}]|[klmnps][{VOWELS}]|[jt][aeou]|[w][aei])(n(?![mn]))?)+$|^n$",
|
@@ -166,10 +206,16 @@ class Phonotactic(RegexFilter):
|
|
166
206
|
)
|
167
207
|
|
168
208
|
|
209
|
+
class LongPhonotactic(MinLen, Phonotactic):
|
210
|
+
length = 3
|
211
|
+
|
212
|
+
|
169
213
|
class Syllabic(RegexFilter):
|
170
214
|
"""Determines if a given token is syllabically valid Toki Pona (or `n`).
|
171
|
-
|
172
|
-
|
215
|
+
|
216
|
+
Words must have correctly ordered vowels and consonants, but the
|
217
|
+
phonotactic exceptions are not considered.
|
218
|
+
"""
|
173
219
|
|
174
220
|
# rf"^((^[{VOWELS}]|[{CONSONANTS}][{VOWELS}])n?)+$|^n$"
|
175
221
|
# Alterative I was exploring takes ~15% more steps
|
@@ -179,6 +225,10 @@ class Syllabic(RegexFilter):
|
|
179
225
|
)
|
180
226
|
|
181
227
|
|
228
|
+
class LongSyllabic(MinLen, Syllabic):
|
229
|
+
length = 3
|
230
|
+
|
231
|
+
|
182
232
|
class Alphabetic(SubsetFilter):
|
183
233
|
tokens = set(ALPHABET)
|
184
234
|
|
@@ -187,19 +237,19 @@ class AlphabeticRe(RegexFilter):
|
|
187
237
|
pattern = re.compile(rf"[{ALPHABET}]+", flags=re.IGNORECASE)
|
188
238
|
|
189
239
|
|
190
|
-
class
|
191
|
-
|
192
|
-
pass
|
240
|
+
class LongAlphabetic(MinLen, Alphabetic):
|
241
|
+
length = 3
|
193
242
|
|
194
243
|
|
195
244
|
class Numeric(Filter):
|
196
|
-
"""Determine if a given token is entirely numeric.
|
197
|
-
|
245
|
+
"""Determine if a given token is entirely numeric. Covers all numeric
|
246
|
+
symbols in Unicode.
|
198
247
|
|
199
248
|
This will fail to find numeric tokens such as "1.111" or "-42",
|
200
249
|
but if used with the aggressive tokenizer designed for `tok`, these will be
|
201
250
|
split into `["1", ".", "111"]` and `["-", "42"]` respectively. As such, the
|
202
|
-
numeric tokens will be split from their punctuation.
|
251
|
+
numeric tokens will be split from their punctuation.
|
252
|
+
"""
|
203
253
|
|
204
254
|
@classmethod
|
205
255
|
@override
|
@@ -209,13 +259,17 @@ class Numeric(Filter):
|
|
209
259
|
|
210
260
|
|
211
261
|
class Punctuation(SubsetFilter):
|
212
|
-
"""Identify whether a token is entirely punctuation.
|
262
|
+
"""Identify whether a token is entirely punctuation.
|
263
|
+
|
264
|
+
Fastest implementation.
|
265
|
+
"""
|
213
266
|
|
214
267
|
tokens = set(ALL_PUNCT)
|
215
268
|
|
216
269
|
|
217
270
|
class PunctuationRe(RegexFilter):
|
218
271
|
"""Faster implementation of `PunctuationRe1`.
|
272
|
+
|
219
273
|
Goes out of date compared to the `regex` library if UNICODE_PUNCT_RANGES is not updated.
|
220
274
|
"""
|
221
275
|
|
@@ -223,7 +277,8 @@ class PunctuationRe(RegexFilter):
|
|
223
277
|
|
224
278
|
|
225
279
|
class PunctuationRe1(Regex1Filter):
|
226
|
-
"""Reference implementation for identifying tokens made entirely of
|
280
|
+
"""Reference implementation for identifying tokens made entirely of
|
281
|
+
punctuation."""
|
227
282
|
|
228
283
|
pattern = regex.compile(
|
229
284
|
rf"[\p{{Punctuation}}\p{{posix_punct}}{UCSUR_PUNCT_RANGES}]+"
|
@@ -235,14 +290,16 @@ class OrFilter:
|
|
235
290
|
returning True when any individual filter matches or False otherwise.
|
236
291
|
Requires at least two filters.
|
237
292
|
|
238
|
-
OrFilter exists as a compromise between the need to score some
|
239
|
-
while not adding custom behavior to scorers.
|
240
|
-
|
241
|
-
but this would require cleaning the user's input, and
|
242
|
-
It also would not have been as powerful- I
|
243
|
-
or to not give users
|
293
|
+
OrFilter exists as a compromise between the need to score some
|
294
|
+
filters equally, while not adding custom behavior to scorers. I
|
295
|
+
could have allowed a position to have a list of filters instead of
|
296
|
+
one filter, but this would require cleaning the user's input, and
|
297
|
+
nested handling of lists. It also would not have been as powerful- I
|
298
|
+
would need another param for the and/or switch, or to not give users
|
299
|
+
the choice.
|
244
300
|
|
245
|
-
Instead, the user is responsible for building an OrFilter out of
|
301
|
+
Instead, the user is responsible for building an OrFilter out of
|
302
|
+
their desired filters.
|
246
303
|
"""
|
247
304
|
|
248
305
|
@staticmethod
|
@@ -266,11 +323,9 @@ class OrFilter:
|
|
266
323
|
if not len(filters) >= 2:
|
267
324
|
raise ValueError("Provide at least two Filters to OrFilter.")
|
268
325
|
|
269
|
-
|
270
|
-
if len(
|
271
|
-
raise Warning(
|
272
|
-
"Prefer OrMemberFilter for combining two or more MemberFilters."
|
273
|
-
)
|
326
|
+
member_filters = [f for f in filters if issubclass(f, MemberFilter)]
|
327
|
+
if len(member_filters) >= 2:
|
328
|
+
raise Warning("Use OrMemberFilter for combining two or more MemberFilters.")
|
274
329
|
|
275
330
|
filter = cls.__generic_filter(*filters)
|
276
331
|
|
@@ -279,7 +334,7 @@ class OrFilter:
|
|
279
334
|
|
280
335
|
class OrMemberFilter:
|
281
336
|
@staticmethod
|
282
|
-
def
|
337
|
+
def __member_filter(*filters: Type[MemberFilter]) -> Type[MemberFilter]:
|
283
338
|
all_token_sets: List[Set[str]] = [f.tokens for f in filters]
|
284
339
|
all_tokens: Set[str] = set().union(*all_token_sets)
|
285
340
|
|
@@ -291,14 +346,17 @@ class OrMemberFilter:
|
|
291
346
|
def __new__(cls, *filters_: Type[MemberFilter]) -> Type[MemberFilter]:
|
292
347
|
if not len(filters_) >= 2:
|
293
348
|
raise ValueError("Provide two or more MemberFilters to OrMemberFilter.")
|
294
|
-
filter = cls.
|
349
|
+
filter = cls.__member_filter(*filters_)
|
295
350
|
return filter
|
296
351
|
|
297
352
|
|
298
|
-
class AndFilter
|
353
|
+
class AndFilter:
|
299
354
|
"""Instantiate with more than one filter to compose them into one filter,
|
300
|
-
returning False when any individual filter fails to match or True
|
301
|
-
|
355
|
+
returning False when any individual filter fails to match or True
|
356
|
+
otherwise.
|
357
|
+
|
358
|
+
Requires at least two filters.
|
359
|
+
"""
|
302
360
|
|
303
361
|
def __new__(cls, *filters_: Type[Filter]) -> Type[Filter]:
|
304
362
|
if not len(filters_) >= 2:
|
@@ -323,6 +381,11 @@ __all__ = [
|
|
323
381
|
"Alphabetic",
|
324
382
|
"AndFilter",
|
325
383
|
"EnglishIgnorables",
|
384
|
+
"LongAlphabetic",
|
385
|
+
"LongPhonotactic",
|
386
|
+
"LongProperName",
|
387
|
+
"LongSyllabic",
|
388
|
+
"MinLen",
|
326
389
|
"NimiLinkuCore",
|
327
390
|
"NimiLinkuSandbox",
|
328
391
|
"NimiPu",
|
sonatoki/Preprocessors.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
"Preprocessors" are classes which strip content from a given string prior to tokenization.
|
3
3
|
There are currently two distinct types of Preprocessor:
|
4
4
|
|
5
|
-
- Remove a token from a string which would be difficult to identify after tokenization.
|
5
|
+
- Remove a token from a string which would be difficult to identify after tokenization.
|
6
6
|
- URLs
|
7
7
|
- DiscordEmotes
|
8
8
|
- Remove a section of a string which is contained in or marked by certain character(s). Also called "Containers"
|
@@ -61,21 +61,24 @@ Ignorables are tokens which do not count toward the accepted number of tokens
|
|
61
61
|
or the total number of tokens.
|
62
62
|
This is generally because they are considered external to Toki Pona.
|
63
63
|
|
64
|
-
It is likely that every user will want to use these.
|
64
|
+
It is likely that every user will want to use these.
|
65
65
|
Not having them will cause many false negatives, such as when a URL is divided
|
66
66
|
into its parts and checked as a token.
|
67
67
|
"""
|
68
68
|
|
69
69
|
|
70
70
|
class URLs(RegexPreprocessor):
|
71
|
-
"""Remove http(s) protocol URLs"""
|
71
|
+
"""Remove http(s) protocol URLs."""
|
72
72
|
|
73
73
|
pattern = re.compile(r"https?:\/\/\S+")
|
74
74
|
|
75
75
|
|
76
76
|
class Reference(RegexPreprocessor):
|
77
77
|
"""Remove text contained in double brackets.
|
78
|
-
|
78
|
+
|
79
|
+
Often used to fetch articles on Wikipedia, or Magic the Gathering
|
80
|
+
cards.
|
81
|
+
"""
|
79
82
|
|
80
83
|
pattern = re.compile(r"\[\[.+\]\]")
|
81
84
|
|
@@ -100,7 +103,10 @@ class DiscordSpecial(RegexPreprocessor):
|
|
100
103
|
|
101
104
|
class AngleBracketObject(RegexPreprocessor):
|
102
105
|
"""A generalized version of the Discord-specific angle bracket objects.
|
103
|
-
|
106
|
+
|
107
|
+
Removes any contiguous (not broken by whitespace) text in angle
|
108
|
+
brackets.
|
109
|
+
"""
|
104
110
|
|
105
111
|
pattern = re.compile(r"<[^<>\s]+>")
|
106
112
|
|
@@ -111,7 +117,7 @@ The following classes are Containers.
|
|
111
117
|
Containers are a special case of Ignorables, where an entire segment of an input
|
112
118
|
may be removed and not counted toward the accepted or total number of tokens.
|
113
119
|
|
114
|
-
Some users may prefer to use these so that they may quote third parties who
|
120
|
+
Some users may prefer to use these so that they may quote third parties who
|
115
121
|
would likely be using a language other than Toki Pona.
|
116
122
|
"""
|
117
123
|
|
sonatoki/Scorers.py
CHANGED
@@ -13,22 +13,52 @@ Number = Union[int, float]
|
|
13
13
|
Weights = Dict[str, Number]
|
14
14
|
|
15
15
|
|
16
|
-
def sigmoid(n: int) -> Number:
|
17
|
-
return 1 / (1 + math.exp(-(0.30 * (n - 1))))
|
18
|
-
# n-1 makes sigmoid(1) == 0.5
|
19
|
-
# 0.30 softens scaling in favor of short input
|
20
|
-
# return n / (1+abs(n)) # too weak in 0.7+
|
21
|
-
|
22
|
-
|
23
16
|
class Scorer(ABC):
|
24
17
|
@classmethod
|
25
18
|
@abstractmethod
|
26
19
|
def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
|
20
|
+
"""Score a list of tokens using the given `Filter`s, returning a
|
21
|
+
`Number` between 0 and 1 inclusive."""
|
27
22
|
raise NotImplementedError
|
28
23
|
|
29
24
|
|
25
|
+
class Soften(Scorer):
|
26
|
+
"""Meta `Scorer` which scales the scores of short messages to reduce the
|
27
|
+
impact of shortness on scoring.
|
28
|
+
|
29
|
+
The scores of short messages are scaled by mapping the token count
|
30
|
+
to [0.5, 1.0] via the sigmoid function, then raising the score to
|
31
|
+
the resultant power.
|
32
|
+
|
33
|
+
For example, a single token scoring 0.64 will score 0.8 instead.
|
34
|
+
"""
|
35
|
+
|
36
|
+
@staticmethod
|
37
|
+
def sigmoid(n: int) -> Number:
|
38
|
+
return 1 / (1 + math.exp(-(0.30 * (n - 1))))
|
39
|
+
# n-1 makes sigmoid(1) == 0.5
|
40
|
+
# 0.30 softens scaling in favor of short input
|
41
|
+
# return n / (1+abs(n)) # too weak in 0.7+
|
42
|
+
|
43
|
+
@classmethod
|
44
|
+
@override
|
45
|
+
def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
|
46
|
+
percentage = super().score(tokens, filters) # type: ignore [abstractmethod]
|
47
|
+
len_tokens = len(tokens)
|
48
|
+
percentage **= cls.sigmoid(len_tokens)
|
49
|
+
return percentage
|
50
|
+
|
51
|
+
def __new__(cls, scorer: Type[Scorer]) -> Type[Scorer]:
|
52
|
+
class SoftenedScorer(Soften, scorer): ...
|
53
|
+
|
54
|
+
return SoftenedScorer
|
55
|
+
|
56
|
+
|
30
57
|
class PassFail(Scorer):
|
31
|
-
"""
|
58
|
+
"""If a token matches any filter, it scores 1.
|
59
|
+
|
60
|
+
Otherwise, it scores 0.
|
61
|
+
"""
|
32
62
|
|
33
63
|
@classmethod
|
34
64
|
def score_token(cls, token: str, filters: List[Type[Filter]]) -> Number:
|
@@ -50,28 +80,17 @@ class PassFail(Scorer):
|
|
50
80
|
return total_score / len_tokens if len_tokens else 0
|
51
81
|
|
52
82
|
|
53
|
-
class
|
54
|
-
|
55
|
-
|
56
|
-
def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
|
57
|
-
if not tokens:
|
58
|
-
return 1
|
59
|
-
|
60
|
-
total_score = 0
|
61
|
-
len_tokens = len(tokens)
|
62
|
-
for token in tokens:
|
63
|
-
total_score += cls.score_token(token, filters)
|
64
|
-
|
65
|
-
percentage = total_score / len_tokens if len_tokens else 0
|
66
|
-
percentage **= sigmoid(len_tokens)
|
67
|
-
return percentage
|
83
|
+
class Scaling(Scorer):
|
84
|
+
"""Tokens score 1 for matching the first filter, and a linearly reduced
|
85
|
+
amount for matching later filters based on how many filters there are.
|
68
86
|
|
87
|
+
For example, if there are 4 filters, a token scores 1.0, 0.75, 0.50,
|
88
|
+
and 0.25 for matching each respectively.
|
69
89
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
This is desirable to avoid messages which would only match weaker filters, as these are less likely to be Toki Pona.
|
90
|
+
In other words, filter order matters, weighing earlier listed
|
91
|
+
filters higher than later ones. This is desirable to avoid messages
|
92
|
+
which would only match weaker filters, as these are less likely to
|
93
|
+
be Toki Pona.
|
75
94
|
"""
|
76
95
|
|
77
96
|
@classmethod
|
@@ -95,33 +114,17 @@ class Scaling(Scorer):
|
|
95
114
|
return total_score / max_score if max_score else 0
|
96
115
|
|
97
116
|
|
98
|
-
class
|
99
|
-
"""
|
100
|
-
|
101
|
-
then raising the score to the resultant power.
|
102
|
-
For example, a single token scoring 0.64 will now score 0.8.
|
103
|
-
"""
|
117
|
+
class SoftPassFail(Soften, PassFail):
|
118
|
+
"""Same as `PassFail`, but shorter messages are subject to less harsh
|
119
|
+
scoring."""
|
104
120
|
|
105
|
-
@classmethod
|
106
|
-
@override
|
107
|
-
def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
|
108
|
-
if not tokens:
|
109
|
-
return 1
|
110
121
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
max_score = len_tokens * len_filters
|
116
|
-
for token in tokens:
|
117
|
-
total_score += cls.score_token(token, filters, len_filters)
|
118
|
-
|
119
|
-
percentage = total_score / max_score if max_score else 0
|
120
|
-
percentage **= sigmoid(len_tokens)
|
121
|
-
return percentage
|
122
|
+
class SoftScaling(Soften, Scaling):
|
123
|
+
"""Same as `Scaling`, but shorter messages are subject to less harsh
|
124
|
+
scoring."""
|
122
125
|
|
123
126
|
|
124
|
-
class Logarithmic(Scorer): ...
|
127
|
+
# class Logarithmic(Scorer): ...
|
125
128
|
|
126
129
|
|
127
130
|
__all__ = ["PassFail", "SoftPassFail", "Scaling", "SoftScaling"]
|