sonatoki 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Cleaners.py +4 -1
- sonatoki/Configs.py +34 -16
- sonatoki/Filters.py +38 -21
- sonatoki/Preprocessors.py +12 -6
- sonatoki/Scorers.py +54 -51
- sonatoki/constants.py +12 -45
- sonatoki/ilo.py +55 -11
- sonatoki/linku.json +1 -1
- sonatoki/sandbox.json +1 -1
- sonatoki/utils.py +23 -5
- {sonatoki-0.3.2.dist-info → sonatoki-0.4.0.dist-info}/METADATA +1 -1
- sonatoki-0.4.0.dist-info/RECORD +18 -0
- {sonatoki-0.3.2.dist-info → sonatoki-0.4.0.dist-info}/WHEEL +1 -1
- sonatoki-0.3.2.dist-info/RECORD +0 -18
- {sonatoki-0.3.2.dist-info → sonatoki-0.4.0.dist-info}/licenses/LICENSE +0 -0
sonatoki/Cleaners.py
CHANGED
@@ -10,6 +10,7 @@ class Cleaner(ABC):
|
|
10
10
|
@classmethod
|
11
11
|
@abstractmethod
|
12
12
|
def clean(cls, token: str) -> str:
|
13
|
+
"""Transform a token to remove some undesirable part."""
|
13
14
|
raise NotImplementedError
|
14
15
|
|
15
16
|
|
@@ -33,7 +34,8 @@ class ConsecutiveDuplicates(Cleaner):
|
|
33
34
|
may be altered for emphasis or effect, such as in "sonaaaa" or "AAAAAA".
|
34
35
|
|
35
36
|
This may be undesirable for moraic scripts like Hiragana, where `わわ` would be
|
36
|
-
incorrectly reduced to `わ`. This does preserve phonotactic validity, though.
|
37
|
+
incorrectly reduced to `わ`. This does preserve phonotactic validity, though.
|
38
|
+
"""
|
37
39
|
|
38
40
|
@classmethod
|
39
41
|
@override
|
@@ -69,4 +71,5 @@ class Lowercase(Cleaner):
|
|
69
71
|
|
70
72
|
__all__ = [
|
71
73
|
"ConsecutiveDuplicates",
|
74
|
+
"Lowercase",
|
72
75
|
]
|
sonatoki/Configs.py
CHANGED
@@ -2,6 +2,9 @@
|
|
2
2
|
from copy import deepcopy
|
3
3
|
from typing import List, Type, TypedDict
|
4
4
|
|
5
|
+
# PDM
|
6
|
+
from typing_extensions import NotRequired
|
7
|
+
|
5
8
|
# LOCAL
|
6
9
|
from sonatoki.Filters import (
|
7
10
|
Filter,
|
@@ -9,6 +12,8 @@ from sonatoki.Filters import (
|
|
9
12
|
Syllabic,
|
10
13
|
NimiUCSUR,
|
11
14
|
Alphabetic,
|
15
|
+
NimiKuLili,
|
16
|
+
NimiKuSuli,
|
12
17
|
ProperName,
|
13
18
|
Punctuation,
|
14
19
|
LongSyllabic,
|
@@ -20,12 +25,11 @@ from sonatoki.Filters import (
|
|
20
25
|
NimiLinkuCommon,
|
21
26
|
NimiLinkuObscure,
|
22
27
|
NimiLinkuSandbox,
|
23
|
-
EnglishIgnorables,
|
24
28
|
NimiLinkuUncommon,
|
25
29
|
)
|
26
30
|
from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
|
27
31
|
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
28
|
-
from sonatoki.Tokenizers import Tokenizer
|
32
|
+
from sonatoki.Tokenizers import Tokenizer
|
29
33
|
from sonatoki.Preprocessors import (
|
30
34
|
URLs,
|
31
35
|
Backticks,
|
@@ -37,15 +41,16 @@ from sonatoki.Preprocessors import (
|
|
37
41
|
|
38
42
|
class IloConfig(TypedDict):
|
39
43
|
preprocessors: List[Type[Preprocessor]]
|
40
|
-
word_tokenizer: Type[Tokenizer]
|
41
44
|
cleaners: List[Type[Cleaner]]
|
42
45
|
ignoring_filters: List[Type[Filter]]
|
43
46
|
scoring_filters: List[Type[Filter]]
|
44
47
|
scorer: Type[Scorer]
|
45
48
|
passing_score: Number
|
49
|
+
word_tokenizer: NotRequired[Type[Tokenizer]]
|
50
|
+
sent_tokenizer: NotRequired[Type[Tokenizer]]
|
46
51
|
|
47
52
|
|
48
|
-
# TODO: branching configs?
|
53
|
+
# TODO: branching configs? config builder?
|
49
54
|
|
50
55
|
BaseConfig: IloConfig = {
|
51
56
|
"preprocessors": [URLs],
|
@@ -54,7 +59,6 @@ BaseConfig: IloConfig = {
|
|
54
59
|
"scoring_filters": [],
|
55
60
|
"scorer": PassFail,
|
56
61
|
"passing_score": 0.8,
|
57
|
-
"word_tokenizer": WordTokenizer,
|
58
62
|
}
|
59
63
|
|
60
64
|
|
@@ -70,7 +74,6 @@ PrefConfig: IloConfig = {
|
|
70
74
|
],
|
71
75
|
"scorer": SoftScaling,
|
72
76
|
"passing_score": 0.8,
|
73
|
-
"word_tokenizer": WordTokenizer,
|
74
77
|
}
|
75
78
|
|
76
79
|
CorpusConfig: IloConfig = {
|
@@ -93,13 +96,8 @@ CorpusConfig: IloConfig = {
|
|
93
96
|
],
|
94
97
|
"scorer": SoftScaling,
|
95
98
|
"passing_score": 0.8,
|
96
|
-
"word_tokenizer": WordTokenizer,
|
97
99
|
}
|
98
|
-
|
99
|
-
|
100
|
-
"""
|
101
|
-
Mimics the previous implementation of ilo pi toki pona taso
|
102
|
-
"""
|
100
|
+
"""Mimics the previous implementation of ilo pi toki pona taso."""
|
103
101
|
LazyConfig: IloConfig = {
|
104
102
|
"preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
|
105
103
|
"cleaners": [ConsecutiveDuplicates],
|
@@ -107,27 +105,47 @@ LazyConfig: IloConfig = {
|
|
107
105
|
"scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
|
108
106
|
"scorer": SoftPassFail,
|
109
107
|
"passing_score": 0.8,
|
110
|
-
"word_tokenizer": WordTokenizer,
|
111
108
|
}
|
109
|
+
"""This is extremely silly."""
|
110
|
+
IsipinEpikuConfig: IloConfig = {
|
111
|
+
"preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
|
112
|
+
"cleaners": [ConsecutiveDuplicates],
|
113
|
+
"ignoring_filters": [Numeric, Punctuation],
|
114
|
+
"scoring_filters": [
|
115
|
+
OrMemberFilter(
|
116
|
+
NimiKuSuli,
|
117
|
+
NimiKuLili,
|
118
|
+
NimiLinkuUncommon,
|
119
|
+
NimiLinkuObscure,
|
120
|
+
NimiLinkuSandbox,
|
121
|
+
),
|
122
|
+
LongSyllabic,
|
123
|
+
LongProperName,
|
124
|
+
LongAlphabetic,
|
125
|
+
],
|
126
|
+
"scorer": SoftScaling,
|
127
|
+
"passing_score": 0.8,
|
128
|
+
}
|
129
|
+
|
112
130
|
|
113
131
|
DiscordConfig: IloConfig = {
|
114
132
|
"preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
|
115
133
|
"cleaners": [ConsecutiveDuplicates],
|
116
|
-
"ignoring_filters": [Numeric, Punctuation
|
134
|
+
"ignoring_filters": [Numeric, Punctuation],
|
117
135
|
"scoring_filters": [
|
118
|
-
OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
|
136
|
+
OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
|
119
137
|
LongSyllabic,
|
120
138
|
LongProperName,
|
121
139
|
LongAlphabetic,
|
122
140
|
],
|
123
141
|
"scorer": SoftScaling,
|
124
142
|
"passing_score": 0.8,
|
125
|
-
"word_tokenizer": WordTokenizer,
|
126
143
|
}
|
127
144
|
|
128
145
|
TelegramConfig: IloConfig = deepcopy(PrefConfig)
|
129
146
|
ForumConfig: IloConfig = deepcopy(PrefConfig)
|
130
147
|
|
148
|
+
|
131
149
|
__all__ = [
|
132
150
|
"BaseConfig",
|
133
151
|
"CorpusConfig",
|
sonatoki/Filters.py
CHANGED
@@ -127,9 +127,11 @@ class ProperName(Filter):
|
|
127
127
|
When Toki Pona is written with the Latin alphabet, names are generally
|
128
128
|
capitalized at their start. This filter identifies those tokens.
|
129
129
|
|
130
|
-
Note that this alone cannot determine if a token is a valid name,
|
131
|
-
a standalone name is considered invalid in Toki Pona- names
|
132
|
-
This tool only examines one token at a
|
130
|
+
Note that this alone cannot determine if a token is a valid name,
|
131
|
+
because a standalone name is considered invalid in Toki Pona- names
|
132
|
+
generally have head nouns. This tool only examines one token at a
|
133
|
+
time, so cannot detect names any better than identifying their
|
134
|
+
capital letter.
|
133
135
|
"""
|
134
136
|
|
135
137
|
@classmethod
|
@@ -187,12 +189,14 @@ class NimiUCSUR(MemberFilter):
|
|
187
189
|
|
188
190
|
class Phonotactic(RegexFilter):
|
189
191
|
"""Determines if a given token is phonotactically valid Toki Pona (or `n`).
|
192
|
+
|
190
193
|
Excludes both consecutive nasals and the illegal syllables:
|
191
194
|
- "nm", "nn"
|
192
195
|
- "wu", "wo", "ji", "ti"
|
193
196
|
|
194
197
|
Note that if this validator is used after `Cleaners.ConsecutiveDuplicates`,
|
195
|
-
"nn" cannot be found.
|
198
|
+
"nn" cannot be found.
|
199
|
+
"""
|
196
200
|
|
197
201
|
pattern = re.compile(
|
198
202
|
rf"^((^[{VOWELS}]|[klmnps][{VOWELS}]|[jt][aeou]|[w][aei])(n(?![mn]))?)+$|^n$",
|
@@ -208,8 +212,10 @@ class LongPhonotactic(MinLen, Phonotactic):
|
|
208
212
|
|
209
213
|
class Syllabic(RegexFilter):
|
210
214
|
"""Determines if a given token is syllabically valid Toki Pona (or `n`).
|
211
|
-
|
212
|
-
|
215
|
+
|
216
|
+
Words must have correctly ordered vowels and consonants, but the
|
217
|
+
phonotactic exceptions are not considered.
|
218
|
+
"""
|
213
219
|
|
214
220
|
# rf"^((^[{VOWELS}]|[{CONSONANTS}][{VOWELS}])n?)+$|^n$"
|
215
221
|
# Alterative I was exploring takes ~15% more steps
|
@@ -236,13 +242,14 @@ class LongAlphabetic(MinLen, Alphabetic):
|
|
236
242
|
|
237
243
|
|
238
244
|
class Numeric(Filter):
|
239
|
-
"""Determine if a given token is entirely numeric.
|
240
|
-
|
245
|
+
"""Determine if a given token is entirely numeric. Covers all numeric
|
246
|
+
symbols in Unicode.
|
241
247
|
|
242
248
|
This will fail to find numeric tokens such as "1.111" or "-42",
|
243
249
|
but if used with the aggressive tokenizer designed for `tok`, these will be
|
244
250
|
split into `["1", ".", "111"]` and `["-", "42"]` respectively. As such, the
|
245
|
-
numeric tokens will be split from their punctuation.
|
251
|
+
numeric tokens will be split from their punctuation.
|
252
|
+
"""
|
246
253
|
|
247
254
|
@classmethod
|
248
255
|
@override
|
@@ -252,13 +259,17 @@ class Numeric(Filter):
|
|
252
259
|
|
253
260
|
|
254
261
|
class Punctuation(SubsetFilter):
|
255
|
-
"""Identify whether a token is entirely punctuation.
|
262
|
+
"""Identify whether a token is entirely punctuation.
|
263
|
+
|
264
|
+
Fastest implementation.
|
265
|
+
"""
|
256
266
|
|
257
267
|
tokens = set(ALL_PUNCT)
|
258
268
|
|
259
269
|
|
260
270
|
class PunctuationRe(RegexFilter):
|
261
271
|
"""Faster implementation of `PunctuationRe1`.
|
272
|
+
|
262
273
|
Goes out of date compared to the `regex` library if UNICODE_PUNCT_RANGES is not updated.
|
263
274
|
"""
|
264
275
|
|
@@ -266,7 +277,8 @@ class PunctuationRe(RegexFilter):
|
|
266
277
|
|
267
278
|
|
268
279
|
class PunctuationRe1(Regex1Filter):
|
269
|
-
"""Reference implementation for identifying tokens made entirely of
|
280
|
+
"""Reference implementation for identifying tokens made entirely of
|
281
|
+
punctuation."""
|
270
282
|
|
271
283
|
pattern = regex.compile(
|
272
284
|
rf"[\p{{Punctuation}}\p{{posix_punct}}{UCSUR_PUNCT_RANGES}]+"
|
@@ -278,14 +290,16 @@ class OrFilter:
|
|
278
290
|
returning True when any individual filter matches or False otherwise.
|
279
291
|
Requires at least two filters.
|
280
292
|
|
281
|
-
OrFilter exists as a compromise between the need to score some
|
282
|
-
while not adding custom behavior to scorers.
|
283
|
-
|
284
|
-
but this would require cleaning the user's input, and
|
285
|
-
It also would not have been as powerful- I
|
286
|
-
or to not give users
|
293
|
+
OrFilter exists as a compromise between the need to score some
|
294
|
+
filters equally, while not adding custom behavior to scorers. I
|
295
|
+
could have allowed a position to have a list of filters instead of
|
296
|
+
one filter, but this would require cleaning the user's input, and
|
297
|
+
nested handling of lists. It also would not have been as powerful- I
|
298
|
+
would need another param for the and/or switch, or to not give users
|
299
|
+
the choice.
|
287
300
|
|
288
|
-
Instead, the user is responsible for building an OrFilter out of
|
301
|
+
Instead, the user is responsible for building an OrFilter out of
|
302
|
+
their desired filters.
|
289
303
|
"""
|
290
304
|
|
291
305
|
@staticmethod
|
@@ -336,10 +350,13 @@ class OrMemberFilter:
|
|
336
350
|
return filter
|
337
351
|
|
338
352
|
|
339
|
-
class AndFilter
|
353
|
+
class AndFilter:
|
340
354
|
"""Instantiate with more than one filter to compose them into one filter,
|
341
|
-
returning False when any individual filter fails to match or True
|
342
|
-
|
355
|
+
returning False when any individual filter fails to match or True
|
356
|
+
otherwise.
|
357
|
+
|
358
|
+
Requires at least two filters.
|
359
|
+
"""
|
343
360
|
|
344
361
|
def __new__(cls, *filters_: Type[Filter]) -> Type[Filter]:
|
345
362
|
if not len(filters_) >= 2:
|
sonatoki/Preprocessors.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
"Preprocessors" are classes which strip content from a given string prior to tokenization.
|
3
3
|
There are currently two distinct types of Preprocessor:
|
4
4
|
|
5
|
-
- Remove a token from a string which would be difficult to identify after tokenization.
|
5
|
+
- Remove a token from a string which would be difficult to identify after tokenization.
|
6
6
|
- URLs
|
7
7
|
- DiscordEmotes
|
8
8
|
- Remove a section of a string which is contained in or marked by certain character(s). Also called "Containers"
|
@@ -61,21 +61,24 @@ Ignorables are tokens which do not count toward the accepted number of tokens
|
|
61
61
|
or the total number of tokens.
|
62
62
|
This is generally because they are considered external to Toki Pona.
|
63
63
|
|
64
|
-
It is likely that every user will want to use these.
|
64
|
+
It is likely that every user will want to use these.
|
65
65
|
Not having them will cause many false negatives, such as when a URL is divided
|
66
66
|
into its parts and checked as a token.
|
67
67
|
"""
|
68
68
|
|
69
69
|
|
70
70
|
class URLs(RegexPreprocessor):
|
71
|
-
"""Remove http(s) protocol URLs"""
|
71
|
+
"""Remove http(s) protocol URLs."""
|
72
72
|
|
73
73
|
pattern = re.compile(r"https?:\/\/\S+")
|
74
74
|
|
75
75
|
|
76
76
|
class Reference(RegexPreprocessor):
|
77
77
|
"""Remove text contained in double brackets.
|
78
|
-
|
78
|
+
|
79
|
+
Often used to fetch articles on Wikipedia, or Magic the Gathering
|
80
|
+
cards.
|
81
|
+
"""
|
79
82
|
|
80
83
|
pattern = re.compile(r"\[\[.+\]\]")
|
81
84
|
|
@@ -100,7 +103,10 @@ class DiscordSpecial(RegexPreprocessor):
|
|
100
103
|
|
101
104
|
class AngleBracketObject(RegexPreprocessor):
|
102
105
|
"""A generalized version of the Discord-specific angle bracket objects.
|
103
|
-
|
106
|
+
|
107
|
+
Removes any contiguous (not broken by whitespace) text in angle
|
108
|
+
brackets.
|
109
|
+
"""
|
104
110
|
|
105
111
|
pattern = re.compile(r"<[^<>\s]+>")
|
106
112
|
|
@@ -111,7 +117,7 @@ The following classes are Containers.
|
|
111
117
|
Containers are a special case of Ignorables, where an entire segment of an input
|
112
118
|
may be removed and not counted toward the accepted or total number of tokens.
|
113
119
|
|
114
|
-
Some users may prefer to use these so that they may quote third parties who
|
120
|
+
Some users may prefer to use these so that they may quote third parties who
|
115
121
|
would likely be using a language other than Toki Pona.
|
116
122
|
"""
|
117
123
|
|
sonatoki/Scorers.py
CHANGED
@@ -13,22 +13,52 @@ Number = Union[int, float]
|
|
13
13
|
Weights = Dict[str, Number]
|
14
14
|
|
15
15
|
|
16
|
-
def sigmoid(n: int) -> Number:
|
17
|
-
return 1 / (1 + math.exp(-(0.30 * (n - 1))))
|
18
|
-
# n-1 makes sigmoid(1) == 0.5
|
19
|
-
# 0.30 softens scaling in favor of short input
|
20
|
-
# return n / (1+abs(n)) # too weak in 0.7+
|
21
|
-
|
22
|
-
|
23
16
|
class Scorer(ABC):
|
24
17
|
@classmethod
|
25
18
|
@abstractmethod
|
26
19
|
def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
|
20
|
+
"""Score a list of tokens using the given `Filter`s, returning a
|
21
|
+
`Number` between 0 and 1 inclusive."""
|
27
22
|
raise NotImplementedError
|
28
23
|
|
29
24
|
|
25
|
+
class Soften(Scorer):
|
26
|
+
"""Meta `Scorer` which scales the scores of short messages to reduce the
|
27
|
+
impact of shortness on scoring.
|
28
|
+
|
29
|
+
The scores of short messages are scaled by mapping the token count
|
30
|
+
to [0.5, 1.0] via the sigmoid function, then raising the score to
|
31
|
+
the resultant power.
|
32
|
+
|
33
|
+
For example, a single token scoring 0.64 will score 0.8 instead.
|
34
|
+
"""
|
35
|
+
|
36
|
+
@staticmethod
|
37
|
+
def sigmoid(n: int) -> Number:
|
38
|
+
return 1 / (1 + math.exp(-(0.30 * (n - 1))))
|
39
|
+
# n-1 makes sigmoid(1) == 0.5
|
40
|
+
# 0.30 softens scaling in favor of short input
|
41
|
+
# return n / (1+abs(n)) # too weak in 0.7+
|
42
|
+
|
43
|
+
@classmethod
|
44
|
+
@override
|
45
|
+
def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
|
46
|
+
percentage = super().score(tokens, filters) # type: ignore [abstractmethod]
|
47
|
+
len_tokens = len(tokens)
|
48
|
+
percentage **= cls.sigmoid(len_tokens)
|
49
|
+
return percentage
|
50
|
+
|
51
|
+
def __new__(cls, scorer: Type[Scorer]) -> Type[Scorer]:
|
52
|
+
class SoftenedScorer(Soften, scorer): ...
|
53
|
+
|
54
|
+
return SoftenedScorer
|
55
|
+
|
56
|
+
|
30
57
|
class PassFail(Scorer):
|
31
|
-
"""
|
58
|
+
"""If a token matches any filter, it scores 1.
|
59
|
+
|
60
|
+
Otherwise, it scores 0.
|
61
|
+
"""
|
32
62
|
|
33
63
|
@classmethod
|
34
64
|
def score_token(cls, token: str, filters: List[Type[Filter]]) -> Number:
|
@@ -50,28 +80,17 @@ class PassFail(Scorer):
|
|
50
80
|
return total_score / len_tokens if len_tokens else 0
|
51
81
|
|
52
82
|
|
53
|
-
class
|
54
|
-
|
55
|
-
|
56
|
-
def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
|
57
|
-
if not tokens:
|
58
|
-
return 1
|
59
|
-
|
60
|
-
total_score = 0
|
61
|
-
len_tokens = len(tokens)
|
62
|
-
for token in tokens:
|
63
|
-
total_score += cls.score_token(token, filters)
|
64
|
-
|
65
|
-
percentage = total_score / len_tokens if len_tokens else 0
|
66
|
-
percentage **= sigmoid(len_tokens)
|
67
|
-
return percentage
|
83
|
+
class Scaling(Scorer):
|
84
|
+
"""Tokens score 1 for matching the first filter, and a linearly reduced
|
85
|
+
amount for matching later filters based on how many filters there are.
|
68
86
|
|
87
|
+
For example, if there are 4 filters, a token scores 1.0, 0.75, 0.50,
|
88
|
+
and 0.25 for matching each respectively.
|
69
89
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
This is desirable to avoid messages which would only match weaker filters, as these are less likely to be Toki Pona.
|
90
|
+
In other words, filter order matters, weighing earlier listed
|
91
|
+
filters higher than later ones. This is desirable to avoid messages
|
92
|
+
which would only match weaker filters, as these are less likely to
|
93
|
+
be Toki Pona.
|
75
94
|
"""
|
76
95
|
|
77
96
|
@classmethod
|
@@ -95,33 +114,17 @@ class Scaling(Scorer):
|
|
95
114
|
return total_score / max_score if max_score else 0
|
96
115
|
|
97
116
|
|
98
|
-
class
|
99
|
-
"""
|
100
|
-
|
101
|
-
then raising the score to the resultant power.
|
102
|
-
For example, a single token scoring 0.64 will now score 0.8.
|
103
|
-
"""
|
117
|
+
class SoftPassFail(Soften, PassFail):
|
118
|
+
"""Same as `PassFail`, but shorter messages are subject to less harsh
|
119
|
+
scoring."""
|
104
120
|
|
105
|
-
@classmethod
|
106
|
-
@override
|
107
|
-
def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
|
108
|
-
if not tokens:
|
109
|
-
return 1
|
110
121
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
max_score = len_tokens * len_filters
|
116
|
-
for token in tokens:
|
117
|
-
total_score += cls.score_token(token, filters, len_filters)
|
118
|
-
|
119
|
-
percentage = total_score / max_score if max_score else 0
|
120
|
-
percentage **= sigmoid(len_tokens)
|
121
|
-
return percentage
|
122
|
+
class SoftScaling(Soften, Scaling):
|
123
|
+
"""Same as `Scaling`, but shorter messages are subject to less harsh
|
124
|
+
scoring."""
|
122
125
|
|
123
126
|
|
124
|
-
class Logarithmic(Scorer): ...
|
127
|
+
# class Logarithmic(Scorer): ...
|
125
128
|
|
126
129
|
|
127
130
|
__all__ = ["PassFail", "SoftPassFail", "Scaling", "SoftScaling"]
|
sonatoki/constants.py
CHANGED
@@ -380,62 +380,29 @@ CONSONANTS = "jklmnpstw"
|
|
380
380
|
ALPHABET = VOWELS + CONSONANTS
|
381
381
|
|
382
382
|
LANGUAGE = "english" # for NLTK
|
383
|
-
|
384
|
-
|
383
|
+
"""Commonly occurring strings which are some kind of valid Toki Pona or
|
384
|
+
external token."""
|
385
385
|
ALLOWABLES = {
|
386
386
|
"x", # ala
|
387
387
|
"y", # anu
|
388
388
|
"kxk", # ken ala ken
|
389
389
|
"wxw", # wile ala wile
|
390
|
+
"msa",
|
390
391
|
}
|
391
392
|
|
392
393
|
PHONOMATCHES = {
|
393
|
-
|
394
|
-
|
395
|
-
# "i", # against
|
396
|
-
# "in", # against
|
394
|
+
"non",
|
395
|
+
"nope",
|
397
396
|
"some",
|
398
|
-
"like",
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
# "some", # against
|
405
|
-
"to", # ignore
|
406
|
-
# "u", # against
|
407
|
-
# "un", # against
|
408
|
-
"use", # against
|
409
|
-
# "we", # against
|
397
|
+
"like",
|
398
|
+
"use",
|
399
|
+
"imo",
|
400
|
+
"time",
|
401
|
+
"man",
|
402
|
+
"also",
|
410
403
|
}
|
411
404
|
|
412
|
-
ALPHABETIC_MATCHES =
|
413
|
-
"a",
|
414
|
-
# "am",
|
415
|
-
# "as",
|
416
|
-
# "at",
|
417
|
-
# "aw", # aww
|
418
|
-
# "ek", # eek
|
419
|
-
# "ew",
|
420
|
-
# "ik",
|
421
|
-
# "il", # ill
|
422
|
-
# "im",
|
423
|
-
# "im",
|
424
|
-
# "ip",
|
425
|
-
# "is",
|
426
|
-
# "it",
|
427
|
-
# "l", # they'll
|
428
|
-
# "m", # i'm
|
429
|
-
# "ok",
|
430
|
-
# "op",
|
431
|
-
# "ow",
|
432
|
-
# "s", # let's
|
433
|
-
# "t", # don't
|
434
|
-
# "up",
|
435
|
-
# "us",
|
436
|
-
# "ut",
|
437
|
-
# "uw",
|
438
|
-
}
|
405
|
+
ALPHABETIC_MATCHES: Set[str] = set()
|
439
406
|
|
440
407
|
IGNORABLES = PHONOMATCHES | ALPHABETIC_MATCHES
|
441
408
|
|
sonatoki/ilo.py
CHANGED
@@ -5,12 +5,17 @@ from typing import List, Type, Tuple
|
|
5
5
|
from sonatoki.Filters import Filter
|
6
6
|
from sonatoki.Scorers import Number, Scorer
|
7
7
|
from sonatoki.Cleaners import Cleaner
|
8
|
-
from sonatoki.Tokenizers import Tokenizer
|
8
|
+
from sonatoki.Tokenizers import Tokenizer, SentTokenizer, WordTokenizer
|
9
9
|
from sonatoki.Preprocessors import Preprocessor
|
10
10
|
|
11
|
+
# tokenized, filtered, cleaned, score, result
|
12
|
+
Scorecard = Tuple[List[str], List[str], List[str], Number, bool]
|
13
|
+
# TODO: scorecard kinda sucks as a name
|
14
|
+
|
11
15
|
|
12
16
|
class Ilo:
|
13
17
|
__preprocessors: List[Type[Preprocessor]]
|
18
|
+
__sent_tokenizer: Type[Tokenizer]
|
14
19
|
__word_tokenizer: Type[Tokenizer]
|
15
20
|
__cleaners: List[Type[Cleaner]]
|
16
21
|
__ignoring_filters: List[Type[Filter]]
|
@@ -26,11 +31,13 @@ class Ilo:
|
|
26
31
|
scoring_filters: List[Type[Filter]],
|
27
32
|
scorer: Type[Scorer],
|
28
33
|
passing_score: Number,
|
29
|
-
word_tokenizer: Type[Tokenizer],
|
34
|
+
word_tokenizer: Type[Tokenizer] = WordTokenizer,
|
35
|
+
sent_tokenizer: Type[Tokenizer] = SentTokenizer,
|
30
36
|
):
|
31
37
|
super().__init__()
|
32
38
|
# avoid keeping a ref to user's list just in case
|
33
39
|
self.__preprocessors = [*preprocessors]
|
40
|
+
self.__sent_tokenizer = sent_tokenizer
|
34
41
|
self.__word_tokenizer = word_tokenizer
|
35
42
|
self.__cleaners = [*cleaners]
|
36
43
|
self.__ignoring_filters = [*ignoring_filters]
|
@@ -47,6 +54,9 @@ class Ilo:
|
|
47
54
|
"""It is *highly* recommended that you run `ilo.preprocess` first."""
|
48
55
|
return self.__word_tokenizer.tokenize(msg)
|
49
56
|
|
57
|
+
def sent_tokenize(self, msg: str) -> List[str]:
|
58
|
+
return self.__sent_tokenizer.tokenize(msg)
|
59
|
+
|
50
60
|
def clean_token(self, token: str) -> str:
|
51
61
|
for c in self.__cleaners:
|
52
62
|
token = c.clean(token)
|
@@ -83,26 +93,60 @@ class Ilo:
|
|
83
93
|
def score_tokens(self, tokens: List[str]) -> float:
|
84
94
|
return self.__scorer.score(tokens, self.__scoring_filters)
|
85
95
|
|
86
|
-
def _is_toki_pona(
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
96
|
+
def _is_toki_pona(self, message: str) -> Scorecard:
|
97
|
+
"""Process a message into its tokens, then filters, cleans, and scores
|
98
|
+
them. Returns all parts. Message must already be preprocessed, normally
|
99
|
+
done in `self.is_toki_pona(message)`.
|
100
|
+
|
101
|
+
Returns all components of the processing algorithm except preprocessing:
|
91
102
|
- Tokenized message (list[str])
|
92
103
|
- Filtered message (list[str])
|
93
104
|
- Cleaned message (list[str])
|
94
105
|
- Score (float)
|
95
|
-
- Result (bool)
|
96
|
-
|
97
|
-
tokenized = self.word_tokenize(
|
106
|
+
- Result (bool)
|
107
|
+
"""
|
108
|
+
tokenized = self.word_tokenize(message)
|
98
109
|
filtered = self.filter_tokens(tokenized)
|
99
110
|
cleaned = self.clean_tokens(filtered)
|
100
111
|
score = self.score_tokens(cleaned)
|
101
112
|
result = score >= self.__passing_score
|
102
113
|
|
103
|
-
return
|
114
|
+
return tokenized, filtered, cleaned, score, result
|
104
115
|
|
105
116
|
def is_toki_pona(self, message: str) -> bool:
|
106
117
|
"""Determines whether a single statement is or is not Toki Pona."""
|
118
|
+
message = self.preprocess(message)
|
107
119
|
*_, result = self._is_toki_pona(message)
|
108
120
|
return result
|
121
|
+
|
122
|
+
def _are_toki_pona(self, message: str):
|
123
|
+
"""Split a message into sentences, then return a list each sentence's
|
124
|
+
results via `self._is_toki_pona()`.
|
125
|
+
|
126
|
+
Message must already be preprocessed, normally done in
|
127
|
+
`self.are_toki_pona(message)`.
|
128
|
+
"""
|
129
|
+
results: List[Scorecard] = list()
|
130
|
+
for sentence in self.sent_tokenize(message):
|
131
|
+
result = self._is_toki_pona(sentence)
|
132
|
+
results.append(result)
|
133
|
+
return results
|
134
|
+
|
135
|
+
def are_toki_pona(self, message: str) -> List[bool]:
|
136
|
+
"""Splits a statement into sentences, then determines if each is or is not Toki Pona.
|
137
|
+
NOTE: You will need to decide how to score the result. Examples:
|
138
|
+
|
139
|
+
```
|
140
|
+
def all_must_pass(message: str) -> bool:
|
141
|
+
return all(ILO.are_toki_pona(message))
|
142
|
+
|
143
|
+
def portion_must_pass(message: str, score: Number = 0.8) -> bool:
|
144
|
+
results = ILO.are_toki_pona(message)
|
145
|
+
sent_count = len(results)
|
146
|
+
passing = results.count(True)
|
147
|
+
return (passing / sent_count) >= score
|
148
|
+
```
|
149
|
+
"""
|
150
|
+
message = self.preprocess(message)
|
151
|
+
results = self._are_toki_pona(message)
|
152
|
+
return [res[-1] for res in results]
|