sonatoki 0.6.3__tar.gz → 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sonatoki-0.6.3 → sonatoki-0.8.0}/PKG-INFO +1 -1
- {sonatoki-0.6.3 → sonatoki-0.8.0}/pyproject.toml +1 -1
- {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/Configs.py +13 -38
- {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/Filters.py +60 -36
- {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/Preprocessors.py +9 -0
- {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/Scorers.py +61 -6
- {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/constants.py +38 -27
- {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/ilo.py +34 -27
- sonatoki-0.8.0/src/sonatoki/types.py +60 -0
- {sonatoki-0.6.3 → sonatoki-0.8.0}/tests/test_filters.py +40 -36
- {sonatoki-0.6.3 → sonatoki-0.8.0}/tests/test_ilo.py +54 -5
- {sonatoki-0.6.3 → sonatoki-0.8.0}/tests/test_preprocessors.py +20 -0
- {sonatoki-0.6.3 → sonatoki-0.8.0}/tests/test_properties.py +12 -22
- {sonatoki-0.6.3 → sonatoki-0.8.0}/tests/test_scorers.py +2 -2
- {sonatoki-0.6.3 → sonatoki-0.8.0}/tests/test_utils.py +2 -5
- {sonatoki-0.6.3 → sonatoki-0.8.0}/LICENSE +0 -0
- {sonatoki-0.6.3 → sonatoki-0.8.0}/README.md +0 -0
- {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/Cleaners.py +0 -0
- {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/Tokenizers.py +0 -0
- {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/__init__.py +0 -0
- {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/__main__.py +0 -0
- {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/alphabetic.txt +0 -0
- {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/linku.json +0 -0
- {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/py.typed +0 -0
- {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/sandbox.json +0 -0
- {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/syllabic.txt +0 -0
- {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/utils.py +0 -0
- {sonatoki-0.6.3 → sonatoki-0.8.0}/tests/__init__.py +0 -0
- {sonatoki-0.6.3 → sonatoki-0.8.0}/tests/test_cleaners.py +0 -0
- {sonatoki-0.6.3 → sonatoki-0.8.0}/tests/test_tokenize.py +0 -0
- {sonatoki-0.6.3 → sonatoki-0.8.0}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
- {sonatoki-0.6.3 → sonatoki-0.8.0}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,42 +1,41 @@
|
|
1
1
|
# STL
|
2
|
-
from copy import deepcopy
|
3
2
|
from typing import List, Type, TypedDict
|
4
3
|
|
5
4
|
# PDM
|
6
5
|
from typing_extensions import NotRequired
|
7
6
|
|
8
7
|
# LOCAL
|
8
|
+
from sonatoki.types import Number
|
9
9
|
from sonatoki.Filters import (
|
10
10
|
Or,
|
11
11
|
And,
|
12
12
|
Not,
|
13
13
|
Filter,
|
14
|
+
PuName,
|
14
15
|
Numeric,
|
15
16
|
NimiUCSUR,
|
16
17
|
Alphabetic,
|
17
18
|
NimiKuLili,
|
18
19
|
NimiKuSuli,
|
19
|
-
ProperName,
|
20
20
|
Punctuation,
|
21
21
|
LongSyllabic,
|
22
22
|
Miscellaneous,
|
23
|
-
NimiLinkuCore,
|
24
23
|
LongAlphabetic,
|
25
24
|
LongProperName,
|
26
|
-
NimiLinkuCommon,
|
27
25
|
FalsePosSyllabic,
|
26
|
+
NimiLinkuByUsage,
|
28
27
|
NimiLinkuObscure,
|
29
28
|
NimiLinkuSandbox,
|
30
29
|
NimiLinkuUncommon,
|
31
30
|
FalsePosAlphabetic,
|
32
31
|
)
|
33
|
-
from sonatoki.Scorers import
|
32
|
+
from sonatoki.Scorers import Scorer, PassFail, SoftScaling, SoftPassFail
|
34
33
|
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
35
34
|
from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
|
36
35
|
from sonatoki.Preprocessors import (
|
37
36
|
URLs,
|
38
37
|
Emoji,
|
39
|
-
|
38
|
+
Codeblock,
|
40
39
|
Reference,
|
41
40
|
Preprocessor,
|
42
41
|
AngleBracketObject,
|
@@ -95,11 +94,11 @@ BaseConfig: IloConfig = {
|
|
95
94
|
|
96
95
|
|
97
96
|
PrefConfig: IloConfig = {
|
98
|
-
"preprocessors": [Emoji,
|
97
|
+
"preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
|
99
98
|
"cleaners": [ConsecutiveDuplicates],
|
100
99
|
"ignoring_filters": [Numeric, Punctuation],
|
101
100
|
"scoring_filters": [
|
102
|
-
Or(
|
101
|
+
Or(NimiLinkuByUsage(30), NimiUCSUR),
|
103
102
|
And(LongSyllabic, Not(FalsePosSyllabic)),
|
104
103
|
# NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
|
105
104
|
LongProperName,
|
@@ -110,16 +109,13 @@ PrefConfig: IloConfig = {
|
|
110
109
|
}
|
111
110
|
|
112
111
|
CorpusConfig: IloConfig = {
|
113
|
-
"preprocessors": [Emoji,
|
112
|
+
"preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
|
114
113
|
"cleaners": [ConsecutiveDuplicates],
|
115
114
|
"ignoring_filters": [Numeric, Punctuation],
|
116
115
|
"scoring_filters": [
|
117
116
|
Or(
|
118
|
-
|
119
|
-
|
120
|
-
NimiLinkuUncommon,
|
121
|
-
NimiLinkuObscure(sub=__DICT_PHONOMATCHES),
|
122
|
-
NimiLinkuSandbox(sub=__DICT_PHONOMATCHES),
|
117
|
+
# awkward but efficient syntax
|
118
|
+
NimiLinkuByUsage(0)(sub=__DICT_PHONOMATCHES),
|
123
119
|
NimiUCSUR,
|
124
120
|
Miscellaneous,
|
125
121
|
),
|
@@ -132,17 +128,17 @@ CorpusConfig: IloConfig = {
|
|
132
128
|
}
|
133
129
|
"""Mimics the previous implementation of ilo pi toki pona taso."""
|
134
130
|
LazyConfig: IloConfig = {
|
135
|
-
"preprocessors": [Emoji,
|
131
|
+
"preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
|
136
132
|
"cleaners": [ConsecutiveDuplicates],
|
137
133
|
"ignoring_filters": [Numeric, Punctuation],
|
138
|
-
"scoring_filters": [Alphabetic, NimiUCSUR,
|
134
|
+
"scoring_filters": [Alphabetic, NimiUCSUR, PuName, Miscellaneous],
|
139
135
|
"scorer": SoftPassFail,
|
140
136
|
"passing_score": 0.8,
|
141
137
|
"word_tokenizer": WordTokenizerRe, # mimics old tokenizer
|
142
138
|
}
|
143
139
|
"""This is extremely silly."""
|
144
140
|
IsipinEpikuConfig: IloConfig = {
|
145
|
-
"preprocessors": [Emoji,
|
141
|
+
"preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
|
146
142
|
"cleaners": [ConsecutiveDuplicates],
|
147
143
|
"ignoring_filters": [Numeric, Punctuation],
|
148
144
|
"scoring_filters": [
|
@@ -162,31 +158,10 @@ IsipinEpikuConfig: IloConfig = {
|
|
162
158
|
}
|
163
159
|
|
164
160
|
|
165
|
-
DiscordConfig: IloConfig = {
|
166
|
-
"preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
|
167
|
-
"cleaners": [ConsecutiveDuplicates],
|
168
|
-
"ignoring_filters": [Numeric, Punctuation],
|
169
|
-
"scoring_filters": [
|
170
|
-
Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
|
171
|
-
And(LongSyllabic, Not(FalsePosSyllabic)),
|
172
|
-
LongProperName,
|
173
|
-
And(LongAlphabetic, Not(FalsePosAlphabetic)),
|
174
|
-
],
|
175
|
-
"scorer": SoftScaling,
|
176
|
-
"passing_score": 0.8,
|
177
|
-
}
|
178
|
-
|
179
|
-
TelegramConfig: IloConfig = deepcopy(PrefConfig)
|
180
|
-
ForumConfig: IloConfig = deepcopy(PrefConfig)
|
181
|
-
|
182
|
-
|
183
161
|
__all__ = [
|
184
162
|
"BaseConfig",
|
185
163
|
"CorpusConfig",
|
186
|
-
"DiscordConfig",
|
187
|
-
"ForumConfig",
|
188
164
|
"IloConfig",
|
189
165
|
"LazyConfig",
|
190
166
|
"PrefConfig",
|
191
|
-
"TelegramConfig",
|
192
167
|
]
|
@@ -2,37 +2,32 @@
|
|
2
2
|
import re
|
3
3
|
from abc import ABC, abstractmethod
|
4
4
|
from copy import deepcopy
|
5
|
-
from typing import Set, List, Type, Optional
|
5
|
+
from typing import Set, List, Type, Union, Literal, Optional
|
6
6
|
from functools import lru_cache as cache # cache comes in 3.9
|
7
7
|
|
8
8
|
# PDM
|
9
9
|
import regex
|
10
|
-
from typing_extensions import override
|
10
|
+
from typing_extensions import override
|
11
11
|
|
12
12
|
# LOCAL
|
13
|
+
from sonatoki.types import LinkuBooks, LinkuUsageDate, LinkuUsageCategory
|
13
14
|
from sonatoki.utils import prep_dictionary
|
14
15
|
from sonatoki.constants import (
|
15
16
|
VOWELS,
|
16
|
-
NIMI_PU,
|
17
17
|
ALPHABET,
|
18
18
|
ALL_PUNCT,
|
19
19
|
ALLOWABLES,
|
20
20
|
CONSONANTS,
|
21
21
|
NIMI_UCSUR,
|
22
|
-
NIMI_KU_LILI,
|
23
|
-
NIMI_KU_SULI,
|
24
|
-
NIMI_LINKU_CORE,
|
25
22
|
NIMI_PU_SYNONYMS,
|
26
|
-
NIMI_LINKU_COMMON,
|
27
23
|
FALSE_POS_SYLLABIC,
|
28
|
-
NIMI_LINKU_OBSCURE,
|
29
|
-
NIMI_LINKU_SANDBOX,
|
30
24
|
NOT_IN_PUNCT_CLASS,
|
31
|
-
NIMI_LINKU_UNCOMMON,
|
32
25
|
ALL_PUNCT_RANGES_STR,
|
33
26
|
FALSE_POS_ALPHABETIC,
|
34
27
|
UCSUR_PUNCT_RANGES_STR,
|
35
28
|
EMOJI_VARIATION_SELECTOR_RANGES_STR,
|
29
|
+
words_by_tag,
|
30
|
+
words_by_usage,
|
36
31
|
)
|
37
32
|
|
38
33
|
regex.DEFAULT_VERSION = regex.VERSION1
|
@@ -146,8 +141,27 @@ class FalsePosAlphabetic(MemberFilter):
|
|
146
141
|
|
147
142
|
|
148
143
|
class ProperName(Filter):
|
149
|
-
"""
|
150
|
-
|
144
|
+
"""Determine if a given token is a valid name based on a reasonable weakening of
|
145
|
+
the rules given in Toki Pona: The Language of Good. A token matches if it has a capital
|
146
|
+
letter at its start and is **not** fully capitalized.
|
147
|
+
|
148
|
+
This corrects an issue with PuName, where scripts lacking a case distinction are
|
149
|
+
errantly counted"""
|
150
|
+
|
151
|
+
@classmethod
|
152
|
+
@override
|
153
|
+
@cache(maxsize=None)
|
154
|
+
def filter(cls, token: str) -> bool:
|
155
|
+
first_capitalized = token[0].isupper()
|
156
|
+
all_caps = token.isupper()
|
157
|
+
|
158
|
+
return first_capitalized and not all_caps
|
159
|
+
|
160
|
+
|
161
|
+
class PuName(Filter):
|
162
|
+
"""Determine if a given token is a valid name (also called a loan word) based on
|
163
|
+
the rules given in Toki Pona: The Language of Good.
|
164
|
+
When Toki Pona is written with the Latin alphabet, names are
|
151
165
|
capitalized at their start. This filter identifies those tokens.
|
152
166
|
|
153
167
|
Note that this alone cannot determine if a token is a valid name,
|
@@ -161,6 +175,9 @@ class ProperName(Filter):
|
|
161
175
|
@override
|
162
176
|
@cache(maxsize=None)
|
163
177
|
def filter(cls, token: str) -> bool:
|
178
|
+
# first_capitalized = token[0].isupper()
|
179
|
+
# rest_capitalized = token[1:] == token[1:].upper()
|
180
|
+
# return first_capitalized and not rest_capitalized
|
164
181
|
return token == token.capitalize()
|
165
182
|
# TODO: If the token is in a script which doesn't have a case distinction,
|
166
183
|
# this will errantly match.
|
@@ -170,40 +187,46 @@ class LongProperName(MinLen, ProperName):
|
|
170
187
|
length = 2 # reject "names" of length 1
|
171
188
|
|
172
189
|
|
173
|
-
class
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
class NimiKuSuli(MemberFilter):
|
182
|
-
tokens = prep_dictionary(NIMI_KU_SULI)
|
183
|
-
|
184
|
-
|
185
|
-
class NimiKuLili(MemberFilter):
|
186
|
-
tokens = prep_dictionary(NIMI_KU_LILI)
|
190
|
+
class NimiLinkuByUsage:
|
191
|
+
def __new__(
|
192
|
+
cls,
|
193
|
+
usage: int,
|
194
|
+
date: Optional[LinkuUsageDate] = None,
|
195
|
+
) -> Type[MemberFilter]:
|
196
|
+
words = words_by_usage(usage, date)
|
187
197
|
|
198
|
+
class AnonLinkuMemberFilter(MemberFilter):
|
199
|
+
tokens = prep_dictionary(words)
|
188
200
|
|
189
|
-
|
190
|
-
tokens = prep_dictionary(NIMI_LINKU_CORE)
|
201
|
+
return AnonLinkuMemberFilter
|
191
202
|
|
192
203
|
|
193
|
-
class
|
194
|
-
|
204
|
+
class NimiLinkuByTag:
|
205
|
+
def __new__(
|
206
|
+
cls,
|
207
|
+
tag: Union[Literal["usage_category"], Literal["book"]],
|
208
|
+
category: Union[LinkuUsageCategory, LinkuBooks],
|
209
|
+
) -> Type[MemberFilter]:
|
210
|
+
words = words_by_tag(tag, category)
|
195
211
|
|
212
|
+
class AnonLinkuMemberFilter(MemberFilter):
|
213
|
+
tokens = prep_dictionary(words)
|
196
214
|
|
197
|
-
|
198
|
-
tokens = prep_dictionary(NIMI_LINKU_UNCOMMON)
|
215
|
+
return AnonLinkuMemberFilter
|
199
216
|
|
200
217
|
|
201
|
-
|
202
|
-
|
218
|
+
NimiPu = NimiLinkuByTag("book", "pu")
|
219
|
+
NimiKuSuli = NimiLinkuByTag("book", "ku suli")
|
220
|
+
NimiKuLili = NimiLinkuByTag("book", "ku lili")
|
221
|
+
NimiLinkuCore = NimiLinkuByTag("usage_category", "core")
|
222
|
+
NimiLinkuCommon = NimiLinkuByTag("usage_category", "common")
|
223
|
+
NimiLinkuUncommon = NimiLinkuByTag("usage_category", "uncommon")
|
224
|
+
NimiLinkuObscure = NimiLinkuByTag("usage_category", "obscure")
|
225
|
+
NimiLinkuSandbox = NimiLinkuByTag("usage_category", "sandbox")
|
203
226
|
|
204
227
|
|
205
|
-
class
|
206
|
-
tokens = prep_dictionary(
|
228
|
+
class NimiPuSynonyms(MemberFilter):
|
229
|
+
tokens = prep_dictionary(NIMI_PU_SYNONYMS)
|
207
230
|
|
208
231
|
|
209
232
|
class NimiUCSUR(MemberFilter):
|
@@ -444,6 +467,7 @@ __all__ = [
|
|
444
467
|
"Or",
|
445
468
|
"Phonotactic",
|
446
469
|
"ProperName",
|
470
|
+
"PuName",
|
447
471
|
"Punctuation",
|
448
472
|
"Syllabic",
|
449
473
|
]
|
@@ -143,6 +143,15 @@ class Backticks(RegexPreprocessor):
|
|
143
143
|
pattern = re.compile(r"`[^`]+`", flags=re.DOTALL)
|
144
144
|
|
145
145
|
|
146
|
+
class Codeblock(RegexPreprocessor):
|
147
|
+
"""Remove codeblocks marked by a set of three backticks on their own lines.
|
148
|
+
|
149
|
+
Subset of what would be removed by Backticks, but may be preferable.
|
150
|
+
"""
|
151
|
+
|
152
|
+
pattern = re.compile(r"```\n(?:(?!```).*?)?```", flags=re.DOTALL)
|
153
|
+
|
154
|
+
|
146
155
|
class Spoilers(RegexPreprocessor):
|
147
156
|
"""Remove paired double bars and their contents `||like this||`"""
|
148
157
|
|
@@ -1,17 +1,15 @@
|
|
1
1
|
# STL
|
2
2
|
import math
|
3
3
|
from abc import ABC, abstractmethod
|
4
|
-
from typing import
|
4
|
+
from typing import List, Type
|
5
5
|
|
6
6
|
# PDM
|
7
7
|
from typing_extensions import override
|
8
8
|
|
9
9
|
# LOCAL
|
10
|
+
from sonatoki.types import Number, Scorecard
|
10
11
|
from sonatoki.Filters import Filter
|
11
12
|
|
12
|
-
Number = Union[int, float]
|
13
|
-
Weights = Dict[str, Number]
|
14
|
-
|
15
13
|
|
16
14
|
class Scorer(ABC):
|
17
15
|
@classmethod
|
@@ -124,7 +122,64 @@ class SoftScaling(Soften, Scaling):
|
|
124
122
|
scoring."""
|
125
123
|
|
126
124
|
|
127
|
-
|
125
|
+
class SentenceScorer(ABC):
|
126
|
+
@classmethod
|
127
|
+
@abstractmethod
|
128
|
+
def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
|
129
|
+
"""Re-score a list of sentences (scorecards, sentences with all their
|
130
|
+
metadata) and return them."""
|
131
|
+
raise NotImplementedError
|
132
|
+
|
133
|
+
|
134
|
+
class SentNoOp(SentenceScorer):
|
135
|
+
@classmethod
|
136
|
+
@override
|
137
|
+
def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
|
138
|
+
return scorecards
|
128
139
|
|
129
140
|
|
130
|
-
|
141
|
+
class SentAvg(SentenceScorer):
|
142
|
+
@classmethod
|
143
|
+
@override
|
144
|
+
def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
|
145
|
+
if not scorecards:
|
146
|
+
return scorecards
|
147
|
+
|
148
|
+
total = sum(card["score"] for card in scorecards)
|
149
|
+
avg = total / len(scorecards)
|
150
|
+
for card in scorecards:
|
151
|
+
card["score"] = avg
|
152
|
+
return scorecards
|
153
|
+
|
154
|
+
|
155
|
+
class SentWeightedAvg(SentenceScorer):
|
156
|
+
@classmethod
|
157
|
+
@override
|
158
|
+
def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
|
159
|
+
if not scorecards:
|
160
|
+
return scorecards
|
161
|
+
|
162
|
+
weighted_total = 0
|
163
|
+
total_len = 0
|
164
|
+
for card in scorecards:
|
165
|
+
cardlen = len(card["cleaned"])
|
166
|
+
cardscore = card["score"]
|
167
|
+
|
168
|
+
weighted_total += cardlen * cardscore
|
169
|
+
total_len += cardlen
|
170
|
+
|
171
|
+
weighted_avg = weighted_total / total_len
|
172
|
+
for card in scorecards:
|
173
|
+
card["score"] = weighted_avg
|
174
|
+
return scorecards
|
175
|
+
|
176
|
+
|
177
|
+
__all__ = [
|
178
|
+
"PassFail",
|
179
|
+
"Scaling",
|
180
|
+
"SoftPassFail",
|
181
|
+
"SoftScaling",
|
182
|
+
"Soften",
|
183
|
+
"SentAvg",
|
184
|
+
"SentWeightedAvg",
|
185
|
+
]
|
@@ -1,11 +1,16 @@
|
|
1
1
|
# STL
|
2
2
|
import json
|
3
|
-
from typing import Set, Dict
|
3
|
+
from typing import Set, Dict, Optional
|
4
4
|
from pathlib import Path
|
5
5
|
|
6
6
|
# LOCAL
|
7
|
+
from sonatoki.types import LinkuWord, LinkuUsageDate
|
7
8
|
from sonatoki.utils import find_unicode_chars, find_unicode_ranges
|
8
9
|
|
10
|
+
LATEST_DATE = "2023-09"
|
11
|
+
# hardcoding this seems bad, but it means the parser is stable w.r.t. Linku!
|
12
|
+
|
13
|
+
|
9
14
|
# `\p{Punctuation}` character class
|
10
15
|
# https://www.compart.com/en/unicode/category
|
11
16
|
# https://unicode.org/Public/UNIDATA/UnicodeData.txt
|
@@ -638,6 +643,7 @@ FALSE_POS_SYLLABIC = {
|
|
638
643
|
"iluminate",
|
639
644
|
"imense",
|
640
645
|
"imitate",
|
646
|
+
"inanimate",
|
641
647
|
"injoke",
|
642
648
|
"insane",
|
643
649
|
"insolate",
|
@@ -689,26 +695,42 @@ NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
|
|
689
695
|
# NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
|
690
696
|
|
691
697
|
|
692
|
-
def
|
693
|
-
|
698
|
+
def linku_data() -> Dict[str, LinkuWord]:
|
699
|
+
# NOTE: this does open+read+parse two files each time you construct a filter
|
700
|
+
# but i expect users to construct filters only at the start of runtime
|
701
|
+
# there is no reason to waste your RAM by leaving the linku data in it
|
702
|
+
with open(LINKU) as f:
|
703
|
+
linku: Dict[str, LinkuWord] = json.loads(f.read())
|
704
|
+
with open(SANDBOX) as f:
|
705
|
+
sandbox: Dict[str, LinkuWord] = json.loads(f.read())
|
706
|
+
|
707
|
+
return {**linku, **sandbox}
|
708
|
+
|
694
709
|
|
710
|
+
def words_by_tag(tag: str, value: str) -> Set[str]:
|
711
|
+
data = linku_data()
|
712
|
+
return {d["word"] for d in data.values() if d[tag] == value}
|
695
713
|
|
696
|
-
with open(LINKU) as f:
|
697
|
-
linku: Dict[str, Dict[str, str]] = json.loads(f.read())
|
698
|
-
NIMI_PU = category_helper(linku, "book", "pu")
|
699
|
-
NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
|
700
714
|
|
701
|
-
|
702
|
-
|
715
|
+
def words_by_usage(
|
716
|
+
usage: int,
|
717
|
+
date: Optional[LinkuUsageDate] = None,
|
718
|
+
) -> Set[str]:
|
719
|
+
if not date:
|
720
|
+
date = LATEST_DATE
|
721
|
+
data = linku_data()
|
703
722
|
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
723
|
+
result: Set[str] = set()
|
724
|
+
for word in data.values():
|
725
|
+
usages = word["usage"]
|
726
|
+
if date in usages and usages[date] >= usage:
|
727
|
+
result.add(word["word"])
|
728
|
+
|
729
|
+
return result
|
730
|
+
|
731
|
+
|
732
|
+
NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
|
708
733
|
|
709
|
-
with open(SANDBOX) as f:
|
710
|
-
sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
|
711
|
-
NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
|
712
734
|
|
713
735
|
# with open(SYLLABICS) as f:
|
714
736
|
# FALSE_POS_SYLLABIC = {line.strip() for line in f}
|
@@ -716,9 +738,6 @@ with open(SANDBOX) as f:
|
|
716
738
|
# with open(ALPHABETICS) as f:
|
717
739
|
# FALSE_POS_ALPHABETIC = {line.strip() for line in f}
|
718
740
|
|
719
|
-
del linku
|
720
|
-
del sandbox
|
721
|
-
|
722
741
|
__all__ = [
|
723
742
|
"ALLOWABLES",
|
724
743
|
"ALL_PUNCT",
|
@@ -727,14 +746,6 @@ __all__ = [
|
|
727
746
|
"CONSONANTS",
|
728
747
|
"EMOJI_VARIATION_SELECTOR_RANGES",
|
729
748
|
"EMOJI_VARIATION_SELECTOR_RANGES_STR",
|
730
|
-
"NIMI_KU_LILI",
|
731
|
-
"NIMI_KU_SULI",
|
732
|
-
"NIMI_LINKU_COMMON",
|
733
|
-
"NIMI_LINKU_CORE",
|
734
|
-
"NIMI_LINKU_OBSCURE",
|
735
|
-
"NIMI_LINKU_SANDBOX",
|
736
|
-
"NIMI_LINKU_UNCOMMON",
|
737
|
-
"NIMI_PU",
|
738
749
|
"NIMI_PU_SYNONYMS",
|
739
750
|
"POSIX_PUNCT",
|
740
751
|
"POSIX_PUNCT_RANGES",
|
@@ -1,17 +1,14 @@
|
|
1
1
|
# STL
|
2
|
-
from typing import List, Type
|
2
|
+
from typing import List, Type
|
3
3
|
|
4
4
|
# LOCAL
|
5
|
+
from sonatoki.types import Number, Scorecard
|
5
6
|
from sonatoki.Filters import Filter
|
6
|
-
from sonatoki.Scorers import
|
7
|
+
from sonatoki.Scorers import Scorer, SentNoOp, SentenceScorer
|
7
8
|
from sonatoki.Cleaners import Cleaner
|
8
9
|
from sonatoki.Tokenizers import Tokenizer, SentTokenizer, WordTokenizer
|
9
10
|
from sonatoki.Preprocessors import Preprocessor
|
10
11
|
|
11
|
-
# tokenized, filtered, cleaned, score, result
|
12
|
-
Scorecard = Tuple[List[str], List[str], List[str], Number, bool]
|
13
|
-
# TODO: scorecard kinda sucks as a name
|
14
|
-
|
15
12
|
|
16
13
|
class Ilo:
|
17
14
|
__preprocessors: List[Type[Preprocessor]]
|
@@ -21,6 +18,7 @@ class Ilo:
|
|
21
18
|
__ignoring_filters: List[Type[Filter]]
|
22
19
|
__scoring_filters: List[Type[Filter]]
|
23
20
|
__scorer: Type[Scorer]
|
21
|
+
__sentence_scorer: Type[SentenceScorer]
|
24
22
|
__passing_score: Number
|
25
23
|
|
26
24
|
def __init__(
|
@@ -31,6 +29,7 @@ class Ilo:
|
|
31
29
|
scoring_filters: List[Type[Filter]],
|
32
30
|
scorer: Type[Scorer],
|
33
31
|
passing_score: Number,
|
32
|
+
sentence_scorer: Type[SentenceScorer] = SentNoOp,
|
34
33
|
word_tokenizer: Type[Tokenizer] = WordTokenizer,
|
35
34
|
sent_tokenizer: Type[Tokenizer] = SentTokenizer,
|
36
35
|
):
|
@@ -43,6 +42,7 @@ class Ilo:
|
|
43
42
|
self.__ignoring_filters = [*ignoring_filters]
|
44
43
|
self.__scoring_filters = [*scoring_filters]
|
45
44
|
self.__scorer = scorer
|
45
|
+
self.__sentence_scorer = sentence_scorer
|
46
46
|
self.__passing_score = passing_score
|
47
47
|
|
48
48
|
def preprocess(self, msg: str) -> str:
|
@@ -55,6 +55,7 @@ class Ilo:
|
|
55
55
|
return self.__word_tokenizer.tokenize(msg)
|
56
56
|
|
57
57
|
def sent_tokenize(self, msg: str) -> List[str]:
|
58
|
+
"""It is *highly* recommended that you run `ilo.preprocess` first."""
|
58
59
|
return self.__sent_tokenizer.tokenize(msg)
|
59
60
|
|
60
61
|
def clean_token(self, token: str) -> str:
|
@@ -93,44 +94,50 @@ class Ilo:
|
|
93
94
|
def score_tokens(self, tokens: List[str]) -> float:
|
94
95
|
return self.__scorer.score(tokens, self.__scoring_filters)
|
95
96
|
|
97
|
+
def score_sentences(self, scorecards: List[Scorecard]) -> List[Scorecard]:
|
98
|
+
return self.__sentence_scorer.score(scorecards)
|
99
|
+
|
96
100
|
def _is_toki_pona(self, message: str) -> Scorecard:
|
97
101
|
"""Process a message into its tokens, then filters, cleans, and scores
|
98
|
-
them.
|
99
|
-
|
100
|
-
|
101
|
-
Returns all
|
102
|
-
- Tokenized message (list[str])
|
103
|
-
- Filtered message (list[str])
|
104
|
-
- Cleaned message (list[str])
|
105
|
-
- Score (float)
|
106
|
-
- Result (bool)
|
102
|
+
them. Message must already be preprocessed, normally done in
|
103
|
+
`self.is_toki_pona(message)`.
|
104
|
+
|
105
|
+
Returns a `Scorecard` with all changes to the input text and a score.
|
107
106
|
"""
|
108
107
|
tokenized = self.word_tokenize(message)
|
109
108
|
filtered = self.filter_tokens(tokenized)
|
110
109
|
cleaned = self.clean_tokens(filtered)
|
111
110
|
score = self.score_tokens(cleaned)
|
112
|
-
result = score >= self.__passing_score
|
113
111
|
|
114
|
-
|
112
|
+
scorecard: Scorecard = {
|
113
|
+
"text": message,
|
114
|
+
"tokenized": tokenized,
|
115
|
+
"filtered": filtered,
|
116
|
+
"cleaned": cleaned,
|
117
|
+
"score": score,
|
118
|
+
}
|
119
|
+
|
120
|
+
return scorecard
|
115
121
|
|
116
122
|
def is_toki_pona(self, message: str) -> bool:
|
117
|
-
"""Determines whether a
|
123
|
+
"""Determines whether a text is or is not Toki Pona."""
|
118
124
|
message = self.preprocess(message)
|
119
|
-
|
120
|
-
return
|
125
|
+
scorecard = self._is_toki_pona(message)
|
126
|
+
return scorecard["score"] >= self.__passing_score
|
121
127
|
|
122
128
|
def _are_toki_pona(self, message: str) -> List[Scorecard]:
|
123
|
-
"""Split a message into sentences, then return a list each
|
124
|
-
|
129
|
+
"""Split a message into sentences, then return a list with each
|
130
|
+
sentence's scorecard from `self._is_toki_pona()`.
|
125
131
|
|
126
132
|
Message must already be preprocessed, normally done in
|
127
133
|
`self.are_toki_pona(message)`.
|
128
134
|
"""
|
129
|
-
|
135
|
+
scorecards: List[Scorecard] = list()
|
130
136
|
for sentence in self.sent_tokenize(message):
|
131
137
|
result = self._is_toki_pona(sentence)
|
132
|
-
|
133
|
-
|
138
|
+
scorecards.append(result)
|
139
|
+
scorecards = self.score_sentences(scorecards)
|
140
|
+
return scorecards
|
134
141
|
|
135
142
|
def are_toki_pona(self, message: str) -> List[bool]:
|
136
143
|
"""Splits a statement into sentences, then determines if each is or is not Toki Pona.
|
@@ -148,5 +155,5 @@ class Ilo:
|
|
148
155
|
```
|
149
156
|
"""
|
150
157
|
message = self.preprocess(message)
|
151
|
-
|
152
|
-
return [
|
158
|
+
scorecards = self._are_toki_pona(message)
|
159
|
+
return [card["score"] >= self.__passing_score for card in scorecards]
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# STL
|
2
|
+
from typing import Dict, List, Union, Literal, TypedDict
|
3
|
+
|
4
|
+
Number = Union[int, float]
|
5
|
+
|
6
|
+
|
7
|
+
# TODO: scorecard kinda sucks as a name
|
8
|
+
class Scorecard(TypedDict):
|
9
|
+
text: str
|
10
|
+
tokenized: List[str]
|
11
|
+
filtered: List[str]
|
12
|
+
cleaned: List[str]
|
13
|
+
score: Number
|
14
|
+
|
15
|
+
|
16
|
+
LinkuUsageDate = Union[
|
17
|
+
Literal["2020-04"],
|
18
|
+
Literal["2021-10"],
|
19
|
+
Literal["2022-08"],
|
20
|
+
Literal["2023-09"],
|
21
|
+
# Literal["2024-09"],
|
22
|
+
]
|
23
|
+
|
24
|
+
LinkuUsageCategory = Union[
|
25
|
+
Literal["core"],
|
26
|
+
Literal["common"],
|
27
|
+
Literal["uncommon"],
|
28
|
+
Literal["obscure"],
|
29
|
+
Literal["sandbox"],
|
30
|
+
]
|
31
|
+
|
32
|
+
LinkuBooks = Union[
|
33
|
+
Literal["pu"],
|
34
|
+
Literal["ku suli"],
|
35
|
+
Literal["ku lili"],
|
36
|
+
Literal["none"],
|
37
|
+
]
|
38
|
+
|
39
|
+
|
40
|
+
class LinkuWord(TypedDict):
|
41
|
+
id: str
|
42
|
+
author_verbatim: str
|
43
|
+
author_verbatim_source: str
|
44
|
+
book: str
|
45
|
+
coined_era: str
|
46
|
+
coined_year: str
|
47
|
+
creator: List[str]
|
48
|
+
ku_data: Dict[str, int]
|
49
|
+
see_also: List[str]
|
50
|
+
resources: Dict[str, str]
|
51
|
+
representations: Dict[str, Union[str, List[str]]]
|
52
|
+
source_language: str
|
53
|
+
usage_category: LinkuUsageCategory
|
54
|
+
word: str
|
55
|
+
deprecated: bool
|
56
|
+
etymology: List[Dict[str, str]]
|
57
|
+
audio: List[Dict[str, str]]
|
58
|
+
pu_verbatim: Dict[str, str]
|
59
|
+
usage: Dict[LinkuUsageDate, int]
|
60
|
+
translations: Dict[str, Dict[str, str]]
|
@@ -11,12 +11,12 @@ from sonatoki.Filters import (
|
|
11
11
|
And,
|
12
12
|
Not,
|
13
13
|
NimiPu,
|
14
|
+
PuName,
|
14
15
|
Numeric,
|
15
16
|
Syllabic,
|
16
17
|
Alphabetic,
|
17
18
|
NimiKuLili,
|
18
19
|
NimiKuSuli,
|
19
|
-
ProperName,
|
20
20
|
Phonotactic,
|
21
21
|
Punctuation,
|
22
22
|
AlphabeticRe,
|
@@ -34,23 +34,13 @@ from sonatoki.Filters import (
|
|
34
34
|
NimiLinkuUncommon,
|
35
35
|
)
|
36
36
|
from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates
|
37
|
-
from sonatoki.constants import
|
38
|
-
NIMI_PU,
|
39
|
-
NIMI_KU_LILI,
|
40
|
-
NIMI_KU_SULI,
|
41
|
-
NIMI_LINKU_CORE,
|
42
|
-
NIMI_LINKU_COMMON,
|
43
|
-
FALSE_POS_SYLLABIC,
|
44
|
-
NIMI_LINKU_OBSCURE,
|
45
|
-
NIMI_LINKU_SANDBOX,
|
46
|
-
NIMI_LINKU_UNCOMMON,
|
47
|
-
)
|
37
|
+
from sonatoki.constants import FALSE_POS_SYLLABIC, words_by_tag
|
48
38
|
|
49
39
|
# FILESYSTEM
|
50
40
|
from .test_utils import PROPER_NAME_RE
|
51
41
|
|
52
42
|
|
53
|
-
@given(st.sampled_from(list(
|
43
|
+
@given(st.sampled_from(list(words_by_tag("book", "pu"))))
|
54
44
|
@example("lukin")
|
55
45
|
@example("selo")
|
56
46
|
@example("li")
|
@@ -59,14 +49,14 @@ def test_NimiPu(s: str):
|
|
59
49
|
assert res, repr(s)
|
60
50
|
|
61
51
|
|
62
|
-
@given(st.sampled_from(list(
|
52
|
+
@given(st.sampled_from(list(words_by_tag("usage_category", "core"))))
|
63
53
|
@example("pona")
|
64
54
|
def test_NimiLinkuCore(s: str):
|
65
55
|
res = NimiLinkuCore.filter(s)
|
66
56
|
assert res, repr(s)
|
67
57
|
|
68
58
|
|
69
|
-
@given(st.sampled_from(list(
|
59
|
+
@given(st.sampled_from(list(words_by_tag("usage_category", "common"))))
|
70
60
|
@example("n")
|
71
61
|
@example("tonsi")
|
72
62
|
@example("kipisi")
|
@@ -75,19 +65,21 @@ def test_NimiLinkuCommon(s: str):
|
|
75
65
|
assert res, repr(s)
|
76
66
|
|
77
67
|
|
78
|
-
@given(st.sampled_from(list(
|
68
|
+
@given(st.sampled_from(list(words_by_tag("usage_category", "uncommon"))))
|
79
69
|
def test_NimiLinkuUncommon(s: str):
|
80
70
|
res = NimiLinkuUncommon.filter(s)
|
81
71
|
assert res, repr(s)
|
82
72
|
|
83
73
|
|
84
|
-
@given(st.sampled_from(list(
|
74
|
+
@given(st.sampled_from(list(words_by_tag("usage_category", "obscure"))))
|
75
|
+
@example("pake")
|
76
|
+
@example("san")
|
85
77
|
def test_NimiLinkuObscure(s: str):
|
86
78
|
res = NimiLinkuObscure.filter(s)
|
87
79
|
assert res, repr(s)
|
88
80
|
|
89
81
|
|
90
|
-
@given(st.sampled_from(list(
|
82
|
+
@given(st.sampled_from(list(words_by_tag("usage_category", "sandbox"))))
|
91
83
|
@example("kalamARR")
|
92
84
|
@example("Pingo")
|
93
85
|
def test_NimiLinkuSandbox(s: str):
|
@@ -152,7 +144,7 @@ def test_AlphabeticRe(s: str):
|
|
152
144
|
|
153
145
|
@given(st.from_regex(PROPER_NAME_RE, fullmatch=True))
|
154
146
|
def test_ProperName(s: str):
|
155
|
-
res =
|
147
|
+
res = PuName.filter(s)
|
156
148
|
assert res, repr(s)
|
157
149
|
|
158
150
|
|
@@ -207,7 +199,11 @@ def test_OrFilter(s: str):
|
|
207
199
|
# NOTE: No subset filter test because A | B is not the same as A combined with B.
|
208
200
|
# e.g. "apple" passes Alphabetic, "..." passes Punctuation, "apple..." passes neither
|
209
201
|
# but would incorrectly pass a combined filter.
|
210
|
-
@given(
|
202
|
+
@given(
|
203
|
+
st.sampled_from(
|
204
|
+
list(words_by_tag("book", "pu") | words_by_tag("usage_category", "obscure"))
|
205
|
+
)
|
206
|
+
)
|
211
207
|
def test_MemberFilters_OrFilter(s: str):
|
212
208
|
filter = Or(NimiPu, NimiLinkuObscure)
|
213
209
|
assert issubclass(filter, MemberFilter)
|
@@ -221,11 +217,11 @@ def test_MemberFilters_OrFilter(s: str):
|
|
221
217
|
@given(
|
222
218
|
st.sampled_from(
|
223
219
|
list(
|
224
|
-
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
220
|
+
words_by_tag("book", "ku suli")
|
221
|
+
| words_by_tag("book", "ku lili")
|
222
|
+
| words_by_tag("usage_category", "uncommon")
|
223
|
+
| words_by_tag("usage_category", "obscure")
|
224
|
+
| words_by_tag("usage_category", "sandbox")
|
229
225
|
),
|
230
226
|
)
|
231
227
|
)
|
@@ -248,14 +244,14 @@ def test_OrFilter_IsipinEpiku(s: str):
|
|
248
244
|
)
|
249
245
|
|
250
246
|
|
251
|
-
@given(st.sampled_from(list(
|
247
|
+
@given(st.sampled_from(list(words_by_tag("book", "pu"))))
|
252
248
|
def test_AndFilter(s: str):
|
253
249
|
s = s.capitalize()
|
254
|
-
f = And(
|
250
|
+
f = And(PuName, NimiPu)
|
255
251
|
assert f.filter(s)
|
256
252
|
|
257
253
|
|
258
|
-
@given(st.sampled_from(list(
|
254
|
+
@given(st.sampled_from(list(words_by_tag("book", "pu"))))
|
259
255
|
def test_NotFilter(s: str):
|
260
256
|
f = Not(NimiPu)
|
261
257
|
assert not f.filter(s)
|
@@ -282,13 +278,21 @@ def test_AndNotFilter(s: str):
|
|
282
278
|
assert not res_composed
|
283
279
|
|
284
280
|
|
285
|
-
@given(
|
281
|
+
@given(
|
282
|
+
st.sampled_from(list(words_by_tag("book", "pu") | words_by_tag("book", "ku suli")))
|
283
|
+
)
|
286
284
|
def test_AddTokensToMemberFilter(s: str):
|
287
285
|
PuEnKuSuliFilter = NimiPu(add=NimiKuSuli.tokens)
|
288
286
|
assert PuEnKuSuliFilter.filter(s)
|
289
287
|
|
290
288
|
|
291
|
-
@given(
|
289
|
+
@given(
|
290
|
+
st.sampled_from(
|
291
|
+
list(
|
292
|
+
words_by_tag("usage_category", "sandbox") | words_by_tag("book", "ku lili")
|
293
|
+
)
|
294
|
+
)
|
295
|
+
)
|
292
296
|
def test_AddTokensToMemberFilterNegative(s: str):
|
293
297
|
PuEnKuSuliFilter = NimiPu(add=NimiKuSuli.tokens)
|
294
298
|
assert not PuEnKuSuliFilter.filter(s)
|
@@ -297,12 +301,12 @@ def test_AddTokensToMemberFilterNegative(s: str):
|
|
297
301
|
@given(
|
298
302
|
st.sampled_from(
|
299
303
|
list(
|
300
|
-
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
304
|
+
words_by_tag("book", "pu")
|
305
|
+
| words_by_tag("book", "ku suli")
|
306
|
+
| words_by_tag("book", "ku lili")
|
307
|
+
| words_by_tag("usage_category", "uncommon")
|
308
|
+
| words_by_tag("usage_category", "obscure")
|
309
|
+
| words_by_tag("usage_category", "sandbox")
|
306
310
|
),
|
307
311
|
)
|
308
312
|
| st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
|
@@ -1,3 +1,6 @@
|
|
1
|
+
# STL
|
2
|
+
from typing import List, Tuple
|
3
|
+
|
1
4
|
# PDM
|
2
5
|
import pytest
|
3
6
|
|
@@ -35,6 +38,10 @@ ALL_VALID = [
|
|
35
38
|
"", # "o toki tawa mi" in UCSUR
|
36
39
|
"",
|
37
40
|
"",
|
41
|
+
"o lukin, wawa",
|
42
|
+
"ni li sona kiwen",
|
43
|
+
"nimi namako li toki e ale",
|
44
|
+
"mi open mute a", # mostly eng words
|
38
45
|
]
|
39
46
|
|
40
47
|
IGNORABLES = [
|
@@ -55,10 +62,9 @@ IGNORABLES = [
|
|
55
62
|
"❤️", # heart
|
56
63
|
"😊",
|
57
64
|
"👨👩👧👧", # family emoji with zwj
|
58
|
-
# every non-emoji in
|
65
|
+
# every non-emoji in the writables
|
59
66
|
"🄀🄁🄂🄃🄄🄅🄆🄇🄈🄉🄊🄋🄌🄍🄎🄏🄐🄑🄒🄓🄔🄕🄖🄗🄘🄙🄚🄛🄜🄝🄞🄟🄠🄡🄢🄣🄤🄥🄦🄧🄨🄩🄪🄫🄬🄭🄮🄯🄰🄱🄲🄳🄴🄵🄶🄷🄸🄹🄺🄻🄼🄽🄾🄿🅀🅁🅂🅃🅄🅅🅆🅇🅈🅉🅊🅋🅌🅍🅎🅏🅐🅑🅒🅓🅔🅕🅖🅗🅘🅙🅚🅛🅜🅝🅞🅟🅠🅡🅢🅣🅤🅥🅦🅧🅨🅩🅪🅫🅬🅭🅮🅯🅲🅳🅴🅵🅶🅷🅸🅹🅺🅻🅼🅽🆀🆁🆂🆃🆄🆅🆆🆇🆈🆉🆊🆋🆌🆍🆏🆐 🆛🆜🆝🆞🆟🆠🆡🆢🆣🆤🆥🆦🆧🆨🆩🆪🆫🆬🆭🇦🇧🇨🇩🇪🇫🇬🇭🇮🇯🇰🇱🇲🇳🇴🇵🇶🇷🇸🇹🇺🇻🇼🇽🇾🇿",
|
60
67
|
"🅰️🅱️🅾️🅱️🅰️", # blood type emojis
|
61
|
-
# "😃⃢👍", # sincerely, no idea, but it came up
|
62
68
|
]
|
63
69
|
|
64
70
|
SYLLABIC_MATCHES = [
|
@@ -88,6 +94,9 @@ NAME_MATCHES = [
|
|
88
94
|
"toki Kanse li lon",
|
89
95
|
"toki Lojban li nasa e lawa mi",
|
90
96
|
"ilo Firefox",
|
97
|
+
"ilo FaceBook li nasa",
|
98
|
+
"mi kepeken ilo MySQL",
|
99
|
+
"poki li nasin SQLite",
|
91
100
|
"mi musi Space Station 13",
|
92
101
|
"jan Tepo en jan Salo en jan Lakuse en pipi Kewapi en soweli Eweke en mi li musi",
|
93
102
|
]
|
@@ -108,7 +117,7 @@ CORPUS_SPECIFIC = [
|
|
108
117
|
"Pingo",
|
109
118
|
"we Luke li alente wa",
|
110
119
|
]
|
111
|
-
CORPUS_SPECIFIC_XFAIL = []
|
120
|
+
CORPUS_SPECIFIC_XFAIL: List[str] = []
|
112
121
|
|
113
122
|
|
114
123
|
EXCESSIVE_SYLLABICS = [
|
@@ -129,7 +138,6 @@ EXCESSIVE_SYLLABICS = [
|
|
129
138
|
]
|
130
139
|
|
131
140
|
EXCESSIVE_ALPHABETICS = [
|
132
|
-
"21st", # candidate for xfails?
|
133
141
|
"wen i tok usin onli notes in toki pona i look silli. ",
|
134
142
|
"I wait, I sulk, as a tool I make stoops to ineptness.",
|
135
143
|
"aaa i non-saw usa's most multiple element-set. it's as asinine as in `e`-less speak",
|
@@ -155,6 +163,7 @@ EXCESSIVE_ENGLISH = [
|
|
155
163
|
"i'm online all the time",
|
156
164
|
"How to Cut a Kiwi",
|
157
165
|
"a e i o u",
|
166
|
+
"21st", # previous false positive; fixed by ProperName change
|
158
167
|
]
|
159
168
|
|
160
169
|
NON_MATCHES = [
|
@@ -193,10 +202,20 @@ FALSE_NEGATIVES = [
|
|
193
202
|
"mtue",
|
194
203
|
"mi nasa B^)", # emoticon
|
195
204
|
"lete li ike x.x", # this is an emoticon but passes because 'x' is in Filters.Miscellaneous
|
205
|
+
"😃⃢👍", # sincerely, no idea, but it came up and it should be omitted by emojis but isn't
|
196
206
|
]
|
197
207
|
|
198
208
|
FALSE_POSITIVES = [
|
199
|
-
"Knowing a little toki pona",
|
209
|
+
"Knowing a little toki pona", # name, dict, alphabet, dict, dict- damn, that's hard.
|
210
|
+
]
|
211
|
+
|
212
|
+
IGNORABLE_PAIRS: List[Tuple[str, str]] = [
|
213
|
+
("o lukin e ni: https://example.com/", "o lukin e ni:"),
|
214
|
+
("ni li nasa anu seme <:musiwawa:198591138591>", "ni li nasa anu seme"),
|
215
|
+
("seme la ni li toki pona ala https://example.com/", "seme la ni li toki pona ala"),
|
216
|
+
("```\ndef bad():\n pass\n``` o lukin e ni", "o lukin e ni"),
|
217
|
+
("mi tawa tomo telo 💦💦", "mi tawa tomo telo"),
|
218
|
+
("o lukin e lipu ni: [[wp:Canvassing]]", "o lukin e lipu ni:"),
|
200
219
|
]
|
201
220
|
|
202
221
|
|
@@ -254,3 +273,33 @@ def test_false_negatives_pref(ilo: Ilo, text: str):
|
|
254
273
|
@pytest.mark.parametrize("text", CORPUS_SPECIFIC_XFAIL)
|
255
274
|
def test_false_positives_corpus(corpus_ilo: Ilo, text: str):
|
256
275
|
assert not corpus_ilo.is_toki_pona(text)
|
276
|
+
|
277
|
+
|
278
|
+
@pytest.mark.parametrize("pair", IGNORABLE_PAIRS)
|
279
|
+
def test_pref_ignorable_doesnt_change_score(ilo: Ilo, pair: Tuple[str, str]):
|
280
|
+
with_ignorable, without_ignorable = pair
|
281
|
+
with_ignorable = ilo.preprocess(with_ignorable)
|
282
|
+
without_ignorable = ilo.preprocess(without_ignorable)
|
283
|
+
score_with = ilo._is_toki_pona(with_ignorable)["score"]
|
284
|
+
score_without = ilo._is_toki_pona(without_ignorable)["score"]
|
285
|
+
assert score_with == score_without
|
286
|
+
|
287
|
+
|
288
|
+
@pytest.mark.parametrize("pair", IGNORABLE_PAIRS)
|
289
|
+
def test_lazy_ignorable_doesnt_change_score(lazy_ilo: Ilo, pair: Tuple[str, str]):
|
290
|
+
with_ignorable, without_ignorable = pair
|
291
|
+
with_ignorable = lazy_ilo.preprocess(with_ignorable)
|
292
|
+
without_ignorable = lazy_ilo.preprocess(without_ignorable)
|
293
|
+
score_with = lazy_ilo._is_toki_pona(with_ignorable)["score"]
|
294
|
+
score_without = lazy_ilo._is_toki_pona(without_ignorable)["score"]
|
295
|
+
assert score_with == score_without
|
296
|
+
|
297
|
+
|
298
|
+
@pytest.mark.parametrize("pair", IGNORABLE_PAIRS)
|
299
|
+
def test_corpus_ignorable_doesnt_change_score(corpus_ilo: Ilo, pair: Tuple[str, str]):
|
300
|
+
with_ignorable, without_ignorable = pair
|
301
|
+
with_ignorable = corpus_ilo.preprocess(with_ignorable)
|
302
|
+
without_ignorable = corpus_ilo.preprocess(without_ignorable)
|
303
|
+
score_with = corpus_ilo._is_toki_pona(with_ignorable)["score"]
|
304
|
+
score_without = corpus_ilo._is_toki_pona(without_ignorable)["score"]
|
305
|
+
assert score_with == score_without
|
@@ -8,6 +8,7 @@ from sonatoki.Preprocessors import (
|
|
8
8
|
Spoilers,
|
9
9
|
AllQuotes,
|
10
10
|
Backticks,
|
11
|
+
Codeblock,
|
11
12
|
Reference,
|
12
13
|
ArrowQuote,
|
13
14
|
ColonEmotes,
|
@@ -48,6 +49,25 @@ def test_Backticks(s: str):
|
|
48
49
|
assert res == "", (repr(s), repr(res))
|
49
50
|
|
50
51
|
|
52
|
+
@given(st.from_regex(Codeblock.pattern.pattern, fullmatch=True))
|
53
|
+
@example(
|
54
|
+
"""```
|
55
|
+
```"""
|
56
|
+
)
|
57
|
+
@example(
|
58
|
+
"""```
|
59
|
+
blocky message
|
60
|
+
```
|
61
|
+
|
62
|
+
```
|
63
|
+
second blocky message
|
64
|
+
```"""
|
65
|
+
)
|
66
|
+
def test_Codeblock(s: str):
|
67
|
+
res = Codeblock.process(s).strip()
|
68
|
+
assert res == "", (repr(s), repr(res))
|
69
|
+
|
70
|
+
|
51
71
|
@given(st.from_regex(ArrowQuote.pattern.pattern, fullmatch=True))
|
52
72
|
@example("> base")
|
53
73
|
@example("> newline\n> newline")
|
@@ -19,45 +19,35 @@ from sonatoki.Filters import (
|
|
19
19
|
)
|
20
20
|
from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates
|
21
21
|
from sonatoki.constants import (
|
22
|
-
NIMI_PU,
|
23
|
-
NIMI_KU_LILI,
|
24
|
-
NIMI_KU_SULI,
|
25
|
-
NIMI_LINKU_CORE,
|
26
22
|
NIMI_PU_SYNONYMS,
|
27
|
-
NIMI_LINKU_COMMON,
|
28
23
|
FALSE_POS_SYLLABIC,
|
29
|
-
NIMI_LINKU_OBSCURE,
|
30
|
-
NIMI_LINKU_SANDBOX,
|
31
|
-
NIMI_LINKU_UNCOMMON,
|
32
24
|
FALSE_POS_ALPHABETIC,
|
25
|
+
words_by_tag,
|
26
|
+
words_by_usage,
|
33
27
|
)
|
34
28
|
|
35
29
|
|
36
|
-
@given(st.sampled_from(list(
|
30
|
+
@given(st.sampled_from(list(words_by_tag("book", "pu") | NIMI_PU_SYNONYMS)))
|
37
31
|
def test_pu_filters_non_overlap(s: str):
|
38
32
|
res_pu = NimiPu.filter(s)
|
39
33
|
res_synonyms = NimiPuSynonyms.filter(s)
|
40
34
|
assert (res_pu + res_synonyms) == 1
|
41
35
|
|
42
36
|
|
43
|
-
@given(
|
37
|
+
@given(
|
38
|
+
st.sampled_from(
|
39
|
+
list(words_by_tag("book", "ku suli") | words_by_tag("book", "ku lili"))
|
40
|
+
)
|
41
|
+
)
|
44
42
|
def test_ku_filters_non_overlap(s: str):
|
43
|
+
s = Lowercase.clean(s)
|
44
|
+
s = ConsecutiveDuplicates.clean(s)
|
45
45
|
res_ku_suli = NimiKuSuli.filter(s)
|
46
46
|
res_ku_lili = NimiKuLili.filter(s)
|
47
47
|
assert (res_ku_suli + res_ku_lili) == 1
|
48
48
|
|
49
49
|
|
50
|
-
@given(
|
51
|
-
st.sampled_from(
|
52
|
-
list(
|
53
|
-
NIMI_LINKU_CORE
|
54
|
-
| NIMI_LINKU_COMMON
|
55
|
-
| NIMI_LINKU_UNCOMMON
|
56
|
-
| NIMI_LINKU_OBSCURE
|
57
|
-
| NIMI_LINKU_SANDBOX
|
58
|
-
)
|
59
|
-
)
|
60
|
-
)
|
50
|
+
@given(st.sampled_from(list(words_by_usage(0))))
|
61
51
|
def test_linku_filters_non_overlap(s: str):
|
62
52
|
_ = assume(s != "su")
|
63
53
|
|
@@ -73,7 +63,7 @@ def test_linku_filters_non_overlap(s: str):
|
|
73
63
|
assert (res_core + res_common + res_uncommon + res_obscure + res_sandbox) == 1
|
74
64
|
|
75
65
|
|
76
|
-
@given(st.sampled_from(list(
|
66
|
+
@given(st.sampled_from(list(words_by_usage(30))))
|
77
67
|
def test_nimi_linku_properties(s: str):
|
78
68
|
assert ConsecutiveDuplicates.clean(s) == s, repr(s)
|
79
69
|
assert Alphabetic.filter(s), repr(s)
|
@@ -10,10 +10,10 @@ from hypothesis import given, example
|
|
10
10
|
from sonatoki.Filters import (
|
11
11
|
Filter,
|
12
12
|
NimiPu,
|
13
|
+
PuName,
|
13
14
|
Numeric,
|
14
15
|
Syllabic,
|
15
16
|
Alphabetic,
|
16
|
-
ProperName,
|
17
17
|
Phonotactic,
|
18
18
|
NimiLinkuCore,
|
19
19
|
PunctuationRe,
|
@@ -31,7 +31,7 @@ FILTERS = [
|
|
31
31
|
NimiLinkuCore,
|
32
32
|
NimiLinkuCommon,
|
33
33
|
Alphabetic,
|
34
|
-
|
34
|
+
PuName,
|
35
35
|
Phonotactic,
|
36
36
|
PunctuationRe,
|
37
37
|
]
|
@@ -1,17 +1,14 @@
|
|
1
|
-
# STL
|
2
|
-
import re
|
3
|
-
|
4
1
|
# PDM
|
5
2
|
import hypothesis.strategies as st
|
6
3
|
|
7
4
|
# LOCAL
|
8
5
|
from sonatoki.Filters import Syllabic, Phonotactic, AlphabeticRe
|
9
|
-
from sonatoki.constants import
|
6
|
+
from sonatoki.constants import words_by_usage
|
10
7
|
|
11
8
|
PROPER_NAME_RE = r"[A-Z][a-z]*"
|
12
9
|
|
13
10
|
token_strategy = (
|
14
|
-
st.sampled_from(list(
|
11
|
+
st.sampled_from(list(words_by_usage(60)))
|
15
12
|
| st.from_regex(Phonotactic.pattern.pattern, fullmatch=True)
|
16
13
|
| st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
|
17
14
|
| st.from_regex(PROPER_NAME_RE, fullmatch=True)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|