sonatoki 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Configs.py +40 -73
- sonatoki/Filters.py +50 -34
- sonatoki/Preprocessors.py +9 -0
- sonatoki/Scorers.py +61 -6
- sonatoki/constants.py +38 -27
- sonatoki/ilo.py +34 -27
- sonatoki/types.py +60 -0
- {sonatoki-0.6.2.dist-info → sonatoki-0.7.0.dist-info}/METADATA +1 -1
- {sonatoki-0.6.2.dist-info → sonatoki-0.7.0.dist-info}/RECORD +11 -10
- {sonatoki-0.6.2.dist-info → sonatoki-0.7.0.dist-info}/WHEEL +0 -0
- {sonatoki-0.6.2.dist-info → sonatoki-0.7.0.dist-info}/licenses/LICENSE +0 -0
sonatoki/Configs.py
CHANGED
@@ -1,49 +1,74 @@
|
|
1
1
|
# STL
|
2
|
-
from
|
3
|
-
from typing import Set, List, Type, TypedDict, cast
|
2
|
+
from typing import List, Type, TypedDict
|
4
3
|
|
5
4
|
# PDM
|
6
5
|
from typing_extensions import NotRequired
|
7
6
|
|
8
7
|
# LOCAL
|
8
|
+
from sonatoki.types import Number
|
9
9
|
from sonatoki.Filters import (
|
10
10
|
Or,
|
11
11
|
And,
|
12
12
|
Not,
|
13
13
|
Filter,
|
14
14
|
Numeric,
|
15
|
-
Syllabic,
|
16
15
|
NimiUCSUR,
|
17
16
|
Alphabetic,
|
18
17
|
NimiKuLili,
|
19
18
|
NimiKuSuli,
|
20
19
|
ProperName,
|
21
|
-
Phonotactic,
|
22
20
|
Punctuation,
|
23
21
|
LongSyllabic,
|
24
22
|
Miscellaneous,
|
25
|
-
NimiLinkuCore,
|
26
23
|
LongAlphabetic,
|
27
24
|
LongProperName,
|
28
|
-
NimiLinkuCommon,
|
29
25
|
FalsePosSyllabic,
|
26
|
+
NimiLinkuByUsage,
|
30
27
|
NimiLinkuObscure,
|
31
28
|
NimiLinkuSandbox,
|
32
29
|
NimiLinkuUncommon,
|
33
30
|
FalsePosAlphabetic,
|
34
31
|
)
|
35
|
-
from sonatoki.Scorers import
|
32
|
+
from sonatoki.Scorers import Scorer, PassFail, SoftScaling, SoftPassFail
|
36
33
|
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
37
34
|
from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
|
38
35
|
from sonatoki.Preprocessors import (
|
39
36
|
URLs,
|
40
37
|
Emoji,
|
41
|
-
|
38
|
+
Codeblock,
|
42
39
|
Reference,
|
43
40
|
Preprocessor,
|
44
41
|
AngleBracketObject,
|
45
42
|
)
|
46
43
|
|
44
|
+
__DICT_PHONOMATCHES = {
|
45
|
+
# Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
|
46
|
+
# In this case, all of these appear more often in English by a factor of at least 10.
|
47
|
+
"aka", # also known as
|
48
|
+
"an", # article
|
49
|
+
"api", # API
|
50
|
+
"i", # 1st person
|
51
|
+
"kana", # japanese script
|
52
|
+
"me", # 1st person singular, english
|
53
|
+
"ne", # "no" in several languages
|
54
|
+
"nu", # "new" in english, "now" in dutch
|
55
|
+
"se", # spanish particle, english "see"
|
56
|
+
"take", # acquire, perhaps forcefully or without permission
|
57
|
+
"ten", # 10
|
58
|
+
"to", # to, too
|
59
|
+
"je", # 1st person pronoun, french
|
60
|
+
"u", # no u
|
61
|
+
"we", # 1st person plural, english
|
62
|
+
"wi", # wii and discussions of syllables
|
63
|
+
"sole", # singular, of shoe
|
64
|
+
# unexplored candidates for removal
|
65
|
+
# "omen", # ominous
|
66
|
+
# "papa", # father
|
67
|
+
# "lo", # "lo" and "loo"
|
68
|
+
# "ewe", # sheep
|
69
|
+
# "pa", # father- eh?
|
70
|
+
}
|
71
|
+
|
47
72
|
|
48
73
|
class IloConfig(TypedDict):
|
49
74
|
preprocessors: List[Type[Preprocessor]]
|
@@ -69,11 +94,11 @@ BaseConfig: IloConfig = {
|
|
69
94
|
|
70
95
|
|
71
96
|
PrefConfig: IloConfig = {
|
72
|
-
"preprocessors": [Emoji,
|
97
|
+
"preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
|
73
98
|
"cleaners": [ConsecutiveDuplicates],
|
74
99
|
"ignoring_filters": [Numeric, Punctuation],
|
75
100
|
"scoring_filters": [
|
76
|
-
Or(
|
101
|
+
Or(NimiLinkuByUsage(30), NimiUCSUR),
|
77
102
|
And(LongSyllabic, Not(FalsePosSyllabic)),
|
78
103
|
# NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
|
79
104
|
LongProperName,
|
@@ -84,16 +109,13 @@ PrefConfig: IloConfig = {
|
|
84
109
|
}
|
85
110
|
|
86
111
|
CorpusConfig: IloConfig = {
|
87
|
-
"preprocessors": [Emoji,
|
112
|
+
"preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
|
88
113
|
"cleaners": [ConsecutiveDuplicates],
|
89
114
|
"ignoring_filters": [Numeric, Punctuation],
|
90
115
|
"scoring_filters": [
|
91
116
|
Or(
|
92
|
-
|
93
|
-
|
94
|
-
NimiLinkuUncommon,
|
95
|
-
NimiLinkuObscure,
|
96
|
-
NimiLinkuSandbox,
|
117
|
+
# awkward but efficient syntax
|
118
|
+
NimiLinkuByUsage(0)(sub=__DICT_PHONOMATCHES),
|
97
119
|
NimiUCSUR,
|
98
120
|
Miscellaneous,
|
99
121
|
),
|
@@ -104,43 +126,9 @@ CorpusConfig: IloConfig = {
|
|
104
126
|
"scorer": SoftScaling,
|
105
127
|
"passing_score": 0.8,
|
106
128
|
}
|
107
|
-
|
108
|
-
# TODO: create a mechanism to omit tokens from a filter with more granularity
|
109
|
-
__corpus_tokens_dict: Set[str] = cast(
|
110
|
-
Set[str],
|
111
|
-
CorpusConfig["scoring_filters"][
|
112
|
-
0
|
113
|
-
].tokens, # pyright: ignore[reportAttributeAccessIssue]
|
114
|
-
)
|
115
|
-
__corpus_tokens_dict -= {
|
116
|
-
# Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
|
117
|
-
# In this case, all of these appear more often in English by a factor of at least 10.
|
118
|
-
"aka", # also known as
|
119
|
-
"an", # article
|
120
|
-
"api", # API
|
121
|
-
"i", # 1st person
|
122
|
-
"kana", # japanese script
|
123
|
-
"me", # 1st person
|
124
|
-
"ne", # "no" in several languages
|
125
|
-
"nu", # "new", now in dutch
|
126
|
-
"se", # spanish particle, "see"
|
127
|
-
"take", # acquire, perhaps forcefully or without permission
|
128
|
-
"ten", # 10
|
129
|
-
"to", # to, too
|
130
|
-
"u", # no u
|
131
|
-
"we", # 1st person plural
|
132
|
-
"wi", # wii and discussions of syllables
|
133
|
-
"sole", # singular, of shoe
|
134
|
-
# unexplored candidates for removal
|
135
|
-
# "omen", # ominous
|
136
|
-
# "papa", # father
|
137
|
-
# "lo", # "lo" and "loo"
|
138
|
-
# "ewe", # sheep
|
139
|
-
# "pa", # father- eh?
|
140
|
-
}
|
141
129
|
"""Mimics the previous implementation of ilo pi toki pona taso."""
|
142
130
|
LazyConfig: IloConfig = {
|
143
|
-
"preprocessors": [Emoji,
|
131
|
+
"preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
|
144
132
|
"cleaners": [ConsecutiveDuplicates],
|
145
133
|
"ignoring_filters": [Numeric, Punctuation],
|
146
134
|
"scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
|
@@ -150,7 +138,7 @@ LazyConfig: IloConfig = {
|
|
150
138
|
}
|
151
139
|
"""This is extremely silly."""
|
152
140
|
IsipinEpikuConfig: IloConfig = {
|
153
|
-
"preprocessors": [Emoji,
|
141
|
+
"preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
|
154
142
|
"cleaners": [ConsecutiveDuplicates],
|
155
143
|
"ignoring_filters": [Numeric, Punctuation],
|
156
144
|
"scoring_filters": [
|
@@ -170,31 +158,10 @@ IsipinEpikuConfig: IloConfig = {
|
|
170
158
|
}
|
171
159
|
|
172
160
|
|
173
|
-
DiscordConfig: IloConfig = {
|
174
|
-
"preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
|
175
|
-
"cleaners": [ConsecutiveDuplicates],
|
176
|
-
"ignoring_filters": [Numeric, Punctuation],
|
177
|
-
"scoring_filters": [
|
178
|
-
Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
|
179
|
-
And(LongSyllabic, Not(FalsePosSyllabic)),
|
180
|
-
LongProperName,
|
181
|
-
And(LongAlphabetic, Not(FalsePosAlphabetic)),
|
182
|
-
],
|
183
|
-
"scorer": SoftScaling,
|
184
|
-
"passing_score": 0.8,
|
185
|
-
}
|
186
|
-
|
187
|
-
TelegramConfig: IloConfig = deepcopy(PrefConfig)
|
188
|
-
ForumConfig: IloConfig = deepcopy(PrefConfig)
|
189
|
-
|
190
|
-
|
191
161
|
__all__ = [
|
192
162
|
"BaseConfig",
|
193
163
|
"CorpusConfig",
|
194
|
-
"DiscordConfig",
|
195
|
-
"ForumConfig",
|
196
164
|
"IloConfig",
|
197
165
|
"LazyConfig",
|
198
166
|
"PrefConfig",
|
199
|
-
"TelegramConfig",
|
200
167
|
]
|
sonatoki/Filters.py
CHANGED
@@ -1,37 +1,33 @@
|
|
1
1
|
# STL
|
2
2
|
import re
|
3
3
|
from abc import ABC, abstractmethod
|
4
|
-
from
|
4
|
+
from copy import deepcopy
|
5
|
+
from typing import Set, List, Type, Union, Literal, Optional
|
5
6
|
from functools import lru_cache as cache # cache comes in 3.9
|
6
7
|
|
7
8
|
# PDM
|
8
9
|
import regex
|
9
|
-
from typing_extensions import override
|
10
|
+
from typing_extensions import override
|
10
11
|
|
11
12
|
# LOCAL
|
13
|
+
from sonatoki.types import LinkuBooks, LinkuUsageDate, LinkuUsageCategory
|
12
14
|
from sonatoki.utils import prep_dictionary
|
13
15
|
from sonatoki.constants import (
|
14
16
|
VOWELS,
|
15
|
-
NIMI_PU,
|
16
17
|
ALPHABET,
|
17
18
|
ALL_PUNCT,
|
18
19
|
ALLOWABLES,
|
19
20
|
CONSONANTS,
|
20
21
|
NIMI_UCSUR,
|
21
|
-
NIMI_KU_LILI,
|
22
|
-
NIMI_KU_SULI,
|
23
|
-
NIMI_LINKU_CORE,
|
24
22
|
NIMI_PU_SYNONYMS,
|
25
|
-
NIMI_LINKU_COMMON,
|
26
23
|
FALSE_POS_SYLLABIC,
|
27
|
-
NIMI_LINKU_OBSCURE,
|
28
|
-
NIMI_LINKU_SANDBOX,
|
29
24
|
NOT_IN_PUNCT_CLASS,
|
30
|
-
NIMI_LINKU_UNCOMMON,
|
31
25
|
ALL_PUNCT_RANGES_STR,
|
32
26
|
FALSE_POS_ALPHABETIC,
|
33
27
|
UCSUR_PUNCT_RANGES_STR,
|
34
28
|
EMOJI_VARIATION_SELECTOR_RANGES_STR,
|
29
|
+
words_by_tag,
|
30
|
+
words_by_usage,
|
35
31
|
)
|
36
32
|
|
37
33
|
regex.DEFAULT_VERSION = regex.VERSION1
|
@@ -101,6 +97,20 @@ class MemberFilter(Filter):
|
|
101
97
|
def filter(cls, token: str) -> bool:
|
102
98
|
return token.lower() in cls.tokens
|
103
99
|
|
100
|
+
def __new__(
|
101
|
+
cls, add: Optional[Set[str]] = None, sub: Optional[Set[str]] = None
|
102
|
+
) -> Type[Filter]:
|
103
|
+
parent_tokens = deepcopy(cls.tokens)
|
104
|
+
if add:
|
105
|
+
parent_tokens = parent_tokens.union(add)
|
106
|
+
if sub:
|
107
|
+
parent_tokens -= sub
|
108
|
+
|
109
|
+
class AnonMemberFilter(MemberFilter):
|
110
|
+
tokens = parent_tokens
|
111
|
+
|
112
|
+
return AnonMemberFilter
|
113
|
+
|
104
114
|
|
105
115
|
class SubsetFilter(Filter):
|
106
116
|
tokens: Set[str]
|
@@ -155,40 +165,46 @@ class LongProperName(MinLen, ProperName):
|
|
155
165
|
length = 2 # reject "names" of length 1
|
156
166
|
|
157
167
|
|
158
|
-
class
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
168
|
+
class NimiLinkuByUsage:
|
169
|
+
def __new__(
|
170
|
+
cls,
|
171
|
+
usage: int,
|
172
|
+
date: Optional[LinkuUsageDate] = None,
|
173
|
+
) -> Type[MemberFilter]:
|
174
|
+
words = words_by_usage(usage, date)
|
164
175
|
|
176
|
+
class AnonLinkuMemberFilter(MemberFilter):
|
177
|
+
tokens = prep_dictionary(words)
|
165
178
|
|
166
|
-
|
167
|
-
tokens = prep_dictionary(NIMI_KU_SULI)
|
179
|
+
return AnonLinkuMemberFilter
|
168
180
|
|
169
181
|
|
170
|
-
class
|
171
|
-
|
182
|
+
class NimiLinkuByTag:
|
183
|
+
def __new__(
|
184
|
+
cls,
|
185
|
+
tag: Union[Literal["usage_category"], Literal["book"]],
|
186
|
+
category: Union[LinkuUsageCategory, LinkuBooks],
|
187
|
+
) -> Type[MemberFilter]:
|
188
|
+
words = words_by_tag(tag, category)
|
172
189
|
|
190
|
+
class AnonLinkuMemberFilter(MemberFilter):
|
191
|
+
tokens = prep_dictionary(words)
|
173
192
|
|
174
|
-
|
175
|
-
tokens = prep_dictionary(NIMI_LINKU_CORE)
|
193
|
+
return AnonLinkuMemberFilter
|
176
194
|
|
177
195
|
|
178
|
-
|
179
|
-
|
196
|
+
NimiPu = NimiLinkuByTag("book", "pu")
|
197
|
+
NimiKuSuli = NimiLinkuByTag("book", "ku suli")
|
198
|
+
NimiKuLili = NimiLinkuByTag("book", "ku lili")
|
199
|
+
NimiLinkuCore = NimiLinkuByTag("usage_category", "core")
|
200
|
+
NimiLinkuCommon = NimiLinkuByTag("usage_category", "common")
|
201
|
+
NimiLinkuUncommon = NimiLinkuByTag("usage_category", "uncommon")
|
202
|
+
NimiLinkuObscure = NimiLinkuByTag("usage_category", "obscure")
|
203
|
+
NimiLinkuSandbox = NimiLinkuByTag("usage_category", "sandbox")
|
180
204
|
|
181
205
|
|
182
|
-
class
|
183
|
-
tokens = prep_dictionary(
|
184
|
-
|
185
|
-
|
186
|
-
class NimiLinkuObscure(MemberFilter):
|
187
|
-
tokens = prep_dictionary(NIMI_LINKU_OBSCURE)
|
188
|
-
|
189
|
-
|
190
|
-
class NimiLinkuSandbox(MemberFilter):
|
191
|
-
tokens = prep_dictionary(NIMI_LINKU_SANDBOX)
|
206
|
+
class NimiPuSynonyms(MemberFilter):
|
207
|
+
tokens = prep_dictionary(NIMI_PU_SYNONYMS)
|
192
208
|
|
193
209
|
|
194
210
|
class NimiUCSUR(MemberFilter):
|
sonatoki/Preprocessors.py
CHANGED
@@ -143,6 +143,15 @@ class Backticks(RegexPreprocessor):
|
|
143
143
|
pattern = re.compile(r"`[^`]+`", flags=re.DOTALL)
|
144
144
|
|
145
145
|
|
146
|
+
class Codeblock(RegexPreprocessor):
|
147
|
+
"""Remove codeblocks marked by a set of three backticks on their own lines.
|
148
|
+
|
149
|
+
Subset of what would be removed by Backticks, but may be preferable.
|
150
|
+
"""
|
151
|
+
|
152
|
+
pattern = re.compile(r"```\n(?:(?!```).*?)?```", flags=re.DOTALL)
|
153
|
+
|
154
|
+
|
146
155
|
class Spoilers(RegexPreprocessor):
|
147
156
|
"""Remove paired double bars and their contents `||like this||`"""
|
148
157
|
|
sonatoki/Scorers.py
CHANGED
@@ -1,17 +1,15 @@
|
|
1
1
|
# STL
|
2
2
|
import math
|
3
3
|
from abc import ABC, abstractmethod
|
4
|
-
from typing import
|
4
|
+
from typing import List, Type
|
5
5
|
|
6
6
|
# PDM
|
7
7
|
from typing_extensions import override
|
8
8
|
|
9
9
|
# LOCAL
|
10
|
+
from sonatoki.types import Number, Scorecard
|
10
11
|
from sonatoki.Filters import Filter
|
11
12
|
|
12
|
-
Number = Union[int, float]
|
13
|
-
Weights = Dict[str, Number]
|
14
|
-
|
15
13
|
|
16
14
|
class Scorer(ABC):
|
17
15
|
@classmethod
|
@@ -124,7 +122,64 @@ class SoftScaling(Soften, Scaling):
|
|
124
122
|
scoring."""
|
125
123
|
|
126
124
|
|
127
|
-
|
125
|
+
class SentenceScorer(ABC):
|
126
|
+
@classmethod
|
127
|
+
@abstractmethod
|
128
|
+
def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
|
129
|
+
"""Re-score a list of sentences (scorecards, sentences with all their
|
130
|
+
metadata) and return them."""
|
131
|
+
raise NotImplementedError
|
132
|
+
|
133
|
+
|
134
|
+
class SentNoOp(SentenceScorer):
|
135
|
+
@classmethod
|
136
|
+
@override
|
137
|
+
def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
|
138
|
+
return scorecards
|
128
139
|
|
129
140
|
|
130
|
-
|
141
|
+
class SentAvg(SentenceScorer):
|
142
|
+
@classmethod
|
143
|
+
@override
|
144
|
+
def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
|
145
|
+
if not scorecards:
|
146
|
+
return scorecards
|
147
|
+
|
148
|
+
total = sum(card["score"] for card in scorecards)
|
149
|
+
avg = total / len(scorecards)
|
150
|
+
for card in scorecards:
|
151
|
+
card["score"] = avg
|
152
|
+
return scorecards
|
153
|
+
|
154
|
+
|
155
|
+
class SentWeightedAvg(SentenceScorer):
|
156
|
+
@classmethod
|
157
|
+
@override
|
158
|
+
def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
|
159
|
+
if not scorecards:
|
160
|
+
return scorecards
|
161
|
+
|
162
|
+
weighted_total = 0
|
163
|
+
total_len = 0
|
164
|
+
for card in scorecards:
|
165
|
+
cardlen = len(card["cleaned"])
|
166
|
+
cardscore = card["score"]
|
167
|
+
|
168
|
+
weighted_total += cardlen * cardscore
|
169
|
+
total_len += cardlen
|
170
|
+
|
171
|
+
weighted_avg = weighted_total / total_len
|
172
|
+
for card in scorecards:
|
173
|
+
card["score"] = weighted_avg
|
174
|
+
return scorecards
|
175
|
+
|
176
|
+
|
177
|
+
__all__ = [
|
178
|
+
"PassFail",
|
179
|
+
"Scaling",
|
180
|
+
"SoftPassFail",
|
181
|
+
"SoftScaling",
|
182
|
+
"Soften",
|
183
|
+
"SentAvg",
|
184
|
+
"SentWeightedAvg",
|
185
|
+
]
|
sonatoki/constants.py
CHANGED
@@ -1,11 +1,16 @@
|
|
1
1
|
# STL
|
2
2
|
import json
|
3
|
-
from typing import Set, Dict
|
3
|
+
from typing import Set, Dict, Optional
|
4
4
|
from pathlib import Path
|
5
5
|
|
6
6
|
# LOCAL
|
7
|
+
from sonatoki.types import LinkuWord, LinkuUsageDate
|
7
8
|
from sonatoki.utils import find_unicode_chars, find_unicode_ranges
|
8
9
|
|
10
|
+
LATEST_DATE = "2023-09"
|
11
|
+
# hardcoding this seems bad, but it means the parser is stable w.r.t. Linku!
|
12
|
+
|
13
|
+
|
9
14
|
# `\p{Punctuation}` character class
|
10
15
|
# https://www.compart.com/en/unicode/category
|
11
16
|
# https://unicode.org/Public/UNIDATA/UnicodeData.txt
|
@@ -638,6 +643,7 @@ FALSE_POS_SYLLABIC = {
|
|
638
643
|
"iluminate",
|
639
644
|
"imense",
|
640
645
|
"imitate",
|
646
|
+
"inanimate",
|
641
647
|
"injoke",
|
642
648
|
"insane",
|
643
649
|
"insolate",
|
@@ -689,26 +695,42 @@ NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
|
|
689
695
|
# NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
|
690
696
|
|
691
697
|
|
692
|
-
def
|
693
|
-
|
698
|
+
def linku_data() -> Dict[str, LinkuWord]:
|
699
|
+
# NOTE: this does open+read+parse two files each time you construct a filter
|
700
|
+
# but i expect users to construct filters only at the start of runtime
|
701
|
+
# there is no reason to waste your RAM by leaving the linku data in it
|
702
|
+
with open(LINKU) as f:
|
703
|
+
linku: Dict[str, LinkuWord] = json.loads(f.read())
|
704
|
+
with open(SANDBOX) as f:
|
705
|
+
sandbox: Dict[str, LinkuWord] = json.loads(f.read())
|
706
|
+
|
707
|
+
return {**linku, **sandbox}
|
708
|
+
|
694
709
|
|
710
|
+
def words_by_tag(tag: str, value: str) -> Set[str]:
|
711
|
+
data = linku_data()
|
712
|
+
return {d["word"] for d in data.values() if d[tag] == value}
|
695
713
|
|
696
|
-
with open(LINKU) as f:
|
697
|
-
linku: Dict[str, Dict[str, str]] = json.loads(f.read())
|
698
|
-
NIMI_PU = category_helper(linku, "book", "pu")
|
699
|
-
NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
|
700
714
|
|
701
|
-
|
702
|
-
|
715
|
+
def words_by_usage(
|
716
|
+
usage: int,
|
717
|
+
date: Optional[LinkuUsageDate] = None,
|
718
|
+
) -> Set[str]:
|
719
|
+
if not date:
|
720
|
+
date = LATEST_DATE
|
721
|
+
data = linku_data()
|
703
722
|
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
723
|
+
result: Set[str] = set()
|
724
|
+
for word in data.values():
|
725
|
+
usages = word["usage"]
|
726
|
+
if date in usages and usages[date] >= usage:
|
727
|
+
result.add(word["word"])
|
728
|
+
|
729
|
+
return result
|
730
|
+
|
731
|
+
|
732
|
+
NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
|
708
733
|
|
709
|
-
with open(SANDBOX) as f:
|
710
|
-
sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
|
711
|
-
NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
|
712
734
|
|
713
735
|
# with open(SYLLABICS) as f:
|
714
736
|
# FALSE_POS_SYLLABIC = {line.strip() for line in f}
|
@@ -716,9 +738,6 @@ with open(SANDBOX) as f:
|
|
716
738
|
# with open(ALPHABETICS) as f:
|
717
739
|
# FALSE_POS_ALPHABETIC = {line.strip() for line in f}
|
718
740
|
|
719
|
-
del linku
|
720
|
-
del sandbox
|
721
|
-
|
722
741
|
__all__ = [
|
723
742
|
"ALLOWABLES",
|
724
743
|
"ALL_PUNCT",
|
@@ -727,14 +746,6 @@ __all__ = [
|
|
727
746
|
"CONSONANTS",
|
728
747
|
"EMOJI_VARIATION_SELECTOR_RANGES",
|
729
748
|
"EMOJI_VARIATION_SELECTOR_RANGES_STR",
|
730
|
-
"NIMI_KU_LILI",
|
731
|
-
"NIMI_KU_SULI",
|
732
|
-
"NIMI_LINKU_COMMON",
|
733
|
-
"NIMI_LINKU_CORE",
|
734
|
-
"NIMI_LINKU_OBSCURE",
|
735
|
-
"NIMI_LINKU_SANDBOX",
|
736
|
-
"NIMI_LINKU_UNCOMMON",
|
737
|
-
"NIMI_PU",
|
738
749
|
"NIMI_PU_SYNONYMS",
|
739
750
|
"POSIX_PUNCT",
|
740
751
|
"POSIX_PUNCT_RANGES",
|
sonatoki/ilo.py
CHANGED
@@ -1,17 +1,14 @@
|
|
1
1
|
# STL
|
2
|
-
from typing import List, Type
|
2
|
+
from typing import List, Type
|
3
3
|
|
4
4
|
# LOCAL
|
5
|
+
from sonatoki.types import Number, Scorecard
|
5
6
|
from sonatoki.Filters import Filter
|
6
|
-
from sonatoki.Scorers import
|
7
|
+
from sonatoki.Scorers import Scorer, SentNoOp, SentenceScorer
|
7
8
|
from sonatoki.Cleaners import Cleaner
|
8
9
|
from sonatoki.Tokenizers import Tokenizer, SentTokenizer, WordTokenizer
|
9
10
|
from sonatoki.Preprocessors import Preprocessor
|
10
11
|
|
11
|
-
# tokenized, filtered, cleaned, score, result
|
12
|
-
Scorecard = Tuple[List[str], List[str], List[str], Number, bool]
|
13
|
-
# TODO: scorecard kinda sucks as a name
|
14
|
-
|
15
12
|
|
16
13
|
class Ilo:
|
17
14
|
__preprocessors: List[Type[Preprocessor]]
|
@@ -21,6 +18,7 @@ class Ilo:
|
|
21
18
|
__ignoring_filters: List[Type[Filter]]
|
22
19
|
__scoring_filters: List[Type[Filter]]
|
23
20
|
__scorer: Type[Scorer]
|
21
|
+
__sentence_scorer: Type[SentenceScorer]
|
24
22
|
__passing_score: Number
|
25
23
|
|
26
24
|
def __init__(
|
@@ -31,6 +29,7 @@ class Ilo:
|
|
31
29
|
scoring_filters: List[Type[Filter]],
|
32
30
|
scorer: Type[Scorer],
|
33
31
|
passing_score: Number,
|
32
|
+
sentence_scorer: Type[SentenceScorer] = SentNoOp,
|
34
33
|
word_tokenizer: Type[Tokenizer] = WordTokenizer,
|
35
34
|
sent_tokenizer: Type[Tokenizer] = SentTokenizer,
|
36
35
|
):
|
@@ -43,6 +42,7 @@ class Ilo:
|
|
43
42
|
self.__ignoring_filters = [*ignoring_filters]
|
44
43
|
self.__scoring_filters = [*scoring_filters]
|
45
44
|
self.__scorer = scorer
|
45
|
+
self.__sentence_scorer = sentence_scorer
|
46
46
|
self.__passing_score = passing_score
|
47
47
|
|
48
48
|
def preprocess(self, msg: str) -> str:
|
@@ -55,6 +55,7 @@ class Ilo:
|
|
55
55
|
return self.__word_tokenizer.tokenize(msg)
|
56
56
|
|
57
57
|
def sent_tokenize(self, msg: str) -> List[str]:
|
58
|
+
"""It is *highly* recommended that you run `ilo.preprocess` first."""
|
58
59
|
return self.__sent_tokenizer.tokenize(msg)
|
59
60
|
|
60
61
|
def clean_token(self, token: str) -> str:
|
@@ -93,44 +94,50 @@ class Ilo:
|
|
93
94
|
def score_tokens(self, tokens: List[str]) -> float:
|
94
95
|
return self.__scorer.score(tokens, self.__scoring_filters)
|
95
96
|
|
97
|
+
def score_sentences(self, scorecards: List[Scorecard]) -> List[Scorecard]:
|
98
|
+
return self.__sentence_scorer.score(scorecards)
|
99
|
+
|
96
100
|
def _is_toki_pona(self, message: str) -> Scorecard:
|
97
101
|
"""Process a message into its tokens, then filters, cleans, and scores
|
98
|
-
them.
|
99
|
-
|
100
|
-
|
101
|
-
Returns all
|
102
|
-
- Tokenized message (list[str])
|
103
|
-
- Filtered message (list[str])
|
104
|
-
- Cleaned message (list[str])
|
105
|
-
- Score (float)
|
106
|
-
- Result (bool)
|
102
|
+
them. Message must already be preprocessed, normally done in
|
103
|
+
`self.is_toki_pona(message)`.
|
104
|
+
|
105
|
+
Returns a `Scorecard` with all changes to the input text and a score.
|
107
106
|
"""
|
108
107
|
tokenized = self.word_tokenize(message)
|
109
108
|
filtered = self.filter_tokens(tokenized)
|
110
109
|
cleaned = self.clean_tokens(filtered)
|
111
110
|
score = self.score_tokens(cleaned)
|
112
|
-
result = score >= self.__passing_score
|
113
111
|
|
114
|
-
|
112
|
+
scorecard: Scorecard = {
|
113
|
+
"text": message,
|
114
|
+
"tokenized": tokenized,
|
115
|
+
"filtered": filtered,
|
116
|
+
"cleaned": cleaned,
|
117
|
+
"score": score,
|
118
|
+
}
|
119
|
+
|
120
|
+
return scorecard
|
115
121
|
|
116
122
|
def is_toki_pona(self, message: str) -> bool:
|
117
|
-
"""Determines whether a
|
123
|
+
"""Determines whether a text is or is not Toki Pona."""
|
118
124
|
message = self.preprocess(message)
|
119
|
-
|
120
|
-
return
|
125
|
+
scorecard = self._is_toki_pona(message)
|
126
|
+
return scorecard["score"] >= self.__passing_score
|
121
127
|
|
122
128
|
def _are_toki_pona(self, message: str) -> List[Scorecard]:
|
123
|
-
"""Split a message into sentences, then return a list each
|
124
|
-
|
129
|
+
"""Split a message into sentences, then return a list with each
|
130
|
+
sentence's scorecard from `self._is_toki_pona()`.
|
125
131
|
|
126
132
|
Message must already be preprocessed, normally done in
|
127
133
|
`self.are_toki_pona(message)`.
|
128
134
|
"""
|
129
|
-
|
135
|
+
scorecards: List[Scorecard] = list()
|
130
136
|
for sentence in self.sent_tokenize(message):
|
131
137
|
result = self._is_toki_pona(sentence)
|
132
|
-
|
133
|
-
|
138
|
+
scorecards.append(result)
|
139
|
+
scorecards = self.score_sentences(scorecards)
|
140
|
+
return scorecards
|
134
141
|
|
135
142
|
def are_toki_pona(self, message: str) -> List[bool]:
|
136
143
|
"""Splits a statement into sentences, then determines if each is or is not Toki Pona.
|
@@ -148,5 +155,5 @@ class Ilo:
|
|
148
155
|
```
|
149
156
|
"""
|
150
157
|
message = self.preprocess(message)
|
151
|
-
|
152
|
-
return [
|
158
|
+
scorecards = self._are_toki_pona(message)
|
159
|
+
return [card["score"] >= self.__passing_score for card in scorecards]
|
sonatoki/types.py
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
# STL
|
2
|
+
from typing import Dict, List, Union, Literal, TypedDict
|
3
|
+
|
4
|
+
Number = Union[int, float]
|
5
|
+
|
6
|
+
|
7
|
+
# TODO: scorecard kinda sucks as a name
|
8
|
+
class Scorecard(TypedDict):
|
9
|
+
text: str
|
10
|
+
tokenized: List[str]
|
11
|
+
filtered: List[str]
|
12
|
+
cleaned: List[str]
|
13
|
+
score: Number
|
14
|
+
|
15
|
+
|
16
|
+
LinkuUsageDate = Union[
|
17
|
+
Literal["2020-04"],
|
18
|
+
Literal["2021-10"],
|
19
|
+
Literal["2022-08"],
|
20
|
+
Literal["2023-09"],
|
21
|
+
# Literal["2024-09"],
|
22
|
+
]
|
23
|
+
|
24
|
+
LinkuUsageCategory = Union[
|
25
|
+
Literal["core"],
|
26
|
+
Literal["common"],
|
27
|
+
Literal["uncommon"],
|
28
|
+
Literal["obscure"],
|
29
|
+
Literal["sandbox"],
|
30
|
+
]
|
31
|
+
|
32
|
+
LinkuBooks = Union[
|
33
|
+
Literal["pu"],
|
34
|
+
Literal["ku suli"],
|
35
|
+
Literal["ku lili"],
|
36
|
+
Literal["none"],
|
37
|
+
]
|
38
|
+
|
39
|
+
|
40
|
+
class LinkuWord(TypedDict):
|
41
|
+
id: str
|
42
|
+
author_verbatim: str
|
43
|
+
author_verbatim_source: str
|
44
|
+
book: str
|
45
|
+
coined_era: str
|
46
|
+
coined_year: str
|
47
|
+
creator: List[str]
|
48
|
+
ku_data: Dict[str, int]
|
49
|
+
see_also: List[str]
|
50
|
+
resources: Dict[str, str]
|
51
|
+
representations: Dict[str, Union[str, List[str]]]
|
52
|
+
source_language: str
|
53
|
+
usage_category: LinkuUsageCategory
|
54
|
+
word: str
|
55
|
+
deprecated: bool
|
56
|
+
etymology: List[Dict[str, str]]
|
57
|
+
audio: List[Dict[str, str]]
|
58
|
+
pu_verbatim: Dict[str, str]
|
59
|
+
usage: Dict[LinkuUsageDate, int]
|
60
|
+
translations: Dict[str, Dict[str, str]]
|
@@ -1,20 +1,21 @@
|
|
1
|
-
sonatoki-0.
|
2
|
-
sonatoki-0.
|
3
|
-
sonatoki-0.
|
1
|
+
sonatoki-0.7.0.dist-info/METADATA,sha256=s6w7_WaARQijvFIFIWtg8hL2WzAkj19N7-DsKgfhi3s,6517
|
2
|
+
sonatoki-0.7.0.dist-info/WHEEL,sha256=rSwsxJWe3vzyR5HCwjWXQruDgschpei4h_giTm0dJVE,90
|
3
|
+
sonatoki-0.7.0.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
|
4
4
|
sonatoki/Cleaners.py,sha256=x2dT3MpDUfbrHA0EP2D3n1sTiKFFi5jw9ha-1dX973o,1958
|
5
|
-
sonatoki/Configs.py,sha256=
|
6
|
-
sonatoki/Filters.py,sha256=
|
7
|
-
sonatoki/Preprocessors.py,sha256=
|
8
|
-
sonatoki/Scorers.py,sha256=
|
5
|
+
sonatoki/Configs.py,sha256=rIvrkYjeJeCuWwJIjvmJX6keRZcUJ0pt7h7KdYT5IFI,4766
|
6
|
+
sonatoki/Filters.py,sha256=cJ5skX9yeqd4HvjzPxIAswigRWvO0ZV2nepQksFedtk,12575
|
7
|
+
sonatoki/Preprocessors.py,sha256=nY0_cmF4aEmGZxXc7ZEvhvf2BZO6GnrMUC8IqDwu47A,6034
|
8
|
+
sonatoki/Scorers.py,sha256=aCU3p9rD4QOy-uu851FGGw-ARqUCG_l4V_z5rtRL420,5236
|
9
9
|
sonatoki/Tokenizers.py,sha256=8lpC70bzXOpHyhVr5bmqpYKmdmQvJdf7X5-Icc9RRCw,5040
|
10
10
|
sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
sonatoki/__main__.py,sha256=6n4kUF80APl6a0jV46h_ncHNuQbrLpZ_nAmiNAakiag,5673
|
12
12
|
sonatoki/alphabetic.txt,sha256=duyqAKilD2vLIr75RShCIAnktNJcGeEoQIk18V6czmg,11702
|
13
|
-
sonatoki/constants.py,sha256=
|
14
|
-
sonatoki/ilo.py,sha256=
|
13
|
+
sonatoki/constants.py,sha256=BxE_MME2XZUZLg9ZezPirUO2sxw4JkujsrKoENeYORc,19313
|
14
|
+
sonatoki/ilo.py,sha256=Dsn0yagkwjqpAQoCj6mkZ6NqWeanRF2lxNDNoqjWGLo,5993
|
15
15
|
sonatoki/linku.json,sha256=d72Dvht-a4gBmdqLLI8mElvo83zSpbxDmxJj05hOudM,295413
|
16
16
|
sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
17
|
sonatoki/sandbox.json,sha256=44csrQDaVtV-n8OyewabX1J9MmUFCsPct5C8E5Xuc58,140197
|
18
18
|
sonatoki/syllabic.txt,sha256=HnqY4TrZ3tPcHah3TsvG9F9gjMrnAGdJ8hHJNHyyUPc,1712
|
19
|
+
sonatoki/types.py,sha256=zoVJeaDLOPstREiHtoD9pv-AOCsJq2C4_GG3nTYd114,1267
|
19
20
|
sonatoki/utils.py,sha256=sT5xLMEj0aLpy8GP92HKblJU1Wt1m8NUlMgCFWB32xQ,2265
|
20
|
-
sonatoki-0.
|
21
|
+
sonatoki-0.7.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|