sonatoki 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Configs.py +6 -0
- sonatoki/ilo.py +20 -2
- {sonatoki-0.10.1.dist-info → sonatoki-0.11.0.dist-info}/METADATA +4 -3
- {sonatoki-0.10.1.dist-info → sonatoki-0.11.0.dist-info}/RECORD +7 -7
- {sonatoki-0.10.1.dist-info → sonatoki-0.11.0.dist-info}/WHEEL +0 -0
- {sonatoki-0.10.1.dist-info → sonatoki-0.11.0.dist-info}/entry_points.txt +0 -0
- {sonatoki-0.10.1.dist-info → sonatoki-0.11.0.dist-info}/licenses/LICENSE +0 -0
sonatoki/Configs.py
CHANGED
@@ -81,6 +81,7 @@ class IloConfig(TypedDict):
|
|
81
81
|
scoring_filters: List[Type[Filter]]
|
82
82
|
scorer: Type[Scorer]
|
83
83
|
passing_score: Number
|
84
|
+
empty_passes: bool
|
84
85
|
word_tokenizer: NotRequired[Type[Tokenizer]]
|
85
86
|
sent_tokenizer: NotRequired[Type[Tokenizer]]
|
86
87
|
|
@@ -94,6 +95,7 @@ BaseConfig: IloConfig = {
|
|
94
95
|
"scoring_filters": [],
|
95
96
|
"scorer": PassFail,
|
96
97
|
"passing_score": 0.8,
|
98
|
+
"empty_passes": True,
|
97
99
|
}
|
98
100
|
|
99
101
|
|
@@ -110,6 +112,7 @@ PrefConfig: IloConfig = {
|
|
110
112
|
],
|
111
113
|
"scorer": SoftScaling,
|
112
114
|
"passing_score": 0.8,
|
115
|
+
"empty_passes": True,
|
113
116
|
}
|
114
117
|
|
115
118
|
CorpusConfig: IloConfig = {
|
@@ -132,6 +135,7 @@ CorpusConfig: IloConfig = {
|
|
132
135
|
],
|
133
136
|
"scorer": SoftScaling,
|
134
137
|
"passing_score": 0.8,
|
138
|
+
"empty_passes": True, # my client doesn't fail empty sentences; it just omits them
|
135
139
|
}
|
136
140
|
"""Mimics the previous implementation of ilo pi toki pona taso."""
|
137
141
|
LazyConfig: IloConfig = {
|
@@ -142,6 +146,7 @@ LazyConfig: IloConfig = {
|
|
142
146
|
"scorer": SoftPassFail,
|
143
147
|
"passing_score": 0.8,
|
144
148
|
"word_tokenizer": WordTokenizerRe, # mimics old tokenizer
|
149
|
+
"empty_passes": True,
|
145
150
|
}
|
146
151
|
"""This is extremely silly."""
|
147
152
|
IsipinEpikuConfig: IloConfig = {
|
@@ -162,6 +167,7 @@ IsipinEpikuConfig: IloConfig = {
|
|
162
167
|
],
|
163
168
|
"scorer": SoftScaling,
|
164
169
|
"passing_score": 0.8,
|
170
|
+
"empty_passes": True,
|
165
171
|
}
|
166
172
|
|
167
173
|
|
sonatoki/ilo.py
CHANGED
@@ -20,6 +20,7 @@ class Ilo:
|
|
20
20
|
__scorer: Type[Scorer]
|
21
21
|
__sentence_scorer: Type[SentenceScorer]
|
22
22
|
__passing_score: Number
|
23
|
+
__empty_passes: bool
|
23
24
|
|
24
25
|
def __init__(
|
25
26
|
self,
|
@@ -29,6 +30,7 @@ class Ilo:
|
|
29
30
|
scoring_filters: List[Type[Filter]],
|
30
31
|
scorer: Type[Scorer],
|
31
32
|
passing_score: Number,
|
33
|
+
empty_passes: bool = True,
|
32
34
|
sentence_scorer: Type[SentenceScorer] = SentNoOp,
|
33
35
|
word_tokenizer: Type[Tokenizer] = WordTokenizer,
|
34
36
|
sent_tokenizer: Type[Tokenizer] = SentTokenizer,
|
@@ -44,6 +46,7 @@ class Ilo:
|
|
44
46
|
self.__scorer = scorer
|
45
47
|
self.__sentence_scorer = sentence_scorer
|
46
48
|
self.__passing_score = passing_score
|
49
|
+
self.__empty_passes = empty_passes
|
47
50
|
|
48
51
|
def preprocess(self, msg: str) -> str:
|
49
52
|
for p in self.__preprocessors:
|
@@ -108,6 +111,10 @@ class Ilo:
|
|
108
111
|
filtered = self.filter_tokens(tokenized)
|
109
112
|
cleaned = self.clean_tokens(filtered)
|
110
113
|
score = self.score_tokens(cleaned)
|
114
|
+
if not self.__empty_passes and not cleaned:
|
115
|
+
# NOTE: filtered will already be empty
|
116
|
+
# but clean_tokens can *technically* omit tokens too
|
117
|
+
score = 0
|
111
118
|
|
112
119
|
scorecard: Scorecard = {
|
113
120
|
"text": message,
|
@@ -119,10 +126,15 @@ class Ilo:
|
|
119
126
|
|
120
127
|
return scorecard
|
121
128
|
|
129
|
+
def make_scorecard(self, message: str) -> Scorecard:
|
130
|
+
"""Preprocess a message, then create and return a `Scorecard` for that
|
131
|
+
message."""
|
132
|
+
message = self.preprocess(message)
|
133
|
+
return self._is_toki_pona(message)
|
134
|
+
|
122
135
|
def is_toki_pona(self, message: str) -> bool:
|
123
136
|
"""Determines whether a text is or is not Toki Pona."""
|
124
|
-
|
125
|
-
scorecard = self._is_toki_pona(message)
|
137
|
+
scorecard = self.make_scorecard(message)
|
126
138
|
return scorecard["score"] >= self.__passing_score
|
127
139
|
|
128
140
|
def _are_toki_pona(self, message: str) -> List[Scorecard]:
|
@@ -139,6 +151,12 @@ class Ilo:
|
|
139
151
|
scorecards = self.score_sentences(scorecards)
|
140
152
|
return scorecards
|
141
153
|
|
154
|
+
def make_scorecards(self, message: str) -> List[Scorecard]:
|
155
|
+
"""Preprocess a message, then create and return a `Scorecard` for each
|
156
|
+
sentence in that message."""
|
157
|
+
message = self.preprocess(message)
|
158
|
+
return self._are_toki_pona(message)
|
159
|
+
|
142
160
|
def are_toki_pona(self, message: str) -> List[bool]:
|
143
161
|
"""Splits a statement into sentences, then determines if each is or is not Toki Pona.
|
144
162
|
NOTE: You will need to decide how to score the result. Examples:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sonatoki
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.11.0
|
4
4
|
Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
|
5
5
|
Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
|
6
6
|
License: AGPL-3.0-or-later
|
@@ -99,9 +99,10 @@ I originally intended to translate this file and library into Toki Pona once Uni
|
|
99
99
|
The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer` attempts to tokenize statements such that every token either represents a word candidate ("toki", "mumumu") or a complete non-candidate ("..!", "123").
|
100
100
|
This design is highly undesirable for NLTK's English tokenizer because words in languages other than Toki Pona can have punctuation characters in or around them which are part of the word.
|
101
101
|
Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet or in [Private Use Area Unicode characters](https://www.kreativekorp.com/ucsur/), so a more aggressive tokenizer is highly desirable.
|
102
|
-
However, this tokenizer doesn't ignore intra-word punctuation entirely. Instead, exactly one of
|
102
|
+
However, this tokenizer doesn't ignore intra-word punctuation entirely. Instead, exactly one of `-`, `'`, or `.` is allowed at a time, so long as both of its neighbors are writing characters. This increases the accuracy of the tokenizer significantly, which increases the scoring accuracy of the library, ultimately making it easier to correctly identify Toki Pona sentences among arbitrary ones.
|
103
|
+
The reason for this is that, while Toki Pona itself does not have intra-word punctuation, Toki Pona may be written in mixed language environments; a more accurate tokenizer is less likely to produce sentence fragments, and accurately captured sentences are more likely to be identified as the actual language they are written in.
|
103
104
|
|
104
|
-
The
|
105
|
+
Another note: The intent of splitting into word candidates and non-candidates is important, because any [encoding of Toki Pona's logographic script](https://www.kreativekorp.com/ucsur/charts/sitelen.html) will require each character be split into its own token, where the default behavior would be to leave consecutive non-punctuation together.
|
105
106
|
|
106
107
|
### Aren't there a lot of false positives?
|
107
108
|
|
@@ -1,9 +1,9 @@
|
|
1
|
-
sonatoki-0.
|
2
|
-
sonatoki-0.
|
3
|
-
sonatoki-0.
|
4
|
-
sonatoki-0.
|
1
|
+
sonatoki-0.11.0.dist-info/METADATA,sha256=LXxje9dMJHy-2LGPS0nSRre5jWq4qvGU2bLEpJsUbpo,7303
|
2
|
+
sonatoki-0.11.0.dist-info/WHEEL,sha256=thaaA2w1JzcGC48WYufAs8nrYZjJm8LqNfnXFOFyCC4,90
|
3
|
+
sonatoki-0.11.0.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
|
4
|
+
sonatoki-0.11.0.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
|
5
5
|
sonatoki/Cleaners.py,sha256=x2dT3MpDUfbrHA0EP2D3n1sTiKFFi5jw9ha-1dX973o,1958
|
6
|
-
sonatoki/Configs.py,sha256=
|
6
|
+
sonatoki/Configs.py,sha256=cQizs-wqgtM9T9F4kkUsDHpIXN91p4FAgSMefDl333s,5114
|
7
7
|
sonatoki/Filters.py,sha256=8HAtR6_Rk6GPboaS_MHwSjZBJxYnAA8kYbRPI0eR6sM,14823
|
8
8
|
sonatoki/Preprocessors.py,sha256=RmzkvPVo6Kdx1rZ5HeR9cTtx6oxpp2iLKrOMCUEqIrM,7107
|
9
9
|
sonatoki/Scorers.py,sha256=zkdWc0hbtCX1HPdhI2tu2mL4Z5_S5sv7T83MefE4Yik,7756
|
@@ -12,11 +12,11 @@ sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
12
|
sonatoki/__main__.py,sha256=394ldEB4tFpw1UJLV4S4jJ55NfyLgH8rE7o3VWJoGik,6650
|
13
13
|
sonatoki/alphabetic.txt,sha256=duyqAKilD2vLIr75RShCIAnktNJcGeEoQIk18V6czmg,11702
|
14
14
|
sonatoki/constants.py,sha256=KhE385XkF-64bGBxkQNisu7vODsjpOfmrL8bnyQP_1k,20572
|
15
|
-
sonatoki/ilo.py,sha256=
|
15
|
+
sonatoki/ilo.py,sha256=MWoONZaYh8h92ZrMlG-MkNktFyqHX8Jb5zOD57800KI,6755
|
16
16
|
sonatoki/linku.json,sha256=U5KVxFJSageQydXXDsQCT8X_QoNAK2OaZhJmbu0eoZo,299939
|
17
17
|
sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
18
|
sonatoki/sandbox.json,sha256=QAviQZ7_nwstUr1ejKegxiIoYmBL2YJIoiZovDYNFL8,147485
|
19
19
|
sonatoki/syllabic.txt,sha256=HnqY4TrZ3tPcHah3TsvG9F9gjMrnAGdJ8hHJNHyyUPc,1712
|
20
20
|
sonatoki/types.py,sha256=VjYSGAzsbR_d3mg8n-VHg__7LyXpmGdEIMDsbPHyxFw,1265
|
21
21
|
sonatoki/utils.py,sha256=sT5xLMEj0aLpy8GP92HKblJU1Wt1m8NUlMgCFWB32xQ,2265
|
22
|
-
sonatoki-0.
|
22
|
+
sonatoki-0.11.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|