sonatoki 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Configs.py +11 -36
- sonatoki/Filters.py +35 -34
- sonatoki/Preprocessors.py +9 -0
- sonatoki/Scorers.py +61 -6
- sonatoki/constants.py +38 -27
- sonatoki/ilo.py +34 -27
- sonatoki/types.py +60 -0
- {sonatoki-0.6.3.dist-info → sonatoki-0.7.0.dist-info}/METADATA +1 -1
- {sonatoki-0.6.3.dist-info → sonatoki-0.7.0.dist-info}/RECORD +11 -10
- {sonatoki-0.6.3.dist-info → sonatoki-0.7.0.dist-info}/WHEEL +0 -0
- {sonatoki-0.6.3.dist-info → sonatoki-0.7.0.dist-info}/licenses/LICENSE +0 -0
sonatoki/Configs.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
# STL
|
2
|
-
from copy import deepcopy
|
3
2
|
from typing import List, Type, TypedDict
|
4
3
|
|
5
4
|
# PDM
|
6
5
|
from typing_extensions import NotRequired
|
7
6
|
|
8
7
|
# LOCAL
|
8
|
+
from sonatoki.types import Number
|
9
9
|
from sonatoki.Filters import (
|
10
10
|
Or,
|
11
11
|
And,
|
@@ -20,23 +20,22 @@ from sonatoki.Filters import (
|
|
20
20
|
Punctuation,
|
21
21
|
LongSyllabic,
|
22
22
|
Miscellaneous,
|
23
|
-
NimiLinkuCore,
|
24
23
|
LongAlphabetic,
|
25
24
|
LongProperName,
|
26
|
-
NimiLinkuCommon,
|
27
25
|
FalsePosSyllabic,
|
26
|
+
NimiLinkuByUsage,
|
28
27
|
NimiLinkuObscure,
|
29
28
|
NimiLinkuSandbox,
|
30
29
|
NimiLinkuUncommon,
|
31
30
|
FalsePosAlphabetic,
|
32
31
|
)
|
33
|
-
from sonatoki.Scorers import
|
32
|
+
from sonatoki.Scorers import Scorer, PassFail, SoftScaling, SoftPassFail
|
34
33
|
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
35
34
|
from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
|
36
35
|
from sonatoki.Preprocessors import (
|
37
36
|
URLs,
|
38
37
|
Emoji,
|
39
|
-
|
38
|
+
Codeblock,
|
40
39
|
Reference,
|
41
40
|
Preprocessor,
|
42
41
|
AngleBracketObject,
|
@@ -95,11 +94,11 @@ BaseConfig: IloConfig = {
|
|
95
94
|
|
96
95
|
|
97
96
|
PrefConfig: IloConfig = {
|
98
|
-
"preprocessors": [Emoji,
|
97
|
+
"preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
|
99
98
|
"cleaners": [ConsecutiveDuplicates],
|
100
99
|
"ignoring_filters": [Numeric, Punctuation],
|
101
100
|
"scoring_filters": [
|
102
|
-
Or(
|
101
|
+
Or(NimiLinkuByUsage(30), NimiUCSUR),
|
103
102
|
And(LongSyllabic, Not(FalsePosSyllabic)),
|
104
103
|
# NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
|
105
104
|
LongProperName,
|
@@ -110,16 +109,13 @@ PrefConfig: IloConfig = {
|
|
110
109
|
}
|
111
110
|
|
112
111
|
CorpusConfig: IloConfig = {
|
113
|
-
"preprocessors": [Emoji,
|
112
|
+
"preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
|
114
113
|
"cleaners": [ConsecutiveDuplicates],
|
115
114
|
"ignoring_filters": [Numeric, Punctuation],
|
116
115
|
"scoring_filters": [
|
117
116
|
Or(
|
118
|
-
|
119
|
-
|
120
|
-
NimiLinkuUncommon,
|
121
|
-
NimiLinkuObscure(sub=__DICT_PHONOMATCHES),
|
122
|
-
NimiLinkuSandbox(sub=__DICT_PHONOMATCHES),
|
117
|
+
# awkward but efficient syntax
|
118
|
+
NimiLinkuByUsage(0)(sub=__DICT_PHONOMATCHES),
|
123
119
|
NimiUCSUR,
|
124
120
|
Miscellaneous,
|
125
121
|
),
|
@@ -132,7 +128,7 @@ CorpusConfig: IloConfig = {
|
|
132
128
|
}
|
133
129
|
"""Mimics the previous implementation of ilo pi toki pona taso."""
|
134
130
|
LazyConfig: IloConfig = {
|
135
|
-
"preprocessors": [Emoji,
|
131
|
+
"preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
|
136
132
|
"cleaners": [ConsecutiveDuplicates],
|
137
133
|
"ignoring_filters": [Numeric, Punctuation],
|
138
134
|
"scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
|
@@ -142,7 +138,7 @@ LazyConfig: IloConfig = {
|
|
142
138
|
}
|
143
139
|
"""This is extremely silly."""
|
144
140
|
IsipinEpikuConfig: IloConfig = {
|
145
|
-
"preprocessors": [Emoji,
|
141
|
+
"preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
|
146
142
|
"cleaners": [ConsecutiveDuplicates],
|
147
143
|
"ignoring_filters": [Numeric, Punctuation],
|
148
144
|
"scoring_filters": [
|
@@ -162,31 +158,10 @@ IsipinEpikuConfig: IloConfig = {
|
|
162
158
|
}
|
163
159
|
|
164
160
|
|
165
|
-
DiscordConfig: IloConfig = {
|
166
|
-
"preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
|
167
|
-
"cleaners": [ConsecutiveDuplicates],
|
168
|
-
"ignoring_filters": [Numeric, Punctuation],
|
169
|
-
"scoring_filters": [
|
170
|
-
Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
|
171
|
-
And(LongSyllabic, Not(FalsePosSyllabic)),
|
172
|
-
LongProperName,
|
173
|
-
And(LongAlphabetic, Not(FalsePosAlphabetic)),
|
174
|
-
],
|
175
|
-
"scorer": SoftScaling,
|
176
|
-
"passing_score": 0.8,
|
177
|
-
}
|
178
|
-
|
179
|
-
TelegramConfig: IloConfig = deepcopy(PrefConfig)
|
180
|
-
ForumConfig: IloConfig = deepcopy(PrefConfig)
|
181
|
-
|
182
|
-
|
183
161
|
__all__ = [
|
184
162
|
"BaseConfig",
|
185
163
|
"CorpusConfig",
|
186
|
-
"DiscordConfig",
|
187
|
-
"ForumConfig",
|
188
164
|
"IloConfig",
|
189
165
|
"LazyConfig",
|
190
166
|
"PrefConfig",
|
191
|
-
"TelegramConfig",
|
192
167
|
]
|
sonatoki/Filters.py
CHANGED
@@ -2,37 +2,32 @@
|
|
2
2
|
import re
|
3
3
|
from abc import ABC, abstractmethod
|
4
4
|
from copy import deepcopy
|
5
|
-
from typing import Set, List, Type, Optional
|
5
|
+
from typing import Set, List, Type, Union, Literal, Optional
|
6
6
|
from functools import lru_cache as cache # cache comes in 3.9
|
7
7
|
|
8
8
|
# PDM
|
9
9
|
import regex
|
10
|
-
from typing_extensions import override
|
10
|
+
from typing_extensions import override
|
11
11
|
|
12
12
|
# LOCAL
|
13
|
+
from sonatoki.types import LinkuBooks, LinkuUsageDate, LinkuUsageCategory
|
13
14
|
from sonatoki.utils import prep_dictionary
|
14
15
|
from sonatoki.constants import (
|
15
16
|
VOWELS,
|
16
|
-
NIMI_PU,
|
17
17
|
ALPHABET,
|
18
18
|
ALL_PUNCT,
|
19
19
|
ALLOWABLES,
|
20
20
|
CONSONANTS,
|
21
21
|
NIMI_UCSUR,
|
22
|
-
NIMI_KU_LILI,
|
23
|
-
NIMI_KU_SULI,
|
24
|
-
NIMI_LINKU_CORE,
|
25
22
|
NIMI_PU_SYNONYMS,
|
26
|
-
NIMI_LINKU_COMMON,
|
27
23
|
FALSE_POS_SYLLABIC,
|
28
|
-
NIMI_LINKU_OBSCURE,
|
29
|
-
NIMI_LINKU_SANDBOX,
|
30
24
|
NOT_IN_PUNCT_CLASS,
|
31
|
-
NIMI_LINKU_UNCOMMON,
|
32
25
|
ALL_PUNCT_RANGES_STR,
|
33
26
|
FALSE_POS_ALPHABETIC,
|
34
27
|
UCSUR_PUNCT_RANGES_STR,
|
35
28
|
EMOJI_VARIATION_SELECTOR_RANGES_STR,
|
29
|
+
words_by_tag,
|
30
|
+
words_by_usage,
|
36
31
|
)
|
37
32
|
|
38
33
|
regex.DEFAULT_VERSION = regex.VERSION1
|
@@ -170,40 +165,46 @@ class LongProperName(MinLen, ProperName):
|
|
170
165
|
length = 2 # reject "names" of length 1
|
171
166
|
|
172
167
|
|
173
|
-
class
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
class NimiKuSuli(MemberFilter):
|
182
|
-
tokens = prep_dictionary(NIMI_KU_SULI)
|
183
|
-
|
184
|
-
|
185
|
-
class NimiKuLili(MemberFilter):
|
186
|
-
tokens = prep_dictionary(NIMI_KU_LILI)
|
168
|
+
class NimiLinkuByUsage:
|
169
|
+
def __new__(
|
170
|
+
cls,
|
171
|
+
usage: int,
|
172
|
+
date: Optional[LinkuUsageDate] = None,
|
173
|
+
) -> Type[MemberFilter]:
|
174
|
+
words = words_by_usage(usage, date)
|
187
175
|
|
176
|
+
class AnonLinkuMemberFilter(MemberFilter):
|
177
|
+
tokens = prep_dictionary(words)
|
188
178
|
|
189
|
-
|
190
|
-
tokens = prep_dictionary(NIMI_LINKU_CORE)
|
179
|
+
return AnonLinkuMemberFilter
|
191
180
|
|
192
181
|
|
193
|
-
class
|
194
|
-
|
182
|
+
class NimiLinkuByTag:
|
183
|
+
def __new__(
|
184
|
+
cls,
|
185
|
+
tag: Union[Literal["usage_category"], Literal["book"]],
|
186
|
+
category: Union[LinkuUsageCategory, LinkuBooks],
|
187
|
+
) -> Type[MemberFilter]:
|
188
|
+
words = words_by_tag(tag, category)
|
195
189
|
|
190
|
+
class AnonLinkuMemberFilter(MemberFilter):
|
191
|
+
tokens = prep_dictionary(words)
|
196
192
|
|
197
|
-
|
198
|
-
tokens = prep_dictionary(NIMI_LINKU_UNCOMMON)
|
193
|
+
return AnonLinkuMemberFilter
|
199
194
|
|
200
195
|
|
201
|
-
|
202
|
-
|
196
|
+
NimiPu = NimiLinkuByTag("book", "pu")
|
197
|
+
NimiKuSuli = NimiLinkuByTag("book", "ku suli")
|
198
|
+
NimiKuLili = NimiLinkuByTag("book", "ku lili")
|
199
|
+
NimiLinkuCore = NimiLinkuByTag("usage_category", "core")
|
200
|
+
NimiLinkuCommon = NimiLinkuByTag("usage_category", "common")
|
201
|
+
NimiLinkuUncommon = NimiLinkuByTag("usage_category", "uncommon")
|
202
|
+
NimiLinkuObscure = NimiLinkuByTag("usage_category", "obscure")
|
203
|
+
NimiLinkuSandbox = NimiLinkuByTag("usage_category", "sandbox")
|
203
204
|
|
204
205
|
|
205
|
-
class
|
206
|
-
tokens = prep_dictionary(
|
206
|
+
class NimiPuSynonyms(MemberFilter):
|
207
|
+
tokens = prep_dictionary(NIMI_PU_SYNONYMS)
|
207
208
|
|
208
209
|
|
209
210
|
class NimiUCSUR(MemberFilter):
|
sonatoki/Preprocessors.py
CHANGED
@@ -143,6 +143,15 @@ class Backticks(RegexPreprocessor):
|
|
143
143
|
pattern = re.compile(r"`[^`]+`", flags=re.DOTALL)
|
144
144
|
|
145
145
|
|
146
|
+
class Codeblock(RegexPreprocessor):
|
147
|
+
"""Remove codeblocks marked by a set of three backticks on their own lines.
|
148
|
+
|
149
|
+
Subset of what would be removed by Backticks, but may be preferable.
|
150
|
+
"""
|
151
|
+
|
152
|
+
pattern = re.compile(r"```\n(?:(?!```).*?)?```", flags=re.DOTALL)
|
153
|
+
|
154
|
+
|
146
155
|
class Spoilers(RegexPreprocessor):
|
147
156
|
"""Remove paired double bars and their contents `||like this||`"""
|
148
157
|
|
sonatoki/Scorers.py
CHANGED
@@ -1,17 +1,15 @@
|
|
1
1
|
# STL
|
2
2
|
import math
|
3
3
|
from abc import ABC, abstractmethod
|
4
|
-
from typing import
|
4
|
+
from typing import List, Type
|
5
5
|
|
6
6
|
# PDM
|
7
7
|
from typing_extensions import override
|
8
8
|
|
9
9
|
# LOCAL
|
10
|
+
from sonatoki.types import Number, Scorecard
|
10
11
|
from sonatoki.Filters import Filter
|
11
12
|
|
12
|
-
Number = Union[int, float]
|
13
|
-
Weights = Dict[str, Number]
|
14
|
-
|
15
13
|
|
16
14
|
class Scorer(ABC):
|
17
15
|
@classmethod
|
@@ -124,7 +122,64 @@ class SoftScaling(Soften, Scaling):
|
|
124
122
|
scoring."""
|
125
123
|
|
126
124
|
|
127
|
-
|
125
|
+
class SentenceScorer(ABC):
|
126
|
+
@classmethod
|
127
|
+
@abstractmethod
|
128
|
+
def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
|
129
|
+
"""Re-score a list of sentences (scorecards, sentences with all their
|
130
|
+
metadata) and return them."""
|
131
|
+
raise NotImplementedError
|
132
|
+
|
133
|
+
|
134
|
+
class SentNoOp(SentenceScorer):
|
135
|
+
@classmethod
|
136
|
+
@override
|
137
|
+
def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
|
138
|
+
return scorecards
|
128
139
|
|
129
140
|
|
130
|
-
|
141
|
+
class SentAvg(SentenceScorer):
|
142
|
+
@classmethod
|
143
|
+
@override
|
144
|
+
def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
|
145
|
+
if not scorecards:
|
146
|
+
return scorecards
|
147
|
+
|
148
|
+
total = sum(card["score"] for card in scorecards)
|
149
|
+
avg = total / len(scorecards)
|
150
|
+
for card in scorecards:
|
151
|
+
card["score"] = avg
|
152
|
+
return scorecards
|
153
|
+
|
154
|
+
|
155
|
+
class SentWeightedAvg(SentenceScorer):
|
156
|
+
@classmethod
|
157
|
+
@override
|
158
|
+
def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
|
159
|
+
if not scorecards:
|
160
|
+
return scorecards
|
161
|
+
|
162
|
+
weighted_total = 0
|
163
|
+
total_len = 0
|
164
|
+
for card in scorecards:
|
165
|
+
cardlen = len(card["cleaned"])
|
166
|
+
cardscore = card["score"]
|
167
|
+
|
168
|
+
weighted_total += cardlen * cardscore
|
169
|
+
total_len += cardlen
|
170
|
+
|
171
|
+
weighted_avg = weighted_total / total_len
|
172
|
+
for card in scorecards:
|
173
|
+
card["score"] = weighted_avg
|
174
|
+
return scorecards
|
175
|
+
|
176
|
+
|
177
|
+
__all__ = [
|
178
|
+
"PassFail",
|
179
|
+
"Scaling",
|
180
|
+
"SoftPassFail",
|
181
|
+
"SoftScaling",
|
182
|
+
"Soften",
|
183
|
+
"SentAvg",
|
184
|
+
"SentWeightedAvg",
|
185
|
+
]
|
sonatoki/constants.py
CHANGED
@@ -1,11 +1,16 @@
|
|
1
1
|
# STL
|
2
2
|
import json
|
3
|
-
from typing import Set, Dict
|
3
|
+
from typing import Set, Dict, Optional
|
4
4
|
from pathlib import Path
|
5
5
|
|
6
6
|
# LOCAL
|
7
|
+
from sonatoki.types import LinkuWord, LinkuUsageDate
|
7
8
|
from sonatoki.utils import find_unicode_chars, find_unicode_ranges
|
8
9
|
|
10
|
+
LATEST_DATE = "2023-09"
|
11
|
+
# hardcoding this seems bad, but it means the parser is stable w.r.t. Linku!
|
12
|
+
|
13
|
+
|
9
14
|
# `\p{Punctuation}` character class
|
10
15
|
# https://www.compart.com/en/unicode/category
|
11
16
|
# https://unicode.org/Public/UNIDATA/UnicodeData.txt
|
@@ -638,6 +643,7 @@ FALSE_POS_SYLLABIC = {
|
|
638
643
|
"iluminate",
|
639
644
|
"imense",
|
640
645
|
"imitate",
|
646
|
+
"inanimate",
|
641
647
|
"injoke",
|
642
648
|
"insane",
|
643
649
|
"insolate",
|
@@ -689,26 +695,42 @@ NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
|
|
689
695
|
# NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
|
690
696
|
|
691
697
|
|
692
|
-
def
|
693
|
-
|
698
|
+
def linku_data() -> Dict[str, LinkuWord]:
|
699
|
+
# NOTE: this does open+read+parse two files each time you construct a filter
|
700
|
+
# but i expect users to construct filters only at the start of runtime
|
701
|
+
# there is no reason to waste your RAM by leaving the linku data in it
|
702
|
+
with open(LINKU) as f:
|
703
|
+
linku: Dict[str, LinkuWord] = json.loads(f.read())
|
704
|
+
with open(SANDBOX) as f:
|
705
|
+
sandbox: Dict[str, LinkuWord] = json.loads(f.read())
|
706
|
+
|
707
|
+
return {**linku, **sandbox}
|
708
|
+
|
694
709
|
|
710
|
+
def words_by_tag(tag: str, value: str) -> Set[str]:
|
711
|
+
data = linku_data()
|
712
|
+
return {d["word"] for d in data.values() if d[tag] == value}
|
695
713
|
|
696
|
-
with open(LINKU) as f:
|
697
|
-
linku: Dict[str, Dict[str, str]] = json.loads(f.read())
|
698
|
-
NIMI_PU = category_helper(linku, "book", "pu")
|
699
|
-
NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
|
700
714
|
|
701
|
-
|
702
|
-
|
715
|
+
def words_by_usage(
|
716
|
+
usage: int,
|
717
|
+
date: Optional[LinkuUsageDate] = None,
|
718
|
+
) -> Set[str]:
|
719
|
+
if not date:
|
720
|
+
date = LATEST_DATE
|
721
|
+
data = linku_data()
|
703
722
|
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
723
|
+
result: Set[str] = set()
|
724
|
+
for word in data.values():
|
725
|
+
usages = word["usage"]
|
726
|
+
if date in usages and usages[date] >= usage:
|
727
|
+
result.add(word["word"])
|
728
|
+
|
729
|
+
return result
|
730
|
+
|
731
|
+
|
732
|
+
NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
|
708
733
|
|
709
|
-
with open(SANDBOX) as f:
|
710
|
-
sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
|
711
|
-
NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
|
712
734
|
|
713
735
|
# with open(SYLLABICS) as f:
|
714
736
|
# FALSE_POS_SYLLABIC = {line.strip() for line in f}
|
@@ -716,9 +738,6 @@ with open(SANDBOX) as f:
|
|
716
738
|
# with open(ALPHABETICS) as f:
|
717
739
|
# FALSE_POS_ALPHABETIC = {line.strip() for line in f}
|
718
740
|
|
719
|
-
del linku
|
720
|
-
del sandbox
|
721
|
-
|
722
741
|
__all__ = [
|
723
742
|
"ALLOWABLES",
|
724
743
|
"ALL_PUNCT",
|
@@ -727,14 +746,6 @@ __all__ = [
|
|
727
746
|
"CONSONANTS",
|
728
747
|
"EMOJI_VARIATION_SELECTOR_RANGES",
|
729
748
|
"EMOJI_VARIATION_SELECTOR_RANGES_STR",
|
730
|
-
"NIMI_KU_LILI",
|
731
|
-
"NIMI_KU_SULI",
|
732
|
-
"NIMI_LINKU_COMMON",
|
733
|
-
"NIMI_LINKU_CORE",
|
734
|
-
"NIMI_LINKU_OBSCURE",
|
735
|
-
"NIMI_LINKU_SANDBOX",
|
736
|
-
"NIMI_LINKU_UNCOMMON",
|
737
|
-
"NIMI_PU",
|
738
749
|
"NIMI_PU_SYNONYMS",
|
739
750
|
"POSIX_PUNCT",
|
740
751
|
"POSIX_PUNCT_RANGES",
|
sonatoki/ilo.py
CHANGED
@@ -1,17 +1,14 @@
|
|
1
1
|
# STL
|
2
|
-
from typing import List, Type
|
2
|
+
from typing import List, Type
|
3
3
|
|
4
4
|
# LOCAL
|
5
|
+
from sonatoki.types import Number, Scorecard
|
5
6
|
from sonatoki.Filters import Filter
|
6
|
-
from sonatoki.Scorers import
|
7
|
+
from sonatoki.Scorers import Scorer, SentNoOp, SentenceScorer
|
7
8
|
from sonatoki.Cleaners import Cleaner
|
8
9
|
from sonatoki.Tokenizers import Tokenizer, SentTokenizer, WordTokenizer
|
9
10
|
from sonatoki.Preprocessors import Preprocessor
|
10
11
|
|
11
|
-
# tokenized, filtered, cleaned, score, result
|
12
|
-
Scorecard = Tuple[List[str], List[str], List[str], Number, bool]
|
13
|
-
# TODO: scorecard kinda sucks as a name
|
14
|
-
|
15
12
|
|
16
13
|
class Ilo:
|
17
14
|
__preprocessors: List[Type[Preprocessor]]
|
@@ -21,6 +18,7 @@ class Ilo:
|
|
21
18
|
__ignoring_filters: List[Type[Filter]]
|
22
19
|
__scoring_filters: List[Type[Filter]]
|
23
20
|
__scorer: Type[Scorer]
|
21
|
+
__sentence_scorer: Type[SentenceScorer]
|
24
22
|
__passing_score: Number
|
25
23
|
|
26
24
|
def __init__(
|
@@ -31,6 +29,7 @@ class Ilo:
|
|
31
29
|
scoring_filters: List[Type[Filter]],
|
32
30
|
scorer: Type[Scorer],
|
33
31
|
passing_score: Number,
|
32
|
+
sentence_scorer: Type[SentenceScorer] = SentNoOp,
|
34
33
|
word_tokenizer: Type[Tokenizer] = WordTokenizer,
|
35
34
|
sent_tokenizer: Type[Tokenizer] = SentTokenizer,
|
36
35
|
):
|
@@ -43,6 +42,7 @@ class Ilo:
|
|
43
42
|
self.__ignoring_filters = [*ignoring_filters]
|
44
43
|
self.__scoring_filters = [*scoring_filters]
|
45
44
|
self.__scorer = scorer
|
45
|
+
self.__sentence_scorer = sentence_scorer
|
46
46
|
self.__passing_score = passing_score
|
47
47
|
|
48
48
|
def preprocess(self, msg: str) -> str:
|
@@ -55,6 +55,7 @@ class Ilo:
|
|
55
55
|
return self.__word_tokenizer.tokenize(msg)
|
56
56
|
|
57
57
|
def sent_tokenize(self, msg: str) -> List[str]:
|
58
|
+
"""It is *highly* recommended that you run `ilo.preprocess` first."""
|
58
59
|
return self.__sent_tokenizer.tokenize(msg)
|
59
60
|
|
60
61
|
def clean_token(self, token: str) -> str:
|
@@ -93,44 +94,50 @@ class Ilo:
|
|
93
94
|
def score_tokens(self, tokens: List[str]) -> float:
|
94
95
|
return self.__scorer.score(tokens, self.__scoring_filters)
|
95
96
|
|
97
|
+
def score_sentences(self, scorecards: List[Scorecard]) -> List[Scorecard]:
|
98
|
+
return self.__sentence_scorer.score(scorecards)
|
99
|
+
|
96
100
|
def _is_toki_pona(self, message: str) -> Scorecard:
|
97
101
|
"""Process a message into its tokens, then filters, cleans, and scores
|
98
|
-
them.
|
99
|
-
|
100
|
-
|
101
|
-
Returns all
|
102
|
-
- Tokenized message (list[str])
|
103
|
-
- Filtered message (list[str])
|
104
|
-
- Cleaned message (list[str])
|
105
|
-
- Score (float)
|
106
|
-
- Result (bool)
|
102
|
+
them. Message must already be preprocessed, normally done in
|
103
|
+
`self.is_toki_pona(message)`.
|
104
|
+
|
105
|
+
Returns a `Scorecard` with all changes to the input text and a score.
|
107
106
|
"""
|
108
107
|
tokenized = self.word_tokenize(message)
|
109
108
|
filtered = self.filter_tokens(tokenized)
|
110
109
|
cleaned = self.clean_tokens(filtered)
|
111
110
|
score = self.score_tokens(cleaned)
|
112
|
-
result = score >= self.__passing_score
|
113
111
|
|
114
|
-
|
112
|
+
scorecard: Scorecard = {
|
113
|
+
"text": message,
|
114
|
+
"tokenized": tokenized,
|
115
|
+
"filtered": filtered,
|
116
|
+
"cleaned": cleaned,
|
117
|
+
"score": score,
|
118
|
+
}
|
119
|
+
|
120
|
+
return scorecard
|
115
121
|
|
116
122
|
def is_toki_pona(self, message: str) -> bool:
|
117
|
-
"""Determines whether a
|
123
|
+
"""Determines whether a text is or is not Toki Pona."""
|
118
124
|
message = self.preprocess(message)
|
119
|
-
|
120
|
-
return
|
125
|
+
scorecard = self._is_toki_pona(message)
|
126
|
+
return scorecard["score"] >= self.__passing_score
|
121
127
|
|
122
128
|
def _are_toki_pona(self, message: str) -> List[Scorecard]:
|
123
|
-
"""Split a message into sentences, then return a list each
|
124
|
-
|
129
|
+
"""Split a message into sentences, then return a list with each
|
130
|
+
sentence's scorecard from `self._is_toki_pona()`.
|
125
131
|
|
126
132
|
Message must already be preprocessed, normally done in
|
127
133
|
`self.are_toki_pona(message)`.
|
128
134
|
"""
|
129
|
-
|
135
|
+
scorecards: List[Scorecard] = list()
|
130
136
|
for sentence in self.sent_tokenize(message):
|
131
137
|
result = self._is_toki_pona(sentence)
|
132
|
-
|
133
|
-
|
138
|
+
scorecards.append(result)
|
139
|
+
scorecards = self.score_sentences(scorecards)
|
140
|
+
return scorecards
|
134
141
|
|
135
142
|
def are_toki_pona(self, message: str) -> List[bool]:
|
136
143
|
"""Splits a statement into sentences, then determines if each is or is not Toki Pona.
|
@@ -148,5 +155,5 @@ class Ilo:
|
|
148
155
|
```
|
149
156
|
"""
|
150
157
|
message = self.preprocess(message)
|
151
|
-
|
152
|
-
return [
|
158
|
+
scorecards = self._are_toki_pona(message)
|
159
|
+
return [card["score"] >= self.__passing_score for card in scorecards]
|
sonatoki/types.py
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
# STL
|
2
|
+
from typing import Dict, List, Union, Literal, TypedDict
|
3
|
+
|
4
|
+
Number = Union[int, float]
|
5
|
+
|
6
|
+
|
7
|
+
# TODO: scorecard kinda sucks as a name
|
8
|
+
class Scorecard(TypedDict):
|
9
|
+
text: str
|
10
|
+
tokenized: List[str]
|
11
|
+
filtered: List[str]
|
12
|
+
cleaned: List[str]
|
13
|
+
score: Number
|
14
|
+
|
15
|
+
|
16
|
+
LinkuUsageDate = Union[
|
17
|
+
Literal["2020-04"],
|
18
|
+
Literal["2021-10"],
|
19
|
+
Literal["2022-08"],
|
20
|
+
Literal["2023-09"],
|
21
|
+
# Literal["2024-09"],
|
22
|
+
]
|
23
|
+
|
24
|
+
LinkuUsageCategory = Union[
|
25
|
+
Literal["core"],
|
26
|
+
Literal["common"],
|
27
|
+
Literal["uncommon"],
|
28
|
+
Literal["obscure"],
|
29
|
+
Literal["sandbox"],
|
30
|
+
]
|
31
|
+
|
32
|
+
LinkuBooks = Union[
|
33
|
+
Literal["pu"],
|
34
|
+
Literal["ku suli"],
|
35
|
+
Literal["ku lili"],
|
36
|
+
Literal["none"],
|
37
|
+
]
|
38
|
+
|
39
|
+
|
40
|
+
class LinkuWord(TypedDict):
|
41
|
+
id: str
|
42
|
+
author_verbatim: str
|
43
|
+
author_verbatim_source: str
|
44
|
+
book: str
|
45
|
+
coined_era: str
|
46
|
+
coined_year: str
|
47
|
+
creator: List[str]
|
48
|
+
ku_data: Dict[str, int]
|
49
|
+
see_also: List[str]
|
50
|
+
resources: Dict[str, str]
|
51
|
+
representations: Dict[str, Union[str, List[str]]]
|
52
|
+
source_language: str
|
53
|
+
usage_category: LinkuUsageCategory
|
54
|
+
word: str
|
55
|
+
deprecated: bool
|
56
|
+
etymology: List[Dict[str, str]]
|
57
|
+
audio: List[Dict[str, str]]
|
58
|
+
pu_verbatim: Dict[str, str]
|
59
|
+
usage: Dict[LinkuUsageDate, int]
|
60
|
+
translations: Dict[str, Dict[str, str]]
|
@@ -1,20 +1,21 @@
|
|
1
|
-
sonatoki-0.
|
2
|
-
sonatoki-0.
|
3
|
-
sonatoki-0.
|
1
|
+
sonatoki-0.7.0.dist-info/METADATA,sha256=s6w7_WaARQijvFIFIWtg8hL2WzAkj19N7-DsKgfhi3s,6517
|
2
|
+
sonatoki-0.7.0.dist-info/WHEEL,sha256=rSwsxJWe3vzyR5HCwjWXQruDgschpei4h_giTm0dJVE,90
|
3
|
+
sonatoki-0.7.0.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
|
4
4
|
sonatoki/Cleaners.py,sha256=x2dT3MpDUfbrHA0EP2D3n1sTiKFFi5jw9ha-1dX973o,1958
|
5
|
-
sonatoki/Configs.py,sha256=
|
6
|
-
sonatoki/Filters.py,sha256=
|
7
|
-
sonatoki/Preprocessors.py,sha256=
|
8
|
-
sonatoki/Scorers.py,sha256=
|
5
|
+
sonatoki/Configs.py,sha256=rIvrkYjeJeCuWwJIjvmJX6keRZcUJ0pt7h7KdYT5IFI,4766
|
6
|
+
sonatoki/Filters.py,sha256=cJ5skX9yeqd4HvjzPxIAswigRWvO0ZV2nepQksFedtk,12575
|
7
|
+
sonatoki/Preprocessors.py,sha256=nY0_cmF4aEmGZxXc7ZEvhvf2BZO6GnrMUC8IqDwu47A,6034
|
8
|
+
sonatoki/Scorers.py,sha256=aCU3p9rD4QOy-uu851FGGw-ARqUCG_l4V_z5rtRL420,5236
|
9
9
|
sonatoki/Tokenizers.py,sha256=8lpC70bzXOpHyhVr5bmqpYKmdmQvJdf7X5-Icc9RRCw,5040
|
10
10
|
sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
sonatoki/__main__.py,sha256=6n4kUF80APl6a0jV46h_ncHNuQbrLpZ_nAmiNAakiag,5673
|
12
12
|
sonatoki/alphabetic.txt,sha256=duyqAKilD2vLIr75RShCIAnktNJcGeEoQIk18V6czmg,11702
|
13
|
-
sonatoki/constants.py,sha256=
|
14
|
-
sonatoki/ilo.py,sha256=
|
13
|
+
sonatoki/constants.py,sha256=BxE_MME2XZUZLg9ZezPirUO2sxw4JkujsrKoENeYORc,19313
|
14
|
+
sonatoki/ilo.py,sha256=Dsn0yagkwjqpAQoCj6mkZ6NqWeanRF2lxNDNoqjWGLo,5993
|
15
15
|
sonatoki/linku.json,sha256=d72Dvht-a4gBmdqLLI8mElvo83zSpbxDmxJj05hOudM,295413
|
16
16
|
sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
17
|
sonatoki/sandbox.json,sha256=44csrQDaVtV-n8OyewabX1J9MmUFCsPct5C8E5Xuc58,140197
|
18
18
|
sonatoki/syllabic.txt,sha256=HnqY4TrZ3tPcHah3TsvG9F9gjMrnAGdJ8hHJNHyyUPc,1712
|
19
|
+
sonatoki/types.py,sha256=zoVJeaDLOPstREiHtoD9pv-AOCsJq2C4_GG3nTYd114,1267
|
19
20
|
sonatoki/utils.py,sha256=sT5xLMEj0aLpy8GP92HKblJU1Wt1m8NUlMgCFWB32xQ,2265
|
20
|
-
sonatoki-0.
|
21
|
+
sonatoki-0.7.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|