sonatoki 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Configs.py +3 -3
- sonatoki/Filters.py +20 -7
- sonatoki/Preprocessors.py +48 -6
- sonatoki/Scorers.py +2 -14
- sonatoki/Tokenizers.py +22 -7
- sonatoki/constants.py +16 -0
- sonatoki/ilo.py +0 -12
- {sonatoki-0.1.3.dist-info → sonatoki-0.1.5.dist-info}/METADATA +1 -1
- sonatoki-0.1.5.dist-info/RECORD +16 -0
- sonatoki-0.1.3.dist-info/RECORD +0 -16
- {sonatoki-0.1.3.dist-info → sonatoki-0.1.5.dist-info}/WHEEL +0 -0
- {sonatoki-0.1.3.dist-info → sonatoki-0.1.5.dist-info}/licenses/LICENSE +0 -0
sonatoki/Configs.py
CHANGED
@@ -9,15 +9,15 @@ from typing_extensions import NotRequired
|
|
9
9
|
from sonatoki.Filters import (
|
10
10
|
Filter,
|
11
11
|
NimiPu,
|
12
|
-
|
12
|
+
Numeric,
|
13
13
|
Syllabic,
|
14
14
|
NimiLinku,
|
15
15
|
NimiPuAle,
|
16
16
|
Alphabetic,
|
17
17
|
ProperName,
|
18
18
|
Phonotactic,
|
19
|
+
Punctuation,
|
19
20
|
NimiLinkuAle,
|
20
|
-
Punctuations,
|
21
21
|
)
|
22
22
|
from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
|
23
23
|
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
@@ -45,7 +45,7 @@ class IloConfig(TypedDict):
|
|
45
45
|
BaseConfig: IloConfig = {
|
46
46
|
"preprocessors": [URLs],
|
47
47
|
"cleaners": [ConsecutiveDuplicates],
|
48
|
-
"ignoring_filters": [
|
48
|
+
"ignoring_filters": [Numeric, Punctuation],
|
49
49
|
"scoring_filters": [],
|
50
50
|
"scorer": PassFail,
|
51
51
|
"passing_score": 0.8,
|
sonatoki/Filters.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
# STL
|
2
|
+
import re
|
2
3
|
from abc import ABC, abstractmethod
|
3
4
|
from typing import Set
|
4
5
|
from functools import lru_cache as cache # cache comes in 3.9
|
5
6
|
|
6
7
|
# PDM
|
7
|
-
import regex
|
8
|
+
import regex
|
8
9
|
from typing_extensions import override
|
9
10
|
|
10
11
|
# LOCAL
|
@@ -13,14 +14,16 @@ from sonatoki.constants import (
|
|
13
14
|
CONSONANTS,
|
14
15
|
NIMI_PU_SET,
|
15
16
|
ALPHABET_SET,
|
17
|
+
UNICODE_PUNCT,
|
16
18
|
ALLOWABLES_SET,
|
17
19
|
NIMI_LINKU_SET,
|
18
20
|
NIMI_PU_ALE_SET,
|
19
21
|
NIMI_LINKU_ALE_SET,
|
22
|
+
PRUNED_POSIX_PUNCT,
|
20
23
|
NIMI_LINKU_SANDBOX_SET,
|
21
24
|
)
|
22
25
|
|
23
|
-
|
26
|
+
regex.DEFAULT_VERSION = regex.VERSION1
|
24
27
|
|
25
28
|
|
26
29
|
class Filter(ABC):
|
@@ -41,6 +44,16 @@ class RegexFilter(Filter):
|
|
41
44
|
return not not re.fullmatch(cls.pattern, token)
|
42
45
|
|
43
46
|
|
47
|
+
class Regex1Filter(Filter):
|
48
|
+
pattern: "regex.Pattern[str]"
|
49
|
+
|
50
|
+
@classmethod
|
51
|
+
@override
|
52
|
+
@cache(maxsize=None)
|
53
|
+
def filter(cls, token: str) -> bool:
|
54
|
+
return not not regex.fullmatch(cls.pattern, token)
|
55
|
+
|
56
|
+
|
44
57
|
class SetFilter(Filter):
|
45
58
|
tokens: Set[str]
|
46
59
|
|
@@ -131,7 +144,7 @@ class Alphabetic(Filter):
|
|
131
144
|
return set(token.lower()).issubset(ALPHABET_SET)
|
132
145
|
|
133
146
|
|
134
|
-
class
|
147
|
+
class Numeric(Filter):
|
135
148
|
"""Determine if a given token is entirely numeric.
|
136
149
|
Covers all numeric symbols in Unicode.
|
137
150
|
|
@@ -147,8 +160,8 @@ class Numerics(Filter):
|
|
147
160
|
return msg.isnumeric()
|
148
161
|
|
149
162
|
|
150
|
-
class
|
151
|
-
pattern = re.compile(
|
163
|
+
class Punctuation(RegexFilter):
|
164
|
+
pattern = re.compile(rf"[{PRUNED_POSIX_PUNCT}{UNICODE_PUNCT}]+")
|
152
165
|
|
153
166
|
|
154
167
|
__all__ = [
|
@@ -159,6 +172,6 @@ __all__ = [
|
|
159
172
|
"Syllabic",
|
160
173
|
"Alphabetic",
|
161
174
|
"ProperName",
|
162
|
-
"
|
163
|
-
"
|
175
|
+
"Punctuation",
|
176
|
+
"Numeric",
|
164
177
|
]
|
sonatoki/Preprocessors.py
CHANGED
@@ -17,13 +17,14 @@ It is up to the user to order them appropriately.
|
|
17
17
|
"""
|
18
18
|
|
19
19
|
# STL
|
20
|
+
import re
|
20
21
|
from abc import ABC, abstractmethod
|
21
22
|
|
22
23
|
# PDM
|
23
|
-
import regex
|
24
|
+
import regex
|
24
25
|
from typing_extensions import override
|
25
26
|
|
26
|
-
|
27
|
+
regex.DEFAULT_VERSION = regex.VERSION1
|
27
28
|
|
28
29
|
|
29
30
|
class Preprocessor(ABC):
|
@@ -43,6 +44,16 @@ class RegexPreprocessor(Preprocessor):
|
|
43
44
|
return re.sub(cls.pattern, cls.replace, msg)
|
44
45
|
|
45
46
|
|
47
|
+
class Regex1Preprocessor(Preprocessor):
|
48
|
+
pattern: "regex.Pattern[str]"
|
49
|
+
replace: str = " "
|
50
|
+
|
51
|
+
@classmethod
|
52
|
+
@override
|
53
|
+
def process(cls, msg: str) -> str:
|
54
|
+
return regex.sub(cls.pattern, cls.replace, msg)
|
55
|
+
|
56
|
+
|
46
57
|
"""
|
47
58
|
The following classes are Ignorables.
|
48
59
|
|
@@ -62,6 +73,13 @@ class URLs(RegexPreprocessor):
|
|
62
73
|
pattern = re.compile(r"https?:\/\/\S+")
|
63
74
|
|
64
75
|
|
76
|
+
class Reference(RegexPreprocessor):
|
77
|
+
"""Remove text contained in double brackets.
|
78
|
+
Often used to fetch articles on Wikipedia, or Magic the Gathering cards."""
|
79
|
+
|
80
|
+
pattern = re.compile(r"\[\[.+\]\]")
|
81
|
+
|
82
|
+
|
65
83
|
class DiscordEmotes(RegexPreprocessor):
|
66
84
|
"""Remove text-formatted Discord emotes `<flags:name:id>`"""
|
67
85
|
|
@@ -80,6 +98,13 @@ class DiscordSpecial(RegexPreprocessor):
|
|
80
98
|
pattern = re.compile(r"<id:[a-zA-Z0-9_]{4,}>")
|
81
99
|
|
82
100
|
|
101
|
+
class AngleBracketObject(RegexPreprocessor):
|
102
|
+
"""A generalized version of the Discord-specific angle bracket objects.
|
103
|
+
Removes any contiguous (not broken by whitespace) text in angle brackets."""
|
104
|
+
|
105
|
+
pattern = re.compile(r"<[^<>\s]+>")
|
106
|
+
|
107
|
+
|
83
108
|
"""
|
84
109
|
The following classes are Containers.
|
85
110
|
|
@@ -92,23 +117,23 @@ would likely be using a language other than Toki Pona.
|
|
92
117
|
|
93
118
|
|
94
119
|
class SingleQuotes(RegexPreprocessor):
|
95
|
-
pattern = re.compile(r"'[^']+'", flags=re.
|
120
|
+
pattern = re.compile(r"'[^']+'", flags=re.DOTALL)
|
96
121
|
|
97
122
|
|
98
123
|
class DoubleQuotes(RegexPreprocessor):
|
99
|
-
pattern = re.compile(r'"[^"]+"', flags=re.
|
124
|
+
pattern = re.compile(r'"[^"]+"', flags=re.DOTALL)
|
100
125
|
|
101
126
|
|
102
127
|
class Backticks(RegexPreprocessor):
|
103
128
|
"""Remove paired backticks and their contents `like this`"""
|
104
129
|
|
105
|
-
pattern = re.compile(r"`[^`]+`", flags=re.
|
130
|
+
pattern = re.compile(r"`[^`]+`", flags=re.DOTALL)
|
106
131
|
|
107
132
|
|
108
133
|
class Spoilers(RegexPreprocessor):
|
109
134
|
"""Remove paired double bars and their contents `||like this||`"""
|
110
135
|
|
111
|
-
pattern = re.compile(r"\|\|(?:(?!\|\|).)+\|\|", flags=re.
|
136
|
+
pattern = re.compile(r"\|\|(?:(?!\|\|).)+\|\|", flags=re.DOTALL)
|
112
137
|
|
113
138
|
|
114
139
|
class ArrowQuote(RegexPreprocessor):
|
@@ -117,7 +142,22 @@ class ArrowQuote(RegexPreprocessor):
|
|
117
142
|
pattern = re.compile(r"^>\ .+$", re.MULTILINE)
|
118
143
|
|
119
144
|
|
145
|
+
class AllQuotes(RegexPreprocessor):
|
146
|
+
pattern = re.compile(
|
147
|
+
"|".join(
|
148
|
+
[
|
149
|
+
SingleQuotes.pattern.pattern,
|
150
|
+
DoubleQuotes.pattern.pattern,
|
151
|
+
Backticks.pattern.pattern,
|
152
|
+
ArrowQuote.pattern.pattern,
|
153
|
+
]
|
154
|
+
),
|
155
|
+
flags=re.MULTILINE | re.DOTALL,
|
156
|
+
)
|
157
|
+
|
158
|
+
|
120
159
|
__all__ = [
|
160
|
+
"AngleBracketObject",
|
121
161
|
"DiscordChannels",
|
122
162
|
"DiscordMentions",
|
123
163
|
"DiscordSpecial",
|
@@ -125,7 +165,9 @@ __all__ = [
|
|
125
165
|
"SingleQuotes",
|
126
166
|
"DoubleQuotes",
|
127
167
|
"ArrowQuote",
|
168
|
+
"AllQuotes",
|
128
169
|
"Backticks",
|
170
|
+
"Reference",
|
129
171
|
"Spoilers",
|
130
172
|
"URLs",
|
131
173
|
]
|
sonatoki/Scorers.py
CHANGED
@@ -10,8 +10,6 @@ from typing_extensions import override
|
|
10
10
|
# LOCAL
|
11
11
|
from sonatoki.Filters import Filter
|
12
12
|
|
13
|
-
LOG = logging.getLogger(__name__)
|
14
|
-
|
15
13
|
Number = Union[int, float]
|
16
14
|
Weights = Dict[str, Number]
|
17
15
|
|
@@ -37,12 +35,7 @@ class PassFail(Scorer):
|
|
37
35
|
def score_token(cls, token: str, filters: List[Type[Filter]]) -> Number:
|
38
36
|
for f in filters:
|
39
37
|
if f.filter(token):
|
40
|
-
|
41
|
-
LOG.debug(
|
42
|
-
"%12s.%s('%s') = %.2f", cls.__name__, f.__name__, token, score
|
43
|
-
)
|
44
|
-
return score
|
45
|
-
LOG.debug("%12s('%s') = 0.00", cls.__name__, token)
|
38
|
+
return 1
|
46
39
|
return 0
|
47
40
|
|
48
41
|
@classmethod
|
@@ -86,12 +79,7 @@ class Scaling(Scorer):
|
|
86
79
|
def score_token(cls, token: str, filters: List[Type[Filter]], scale: int):
|
87
80
|
for i, f in enumerate(filters):
|
88
81
|
if f.filter(token):
|
89
|
-
|
90
|
-
LOG.debug(
|
91
|
-
"%12s.%s('%s') = %.2f", cls.__name__, f.__name__, token, score
|
92
|
-
)
|
93
|
-
return score
|
94
|
-
LOG.debug("%12s('%s') = 0.00", cls.__name__, token)
|
82
|
+
return scale - i
|
95
83
|
return 0
|
96
84
|
|
97
85
|
@classmethod
|
sonatoki/Tokenizers.py
CHANGED
@@ -1,11 +1,15 @@
|
|
1
1
|
# STL
|
2
|
+
import re
|
2
3
|
from abc import ABC, abstractmethod
|
3
4
|
from typing import List
|
4
5
|
|
5
6
|
# PDM
|
6
|
-
import regex
|
7
|
+
import regex
|
7
8
|
from typing_extensions import override
|
8
9
|
|
10
|
+
# LOCAL
|
11
|
+
from sonatoki.constants import UNICODE_PUNCT, PRUNED_POSIX_PUNCT
|
12
|
+
|
9
13
|
try:
|
10
14
|
# PDM
|
11
15
|
import nltk
|
@@ -15,7 +19,7 @@ except ImportError as e:
|
|
15
19
|
nltk = e
|
16
20
|
|
17
21
|
|
18
|
-
|
22
|
+
regex.DEFAULT_VERSION = regex.VERSION1
|
19
23
|
|
20
24
|
|
21
25
|
class Tokenizer(ABC):
|
@@ -42,15 +46,26 @@ class RegexTokenizer(Tokenizer):
|
|
42
46
|
return [clean for word in re.split(cls.pattern, s) if (clean := word.strip())]
|
43
47
|
|
44
48
|
|
49
|
+
class Regex1Tokenizer(Tokenizer):
|
50
|
+
pattern: "regex.Pattern[str]"
|
51
|
+
|
52
|
+
@classmethod
|
53
|
+
@override
|
54
|
+
def tokenize(cls, s: str) -> List[str]:
|
55
|
+
return [
|
56
|
+
clean for word in regex.split(cls.pattern, s) if (clean := word.strip())
|
57
|
+
]
|
58
|
+
|
59
|
+
|
45
60
|
class WordTokenizerTok(RegexTokenizer):
|
46
|
-
pattern = re.compile(
|
47
|
-
# TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
|
48
|
-
# TODO: do the typography characters matter?
|
49
|
-
# NOTE: | / and , are *not* sentence delimiters for my purpose
|
61
|
+
pattern = re.compile(rf"""([{PRUNED_POSIX_PUNCT}{UNICODE_PUNCT}]+|\s+)""")
|
50
62
|
|
51
63
|
|
52
64
|
class SentTokenizerTok(RegexTokenizer):
|
53
|
-
pattern = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-]|$
|
65
|
+
pattern = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-])|$""", flags=re.MULTILINE)
|
66
|
+
# TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
|
67
|
+
# TODO: do the typography characters matter?
|
68
|
+
# NOTE: | / and , are *not* sentence delimiters for my purpose
|
54
69
|
|
55
70
|
|
56
71
|
class WordTokenizerRe(RegexTokenizer):
|
sonatoki/constants.py
CHANGED
@@ -11,6 +11,22 @@ CONSONANTS = "jklmnpstw"
|
|
11
11
|
ALPHABET = VOWELS + CONSONANTS
|
12
12
|
ALPHABET_SET = set(ALPHABET)
|
13
13
|
|
14
|
+
LANGUAGE = "english" # for NLTK
|
15
|
+
|
16
|
+
# `\p{posix_punct}` character class
|
17
|
+
POSIX_PUNCT = r"""-!"#$%&'()*+,./:;<=>?@[\]^_`{|}~"""
|
18
|
+
PRUNED_POSIX_PUNCT = r"""$+<=>^`|~""" # only those that are not in UNICODE_PUNCT
|
19
|
+
|
20
|
+
# `\p{Punctuation}` character class
|
21
|
+
UNICODE_PUNCT = r"""!"#%&'()*,-./:;?@\[\\\]_{}¡§«¶·»¿;·՚՛՜՝՞՟։֊־׀׃׆׳״؉؊،؍؛؝؞؟٪٫٬٭۔܀܁܂܃܄܅܆܇܈܉܊܋܌܍߷߸߹࠰࠱࠲࠳࠴࠵࠶࠷࠸࠹࠺࠻࠼࠽࠾࡞।॥॰৽੶૰౷಄෴๏๚๛༄༅༆༇༈༉༊་༌།༎༏༐༑༒༔༺༻༼༽྅࿐࿑࿒࿓࿔࿙࿚၊။၌၍၎၏჻፠፡።፣፤፥፦፧፨᐀᙮᚛᚜᛫᛬᛭᜵᜶។៕៖៘៙៚᠀᠁᠂᠃᠄᠅᠆᠇᠈᠉᠊᥄᥅᨞᨟᪠᪡᪢᪣᪤᪥᪦᪨᪩᪪᪫᪬᪭᭚᭛᭜᭝᭞᭟᭠᭽᭾᯼᯽᯾᯿᰻᰼᰽᰾᰿᱾᱿᳀᳁᳂᳃᳄᳅᳆᳇᳓‐‑‒–—―‖‗‘’‚‛“”„‟†‡•‣․‥…‧‰‱′″‴‵‶‷‸‹›※‼‽‾‿⁀⁁⁂⁃⁅⁆⁇⁈⁉⁊⁋⁌⁍⁎⁏⁐⁑⁓⁔⁕⁖⁗⁘⁙⁚⁛⁜⁝⁞⁽⁾₍₎⌈⌉⌊⌋〈〉❨❩❪❫❬❭❮❯❰❱❲❳❴❵⟅⟆⟦⟧⟨⟩⟪⟫⟬⟭⟮⟯⦃⦄⦅⦆⦇⦈⦉⦊⦋⦌⦍⦎⦏⦐⦑⦒⦓⦔⦕⦖⦗⦘⧘⧙⧚⧛⧼⧽⳹⳺⳻⳼⳾⳿⵰⸀⸁⸂⸃⸄⸅⸆⸇⸈⸉⸊⸋⸌⸍⸎⸏⸐⸑⸒⸓⸔⸕⸖⸗⸘⸙⸚⸛⸜⸝⸞⸟⸠⸡⸢⸣⸤⸥⸦⸧⸨⸩⸪⸫⸬⸭⸮⸰⸱⸲⸳⸴⸵⸶⸷⸸⸹⸺⸻⸼⸽⸾⸿⹀⹁⹂⹃⹄⹅⹆⹇⹈⹉⹊⹋⹌⹍⹎⹏⹒⹓⹔⹕⹖⹗⹘⹙⹚⹛⹜⹝、。〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〽゠・꓾꓿꘍꘎꘏꙳꙾꛲꛳꛴꛵꛶꛷꡴꡵꡶꡷꣎꣏꣸꣹꣺꣼꤮꤯꥟꧁꧂꧃꧄꧅꧆꧇꧈꧉꧊꧋꧌꧍꧞꧟꩜꩝꩞꩟꫞꫟꫰꫱꯫﴾﴿︐︑︒︓︔︕︖︗︘︙︰︱︲︳︴︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄﹅﹆﹇﹈﹉﹊﹋﹌﹍﹎﹏﹐﹑﹒﹔﹕﹖﹗﹘﹙﹚﹛﹜﹝﹞﹟﹠﹡﹣﹨﹪﹫!"#%&'()*,-./:;?@[\]_{}⦅⦆。「」、・𐄀𐄁𐄂𐎟𐏐𐕯𐡗𐤟𐤿𐩐𐩑𐩒𐩓𐩔𐩕𐩖𐩗𐩘𐩿𐫰𐫱𐫲𐫳𐫴𐫵𐫶𐬹𐬺𐬻𐬼𐬽𐬾𐬿𐮙𐮚𐮛𐮜𐺭𐽕𐽖𐽗𐽘𐽙𐾆𐾇𐾈𐾉𑁇𑁈𑁉𑁊𑁋𑁌𑁍𑂻𑂼𑂾𑂿𑃀𑃁𑅀𑅁𑅂𑅃𑅴𑅵𑇅𑇆𑇇𑇈𑇍𑇛𑇝𑇞𑇟𑈸𑈹𑈺𑈻𑈼𑈽𑊩𑑋𑑌𑑍𑑎𑑏𑑚𑑛𑑝𑓆𑗁𑗂𑗃𑗄𑗅𑗆𑗇𑗈𑗉𑗊𑗋𑗌𑗍𑗎𑗏𑗐𑗑𑗒𑗓𑗔𑗕𑗖𑗗𑙁𑙂𑙃𑙠𑙡𑙢𑙣𑙤𑙥𑙦𑙧𑙨𑙩𑙪𑙫𑙬𑚹𑜼𑜽𑜾𑠻𑥄𑥅𑥆𑧢𑨿𑩀𑩁𑩂𑩃𑩄𑩅𑩆𑪚𑪛𑪜𑪞𑪟𑪠𑪡𑪢𑬀𑬁𑬂𑬃𑬄𑬅𑬆𑬇𑬈𑬉𑱁𑱂𑱃𑱄𑱅𑱰𑱱𑻷𑻸𑽃𑽄𑽅𑽆𑽇𑽈𑽉𑽊𑽋𑽌𑽍𑽎𑽏𑿿𒑰𒑱𒑲𒑳𒑴𒿱𒿲𖩮𖩯𖫵𖬷𖬸𖬹𖬺𖬻𖭄𖺗𖺘𖺙𖺚𖿢𛲟𝪇𝪈𝪉𝪊𝪋𞥞𞥟"""
|
22
|
+
# NOTE: This list diverges slightly from the raw list, since []\ must be escaped
|
23
|
+
# The [] need to be escaped to avoid prematurely closing the regex character class
|
24
|
+
# The \ needs to be escaped to be considered as a raw \
|
25
|
+
|
26
|
+
# https://www.compart.com/en/unicode/category
|
27
|
+
# https://unicode.org/Public/UNIDATA/UnicodeData.txt
|
28
|
+
|
29
|
+
|
14
30
|
"""Commonly occurring strings which are some kind of valid Toki Pona or external token"""
|
15
31
|
ALLOWABLES = {
|
16
32
|
"cw", # Content Warning
|
sonatoki/ilo.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
# STL
|
2
|
-
import logging
|
3
2
|
from typing import List, Type, Tuple
|
4
3
|
|
5
4
|
# LOCAL
|
@@ -9,8 +8,6 @@ from sonatoki.Cleaners import Cleaner
|
|
9
8
|
from sonatoki.Tokenizers import Tokenizer
|
10
9
|
from sonatoki.Preprocessors import Preprocessor
|
11
10
|
|
12
|
-
LOG = logging.getLogger(__name__)
|
13
|
-
|
14
11
|
|
15
12
|
class Ilo:
|
16
13
|
__preprocessors: List[Type[Preprocessor]]
|
@@ -20,7 +17,6 @@ class Ilo:
|
|
20
17
|
__scoring_filters: List[Type[Filter]]
|
21
18
|
__scorer: Type[Scorer]
|
22
19
|
__passing_score: Number
|
23
|
-
logging_threshold: Number = -1
|
24
20
|
|
25
21
|
def __init__(
|
26
22
|
self,
|
@@ -104,14 +100,6 @@ class Ilo:
|
|
104
100
|
score = self.score_tokens(cleaned)
|
105
101
|
result = score >= self.__passing_score
|
106
102
|
|
107
|
-
if score <= self.logging_threshold:
|
108
|
-
LOG.debug("msg: %.2f %s", score, repr(message))
|
109
|
-
LOG.debug("preproc: %s", repr(preprocessed))
|
110
|
-
LOG.debug("tokenized: %s", tokenized)
|
111
|
-
LOG.debug("filtered: %s", filtered)
|
112
|
-
LOG.debug("cleaned: %s", cleaned)
|
113
|
-
# TODO: Move to each function? Loses ability to control when logging occurs by threshold
|
114
|
-
|
115
103
|
return preprocessed, tokenized, filtered, cleaned, score, result
|
116
104
|
|
117
105
|
def is_toki_pona(self, message: str) -> bool:
|
@@ -0,0 +1,16 @@
|
|
1
|
+
sonatoki-0.1.5.dist-info/METADATA,sha256=wJBa9CKSni9dcfQGQZp_-FSfANJHmTfZdYSCMH0Wolg,5225
|
2
|
+
sonatoki-0.1.5.dist-info/WHEEL,sha256=vnE8JVcI2Wz7GRKorsPArnBdnW2SWKWGow5gu5tHlRU,90
|
3
|
+
sonatoki-0.1.5.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
|
4
|
+
sonatoki/Cleaners.py,sha256=gTZ9dSsnvKVUtxM_ECSZ-_2heh--nD5A9dCQR1ATb1c,1160
|
5
|
+
sonatoki/Configs.py,sha256=iY6Lyn1rMi7iF0M62yx0ET4pEb35-QAd1FS0tkyUfSc,1935
|
6
|
+
sonatoki/Filters.py,sha256=xanTOKxasW_2OpQXsnk5pzbM1FmG8y46pakuTfMz9Hw,4470
|
7
|
+
sonatoki/Preprocessors.py,sha256=FqBcHirsXV_91mj99ju9AnbsHaCFctSnuz_vca9ckSY,4441
|
8
|
+
sonatoki/Scorers.py,sha256=w5p4qPzpEhR-xHaOXzqulN01OKtLcSPsTrgKyMhfAaQ,3658
|
9
|
+
sonatoki/Tokenizers.py,sha256=Wqyf36d1OST7sVrANlLixDIOYWSctD5rVg1_MlaPrCw,2310
|
10
|
+
sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
+
sonatoki/__main__.py,sha256=6xc-wIrrFo9wTyn4zRQNAmqwmJBtVvCMwV-CrM-hueA,82
|
12
|
+
sonatoki/constants.py,sha256=LZVI0322kTsqwrGQIJfgSl6IIZS-ste1rd9isRzcNqM,4883
|
13
|
+
sonatoki/ilo.py,sha256=yyLgNPI0Hmb4f1BzX6IRHr11FPChfL2xDR_9odlr8_8,3849
|
14
|
+
sonatoki/linku.json,sha256=B5KNdhyM5UEfMciROgh1ECHr3i-ASBeMvwrkzNJX47c,271013
|
15
|
+
sonatoki/sandbox.json,sha256=hx6LRsfvmmTtqXcXIyCsfSaGK3DZ-GCdbM8xhZQBHoA,77650
|
16
|
+
sonatoki-0.1.5.dist-info/RECORD,,
|
sonatoki-0.1.3.dist-info/RECORD
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
sonatoki-0.1.3.dist-info/METADATA,sha256=ivcjgCnmdW1Typsn01RgqHX4PePGx6r4U_Ms5h5ksYo,5225
|
2
|
-
sonatoki-0.1.3.dist-info/WHEEL,sha256=vnE8JVcI2Wz7GRKorsPArnBdnW2SWKWGow5gu5tHlRU,90
|
3
|
-
sonatoki-0.1.3.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
|
4
|
-
sonatoki/Cleaners.py,sha256=gTZ9dSsnvKVUtxM_ECSZ-_2heh--nD5A9dCQR1ATb1c,1160
|
5
|
-
sonatoki/Configs.py,sha256=yzTbjEyWS7sysaQ_9mIQMmO-acpEgQBzv4foP_J5x1o,1939
|
6
|
-
sonatoki/Filters.py,sha256=RgbOXLat30WdsJW4y1DlMNttmGfVtLM6T7cD_qK-ASo,4194
|
7
|
-
sonatoki/Preprocessors.py,sha256=uJ8-Y51gcgu5Wrri9BiP1F1YT-cYiLeWhrquFbYi9AI,3347
|
8
|
-
sonatoki/Scorers.py,sha256=V293DBiupBiujzuc4yMrKOAiuNTLltIsiCzIAlLeokA,4129
|
9
|
-
sonatoki/Tokenizers.py,sha256=fvqxpubs2F63va2RzZKZQhZbFnVaC_9haXIA9Mqznis,1942
|
10
|
-
sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
-
sonatoki/__main__.py,sha256=6xc-wIrrFo9wTyn4zRQNAmqwmJBtVvCMwV-CrM-hueA,82
|
12
|
-
sonatoki/constants.py,sha256=m0Z4At6MfbqZRio2glT3J3zT9x_itcWZBT_G82mpaVc,1647
|
13
|
-
sonatoki/ilo.py,sha256=oN14iYFKxgjFjjOslgqBrMaIgpnvS5gO6MscbS0JS5A,4343
|
14
|
-
sonatoki/linku.json,sha256=B5KNdhyM5UEfMciROgh1ECHr3i-ASBeMvwrkzNJX47c,271013
|
15
|
-
sonatoki/sandbox.json,sha256=hx6LRsfvmmTtqXcXIyCsfSaGK3DZ-GCdbM8xhZQBHoA,77650
|
16
|
-
sonatoki-0.1.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|