sonatoki 0.1.3__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sonatoki-0.1.3 → sonatoki-0.1.5}/PKG-INFO +1 -1
- {sonatoki-0.1.3 → sonatoki-0.1.5}/pyproject.toml +1 -1
- {sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/Configs.py +3 -3
- {sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/Filters.py +20 -7
- {sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/Preprocessors.py +48 -6
- {sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/Scorers.py +2 -14
- {sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/Tokenizers.py +22 -7
- sonatoki-0.1.5/src/sonatoki/constants.py +83 -0
- {sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/ilo.py +0 -12
- {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/test_filters.py +10 -11
- {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/test_ilo.py +0 -1
- {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/test_preprocessors.py +40 -0
- {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/test_scorers.py +8 -6
- {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/tokenize_cases/tokenize_sentences_tok.yml +18 -0
- sonatoki-0.1.3/src/sonatoki/constants.py +0 -67
- {sonatoki-0.1.3 → sonatoki-0.1.5}/LICENSE +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.5}/README.md +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/Cleaners.py +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/__init__.py +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/__main__.py +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/linku.json +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/sandbox.json +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/__init__.py +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/test_cleaners.py +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/test_tokenize.py +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/test_utils.py +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/tokenize_cases/tokenize_sentences.yml +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/tokenize_cases/tokenize_words.yml +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.5}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -9,15 +9,15 @@ from typing_extensions import NotRequired
|
|
9
9
|
from sonatoki.Filters import (
|
10
10
|
Filter,
|
11
11
|
NimiPu,
|
12
|
-
|
12
|
+
Numeric,
|
13
13
|
Syllabic,
|
14
14
|
NimiLinku,
|
15
15
|
NimiPuAle,
|
16
16
|
Alphabetic,
|
17
17
|
ProperName,
|
18
18
|
Phonotactic,
|
19
|
+
Punctuation,
|
19
20
|
NimiLinkuAle,
|
20
|
-
Punctuations,
|
21
21
|
)
|
22
22
|
from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
|
23
23
|
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
@@ -45,7 +45,7 @@ class IloConfig(TypedDict):
|
|
45
45
|
BaseConfig: IloConfig = {
|
46
46
|
"preprocessors": [URLs],
|
47
47
|
"cleaners": [ConsecutiveDuplicates],
|
48
|
-
"ignoring_filters": [
|
48
|
+
"ignoring_filters": [Numeric, Punctuation],
|
49
49
|
"scoring_filters": [],
|
50
50
|
"scorer": PassFail,
|
51
51
|
"passing_score": 0.8,
|
@@ -1,10 +1,11 @@
|
|
1
1
|
# STL
|
2
|
+
import re
|
2
3
|
from abc import ABC, abstractmethod
|
3
4
|
from typing import Set
|
4
5
|
from functools import lru_cache as cache # cache comes in 3.9
|
5
6
|
|
6
7
|
# PDM
|
7
|
-
import regex
|
8
|
+
import regex
|
8
9
|
from typing_extensions import override
|
9
10
|
|
10
11
|
# LOCAL
|
@@ -13,14 +14,16 @@ from sonatoki.constants import (
|
|
13
14
|
CONSONANTS,
|
14
15
|
NIMI_PU_SET,
|
15
16
|
ALPHABET_SET,
|
17
|
+
UNICODE_PUNCT,
|
16
18
|
ALLOWABLES_SET,
|
17
19
|
NIMI_LINKU_SET,
|
18
20
|
NIMI_PU_ALE_SET,
|
19
21
|
NIMI_LINKU_ALE_SET,
|
22
|
+
PRUNED_POSIX_PUNCT,
|
20
23
|
NIMI_LINKU_SANDBOX_SET,
|
21
24
|
)
|
22
25
|
|
23
|
-
|
26
|
+
regex.DEFAULT_VERSION = regex.VERSION1
|
24
27
|
|
25
28
|
|
26
29
|
class Filter(ABC):
|
@@ -41,6 +44,16 @@ class RegexFilter(Filter):
|
|
41
44
|
return not not re.fullmatch(cls.pattern, token)
|
42
45
|
|
43
46
|
|
47
|
+
class Regex1Filter(Filter):
|
48
|
+
pattern: "regex.Pattern[str]"
|
49
|
+
|
50
|
+
@classmethod
|
51
|
+
@override
|
52
|
+
@cache(maxsize=None)
|
53
|
+
def filter(cls, token: str) -> bool:
|
54
|
+
return not not regex.fullmatch(cls.pattern, token)
|
55
|
+
|
56
|
+
|
44
57
|
class SetFilter(Filter):
|
45
58
|
tokens: Set[str]
|
46
59
|
|
@@ -131,7 +144,7 @@ class Alphabetic(Filter):
|
|
131
144
|
return set(token.lower()).issubset(ALPHABET_SET)
|
132
145
|
|
133
146
|
|
134
|
-
class
|
147
|
+
class Numeric(Filter):
|
135
148
|
"""Determine if a given token is entirely numeric.
|
136
149
|
Covers all numeric symbols in Unicode.
|
137
150
|
|
@@ -147,8 +160,8 @@ class Numerics(Filter):
|
|
147
160
|
return msg.isnumeric()
|
148
161
|
|
149
162
|
|
150
|
-
class
|
151
|
-
pattern = re.compile(
|
163
|
+
class Punctuation(RegexFilter):
|
164
|
+
pattern = re.compile(rf"[{PRUNED_POSIX_PUNCT}{UNICODE_PUNCT}]+")
|
152
165
|
|
153
166
|
|
154
167
|
__all__ = [
|
@@ -159,6 +172,6 @@ __all__ = [
|
|
159
172
|
"Syllabic",
|
160
173
|
"Alphabetic",
|
161
174
|
"ProperName",
|
162
|
-
"
|
163
|
-
"
|
175
|
+
"Punctuation",
|
176
|
+
"Numeric",
|
164
177
|
]
|
@@ -17,13 +17,14 @@ It is up to the user to order them appropriately.
|
|
17
17
|
"""
|
18
18
|
|
19
19
|
# STL
|
20
|
+
import re
|
20
21
|
from abc import ABC, abstractmethod
|
21
22
|
|
22
23
|
# PDM
|
23
|
-
import regex
|
24
|
+
import regex
|
24
25
|
from typing_extensions import override
|
25
26
|
|
26
|
-
|
27
|
+
regex.DEFAULT_VERSION = regex.VERSION1
|
27
28
|
|
28
29
|
|
29
30
|
class Preprocessor(ABC):
|
@@ -43,6 +44,16 @@ class RegexPreprocessor(Preprocessor):
|
|
43
44
|
return re.sub(cls.pattern, cls.replace, msg)
|
44
45
|
|
45
46
|
|
47
|
+
class Regex1Preprocessor(Preprocessor):
|
48
|
+
pattern: "regex.Pattern[str]"
|
49
|
+
replace: str = " "
|
50
|
+
|
51
|
+
@classmethod
|
52
|
+
@override
|
53
|
+
def process(cls, msg: str) -> str:
|
54
|
+
return regex.sub(cls.pattern, cls.replace, msg)
|
55
|
+
|
56
|
+
|
46
57
|
"""
|
47
58
|
The following classes are Ignorables.
|
48
59
|
|
@@ -62,6 +73,13 @@ class URLs(RegexPreprocessor):
|
|
62
73
|
pattern = re.compile(r"https?:\/\/\S+")
|
63
74
|
|
64
75
|
|
76
|
+
class Reference(RegexPreprocessor):
|
77
|
+
"""Remove text contained in double brackets.
|
78
|
+
Often used to fetch articles on Wikipedia, or Magic the Gathering cards."""
|
79
|
+
|
80
|
+
pattern = re.compile(r"\[\[.+\]\]")
|
81
|
+
|
82
|
+
|
65
83
|
class DiscordEmotes(RegexPreprocessor):
|
66
84
|
"""Remove text-formatted Discord emotes `<flags:name:id>`"""
|
67
85
|
|
@@ -80,6 +98,13 @@ class DiscordSpecial(RegexPreprocessor):
|
|
80
98
|
pattern = re.compile(r"<id:[a-zA-Z0-9_]{4,}>")
|
81
99
|
|
82
100
|
|
101
|
+
class AngleBracketObject(RegexPreprocessor):
|
102
|
+
"""A generalized version of the Discord-specific angle bracket objects.
|
103
|
+
Removes any contiguous (not broken by whitespace) text in angle brackets."""
|
104
|
+
|
105
|
+
pattern = re.compile(r"<[^<>\s]+>")
|
106
|
+
|
107
|
+
|
83
108
|
"""
|
84
109
|
The following classes are Containers.
|
85
110
|
|
@@ -92,23 +117,23 @@ would likely be using a language other than Toki Pona.
|
|
92
117
|
|
93
118
|
|
94
119
|
class SingleQuotes(RegexPreprocessor):
|
95
|
-
pattern = re.compile(r"'[^']+'", flags=re.
|
120
|
+
pattern = re.compile(r"'[^']+'", flags=re.DOTALL)
|
96
121
|
|
97
122
|
|
98
123
|
class DoubleQuotes(RegexPreprocessor):
|
99
|
-
pattern = re.compile(r'"[^"]+"', flags=re.
|
124
|
+
pattern = re.compile(r'"[^"]+"', flags=re.DOTALL)
|
100
125
|
|
101
126
|
|
102
127
|
class Backticks(RegexPreprocessor):
|
103
128
|
"""Remove paired backticks and their contents `like this`"""
|
104
129
|
|
105
|
-
pattern = re.compile(r"`[^`]+`", flags=re.
|
130
|
+
pattern = re.compile(r"`[^`]+`", flags=re.DOTALL)
|
106
131
|
|
107
132
|
|
108
133
|
class Spoilers(RegexPreprocessor):
|
109
134
|
"""Remove paired double bars and their contents `||like this||`"""
|
110
135
|
|
111
|
-
pattern = re.compile(r"\|\|(?:(?!\|\|).)+\|\|", flags=re.
|
136
|
+
pattern = re.compile(r"\|\|(?:(?!\|\|).)+\|\|", flags=re.DOTALL)
|
112
137
|
|
113
138
|
|
114
139
|
class ArrowQuote(RegexPreprocessor):
|
@@ -117,7 +142,22 @@ class ArrowQuote(RegexPreprocessor):
|
|
117
142
|
pattern = re.compile(r"^>\ .+$", re.MULTILINE)
|
118
143
|
|
119
144
|
|
145
|
+
class AllQuotes(RegexPreprocessor):
|
146
|
+
pattern = re.compile(
|
147
|
+
"|".join(
|
148
|
+
[
|
149
|
+
SingleQuotes.pattern.pattern,
|
150
|
+
DoubleQuotes.pattern.pattern,
|
151
|
+
Backticks.pattern.pattern,
|
152
|
+
ArrowQuote.pattern.pattern,
|
153
|
+
]
|
154
|
+
),
|
155
|
+
flags=re.MULTILINE | re.DOTALL,
|
156
|
+
)
|
157
|
+
|
158
|
+
|
120
159
|
__all__ = [
|
160
|
+
"AngleBracketObject",
|
121
161
|
"DiscordChannels",
|
122
162
|
"DiscordMentions",
|
123
163
|
"DiscordSpecial",
|
@@ -125,7 +165,9 @@ __all__ = [
|
|
125
165
|
"SingleQuotes",
|
126
166
|
"DoubleQuotes",
|
127
167
|
"ArrowQuote",
|
168
|
+
"AllQuotes",
|
128
169
|
"Backticks",
|
170
|
+
"Reference",
|
129
171
|
"Spoilers",
|
130
172
|
"URLs",
|
131
173
|
]
|
@@ -10,8 +10,6 @@ from typing_extensions import override
|
|
10
10
|
# LOCAL
|
11
11
|
from sonatoki.Filters import Filter
|
12
12
|
|
13
|
-
LOG = logging.getLogger(__name__)
|
14
|
-
|
15
13
|
Number = Union[int, float]
|
16
14
|
Weights = Dict[str, Number]
|
17
15
|
|
@@ -37,12 +35,7 @@ class PassFail(Scorer):
|
|
37
35
|
def score_token(cls, token: str, filters: List[Type[Filter]]) -> Number:
|
38
36
|
for f in filters:
|
39
37
|
if f.filter(token):
|
40
|
-
|
41
|
-
LOG.debug(
|
42
|
-
"%12s.%s('%s') = %.2f", cls.__name__, f.__name__, token, score
|
43
|
-
)
|
44
|
-
return score
|
45
|
-
LOG.debug("%12s('%s') = 0.00", cls.__name__, token)
|
38
|
+
return 1
|
46
39
|
return 0
|
47
40
|
|
48
41
|
@classmethod
|
@@ -86,12 +79,7 @@ class Scaling(Scorer):
|
|
86
79
|
def score_token(cls, token: str, filters: List[Type[Filter]], scale: int):
|
87
80
|
for i, f in enumerate(filters):
|
88
81
|
if f.filter(token):
|
89
|
-
|
90
|
-
LOG.debug(
|
91
|
-
"%12s.%s('%s') = %.2f", cls.__name__, f.__name__, token, score
|
92
|
-
)
|
93
|
-
return score
|
94
|
-
LOG.debug("%12s('%s') = 0.00", cls.__name__, token)
|
82
|
+
return scale - i
|
95
83
|
return 0
|
96
84
|
|
97
85
|
@classmethod
|
@@ -1,11 +1,15 @@
|
|
1
1
|
# STL
|
2
|
+
import re
|
2
3
|
from abc import ABC, abstractmethod
|
3
4
|
from typing import List
|
4
5
|
|
5
6
|
# PDM
|
6
|
-
import regex
|
7
|
+
import regex
|
7
8
|
from typing_extensions import override
|
8
9
|
|
10
|
+
# LOCAL
|
11
|
+
from sonatoki.constants import UNICODE_PUNCT, PRUNED_POSIX_PUNCT
|
12
|
+
|
9
13
|
try:
|
10
14
|
# PDM
|
11
15
|
import nltk
|
@@ -15,7 +19,7 @@ except ImportError as e:
|
|
15
19
|
nltk = e
|
16
20
|
|
17
21
|
|
18
|
-
|
22
|
+
regex.DEFAULT_VERSION = regex.VERSION1
|
19
23
|
|
20
24
|
|
21
25
|
class Tokenizer(ABC):
|
@@ -42,15 +46,26 @@ class RegexTokenizer(Tokenizer):
|
|
42
46
|
return [clean for word in re.split(cls.pattern, s) if (clean := word.strip())]
|
43
47
|
|
44
48
|
|
49
|
+
class Regex1Tokenizer(Tokenizer):
|
50
|
+
pattern: "regex.Pattern[str]"
|
51
|
+
|
52
|
+
@classmethod
|
53
|
+
@override
|
54
|
+
def tokenize(cls, s: str) -> List[str]:
|
55
|
+
return [
|
56
|
+
clean for word in regex.split(cls.pattern, s) if (clean := word.strip())
|
57
|
+
]
|
58
|
+
|
59
|
+
|
45
60
|
class WordTokenizerTok(RegexTokenizer):
|
46
|
-
pattern = re.compile(
|
47
|
-
# TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
|
48
|
-
# TODO: do the typography characters matter?
|
49
|
-
# NOTE: | / and , are *not* sentence delimiters for my purpose
|
61
|
+
pattern = re.compile(rf"""([{PRUNED_POSIX_PUNCT}{UNICODE_PUNCT}]+|\s+)""")
|
50
62
|
|
51
63
|
|
52
64
|
class SentTokenizerTok(RegexTokenizer):
|
53
|
-
pattern = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-]|$
|
65
|
+
pattern = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-])|$""", flags=re.MULTILINE)
|
66
|
+
# TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
|
67
|
+
# TODO: do the typography characters matter?
|
68
|
+
# NOTE: | / and , are *not* sentence delimiters for my purpose
|
54
69
|
|
55
70
|
|
56
71
|
class WordTokenizerRe(RegexTokenizer):
|
@@ -0,0 +1,83 @@
|
|
1
|
+
# STL
|
2
|
+
import json
|
3
|
+
from typing import Dict, List
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
LINKU = Path(__file__).resolve().parent / Path("linku.json")
|
7
|
+
SANDBOX = Path(__file__).resolve().parent / Path("sandbox.json")
|
8
|
+
|
9
|
+
VOWELS = "aeiou"
|
10
|
+
CONSONANTS = "jklmnpstw"
|
11
|
+
ALPHABET = VOWELS + CONSONANTS
|
12
|
+
ALPHABET_SET = set(ALPHABET)
|
13
|
+
|
14
|
+
LANGUAGE = "english" # for NLTK
|
15
|
+
|
16
|
+
# `\p{posix_punct}` character class
|
17
|
+
POSIX_PUNCT = r"""-!"#$%&'()*+,./:;<=>?@[\]^_`{|}~"""
|
18
|
+
PRUNED_POSIX_PUNCT = r"""$+<=>^`|~""" # only those that are not in UNICODE_PUNCT
|
19
|
+
|
20
|
+
# `\p{Punctuation}` character class
|
21
|
+
UNICODE_PUNCT = r"""!"#%&'()*,-./:;?@\[\\\]_{}¡§«¶·»¿;·՚՛՜՝՞՟։֊־׀׃׆׳״؉؊،؍؛؝؞؟٪٫٬٭۔܀܁܂܃܄܅܆܇܈܉܊܋܌܍߷߸߹࠰࠱࠲࠳࠴࠵࠶࠷࠸࠹࠺࠻࠼࠽࠾࡞।॥॰৽੶૰౷಄෴๏๚๛༄༅༆༇༈༉༊་༌།༎༏༐༑༒༔༺༻༼༽྅࿐࿑࿒࿓࿔࿙࿚၊။၌၍၎၏჻፠፡።፣፤፥፦፧፨᐀᙮᚛᚜᛫᛬᛭᜵᜶។៕៖៘៙៚᠀᠁᠂᠃᠄᠅᠆᠇᠈᠉᠊᥄᥅᨞᨟᪠᪡᪢᪣᪤᪥᪦᪨᪩᪪᪫᪬᪭᭚᭛᭜᭝᭞᭟᭠᭽᭾᯼᯽᯾᯿᰻᰼᰽᰾᰿᱾᱿᳀᳁᳂᳃᳄᳅᳆᳇᳓‐‑‒–—―‖‗‘’‚‛“”„‟†‡•‣․‥…‧‰‱′″‴‵‶‷‸‹›※‼‽‾‿⁀⁁⁂⁃⁅⁆⁇⁈⁉⁊⁋⁌⁍⁎⁏⁐⁑⁓⁔⁕⁖⁗⁘⁙⁚⁛⁜⁝⁞⁽⁾₍₎⌈⌉⌊⌋〈〉❨❩❪❫❬❭❮❯❰❱❲❳❴❵⟅⟆⟦⟧⟨⟩⟪⟫⟬⟭⟮⟯⦃⦄⦅⦆⦇⦈⦉⦊⦋⦌⦍⦎⦏⦐⦑⦒⦓⦔⦕⦖⦗⦘⧘⧙⧚⧛⧼⧽⳹⳺⳻⳼⳾⳿⵰⸀⸁⸂⸃⸄⸅⸆⸇⸈⸉⸊⸋⸌⸍⸎⸏⸐⸑⸒⸓⸔⸕⸖⸗⸘⸙⸚⸛⸜⸝⸞⸟⸠⸡⸢⸣⸤⸥⸦⸧⸨⸩⸪⸫⸬⸭⸮⸰⸱⸲⸳⸴⸵⸶⸷⸸⸹⸺⸻⸼⸽⸾⸿⹀⹁⹂⹃⹄⹅⹆⹇⹈⹉⹊⹋⹌⹍⹎⹏⹒⹓⹔⹕⹖⹗⹘⹙⹚⹛⹜⹝、。〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〽゠・꓾꓿꘍꘎꘏꙳꙾꛲꛳꛴꛵꛶꛷꡴꡵꡶꡷꣎꣏꣸꣹꣺꣼꤮꤯꥟꧁꧂꧃꧄꧅꧆꧇꧈꧉꧊꧋꧌꧍꧞꧟꩜꩝꩞꩟꫞꫟꫰꫱꯫﴾﴿︐︑︒︓︔︕︖︗︘︙︰︱︲︳︴︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄﹅﹆﹇﹈﹉﹊﹋﹌﹍﹎﹏﹐﹑﹒﹔﹕﹖﹗﹘﹙﹚﹛﹜﹝﹞﹟﹠﹡﹣﹨﹪﹫!"#%&'()*,-./:;?@[\]_{}⦅⦆。「」、・𐄀𐄁𐄂𐎟𐏐𐕯𐡗𐤟𐤿𐩐𐩑𐩒𐩓𐩔𐩕𐩖𐩗𐩘𐩿𐫰𐫱𐫲𐫳𐫴𐫵𐫶𐬹𐬺𐬻𐬼𐬽𐬾𐬿𐮙𐮚𐮛𐮜𐺭𐽕𐽖𐽗𐽘𐽙𐾆𐾇𐾈𐾉𑁇𑁈𑁉𑁊𑁋𑁌𑁍𑂻𑂼𑂾𑂿𑃀𑃁𑅀𑅁𑅂𑅃𑅴𑅵𑇅𑇆𑇇𑇈𑇍𑇛𑇝𑇞𑇟𑈸𑈹𑈺𑈻𑈼𑈽𑊩𑑋𑑌𑑍𑑎𑑏𑑚𑑛𑑝𑓆𑗁𑗂𑗃𑗄𑗅𑗆𑗇𑗈𑗉𑗊𑗋𑗌𑗍𑗎𑗏𑗐𑗑𑗒𑗓𑗔𑗕𑗖𑗗𑙁𑙂𑙃𑙠𑙡𑙢𑙣𑙤𑙥𑙦𑙧𑙨𑙩𑙪𑙫𑙬𑚹𑜼𑜽𑜾𑠻𑥄𑥅𑥆𑧢𑨿𑩀𑩁𑩂𑩃𑩄𑩅𑩆𑪚𑪛𑪜𑪞𑪟𑪠𑪡𑪢𑬀𑬁𑬂𑬃𑬄𑬅𑬆𑬇𑬈𑬉𑱁𑱂𑱃𑱄𑱅𑱰𑱱𑻷𑻸𑽃𑽄𑽅𑽆𑽇𑽈𑽉𑽊𑽋𑽌𑽍𑽎𑽏𑿿𒑰𒑱𒑲𒑳𒑴𒿱𒿲𖩮𖩯𖫵𖬷𖬸𖬹𖬺𖬻𖭄𖺗𖺘𖺙𖺚𖿢𛲟𝪇𝪈𝪉𝪊𝪋𞥞𞥟"""
|
22
|
+
# NOTE: This list diverges slightly from the raw list, since []\ must be escaped
|
23
|
+
# The [] need to be escaped to avoid prematurely closing the regex character class
|
24
|
+
# The \ needs to be escaped to be considered as a raw \
|
25
|
+
|
26
|
+
# https://www.compart.com/en/unicode/category
|
27
|
+
# https://unicode.org/Public/UNIDATA/UnicodeData.txt
|
28
|
+
|
29
|
+
|
30
|
+
"""Commonly occurring strings which are some kind of valid Toki Pona or external token"""
|
31
|
+
ALLOWABLES = {
|
32
|
+
"cw", # Content Warning
|
33
|
+
"x", # ala
|
34
|
+
"y", # anu
|
35
|
+
"kxk", # ken ala ken
|
36
|
+
"wxw", # wile ala wile
|
37
|
+
}
|
38
|
+
|
39
|
+
|
40
|
+
with open(LINKU) as f:
|
41
|
+
r: Dict[str, Dict[str, str]] = json.loads(f.read())
|
42
|
+
NIMI_PU: List[str] = [d["word"] for d in r.values() if d["book"] == "pu"]
|
43
|
+
NIMI_PU_ALE: List[str] = NIMI_PU + ["namako", "kin", "oko"]
|
44
|
+
NIMI_LINKU: List[str] = [
|
45
|
+
d["word"] for d in r.values() if d["usage_category"] in ["core", "common"]
|
46
|
+
]
|
47
|
+
NIMI_LINKU_ALE: List[str] = [d["word"] for d in r.values()]
|
48
|
+
|
49
|
+
with open(SANDBOX) as f:
|
50
|
+
r: Dict[str, Dict[str, str]] = json.loads(f.read())
|
51
|
+
NIMI_LINKU_SANDBOX: List[str] = [d["word"] for d in r.values()]
|
52
|
+
|
53
|
+
|
54
|
+
NIMI_PU_SET = set(NIMI_PU)
|
55
|
+
NIMI_PU_ALE_SET = set(NIMI_PU_ALE)
|
56
|
+
NIMI_LINKU_SET = set(NIMI_LINKU)
|
57
|
+
NIMI_LINKU_ALE_SET = set(NIMI_LINKU_ALE)
|
58
|
+
NIMI_LINKU_SANDBOX_SET = set(NIMI_LINKU_SANDBOX)
|
59
|
+
ALLOWABLES_SET = set(ALLOWABLES)
|
60
|
+
|
61
|
+
__all__ = [
|
62
|
+
"VOWELS",
|
63
|
+
#
|
64
|
+
"CONSONANTS",
|
65
|
+
#
|
66
|
+
"ALPHABET",
|
67
|
+
"ALPHABET_SET",
|
68
|
+
#
|
69
|
+
"NIMI_PU",
|
70
|
+
"NIMI_PU_SET",
|
71
|
+
#
|
72
|
+
"NIMI_PU_ALE",
|
73
|
+
"NIMI_PU_ALE_SET",
|
74
|
+
#
|
75
|
+
"NIMI_LINKU",
|
76
|
+
"NIMI_LINKU_SET",
|
77
|
+
#
|
78
|
+
"NIMI_LINKU_ALE",
|
79
|
+
"NIMI_LINKU_ALE_SET",
|
80
|
+
#
|
81
|
+
"NIMI_LINKU_SANDBOX",
|
82
|
+
"NIMI_LINKU_SANDBOX_SET",
|
83
|
+
]
|
@@ -1,5 +1,4 @@
|
|
1
1
|
# STL
|
2
|
-
import logging
|
3
2
|
from typing import List, Type, Tuple
|
4
3
|
|
5
4
|
# LOCAL
|
@@ -9,8 +8,6 @@ from sonatoki.Cleaners import Cleaner
|
|
9
8
|
from sonatoki.Tokenizers import Tokenizer
|
10
9
|
from sonatoki.Preprocessors import Preprocessor
|
11
10
|
|
12
|
-
LOG = logging.getLogger(__name__)
|
13
|
-
|
14
11
|
|
15
12
|
class Ilo:
|
16
13
|
__preprocessors: List[Type[Preprocessor]]
|
@@ -20,7 +17,6 @@ class Ilo:
|
|
20
17
|
__scoring_filters: List[Type[Filter]]
|
21
18
|
__scorer: Type[Scorer]
|
22
19
|
__passing_score: Number
|
23
|
-
logging_threshold: Number = -1
|
24
20
|
|
25
21
|
def __init__(
|
26
22
|
self,
|
@@ -104,14 +100,6 @@ class Ilo:
|
|
104
100
|
score = self.score_tokens(cleaned)
|
105
101
|
result = score >= self.__passing_score
|
106
102
|
|
107
|
-
if score <= self.logging_threshold:
|
108
|
-
LOG.debug("msg: %.2f %s", score, repr(message))
|
109
|
-
LOG.debug("preproc: %s", repr(preprocessed))
|
110
|
-
LOG.debug("tokenized: %s", tokenized)
|
111
|
-
LOG.debug("filtered: %s", filtered)
|
112
|
-
LOG.debug("cleaned: %s", cleaned)
|
113
|
-
# TODO: Move to each function? Loses ability to control when logging occurs by threshold
|
114
|
-
|
115
103
|
return preprocessed, tokenized, filtered, cleaned, score, result
|
116
104
|
|
117
105
|
def is_toki_pona(self, message: str) -> bool:
|
@@ -9,13 +9,13 @@ from hypothesis import HealthCheck, given, assume, example, settings
|
|
9
9
|
# LOCAL
|
10
10
|
from sonatoki.Filters import (
|
11
11
|
NimiPu,
|
12
|
-
|
12
|
+
Numeric,
|
13
13
|
Syllabic,
|
14
14
|
NimiLinku,
|
15
15
|
Alphabetic,
|
16
16
|
ProperName,
|
17
17
|
Phonotactic,
|
18
|
-
|
18
|
+
Punctuation,
|
19
19
|
)
|
20
20
|
from sonatoki.Cleaners import ConsecutiveDuplicates
|
21
21
|
from sonatoki.constants import NIMI_PU, NIMI_LINKU
|
@@ -82,17 +82,16 @@ def test_ProperName(s: str):
|
|
82
82
|
assert res, repr(s)
|
83
83
|
|
84
84
|
|
85
|
-
|
86
|
-
|
87
|
-
@
|
85
|
+
@given(st.from_regex(Punctuation.pattern.pattern, fullmatch=True))
|
86
|
+
@example("[]")
|
87
|
+
@example(r"\\")
|
88
|
+
@example(r"\"")
|
88
89
|
@example("⟨·⟩")
|
89
90
|
@example("…")
|
90
|
-
@example("
|
91
|
+
@example("「」") # ` `
|
91
92
|
@example(string.punctuation)
|
92
|
-
|
93
|
-
|
94
|
-
_ = assume(re.fullmatch(Punctuations.pattern.pattern, s))
|
95
|
-
res = Punctuations.filter(s)
|
93
|
+
def test_Punctuation(s: str):
|
94
|
+
res = Punctuation.filter(s)
|
96
95
|
assert res, repr(s)
|
97
96
|
|
98
97
|
|
@@ -100,5 +99,5 @@ def test_Punctuations(s: str):
|
|
100
99
|
@example("124125")
|
101
100
|
@example("99990000")
|
102
101
|
def test_Numeric(s: str):
|
103
|
-
res =
|
102
|
+
res = Numeric.filter(s)
|
104
103
|
assert res, repr(s)
|
@@ -6,7 +6,9 @@ from hypothesis import given, example
|
|
6
6
|
from sonatoki.Preprocessors import (
|
7
7
|
URLs,
|
8
8
|
Spoilers,
|
9
|
+
AllQuotes,
|
9
10
|
Backticks,
|
11
|
+
Reference,
|
10
12
|
ArrowQuote,
|
11
13
|
DoubleQuotes,
|
12
14
|
SingleQuotes,
|
@@ -14,6 +16,7 @@ from sonatoki.Preprocessors import (
|
|
14
16
|
DiscordSpecial,
|
15
17
|
DiscordChannels,
|
16
18
|
DiscordMentions,
|
19
|
+
AngleBracketObject,
|
17
20
|
)
|
18
21
|
|
19
22
|
|
@@ -101,3 +104,40 @@ def test_DiscordChannels(s: str):
|
|
101
104
|
def test_DiscordSpecial(s: str):
|
102
105
|
res = DiscordSpecial.process(s).strip()
|
103
106
|
assert res == "", (repr(s), repr(res))
|
107
|
+
|
108
|
+
|
109
|
+
@given(
|
110
|
+
st.from_regex(DiscordEmotes.pattern.pattern, fullmatch=True)
|
111
|
+
| st.from_regex(DiscordMentions.pattern.pattern, fullmatch=True)
|
112
|
+
| st.from_regex(DiscordChannels.pattern.pattern, fullmatch=True)
|
113
|
+
| st.from_regex(DiscordSpecial.pattern.pattern, fullmatch=True)
|
114
|
+
| st.from_regex(AngleBracketObject.pattern.pattern, fullmatch=True)
|
115
|
+
)
|
116
|
+
@example("<https://example.com>")
|
117
|
+
@example("<#123124125125>")
|
118
|
+
def test_AngleBracketObject(s: str):
|
119
|
+
res = AngleBracketObject.process(s).strip()
|
120
|
+
assert res == "", (repr(s), repr(res))
|
121
|
+
|
122
|
+
|
123
|
+
@given(
|
124
|
+
st.from_regex(SingleQuotes.pattern.pattern, fullmatch=True)
|
125
|
+
| st.from_regex(DoubleQuotes.pattern.pattern, fullmatch=True)
|
126
|
+
| st.from_regex(Backticks.pattern.pattern, fullmatch=True)
|
127
|
+
| st.from_regex(ArrowQuote.pattern.pattern, fullmatch=True)
|
128
|
+
| st.from_regex(AllQuotes.pattern.pattern, fullmatch=True)
|
129
|
+
)
|
130
|
+
@example("> bruh")
|
131
|
+
@example("`bruh`")
|
132
|
+
def test_AllQuotes(s: str):
|
133
|
+
res = AllQuotes.process(s).strip()
|
134
|
+
assert res == "", (repr(s), repr(res))
|
135
|
+
|
136
|
+
|
137
|
+
@given(st.from_regex(Reference.pattern.pattern, fullmatch=True))
|
138
|
+
@example("[[Brainstorm]]")
|
139
|
+
@example("[[Phatic Phrases]]")
|
140
|
+
@example("[[Yahoo!]]")
|
141
|
+
def test_Reference(s: str):
|
142
|
+
res = Reference.process(s).strip()
|
143
|
+
assert res == "", (repr(s), repr(res))
|
@@ -4,38 +4,39 @@ from typing import List, Type
|
|
4
4
|
# PDM
|
5
5
|
import pytest
|
6
6
|
import hypothesis.strategies as st
|
7
|
-
from hypothesis import given
|
7
|
+
from hypothesis import given, example
|
8
8
|
|
9
9
|
# LOCAL
|
10
10
|
from sonatoki.Filters import (
|
11
11
|
Filter,
|
12
12
|
NimiPu,
|
13
|
-
|
13
|
+
Numeric,
|
14
14
|
Syllabic,
|
15
15
|
NimiLinku,
|
16
16
|
Alphabetic,
|
17
17
|
ProperName,
|
18
18
|
Phonotactic,
|
19
|
-
|
19
|
+
Punctuation,
|
20
20
|
)
|
21
|
-
from sonatoki.Scorers import Scorer, Scaling, PassFail, SoftScaling
|
21
|
+
from sonatoki.Scorers import Scorer, Scaling, PassFail, SoftScaling, SoftPassFail
|
22
22
|
|
23
23
|
# FILESYSTEM
|
24
24
|
from .test_utils import token_strategy
|
25
25
|
|
26
26
|
FILTERS = [
|
27
27
|
NimiPu,
|
28
|
-
|
28
|
+
Numeric,
|
29
29
|
Syllabic,
|
30
30
|
NimiLinku,
|
31
31
|
Alphabetic,
|
32
32
|
ProperName,
|
33
33
|
Phonotactic,
|
34
|
-
|
34
|
+
Punctuation,
|
35
35
|
]
|
36
36
|
|
37
37
|
SCORERS = [
|
38
38
|
PassFail,
|
39
|
+
SoftPassFail,
|
39
40
|
Scaling,
|
40
41
|
SoftScaling,
|
41
42
|
]
|
@@ -46,6 +47,7 @@ SCORERS = [
|
|
46
47
|
st.lists(st.sampled_from(FILTERS), min_size=1, unique=True),
|
47
48
|
st.lists(token_strategy, min_size=0, max_size=10),
|
48
49
|
)
|
50
|
+
@example(st.sampled_from(FILTERS), [])
|
49
51
|
def test_score_bounds(scorer: Scorer, filters: List[Type[Filter]], text: List[str]):
|
50
52
|
score = scorer.score(text, filters)
|
51
53
|
assert 0 <= score <= 1, (score, filters, text)
|
@@ -19,6 +19,24 @@
|
|
19
19
|
output:
|
20
20
|
- "mi mu."
|
21
21
|
- "mi wawa."
|
22
|
+
- name: "empty"
|
23
|
+
input: ""
|
24
|
+
output: []
|
25
|
+
- name: "whitespace"
|
26
|
+
input: " \n "
|
27
|
+
output: []
|
28
|
+
- name: "newline basic"
|
29
|
+
input: "sina lon seme?\nmi wile lon poka...\n"
|
30
|
+
output:
|
31
|
+
- "sina lon seme?"
|
32
|
+
- "mi wile lon poka."
|
33
|
+
- "."
|
34
|
+
- "."
|
35
|
+
- name: "newline alone"
|
36
|
+
input: "sina lon seme\nmi wile lon poka"
|
37
|
+
output:
|
38
|
+
- "sina lon seme"
|
39
|
+
- "mi wile lon poka"
|
22
40
|
- name: "dash"
|
23
41
|
input: "mi sona ala e ni- sina seme a"
|
24
42
|
output:
|
@@ -1,67 +0,0 @@
|
|
1
|
-
# STL
|
2
|
-
import json
|
3
|
-
from typing import Dict, List
|
4
|
-
from pathlib import Path
|
5
|
-
|
6
|
-
LINKU = Path(__file__).resolve().parent / Path("linku.json")
|
7
|
-
SANDBOX = Path(__file__).resolve().parent / Path("sandbox.json")
|
8
|
-
|
9
|
-
VOWELS = "aeiou"
|
10
|
-
CONSONANTS = "jklmnpstw"
|
11
|
-
ALPHABET = VOWELS + CONSONANTS
|
12
|
-
ALPHABET_SET = set(ALPHABET)
|
13
|
-
|
14
|
-
"""Commonly occurring strings which are some kind of valid Toki Pona or external token"""
|
15
|
-
ALLOWABLES = {
|
16
|
-
"cw", # Content Warning
|
17
|
-
"x", # ala
|
18
|
-
"y", # anu
|
19
|
-
"kxk", # ken ala ken
|
20
|
-
"wxw", # wile ala wile
|
21
|
-
}
|
22
|
-
|
23
|
-
|
24
|
-
with open(LINKU) as f:
|
25
|
-
r: Dict[str, Dict[str, str]] = json.loads(f.read())
|
26
|
-
NIMI_PU: List[str] = [d["word"] for d in r.values() if d["book"] == "pu"]
|
27
|
-
NIMI_PU_ALE: List[str] = NIMI_PU + ["namako", "kin", "oko"]
|
28
|
-
NIMI_LINKU: List[str] = [
|
29
|
-
d["word"] for d in r.values() if d["usage_category"] in ["core", "common"]
|
30
|
-
]
|
31
|
-
NIMI_LINKU_ALE: List[str] = [d["word"] for d in r.values()]
|
32
|
-
|
33
|
-
with open(SANDBOX) as f:
|
34
|
-
r: Dict[str, Dict[str, str]] = json.loads(f.read())
|
35
|
-
NIMI_LINKU_SANDBOX: List[str] = [d["word"] for d in r.values()]
|
36
|
-
|
37
|
-
|
38
|
-
NIMI_PU_SET = set(NIMI_PU)
|
39
|
-
NIMI_PU_ALE_SET = set(NIMI_PU_ALE)
|
40
|
-
NIMI_LINKU_SET = set(NIMI_LINKU)
|
41
|
-
NIMI_LINKU_ALE_SET = set(NIMI_LINKU_ALE)
|
42
|
-
NIMI_LINKU_SANDBOX_SET = set(NIMI_LINKU_SANDBOX)
|
43
|
-
ALLOWABLES_SET = set(ALLOWABLES)
|
44
|
-
|
45
|
-
__all__ = [
|
46
|
-
"VOWELS",
|
47
|
-
#
|
48
|
-
"CONSONANTS",
|
49
|
-
#
|
50
|
-
"ALPHABET",
|
51
|
-
"ALPHABET_SET",
|
52
|
-
#
|
53
|
-
"NIMI_PU",
|
54
|
-
"NIMI_PU_SET",
|
55
|
-
#
|
56
|
-
"NIMI_PU_ALE",
|
57
|
-
"NIMI_PU_ALE_SET",
|
58
|
-
#
|
59
|
-
"NIMI_LINKU",
|
60
|
-
"NIMI_LINKU_SET",
|
61
|
-
#
|
62
|
-
"NIMI_LINKU_ALE",
|
63
|
-
"NIMI_LINKU_ALE_SET",
|
64
|
-
#
|
65
|
-
"NIMI_LINKU_SANDBOX",
|
66
|
-
"NIMI_LINKU_SANDBOX_SET",
|
67
|
-
]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|