sonatoki 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Configs.py +3 -3
- sonatoki/Filters.py +4 -4
- sonatoki/Preprocessors.py +35 -4
- {sonatoki-0.1.3.dist-info → sonatoki-0.1.4.dist-info}/METADATA +1 -1
- {sonatoki-0.1.3.dist-info → sonatoki-0.1.4.dist-info}/RECORD +7 -7
- {sonatoki-0.1.3.dist-info → sonatoki-0.1.4.dist-info}/WHEEL +0 -0
- {sonatoki-0.1.3.dist-info → sonatoki-0.1.4.dist-info}/licenses/LICENSE +0 -0
sonatoki/Configs.py
CHANGED
@@ -9,15 +9,15 @@ from typing_extensions import NotRequired
|
|
9
9
|
from sonatoki.Filters import (
|
10
10
|
Filter,
|
11
11
|
NimiPu,
|
12
|
-
|
12
|
+
Numeric,
|
13
13
|
Syllabic,
|
14
14
|
NimiLinku,
|
15
15
|
NimiPuAle,
|
16
16
|
Alphabetic,
|
17
17
|
ProperName,
|
18
18
|
Phonotactic,
|
19
|
+
Punctuation,
|
19
20
|
NimiLinkuAle,
|
20
|
-
Punctuations,
|
21
21
|
)
|
22
22
|
from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
|
23
23
|
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
@@ -45,7 +45,7 @@ class IloConfig(TypedDict):
|
|
45
45
|
BaseConfig: IloConfig = {
|
46
46
|
"preprocessors": [URLs],
|
47
47
|
"cleaners": [ConsecutiveDuplicates],
|
48
|
-
"ignoring_filters": [
|
48
|
+
"ignoring_filters": [Numeric, Punctuation],
|
49
49
|
"scoring_filters": [],
|
50
50
|
"scorer": PassFail,
|
51
51
|
"passing_score": 0.8,
|
sonatoki/Filters.py
CHANGED
@@ -131,7 +131,7 @@ class Alphabetic(Filter):
|
|
131
131
|
return set(token.lower()).issubset(ALPHABET_SET)
|
132
132
|
|
133
133
|
|
134
|
-
class
|
134
|
+
class Numeric(Filter):
|
135
135
|
"""Determine if a given token is entirely numeric.
|
136
136
|
Covers all numeric symbols in Unicode.
|
137
137
|
|
@@ -147,7 +147,7 @@ class Numerics(Filter):
|
|
147
147
|
return msg.isnumeric()
|
148
148
|
|
149
149
|
|
150
|
-
class
|
150
|
+
class Punctuation(RegexFilter):
|
151
151
|
pattern = re.compile(r"[\p{Punctuation}\p{posix_punct}]+")
|
152
152
|
|
153
153
|
|
@@ -159,6 +159,6 @@ __all__ = [
|
|
159
159
|
"Syllabic",
|
160
160
|
"Alphabetic",
|
161
161
|
"ProperName",
|
162
|
-
"
|
163
|
-
"
|
162
|
+
"Punctuation",
|
163
|
+
"Numeric",
|
164
164
|
]
|
sonatoki/Preprocessors.py
CHANGED
@@ -62,6 +62,13 @@ class URLs(RegexPreprocessor):
|
|
62
62
|
pattern = re.compile(r"https?:\/\/\S+")
|
63
63
|
|
64
64
|
|
65
|
+
class Reference(RegexPreprocessor):
|
66
|
+
"""Remove text contained in double brackets.
|
67
|
+
Often used to fetch articles on Wikipedia, or Magic the Gathering cards."""
|
68
|
+
|
69
|
+
pattern = re.compile(r"\[\[.+\]\]")
|
70
|
+
|
71
|
+
|
65
72
|
class DiscordEmotes(RegexPreprocessor):
|
66
73
|
"""Remove text-formatted Discord emotes `<flags:name:id>`"""
|
67
74
|
|
@@ -80,6 +87,13 @@ class DiscordSpecial(RegexPreprocessor):
|
|
80
87
|
pattern = re.compile(r"<id:[a-zA-Z0-9_]{4,}>")
|
81
88
|
|
82
89
|
|
90
|
+
class AngleBracketObject(RegexPreprocessor):
|
91
|
+
"""A generalized version of the Discord-specific angle bracket objects.
|
92
|
+
Removes any contiguous (not broken by whitespace) text in angle brackets."""
|
93
|
+
|
94
|
+
pattern = re.compile(r"<[^<>\s]+>")
|
95
|
+
|
96
|
+
|
83
97
|
"""
|
84
98
|
The following classes are Containers.
|
85
99
|
|
@@ -92,23 +106,23 @@ would likely be using a language other than Toki Pona.
|
|
92
106
|
|
93
107
|
|
94
108
|
class SingleQuotes(RegexPreprocessor):
|
95
|
-
pattern = re.compile(r"'[^']+'", flags=re.
|
109
|
+
pattern = re.compile(r"'[^']+'", flags=re.DOTALL)
|
96
110
|
|
97
111
|
|
98
112
|
class DoubleQuotes(RegexPreprocessor):
|
99
|
-
pattern = re.compile(r'"[^"]+"', flags=re.
|
113
|
+
pattern = re.compile(r'"[^"]+"', flags=re.DOTALL)
|
100
114
|
|
101
115
|
|
102
116
|
class Backticks(RegexPreprocessor):
|
103
117
|
"""Remove paired backticks and their contents `like this`"""
|
104
118
|
|
105
|
-
pattern = re.compile(r"`[^`]+`", flags=re.
|
119
|
+
pattern = re.compile(r"`[^`]+`", flags=re.DOTALL)
|
106
120
|
|
107
121
|
|
108
122
|
class Spoilers(RegexPreprocessor):
|
109
123
|
"""Remove paired double bars and their contents `||like this||`"""
|
110
124
|
|
111
|
-
pattern = re.compile(r"\|\|(?:(?!\|\|).)+\|\|", flags=re.
|
125
|
+
pattern = re.compile(r"\|\|(?:(?!\|\|).)+\|\|", flags=re.DOTALL)
|
112
126
|
|
113
127
|
|
114
128
|
class ArrowQuote(RegexPreprocessor):
|
@@ -117,7 +131,22 @@ class ArrowQuote(RegexPreprocessor):
|
|
117
131
|
pattern = re.compile(r"^>\ .+$", re.MULTILINE)
|
118
132
|
|
119
133
|
|
134
|
+
class AllQuotes(RegexPreprocessor):
|
135
|
+
pattern = re.compile(
|
136
|
+
"|".join(
|
137
|
+
[
|
138
|
+
SingleQuotes.pattern.pattern,
|
139
|
+
DoubleQuotes.pattern.pattern,
|
140
|
+
Backticks.pattern.pattern,
|
141
|
+
ArrowQuote.pattern.pattern,
|
142
|
+
]
|
143
|
+
),
|
144
|
+
flags=re.MULTILINE | re.DOTALL,
|
145
|
+
)
|
146
|
+
|
147
|
+
|
120
148
|
__all__ = [
|
149
|
+
"AngleBracketObject",
|
121
150
|
"DiscordChannels",
|
122
151
|
"DiscordMentions",
|
123
152
|
"DiscordSpecial",
|
@@ -125,7 +154,9 @@ __all__ = [
|
|
125
154
|
"SingleQuotes",
|
126
155
|
"DoubleQuotes",
|
127
156
|
"ArrowQuote",
|
157
|
+
"AllQuotes",
|
128
158
|
"Backticks",
|
159
|
+
"Reference",
|
129
160
|
"Spoilers",
|
130
161
|
"URLs",
|
131
162
|
]
|
@@ -1,10 +1,10 @@
|
|
1
|
-
sonatoki-0.1.
|
2
|
-
sonatoki-0.1.
|
3
|
-
sonatoki-0.1.
|
1
|
+
sonatoki-0.1.4.dist-info/METADATA,sha256=cK_EyYXPeY4rm9Plcre-i_DbPJZD06572cYQEIUQ804,5225
|
2
|
+
sonatoki-0.1.4.dist-info/WHEEL,sha256=vnE8JVcI2Wz7GRKorsPArnBdnW2SWKWGow5gu5tHlRU,90
|
3
|
+
sonatoki-0.1.4.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
|
4
4
|
sonatoki/Cleaners.py,sha256=gTZ9dSsnvKVUtxM_ECSZ-_2heh--nD5A9dCQR1ATb1c,1160
|
5
|
-
sonatoki/Configs.py,sha256=
|
6
|
-
sonatoki/Filters.py,sha256=
|
7
|
-
sonatoki/Preprocessors.py,sha256=
|
5
|
+
sonatoki/Configs.py,sha256=iY6Lyn1rMi7iF0M62yx0ET4pEb35-QAd1FS0tkyUfSc,1935
|
6
|
+
sonatoki/Filters.py,sha256=dL3XgH62OrVVvc8b6dtR5-JZmErVF4bl7ultAoHHqpo,4190
|
7
|
+
sonatoki/Preprocessors.py,sha256=h2sX6nJIIOPotwHL0476VQe4KxERlD_F6nrvxDyuaTs,4205
|
8
8
|
sonatoki/Scorers.py,sha256=V293DBiupBiujzuc4yMrKOAiuNTLltIsiCzIAlLeokA,4129
|
9
9
|
sonatoki/Tokenizers.py,sha256=fvqxpubs2F63va2RzZKZQhZbFnVaC_9haXIA9Mqznis,1942
|
10
10
|
sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -13,4 +13,4 @@ sonatoki/constants.py,sha256=m0Z4At6MfbqZRio2glT3J3zT9x_itcWZBT_G82mpaVc,1647
|
|
13
13
|
sonatoki/ilo.py,sha256=oN14iYFKxgjFjjOslgqBrMaIgpnvS5gO6MscbS0JS5A,4343
|
14
14
|
sonatoki/linku.json,sha256=B5KNdhyM5UEfMciROgh1ECHr3i-ASBeMvwrkzNJX47c,271013
|
15
15
|
sonatoki/sandbox.json,sha256=hx6LRsfvmmTtqXcXIyCsfSaGK3DZ-GCdbM8xhZQBHoA,77650
|
16
|
-
sonatoki-0.1.
|
16
|
+
sonatoki-0.1.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|