sonatoki 0.8.0__tar.gz → 0.8.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sonatoki-0.8.0 → sonatoki-0.8.2}/PKG-INFO +1 -1
- {sonatoki-0.8.0 → sonatoki-0.8.2}/pyproject.toml +1 -1
- {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/Preprocessors.py +4 -1
- {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/constants.py +3 -2
- {sonatoki-0.8.0 → sonatoki-0.8.2}/tests/test_cleaners.py +2 -2
- {sonatoki-0.8.0 → sonatoki-0.8.2}/tests/test_filters.py +14 -14
- {sonatoki-0.8.0 → sonatoki-0.8.2}/tests/test_preprocessors.py +30 -23
- {sonatoki-0.8.0 → sonatoki-0.8.2}/tests/test_utils.py +3 -3
- {sonatoki-0.8.0 → sonatoki-0.8.2}/tests/tokenize_cases/tokenize_sentences_tok.yml +2 -1
- {sonatoki-0.8.0 → sonatoki-0.8.2}/LICENSE +0 -0
- {sonatoki-0.8.0 → sonatoki-0.8.2}/README.md +0 -0
- {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/Cleaners.py +0 -0
- {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/Configs.py +0 -0
- {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/Filters.py +0 -0
- {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/Scorers.py +0 -0
- {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/Tokenizers.py +0 -0
- {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/__init__.py +0 -0
- {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/__main__.py +0 -0
- {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/alphabetic.txt +0 -0
- {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/ilo.py +0 -0
- {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/linku.json +0 -0
- {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/py.typed +0 -0
- {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/sandbox.json +0 -0
- {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/syllabic.txt +0 -0
- {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/types.py +0 -0
- {sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/utils.py +0 -0
- {sonatoki-0.8.0 → sonatoki-0.8.2}/tests/__init__.py +0 -0
- {sonatoki-0.8.0 → sonatoki-0.8.2}/tests/test_ilo.py +0 -0
- {sonatoki-0.8.0 → sonatoki-0.8.2}/tests/test_properties.py +0 -0
- {sonatoki-0.8.0 → sonatoki-0.8.2}/tests/test_scorers.py +0 -0
- {sonatoki-0.8.0 → sonatoki-0.8.2}/tests/test_tokenize.py +0 -0
- {sonatoki-0.8.0 → sonatoki-0.8.2}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -149,7 +149,10 @@ class Codeblock(RegexPreprocessor):
|
|
149
149
|
Subset of what would be removed by Backticks, but may be preferable.
|
150
150
|
"""
|
151
151
|
|
152
|
-
pattern = re.compile(
|
152
|
+
pattern = re.compile(
|
153
|
+
r"```.+?```",
|
154
|
+
flags=re.DOTALL,
|
155
|
+
)
|
153
156
|
|
154
157
|
|
155
158
|
class Spoilers(RegexPreprocessor):
|
@@ -503,8 +503,9 @@ ALL_PUNCT = "".join(sorted(list(set(POSIX_PUNCT + UNICODE_PUNCT))))
|
|
503
503
|
ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
|
504
504
|
# combined bc the result could be simpler
|
505
505
|
|
506
|
-
SENTENCE_PUNCT = """.?!:;()[-]
|
507
|
-
#
|
506
|
+
SENTENCE_PUNCT = """.?!:;"()[-]«»‹›“”‟„⹂‽·•…「」『』"""
|
507
|
+
# single quotes are word boundaries if not intra-word, but double quotes are sentence
|
508
|
+
# boundaries
|
508
509
|
|
509
510
|
INTRA_WORD_PUNCT = """-'’"""
|
510
511
|
|
@@ -12,7 +12,7 @@ from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates, ConsecutiveDupli
|
|
12
12
|
from .test_utils import PROPER_NAME_RE
|
13
13
|
|
14
14
|
|
15
|
-
@given(st.from_regex(ConsecutiveDuplicatesRe.pattern
|
15
|
+
@given(st.from_regex(ConsecutiveDuplicatesRe.pattern))
|
16
16
|
@example("tooooki a")
|
17
17
|
@example("muuuuuu")
|
18
18
|
@example("nnn")
|
@@ -25,7 +25,7 @@ def test_ConsecutiveDuplicatesRe(s: str):
|
|
25
25
|
assert a.lower() != b.lower(), (s, res)
|
26
26
|
|
27
27
|
|
28
|
-
@given(st.from_regex(ConsecutiveDuplicatesRe.pattern
|
28
|
+
@given(st.from_regex(ConsecutiveDuplicatesRe.pattern))
|
29
29
|
@example("Aaa")
|
30
30
|
@example("aAa")
|
31
31
|
@example("aaA")
|
@@ -90,7 +90,7 @@ def test_NimiLinkuSandbox(s: str):
|
|
90
90
|
assert res, repr(s)
|
91
91
|
|
92
92
|
|
93
|
-
@given(st.from_regex(Phonotactic.pattern
|
93
|
+
@given(st.from_regex(Phonotactic.pattern, fullmatch=True))
|
94
94
|
@example("kijetesantakalu")
|
95
95
|
@example("n")
|
96
96
|
def test_Phonotactic(s: str):
|
@@ -98,28 +98,28 @@ def test_Phonotactic(s: str):
|
|
98
98
|
assert res, repr(s)
|
99
99
|
|
100
100
|
|
101
|
-
@given(st.from_regex(Phonotactic.pattern
|
101
|
+
@given(st.from_regex(Phonotactic.pattern, fullmatch=True))
|
102
102
|
def test_LongPhonotactic(s: str):
|
103
103
|
len_ok = len(s) >= LongPhonotactic.length
|
104
104
|
res = LongPhonotactic.filter(s)
|
105
105
|
assert res == len_ok, repr(s) # will match given fullmatch
|
106
106
|
|
107
107
|
|
108
|
-
@given(st.from_regex(Syllabic.pattern
|
108
|
+
@given(st.from_regex(Syllabic.pattern, fullmatch=True))
|
109
109
|
@example("wuwojitiwunwonjintinmanna")
|
110
110
|
def test_Syllabic(s: str):
|
111
111
|
res = Syllabic.filter(s)
|
112
112
|
assert res, repr(s)
|
113
113
|
|
114
114
|
|
115
|
-
@given(st.from_regex(Syllabic.pattern
|
115
|
+
@given(st.from_regex(Syllabic.pattern, fullmatch=True))
|
116
116
|
def test_LongSyllabic(s: str):
|
117
117
|
len_ok = len(s) >= LongSyllabic.length
|
118
118
|
res = LongSyllabic.filter(s)
|
119
119
|
assert res == len_ok
|
120
120
|
|
121
121
|
|
122
|
-
@given(st.from_regex(AlphabeticRe.pattern
|
122
|
+
@given(st.from_regex(AlphabeticRe.pattern, fullmatch=True))
|
123
123
|
@example("muems")
|
124
124
|
@example("mpptp")
|
125
125
|
@example("tptpt")
|
@@ -129,14 +129,14 @@ def test_Alphabetic(s: str):
|
|
129
129
|
assert res_fn == res_re, repr(s)
|
130
130
|
|
131
131
|
|
132
|
-
@given(st.from_regex(AlphabeticRe.pattern
|
132
|
+
@given(st.from_regex(AlphabeticRe.pattern, fullmatch=True))
|
133
133
|
def test_LongAlphabetic(s: str):
|
134
134
|
len_ok = len(s) >= LongAlphabetic.length
|
135
135
|
res = LongAlphabetic.filter(s)
|
136
136
|
assert res == len_ok
|
137
137
|
|
138
138
|
|
139
|
-
@given(st.from_regex(AlphabeticRe.pattern
|
139
|
+
@given(st.from_regex(AlphabeticRe.pattern, fullmatch=True))
|
140
140
|
def test_AlphabeticRe(s: str):
|
141
141
|
res_re = AlphabeticRe.filter(s)
|
142
142
|
assert res_re, repr(s)
|
@@ -148,7 +148,7 @@ def test_ProperName(s: str):
|
|
148
148
|
assert res, repr(s)
|
149
149
|
|
150
150
|
|
151
|
-
@given(st.from_regex(PunctuationRe.pattern
|
151
|
+
@given(st.from_regex(PunctuationRe.pattern, fullmatch=True))
|
152
152
|
@example("[]")
|
153
153
|
@example(r"\\")
|
154
154
|
@example(r"\"")
|
@@ -161,14 +161,14 @@ def test_PunctuationRe1(s: str):
|
|
161
161
|
assert res, repr(s)
|
162
162
|
|
163
163
|
|
164
|
-
@given(st.from_regex(PunctuationRe.pattern
|
164
|
+
@given(st.from_regex(PunctuationRe.pattern, fullmatch=True))
|
165
165
|
def test_PunctuationRe(s: str):
|
166
166
|
res_re = PunctuationRe.filter(s)
|
167
167
|
res_re1 = PunctuationRe1.filter(s)
|
168
168
|
assert res_re == res_re1, repr(s)
|
169
169
|
|
170
170
|
|
171
|
-
@given(st.from_regex(PunctuationRe.pattern
|
171
|
+
@given(st.from_regex(PunctuationRe.pattern, fullmatch=True))
|
172
172
|
@example("\U000f1990") # UCSUR char
|
173
173
|
def test_Punctuation(s: str):
|
174
174
|
res_fn = Punctuation.filter(s)
|
@@ -185,7 +185,7 @@ def test_Numeric(s: str):
|
|
185
185
|
|
186
186
|
|
187
187
|
@given(
|
188
|
-
st.from_regex(PunctuationRe.pattern
|
188
|
+
st.from_regex(PunctuationRe.pattern, fullmatch=True)
|
189
189
|
| st.from_regex(r"\d+", fullmatch=True),
|
190
190
|
)
|
191
191
|
def test_OrFilter(s: str):
|
@@ -259,8 +259,8 @@ def test_NotFilter(s: str):
|
|
259
259
|
|
260
260
|
@given(
|
261
261
|
st.sampled_from(list(FALSE_POS_SYLLABIC))
|
262
|
-
| st.from_regex(Syllabic.pattern
|
263
|
-
| st.from_regex(AlphabeticRe.pattern
|
262
|
+
| st.from_regex(Syllabic.pattern, fullmatch=True)
|
263
|
+
| st.from_regex(AlphabeticRe.pattern, fullmatch=True)
|
264
264
|
)
|
265
265
|
def test_AndNotFilter(s: str):
|
266
266
|
AndNotFilter = And(Syllabic, Not(FalsePosSyllabic))
|
@@ -309,7 +309,7 @@ def test_AddTokensToMemberFilterNegative(s: str):
|
|
309
309
|
| words_by_tag("usage_category", "sandbox")
|
310
310
|
),
|
311
311
|
)
|
312
|
-
| st.from_regex(Syllabic.pattern
|
312
|
+
| st.from_regex(Syllabic.pattern, fullmatch=True)
|
313
313
|
)
|
314
314
|
def test_SubTokensFromMemberFilter(s: str):
|
315
315
|
NimiAlaFilter = NimiLinkuCore(sub=NimiPu.tokens)
|
@@ -22,7 +22,7 @@ from sonatoki.Preprocessors import (
|
|
22
22
|
)
|
23
23
|
|
24
24
|
|
25
|
-
@given(st.from_regex(URLs.pattern
|
25
|
+
@given(st.from_regex(URLs.pattern, fullmatch=True))
|
26
26
|
@example("https://google.com")
|
27
27
|
@example("https://mun.la")
|
28
28
|
@example("https://discord.gg/")
|
@@ -32,7 +32,7 @@ def test_URLs(s: str):
|
|
32
32
|
assert URLs.process(s).strip() == ""
|
33
33
|
|
34
34
|
|
35
|
-
@given(st.from_regex(Spoilers.pattern
|
35
|
+
@given(st.from_regex(Spoilers.pattern, fullmatch=True))
|
36
36
|
@example("|| | ||")
|
37
37
|
@example("|| content\n\n\ncontent ||")
|
38
38
|
@example("||\n||")
|
@@ -42,14 +42,15 @@ def test_Spoilers(s: str):
|
|
42
42
|
assert res == "", (repr(s), repr(res))
|
43
43
|
|
44
44
|
|
45
|
-
@given(st.from_regex(Backticks.pattern
|
45
|
+
@given(st.from_regex(Backticks.pattern, fullmatch=True))
|
46
46
|
@example("` ` ` `")
|
47
47
|
def test_Backticks(s: str):
|
48
48
|
res = Backticks.process(s).strip()
|
49
49
|
assert res == "", (repr(s), repr(res))
|
50
50
|
|
51
51
|
|
52
|
-
@given(st.from_regex(
|
52
|
+
@given(st.from_regex(r"```(?:(?!`).+?)```", fullmatch=True))
|
53
|
+
@example("""```0```""")
|
53
54
|
@example(
|
54
55
|
"""```
|
55
56
|
```"""
|
@@ -63,12 +64,18 @@ blocky message
|
|
63
64
|
second blocky message
|
64
65
|
```"""
|
65
66
|
)
|
67
|
+
@example(
|
68
|
+
"""```oisandm123-_mu
|
69
|
+
arbitrary content
|
70
|
+
```"""
|
71
|
+
)
|
72
|
+
@example("""```mu```""")
|
66
73
|
def test_Codeblock(s: str):
|
67
74
|
res = Codeblock.process(s).strip()
|
68
75
|
assert res == "", (repr(s), repr(res))
|
69
76
|
|
70
77
|
|
71
|
-
@given(st.from_regex(ArrowQuote.pattern
|
78
|
+
@given(st.from_regex(ArrowQuote.pattern, fullmatch=True))
|
72
79
|
@example("> base")
|
73
80
|
@example("> newline\n> newline")
|
74
81
|
def test_ArrowQuote(s: str):
|
@@ -76,7 +83,7 @@ def test_ArrowQuote(s: str):
|
|
76
83
|
assert res == "", (repr(s), repr(res))
|
77
84
|
|
78
85
|
|
79
|
-
@given(st.from_regex(DoubleQuotes.pattern
|
86
|
+
@given(st.from_regex(DoubleQuotes.pattern, fullmatch=True))
|
80
87
|
@example('" "" "')
|
81
88
|
@example('" "\n" "')
|
82
89
|
@example('" \n "')
|
@@ -85,7 +92,7 @@ def test_DoubleQuotes(s: str):
|
|
85
92
|
assert res == "", (repr(s), repr(res))
|
86
93
|
|
87
94
|
|
88
|
-
@given(st.from_regex(SingleQuotes.pattern
|
95
|
+
@given(st.from_regex(SingleQuotes.pattern, fullmatch=True))
|
89
96
|
@example("' '' '")
|
90
97
|
@example("' '\n' '")
|
91
98
|
@example("' \n '")
|
@@ -94,7 +101,7 @@ def test_SingleQuotes(s: str):
|
|
94
101
|
assert res == "", (repr(s), repr(res))
|
95
102
|
|
96
103
|
|
97
|
-
@given(st.from_regex(DiscordEmotes.pattern
|
104
|
+
@given(st.from_regex(DiscordEmotes.pattern, fullmatch=True))
|
98
105
|
@example("<a:example:123123>")
|
99
106
|
@example("<:example:123123>")
|
100
107
|
def test_DiscordEmotes(s: str):
|
@@ -102,7 +109,7 @@ def test_DiscordEmotes(s: str):
|
|
102
109
|
assert res == "", (repr(s), repr(res))
|
103
110
|
|
104
111
|
|
105
|
-
@given(st.from_regex(DiscordMentions.pattern
|
112
|
+
@given(st.from_regex(DiscordMentions.pattern, fullmatch=True))
|
106
113
|
@example("<@497549183847497739>")
|
107
114
|
@example("<@!457890000>")
|
108
115
|
@example("<@&18398198981985>")
|
@@ -111,7 +118,7 @@ def test_DiscordMentions(s: str):
|
|
111
118
|
assert res == "", (repr(s), repr(res))
|
112
119
|
|
113
120
|
|
114
|
-
@given(st.from_regex(DiscordChannels.pattern
|
121
|
+
@given(st.from_regex(DiscordChannels.pattern, fullmatch=True))
|
115
122
|
@example("<#19858915>")
|
116
123
|
@example("<#18591912589812985>")
|
117
124
|
def test_DiscordChannels(s: str):
|
@@ -119,7 +126,7 @@ def test_DiscordChannels(s: str):
|
|
119
126
|
assert res == "", (repr(s), repr(res))
|
120
127
|
|
121
128
|
|
122
|
-
@given(st.from_regex(DiscordSpecial.pattern
|
129
|
+
@given(st.from_regex(DiscordSpecial.pattern, fullmatch=True))
|
123
130
|
@example("<id:guide>")
|
124
131
|
@example("<id:browse>")
|
125
132
|
def test_DiscordSpecial(s: str):
|
@@ -128,11 +135,11 @@ def test_DiscordSpecial(s: str):
|
|
128
135
|
|
129
136
|
|
130
137
|
@given(
|
131
|
-
st.from_regex(DiscordEmotes.pattern
|
132
|
-
| st.from_regex(DiscordMentions.pattern
|
133
|
-
| st.from_regex(DiscordChannels.pattern
|
134
|
-
| st.from_regex(DiscordSpecial.pattern
|
135
|
-
| st.from_regex(AngleBracketObject.pattern
|
138
|
+
st.from_regex(DiscordEmotes.pattern, fullmatch=True)
|
139
|
+
| st.from_regex(DiscordMentions.pattern, fullmatch=True)
|
140
|
+
| st.from_regex(DiscordChannels.pattern, fullmatch=True)
|
141
|
+
| st.from_regex(DiscordSpecial.pattern, fullmatch=True)
|
142
|
+
| st.from_regex(AngleBracketObject.pattern, fullmatch=True)
|
136
143
|
)
|
137
144
|
@example("<https://example.com>")
|
138
145
|
@example("<#123124125125>")
|
@@ -142,11 +149,11 @@ def test_AngleBracketObject(s: str):
|
|
142
149
|
|
143
150
|
|
144
151
|
@given(
|
145
|
-
st.from_regex(SingleQuotes.pattern
|
146
|
-
| st.from_regex(DoubleQuotes.pattern
|
147
|
-
| st.from_regex(Backticks.pattern
|
148
|
-
| st.from_regex(ArrowQuote.pattern
|
149
|
-
| st.from_regex(AllQuotes.pattern
|
152
|
+
st.from_regex(SingleQuotes.pattern, fullmatch=True)
|
153
|
+
| st.from_regex(DoubleQuotes.pattern, fullmatch=True)
|
154
|
+
| st.from_regex(Backticks.pattern, fullmatch=True)
|
155
|
+
| st.from_regex(ArrowQuote.pattern, fullmatch=True)
|
156
|
+
| st.from_regex(AllQuotes.pattern, fullmatch=True)
|
150
157
|
)
|
151
158
|
@example("> bruh")
|
152
159
|
@example("`bruh`")
|
@@ -155,7 +162,7 @@ def test_AllQuotes(s: str):
|
|
155
162
|
assert res == "", (repr(s), repr(res))
|
156
163
|
|
157
164
|
|
158
|
-
@given(st.from_regex(Reference.pattern
|
165
|
+
@given(st.from_regex(Reference.pattern, fullmatch=True))
|
159
166
|
@example("[[Brainstorm]]")
|
160
167
|
@example("[[Phatic Phrases]]")
|
161
168
|
@example("[[Yahoo!]]")
|
@@ -164,7 +171,7 @@ def test_Reference(s: str):
|
|
164
171
|
assert res == "", (repr(s), repr(res))
|
165
172
|
|
166
173
|
|
167
|
-
@given(st.from_regex(ColonEmotes.pattern
|
174
|
+
@given(st.from_regex(ColonEmotes.pattern, fullmatch=True))
|
168
175
|
@example(":owe::owe:")
|
169
176
|
@example(":suffering:")
|
170
177
|
@example(":presid65despair:")
|
@@ -9,10 +9,10 @@ PROPER_NAME_RE = r"[A-Z][a-z]*"
|
|
9
9
|
|
10
10
|
token_strategy = (
|
11
11
|
st.sampled_from(list(words_by_usage(60)))
|
12
|
-
| st.from_regex(Phonotactic.pattern
|
13
|
-
| st.from_regex(Syllabic.pattern
|
12
|
+
| st.from_regex(Phonotactic.pattern, fullmatch=True)
|
13
|
+
| st.from_regex(Syllabic.pattern, fullmatch=True)
|
14
14
|
| st.from_regex(PROPER_NAME_RE, fullmatch=True)
|
15
|
-
| st.from_regex(AlphabeticRe.pattern
|
15
|
+
| st.from_regex(AlphabeticRe.pattern, fullmatch=True)
|
16
16
|
)
|
17
17
|
|
18
18
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|