sonatoki 0.9.2__tar.gz → 0.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sonatoki-0.9.2 → sonatoki-0.10.0}/PKG-INFO +1 -1
- {sonatoki-0.9.2 → sonatoki-0.10.0}/pyproject.toml +1 -1
- {sonatoki-0.9.2 → sonatoki-0.10.0}/src/sonatoki/Tokenizers.py +45 -6
- {sonatoki-0.9.2 → sonatoki-0.10.0}/src/sonatoki/__main__.py +33 -12
- {sonatoki-0.9.2 → sonatoki-0.10.0}/src/sonatoki/constants.py +54 -20
- {sonatoki-0.9.2 → sonatoki-0.10.0}/tests/test_tokenize.py +15 -2
- sonatoki-0.10.0/tests/tokenize_cases/tokenize_sentences_tok.yml +162 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/tests/tokenize_cases/tokenize_words_tok.yml +22 -1
- sonatoki-0.9.2/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -71
- {sonatoki-0.9.2 → sonatoki-0.10.0}/LICENSE +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/README.md +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/src/sonatoki/Cleaners.py +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/src/sonatoki/Configs.py +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/src/sonatoki/Filters.py +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/src/sonatoki/Preprocessors.py +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/src/sonatoki/Scorers.py +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/src/sonatoki/__init__.py +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/src/sonatoki/alphabetic.txt +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/src/sonatoki/ilo.py +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/src/sonatoki/linku.json +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/src/sonatoki/py.typed +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/src/sonatoki/sandbox.json +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/src/sonatoki/syllabic.txt +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/src/sonatoki/types.py +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/src/sonatoki/utils.py +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/tests/__init__.py +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/tests/test_cleaners.py +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/tests/test_filters.py +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/tests/test_ilo.py +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/tests/test_preprocessors.py +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/tests/test_properties.py +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/tests/test_scorers.py +0 -0
- {sonatoki-0.9.2 → sonatoki-0.10.0}/tests/test_utils.py +0 -0
@@ -12,9 +12,13 @@ from sonatoki.utils import regex_escape
|
|
12
12
|
from sonatoki.Filters import NimiUCSUR # seriously this sucks
|
13
13
|
from sonatoki.constants import (
|
14
14
|
ALL_PUNCT,
|
15
|
-
SENTENCE_PUNCT,
|
16
15
|
INTRA_WORD_PUNCT,
|
16
|
+
ALL_SENTENCE_PUNCT,
|
17
|
+
UNICODE_WHITESPACE,
|
17
18
|
ALL_PUNCT_RANGES_STR,
|
19
|
+
UCSUR_CARTOUCHE_LEFT,
|
20
|
+
UCSUR_CARTOUCHE_RIGHT,
|
21
|
+
UCSUR_MINUS_CARTOUCHE,
|
18
22
|
)
|
19
23
|
|
20
24
|
regex.DEFAULT_VERSION = regex.VERSION1
|
@@ -146,7 +150,9 @@ class WordTokenizerRe1(Regex1Tokenizer):
|
|
146
150
|
|
147
151
|
|
148
152
|
class SentTokenizer(SetTokenizer):
|
149
|
-
delimiters = set(
|
153
|
+
delimiters: Set[str] = set(ALL_SENTENCE_PUNCT + "\n") # regex does \n with a flag
|
154
|
+
intra_word_punct: Set[str] = set(INTRA_WORD_PUNCT)
|
155
|
+
all_punct: Set[str] = set(ALL_PUNCT + UNICODE_WHITESPACE)
|
150
156
|
|
151
157
|
@classmethod
|
152
158
|
@override
|
@@ -155,16 +161,43 @@ class SentTokenizer(SetTokenizer):
|
|
155
161
|
return []
|
156
162
|
|
157
163
|
tokens: List[str] = []
|
164
|
+
|
165
|
+
slen = len(s)
|
158
166
|
last_match = 0
|
159
|
-
|
160
|
-
|
167
|
+
i = 0
|
168
|
+
while i < slen:
|
169
|
+
# if a cartouche appears, we do not want to split on its punctuation
|
170
|
+
if s[i] == UCSUR_CARTOUCHE_LEFT:
|
171
|
+
right_i = s.find(UCSUR_CARTOUCHE_RIGHT, i)
|
172
|
+
contained: set[str] = set()
|
173
|
+
if right_i > 0:
|
174
|
+
contained = set(s[i + 1 : right_i])
|
175
|
+
# but it must contain only non-cartouche UCSUR chars
|
176
|
+
if contained and contained.issubset(UCSUR_MINUS_CARTOUCHE):
|
177
|
+
i = right_i + 1
|
178
|
+
continue
|
179
|
+
if s[i] not in cls.delimiters:
|
180
|
+
i += 1
|
161
181
|
continue
|
182
|
+
if s[i] in cls.intra_word_punct:
|
183
|
+
prev = s[i - 1] if i > 0 else ""
|
184
|
+
next = s[i + 1] if i + 1 < slen else ""
|
185
|
+
if (
|
186
|
+
prev
|
187
|
+
and next
|
188
|
+
and prev not in cls.all_punct
|
189
|
+
and next not in cls.all_punct
|
190
|
+
):
|
191
|
+
i += 2
|
192
|
+
continue
|
162
193
|
|
163
194
|
match = s[last_match : i + 1].strip()
|
164
195
|
last_match = i + 1 # newlines can strip but idc
|
165
196
|
if not match:
|
197
|
+
i += 1
|
166
198
|
continue
|
167
199
|
tokens.append(match)
|
200
|
+
i += 1
|
168
201
|
|
169
202
|
match = s[last_match:].strip()
|
170
203
|
if match:
|
@@ -173,18 +206,24 @@ class SentTokenizer(SetTokenizer):
|
|
173
206
|
return tokens
|
174
207
|
|
175
208
|
|
209
|
+
@deprecated(
|
210
|
+
"SentTokenizerRe is a previous reference implementation. Its behavior has diverged from SentTokenizer and it may not be restored."
|
211
|
+
)
|
176
212
|
class SentTokenizerRe(RegexTokenizer):
|
177
213
|
pattern = re.compile(
|
178
|
-
rf"""(?<=[{regex_escape(
|
214
|
+
rf"""(?<=[{regex_escape(ALL_SENTENCE_PUNCT)}])|$""", flags=re.MULTILINE
|
179
215
|
)
|
180
216
|
# TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
|
181
217
|
# TODO: do the typography characters matter?
|
182
218
|
# NOTE: | / and , are *not* sentence delimiters for my purpose
|
183
219
|
|
184
220
|
|
221
|
+
@deprecated(
|
222
|
+
"SentTokenizerRe1 is a previous reference implementation. Its behavior has diverged from SentTokenizer and it may not be restored."
|
223
|
+
)
|
185
224
|
class SentTokenizerRe1(Regex1Tokenizer):
|
186
225
|
pattern = regex.compile(
|
187
|
-
rf"""(?<=[{regex_escape(
|
226
|
+
rf"""(?<=[{regex_escape(ALL_SENTENCE_PUNCT)}]|$)""", flags=regex.MULTILINE
|
188
227
|
)
|
189
228
|
|
190
229
|
|
@@ -24,6 +24,7 @@ from sonatoki.Cleaners import ConsecutiveDuplicates
|
|
24
24
|
from sonatoki.constants import (
|
25
25
|
UCSUR_PUNCT_RANGES,
|
26
26
|
UNICODE_PUNCT_RANGES,
|
27
|
+
UNICODE_WHITESPACE_RANGES,
|
27
28
|
EMOJI_VARIATION_SELECTOR_RANGES,
|
28
29
|
)
|
29
30
|
|
@@ -121,6 +122,11 @@ def regen_unicode_data():
|
|
121
122
|
"Sc", # Currency
|
122
123
|
"So", # Other
|
123
124
|
}
|
125
|
+
WHITESPACE_CATEGORIES = {
|
126
|
+
"Zl", # Line Separator
|
127
|
+
"Zp", # Paragraph Separator
|
128
|
+
"Zs", # Space Separator
|
129
|
+
}
|
124
130
|
r"""These characters are in Symbol other (So) but are not in
|
125
131
|
`\p{Punctuation}` However, I began excluding them again, because it turns
|
126
132
|
out that some sequences of latin alphabet emoji."""
|
@@ -134,11 +140,15 @@ def regen_unicode_data():
|
|
134
140
|
def is_punctuation(data: List[str]):
|
135
141
|
return data[2] in PUNCT_CATEGORIES
|
136
142
|
|
143
|
+
def is_whitespace(data: List[str]):
|
144
|
+
return data[2] in WHITESPACE_CATEGORIES
|
145
|
+
|
137
146
|
def get_character(data: List[str]):
|
138
147
|
return chr(int(data[0], 16))
|
139
148
|
|
140
149
|
unicode_data = download(UNICODE_DATA)
|
141
150
|
unicode_punctuation = ""
|
151
|
+
unicode_whitespace = ""
|
142
152
|
for line in unicode_data.split("\n"):
|
143
153
|
if not line: # damn you, trailing newline
|
144
154
|
continue
|
@@ -147,24 +157,35 @@ def regen_unicode_data():
|
|
147
157
|
# This does not apply to any currently defined punctuation category.
|
148
158
|
|
149
159
|
unicode_data = line.split(";")
|
150
|
-
if
|
160
|
+
if is_punctuation(unicode_data):
|
161
|
+
char = get_character(unicode_data)
|
162
|
+
unicode_punctuation += char
|
163
|
+
continue
|
164
|
+
if is_whitespace((unicode_data)):
|
165
|
+
char = get_character(unicode_data)
|
166
|
+
unicode_whitespace += char
|
151
167
|
continue
|
152
|
-
|
153
|
-
char = get_character(unicode_data)
|
154
|
-
|
155
|
-
unicode_punctuation += char
|
156
168
|
|
157
169
|
unicode_punctuation = emoji.replace_emoji(unicode_punctuation)
|
158
170
|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
unicode_ranges = sorted(unicode_ranges)
|
171
|
+
unicode_punct_ranges = find_unicode_ranges(unicode_punctuation)
|
172
|
+
unicode_punct_ranges.extend(UCSUR_PUNCT_RANGES)
|
173
|
+
unicode_punct_ranges = sorted(unicode_punct_ranges)
|
163
174
|
# sorted in case my manual additions are out of order
|
164
175
|
|
165
|
-
|
166
|
-
|
167
|
-
|
176
|
+
# TODO: can i push these outputs directly into the constants.py file?
|
177
|
+
|
178
|
+
if unicode_punct_ranges != UNICODE_PUNCT_RANGES:
|
179
|
+
output = json.dumps(unicode_punct_ranges, indent=4, ensure_ascii=True)
|
180
|
+
with open("updated_unicode_punct_ranges.txt", "w") as f:
|
181
|
+
f.write(output)
|
182
|
+
|
183
|
+
unicode_whitespace_ranges = find_unicode_ranges(unicode_whitespace)
|
184
|
+
unicode_whitespace_ranges = sorted(unicode_whitespace_ranges)
|
185
|
+
if unicode_whitespace_ranges != UNICODE_WHITESPACE_RANGES:
|
186
|
+
output = json.dumps(unicode_whitespace_ranges, indent=4, ensure_ascii=True)
|
187
|
+
with open("updated_unicode_whitespace_ranges.txt", "w") as f:
|
188
|
+
f.write(output)
|
168
189
|
|
169
190
|
|
170
191
|
def main(argv: argparse.Namespace):
|
@@ -109,8 +109,9 @@ UNICODE_PUNCT_RANGES = [
|
|
109
109
|
"\\U00001a1e-\\U00001a1f",
|
110
110
|
"\\U00001aa0-\\U00001aa6",
|
111
111
|
"\\U00001aa8-\\U00001aad",
|
112
|
+
"\\U00001b4e-\\U00001b4f",
|
112
113
|
"\\U00001b5a-\\U00001b6a",
|
113
|
-
"\\U00001b74-\\
|
114
|
+
"\\U00001b74-\\U00001b7f",
|
114
115
|
"\\U00001bfc-\\U00001bff",
|
115
116
|
"\\U00001c3b-\\U00001c3f",
|
116
117
|
"\\U00001c7e-\\U00001c7f",
|
@@ -152,7 +153,7 @@ UNICODE_PUNCT_RANGES = [
|
|
152
153
|
"\\U00002329-\\U000023ce",
|
153
154
|
"\\U000023d0-\\U000023e8",
|
154
155
|
"\\U000023f4-\\U000023f7",
|
155
|
-
"\\U000023fb-\\
|
156
|
+
"\\U000023fb-\\U00002429",
|
156
157
|
"\\U00002440-\\U0000244a",
|
157
158
|
"\\U0000249c-\\U000024c1",
|
158
159
|
"\\U000024c3-\\U000024e9",
|
@@ -248,7 +249,7 @@ UNICODE_PUNCT_RANGES = [
|
|
248
249
|
"\\U000030fb",
|
249
250
|
"\\U00003190-\\U00003191",
|
250
251
|
"\\U00003196-\\U0000319f",
|
251
|
-
"\\U000031c0-\\
|
252
|
+
"\\U000031c0-\\U000031e5",
|
252
253
|
"\\U000031ef",
|
253
254
|
"\\U00003200-\\U0000321e",
|
254
255
|
"\\U0000322a-\\U00003247",
|
@@ -321,6 +322,8 @@ UNICODE_PUNCT_RANGES = [
|
|
321
322
|
"\\U00010af0-\\U00010af6",
|
322
323
|
"\\U00010b39-\\U00010b3f",
|
323
324
|
"\\U00010b99-\\U00010b9c",
|
325
|
+
"\\U00010d6e",
|
326
|
+
"\\U00010d8e-\\U00010d8f",
|
324
327
|
"\\U00010ead",
|
325
328
|
"\\U00010f55-\\U00010f59",
|
326
329
|
"\\U00010f86-\\U00010f89",
|
@@ -335,6 +338,8 @@ UNICODE_PUNCT_RANGES = [
|
|
335
338
|
"\\U000111dd-\\U000111df",
|
336
339
|
"\\U00011238-\\U0001123d",
|
337
340
|
"\\U000112a9",
|
341
|
+
"\\U000113d4-\\U000113d5",
|
342
|
+
"\\U000113d7-\\U000113d8",
|
338
343
|
"\\U0001144b-\\U0001144f",
|
339
344
|
"\\U0001145a-\\U0001145b",
|
340
345
|
"\\U0001145d",
|
@@ -351,6 +356,7 @@ UNICODE_PUNCT_RANGES = [
|
|
351
356
|
"\\U00011a9a-\\U00011a9c",
|
352
357
|
"\\U00011a9e-\\U00011aa2",
|
353
358
|
"\\U00011b00-\\U00011b09",
|
359
|
+
"\\U00011be1",
|
354
360
|
"\\U00011c41-\\U00011c45",
|
355
361
|
"\\U00011c70-\\U00011c71",
|
356
362
|
"\\U00011ef7-\\U00011ef8",
|
@@ -363,10 +369,13 @@ UNICODE_PUNCT_RANGES = [
|
|
363
369
|
"\\U00016af5",
|
364
370
|
"\\U00016b37-\\U00016b3f",
|
365
371
|
"\\U00016b44-\\U00016b45",
|
372
|
+
"\\U00016d6d-\\U00016d6f",
|
366
373
|
"\\U00016e97-\\U00016e9a",
|
367
374
|
"\\U00016fe2",
|
368
375
|
"\\U0001bc9c",
|
369
376
|
"\\U0001bc9f",
|
377
|
+
"\\U0001cc00-\\U0001ccef",
|
378
|
+
"\\U0001cd00-\\U0001ceb3",
|
370
379
|
"\\U0001cf50-\\U0001cfc3",
|
371
380
|
"\\U0001d000-\\U0001d0f5",
|
372
381
|
"\\U0001d100-\\U0001d126",
|
@@ -395,6 +404,7 @@ UNICODE_PUNCT_RANGES = [
|
|
395
404
|
"\\U0001da85-\\U0001da8b",
|
396
405
|
"\\U0001e14f",
|
397
406
|
"\\U0001e2ff",
|
407
|
+
"\\U0001e5ff",
|
398
408
|
"\\U0001e95e-\\U0001e95f",
|
399
409
|
"\\U0001ecac",
|
400
410
|
"\\U0001ecb0",
|
@@ -464,16 +474,41 @@ UNICODE_PUNCT_RANGES = [
|
|
464
474
|
"\\U0001f850-\\U0001f859",
|
465
475
|
"\\U0001f860-\\U0001f887",
|
466
476
|
"\\U0001f890-\\U0001f8ad",
|
467
|
-
"\\U0001f8b0-\\
|
477
|
+
"\\U0001f8b0-\\U0001f8bb",
|
478
|
+
"\\U0001f8c0-\\U0001f8c1",
|
468
479
|
"\\U0001f900-\\U0001f90b",
|
469
480
|
"\\U0001f93b",
|
470
481
|
"\\U0001f946",
|
471
482
|
"\\U0001fa00-\\U0001fa53",
|
472
483
|
"\\U0001fa60-\\U0001fa6d",
|
473
484
|
"\\U0001fb00-\\U0001fb92",
|
474
|
-
"\\U0001fb94-\\
|
485
|
+
"\\U0001fb94-\\U0001fbef",
|
475
486
|
"\\U000f1990-\\U000f199d",
|
476
487
|
]
|
488
|
+
UNICODE_PUNCT = find_unicode_chars(UNICODE_PUNCT_RANGES)
|
489
|
+
# this is a large string.
|
490
|
+
|
491
|
+
# `\p{posix_punct}` character class
|
492
|
+
POSIX_PUNCT = r"""-!"#$%&'()*+,./:;<=>?@[\]^_`{|}~"""
|
493
|
+
POSIX_PUNCT_RANGES = find_unicode_ranges(POSIX_PUNCT)
|
494
|
+
|
495
|
+
ALL_PUNCT = "".join(sorted(list(set(POSIX_PUNCT + UNICODE_PUNCT))))
|
496
|
+
ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
|
497
|
+
# combined bc the result could be simpler
|
498
|
+
|
499
|
+
|
500
|
+
UNICODE_WHITESPACE_RANGES = [
|
501
|
+
"\\U00000020",
|
502
|
+
"\\U000000a0",
|
503
|
+
"\\U00001680",
|
504
|
+
"\\U00002000-\\U0000200a",
|
505
|
+
"\\U00002028-\\U00002029",
|
506
|
+
"\\U0000202f",
|
507
|
+
"\\U0000205f",
|
508
|
+
"\\U00003000",
|
509
|
+
]
|
510
|
+
UNICODE_WHITESPACE = find_unicode_chars(UNICODE_WHITESPACE_RANGES)
|
511
|
+
UNICODE_WHITESPACE_RANGES_STR = "".join(UNICODE_WHITESPACE_RANGES)
|
477
512
|
|
478
513
|
|
479
514
|
NOT_IN_PUNCT_CLASS = r"Ⓐ-ⓩ🄰-🅉🅐-🅩🅰-🆉"
|
@@ -482,9 +517,7 @@ EMOJI_VARIATION_SELECTOR_RANGES = ["\\U0000fe0e-\\U0000fe0f"]
|
|
482
517
|
EMOJI_VARIATION_SELECTOR_RANGES_STR = "".join(EMOJI_VARIATION_SELECTOR_RANGES)
|
483
518
|
"""All variation selectors are in Nonspacing Mark (Mn), but it is more apt to
|
484
519
|
mark these two as punctuation, since they are used exclusively for rendering
|
485
|
-
emoji.
|
486
|
-
|
487
|
-
But it's even better to use the Emoji filter.
|
520
|
+
emoji. But it's best to use the Emoji filter.
|
488
521
|
"""
|
489
522
|
|
490
523
|
UCSUR_PUNCT_RANGES = ["\\U000f1990-\\U000f199d"]
|
@@ -492,20 +525,15 @@ UCSUR_PUNCT_RANGES_STR = "".join(UCSUR_PUNCT_RANGES)
|
|
492
525
|
"""Private Use Area glyphs are given the apt but unhelpful 'Private Use'
|
493
526
|
class."""
|
494
527
|
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
# `\p{posix_punct}` character class
|
499
|
-
POSIX_PUNCT = r"""-!"#$%&'()*+,./:;<=>?@[\]^_`{|}~"""
|
500
|
-
POSIX_PUNCT_RANGES = find_unicode_ranges(POSIX_PUNCT)
|
501
|
-
|
502
|
-
ALL_PUNCT = "".join(sorted(list(set(POSIX_PUNCT + UNICODE_PUNCT))))
|
503
|
-
ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
|
504
|
-
# combined bc the result could be simpler
|
528
|
+
UCSUR_CARTOUCHE_LEFT = ""
|
529
|
+
UCSUR_CARTOUCHE_RIGHT = ""
|
505
530
|
|
506
|
-
SENTENCE_PUNCT = """.?!:;"()[-]«»‹›“”‟„⹂‽·•…「」『』"""
|
507
531
|
# single quotes are word boundaries if not intra-word, but double quotes are sentence
|
508
532
|
# boundaries
|
533
|
+
BASIC_SENTENCE_PUNCT = """.?!:;()[-]‽·•…"""
|
534
|
+
QUOTATIVE_PUNCT = """"«»‹›“”‟„⹂「」『』"""
|
535
|
+
UCSUR_SENTENCE_PUNCT = """"""
|
536
|
+
ALL_SENTENCE_PUNCT = BASIC_SENTENCE_PUNCT + UCSUR_SENTENCE_PUNCT
|
509
537
|
|
510
538
|
INTRA_WORD_PUNCT = """-'’."""
|
511
539
|
|
@@ -691,7 +719,11 @@ UCSUR_RANGES = [
|
|
691
719
|
"\\U000F19A0-\\U000F19A3", # ku lili
|
692
720
|
]
|
693
721
|
NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
|
694
|
-
|
722
|
+
ALL_UCSUR = NIMI_UCSUR + find_unicode_chars(UCSUR_PUNCT_RANGES)
|
723
|
+
UCSUR_MINUS_CARTOUCHE = set(ALL_UCSUR).difference(
|
724
|
+
{UCSUR_CARTOUCHE_LEFT, UCSUR_CARTOUCHE_RIGHT}
|
725
|
+
)
|
726
|
+
print(UCSUR_MINUS_CARTOUCHE)
|
695
727
|
|
696
728
|
# NIMI_PU_UCSUR_RANGES = ["\\U000F1900-\\U000F1977"]
|
697
729
|
# NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
|
@@ -757,7 +789,9 @@ __all__ = [
|
|
757
789
|
"POSIX_PUNCT_RANGES",
|
758
790
|
"UCSUR_PUNCT_RANGES",
|
759
791
|
"UCSUR_PUNCT_RANGES_STR",
|
792
|
+
"UCSUR_SENTENCE_PUNCT",
|
760
793
|
"UNICODE_PUNCT",
|
761
794
|
"UNICODE_PUNCT_RANGES",
|
795
|
+
"UNICODE_WHITESPACE",
|
762
796
|
"VOWELS",
|
763
797
|
]
|
@@ -54,10 +54,11 @@ def test_SentTokenizer(test: TokenizerTest):
|
|
54
54
|
pytest.xfail()
|
55
55
|
|
56
56
|
fn_tokenized = SentTokenizer.tokenize(test["input"])
|
57
|
-
re1_tokenized = SentTokenizerRe1.tokenize(test["input"])
|
58
|
-
assert fn_tokenized ==
|
57
|
+
# re1_tokenized = SentTokenizerRe1.tokenize(test["input"])
|
58
|
+
assert fn_tokenized == test["output"], test["name"]
|
59
59
|
|
60
60
|
|
61
|
+
@pytest.mark.skip("Deprecated")
|
61
62
|
@pytest.mark.parametrize(
|
62
63
|
"test", load_tokenizer_tests("tests/tokenize_cases/tokenize_sentences_tok.yml")
|
63
64
|
)
|
@@ -65,11 +66,23 @@ def test_SentTokenizerRe(test: TokenizerTest):
|
|
65
66
|
if test["xfail"]:
|
66
67
|
pytest.xfail()
|
67
68
|
|
69
|
+
re_tokenized = SentTokenizerRe.tokenize(test["input"])
|
70
|
+
assert re_tokenized == test["output"], test["name"]
|
71
|
+
|
72
|
+
|
73
|
+
@pytest.mark.parametrize(
|
74
|
+
"test", load_tokenizer_tests("tests/tokenize_cases/tokenize_sentences_tok.yml")
|
75
|
+
)
|
76
|
+
def test_SentTokenizerReCompare(test: TokenizerTest):
|
77
|
+
if test["xfail"]:
|
78
|
+
pytest.xfail()
|
79
|
+
|
68
80
|
re_tokenized = SentTokenizerRe.tokenize(test["input"])
|
69
81
|
re1_tokenized = SentTokenizerRe1.tokenize(test["input"])
|
70
82
|
assert re_tokenized == re1_tokenized, test["name"]
|
71
83
|
|
72
84
|
|
85
|
+
@pytest.mark.skip("Deprecated")
|
73
86
|
@pytest.mark.parametrize(
|
74
87
|
"test", load_tokenizer_tests("tests/tokenize_cases/tokenize_sentences_tok.yml")
|
75
88
|
)
|
@@ -0,0 +1,162 @@
|
|
1
|
+
---
|
2
|
+
- name: "basic1"
|
3
|
+
input: "mu. mu."
|
4
|
+
output:
|
5
|
+
- "mu."
|
6
|
+
- "mu."
|
7
|
+
- name: "basic2"
|
8
|
+
input: "mu! mu!"
|
9
|
+
output:
|
10
|
+
- "mu!"
|
11
|
+
- "mu!"
|
12
|
+
- name: "basic3"
|
13
|
+
input: "mu? mu?"
|
14
|
+
output:
|
15
|
+
- "mu?"
|
16
|
+
- "mu?"
|
17
|
+
- name: "basic4"
|
18
|
+
input: "mi mu. mi wawa."
|
19
|
+
output:
|
20
|
+
- "mi mu."
|
21
|
+
- "mi wawa."
|
22
|
+
- name: "empty"
|
23
|
+
input: ""
|
24
|
+
output: []
|
25
|
+
- name: "whitespace"
|
26
|
+
input: " \n "
|
27
|
+
output: []
|
28
|
+
- name: "newline basic"
|
29
|
+
input: "sina lon seme?\nmi wile lon poka...\n"
|
30
|
+
output:
|
31
|
+
- "sina lon seme?"
|
32
|
+
- "mi wile lon poka."
|
33
|
+
- "."
|
34
|
+
- "."
|
35
|
+
- name: "newline alone"
|
36
|
+
input: "sina lon seme\nmi wile lon poka"
|
37
|
+
output:
|
38
|
+
- "sina lon seme"
|
39
|
+
- "mi wile lon poka"
|
40
|
+
- name: "dash"
|
41
|
+
input: "mi sona ala e ni- sina seme a"
|
42
|
+
output:
|
43
|
+
- "mi sona ala e ni-"
|
44
|
+
- "sina seme a"
|
45
|
+
- name: "comma"
|
46
|
+
input: "mi mu tawa sina, mi wawa e sina."
|
47
|
+
output:
|
48
|
+
- "mi mu tawa sina, mi wawa e sina."
|
49
|
+
- name: "singlequotes"
|
50
|
+
input: "toki li tan kulupu Kuko li ni: 'o ike ala!'"
|
51
|
+
output:
|
52
|
+
- "toki li tan kulupu Kuko li ni:"
|
53
|
+
- "'o ike ala!"
|
54
|
+
- "'"
|
55
|
+
- name: "doublequotes"
|
56
|
+
input: 'ona li toki e ni: "mama sina"'
|
57
|
+
output:
|
58
|
+
- "ona li toki e ni:"
|
59
|
+
- '"mama sina"'
|
60
|
+
- name: "doublequotes 2"
|
61
|
+
input: 'this is a bit dumb, right? they said "where is the pacific ocean?"'
|
62
|
+
output:
|
63
|
+
- "this is a bit dumb, right?"
|
64
|
+
- 'they said "where is the pacific ocean?'
|
65
|
+
- '"'
|
66
|
+
- name: "doublequotes 3"
|
67
|
+
input: 'they said "wow, its made"'
|
68
|
+
output:
|
69
|
+
- they said "wow, its made"
|
70
|
+
- name: "mixed periods spoilers"
|
71
|
+
input: "||...||"
|
72
|
+
output:
|
73
|
+
- "||."
|
74
|
+
- "."
|
75
|
+
- "."
|
76
|
+
- "||"
|
77
|
+
- name: "trailing periods"
|
78
|
+
input: "h.."
|
79
|
+
output:
|
80
|
+
- "h."
|
81
|
+
- "."
|
82
|
+
- name: "trailing periods 2"
|
83
|
+
input: "h.!"
|
84
|
+
output:
|
85
|
+
- "h."
|
86
|
+
- "!"
|
87
|
+
- name: "intraword punctuation 1"
|
88
|
+
input: "e.g. monsuta"
|
89
|
+
output:
|
90
|
+
- "e.g."
|
91
|
+
- "monsuta"
|
92
|
+
- name: "intraword punctuation 2"
|
93
|
+
input: "isn't that game-breaking? i think so"
|
94
|
+
output:
|
95
|
+
- "isn't that game-breaking?"
|
96
|
+
- "i think so"
|
97
|
+
- name: "fake intraword punct 1"
|
98
|
+
input: "!.h"
|
99
|
+
output:
|
100
|
+
- "!"
|
101
|
+
- "."
|
102
|
+
- "h"
|
103
|
+
- name: "full width space"
|
104
|
+
input: "life-altering pseudo-science. and non-sense"
|
105
|
+
output:
|
106
|
+
- "life-altering pseudo-science."
|
107
|
+
- "and non-sense"
|
108
|
+
- name: "discovered case 1"
|
109
|
+
input: "ona li ken lukin e sitelen [_ike_nanpa_lete_ike]. ni li pona kin."
|
110
|
+
output:
|
111
|
+
- "ona li ken lukin e sitelen ["
|
112
|
+
- "_ike_nanpa_lete_ike]"
|
113
|
+
- "."
|
114
|
+
- "ni li pona kin."
|
115
|
+
- name: "zwj in emoji"
|
116
|
+
input: "👨👩👧👧"
|
117
|
+
output:
|
118
|
+
- "👨👩👧👧"
|
119
|
+
|
120
|
+
- name: UCSUR 1
|
121
|
+
input: ""
|
122
|
+
output:
|
123
|
+
- ""
|
124
|
+
- ""
|
125
|
+
# - name: "UCSUR 2 (original)"
|
126
|
+
# input: ""
|
127
|
+
# output:
|
128
|
+
# - ""
|
129
|
+
# - ""
|
130
|
+
# - ""
|
131
|
+
# - ""
|
132
|
+
- name: "UCSUR 2 (preferred)"
|
133
|
+
input: ""
|
134
|
+
output:
|
135
|
+
- ""
|
136
|
+
- name: "UCSUR 3"
|
137
|
+
input: ""
|
138
|
+
output:
|
139
|
+
- ""
|
140
|
+
- name: "UCSUR 4"
|
141
|
+
input: ""
|
142
|
+
output:
|
143
|
+
- ""
|
144
|
+
- name: "UCSUR 5"
|
145
|
+
input: ""
|
146
|
+
output:
|
147
|
+
- ""
|
148
|
+
- name: "UCSUR 6"
|
149
|
+
input: "nvidia shield. and other nvidia products."
|
150
|
+
output:
|
151
|
+
- "nvidia shield."
|
152
|
+
- "and other nvidia products."
|
153
|
+
- ""
|
154
|
+
- name: "UCSUR 7"
|
155
|
+
input: ""
|
156
|
+
output:
|
157
|
+
- ""
|
158
|
+
- ""
|
159
|
+
- name: "UCSUR 8"
|
160
|
+
input: ""
|
161
|
+
output:
|
162
|
+
- ""
|
@@ -289,7 +289,20 @@
|
|
289
289
|
- "「"
|
290
290
|
- "Direct"
|
291
291
|
- "」"
|
292
|
-
|
292
|
+
- name: "UCSUR 4"
|
293
|
+
input: ""
|
294
|
+
output:
|
295
|
+
- ""
|
296
|
+
- ""
|
297
|
+
- ""
|
298
|
+
- ""
|
299
|
+
- ""
|
300
|
+
- ""
|
301
|
+
- ""
|
302
|
+
- ""
|
303
|
+
- ""
|
304
|
+
- ""
|
305
|
+
- ""
|
293
306
|
- name: "simple intrapunct 1"
|
294
307
|
input: "i'm"
|
295
308
|
output:
|
@@ -376,3 +389,11 @@
|
|
376
389
|
input: "q.q"
|
377
390
|
output:
|
378
391
|
- "q.q"
|
392
|
+
- name: "full width space"
|
393
|
+
input: "life-altering pseudo-science. and non-sense"
|
394
|
+
output:
|
395
|
+
- "life-altering"
|
396
|
+
- "pseudo-science"
|
397
|
+
- "."
|
398
|
+
- "and"
|
399
|
+
- "non-sense"
|
@@ -1,71 +0,0 @@
|
|
1
|
-
---
|
2
|
-
- name: "basic1"
|
3
|
-
input: "mu. mu."
|
4
|
-
output:
|
5
|
-
- "mu."
|
6
|
-
- "mu."
|
7
|
-
- name: "basic2"
|
8
|
-
input: "mu! mu!"
|
9
|
-
output:
|
10
|
-
- "mu!"
|
11
|
-
- "mu!"
|
12
|
-
- name: "basic3"
|
13
|
-
input: "mu? mu?"
|
14
|
-
output:
|
15
|
-
- "mu?"
|
16
|
-
- "mu?"
|
17
|
-
- name: "basic4"
|
18
|
-
input: "mi mu. mi wawa."
|
19
|
-
output:
|
20
|
-
- "mi mu."
|
21
|
-
- "mi wawa."
|
22
|
-
- name: "empty"
|
23
|
-
input: ""
|
24
|
-
output: []
|
25
|
-
- name: "whitespace"
|
26
|
-
input: " \n "
|
27
|
-
output: []
|
28
|
-
- name: "newline basic"
|
29
|
-
input: "sina lon seme?\nmi wile lon poka...\n"
|
30
|
-
output:
|
31
|
-
- "sina lon seme?"
|
32
|
-
- "mi wile lon poka."
|
33
|
-
- "."
|
34
|
-
- "."
|
35
|
-
- name: "newline alone"
|
36
|
-
input: "sina lon seme\nmi wile lon poka"
|
37
|
-
output:
|
38
|
-
- "sina lon seme"
|
39
|
-
- "mi wile lon poka"
|
40
|
-
- name: "dash"
|
41
|
-
input: "mi sona ala e ni- sina seme a"
|
42
|
-
output:
|
43
|
-
- "mi sona ala e ni-"
|
44
|
-
- "sina seme a"
|
45
|
-
- name: "comma"
|
46
|
-
input: "mi mu tawa sina, mi wawa e sina."
|
47
|
-
output:
|
48
|
-
- "mi mu tawa sina, mi wawa e sina."
|
49
|
-
- name: "singlequotes"
|
50
|
-
input: "toki li tan kulupu Kuko li ni: 'o ike ala!'"
|
51
|
-
output:
|
52
|
-
- "toki li tan kulupu Kuko li ni:"
|
53
|
-
- "'o ike ala!"
|
54
|
-
- "'"
|
55
|
-
- name: "doublequotes"
|
56
|
-
input: 'ona li toki e ni: "mama sina"'
|
57
|
-
output:
|
58
|
-
- "ona li toki e ni:"
|
59
|
-
- '"'
|
60
|
-
- 'mama sina"'
|
61
|
-
- name: "discovered case 1"
|
62
|
-
input: "ona li ken lukin e sitelen [_ike_nanpa_lete_ike]. ni li pona kin."
|
63
|
-
output:
|
64
|
-
- "ona li ken lukin e sitelen ["
|
65
|
-
- "_ike_nanpa_lete_ike]"
|
66
|
-
- "."
|
67
|
-
- "ni li pona kin."
|
68
|
-
- name: "zwj in emoji"
|
69
|
-
input: "👨👩👧👧"
|
70
|
-
output:
|
71
|
-
- "👨👩👧👧"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|