sonatoki 0.9.1__tar.gz → 0.10.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {sonatoki-0.9.1 → sonatoki-0.10.0}/PKG-INFO +1 -1
  2. {sonatoki-0.9.1 → sonatoki-0.10.0}/pyproject.toml +1 -1
  3. {sonatoki-0.9.1 → sonatoki-0.10.0}/src/sonatoki/Tokenizers.py +49 -6
  4. {sonatoki-0.9.1 → sonatoki-0.10.0}/src/sonatoki/__main__.py +33 -12
  5. {sonatoki-0.9.1 → sonatoki-0.10.0}/src/sonatoki/constants.py +55 -21
  6. {sonatoki-0.9.1 → sonatoki-0.10.0}/tests/test_ilo.py +1 -0
  7. {sonatoki-0.9.1 → sonatoki-0.10.0}/tests/test_tokenize.py +15 -2
  8. sonatoki-0.10.0/tests/tokenize_cases/tokenize_sentences_tok.yml +162 -0
  9. {sonatoki-0.9.1 → sonatoki-0.10.0}/tests/tokenize_cases/tokenize_words_tok.yml +73 -13
  10. sonatoki-0.9.1/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -71
  11. {sonatoki-0.9.1 → sonatoki-0.10.0}/LICENSE +0 -0
  12. {sonatoki-0.9.1 → sonatoki-0.10.0}/README.md +0 -0
  13. {sonatoki-0.9.1 → sonatoki-0.10.0}/src/sonatoki/Cleaners.py +0 -0
  14. {sonatoki-0.9.1 → sonatoki-0.10.0}/src/sonatoki/Configs.py +0 -0
  15. {sonatoki-0.9.1 → sonatoki-0.10.0}/src/sonatoki/Filters.py +0 -0
  16. {sonatoki-0.9.1 → sonatoki-0.10.0}/src/sonatoki/Preprocessors.py +0 -0
  17. {sonatoki-0.9.1 → sonatoki-0.10.0}/src/sonatoki/Scorers.py +0 -0
  18. {sonatoki-0.9.1 → sonatoki-0.10.0}/src/sonatoki/__init__.py +0 -0
  19. {sonatoki-0.9.1 → sonatoki-0.10.0}/src/sonatoki/alphabetic.txt +0 -0
  20. {sonatoki-0.9.1 → sonatoki-0.10.0}/src/sonatoki/ilo.py +0 -0
  21. {sonatoki-0.9.1 → sonatoki-0.10.0}/src/sonatoki/linku.json +0 -0
  22. {sonatoki-0.9.1 → sonatoki-0.10.0}/src/sonatoki/py.typed +0 -0
  23. {sonatoki-0.9.1 → sonatoki-0.10.0}/src/sonatoki/sandbox.json +0 -0
  24. {sonatoki-0.9.1 → sonatoki-0.10.0}/src/sonatoki/syllabic.txt +0 -0
  25. {sonatoki-0.9.1 → sonatoki-0.10.0}/src/sonatoki/types.py +0 -0
  26. {sonatoki-0.9.1 → sonatoki-0.10.0}/src/sonatoki/utils.py +0 -0
  27. {sonatoki-0.9.1 → sonatoki-0.10.0}/tests/__init__.py +0 -0
  28. {sonatoki-0.9.1 → sonatoki-0.10.0}/tests/test_cleaners.py +0 -0
  29. {sonatoki-0.9.1 → sonatoki-0.10.0}/tests/test_filters.py +0 -0
  30. {sonatoki-0.9.1 → sonatoki-0.10.0}/tests/test_preprocessors.py +0 -0
  31. {sonatoki-0.9.1 → sonatoki-0.10.0}/tests/test_properties.py +0 -0
  32. {sonatoki-0.9.1 → sonatoki-0.10.0}/tests/test_scorers.py +0 -0
  33. {sonatoki-0.9.1 → sonatoki-0.10.0}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.9.1
3
+ Version: 0.10.0
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.9.1"
3
+ version = "0.10.0"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -12,9 +12,13 @@ from sonatoki.utils import regex_escape
12
12
  from sonatoki.Filters import NimiUCSUR # seriously this sucks
13
13
  from sonatoki.constants import (
14
14
  ALL_PUNCT,
15
- SENTENCE_PUNCT,
16
15
  INTRA_WORD_PUNCT,
16
+ ALL_SENTENCE_PUNCT,
17
+ UNICODE_WHITESPACE,
17
18
  ALL_PUNCT_RANGES_STR,
19
+ UCSUR_CARTOUCHE_LEFT,
20
+ UCSUR_CARTOUCHE_RIGHT,
21
+ UCSUR_MINUS_CARTOUCHE,
18
22
  )
19
23
 
20
24
  regex.DEFAULT_VERSION = regex.VERSION1
@@ -104,6 +108,10 @@ class WordTokenizer(SetTokenizer):
104
108
  # we skipped, but there wasn't another writing character
105
109
  cls.add_token(s, tokens, last_match, i - 1)
106
110
  last_match = i - 1
111
+ # there may be punctuation though
112
+ # TODO: this is duplicated
113
+ while i < slen and cls.is_delimiter(s[i]):
114
+ i += 1
107
115
 
108
116
  cls.add_token(s, tokens, last_match, i)
109
117
 
@@ -142,7 +150,9 @@ class WordTokenizerRe1(Regex1Tokenizer):
142
150
 
143
151
 
144
152
  class SentTokenizer(SetTokenizer):
145
- delimiters = set(SENTENCE_PUNCT + "\n") # regex does \n with a flag
153
+ delimiters: Set[str] = set(ALL_SENTENCE_PUNCT + "\n") # regex does \n with a flag
154
+ intra_word_punct: Set[str] = set(INTRA_WORD_PUNCT)
155
+ all_punct: Set[str] = set(ALL_PUNCT + UNICODE_WHITESPACE)
146
156
 
147
157
  @classmethod
148
158
  @override
@@ -151,16 +161,43 @@ class SentTokenizer(SetTokenizer):
151
161
  return []
152
162
 
153
163
  tokens: List[str] = []
164
+
165
+ slen = len(s)
154
166
  last_match = 0
155
- for i, char in enumerate(s):
156
- if char not in cls.delimiters:
167
+ i = 0
168
+ while i < slen:
169
+ # if a cartouche appears, we do not want to split on its punctuation
170
+ if s[i] == UCSUR_CARTOUCHE_LEFT:
171
+ right_i = s.find(UCSUR_CARTOUCHE_RIGHT, i)
172
+ contained: set[str] = set()
173
+ if right_i > 0:
174
+ contained = set(s[i + 1 : right_i])
175
+ # but it must contain only non-cartouche UCSUR chars
176
+ if contained and contained.issubset(UCSUR_MINUS_CARTOUCHE):
177
+ i = right_i + 1
178
+ continue
179
+ if s[i] not in cls.delimiters:
180
+ i += 1
157
181
  continue
182
+ if s[i] in cls.intra_word_punct:
183
+ prev = s[i - 1] if i > 0 else ""
184
+ next = s[i + 1] if i + 1 < slen else ""
185
+ if (
186
+ prev
187
+ and next
188
+ and prev not in cls.all_punct
189
+ and next not in cls.all_punct
190
+ ):
191
+ i += 2
192
+ continue
158
193
 
159
194
  match = s[last_match : i + 1].strip()
160
195
  last_match = i + 1 # newlines can strip but idc
161
196
  if not match:
197
+ i += 1
162
198
  continue
163
199
  tokens.append(match)
200
+ i += 1
164
201
 
165
202
  match = s[last_match:].strip()
166
203
  if match:
@@ -169,18 +206,24 @@ class SentTokenizer(SetTokenizer):
169
206
  return tokens
170
207
 
171
208
 
209
+ @deprecated(
210
+ "SentTokenizerRe is a previous reference implementation. Its behavior has diverged from SentTokenizer and it may not be restored."
211
+ )
172
212
  class SentTokenizerRe(RegexTokenizer):
173
213
  pattern = re.compile(
174
- rf"""(?<=[{regex_escape(SENTENCE_PUNCT)}])|$""", flags=re.MULTILINE
214
+ rf"""(?<=[{regex_escape(ALL_SENTENCE_PUNCT)}])|$""", flags=re.MULTILINE
175
215
  )
176
216
  # TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
177
217
  # TODO: do the typography characters matter?
178
218
  # NOTE: | / and , are *not* sentence delimiters for my purpose
179
219
 
180
220
 
221
+ @deprecated(
222
+ "SentTokenizerRe1 is a previous reference implementation. Its behavior has diverged from SentTokenizer and it may not be restored."
223
+ )
181
224
  class SentTokenizerRe1(Regex1Tokenizer):
182
225
  pattern = regex.compile(
183
- rf"""(?<=[{regex_escape(SENTENCE_PUNCT)}]|$)""", flags=regex.MULTILINE
226
+ rf"""(?<=[{regex_escape(ALL_SENTENCE_PUNCT)}]|$)""", flags=regex.MULTILINE
184
227
  )
185
228
 
186
229
 
@@ -24,6 +24,7 @@ from sonatoki.Cleaners import ConsecutiveDuplicates
24
24
  from sonatoki.constants import (
25
25
  UCSUR_PUNCT_RANGES,
26
26
  UNICODE_PUNCT_RANGES,
27
+ UNICODE_WHITESPACE_RANGES,
27
28
  EMOJI_VARIATION_SELECTOR_RANGES,
28
29
  )
29
30
 
@@ -121,6 +122,11 @@ def regen_unicode_data():
121
122
  "Sc", # Currency
122
123
  "So", # Other
123
124
  }
125
+ WHITESPACE_CATEGORIES = {
126
+ "Zl", # Line Separator
127
+ "Zp", # Paragraph Separator
128
+ "Zs", # Space Separator
129
+ }
124
130
  r"""These characters are in Symbol other (So) but are not in
125
131
  `\p{Punctuation}` However, I began excluding them again, because it turns
126
132
  out that some sequences of latin alphabet emoji."""
@@ -134,11 +140,15 @@ def regen_unicode_data():
134
140
  def is_punctuation(data: List[str]):
135
141
  return data[2] in PUNCT_CATEGORIES
136
142
 
143
+ def is_whitespace(data: List[str]):
144
+ return data[2] in WHITESPACE_CATEGORIES
145
+
137
146
  def get_character(data: List[str]):
138
147
  return chr(int(data[0], 16))
139
148
 
140
149
  unicode_data = download(UNICODE_DATA)
141
150
  unicode_punctuation = ""
151
+ unicode_whitespace = ""
142
152
  for line in unicode_data.split("\n"):
143
153
  if not line: # damn you, trailing newline
144
154
  continue
@@ -147,24 +157,35 @@ def regen_unicode_data():
147
157
  # This does not apply to any currently defined punctuation category.
148
158
 
149
159
  unicode_data = line.split(";")
150
- if not is_punctuation(unicode_data):
160
+ if is_punctuation(unicode_data):
161
+ char = get_character(unicode_data)
162
+ unicode_punctuation += char
163
+ continue
164
+ if is_whitespace((unicode_data)):
165
+ char = get_character(unicode_data)
166
+ unicode_whitespace += char
151
167
  continue
152
-
153
- char = get_character(unicode_data)
154
-
155
- unicode_punctuation += char
156
168
 
157
169
  unicode_punctuation = emoji.replace_emoji(unicode_punctuation)
158
170
 
159
- unicode_ranges = find_unicode_ranges(unicode_punctuation)
160
- unicode_ranges.extend(UCSUR_PUNCT_RANGES)
161
- # unicode_ranges.extend(EMOJI_VARIATION_SELECTOR_RANGES) # made unnecessary by emoji library
162
- unicode_ranges = sorted(unicode_ranges)
171
+ unicode_punct_ranges = find_unicode_ranges(unicode_punctuation)
172
+ unicode_punct_ranges.extend(UCSUR_PUNCT_RANGES)
173
+ unicode_punct_ranges = sorted(unicode_punct_ranges)
163
174
  # sorted in case my manual additions are out of order
164
175
 
165
- if unicode_ranges != UNICODE_PUNCT_RANGES:
166
- output = json.dumps(unicode_ranges, indent=4, ensure_ascii=True)
167
- print(output)
176
+ # TODO: can i push these outputs directly into the constants.py file?
177
+
178
+ if unicode_punct_ranges != UNICODE_PUNCT_RANGES:
179
+ output = json.dumps(unicode_punct_ranges, indent=4, ensure_ascii=True)
180
+ with open("updated_unicode_punct_ranges.txt", "w") as f:
181
+ f.write(output)
182
+
183
+ unicode_whitespace_ranges = find_unicode_ranges(unicode_whitespace)
184
+ unicode_whitespace_ranges = sorted(unicode_whitespace_ranges)
185
+ if unicode_whitespace_ranges != UNICODE_WHITESPACE_RANGES:
186
+ output = json.dumps(unicode_whitespace_ranges, indent=4, ensure_ascii=True)
187
+ with open("updated_unicode_whitespace_ranges.txt", "w") as f:
188
+ f.write(output)
168
189
 
169
190
 
170
191
  def main(argv: argparse.Namespace):
@@ -109,8 +109,9 @@ UNICODE_PUNCT_RANGES = [
109
109
  "\\U00001a1e-\\U00001a1f",
110
110
  "\\U00001aa0-\\U00001aa6",
111
111
  "\\U00001aa8-\\U00001aad",
112
+ "\\U00001b4e-\\U00001b4f",
112
113
  "\\U00001b5a-\\U00001b6a",
113
- "\\U00001b74-\\U00001b7e",
114
+ "\\U00001b74-\\U00001b7f",
114
115
  "\\U00001bfc-\\U00001bff",
115
116
  "\\U00001c3b-\\U00001c3f",
116
117
  "\\U00001c7e-\\U00001c7f",
@@ -152,7 +153,7 @@ UNICODE_PUNCT_RANGES = [
152
153
  "\\U00002329-\\U000023ce",
153
154
  "\\U000023d0-\\U000023e8",
154
155
  "\\U000023f4-\\U000023f7",
155
- "\\U000023fb-\\U00002426",
156
+ "\\U000023fb-\\U00002429",
156
157
  "\\U00002440-\\U0000244a",
157
158
  "\\U0000249c-\\U000024c1",
158
159
  "\\U000024c3-\\U000024e9",
@@ -248,7 +249,7 @@ UNICODE_PUNCT_RANGES = [
248
249
  "\\U000030fb",
249
250
  "\\U00003190-\\U00003191",
250
251
  "\\U00003196-\\U0000319f",
251
- "\\U000031c0-\\U000031e3",
252
+ "\\U000031c0-\\U000031e5",
252
253
  "\\U000031ef",
253
254
  "\\U00003200-\\U0000321e",
254
255
  "\\U0000322a-\\U00003247",
@@ -321,6 +322,8 @@ UNICODE_PUNCT_RANGES = [
321
322
  "\\U00010af0-\\U00010af6",
322
323
  "\\U00010b39-\\U00010b3f",
323
324
  "\\U00010b99-\\U00010b9c",
325
+ "\\U00010d6e",
326
+ "\\U00010d8e-\\U00010d8f",
324
327
  "\\U00010ead",
325
328
  "\\U00010f55-\\U00010f59",
326
329
  "\\U00010f86-\\U00010f89",
@@ -335,6 +338,8 @@ UNICODE_PUNCT_RANGES = [
335
338
  "\\U000111dd-\\U000111df",
336
339
  "\\U00011238-\\U0001123d",
337
340
  "\\U000112a9",
341
+ "\\U000113d4-\\U000113d5",
342
+ "\\U000113d7-\\U000113d8",
338
343
  "\\U0001144b-\\U0001144f",
339
344
  "\\U0001145a-\\U0001145b",
340
345
  "\\U0001145d",
@@ -351,6 +356,7 @@ UNICODE_PUNCT_RANGES = [
351
356
  "\\U00011a9a-\\U00011a9c",
352
357
  "\\U00011a9e-\\U00011aa2",
353
358
  "\\U00011b00-\\U00011b09",
359
+ "\\U00011be1",
354
360
  "\\U00011c41-\\U00011c45",
355
361
  "\\U00011c70-\\U00011c71",
356
362
  "\\U00011ef7-\\U00011ef8",
@@ -363,10 +369,13 @@ UNICODE_PUNCT_RANGES = [
363
369
  "\\U00016af5",
364
370
  "\\U00016b37-\\U00016b3f",
365
371
  "\\U00016b44-\\U00016b45",
372
+ "\\U00016d6d-\\U00016d6f",
366
373
  "\\U00016e97-\\U00016e9a",
367
374
  "\\U00016fe2",
368
375
  "\\U0001bc9c",
369
376
  "\\U0001bc9f",
377
+ "\\U0001cc00-\\U0001ccef",
378
+ "\\U0001cd00-\\U0001ceb3",
370
379
  "\\U0001cf50-\\U0001cfc3",
371
380
  "\\U0001d000-\\U0001d0f5",
372
381
  "\\U0001d100-\\U0001d126",
@@ -395,6 +404,7 @@ UNICODE_PUNCT_RANGES = [
395
404
  "\\U0001da85-\\U0001da8b",
396
405
  "\\U0001e14f",
397
406
  "\\U0001e2ff",
407
+ "\\U0001e5ff",
398
408
  "\\U0001e95e-\\U0001e95f",
399
409
  "\\U0001ecac",
400
410
  "\\U0001ecb0",
@@ -464,16 +474,41 @@ UNICODE_PUNCT_RANGES = [
464
474
  "\\U0001f850-\\U0001f859",
465
475
  "\\U0001f860-\\U0001f887",
466
476
  "\\U0001f890-\\U0001f8ad",
467
- "\\U0001f8b0-\\U0001f8b1",
477
+ "\\U0001f8b0-\\U0001f8bb",
478
+ "\\U0001f8c0-\\U0001f8c1",
468
479
  "\\U0001f900-\\U0001f90b",
469
480
  "\\U0001f93b",
470
481
  "\\U0001f946",
471
482
  "\\U0001fa00-\\U0001fa53",
472
483
  "\\U0001fa60-\\U0001fa6d",
473
484
  "\\U0001fb00-\\U0001fb92",
474
- "\\U0001fb94-\\U0001fbca",
485
+ "\\U0001fb94-\\U0001fbef",
475
486
  "\\U000f1990-\\U000f199d",
476
487
  ]
488
+ UNICODE_PUNCT = find_unicode_chars(UNICODE_PUNCT_RANGES)
489
+ # this is a large string.
490
+
491
+ # `\p{posix_punct}` character class
492
+ POSIX_PUNCT = r"""-!"#$%&'()*+,./:;<=>?@[\]^_`{|}~"""
493
+ POSIX_PUNCT_RANGES = find_unicode_ranges(POSIX_PUNCT)
494
+
495
+ ALL_PUNCT = "".join(sorted(list(set(POSIX_PUNCT + UNICODE_PUNCT))))
496
+ ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
497
+ # combined bc the result could be simpler
498
+
499
+
500
+ UNICODE_WHITESPACE_RANGES = [
501
+ "\\U00000020",
502
+ "\\U000000a0",
503
+ "\\U00001680",
504
+ "\\U00002000-\\U0000200a",
505
+ "\\U00002028-\\U00002029",
506
+ "\\U0000202f",
507
+ "\\U0000205f",
508
+ "\\U00003000",
509
+ ]
510
+ UNICODE_WHITESPACE = find_unicode_chars(UNICODE_WHITESPACE_RANGES)
511
+ UNICODE_WHITESPACE_RANGES_STR = "".join(UNICODE_WHITESPACE_RANGES)
477
512
 
478
513
 
479
514
  NOT_IN_PUNCT_CLASS = r"Ⓐ-ⓩ🄰-🅉🅐-🅩🅰-🆉"
@@ -482,9 +517,7 @@ EMOJI_VARIATION_SELECTOR_RANGES = ["\\U0000fe0e-\\U0000fe0f"]
482
517
  EMOJI_VARIATION_SELECTOR_RANGES_STR = "".join(EMOJI_VARIATION_SELECTOR_RANGES)
483
518
  """All variation selectors are in Nonspacing Mark (Mn), but it is more apt to
484
519
  mark these two as punctuation, since they are used exclusively for rendering
485
- emoji.
486
-
487
- But it's even better to use the Emoji filter.
520
+ emoji. But it's best to use the Emoji filter.
488
521
  """
489
522
 
490
523
  UCSUR_PUNCT_RANGES = ["\\U000f1990-\\U000f199d"]
@@ -492,22 +525,17 @@ UCSUR_PUNCT_RANGES_STR = "".join(UCSUR_PUNCT_RANGES)
492
525
  """Private Use Area glyphs are given the apt but unhelpful 'Private Use'
493
526
  class."""
494
527
 
495
- UNICODE_PUNCT = find_unicode_chars(UNICODE_PUNCT_RANGES)
496
- # this is a large string.
497
-
498
- # `\p{posix_punct}` character class
499
- POSIX_PUNCT = r"""-!"#$%&'()*+,./:;<=>?@[\]^_`{|}~"""
500
- POSIX_PUNCT_RANGES = find_unicode_ranges(POSIX_PUNCT)
501
-
502
- ALL_PUNCT = "".join(sorted(list(set(POSIX_PUNCT + UNICODE_PUNCT))))
503
- ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
504
- # combined bc the result could be simpler
528
+ UCSUR_CARTOUCHE_LEFT = "󱦐"
529
+ UCSUR_CARTOUCHE_RIGHT = "󱦑"
505
530
 
506
- SENTENCE_PUNCT = """.?!:;"()[-]«»‹›“”‟„⹂‽·•…「」『』"""
507
531
  # single quotes are word boundaries if not intra-word, but double quotes are sentence
508
532
  # boundaries
533
+ BASIC_SENTENCE_PUNCT = """.?!:;()[-]‽·•…"""
534
+ QUOTATIVE_PUNCT = """"«»‹›“”‟„⹂「」『』"""
535
+ UCSUR_SENTENCE_PUNCT = """󱦜󱦝"""
536
+ ALL_SENTENCE_PUNCT = BASIC_SENTENCE_PUNCT + UCSUR_SENTENCE_PUNCT
509
537
 
510
- INTRA_WORD_PUNCT = """-'"""
538
+ INTRA_WORD_PUNCT = """-'’."""
511
539
 
512
540
 
513
541
  LINKU = Path(__file__).resolve().parent / Path("linku.json")
@@ -691,7 +719,11 @@ UCSUR_RANGES = [
691
719
  "\\U000F19A0-\\U000F19A3", # ku lili
692
720
  ]
693
721
  NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
694
-
722
+ ALL_UCSUR = NIMI_UCSUR + find_unicode_chars(UCSUR_PUNCT_RANGES)
723
+ UCSUR_MINUS_CARTOUCHE = set(ALL_UCSUR).difference(
724
+ {UCSUR_CARTOUCHE_LEFT, UCSUR_CARTOUCHE_RIGHT}
725
+ )
726
+ print(UCSUR_MINUS_CARTOUCHE)
695
727
 
696
728
  # NIMI_PU_UCSUR_RANGES = ["\\U000F1900-\\U000F1977"]
697
729
  # NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
@@ -757,7 +789,9 @@ __all__ = [
757
789
  "POSIX_PUNCT_RANGES",
758
790
  "UCSUR_PUNCT_RANGES",
759
791
  "UCSUR_PUNCT_RANGES_STR",
792
+ "UCSUR_SENTENCE_PUNCT",
760
793
  "UNICODE_PUNCT",
761
794
  "UNICODE_PUNCT_RANGES",
795
+ "UNICODE_WHITESPACE",
762
796
  "VOWELS",
763
797
  ]
@@ -165,6 +165,7 @@ EXCESSIVE_ENGLISH = [
165
165
  "I wanna see", # same down to here
166
166
  "i'm online all the time",
167
167
  "How to Cut a Kiwi",
168
+ "ni li make e sense",
168
169
  "21st", # previous false positive; fixed by ProperName change
169
170
  "a e i o u", # voting brings this back to false positive zone...
170
171
  ]
@@ -54,10 +54,11 @@ def test_SentTokenizer(test: TokenizerTest):
54
54
  pytest.xfail()
55
55
 
56
56
  fn_tokenized = SentTokenizer.tokenize(test["input"])
57
- re1_tokenized = SentTokenizerRe1.tokenize(test["input"])
58
- assert fn_tokenized == re1_tokenized, test["name"]
57
+ # re1_tokenized = SentTokenizerRe1.tokenize(test["input"])
58
+ assert fn_tokenized == test["output"], test["name"]
59
59
 
60
60
 
61
+ @pytest.mark.skip("Deprecated")
61
62
  @pytest.mark.parametrize(
62
63
  "test", load_tokenizer_tests("tests/tokenize_cases/tokenize_sentences_tok.yml")
63
64
  )
@@ -65,11 +66,23 @@ def test_SentTokenizerRe(test: TokenizerTest):
65
66
  if test["xfail"]:
66
67
  pytest.xfail()
67
68
 
69
+ re_tokenized = SentTokenizerRe.tokenize(test["input"])
70
+ assert re_tokenized == test["output"], test["name"]
71
+
72
+
73
+ @pytest.mark.parametrize(
74
+ "test", load_tokenizer_tests("tests/tokenize_cases/tokenize_sentences_tok.yml")
75
+ )
76
+ def test_SentTokenizerReCompare(test: TokenizerTest):
77
+ if test["xfail"]:
78
+ pytest.xfail()
79
+
68
80
  re_tokenized = SentTokenizerRe.tokenize(test["input"])
69
81
  re1_tokenized = SentTokenizerRe1.tokenize(test["input"])
70
82
  assert re_tokenized == re1_tokenized, test["name"]
71
83
 
72
84
 
85
+ @pytest.mark.skip("Deprecated")
73
86
  @pytest.mark.parametrize(
74
87
  "test", load_tokenizer_tests("tests/tokenize_cases/tokenize_sentences_tok.yml")
75
88
  )
@@ -0,0 +1,162 @@
1
+ ---
2
+ - name: "basic1"
3
+ input: "mu. mu."
4
+ output:
5
+ - "mu."
6
+ - "mu."
7
+ - name: "basic2"
8
+ input: "mu! mu!"
9
+ output:
10
+ - "mu!"
11
+ - "mu!"
12
+ - name: "basic3"
13
+ input: "mu? mu?"
14
+ output:
15
+ - "mu?"
16
+ - "mu?"
17
+ - name: "basic4"
18
+ input: "mi mu. mi wawa."
19
+ output:
20
+ - "mi mu."
21
+ - "mi wawa."
22
+ - name: "empty"
23
+ input: ""
24
+ output: []
25
+ - name: "whitespace"
26
+ input: " \n "
27
+ output: []
28
+ - name: "newline basic"
29
+ input: "sina lon seme?\nmi wile lon poka...\n"
30
+ output:
31
+ - "sina lon seme?"
32
+ - "mi wile lon poka."
33
+ - "."
34
+ - "."
35
+ - name: "newline alone"
36
+ input: "sina lon seme\nmi wile lon poka"
37
+ output:
38
+ - "sina lon seme"
39
+ - "mi wile lon poka"
40
+ - name: "dash"
41
+ input: "mi sona ala e ni- sina seme a"
42
+ output:
43
+ - "mi sona ala e ni-"
44
+ - "sina seme a"
45
+ - name: "comma"
46
+ input: "mi mu tawa sina, mi wawa e sina."
47
+ output:
48
+ - "mi mu tawa sina, mi wawa e sina."
49
+ - name: "singlequotes"
50
+ input: "toki li tan kulupu Kuko li ni: 'o ike ala!'"
51
+ output:
52
+ - "toki li tan kulupu Kuko li ni:"
53
+ - "'o ike ala!"
54
+ - "'"
55
+ - name: "doublequotes"
56
+ input: 'ona li toki e ni: "mama sina"'
57
+ output:
58
+ - "ona li toki e ni:"
59
+ - '"mama sina"'
60
+ - name: "doublequotes 2"
61
+ input: 'this is a bit dumb, right? they said "where is the pacific ocean?"'
62
+ output:
63
+ - "this is a bit dumb, right?"
64
+ - 'they said "where is the pacific ocean?'
65
+ - '"'
66
+ - name: "doublequotes 3"
67
+ input: 'they said "wow, its made"'
68
+ output:
69
+ - they said "wow, its made"
70
+ - name: "mixed periods spoilers"
71
+ input: "||...||"
72
+ output:
73
+ - "||."
74
+ - "."
75
+ - "."
76
+ - "||"
77
+ - name: "trailing periods"
78
+ input: "h.."
79
+ output:
80
+ - "h."
81
+ - "."
82
+ - name: "trailing periods 2"
83
+ input: "h.!"
84
+ output:
85
+ - "h."
86
+ - "!"
87
+ - name: "intraword punctuation 1"
88
+ input: "e.g. monsuta"
89
+ output:
90
+ - "e.g."
91
+ - "monsuta"
92
+ - name: "intraword punctuation 2"
93
+ input: "isn't that game-breaking? i think so"
94
+ output:
95
+ - "isn't that game-breaking?"
96
+ - "i think so"
97
+ - name: "fake intraword punct 1"
98
+ input: "!.h"
99
+ output:
100
+ - "!"
101
+ - "."
102
+ - "h"
103
+ - name: "full width space"
104
+ input: "life-altering pseudo-science. and non-sense"
105
+ output:
106
+ - "life-altering pseudo-science."
107
+ - "and non-sense"
108
+ - name: "discovered case 1"
109
+ input: "ona li ken lukin e sitelen [_ike_nanpa_lete_ike]. ni li pona kin."
110
+ output:
111
+ - "ona li ken lukin e sitelen ["
112
+ - "_ike_nanpa_lete_ike]"
113
+ - "."
114
+ - "ni li pona kin."
115
+ - name: "zwj in emoji"
116
+ input: "👨‍👩‍👧‍👧"
117
+ output:
118
+ - "👨‍👩‍👧‍👧"
119
+
120
+ - name: UCSUR 1
121
+ input: "󱥄󱥬󱥩󱤴󱦜󱥄󱥬󱥩󱤴"
122
+ output:
123
+ - "󱥄󱥬󱥩󱤴󱦜"
124
+ - "󱥄󱥬󱥩󱤴"
125
+ # - name: "UCSUR 2 (original)"
126
+ # input: "󱤴󱤺󱦐󱤘󱦜󱤕󱦜󱤾󱦑󱦐󱤼󱦝󱦑"
127
+ # output:
128
+ # - "󱤴󱤺󱦐󱤘󱦜"
129
+ # - "󱤕󱦜"
130
+ # - "󱤾󱦑󱦐󱤼󱦝"
131
+ # - "󱦑"
132
+ - name: "UCSUR 2 (preferred)"
133
+ input: "󱤴󱤺󱦐󱤘󱦜󱤕󱦜󱤾󱦑󱦐󱤼󱦝󱦑"
134
+ output:
135
+ - "󱤴󱤺󱦐󱤘󱦜󱤕󱦜󱤾󱦑󱦐󱤼󱦝󱦑"
136
+ - name: "UCSUR 3"
137
+ input: "󱤴󱦐󱦑󱦐󱦑"
138
+ output:
139
+ - "󱤴󱦐󱦑󱦐󱦑"
140
+ - name: "UCSUR 4"
141
+ input: "󱤴󱦐󱦐"
142
+ output:
143
+ - "󱤴󱦐󱦐"
144
+ - name: "UCSUR 5"
145
+ input: "󱦑󱤴󱦐󱦐"
146
+ output:
147
+ - "󱦑󱤴󱦐󱦐"
148
+ - name: "UCSUR 6"
149
+ input: "󱦐nvidia shield. and other nvidia products.󱦑"
150
+ output:
151
+ - "󱦐nvidia shield."
152
+ - "and other nvidia products."
153
+ - "󱦑"
154
+ - name: "UCSUR 7"
155
+ input: "󱤴󱤺󱦐󱤘󱦜󱦐󱤕󱦐󱦜󱤾󱦑󱦐󱤼󱦝󱦑"
156
+ output:
157
+ - "󱤴󱤺󱦐󱤘󱦜"
158
+ - "󱦐󱤕󱦐󱦜󱤾󱦑󱦐󱤼󱦝󱦑"
159
+ - name: "UCSUR 8"
160
+ input: "󱤴󱤺󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦝󱦑"
161
+ output:
162
+ - "󱤴󱤺󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦝󱦑"
@@ -53,9 +53,7 @@
53
53
  output:
54
54
  - "i'm"
55
55
  - "an"
56
- - "m"
57
- - "."
58
- - "d"
56
+ - "m.d"
59
57
  - "."
60
58
  - name: "english 4"
61
59
  input: "it's mind-numbing honestly"
@@ -142,15 +140,7 @@
142
140
  - name: periods every word
143
141
  input: "mi.unpa.e.mama.sina"
144
142
  output:
145
- - "mi"
146
- - "."
147
- - "unpa"
148
- - "."
149
- - "e"
150
- - "."
151
- - "mama"
152
- - "."
153
- - "sina"
143
+ - "mi.unpa.e.mama.sina"
154
144
  - name: simple bold
155
145
  input: "**mi unpa e mama sina**"
156
146
  output:
@@ -299,7 +289,20 @@
299
289
  - "「"
300
290
  - "Direct"
301
291
  - "」"
302
-
292
+ - name: "UCSUR 4"
293
+ input: "󱤴󱤺󱦐󱤘󱦜󱤕󱦜󱤾󱦑󱦐󱤼󱦝󱦑"
294
+ output:
295
+ - "󱤴"
296
+ - "󱤺"
297
+ - "󱦐"
298
+ - "󱤘"
299
+ - "󱦜"
300
+ - "󱤕"
301
+ - "󱦜"
302
+ - "󱤾"
303
+ - "󱦑󱦐"
304
+ - "󱤼"
305
+ - "󱦝󱦑"
303
306
  - name: "simple intrapunct 1"
304
307
  input: "i'm"
305
308
  output:
@@ -313,6 +316,11 @@
313
316
  input: "isn't"
314
317
  output:
315
318
  - "isn't"
319
+ - name: "simple intrapunct with punct"
320
+ input: "isn't."
321
+ output:
322
+ - "isn't"
323
+ - "."
316
324
  - name: "quoted with intrapunct"
317
325
  input: "'bother'"
318
326
  output:
@@ -337,3 +345,55 @@
337
345
  input: "whom's't'd've'n't"
338
346
  output:
339
347
  - "whom's't'd've'n't"
348
+ - name: "just periods"
349
+ input: "..."
350
+ output:
351
+ - "..."
352
+ - name: "just periods 2"
353
+ input: "... ..."
354
+ output:
355
+ - "..."
356
+ - "..."
357
+ - name: "mixed periods spoilers"
358
+ input: "||...||"
359
+ output:
360
+ - "||...||"
361
+ - name: "trailing periods"
362
+ input: "h.."
363
+ output:
364
+ - "h"
365
+ - ".."
366
+ - name: "trailing periods"
367
+ input: "h.!"
368
+ output:
369
+ - "h"
370
+ - ".!"
371
+ - name: "trailing period"
372
+ input: "h."
373
+ output:
374
+ - "h"
375
+ - "."
376
+ - name: "trailing interpunctuation"
377
+ input: "h-.'"
378
+ output:
379
+ - "h"
380
+ - "-.'"
381
+ - name: "trailing period 2"
382
+ input: "h. h."
383
+ output:
384
+ - "h"
385
+ - "."
386
+ - "h"
387
+ - "."
388
+ - name: "sad face"
389
+ input: "q.q"
390
+ output:
391
+ - "q.q"
392
+ - name: "full width space"
393
+ input: "life-altering pseudo-science. and non-sense"
394
+ output:
395
+ - "life-altering"
396
+ - "pseudo-science"
397
+ - "."
398
+ - "and"
399
+ - "non-sense"
@@ -1,71 +0,0 @@
1
- ---
2
- - name: "basic1"
3
- input: "mu. mu."
4
- output:
5
- - "mu."
6
- - "mu."
7
- - name: "basic2"
8
- input: "mu! mu!"
9
- output:
10
- - "mu!"
11
- - "mu!"
12
- - name: "basic3"
13
- input: "mu? mu?"
14
- output:
15
- - "mu?"
16
- - "mu?"
17
- - name: "basic4"
18
- input: "mi mu. mi wawa."
19
- output:
20
- - "mi mu."
21
- - "mi wawa."
22
- - name: "empty"
23
- input: ""
24
- output: []
25
- - name: "whitespace"
26
- input: " \n "
27
- output: []
28
- - name: "newline basic"
29
- input: "sina lon seme?\nmi wile lon poka...\n"
30
- output:
31
- - "sina lon seme?"
32
- - "mi wile lon poka."
33
- - "."
34
- - "."
35
- - name: "newline alone"
36
- input: "sina lon seme\nmi wile lon poka"
37
- output:
38
- - "sina lon seme"
39
- - "mi wile lon poka"
40
- - name: "dash"
41
- input: "mi sona ala e ni- sina seme a"
42
- output:
43
- - "mi sona ala e ni-"
44
- - "sina seme a"
45
- - name: "comma"
46
- input: "mi mu tawa sina, mi wawa e sina."
47
- output:
48
- - "mi mu tawa sina, mi wawa e sina."
49
- - name: "singlequotes"
50
- input: "toki li tan kulupu Kuko li ni: 'o ike ala!'"
51
- output:
52
- - "toki li tan kulupu Kuko li ni:"
53
- - "'o ike ala!"
54
- - "'"
55
- - name: "doublequotes"
56
- input: 'ona li toki e ni: "mama sina"'
57
- output:
58
- - "ona li toki e ni:"
59
- - '"'
60
- - 'mama sina"'
61
- - name: "discovered case 1"
62
- input: "ona li ken lukin e sitelen [_ike_nanpa_lete_ike]. ni li pona kin."
63
- output:
64
- - "ona li ken lukin e sitelen ["
65
- - "_ike_nanpa_lete_ike]"
66
- - "."
67
- - "ni li pona kin."
68
- - name: "zwj in emoji"
69
- input: "👨‍👩‍👧‍👧"
70
- output:
71
- - "👨‍👩‍👧‍👧"
File without changes
File without changes
File without changes
File without changes
File without changes