sonatoki 0.9.1__tar.gz → 0.9.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {sonatoki-0.9.1 → sonatoki-0.9.2}/PKG-INFO +1 -1
  2. {sonatoki-0.9.1 → sonatoki-0.9.2}/pyproject.toml +1 -1
  3. {sonatoki-0.9.1 → sonatoki-0.9.2}/src/sonatoki/Tokenizers.py +4 -0
  4. {sonatoki-0.9.1 → sonatoki-0.9.2}/src/sonatoki/constants.py +1 -1
  5. {sonatoki-0.9.1 → sonatoki-0.9.2}/tests/test_ilo.py +1 -0
  6. {sonatoki-0.9.1 → sonatoki-0.9.2}/tests/tokenize_cases/tokenize_words_tok.yml +51 -12
  7. {sonatoki-0.9.1 → sonatoki-0.9.2}/LICENSE +0 -0
  8. {sonatoki-0.9.1 → sonatoki-0.9.2}/README.md +0 -0
  9. {sonatoki-0.9.1 → sonatoki-0.9.2}/src/sonatoki/Cleaners.py +0 -0
  10. {sonatoki-0.9.1 → sonatoki-0.9.2}/src/sonatoki/Configs.py +0 -0
  11. {sonatoki-0.9.1 → sonatoki-0.9.2}/src/sonatoki/Filters.py +0 -0
  12. {sonatoki-0.9.1 → sonatoki-0.9.2}/src/sonatoki/Preprocessors.py +0 -0
  13. {sonatoki-0.9.1 → sonatoki-0.9.2}/src/sonatoki/Scorers.py +0 -0
  14. {sonatoki-0.9.1 → sonatoki-0.9.2}/src/sonatoki/__init__.py +0 -0
  15. {sonatoki-0.9.1 → sonatoki-0.9.2}/src/sonatoki/__main__.py +0 -0
  16. {sonatoki-0.9.1 → sonatoki-0.9.2}/src/sonatoki/alphabetic.txt +0 -0
  17. {sonatoki-0.9.1 → sonatoki-0.9.2}/src/sonatoki/ilo.py +0 -0
  18. {sonatoki-0.9.1 → sonatoki-0.9.2}/src/sonatoki/linku.json +0 -0
  19. {sonatoki-0.9.1 → sonatoki-0.9.2}/src/sonatoki/py.typed +0 -0
  20. {sonatoki-0.9.1 → sonatoki-0.9.2}/src/sonatoki/sandbox.json +0 -0
  21. {sonatoki-0.9.1 → sonatoki-0.9.2}/src/sonatoki/syllabic.txt +0 -0
  22. {sonatoki-0.9.1 → sonatoki-0.9.2}/src/sonatoki/types.py +0 -0
  23. {sonatoki-0.9.1 → sonatoki-0.9.2}/src/sonatoki/utils.py +0 -0
  24. {sonatoki-0.9.1 → sonatoki-0.9.2}/tests/__init__.py +0 -0
  25. {sonatoki-0.9.1 → sonatoki-0.9.2}/tests/test_cleaners.py +0 -0
  26. {sonatoki-0.9.1 → sonatoki-0.9.2}/tests/test_filters.py +0 -0
  27. {sonatoki-0.9.1 → sonatoki-0.9.2}/tests/test_preprocessors.py +0 -0
  28. {sonatoki-0.9.1 → sonatoki-0.9.2}/tests/test_properties.py +0 -0
  29. {sonatoki-0.9.1 → sonatoki-0.9.2}/tests/test_scorers.py +0 -0
  30. {sonatoki-0.9.1 → sonatoki-0.9.2}/tests/test_tokenize.py +0 -0
  31. {sonatoki-0.9.1 → sonatoki-0.9.2}/tests/test_utils.py +0 -0
  32. {sonatoki-0.9.1 → sonatoki-0.9.2}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.9.1
3
+ Version: 0.9.2
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.9.1"
3
+ version = "0.9.2"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -104,6 +104,10 @@ class WordTokenizer(SetTokenizer):
104
104
  # we skipped, but there wasn't another writing character
105
105
  cls.add_token(s, tokens, last_match, i - 1)
106
106
  last_match = i - 1
107
+ # there may be punctuation though
108
+ # TODO: this is duplicated
109
+ while i < slen and cls.is_delimiter(s[i]):
110
+ i += 1
107
111
 
108
112
  cls.add_token(s, tokens, last_match, i)
109
113
 
@@ -507,7 +507,7 @@ SENTENCE_PUNCT = """.?!:;"()[-]«»‹›“”‟„⹂‽·•…「」『』"
507
507
  # single quotes are word boundaries if not intra-word, but double quotes are sentence
508
508
  # boundaries
509
509
 
510
- INTRA_WORD_PUNCT = """-'"""
510
+ INTRA_WORD_PUNCT = """-'’."""
511
511
 
512
512
 
513
513
  LINKU = Path(__file__).resolve().parent / Path("linku.json")
@@ -165,6 +165,7 @@ EXCESSIVE_ENGLISH = [
165
165
  "I wanna see", # same down to here
166
166
  "i'm online all the time",
167
167
  "How to Cut a Kiwi",
168
+ "ni li make e sense",
168
169
  "21st", # previous false positive; fixed by ProperName change
169
170
  "a e i o u", # voting brings this back to false positive zone...
170
171
  ]
@@ -53,9 +53,7 @@
53
53
  output:
54
54
  - "i'm"
55
55
  - "an"
56
- - "m"
57
- - "."
58
- - "d"
56
+ - "m.d"
59
57
  - "."
60
58
  - name: "english 4"
61
59
  input: "it's mind-numbing honestly"
@@ -142,15 +140,7 @@
142
140
  - name: periods every word
143
141
  input: "mi.unpa.e.mama.sina"
144
142
  output:
145
- - "mi"
146
- - "."
147
- - "unpa"
148
- - "."
149
- - "e"
150
- - "."
151
- - "mama"
152
- - "."
153
- - "sina"
143
+ - "mi.unpa.e.mama.sina"
154
144
  - name: simple bold
155
145
  input: "**mi unpa e mama sina**"
156
146
  output:
@@ -313,6 +303,11 @@
313
303
  input: "isn't"
314
304
  output:
315
305
  - "isn't"
306
+ - name: "simple intrapunct with punct"
307
+ input: "isn't."
308
+ output:
309
+ - "isn't"
310
+ - "."
316
311
  - name: "quoted with intrapunct"
317
312
  input: "'bother'"
318
313
  output:
@@ -337,3 +332,47 @@
337
332
  input: "whom's't'd've'n't"
338
333
  output:
339
334
  - "whom's't'd've'n't"
335
+ - name: "just periods"
336
+ input: "..."
337
+ output:
338
+ - "..."
339
+ - name: "just periods 2"
340
+ input: "... ..."
341
+ output:
342
+ - "..."
343
+ - "..."
344
+ - name: "mixed periods spoilers"
345
+ input: "||...||"
346
+ output:
347
+ - "||...||"
348
+ - name: "trailing periods"
349
+ input: "h.."
350
+ output:
351
+ - "h"
352
+ - ".."
353
+ - name: "trailing periods"
354
+ input: "h.!"
355
+ output:
356
+ - "h"
357
+ - ".!"
358
+ - name: "trailing period"
359
+ input: "h."
360
+ output:
361
+ - "h"
362
+ - "."
363
+ - name: "trailing interpunctuation"
364
+ input: "h-.'"
365
+ output:
366
+ - "h"
367
+ - "-.'"
368
+ - name: "trailing period 2"
369
+ input: "h. h."
370
+ output:
371
+ - "h"
372
+ - "."
373
+ - "h"
374
+ - "."
375
+ - name: "sad face"
376
+ input: "q.q"
377
+ output:
378
+ - "q.q"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes