sonatoki 0.11.0__tar.gz → 0.11.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {sonatoki-0.11.0 → sonatoki-0.11.1}/PKG-INFO +1 -1
  2. {sonatoki-0.11.0 → sonatoki-0.11.1}/pyproject.toml +1 -1
  3. {sonatoki-0.11.0 → sonatoki-0.11.1}/src/sonatoki/constants.py +4 -1
  4. {sonatoki-0.11.0 → sonatoki-0.11.1}/tests/tokenize_cases/tokenize_sentences_tok.yml +23 -0
  5. {sonatoki-0.11.0 → sonatoki-0.11.1}/LICENSE +0 -0
  6. {sonatoki-0.11.0 → sonatoki-0.11.1}/README.md +0 -0
  7. {sonatoki-0.11.0 → sonatoki-0.11.1}/src/sonatoki/Cleaners.py +0 -0
  8. {sonatoki-0.11.0 → sonatoki-0.11.1}/src/sonatoki/Configs.py +0 -0
  9. {sonatoki-0.11.0 → sonatoki-0.11.1}/src/sonatoki/Filters.py +0 -0
  10. {sonatoki-0.11.0 → sonatoki-0.11.1}/src/sonatoki/Preprocessors.py +0 -0
  11. {sonatoki-0.11.0 → sonatoki-0.11.1}/src/sonatoki/Scorers.py +0 -0
  12. {sonatoki-0.11.0 → sonatoki-0.11.1}/src/sonatoki/Tokenizers.py +0 -0
  13. {sonatoki-0.11.0 → sonatoki-0.11.1}/src/sonatoki/__init__.py +0 -0
  14. {sonatoki-0.11.0 → sonatoki-0.11.1}/src/sonatoki/__main__.py +0 -0
  15. {sonatoki-0.11.0 → sonatoki-0.11.1}/src/sonatoki/alphabetic.txt +0 -0
  16. {sonatoki-0.11.0 → sonatoki-0.11.1}/src/sonatoki/ilo.py +0 -0
  17. {sonatoki-0.11.0 → sonatoki-0.11.1}/src/sonatoki/linku.json +0 -0
  18. {sonatoki-0.11.0 → sonatoki-0.11.1}/src/sonatoki/py.typed +0 -0
  19. {sonatoki-0.11.0 → sonatoki-0.11.1}/src/sonatoki/sandbox.json +0 -0
  20. {sonatoki-0.11.0 → sonatoki-0.11.1}/src/sonatoki/syllabic.txt +0 -0
  21. {sonatoki-0.11.0 → sonatoki-0.11.1}/src/sonatoki/types.py +0 -0
  22. {sonatoki-0.11.0 → sonatoki-0.11.1}/src/sonatoki/utils.py +0 -0
  23. {sonatoki-0.11.0 → sonatoki-0.11.1}/tests/__init__.py +0 -0
  24. {sonatoki-0.11.0 → sonatoki-0.11.1}/tests/test_cleaners.py +0 -0
  25. {sonatoki-0.11.0 → sonatoki-0.11.1}/tests/test_filters.py +0 -0
  26. {sonatoki-0.11.0 → sonatoki-0.11.1}/tests/test_ilo.py +0 -0
  27. {sonatoki-0.11.0 → sonatoki-0.11.1}/tests/test_preprocessors.py +0 -0
  28. {sonatoki-0.11.0 → sonatoki-0.11.1}/tests/test_properties.py +0 -0
  29. {sonatoki-0.11.0 → sonatoki-0.11.1}/tests/test_scorers.py +0 -0
  30. {sonatoki-0.11.0 → sonatoki-0.11.1}/tests/test_tokenize.py +0 -0
  31. {sonatoki-0.11.0 → sonatoki-0.11.1}/tests/test_utils.py +0 -0
  32. {sonatoki-0.11.0 → sonatoki-0.11.1}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.11.0
3
+ Version: 0.11.1
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.11.0"
3
+ version = "0.11.1"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -498,7 +498,10 @@ ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
498
498
 
499
499
 
500
500
  UNICODE_WHITESPACE_RANGES = [
501
- "\\U00000020",
501
+ "\\U00000009", # tab
502
+ "\\U0000000A", # line feed
503
+ "\\U0000000D", # carriage return
504
+ "\\U00000020", # space
502
505
  "\\U000000a0",
503
506
  "\\U00001680",
504
507
  "\\U00002000-\\U0000200a",
@@ -94,6 +94,29 @@
94
94
  output:
95
95
  - "isn't that game-breaking?"
96
96
  - "i think so"
97
+ - name: "intraword punctuation 3"
98
+ input: "e.g.\n- monsuta\n- monsi\n- ma"
99
+ output:
100
+ - "e.g."
101
+ - "-"
102
+ - "monsuta"
103
+ - "-"
104
+ - "monsi"
105
+ - "-"
106
+ - "ma"
107
+ - name: "multiline with fake intraword"
108
+ input: >
109
+ toki!
110
+ sitelen pini ni li tu ala e toki.
111
+ ni kin.
112
+ taso ni li pini e toki anu seme:
113
+ pini la ni li toki sin.
114
+ output:
115
+ - "toki!"
116
+ - "sitelen pini ni li tu ala e toki."
117
+ - "ni kin."
118
+ - "taso ni li pini e toki anu seme:"
119
+ - "pini la ni li toki sin."
97
120
  - name: "fake intraword punct 1"
98
121
  input: "!.h"
99
122
  output:
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes