sonatoki 0.11.2__tar.gz → 0.11.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sonatoki-0.11.2 → sonatoki-0.11.3}/PKG-INFO +1 -1
- {sonatoki-0.11.2 → sonatoki-0.11.3}/pyproject.toml +1 -1
- {sonatoki-0.11.2 → sonatoki-0.11.3}/src/sonatoki/constants.py +1 -1
- {sonatoki-0.11.2 → sonatoki-0.11.3}/tests/tokenize_cases/tokenize_sentences_tok.yml +5 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/tests/tokenize_cases/tokenize_words_tok.yml +10 -7
- {sonatoki-0.11.2 → sonatoki-0.11.3}/LICENSE +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/README.md +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/src/sonatoki/Cleaners.py +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/src/sonatoki/Configs.py +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/src/sonatoki/Filters.py +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/src/sonatoki/Preprocessors.py +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/src/sonatoki/Scorers.py +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/src/sonatoki/Tokenizers.py +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/src/sonatoki/__init__.py +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/src/sonatoki/__main__.py +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/src/sonatoki/alphabetic.txt +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/src/sonatoki/ilo.py +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/src/sonatoki/linku.json +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/src/sonatoki/py.typed +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/src/sonatoki/sandbox.json +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/src/sonatoki/syllabic.txt +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/src/sonatoki/types.py +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/src/sonatoki/utils.py +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/tests/__init__.py +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/tests/test_cleaners.py +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/tests/test_filters.py +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/tests/test_ilo.py +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/tests/test_preprocessors.py +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/tests/test_properties.py +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/tests/test_scorers.py +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/tests/test_tokenize.py +0 -0
- {sonatoki-0.11.2 → sonatoki-0.11.3}/tests/test_utils.py +0 -0
@@ -538,7 +538,7 @@ QUOTATIVE_PUNCT = """"«»‹›“”‟„⹂「」『』"""
|
|
538
538
|
UCSUR_SENTENCE_PUNCT = """"""
|
539
539
|
ALL_SENTENCE_PUNCT = BASIC_SENTENCE_PUNCT + UCSUR_SENTENCE_PUNCT
|
540
540
|
|
541
|
-
INTRA_WORD_PUNCT = """-'’."""
|
541
|
+
INTRA_WORD_PUNCT = """-'’._"""
|
542
542
|
|
543
543
|
|
544
544
|
LINKU = Path(__file__).resolve().parent / Path("linku.json")
|
@@ -104,6 +104,11 @@
|
|
104
104
|
- "monsi"
|
105
105
|
- "-"
|
106
106
|
- "ma"
|
107
|
+
- name: "intraword punctuation 4"
|
108
|
+
input: "look at this variable: leaf_node_right"
|
109
|
+
output:
|
110
|
+
- "look at this variable:"
|
111
|
+
- "leaf_node_right"
|
107
112
|
- name: "multiline with fake intraword"
|
108
113
|
input: >
|
109
114
|
toki!
|
@@ -187,13 +187,7 @@
|
|
187
187
|
- "e"
|
188
188
|
- "sitelen"
|
189
189
|
- "[_"
|
190
|
-
- "
|
191
|
-
- "_"
|
192
|
-
- "nanpa"
|
193
|
-
- "_"
|
194
|
-
- "lete"
|
195
|
-
- "_"
|
196
|
-
- "ike"
|
190
|
+
- "ike_nanpa_lete_ike"
|
197
191
|
- "]."
|
198
192
|
- "ni"
|
199
193
|
- "li"
|
@@ -345,6 +339,15 @@
|
|
345
339
|
input: "whom's't'd've'n't"
|
346
340
|
output:
|
347
341
|
- "whom's't'd've'n't"
|
342
|
+
- name: "underscore"
|
343
|
+
input: "look at this variable: leaf_node_right"
|
344
|
+
output:
|
345
|
+
- "look"
|
346
|
+
- "at"
|
347
|
+
- "this"
|
348
|
+
- "variable"
|
349
|
+
- ":"
|
350
|
+
- "leaf_node_right"
|
348
351
|
- name: "just periods"
|
349
352
|
input: "..."
|
350
353
|
output:
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|