sonatoki 0.5.2__tar.gz → 0.5.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sonatoki-0.5.2 → sonatoki-0.5.3}/PKG-INFO +1 -1
- {sonatoki-0.5.2 → sonatoki-0.5.3}/pyproject.toml +1 -1
- {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/Configs.py +24 -10
- {sonatoki-0.5.2 → sonatoki-0.5.3}/tests/test_properties.py +4 -2
- {sonatoki-0.5.2 → sonatoki-0.5.3}/LICENSE +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/README.md +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/Cleaners.py +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/Filters.py +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/Preprocessors.py +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/Scorers.py +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/Tokenizers.py +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/__init__.py +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/__main__.py +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/alphabetic.txt +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/constants.py +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/ilo.py +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/linku.json +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/py.typed +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/sandbox.json +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/syllabic.txt +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/src/sonatoki/utils.py +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/tests/__init__.py +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/tests/test_cleaners.py +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/tests/test_filters.py +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/tests/test_ilo.py +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/tests/test_preprocessors.py +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/tests/test_scorers.py +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/tests/test_tokenize.py +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/tests/test_utils.py +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
- {sonatoki-0.5.2 → sonatoki-0.5.3}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -112,16 +112,30 @@ __corpus_tokens_dict: Set[str] = cast(
|
|
112
112
|
].tokens, # pyright: ignore[reportAttributeAccessIssue]
|
113
113
|
)
|
114
114
|
__corpus_tokens_dict -= {
|
115
|
-
|
116
|
-
|
117
|
-
"
|
118
|
-
"
|
119
|
-
"
|
120
|
-
"
|
121
|
-
"
|
122
|
-
"
|
123
|
-
"
|
124
|
-
"
|
115
|
+
# Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
|
116
|
+
# In this case, all of these appear more often in English by a factor of at least 10.
|
117
|
+
"aka", # also known as
|
118
|
+
"an", # article
|
119
|
+
"api", # API
|
120
|
+
"i", # 1st person
|
121
|
+
"kana", # japanese script
|
122
|
+
"me", # 1st person
|
123
|
+
"ne", # "no" in several languages
|
124
|
+
"nu", # "new", now in dutch
|
125
|
+
"se", # spanish particle, "see"
|
126
|
+
"take", # acquire, perhaps forcefully or without permission
|
127
|
+
"ten", # 10
|
128
|
+
"to", # to, too
|
129
|
+
"u", # no u
|
130
|
+
"we", # 1st person plural
|
131
|
+
"wi", # wii and discussions of syllables
|
132
|
+
"sole", # singular, of shoe
|
133
|
+
# unexplored candidates for removal
|
134
|
+
# "omen", # ominous
|
135
|
+
# "papa", # father
|
136
|
+
# "lo", # "lo" and "loo"
|
137
|
+
# "ewe", # sheep
|
138
|
+
# "pa", # father- eh?
|
125
139
|
}
|
126
140
|
"""Mimics the previous implementation of ilo pi toki pona taso."""
|
127
141
|
LazyConfig: IloConfig = {
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# PDM
|
2
2
|
import hypothesis.strategies as st
|
3
|
-
from hypothesis import given
|
3
|
+
from hypothesis import given, assume
|
4
4
|
|
5
5
|
# LOCAL
|
6
6
|
from sonatoki.Filters import (
|
@@ -54,11 +54,13 @@ def test_ku_filters_non_overlap(s: str):
|
|
54
54
|
| NIMI_LINKU_COMMON
|
55
55
|
| NIMI_LINKU_UNCOMMON
|
56
56
|
| NIMI_LINKU_OBSCURE
|
57
|
-
| NIMI_LINKU_SANDBOX
|
57
|
+
| NIMI_LINKU_SANDBOX
|
58
58
|
)
|
59
59
|
)
|
60
60
|
)
|
61
61
|
def test_linku_filters_non_overlap(s: str):
|
62
|
+
_ = assume(s != "su")
|
63
|
+
|
62
64
|
s = Lowercase.clean(s)
|
63
65
|
s = ConsecutiveDuplicates.clean(s)
|
64
66
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|