sonatoki 0.9.0__py3-none-any.whl → 0.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Configs.py +3 -3
- sonatoki/Scorers.py +13 -8
- sonatoki/Tokenizers.py +4 -0
- sonatoki/constants.py +9 -3
- sonatoki/linku.json +1 -1
- sonatoki/sandbox.json +1 -1
- sonatoki/types.py +1 -1
- {sonatoki-0.9.0.dist-info → sonatoki-0.9.2.dist-info}/METADATA +1 -1
- sonatoki-0.9.2.dist-info/RECORD +22 -0
- {sonatoki-0.9.0.dist-info → sonatoki-0.9.2.dist-info}/WHEEL +1 -1
- sonatoki-0.9.0.dist-info/RECORD +0 -22
- {sonatoki-0.9.0.dist-info → sonatoki-0.9.2.dist-info}/entry_points.txt +0 -0
- {sonatoki-0.9.0.dist-info → sonatoki-0.9.2.dist-info}/licenses/LICENSE +0 -0
sonatoki/Configs.py
CHANGED
@@ -52,20 +52,20 @@ __DICT_PHONOMATCHES = {
|
|
52
52
|
"an", # article
|
53
53
|
"api", # API
|
54
54
|
"i", # 1st person
|
55
|
+
"je", # 1st person pronoun, french
|
55
56
|
"kana", # japanese script
|
56
57
|
"me", # 1st person singular, english
|
57
58
|
"ne", # "no" in several languages
|
58
59
|
"nu", # "new" in english, "now" in dutch
|
60
|
+
"omen", # ominous
|
59
61
|
"se", # spanish particle, english "see"
|
62
|
+
"sole", # singular, of shoe
|
60
63
|
"take", # acquire, perhaps forcefully or without permission
|
61
64
|
"ten", # 10
|
62
65
|
"to", # to, too
|
63
|
-
"je", # 1st person pronoun, french
|
64
66
|
"u", # no u
|
65
67
|
"we", # 1st person plural, english
|
66
68
|
"wi", # wii and discussions of syllables
|
67
|
-
"sole", # singular, of shoe
|
68
|
-
"omen", # ominous
|
69
69
|
# unexplored candidates for removal
|
70
70
|
# "papa", # father
|
71
71
|
# "lo", # "lo" and "loo"
|
sonatoki/Scorers.py
CHANGED
@@ -113,19 +113,24 @@ class Scaling(Scorer):
|
|
113
113
|
|
114
114
|
|
115
115
|
class Voting(Scaling):
|
116
|
-
"""Derives from `Scaling` in assigning scores from 0 to 1 based on
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
If
|
122
|
-
|
116
|
+
"""Derives from `Scaling` in assigning scores from 0 to 1 based on the
|
117
|
+
first matching filter out of the list of filters. However, after all scores
|
118
|
+
are derived, each token scoring less than the threshold is assigned the
|
119
|
+
average score of its nearest 3 neighbors. The default threshold is 0.
|
120
|
+
|
121
|
+
If there are 3 or fewer tokens, this scorer is identical to the
|
122
|
+
Scaling scorer.
|
123
|
+
|
124
|
+
If the Voting scorer is created with a Filter, tokens must also
|
125
|
+
match that filter to be considered for voting. For example, the
|
126
|
+
following Voting filter would only check words with a score of 0.3
|
127
|
+
or less that still match the Syllabic filter: `Voting(Syllabic, 0.3)`
|
123
128
|
"""
|
124
129
|
|
125
130
|
prereq: Type[Filter] = Pass
|
126
131
|
threshold: int = 0
|
127
132
|
|
128
|
-
def __new__(cls, filter: Type[Filter], threshold_: int):
|
133
|
+
def __new__(cls, filter: Type[Filter], threshold_: int = 0) -> Type[Scorer]:
|
129
134
|
class AnonVoting(Voting):
|
130
135
|
prereq = filter
|
131
136
|
threshold = threshold_
|
sonatoki/Tokenizers.py
CHANGED
@@ -104,6 +104,10 @@ class WordTokenizer(SetTokenizer):
|
|
104
104
|
# we skipped, but there wasn't another writing character
|
105
105
|
cls.add_token(s, tokens, last_match, i - 1)
|
106
106
|
last_match = i - 1
|
107
|
+
# there may be punctuation though
|
108
|
+
# TODO: this is duplicated
|
109
|
+
while i < slen and cls.is_delimiter(s[i]):
|
110
|
+
i += 1
|
107
111
|
|
108
112
|
cls.add_token(s, tokens, last_match, i)
|
109
113
|
|
sonatoki/constants.py
CHANGED
@@ -7,7 +7,7 @@ from pathlib import Path
|
|
7
7
|
from sonatoki.types import LinkuWord, LinkuUsageDate
|
8
8
|
from sonatoki.utils import find_unicode_chars, find_unicode_ranges
|
9
9
|
|
10
|
-
LATEST_DATE = "
|
10
|
+
LATEST_DATE = "2024-09"
|
11
11
|
# hardcoding this seems bad, but it means the parser is stable w.r.t. Linku!
|
12
12
|
|
13
13
|
|
@@ -507,7 +507,7 @@ SENTENCE_PUNCT = """.?!:;"()[-]«»‹›“”‟„⹂‽·•…「」『』"
|
|
507
507
|
# single quotes are word boundaries if not intra-word, but double quotes are sentence
|
508
508
|
# boundaries
|
509
509
|
|
510
|
-
INTRA_WORD_PUNCT = """-'
|
510
|
+
INTRA_WORD_PUNCT = """-'’."""
|
511
511
|
|
512
512
|
|
513
513
|
LINKU = Path(__file__).resolve().parent / Path("linku.json")
|
@@ -668,10 +668,11 @@ FALSE_POS_ALPHABETIC: Set[str] = {
|
|
668
668
|
"not",
|
669
669
|
"link",
|
670
670
|
"wait",
|
671
|
+
"just",
|
671
672
|
"lol",
|
672
673
|
"new",
|
673
674
|
"also",
|
674
|
-
"
|
675
|
+
"isnt",
|
675
676
|
"mean",
|
676
677
|
"means",
|
677
678
|
"it",
|
@@ -681,6 +682,7 @@ FALSE_POS_ALPHABETIC: Set[str] = {
|
|
681
682
|
"new",
|
682
683
|
"wel",
|
683
684
|
"makes",
|
685
|
+
"unles",
|
684
686
|
}
|
685
687
|
|
686
688
|
UCSUR_RANGES = [
|
@@ -722,6 +724,10 @@ def words_by_usage(
|
|
722
724
|
|
723
725
|
result: Set[str] = set()
|
724
726
|
for word in data.values():
|
727
|
+
if usage == 0:
|
728
|
+
result.add(word["word"])
|
729
|
+
continue
|
730
|
+
|
725
731
|
usages = word["usage"]
|
726
732
|
if date in usages and usages[date] >= usage:
|
727
733
|
result.add(word["word"])
|