sonatoki 0.9.0__py3-none-any.whl → 0.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/Configs.py CHANGED
@@ -52,20 +52,20 @@ __DICT_PHONOMATCHES = {
52
52
  "an", # article
53
53
  "api", # API
54
54
  "i", # 1st person
55
+ "je", # 1st person pronoun, french
55
56
  "kana", # japanese script
56
57
  "me", # 1st person singular, english
57
58
  "ne", # "no" in several languages
58
59
  "nu", # "new" in english, "now" in dutch
60
+ "omen", # ominous
59
61
  "se", # spanish particle, english "see"
62
+ "sole", # singular, of shoe
60
63
  "take", # acquire, perhaps forcefully or without permission
61
64
  "ten", # 10
62
65
  "to", # to, too
63
- "je", # 1st person pronoun, french
64
66
  "u", # no u
65
67
  "we", # 1st person plural, english
66
68
  "wi", # wii and discussions of syllables
67
- "sole", # singular, of shoe
68
- "omen", # ominous
69
69
  # unexplored candidates for removal
70
70
  # "papa", # father
71
71
  # "lo", # "lo" and "loo"
sonatoki/Scorers.py CHANGED
@@ -113,19 +113,24 @@ class Scaling(Scorer):
113
113
 
114
114
 
115
115
  class Voting(Scaling):
116
- """Derives from `Scaling` in assigning scores from 0 to 1 based on how soon
117
- a filter matches, with the first filter scoring a 1. However, after all
118
- scores are derived, each token scoring 0 is given a is given an opportunity
119
- to score based on its nearest 3 neighbors.
120
-
121
- If created with a Filter, tokens must also pass that filter to be
122
- considered for voting.
116
+ """Derives from `Scaling` in assigning scores from 0 to 1 based on the
117
+ first matching filter out of the list of filters. However, after all scores
118
+ are derived, each token scoring less than the threshold is assigned the
119
+ average score of its nearest 3 neighbors. The default threshold is 0.
120
+
121
+ If there are 3 or fewer tokens, this scorer is identical to the
122
+ Scaling scorer.
123
+
124
+ If the Voting scorer is created with a Filter, tokens must also
125
+ match that filter to be considered for voting. For example, the
126
+ following Voting filter would only check words with a score of 0.3
127
+ or less that still match the Syllabic filter: `Voting(Syllabic, 0.3)`
123
128
  """
124
129
 
125
130
  prereq: Type[Filter] = Pass
126
131
  threshold: int = 0
127
132
 
128
- def __new__(cls, filter: Type[Filter], threshold_: int):
133
+ def __new__(cls, filter: Type[Filter], threshold_: int = 0) -> Type[Scorer]:
129
134
  class AnonVoting(Voting):
130
135
  prereq = filter
131
136
  threshold = threshold_
sonatoki/Tokenizers.py CHANGED
@@ -104,6 +104,10 @@ class WordTokenizer(SetTokenizer):
104
104
  # we skipped, but there wasn't another writing character
105
105
  cls.add_token(s, tokens, last_match, i - 1)
106
106
  last_match = i - 1
107
+ # there may be punctuation though
108
+ # TODO: this is duplicated
109
+ while i < slen and cls.is_delimiter(s[i]):
110
+ i += 1
107
111
 
108
112
  cls.add_token(s, tokens, last_match, i)
109
113
 
sonatoki/constants.py CHANGED
@@ -7,7 +7,7 @@ from pathlib import Path
7
7
  from sonatoki.types import LinkuWord, LinkuUsageDate
8
8
  from sonatoki.utils import find_unicode_chars, find_unicode_ranges
9
9
 
10
- LATEST_DATE = "2023-09"
10
+ LATEST_DATE = "2024-09"
11
11
  # hardcoding this seems bad, but it means the parser is stable w.r.t. Linku!
12
12
 
13
13
 
@@ -507,7 +507,7 @@ SENTENCE_PUNCT = """.?!:;"()[-]«»‹›“”‟„⹂‽·•…「」『』"
507
507
  # single quotes are word boundaries if not intra-word, but double quotes are sentence
508
508
  # boundaries
509
509
 
510
- INTRA_WORD_PUNCT = """-'"""
510
+ INTRA_WORD_PUNCT = """-'’."""
511
511
 
512
512
 
513
513
  LINKU = Path(__file__).resolve().parent / Path("linku.json")
@@ -668,10 +668,11 @@ FALSE_POS_ALPHABETIC: Set[str] = {
668
668
  "not",
669
669
  "link",
670
670
  "wait",
671
+ "just",
671
672
  "lol",
672
673
  "new",
673
674
  "also",
674
- "isn", # TODO: tokenizer....
675
+ "isnt",
675
676
  "mean",
676
677
  "means",
677
678
  "it",
@@ -681,6 +682,7 @@ FALSE_POS_ALPHABETIC: Set[str] = {
681
682
  "new",
682
683
  "wel",
683
684
  "makes",
685
+ "unles",
684
686
  }
685
687
 
686
688
  UCSUR_RANGES = [
@@ -722,6 +724,10 @@ def words_by_usage(
722
724
 
723
725
  result: Set[str] = set()
724
726
  for word in data.values():
727
+ if usage == 0:
728
+ result.add(word["word"])
729
+ continue
730
+
725
731
  usages = word["usage"]
726
732
  if date in usages and usages[date] >= usage:
727
733
  result.add(word["word"])