PyPI - sonatoki - Versions diffs - 0.9.0__py3-none-any.whl → 0.9.2__py3-none-any.whl - Mend

sonatoki 0.9.0py3-none-any.whl → 0.9.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

sonatoki/Configs.py +3 -3
sonatoki/Scorers.py +13 -8
sonatoki/Tokenizers.py +4 -0
sonatoki/constants.py +9 -3
sonatoki/linku.json +1 -1
sonatoki/sandbox.json +1 -1
sonatoki/types.py +1 -1
{sonatoki-0.9.0.dist-info → sonatoki-0.9.2.dist-info}/METADATA +1 -1
sonatoki-0.9.2.dist-info/RECORD +22 -0
{sonatoki-0.9.0.dist-info → sonatoki-0.9.2.dist-info}/WHEEL +1 -1
sonatoki-0.9.0.dist-info/RECORD +0 -22
{sonatoki-0.9.0.dist-info → sonatoki-0.9.2.dist-info}/entry_points.txt +0 -0
{sonatoki-0.9.0.dist-info → sonatoki-0.9.2.dist-info}/licenses/LICENSE +0 -0

sonatoki/Configs.py CHANGED Viewed

@@ -52,20 +52,20 @@ __DICT_PHONOMATCHES = {
     "an",  # article
     "api",  # API
     "i",  # 1st person
+    "je",  # 1st person pronoun, french
     "kana",  # japanese script
     "me",  # 1st person singular, english
     "ne",  # "no" in several languages
     "nu",  # "new" in english, "now" in dutch
+    "omen",  # ominous
     "se",  # spanish particle, english "see"
+    "sole",  # singular, of shoe
     "take",  # acquire, perhaps forcefully or without permission
     "ten",  # 10
     "to",  # to, too
-    "je",  # 1st person pronoun, french
     "u",  # no u
     "we",  # 1st person plural, english
     "wi",  # wii and discussions of syllables
-    "sole",  # singular, of shoe
-    "omen",  # ominous
     # unexplored candidates for removal
     # "papa",  # father
     # "lo",  # "lo" and "loo"

sonatoki/Scorers.py CHANGED Viewed

@@ -113,19 +113,24 @@ class Scaling(Scorer):
 class Voting(Scaling):
-    """Derives from `Scaling` in assigning scores from 0 to 1 based on how soon
-    a filter matches, with the first filter scoring a 1. However, after all
-    scores are derived, each token scoring 0 is given a is given an opportunity
-    to score based on its nearest 3 neighbors.
-    If created with a Filter, tokens must also pass that filter to be
-    considered for voting.
+    """Derives from `Scaling` in assigning scores from 0 to 1 based on the
+    first matching filter out of the list of filters. However, after all scores
+    are derived, each token scoring less than the threshold is assigned the
+    average score of its nearest 3 neighbors. The default threshold is 0.
+    If there are 3 or fewer tokens, this scorer is identical to the
+    Scaling scorer.
+    If the Voting scorer is created with a Filter, tokens must also
+    match that filter to be considered for voting. For example, the
+    following Voting filter would only check words with a score of 0.3
+    or less that still match the Syllabic filter: `Voting(Syllabic, 0.3)`
     """
     prereq: Type[Filter] = Pass
     threshold: int = 0
-    def __new__(cls, filter: Type[Filter], threshold_: int):
+    def __new__(cls, filter: Type[Filter], threshold_: int = 0) -> Type[Scorer]:
         class AnonVoting(Voting):
             prereq = filter
             threshold = threshold_

sonatoki/Tokenizers.py CHANGED Viewed

@@ -104,6 +104,10 @@ class WordTokenizer(SetTokenizer):
                 # we skipped, but there wasn't another writing character
                 cls.add_token(s, tokens, last_match, i - 1)
                 last_match = i - 1
+                # there may be punctuation though
+                # TODO: this is duplicated
+                while i < slen and cls.is_delimiter(s[i]):
+                    i += 1
             cls.add_token(s, tokens, last_match, i)

sonatoki/constants.py CHANGED Viewed

@@ -7,7 +7,7 @@ from pathlib import Path
 from sonatoki.types import LinkuWord, LinkuUsageDate
 from sonatoki.utils import find_unicode_chars, find_unicode_ranges
-LATEST_DATE = "2023-09"
+LATEST_DATE = "2024-09"
 # hardcoding this seems bad, but it means the parser is stable w.r.t. Linku!
@@ -507,7 +507,7 @@ SENTENCE_PUNCT = """.?!:;"()[-]«»‹›“”‟„⹂‽·•…「」『』"
 # single quotes are word boundaries if not intra-word, but double quotes are sentence
 # boundaries
-INTRA_WORD_PUNCT = """-'’"""
+INTRA_WORD_PUNCT = """-'’."""
 LINKU = Path(__file__).resolve().parent / Path("linku.json")
@@ -668,10 +668,11 @@ FALSE_POS_ALPHABETIC: Set[str] = {
     "not",
     "link",
     "wait",
+    "just",
     "lol",
     "new",
     "also",
-    "isn",  # TODO: tokenizer....
+    "isnt",
     "mean",
     "means",
     "it",
@@ -681,6 +682,7 @@ FALSE_POS_ALPHABETIC: Set[str] = {
     "new",
     "wel",
     "makes",
+    "unles",
 }
 UCSUR_RANGES = [
@@ -722,6 +724,10 @@ def words_by_usage(
     result: Set[str] = set()
     for word in data.values():
+        if usage == 0:
+            result.add(word["word"])
+            continue
         usages = word["usage"]
         if date in usages and usages[date] >= usage:
             result.add(word["word"])

sonatoki 0.9.0__py3-none-any.whl → 0.9.2__py3-none-any.whl

sonatoki 0.9.0py3-none-any.whl → 0.9.2py3-none-any.whl