PyPI - sonatoki - Versions diffs - 0.9.0__tar.gz → 0.9.2__tar.gz - Mend

sonatoki 0.9.0tar.gz → 0.9.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

{sonatoki-0.9.0 → sonatoki-0.9.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sonatoki
-Version: 0.9.0
+Version: 0.9.2
 Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
 Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
 License: AGPL-3.0-or-later

{sonatoki-0.9.0 → sonatoki-0.9.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "sonatoki"
-version = "0.9.0"
+version = "0.9.2"
 description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
 authors = [
     { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },

{sonatoki-0.9.0 → sonatoki-0.9.2}/src/sonatoki/Scorers.py RENAMED Viewed

@@ -113,19 +113,24 @@ class Scaling(Scorer):
 class Voting(Scaling):
-    """Derives from `Scaling` in assigning scores from 0 to 1 based on how soon
-    a filter matches, with the first filter scoring a 1. However, after all
-    scores are derived, each token scoring 0 is given a is given an opportunity
-    to score based on its nearest 3 neighbors.
-    If created with a Filter, tokens must also pass that filter to be
-    considered for voting.
+    """Derives from `Scaling` in assigning scores from 0 to 1 based on the
+    first matching filter out of the list of filters. However, after all scores
+    are derived, each token scoring less than the threshold is assigned the
+    average score of its nearest 3 neighbors. The default threshold is 0.
+    If there are 3 or fewer tokens, this scorer is identical to the
+    Scaling scorer.
+    If the Voting scorer is created with a Filter, tokens must also
+    match that filter to be considered for voting. For example, the
+    following Voting filter would only check words with a score of 0.3
+    or less that still match the Syllabic filter: `Voting(Syllabic, 0.3)`
     """
     prereq: Type[Filter] = Pass
     threshold: int = 0
-    def __new__(cls, filter: Type[Filter], threshold_: int):
+    def __new__(cls, filter: Type[Filter], threshold_: int = 0) -> Type[Scorer]:
         class AnonVoting(Voting):
             prereq = filter
             threshold = threshold_

{sonatoki-0.9.0 → sonatoki-0.9.2}/src/sonatoki/Tokenizers.py RENAMED Viewed

@@ -104,6 +104,10 @@ class WordTokenizer(SetTokenizer):
                 # we skipped, but there wasn't another writing character
                 cls.add_token(s, tokens, last_match, i - 1)
                 last_match = i - 1
+                # there may be punctuation though
+                # TODO: this is duplicated
+                while i < slen and cls.is_delimiter(s[i]):
+                    i += 1
             cls.add_token(s, tokens, last_match, i)

{sonatoki-0.9.0 → sonatoki-0.9.2}/src/sonatoki/constants.py RENAMED Viewed

@@ -7,7 +7,7 @@ from pathlib import Path
 from sonatoki.types import LinkuWord, LinkuUsageDate
 from sonatoki.utils import find_unicode_chars, find_unicode_ranges
-LATEST_DATE = "2023-09"
+LATEST_DATE = "2024-09"
 # hardcoding this seems bad, but it means the parser is stable w.r.t. Linku!
@@ -507,7 +507,7 @@ SENTENCE_PUNCT = """.?!:;"()[-]«»‹›“”‟„⹂‽·•…「」『』"
 # single quotes are word boundaries if not intra-word, but double quotes are sentence
 # boundaries
-INTRA_WORD_PUNCT = """-'’"""
+INTRA_WORD_PUNCT = """-'’."""
 LINKU = Path(__file__).resolve().parent / Path("linku.json")
@@ -668,10 +668,11 @@ FALSE_POS_ALPHABETIC: Set[str] = {
     "not",
     "link",
     "wait",
+    "just",
     "lol",
     "new",
     "also",
-    "isn",  # TODO: tokenizer....
+    "isnt",
     "mean",
     "means",
     "it",
@@ -681,6 +682,7 @@ FALSE_POS_ALPHABETIC: Set[str] = {
     "new",
     "wel",
     "makes",
+    "unles",
 }
 UCSUR_RANGES = [
@@ -722,6 +724,10 @@ def words_by_usage(
     result: Set[str] = set()
     for word in data.values():
+        if usage == 0:
+            result.add(word["word"])
+            continue
         usages = word["usage"]
         if date in usages and usages[date] >= usage:
             result.add(word["word"])

sonatoki 0.9.0__tar.gz → 0.9.2__tar.gz

sonatoki 0.9.0tar.gz → 0.9.2tar.gz