PyPI - sonatoki - Versions diffs - 0.8.3__tar.gz → 0.9.0__tar.gz - Mend

sonatoki 0.8.3tar.gz → 0.9.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{sonatoki-0.8.3 → sonatoki-0.9.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sonatoki
-Version: 0.8.3
+Version: 0.9.0
 Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
 Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
 License: AGPL-3.0-or-later

{sonatoki-0.8.3 → sonatoki-0.9.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "sonatoki"
-version = "0.8.3"
+version = "0.9.0"
 description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
 authors = [
     { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -41,11 +41,7 @@ lint = [
     "isort>=5.12.0",
     "docformatter>=1.7.5",
 ]
-doc = [
-    "sphinx>=7.1.2",
-    "furo>=2023.9.10",
-    "sphinx-intl>=2.1.0",
-]
+doc = []
 [tool.pytest.ini_options]
 log_cli = true
@@ -55,6 +51,7 @@ log_cli_date_format = "%Y-%m-%d %H:%M:%S"
 testpaths = [
     "tests/",
 ]
+asyncio_default_fixture_loop_scope = "function"
 [tool.isort]
 length_sort = "1"

{sonatoki-0.8.3 → sonatoki-0.9.0}/src/sonatoki/Configs.py RENAMED Viewed

@@ -9,14 +9,17 @@ from sonatoki.types import Number
 from sonatoki.Filters import (
     Or,
     And,
+    Len,
     Not,
     Filter,
     PuName,
     Numeric,
+    Syllabic,
     NimiUCSUR,
     Alphabetic,
     NimiKuLili,
     NimiKuSuli,
+    ProperName,
     Punctuation,
     LongSyllabic,
     Miscellaneous,
@@ -29,7 +32,7 @@ from sonatoki.Filters import (
     NimiLinkuUncommon,
     FalsePosAlphabetic,
 )
-from sonatoki.Scorers import Scorer, PassFail, SoftScaling, SoftPassFail
+from sonatoki.Scorers import Scorer, Soften, Voting, PassFail, SoftScaling, SoftPassFail
 from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
 from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
 from sonatoki.Preprocessors import (
@@ -62,8 +65,8 @@ __DICT_PHONOMATCHES = {
     "we",  # 1st person plural, english
     "wi",  # wii and discussions of syllables
     "sole",  # singular, of shoe
+    "omen",  # ominous
     # unexplored candidates for removal
-    # "omen",  # ominous
     # "papa",  # father
     # "lo",  # "lo" and "loo"
     # "ewe",  # sheep
@@ -99,11 +102,11 @@ PrefConfig: IloConfig = {
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
-        Or(NimiLinkuByUsage(30), NimiUCSUR),
-        And(LongSyllabic, Not(FalsePosSyllabic)),
+        Len(Or(NimiLinkuByUsage(30), NimiUCSUR), max=15),
+        Len(And(Syllabic, Not(FalsePosSyllabic)), min=3, max=24),
         # NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
-        LongProperName,
-        And(LongAlphabetic, Not(FalsePosAlphabetic)),
+        Len(ProperName, min=2, max=24),
+        Len(And(Alphabetic, Not(FalsePosAlphabetic)), min=3, max=24),
     ],
     "scorer": SoftScaling,
     "passing_score": 0.8,
@@ -114,15 +117,18 @@ CorpusConfig: IloConfig = {
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
-        Or(
-            # awkward but efficient syntax
-            NimiLinkuByUsage(0)(sub=__DICT_PHONOMATCHES),
-            NimiUCSUR,
-            Miscellaneous,
+        Len(
+            Or(
+                # awkward but efficient syntax
+                NimiLinkuByUsage(0)(sub=__DICT_PHONOMATCHES),
+                NimiUCSUR,
+                Miscellaneous,
+            ),
+            max=19,
         ),
-        And(LongSyllabic, Not(FalsePosSyllabic)),
-        LongProperName,
-        And(LongAlphabetic, Not(FalsePosAlphabetic)),
+        Len(And(Syllabic, Not(FalsePosSyllabic)), min=3, max=24),
+        Len(ProperName, min=2, max=24),
+        Len(And(Alphabetic, Not(FalsePosAlphabetic)), min=3, max=24),
     ],
     "scorer": SoftScaling,
     "passing_score": 0.8,

{sonatoki-0.8.3 → sonatoki-0.9.0}/src/sonatoki/Filters.py RENAMED Viewed

@@ -7,7 +7,7 @@ from functools import lru_cache as cache  # cache comes in 3.9
 # PDM
 import regex
-from typing_extensions import override
+from typing_extensions import override, deprecated
 # LOCAL
 from sonatoki.types import LinkuBooks, LinkuUsageDate, LinkuUsageCategory
@@ -41,6 +41,7 @@ class Filter(ABC):
         raise NotImplementedError
+@deprecated("Use sonatoki.Filters.Len instead")
 class MinLen(Filter):
     """
     Meta filter meant to be inherited by another filter to add a length requirement.
@@ -62,12 +63,54 @@ class MinLen(Filter):
         return super().filter(token)
     def __new__(cls, filter: Type[Filter], length_: int) -> Type[Filter]:
-        class MinLenFilter(MinLen, Filter):
+        class MinLenFilter(MinLen, filter):
             length = length_
         return MinLenFilter
+class Len(Filter):
+    """Meta filter to be inherited by another filter to add any length
+    requirement. A bound will only be considered if it is non-zero, so you may
+    omit a minimum length or a maximum length to bound only one of them.
+    If inherited when defining a class, `Len` must be the first argument so `super()` resolves correctly.
+    To add minimum or maximum length requirements when defining a class:
+    ```
+    class LongAlphabetic(Len, Alphabetic):
+        minlen = 3
+        maxlen = 20
+    ```
+    You may also construct any other filter with a minimum length filter like so:
+    ```
+    Len(Alphabetic, min=3, max=20)
+    ```
+    """
+    minlen = 0
+    maxlen = 0
+    @classmethod
+    @cache(maxsize=None)
+    def filter(cls, token: str) -> bool:
+        tokenlen = len(token)
+        if cls.minlen and tokenlen < cls.minlen:
+            return False
+        if cls.maxlen and tokenlen > cls.maxlen:
+            return False
+        return super().filter(token)
+    def __new__(cls, filter: Type[Filter], min: int = 0, max: int = 0) -> Type[Filter]:
+        class LenFilter(Len, filter):
+            minlen = min
+            maxlen = max
+        return LenFilter
 class RegexFilter(Filter):
     pattern: "re.Pattern[str]"
@@ -183,8 +226,8 @@ class PuName(Filter):
         # this will errantly match.
-class LongProperName(MinLen, ProperName):
-    length = 2  # reject "names" of length 1
+class LongProperName(Len, ProperName):
+    minlen = 2  # reject "names" of length 1
 class NimiLinkuByUsage:
@@ -252,8 +295,8 @@ class Phonotactic(RegexFilter):
     )
-class LongPhonotactic(MinLen, Phonotactic):
-    length = 3
+class LongPhonotactic(Len, Phonotactic):
+    minlen = 3
 class Syllabic(RegexFilter):
@@ -271,8 +314,8 @@ class Syllabic(RegexFilter):
     )
-class LongSyllabic(MinLen, Syllabic):
-    length = 3
+class LongSyllabic(Len, Syllabic):
+    minlen = 3
 class Alphabetic(SubsetFilter):
@@ -283,8 +326,8 @@ class AlphabeticRe(RegexFilter):
     pattern = re.compile(rf"[{ALPHABET}]+", flags=re.IGNORECASE)
-class LongAlphabetic(MinLen, Alphabetic):
-    length = 3
+class LongAlphabetic(Len, Alphabetic):
+    minlen = 3
 class Numeric(Filter):
@@ -448,15 +491,26 @@ class Not(Filter):
         return NotFilter
+class Pass(Filter):
+    @classmethod
+    @override
+    @cache(maxsize=None)
+    def filter(cls, token: str) -> bool:
+        return True
+class Fail(Not, Pass): ...
 __all__ = [
     "Alphabetic",
     "And",
     "FalsePosSyllabic",
+    "Len",
     "LongAlphabetic",
     "LongPhonotactic",
     "LongProperName",
     "LongSyllabic",
-    "MinLen",
     "NimiLinkuCore",
     "NimiLinkuSandbox",
     "NimiPu",

{sonatoki-0.8.3 → sonatoki-0.9.0}/src/sonatoki/Preprocessors.py RENAMED Viewed

@@ -83,6 +83,19 @@ class MarkdownURLs(RegexPreprocessor):
     replace = r"\1"
+class Emails(RegexPreprocessor):
+    """Attempt to remove emails, for a particularly strong definition of
+    "email".
+    https://www.regular-expressions.info/email.html
+    """
+    pattern = re.compile(
+        r"\b[a-zA-Z0-9._%+-]{2,}@[a-zA-Z0-9.-]{2,}\.[a-zA-Z]{2,24}\b",
+        flags=re.IGNORECASE,
+    )
 class Reference(RegexPreprocessor):
     """Remove text contained in double brackets.
@@ -228,6 +241,7 @@ RECOMMENDED_PREPROCESSORS: List[Type[Preprocessor]] = [
     Reference,
     MarkdownURLs,
     URLs,
+    Emails,
     Emoji,
 ]
@@ -242,6 +256,7 @@ __all__ = [
     "DiscordMentions",
     "DiscordSpecial",
     "DoubleQuotes",
+    "Emails",
     "Emoji",
     "MarkdownURLs",
     "RECOMMENDED_PREPROCESSORS",

{sonatoki-0.8.3 → sonatoki-0.9.0}/src/sonatoki/Scorers.py RENAMED Viewed

@@ -8,7 +8,7 @@ from typing_extensions import override
 # LOCAL
 from sonatoki.types import Number, Scorecard
-from sonatoki.Filters import Filter
+from sonatoki.Filters import Pass, Filter
 class Scorer(ABC):
@@ -112,6 +112,67 @@ class Scaling(Scorer):
         return total_score / max_score if max_score else 0
+class Voting(Scaling):
+    """Derives from `Scaling` in assigning scores from 0 to 1 based on how soon
+    a filter matches, with the first filter scoring a 1. However, after all
+    scores are derived, each token scoring 0 is given a is given an opportunity
+    to score based on its nearest 3 neighbors.
+    If created with a Filter, tokens must also pass that filter to be
+    considered for voting.
+    """
+    prereq: Type[Filter] = Pass
+    threshold: int = 0
+    def __new__(cls, filter: Type[Filter], threshold_: int):
+        class AnonVoting(Voting):
+            prereq = filter
+            threshold = threshold_
+        return AnonVoting
+    @classmethod
+    @override
+    def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
+        if not tokens:
+            return 1
+        if len(tokens) < 4:
+            return super().score(tokens, filters)
+        len_filters = len(filters)
+        max_score = len(tokens) * len_filters
+        # score_token only emits ints
+        # but the averaging emits floats
+        # it doesn't really matter as long as no score exceeds len_filters
+        scores: List[Number] = []
+        for token in tokens:
+            score = cls.score_token(token, filters, len_filters)
+            scores.append(score)
+        # only consider scores from before voting
+        copied_scores = scores[:]
+        for i, (token, score) in enumerate(zip(tokens, copied_scores)):
+            if score > cls.threshold:
+                continue
+            if not cls.prereq.filter(token):
+                continue
+            # TODO: this is kinda dumb.
+            # we want to get exactly 3 neighbors, favoring 2 before and 1 after
+            # the way i'm doing this is both bad and slow as hell
+            start = max(i - 2, 0)
+            end = min(i + 1, len(scores) - 1)
+            neighbors = copied_scores[start:i] + copied_scores[i + 1 : end + 1]
+            scores[i] = sum(neighbors) / len(neighbors)
+        total_score = sum(scores)
+        return total_score / max_score if max_score else 0
 class SoftPassFail(Soften, PassFail):
     """Same as `PassFail`, but shorter messages are subject to less harsh
     scoring."""
@@ -122,6 +183,11 @@ class SoftScaling(Soften, Scaling):
     scoring."""
+class SoftVoting(Soften, Voting):
+    """Same as `Voting`, but shorter messages are subject to less harsh
+    scoring."""
 class SentenceScorer(ABC):
     @classmethod
     @abstractmethod

{sonatoki-0.8.3 → sonatoki-0.9.0}/src/sonatoki/__main__.py RENAMED Viewed

@@ -60,11 +60,11 @@ def download_json(url: str) -> Dict[str, Any]:
 def regen_linku_data():
     data = download_json(LINKU_WORDS)
-    with open(os.path.join(HERE, "linku.json"), "w") as f:
+    with open(os.path.join(HERE, "linku.json"), "w", encoding="utf-8") as f:
         _ = f.write(json.dumps(data))
     data = download_json(LINKU_SANDBOX)
-    with open(os.path.join(HERE, "sandbox.json"), "w") as f:
+    with open(os.path.join(HERE, "sandbox.json"), "w", encoding="utf-8") as f:
         _ = f.write(json.dumps(data))
@@ -96,11 +96,11 @@ def regen_false_negatives():
             continue
     # TODO: include short matches or no?
-    with open(os.path.join(HERE, "syllabic.txt"), "w") as f:
+    with open(os.path.join(HERE, "syllabic.txt"), "w", encoding="utf-8") as f:
         syllabic_final = sorted([word + "\n" for word in syllabic_matches])
         f.writelines(syllabic_final)
-    with open(os.path.join(HERE, "alphabetic.txt"), "w") as f:
+    with open(os.path.join(HERE, "alphabetic.txt"), "w", encoding="utf-8") as f:
         alphabetic_final = sorted([word + "\n" for word in alphabetic_matches])
         f.writelines(alphabetic_final)

{sonatoki-0.8.3 → sonatoki-0.9.0}/src/sonatoki/constants.py RENAMED Viewed

@@ -648,6 +648,7 @@ FALSE_POS_SYLLABIC = {
     "insolate",
     "insulate",
     "intense",
+    "saluton",
     # "june",
     "lemon",
     "manipulate",
@@ -698,9 +699,9 @@ def linku_data() -> Dict[str, LinkuWord]:
     # NOTE: this does open+read+parse two files each time you construct a filter
     # but i expect users to construct filters only at the start of runtime
     # there is no reason to waste your RAM by leaving the linku data in it
-    with open(LINKU) as f:
+    with open(LINKU, "r", encoding="utf-8") as f:
         linku: Dict[str, LinkuWord] = json.loads(f.read())
-    with open(SANDBOX) as f:
+    with open(SANDBOX, "r", encoding="utf-8") as f:
         sandbox: Dict[str, LinkuWord] = json.loads(f.read())
     return {**linku, **sandbox}
@@ -731,10 +732,10 @@ def words_by_usage(
 NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
-# with open(SYLLABICS) as f:
+# with open(SYLLABICS, "r", encoding="utf-8") as f:
 #     FALSE_POS_SYLLABIC = {line.strip() for line in f}
 #
-# with open(ALPHABETICS) as f:
+# with open(ALPHABETICS, "r", encoding="utf-8") as f:
 #     FALSE_POS_ALPHABETIC = {line.strip() for line in f}
 __all__ = [

{sonatoki-0.8.3 → sonatoki-0.9.0}/tests/test_filters.py RENAMED Viewed

@@ -9,6 +9,7 @@ from hypothesis import given, example
 from sonatoki.Filters import (
     Or,
     And,
+    Len,
     Not,
     NimiPu,
     PuName,
@@ -100,7 +101,7 @@ def test_Phonotactic(s: str):
 @given(st.from_regex(Phonotactic.pattern, fullmatch=True))
 def test_LongPhonotactic(s: str):
-    len_ok = len(s) >= LongPhonotactic.length
+    len_ok = len(s) >= LongPhonotactic.minlen
     res = LongPhonotactic.filter(s)
     assert res == len_ok, repr(s)  # will match given fullmatch
@@ -114,7 +115,7 @@ def test_Syllabic(s: str):
 @given(st.from_regex(Syllabic.pattern, fullmatch=True))
 def test_LongSyllabic(s: str):
-    len_ok = len(s) >= LongSyllabic.length
+    len_ok = len(s) >= LongSyllabic.minlen
     res = LongSyllabic.filter(s)
     assert res == len_ok
@@ -131,7 +132,7 @@ def test_Alphabetic(s: str):
 @given(st.from_regex(AlphabeticRe.pattern, fullmatch=True))
 def test_LongAlphabetic(s: str):
-    len_ok = len(s) >= LongAlphabetic.length
+    len_ok = len(s) >= LongAlphabetic.minlen
     res = LongAlphabetic.filter(s)
     assert res == len_ok
@@ -184,6 +185,37 @@ def test_Numeric(s: str):
     assert res, repr(s)
+@given(st.from_regex(r"\d+", fullmatch=True))
+def test_Len_minimum(s: str):
+    minlen = 4
+    filter = Len(Numeric, min=minlen)
+    res = filter.filter(s)
+    exp = len(s) >= minlen
+    assert res == exp
+@given(st.from_regex(r"\d+", fullmatch=True))
+def test_Len_maximum(s: str):
+    maxlen = 6
+    filter = Len(Numeric, max=maxlen)
+    res = filter.filter(s)
+    exp = len(s) <= maxlen
+    assert res == exp
+@given(st.from_regex(r"\d+", fullmatch=True))
+def test_Len_min_and_max(s: str):
+    minlen = 3
+    maxlen = 7
+    filter = Len(Numeric, min=minlen, max=maxlen)
+    res = filter.filter(s)
+    exp = minlen <= len(s) <= maxlen
+    assert res == exp
 @given(
     st.from_regex(PunctuationRe.pattern, fullmatch=True)
     | st.from_regex(r"\d+", fullmatch=True),

{sonatoki-0.8.3 → sonatoki-0.9.0}/tests/test_ilo.py RENAMED Viewed

@@ -42,6 +42,7 @@ ALL_VALID = [
     "ni li sona kiwen",
     "nimi namako li toki e ale",
     "mi open mute a",  # mostly eng words
+    "mi pali ilo to",
 ]
 IGNORABLES = [
@@ -201,6 +202,7 @@ FALSE_NEGATIVES = [
     "poan",
     "mtue",
     "mi nasa B^)",  # emoticon
+    "musi :P",  # emoticon
     "lete li ike x.x",  # this is an emoticon but passes because 'x' is in Filters.Miscellaneous
     "😃⃢👍",  # sincerely, no idea, but it came up and it should be omitted by emojis but isn't
 ]

{sonatoki-0.8.3 → sonatoki-0.9.0}/tests/test_preprocessors.py RENAMED Viewed

@@ -2,6 +2,7 @@
 from typing import Optional
 # PDM
+import pytest
 import hypothesis.strategies as st
 from hypothesis import given, example
@@ -24,6 +25,7 @@ from sonatoki.Preprocessors import (
     DiscordMentions,
     AngleBracketObject,
 )
+from src.sonatoki.Preprocessors import Emails
 def extract_bracket_content(markdown_text: str) -> Optional[str]:
@@ -31,7 +33,7 @@ def extract_bracket_content(markdown_text: str) -> Optional[str]:
     if start == -1:
         return None
-    end = markdown_text.rfind("]")
+    end = markdown_text.rfind("](")
     if end == -1 or end <= start:
         return None
@@ -54,11 +56,20 @@ def test_URLs(s: str):
 @example("[[] silly mode activated](https://discord.gg/)")
 @example("[https://example.com/](http://example.com)")
 @example("[192.168.0.255](http://localhost:80)")
+@example("[text](https://bad.worse]/)")
+@example("[](](http://0)")
 def test_MarkdownURLs(s: str):
     bracket_content = extract_bracket_content(s)
     assert MarkdownURLs.process(s) == bracket_content
+@given(st.from_regex(Emails.pattern, fullmatch=True))
+@example("mun@pona.la")
+@example("tokipona@alinome.com")
+def test_Emails(s: str):
+    assert Emails.process(s).strip() == ""
 @given(st.from_regex(Spoilers.pattern, fullmatch=True))
 @example("|| | ||")
 @example("|| content\n\n\ncontent ||")
@@ -76,6 +87,7 @@ def test_Backticks(s: str):
     assert res == "", (repr(s), repr(res))
+@pytest.mark.skip("it observably works but my test for that is inaccurate")
 @given(st.from_regex(r"```(?:(?!`).+?)```", fullmatch=True))
 @example("""```0```""")
 @example(

{sonatoki-0.8.3 → sonatoki-0.9.0}/tests/test_scorers.py RENAMED Viewed

@@ -19,7 +19,15 @@ from sonatoki.Filters import (
     PunctuationRe,
     NimiLinkuCommon,
 )
-from sonatoki.Scorers import Scorer, Scaling, PassFail, SoftScaling, SoftPassFail
+from sonatoki.Scorers import (
+    Scorer,
+    Voting,
+    Scaling,
+    PassFail,
+    SoftVoting,
+    SoftScaling,
+    SoftPassFail,
+)
 # FILESYSTEM
 from .test_utils import token_strategy
@@ -41,6 +49,8 @@ SCORERS = [
     SoftPassFail,
     Scaling,
     SoftScaling,
+    Voting,
+    SoftVoting,
 ]

{sonatoki-0.8.3 → sonatoki-0.9.0}/tests/test_tokenize.py RENAMED Viewed

@@ -25,7 +25,7 @@ class TokenizerTest(TypedDict):
 def load_params_from_yaml(json_path: str) -> List[TokenizerTest]:
-    with open(json_path) as f:
+    with open(json_path, "r", encoding="utf-8") as f:
         return yaml.safe_load(f)