PyPI - sonatoki - Versions diffs - 0.6.1__tar.gz → 0.6.3__tar.gz - Mend

sonatoki 0.6.1tar.gz → 0.6.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{sonatoki-0.6.1 → sonatoki-0.6.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sonatoki
-Version: 0.6.1
+Version: 0.6.3
 Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
 Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
 License: AGPL-3.0-or-later

{sonatoki-0.6.1 → sonatoki-0.6.3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "sonatoki"
-version = "0.6.1"
+version = "0.6.3"
 description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
 authors = [
     { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },

{sonatoki-0.6.1 → sonatoki-0.6.3}/src/sonatoki/Configs.py RENAMED Viewed

@@ -1,6 +1,6 @@
 # STL
 from copy import deepcopy
-from typing import Set, List, Type, TypedDict, cast
+from typing import List, Type, TypedDict
 # PDM
 from typing_extensions import NotRequired
@@ -12,13 +12,11 @@ from sonatoki.Filters import (
     Not,
     Filter,
     Numeric,
-    Syllabic,
     NimiUCSUR,
     Alphabetic,
     NimiKuLili,
     NimiKuSuli,
     ProperName,
-    Phonotactic,
     Punctuation,
     LongSyllabic,
     Miscellaneous,
@@ -44,6 +42,34 @@ from sonatoki.Preprocessors import (
     AngleBracketObject,
 )
+__DICT_PHONOMATCHES = {
+    # Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
+    # In this case, all of these appear more often in English by a factor of at least 10.
+    "aka",  # also known as
+    "an",  # article
+    "api",  # API
+    "i",  # 1st person
+    "kana",  # japanese script
+    "me",  # 1st person singular, english
+    "ne",  # "no" in several languages
+    "nu",  # "new" in english, "now" in dutch
+    "se",  # spanish particle, english "see"
+    "take",  # acquire, perhaps forcefully or without permission
+    "ten",  # 10
+    "to",  # to, too
+    "je",  # 1st person pronoun, french
+    "u",  # no u
+    "we",  # 1st person plural, english
+    "wi",  # wii and discussions of syllables
+    "sole",  # singular, of shoe
+    # unexplored candidates for removal
+    # "omen",  # ominous
+    # "papa",  # father
+    # "lo",  # "lo" and "loo"
+    # "ewe",  # sheep
+    # "pa",  # father- eh?
+}
 class IloConfig(TypedDict):
     preprocessors: List[Type[Preprocessor]]
@@ -92,8 +118,8 @@ CorpusConfig: IloConfig = {
             NimiLinkuCore,
             NimiLinkuCommon,
             NimiLinkuUncommon,
-            NimiLinkuObscure,
-            NimiLinkuSandbox,
+            NimiLinkuObscure(sub=__DICT_PHONOMATCHES),
+            NimiLinkuSandbox(sub=__DICT_PHONOMATCHES),
             NimiUCSUR,
             Miscellaneous,
         ),
@@ -104,40 +130,6 @@ CorpusConfig: IloConfig = {
     "scorer": SoftScaling,
     "passing_score": 0.8,
 }
-# TODO: create a mechanism to omit tokens from a filter with more granularity
-__corpus_tokens_dict: Set[str] = cast(
-    Set[str],
-    CorpusConfig["scoring_filters"][
-        0
-    ].tokens,  # pyright: ignore[reportAttributeAccessIssue]
-)
-__corpus_tokens_dict -= {
-    # Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
-    # In this case, all of these appear more often in English by a factor of at least 10.
-    "aka",  # also known as
-    "an",  # article
-    "api",  # API
-    "i",  # 1st person
-    "kana",  # japanese script
-    "me",  # 1st person
-    "ne",  # "no" in several languages
-    "nu",  # "new", now in dutch
-    "se",  # spanish particle, "see"
-    "take",  # acquire, perhaps forcefully or without permission
-    "ten",  # 10
-    "to",  # to, too
-    "u",  # no u
-    "we",  # 1st person plural
-    "wi",  # wii and discussions of syllables
-    "sole",  # singular, of shoe
-    # unexplored candidates for removal
-    # "omen",  # ominous
-    # "papa",  # father
-    # "lo",  # "lo" and "loo"
-    # "ewe",  # sheep
-    # "pa",  # father- eh?
-}
 """Mimics the previous implementation of ilo pi toki pona taso."""
 LazyConfig: IloConfig = {
     "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],

{sonatoki-0.6.1 → sonatoki-0.6.3}/src/sonatoki/Filters.py RENAMED Viewed

@@ -1,7 +1,8 @@
 # STL
 import re
 from abc import ABC, abstractmethod
-from typing import Set, List, Type
+from copy import deepcopy
+from typing import Set, List, Type, Optional
 from functools import lru_cache as cache  # cache comes in 3.9
 # PDM
@@ -101,6 +102,20 @@ class MemberFilter(Filter):
     def filter(cls, token: str) -> bool:
         return token.lower() in cls.tokens
+    def __new__(
+        cls, add: Optional[Set[str]] = None, sub: Optional[Set[str]] = None
+    ) -> Type[Filter]:
+        parent_tokens = deepcopy(cls.tokens)
+        if add:
+            parent_tokens = parent_tokens.union(add)
+        if sub:
+            parent_tokens -= sub
+        class AnonMemberFilter(MemberFilter):
+            tokens = parent_tokens
+        return AnonMemberFilter
 class SubsetFilter(Filter):
     tokens: Set[str]

{sonatoki-0.6.1 → sonatoki-0.6.3}/src/sonatoki/constants.py RENAMED Viewed

@@ -501,7 +501,7 @@ ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
 SENTENCE_PUNCT = """.?!:;()[-]·•…"""
 # NOTE: quotes were previously included, but in TP they are *not* reliably sentence boundaries
-INTRA_WORD_PUNCT = """-'"""
+INTRA_WORD_PUNCT = """-'’"""
 LINKU = Path(__file__).resolve().parent / Path("linku.json")

{sonatoki-0.6.1 → sonatoki-0.6.3}/tests/test_filters.py RENAMED Viewed

@@ -280,3 +280,36 @@ def test_AndNotFilter(s: str):
     if res_fp:
         # syl matched- but if fp matches, then the composed filter should not match
         assert not res_composed
+@given(st.sampled_from(list(NIMI_PU | NIMI_KU_SULI)))
+def test_AddTokensToMemberFilter(s: str):
+    PuEnKuSuliFilter = NimiPu(add=NimiKuSuli.tokens)
+    assert PuEnKuSuliFilter.filter(s)
+@given(st.sampled_from(list(NIMI_LINKU_SANDBOX | NIMI_KU_LILI)))
+def test_AddTokensToMemberFilterNegative(s: str):
+    PuEnKuSuliFilter = NimiPu(add=NimiKuSuli.tokens)
+    assert not PuEnKuSuliFilter.filter(s)
+@given(
+    st.sampled_from(
+        list(
+            NIMI_PU
+            | NIMI_KU_SULI
+            | NIMI_KU_LILI
+            | NIMI_LINKU_UNCOMMON
+            | NIMI_LINKU_OBSCURE
+            | NIMI_LINKU_SANDBOX
+        ),
+    )
+    | st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
+)
+def test_SubTokensFromMemberFilter(s: str):
+    NimiAlaFilter = NimiLinkuCore(sub=NimiPu.tokens)
+    # core is a strict subset of pu
+    # if kin becomes core, needs to be corrected
+    assert not NimiAlaFilter.filter(s)