PyPI - sonatoki - Versions diffs - 0.3.1__tar.gz → 0.3.2__tar.gz - Mend

sonatoki 0.3.1tar.gz → 0.3.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{sonatoki-0.3.1 → sonatoki-0.3.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sonatoki
-Version: 0.3.1
+Version: 0.3.2
 Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
 Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
 License: AGPL-3.0-or-later

{sonatoki-0.3.1 → sonatoki-0.3.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "sonatoki"
-version = "0.3.1"
+version = "0.3.2"
 description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
 authors = [
     { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -16,8 +16,6 @@ readme = "README.md"
 [project.license]
 text = "AGPL-3.0-or-later"
-[project.optional-dependencies]
 [build-system]
 requires = [
     "pdm-backend",

{sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/Configs.py RENAMED Viewed

@@ -5,17 +5,17 @@ from typing import List, Type, TypedDict
 # LOCAL
 from sonatoki.Filters import (
     Filter,
-    NimiPu,
     Numeric,
-    OrFilter,
     Syllabic,
     NimiUCSUR,
     Alphabetic,
     ProperName,
-    Phonotactic,
     Punctuation,
+    LongSyllabic,
+    Miscellaneous,
     NimiLinkuCore,
-    NimiPuSynonyms,
+    LongAlphabetic,
+    LongProperName,
     OrMemberFilter,
     NimiLinkuCommon,
     NimiLinkuObscure,
@@ -28,12 +28,9 @@ from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
 from sonatoki.Tokenizers import Tokenizer, WordTokenizer
 from sonatoki.Preprocessors import (
     URLs,
+    Backticks,
     Reference,
     Preprocessor,
-    DiscordEmotes,
-    DiscordSpecial,
-    DiscordChannels,
-    DiscordMentions,
     AngleBracketObject,
 )
@@ -62,14 +59,14 @@ BaseConfig: IloConfig = {
 PrefConfig: IloConfig = {
-    "preprocessors": [URLs, Reference],
+    "preprocessors": [Backticks, URLs, Reference],
     "cleaners": [ConsecutiveDuplicates],
-    "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
+    "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
-        OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
-        Syllabic,
-        ProperName,
-        Alphabetic,
+        OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
+        LongSyllabic,
+        LongProperName,
+        LongAlphabetic,
     ],
     "scorer": SoftScaling,
     "passing_score": 0.8,
@@ -77,9 +74,9 @@ PrefConfig: IloConfig = {
 }
 CorpusConfig: IloConfig = {
-    "preprocessors": [URLs, AngleBracketObject, Reference],
+    "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
-    "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
+    "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
         OrMemberFilter(
             NimiLinkuCore,
@@ -88,10 +85,11 @@ CorpusConfig: IloConfig = {
             NimiLinkuObscure,
             NimiLinkuSandbox,
             NimiUCSUR,
+            Miscellaneous,
         ),
-        Syllabic,
-        ProperName,
-        Alphabetic,
+        LongSyllabic,
+        LongProperName,
+        LongAlphabetic,
     ],
     "scorer": SoftScaling,
     "passing_score": 0.8,
@@ -99,25 +97,28 @@ CorpusConfig: IloConfig = {
 }
+"""
+Mimics the previous implementation of ilo pi toki pona taso
+"""
 LazyConfig: IloConfig = {
-    "preprocessors": [URLs],
+    "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
-    "scoring_filters": [Alphabetic, NimiUCSUR, ProperName],
+    "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
     "scorer": SoftPassFail,
     "passing_score": 0.8,
     "word_tokenizer": WordTokenizer,
 }
 DiscordConfig: IloConfig = {
-    "preprocessors": [URLs, AngleBracketObject, Reference],
+    "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
     "scoring_filters": [
         OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
-        Syllabic,
-        ProperName,
-        Alphabetic,
+        LongSyllabic,
+        LongProperName,
+        LongAlphabetic,
     ],
     "scorer": SoftScaling,
     "passing_score": 0.8,

{sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/Filters.py RENAMED Viewed

@@ -42,6 +42,33 @@ class Filter(ABC):
         raise NotImplementedError
+class MinLen(Filter):
+    """
+    Meta filter meant to be inherited by another filter to add a length requirement.
+    Multiple-inherit with `MinLen` as the first argument so `super()` resolves correctly.
+    You may also construct any other filter with a minimum length filter like so:
+    ```
+    MinLen(Alphabetic, 3)
+    ```
+    """
+    length = 0
+    @classmethod
+    @cache(maxsize=None)
+    def filter(cls, token: str) -> bool:
+        if len(token) < cls.length:
+            return False
+        return super().filter(token)
+    def __new__(cls, filter: Type[Filter], length_: int) -> Type[Filter]:
+        class MinLenFilter(MinLen, Filter):
+            length = length_
+        return MinLenFilter
 class RegexFilter(Filter):
     pattern: "re.Pattern[str]"
@@ -83,11 +110,16 @@ class SubsetFilter(Filter):
 class Miscellaneous(MemberFilter):
-    tokens = set(ALLOWABLES)
+    tokens = prep_dictionary(ALLOWABLES)
 class EnglishIgnorables(MemberFilter):
-    tokens = set(IGNORABLES)
+    """NOTE: Not recommended for use.
+    It is better to use a Long* filter such as LongSyllabic than to use this filter.
+    This filter hides words from scoring rather than scoring them poorly,
+    which is more of a benefit than a loss for a word you would like to omit."""
+    tokens = prep_dictionary(IGNORABLES)
 class ProperName(Filter):
@@ -109,6 +141,10 @@ class ProperName(Filter):
         # this will errantly match.
+class LongProperName(MinLen, ProperName):
+    length = 2  # reject "names" of length 1
 class NimiPu(MemberFilter):
     tokens = prep_dictionary(NIMI_PU)
@@ -166,6 +202,10 @@ class Phonotactic(RegexFilter):
     )
+class LongPhonotactic(MinLen, Phonotactic):
+    length = 3
 class Syllabic(RegexFilter):
     """Determines if a given token is syllabically valid Toki Pona (or `n`).
     Words must have correctly ordered vowels and consonants, but the phonotactic
@@ -179,6 +219,10 @@ class Syllabic(RegexFilter):
     )
+class LongSyllabic(MinLen, Syllabic):
+    length = 3
 class Alphabetic(SubsetFilter):
     tokens = set(ALPHABET)
@@ -187,9 +231,8 @@ class AlphabeticRe(RegexFilter):
     pattern = re.compile(rf"[{ALPHABET}]+", flags=re.IGNORECASE)
-class TwoOrMoreAlphabetic(Filter):
-    # TODO: alphabetic implementation that ignores single characters
-    pass
+class LongAlphabetic(MinLen, Alphabetic):
+    length = 3
 class Numeric(Filter):
@@ -266,11 +309,9 @@ class OrFilter:
         if not len(filters) >= 2:
             raise ValueError("Provide at least two Filters to OrFilter.")
-        subset_filters = [f for f in filters if issubclass(f, MemberFilter)]
-        if len(subset_filters) >= 2:
-            raise Warning(
-                "Prefer OrMemberFilter for combining two or more MemberFilters."
-            )
+        member_filters = [f for f in filters if issubclass(f, MemberFilter)]
+        if len(member_filters) >= 2:
+            raise Warning("Use OrMemberFilter for combining two or more MemberFilters.")
         filter = cls.__generic_filter(*filters)
@@ -279,7 +320,7 @@ class OrFilter:
 class OrMemberFilter:
     @staticmethod
-    def __subset_filter(*filters: Type[MemberFilter]) -> Type[MemberFilter]:
+    def __member_filter(*filters: Type[MemberFilter]) -> Type[MemberFilter]:
         all_token_sets: List[Set[str]] = [f.tokens for f in filters]
         all_tokens: Set[str] = set().union(*all_token_sets)
@@ -291,7 +332,7 @@ class OrMemberFilter:
     def __new__(cls, *filters_: Type[MemberFilter]) -> Type[MemberFilter]:
         if not len(filters_) >= 2:
             raise ValueError("Provide two or more MemberFilters to OrMemberFilter.")
-        filter = cls.__subset_filter(*filters_)
+        filter = cls.__member_filter(*filters_)
         return filter
@@ -323,6 +364,11 @@ __all__ = [
     "Alphabetic",
     "AndFilter",
     "EnglishIgnorables",
+    "LongAlphabetic",
+    "LongPhonotactic",
+    "LongProperName",
+    "LongSyllabic",
+    "MinLen",
     "NimiLinkuCore",
     "NimiLinkuSandbox",
     "NimiPu",

{sonatoki-0.3.1 → sonatoki-0.3.2}/src/sonatoki/constants.py RENAMED Viewed

@@ -1,6 +1,6 @@
 # STL
 import json
-from typing import Dict, List
+from typing import Set, Dict, List
 from pathlib import Path
 # LOCAL
@@ -383,37 +383,62 @@ LANGUAGE = "english"  # for NLTK
 """Commonly occurring strings which are some kind of valid Toki Pona or external token"""
 ALLOWABLES = {
-    "cw",  # Content Warning
     "x",  # ala
     "y",  # anu
     "kxk",  # ken ala ken
     "wxw",  # wile ala wile
 }
-IGNORABLES = {
-    # o, e, n are not here bc they're not frequently problematic in english messages
-    "a",
-    "am",
-    "an",
-    "i",
-    "in",
-    "is",
-    "l",  # they'll
-    "m",  # i'm
-    "me",
-    "no",
-    "s",  # let's
-    "so",
-    "t",  # don't
-    "to",
-    "u",  # you
-    "we",
-    "un",  # un-
-    "use",
+PHONOMATCHES = {
+    # "a",  # ignore
+    # "an",  # against
+    # "i",  # against
+    # "in",  # against
     "some",
-    "like",
+    "like",  # against
+    # "me",  # against
+    # "no",  # against
+    # "on",  # against
+    # "se",  # against
+    # "so",  # against
+    # "some",  # against
+    "to",  # ignore
+    # "u",  # against
+    # "un",  # against
+    "use",  # against
+    # "we",  # against
 }
+ALPHABETIC_MATCHES = PHONOMATCHES | {
+    "a",
+    # "am",
+    # "as",
+    # "at",
+    # "aw",  # aww
+    # "ek",  # eek
+    # "ew",
+    # "ik",
+    # "il",  # ill
+    # "im",
+    # "im",
+    # "ip",
+    # "is",
+    # "it",
+    # "l",  # they'll
+    # "m",  # i'm
+    # "ok",
+    # "op",
+    # "ow",
+    # "s",  # let's
+    # "t",  # don't
+    # "up",
+    # "us",
+    # "ut",
+    # "uw",
+}
+IGNORABLES = PHONOMATCHES | ALPHABETIC_MATCHES
 UCSUR_RANGES = [
     "\\U000F1900-\\U000F1977",  # pu
     "\\U000F1978-\\U000F1988",  # ku suli
@@ -426,14 +451,14 @@ NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
 # NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
-def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> List[str]:
-    return [d["word"] for d in data.values() if d[key] == value]
+def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> Set[str]:
+    return {d["word"] for d in data.values() if d[key] == value}
 with open(LINKU) as f:
     linku: Dict[str, Dict[str, str]] = json.loads(f.read())
-    NIMI_PU: List[str] = category_helper(linku, "book", "pu")
-    NIMI_PU_SYNONYMS: List[str] = ["namako", "kin", "oko"]
+    NIMI_PU = category_helper(linku, "book", "pu")
+    NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
     NIMI_KU_SULI = category_helper(linku, "book", "ku suli")
     NIMI_KU_LILI = category_helper(linku, "book", "ku lili")
@@ -445,7 +470,7 @@ with open(LINKU) as f:
 with open(SANDBOX) as f:
     sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
-    NIMI_LINKU_SANDBOX: List[str] = [d["word"] for d in sandbox.values()]
+    NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
 del linku
 del sandbox

{sonatoki-0.3.1 → sonatoki-0.3.2}/tests/test_filters.py RENAMED Viewed

@@ -18,11 +18,13 @@ from sonatoki.Filters import (
     Phonotactic,
     Punctuation,
     AlphabeticRe,
+    LongSyllabic,
     NimiLinkuCore,
     PunctuationRe,
-    NimiPuSynonyms,
+    LongAlphabetic,
     OrMemberFilter,
     PunctuationRe1,
+    LongPhonotactic,
     NimiLinkuCommon,
     NimiLinkuObscure,
     NimiLinkuSandbox,
@@ -34,7 +36,6 @@ from sonatoki.constants import (
     NIMI_KU_LILI,
     NIMI_KU_SULI,
     NIMI_LINKU_CORE,
-    NIMI_PU_SYNONYMS,
     NIMI_LINKU_COMMON,
     NIMI_LINKU_OBSCURE,
     NIMI_LINKU_SANDBOX,
@@ -45,7 +46,7 @@ from sonatoki.constants import (
 from .test_utils import PROPER_NAME_RE
-@given(st.sampled_from(NIMI_PU))
+@given(st.sampled_from(list(NIMI_PU)))
 @example("lukin")
 @example("selo")
 @example("li")
@@ -54,14 +55,14 @@ def test_NimiPu(s: str):
     assert res, repr(s)
-@given(st.sampled_from(NIMI_LINKU_CORE))
+@given(st.sampled_from(list(NIMI_LINKU_CORE)))
 @example("pona")
 def test_NimiLinkuCore(s: str):
     res = NimiLinkuCore.filter(s)
     assert res, repr(s)
-@given(st.sampled_from(NIMI_LINKU_COMMON))
+@given(st.sampled_from(list(NIMI_LINKU_COMMON)))
 @example("n")
 @example("tonsi")
 @example("kipisi")
@@ -70,19 +71,19 @@ def test_NimiLinkuCommon(s: str):
     assert res, repr(s)
-@given(st.sampled_from(NIMI_LINKU_UNCOMMON))
+@given(st.sampled_from(list(NIMI_LINKU_UNCOMMON)))
 def test_NimiLinkuUncommon(s: str):
     res = NimiLinkuUncommon.filter(s)
     assert res, repr(s)
-@given(st.sampled_from(NIMI_LINKU_OBSCURE))
+@given(st.sampled_from(list(NIMI_LINKU_OBSCURE)))
 def test_NimiLinkuObscure(s: str):
     res = NimiLinkuObscure.filter(s)
     assert res, repr(s)
-@given(st.sampled_from(NIMI_LINKU_SANDBOX))
+@given(st.sampled_from(list(NIMI_LINKU_SANDBOX)))
 @example("kalamARR")
 @example("Pingo")
 def test_NimiLinkuSandbox(s: str):
@@ -101,6 +102,13 @@ def test_Phonotactic(s: str):
     assert res, repr(s)
+@given(st.from_regex(Phonotactic.pattern.pattern, fullmatch=True))
+def test_LongPhonotactic(s: str):
+    len_ok = len(s) >= LongPhonotactic.length
+    res = LongPhonotactic.filter(s)
+    assert res == len_ok, repr(s)  # will match given fullmatch
 @given(st.from_regex(Syllabic.pattern.pattern, fullmatch=True))
 @example("wuwojitiwunwonjintinmanna")
 def test_Syllabic(s: str):
@@ -108,6 +116,13 @@ def test_Syllabic(s: str):
     assert res, repr(s)
+@given(st.from_regex(Syllabic.pattern.pattern, fullmatch=True))
+def test_LongSyllabic(s: str):
+    len_ok = len(s) >= LongSyllabic.length
+    res = LongSyllabic.filter(s)
+    assert res == len_ok
 @given(st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True))
 @example("muems")
 @example("mpptp")
@@ -118,6 +133,13 @@ def test_Alphabetic(s: str):
     assert res_fn == res_re, repr(s)
+@given(st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True))
+def test_LongAlphabetic(s: str):
+    len_ok = len(s) >= LongAlphabetic.length
+    res = LongAlphabetic.filter(s)
+    assert res == len_ok
 @given(st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True))
 def test_AlphabeticRe(s: str):
     res_re = AlphabeticRe.filter(s)
@@ -181,7 +203,7 @@ def test_OrFilter(s: str):
 # NOTE: No subset filter test because A | B is not the same as A combined with B.
 # e.g. "apple" passes Alphabetic, "..." passes Punctuation, "apple..." passes neither
 # but would incorrectly pass a combined filter.
-@given(st.sampled_from(NIMI_PU + NIMI_LINKU_OBSCURE))
+@given(st.sampled_from(list(NIMI_PU | NIMI_LINKU_OBSCURE)))
 def test_OrMemberFilter(s: str):
     filter = OrMemberFilter(NimiPu, NimiLinkuObscure)
     res = filter.filter(s)
@@ -192,11 +214,13 @@ def test_OrMemberFilter(s: str):
 @given(
     st.sampled_from(
-        NIMI_KU_SULI
-        + NIMI_KU_LILI
-        + NIMI_LINKU_UNCOMMON
-        + NIMI_LINKU_OBSCURE
-        + NIMI_LINKU_SANDBOX,
+        list(
+            NIMI_KU_SULI
+            | NIMI_KU_LILI
+            | NIMI_LINKU_UNCOMMON
+            | NIMI_LINKU_OBSCURE
+            | NIMI_LINKU_SANDBOX
+        ),
     )
 )
 def test_OrMemberFilter_IsipinEpiku(s: str):
@@ -216,48 +240,3 @@ def test_OrMemberFilter_IsipinEpiku(s: str):
     assert res and (
         res_ku_suli or res_ku_lili or res_uncommon or res_obscure or res_sandbox
     )
-@given(st.sampled_from(NIMI_PU + NIMI_PU_SYNONYMS))
-def test_pu_filters_non_overlap(s: str):
-    res_pu = NimiPu.filter(s)
-    res_synonyms = NimiPuSynonyms.filter(s)
-    assert (res_pu + res_synonyms) == 1
-@given(st.sampled_from(NIMI_KU_SULI + NIMI_KU_LILI))
-def test_ku_filters_non_overlap(s: str):
-    res_ku_suli = NimiKuSuli.filter(s)
-    res_ku_lili = NimiKuLili.filter(s)
-    assert (res_ku_suli + res_ku_lili) == 1
-@given(
-    st.sampled_from(
-        NIMI_LINKU_CORE
-        + NIMI_LINKU_COMMON
-        + NIMI_LINKU_UNCOMMON
-        + NIMI_LINKU_OBSCURE
-        + NIMI_LINKU_SANDBOX
-    )
-)
-def test_linku_filters_non_overlap(s: str):
-    s = Lowercase.clean(s)
-    s = ConsecutiveDuplicates.clean(s)
-    res_core = NimiLinkuCore.filter(s)
-    res_common = NimiLinkuCommon.filter(s)
-    res_uncommon = NimiLinkuUncommon.filter(s)
-    res_obscure = NimiLinkuObscure.filter(s)
-    res_sandbox = NimiLinkuSandbox.filter(s)
-    assert (res_core + res_common + res_uncommon + res_obscure + res_sandbox) == 1
-@given(st.sampled_from(NIMI_LINKU_CORE + NIMI_LINKU_COMMON + NIMI_LINKU_UNCOMMON))
-def test_nimi_linku_properties(s: str):
-    assert ConsecutiveDuplicates.clean(s) == s, repr(s)
-    assert Alphabetic.filter(s), repr(s)
-    assert Syllabic.filter(s), repr(s)
-    assert Phonotactic.filter(s), repr(s)
-    # Passing phonotactic implies all of the above

{sonatoki-0.3.1 → sonatoki-0.3.2}/tests/test_ilo.py RENAMED Viewed

@@ -59,6 +59,8 @@ SYLLABIC_MATCHES = [
     "mi sona ala e nimi sunopatikuna",
     "kalama wuwojiti li pana e sona",
     "jan Awaja en jan Alasali en jan Akesinu li pona",  # syllables match before names here
+    "jan Ke Tami",
+    "kulupu Kuko",
 ]
 ALPHABETIC_MATCHES = [
@@ -85,13 +87,20 @@ SOME_INVALID = [
     "mi tawa ma ohio",
     "sina toki e nimi what pi toki Inli",
     "wawa la o lukin e ni: your mom",
+    "lete li ike x.x",  # this is an emoticon but passes because 'x' is in Filters.Miscellaneous
 ]
 CORPUS_SPECIFIC = [
-    "ki le konsi si te isipin epiku le pasila to",
+    # "ki le konsi si te isipin epiku le pasila to",
+    "ki konsi te isipin epiku pasila to",  # the sandbox has not documented si or le
     'jasima omekapo, ki nimisin "jasima enko nimisin". ki enko alu linluwi Jutu alu epiku ki epiku baba is you. ki likujo "SINtelen pona", ki epiku alu "sitelen pona". ki kepen wawajete isipin, kin ki yupekosi alu lipamanka alu wawajete, kin ki enko isipin lipamanka linluwi alu wawajete',
     "kalamARRRR",
     "Pingo",
+    "we Luke",
+]
+CORPUS_SPECIFIC_XFAIL = [
+    "How to Cut a Kiwi",
+    "a e i o u",
 ]
@@ -103,6 +112,7 @@ EXCESSIVE_SYLLABICS = [
     "I manipulate a passe pile so a ton emulate, akin to intake",
     "a ton of insolate puke. make no amen, no joke.",
     "I elope so, to an elite untaken tune, some unwise tone",
+    "insane asinine lemon awesome atone joke",
 ]
 EXCESSIVE_ALPHABETICS = [
@@ -122,11 +132,13 @@ EXCESSIVE_NAMES = [
     "I Want To Evade The Filter",
     "If You Do This The Bot Can't See You",
     "This Is A Statement In Perfect Toki Pona, I Guarantee",
-    "How to Cut a Kiwi",  # previous false positive; fixed by english ignorables
 ]
 EXCESSIVE_ENGLISH = [
     "me when i tawa sike",  # previous false positive; fixed by english ignorables
+    "Maybe I’m too nasa",  # previous false positive; fixed by LongSyllabic and LongAlphabetic
+    "I see :)",
+    "I wanna see",  # same down to here
 ]
 NON_MATCHES = [
@@ -134,6 +146,7 @@ NON_MATCHES = [
     "super bruh moment 64",
     "homestuck",
     "homestuck Homestuck",
+    "what if i went to the store ",
 ]
 KNOWN_GOOD = (
@@ -150,22 +163,23 @@ KNOWN_BAD = (
     + EXCESSIVE_ALPHABETICS
     + EXCESSIVE_NAMES
     + EXCESSIVE_TYPOES
+    + EXCESSIVE_ENGLISH
     + NON_MATCHES
 )
 FALSE_NEGATIVES = [
     # emoticon should not be a problem
-    "lete li ike x.x",
     # a token that is one edit off a known word should be allowed
     "mi pnoa",
     "tok",
     "mut",
     "poan",
     "mtue",
+    "mi nasa B^)",  # emoticon
 ]
 FALSE_POSITIVES = [
-    "Maybe I’m too nasa",
+    "insane asinine lemon awesome atone",
 ]
@@ -174,16 +188,16 @@ def test_known_good_pref(ilo: Ilo, text: str):
     assert ilo.is_toki_pona(text), text
+@pytest.mark.parametrize("text", KNOWN_BAD + CORPUS_SPECIFIC)
+def test_known_bad_pref(ilo: Ilo, text: str):
+    assert not ilo.is_toki_pona(text), text
 @pytest.mark.parametrize("text", KNOWN_GOOD + CORPUS_SPECIFIC)
 def test_known_good_corpus(corpus_ilo: Ilo, text: str):
     assert corpus_ilo.is_toki_pona(text), text
-@pytest.mark.parametrize("text", KNOWN_BAD + CORPUS_SPECIFIC)
-def test_known_bad(ilo: Ilo, text: str):
-    assert not ilo.is_toki_pona(text), text
 @pytest.mark.parametrize("text", KNOWN_BAD)
 def test_known_bad_corpus(corpus_ilo: Ilo, text: str):
     assert not corpus_ilo.is_toki_pona(text), text
@@ -209,11 +223,17 @@ def test_weakness_of_lazy(lazy_ilo: Ilo, text: str):
 @pytest.mark.xfail
 @pytest.mark.parametrize("text", FALSE_POSITIVES)
-def test_false_positives(ilo: Ilo, text: str):
+def test_false_positives_pref(ilo: Ilo, text: str):
     assert not ilo.is_toki_pona(text)
 @pytest.mark.xfail
 @pytest.mark.parametrize("text", FALSE_NEGATIVES)
-def test_false_negatives(ilo: Ilo, text: str):
+def test_false_negatives_pref(ilo: Ilo, text: str):
     assert ilo.is_toki_pona(text)
+@pytest.mark.xfail
+@pytest.mark.parametrize("text", CORPUS_SPECIFIC_XFAIL)
+def test_false_positives_corpus(corpus_ilo: Ilo, text: str):
+    assert not corpus_ilo.is_toki_pona(text)

sonatoki-0.3.2/tests/test_properties.py ADDED Viewed

@@ -0,0 +1,78 @@
+# PDM
+import hypothesis.strategies as st
+from hypothesis import given
+# LOCAL
+from sonatoki.Filters import (
+    NimiPu,
+    Syllabic,
+    Alphabetic,
+    NimiKuLili,
+    NimiKuSuli,
+    Phonotactic,
+    NimiLinkuCore,
+    NimiPuSynonyms,
+    NimiLinkuCommon,
+    NimiLinkuObscure,
+    NimiLinkuSandbox,
+    NimiLinkuUncommon,
+)
+from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates
+from sonatoki.constants import (
+    NIMI_PU,
+    NIMI_KU_LILI,
+    NIMI_KU_SULI,
+    NIMI_LINKU_CORE,
+    NIMI_PU_SYNONYMS,
+    NIMI_LINKU_COMMON,
+    NIMI_LINKU_OBSCURE,
+    NIMI_LINKU_SANDBOX,
+    NIMI_LINKU_UNCOMMON,
+)
+@given(st.sampled_from(list(NIMI_PU | NIMI_PU_SYNONYMS)))
+def test_pu_filters_non_overlap(s: str):
+    res_pu = NimiPu.filter(s)
+    res_synonyms = NimiPuSynonyms.filter(s)
+    assert (res_pu + res_synonyms) == 1
+@given(st.sampled_from(list(NIMI_KU_SULI | NIMI_KU_LILI)))
+def test_ku_filters_non_overlap(s: str):
+    res_ku_suli = NimiKuSuli.filter(s)
+    res_ku_lili = NimiKuLili.filter(s)
+    assert (res_ku_suli + res_ku_lili) == 1
+@given(
+    st.sampled_from(
+        list(
+            NIMI_LINKU_CORE
+            | NIMI_LINKU_COMMON
+            | NIMI_LINKU_UNCOMMON
+            | NIMI_LINKU_OBSCURE
+            | NIMI_LINKU_SANDBOX
+        )
+    )
+)
+def test_linku_filters_non_overlap(s: str):
+    s = Lowercase.clean(s)
+    s = ConsecutiveDuplicates.clean(s)
+    res_core = NimiLinkuCore.filter(s)
+    res_common = NimiLinkuCommon.filter(s)
+    res_uncommon = NimiLinkuUncommon.filter(s)
+    res_obscure = NimiLinkuObscure.filter(s)
+    res_sandbox = NimiLinkuSandbox.filter(s)
+    assert (res_core + res_common + res_uncommon + res_obscure + res_sandbox) == 1
+@given(st.sampled_from(list(NIMI_LINKU_CORE | NIMI_LINKU_COMMON | NIMI_LINKU_UNCOMMON)))
+def test_nimi_linku_properties(s: str):
+    assert ConsecutiveDuplicates.clean(s) == s, repr(s)
+    assert Alphabetic.filter(s), repr(s)
+    assert Syllabic.filter(s), repr(s)
+    assert Phonotactic.filter(s), repr(s)
+    # Passing phonotactic implies all of the above

{sonatoki-0.3.1 → sonatoki-0.3.2}/tests/test_utils.py RENAMED Viewed

@@ -11,7 +11,7 @@ from sonatoki.constants import NIMI_LINKU_CORE, NIMI_LINKU_COMMON
 PROPER_NAME_RE = r"[A-Z][a-z]*"
 token_strategy = (
-    st.sampled_from(NIMI_LINKU_CORE + NIMI_LINKU_COMMON)
+    st.sampled_from(list(NIMI_LINKU_CORE | NIMI_LINKU_COMMON))
     | st.from_regex(Phonotactic.pattern.pattern, fullmatch=True)
     | st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
     | st.from_regex(PROPER_NAME_RE, fullmatch=True)