PyPI - sonatoki - Versions diffs - 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

sonatoki 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

sonatoki/Configs.py +41 -30
sonatoki/Filters.py +121 -24
sonatoki/constants.py +74 -38
sonatoki/utils.py +14 -1
{sonatoki-0.3.0.dist-info → sonatoki-0.3.2.dist-info}/METADATA +28 -17
{sonatoki-0.3.0.dist-info → sonatoki-0.3.2.dist-info}/RECORD +8 -8
{sonatoki-0.3.0.dist-info → sonatoki-0.3.2.dist-info}/WHEEL +0 -0
{sonatoki-0.3.0.dist-info → sonatoki-0.3.2.dist-info}/licenses/LICENSE +0 -0

sonatoki/Configs.py CHANGED Viewed

@@ -1,36 +1,36 @@
 # STL
 from copy import deepcopy
-from typing import List, Type, Union, TypedDict
+from typing import List, Type, TypedDict
 # LOCAL
 from sonatoki.Filters import (
     Filter,
-    NimiPu,
     Numeric,
-    OrFilter,
     Syllabic,
-    NimiLinku,
-    NimiPuAle,
     NimiUCSUR,
     Alphabetic,
     ProperName,
-    Phonotactic,
     Punctuation,
-    NimiLinkuAle,
+    LongSyllabic,
+    Miscellaneous,
+    NimiLinkuCore,
+    LongAlphabetic,
+    LongProperName,
+    OrMemberFilter,
+    NimiLinkuCommon,
+    NimiLinkuObscure,
     NimiLinkuSandbox,
     EnglishIgnorables,
+    NimiLinkuUncommon,
 )
 from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
 from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
 from sonatoki.Tokenizers import Tokenizer, WordTokenizer
 from sonatoki.Preprocessors import (
     URLs,
+    Backticks,
     Reference,
     Preprocessor,
-    DiscordEmotes,
-    DiscordSpecial,
-    DiscordChannels,
-    DiscordMentions,
     AngleBracketObject,
 )
@@ -59,14 +59,14 @@ BaseConfig: IloConfig = {
 PrefConfig: IloConfig = {
-    "preprocessors": [URLs, Reference],
+    "preprocessors": [Backticks, URLs, Reference],
     "cleaners": [ConsecutiveDuplicates],
-    "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
+    "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
-        OrFilter(NimiLinku, NimiUCSUR),
-        Syllabic,
-        ProperName,
-        Alphabetic,
+        OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
+        LongSyllabic,
+        LongProperName,
+        LongAlphabetic,
     ],
     "scorer": SoftScaling,
     "passing_score": 0.8,
@@ -74,14 +74,22 @@ PrefConfig: IloConfig = {
 }
 CorpusConfig: IloConfig = {
-    "preprocessors": [URLs, AngleBracketObject, Reference],
+    "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
-    "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
+    "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
-        OrFilter(NimiLinkuSandbox, NimiUCSUR),
-        Syllabic,
-        ProperName,
-        Alphabetic,
+        OrMemberFilter(
+            NimiLinkuCore,
+            NimiLinkuCommon,
+            NimiLinkuUncommon,
+            NimiLinkuObscure,
+            NimiLinkuSandbox,
+            NimiUCSUR,
+            Miscellaneous,
+        ),
+        LongSyllabic,
+        LongProperName,
+        LongAlphabetic,
     ],
     "scorer": SoftScaling,
     "passing_score": 0.8,
@@ -89,25 +97,28 @@ CorpusConfig: IloConfig = {
 }
+"""
+Mimics the previous implementation of ilo pi toki pona taso
+"""
 LazyConfig: IloConfig = {
-    "preprocessors": [URLs],
+    "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
-    "scoring_filters": [Alphabetic, NimiUCSUR, ProperName],
+    "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
     "scorer": SoftPassFail,
     "passing_score": 0.8,
     "word_tokenizer": WordTokenizer,
 }
 DiscordConfig: IloConfig = {
-    "preprocessors": [URLs, AngleBracketObject, Reference],
+    "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
     "scoring_filters": [
-        OrFilter(NimiLinku, NimiUCSUR),
-        Syllabic,
-        ProperName,
-        Alphabetic,
+        OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
+        LongSyllabic,
+        LongProperName,
+        LongAlphabetic,
     ],
     "scorer": SoftScaling,
     "passing_score": 0.8,

sonatoki/Filters.py CHANGED Viewed

@@ -9,6 +9,7 @@ import regex
 from typing_extensions import override
 # LOCAL
+from sonatoki.utils import prep_dictionary
 from sonatoki.constants import (
     VOWELS,
     NIMI_PU,
@@ -17,13 +18,17 @@ from sonatoki.constants import (
     ALLOWABLES,
     CONSONANTS,
     IGNORABLES,
-    NIMI_LINKU,
     NIMI_UCSUR,
-    NIMI_LINKU_LILI,
+    NIMI_KU_LILI,
+    NIMI_KU_SULI,
+    NIMI_LINKU_CORE,
     ALL_PUNCT_RANGES,
     NIMI_PU_SYNONYMS,
+    NIMI_LINKU_COMMON,
+    NIMI_LINKU_OBSCURE,
     NIMI_LINKU_SANDBOX,
     UCSUR_PUNCT_RANGES,
+    NIMI_LINKU_UNCOMMON,
 )
 regex.DEFAULT_VERSION = regex.VERSION1
@@ -37,6 +42,33 @@ class Filter(ABC):
         raise NotImplementedError
+class MinLen(Filter):
+    """
+    Meta filter meant to be inherited by another filter to add a length requirement.
+    Multiple-inherit with `MinLen` as the first argument so `super()` resolves correctly.
+    You may also construct any other filter with a minimum length filter like so:
+    ```
+    MinLen(Alphabetic, 3)
+    ```
+    """
+    length = 0
+    @classmethod
+    @cache(maxsize=None)
+    def filter(cls, token: str) -> bool:
+        if len(token) < cls.length:
+            return False
+        return super().filter(token)
+    def __new__(cls, filter: Type[Filter], length_: int) -> Type[Filter]:
+        class MinLenFilter(MinLen, Filter):
+            length = length_
+        return MinLenFilter
 class RegexFilter(Filter):
     pattern: "re.Pattern[str]"
@@ -78,11 +110,16 @@ class SubsetFilter(Filter):
 class Miscellaneous(MemberFilter):
-    tokens = set(ALLOWABLES)
+    tokens = prep_dictionary(ALLOWABLES)
 class EnglishIgnorables(MemberFilter):
-    tokens = set(IGNORABLES)
+    """NOTE: Not recommended for use.
+    It is better to use a Long* filter such as LongSyllabic than to use this filter.
+    This filter hides words from scoring rather than scoring them poorly,
+    which is more of a benefit than a loss for a word you would like to omit."""
+    tokens = prep_dictionary(IGNORABLES)
 class ProperName(Filter):
@@ -104,28 +141,48 @@ class ProperName(Filter):
         # this will errantly match.
+class LongProperName(MinLen, ProperName):
+    length = 2  # reject "names" of length 1
 class NimiPu(MemberFilter):
-    tokens = set(NIMI_PU)
+    tokens = prep_dictionary(NIMI_PU)
+class NimiPuSynonyms(MemberFilter):
+    tokens = prep_dictionary(NIMI_PU_SYNONYMS)
+class NimiKuSuli(MemberFilter):
+    tokens = prep_dictionary(NIMI_KU_SULI)
+class NimiKuLili(MemberFilter):
+    tokens = prep_dictionary(NIMI_KU_LILI)
-class NimiPuAle(MemberFilter):
-    tokens = set(NIMI_PU + NIMI_PU_SYNONYMS)
+class NimiLinkuCore(MemberFilter):
+    tokens = prep_dictionary(NIMI_LINKU_CORE)
-class NimiLinku(MemberFilter):
-    tokens = set(NIMI_LINKU)
+class NimiLinkuCommon(MemberFilter):
+    tokens = prep_dictionary(NIMI_LINKU_COMMON)
-class NimiLinkuAle(MemberFilter):
-    tokens = set(NIMI_LINKU + NIMI_LINKU_LILI)
+class NimiLinkuUncommon(MemberFilter):
+    tokens = prep_dictionary(NIMI_LINKU_UNCOMMON)
+class NimiLinkuObscure(MemberFilter):
+    tokens = prep_dictionary(NIMI_LINKU_OBSCURE)
 class NimiLinkuSandbox(MemberFilter):
-    tokens = set(NIMI_LINKU + NIMI_LINKU_LILI + NIMI_LINKU_SANDBOX)
+    tokens = prep_dictionary(NIMI_LINKU_SANDBOX)
 class NimiUCSUR(MemberFilter):
-    tokens = set(NIMI_UCSUR)
+    tokens = prep_dictionary(NIMI_UCSUR)
 class Phonotactic(RegexFilter):
@@ -145,6 +202,10 @@ class Phonotactic(RegexFilter):
     )
+class LongPhonotactic(MinLen, Phonotactic):
+    length = 3
 class Syllabic(RegexFilter):
     """Determines if a given token is syllabically valid Toki Pona (or `n`).
     Words must have correctly ordered vowels and consonants, but the phonotactic
@@ -158,6 +219,10 @@ class Syllabic(RegexFilter):
     )
+class LongSyllabic(MinLen, Syllabic):
+    length = 3
 class Alphabetic(SubsetFilter):
     tokens = set(ALPHABET)
@@ -166,9 +231,8 @@ class AlphabeticRe(RegexFilter):
     pattern = re.compile(rf"[{ALPHABET}]+", flags=re.IGNORECASE)
-class TwoOrMoreAlphabetic(Filter):
-    # TODO: alphabetic implementation that ignores single characters
-    pass
+class LongAlphabetic(MinLen, Alphabetic):
+    length = 3
 class Numeric(Filter):
@@ -224,11 +288,10 @@ class OrFilter:
     Instead, the user is responsible for building an OrFilter out of their desired filters.
     """
-    def __new__(cls, *filters_: Type[Filter]) -> Type[Filter]:
-        if not len(filters_) >= 2:
-            raise ValueError("Must provide at least two Filters to OrFilter.")
+    @staticmethod
+    def __generic_filter(*filters_: Type[Filter]) -> Type[Filter]:
-        class AnonymousOrFilter(Filter):
+        class CombinedFilter(Filter):
             filters: List[Type[Filter]] = list(filters_)  # TODO: tuple better?
             @classmethod
@@ -240,7 +303,37 @@ class OrFilter:
                         return True
                 return False
-        return AnonymousOrFilter
+        return CombinedFilter
+    def __new__(cls, *filters: Type[Filter]) -> Type[Filter]:
+        if not len(filters) >= 2:
+            raise ValueError("Provide at least two Filters to OrFilter.")
+        member_filters = [f for f in filters if issubclass(f, MemberFilter)]
+        if len(member_filters) >= 2:
+            raise Warning("Use OrMemberFilter for combining two or more MemberFilters.")
+        filter = cls.__generic_filter(*filters)
+        return filter
+class OrMemberFilter:
+    @staticmethod
+    def __member_filter(*filters: Type[MemberFilter]) -> Type[MemberFilter]:
+        all_token_sets: List[Set[str]] = [f.tokens for f in filters]
+        all_tokens: Set[str] = set().union(*all_token_sets)
+        class CombinedFilter(MemberFilter):
+            tokens = all_tokens
+        return CombinedFilter
+    def __new__(cls, *filters_: Type[MemberFilter]) -> Type[MemberFilter]:
+        if not len(filters_) >= 2:
+            raise ValueError("Provide two or more MemberFilters to OrMemberFilter.")
+        filter = cls.__member_filter(*filters_)
+        return filter
 class AndFilter(Filter):
@@ -271,11 +364,15 @@ __all__ = [
     "Alphabetic",
     "AndFilter",
     "EnglishIgnorables",
-    "NimiLinku",
-    "NimiLinkuAle",
+    "LongAlphabetic",
+    "LongPhonotactic",
+    "LongProperName",
+    "LongSyllabic",
+    "MinLen",
+    "NimiLinkuCore",
     "NimiLinkuSandbox",
     "NimiPu",
-    "NimiPuAle",
+    "NimiPuSynonyms",
     "NimiUCSUR",
     "Numeric",
     "OrFilter",

sonatoki/constants.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # STL
 import json
-from typing import Dict, List
+from typing import Set, Dict, List
 from pathlib import Path
 # LOCAL
@@ -383,37 +383,62 @@ LANGUAGE = "english"  # for NLTK
 """Commonly occurring strings which are some kind of valid Toki Pona or external token"""
 ALLOWABLES = {
-    "cw",  # Content Warning
     "x",  # ala
     "y",  # anu
     "kxk",  # ken ala ken
     "wxw",  # wile ala wile
 }
-IGNORABLES = {
-    # o, e, n are not here bc they're not frequently problematic in english messages
-    "a",
-    "am",
-    "an",
-    "i",
-    "in",
-    "is",
-    "l",  # they'll
-    "m",  # i'm
-    "me",
-    "no",
-    "s",  # let's
-    "so",
-    "t",  # don't
-    "to",
-    "u",  # you
-    "we",
-    "un",  # un-
-    "use",
+PHONOMATCHES = {
+    # "a",  # ignore
+    # "an",  # against
+    # "i",  # against
+    # "in",  # against
     "some",
-    "like",
+    "like",  # against
+    # "me",  # against
+    # "no",  # against
+    # "on",  # against
+    # "se",  # against
+    # "so",  # against
+    # "some",  # against
+    "to",  # ignore
+    # "u",  # against
+    # "un",  # against
+    "use",  # against
+    # "we",  # against
+}
+ALPHABETIC_MATCHES = PHONOMATCHES | {
+    "a",
+    # "am",
+    # "as",
+    # "at",
+    # "aw",  # aww
+    # "ek",  # eek
+    # "ew",
+    # "ik",
+    # "il",  # ill
+    # "im",
+    # "im",
+    # "ip",
+    # "is",
+    # "it",
+    # "l",  # they'll
+    # "m",  # i'm
+    # "ok",
+    # "op",
+    # "ow",
+    # "s",  # let's
+    # "t",  # don't
+    # "up",
+    # "us",
+    # "ut",
+    # "uw",
 }
+IGNORABLES = PHONOMATCHES | ALPHABETIC_MATCHES
 UCSUR_RANGES = [
     "\\U000F1900-\\U000F1977",  # pu
     "\\U000F1978-\\U000F1988",  # ku suli
@@ -421,24 +446,31 @@ UCSUR_RANGES = [
 ]
 NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
+# NIMI_PU_UCSUR_RANGES = ["\\U000F1900-\\U000F1977"]
+# NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
+def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> Set[str]:
+    return {d["word"] for d in data.values() if d[key] == value}
 with open(LINKU) as f:
     linku: Dict[str, Dict[str, str]] = json.loads(f.read())
-    NIMI_PU: List[str] = [d["word"] for d in linku.values() if d["book"] == "pu"]
-    NIMI_PU_SYNONYMS: List[str] = ["namako", "kin", "oko"]
-    NIMI_LINKU: List[str] = [
-        d["word"] for d in linku.values() if d["usage_category"] in ["core", "common"]
-    ]
-    NIMI_LINKU_LILI: List[str] = [
-        d["word"]
-        for d in linku.values()
-        if d["usage_category"] not in ["core", "common"]
-    ]
+    NIMI_PU = category_helper(linku, "book", "pu")
+    NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
+    NIMI_KU_SULI = category_helper(linku, "book", "ku suli")
+    NIMI_KU_LILI = category_helper(linku, "book", "ku lili")
+    NIMI_LINKU_CORE = category_helper(linku, "usage_category", "core")
+    NIMI_LINKU_COMMON = category_helper(linku, "usage_category", "common")
+    NIMI_LINKU_UNCOMMON = category_helper(linku, "usage_category", "uncommon")
+    NIMI_LINKU_OBSCURE = category_helper(linku, "usage_category", "obscure")
 with open(SANDBOX) as f:
     sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
-    NIMI_LINKU_SANDBOX: List[str] = NIMI_LINKU_LILI + [
-        d["word"] for d in sandbox.values()
-    ]
+    NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
 del linku
 del sandbox
@@ -449,9 +481,13 @@ __all__ = [
     "ALL_PUNCT_RANGES",
     "ALPHABET",
     "CONSONANTS",
-    "NIMI_LINKU",
-    "NIMI_LINKU_LILI",
+    "NIMI_KU_LILI",
+    "NIMI_KU_SULI",
+    "NIMI_LINKU_COMMON",
+    "NIMI_LINKU_CORE",
+    "NIMI_LINKU_OBSCURE",
     "NIMI_LINKU_SANDBOX",
+    "NIMI_LINKU_UNCOMMON",
     "NIMI_PU",
     "NIMI_PU_SYNONYMS",
     "POSIX_PUNCT",

sonatoki/utils.py CHANGED Viewed

@@ -1,10 +1,23 @@
 # STL
 import re
-from typing import List
+from typing import Set, List, Iterable
+# LOCAL
+from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates
 TO_ESCAPE = ["\\", "^", "[", "]", "-"]
+def prep_dictionary(words: Iterable[str]) -> Set[str]:
+    out: Set[str] = set()
+    cleaners = [Lowercase, ConsecutiveDuplicates]
+    for word in words:
+        for c in cleaners:
+            word = c.clean(word)
+        out.add(word)
+    return out
 def regex_escape(s: str) -> str:
     """Escape all characters which must be escaped when embedded in a character class."""
     for c in TO_ESCAPE:

{sonatoki-0.3.0.dist-info → sonatoki-0.3.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sonatoki
-Version: 0.3.0
+Version: 0.3.2
 Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
 Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
 License: AGPL-3.0-or-later
@@ -12,15 +12,22 @@ Description-Content-Type: text/markdown
 # sona toki
+<div align="center">
+![Test workflow for this library](https://github.com/gregdan3/sona-toki/workflows/Tests/badge.svg)
+[![Version number for this library](https://img.shields.io/pypi/v/sonatoki?logo=python&logoColor=%23cccccc)](https://pypi.org/project/sonatoki)
+</div>
 ## What is **sona toki**?
-This library, "Language Knowledge," helps you identify whether a message is in Toki Pona. No grammar checking, yet, which means this more checks whether a given message has enough Toki Pona words.
+This library, "Language Knowledge," helps you identify whether a message is in Toki Pona. It does so by determining whether a large enough number of words in a statement are "in Toki Pona". No grammar checking, yet.
-I wrote it with a variety of scraps and lessons learned from a prior project, [ilo pi toki pona taso, "toki-pona-only tool"](https://github.com/gregdan3/ilo-pi-toki-pona-taso). That tool will be rewritten to use this library shortly.
+I wrote this library with a variety of scraps and lessons learned from a prior project, [ilo pi toki pona taso, "toki-pona-only tool"](https://github.com/gregdan3/ilo-pi-toki-pona-taso). That tool now uses this library to great success!
-If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language, and this question applies to Toki Pona too.
+If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language. This complexity applies to Toki Pona too.
-This project "solves" that complex problem by offering a highly configurable parser, so you can tune it to your preferences and goals.
+So, this project "solves" that complex problem by offering an opinionated tokenizer and a configurable parser, allowing you to tune its output to your preferences and goals. [Even silly ones.](https://sona.pona.la/wiki/isipin_epiku).
 ## Quick Start
@@ -53,12 +60,12 @@ Or if you'd prefer to configure on your own:
 from copy import deepcopy
 from sonatoki.ilo import Ilo
 from sonatoki.Configs import BaseConfig
-from sonatoki.Filters import NimiPuAle, Phonotactic, ProperName
+from sonatoki.Filters import NimiLinkuCore, Phonotactic, ProperName
 from sonatoki.Scorers import SoftPassFail
 def main():
     config = deepcopy(BaseConfig)
-    config["scoring_filters"].extend([NimiPuAle, Phonotactic, ProperName])
+    config["scoring_filters"].extend([NimiLinkuCore, Phonotactic, ProperName])
     config["scorer"] = SoftPassFail
     ilo = Ilo(**config)
@@ -88,24 +95,28 @@ After our proposal has been examined and a result given by the committee, I will
 ### What's the deal with the tokenizers?
-The Toki Pona tokenizer `word_tokenize_tok` is very specific in always separating writing characters from punctuation, and leaving contiguous punctuation as contiguous- this is a level of precision that NLTK's English tokenizer does not want for several reasons, such as that English words can have "punctuation" characters in them.
-Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet, so a more aggressive tokenizer is highly desirable.
+The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer` has the goal of tokenizing statements such that every token either represents a word candidate ("toki", "mumumu") or a complete non-candidate ("..!", "123").
+This design is highly undesirable for NLTK's English tokenizer because English words can have "punctuation" characters in them.
+But Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet or in [Private Use Area Unicode characters](https://www.kreativekorp.com/ucsur/), so a more aggressive tokenizer is highly desirable.
-The other tokenizers are provided as a comparison case more than anything. I do not recommend their use.
+The goal of splitting into word candidates and non-candidates is important, because any [encoding of Toki Pona's logographic script](https://www.kreativekorp.com/ucsur/charts/sitelen.html) will require each character be split into its own token, where the default behavior would be to leave consecutive non-punctuation together.
 ### Aren't there a lot of false positives?
-Yes. It's up to you to use this tool responsibly on input you've done your best to clean, and better, use stronger filters before weaker ones. For now though, here's a list of relevant false positives:
+Yes, depending on the filter you choose and how you apply it.
+It's up to you to use this tool responsibly on input you've done your best to clean, such as by using stronger filters before weaker ones.
+For now though, here's a list of relevant false positives:
-- `ProperName` will errantly match text in languages without a capital/lowercase distinction, artificially inflating the scores.
-- `Alphabetic` will match a _lot_ of undesirable text- it essentially allows 14 letters of the English alphabet.
+- `ProperName` will errantly match text in languages without a capital/lowercase distinction, artificially increasing scores.
+- `Alphabetic` will match a _lot_ of undesirable text- it essentially allows 14 letters of the English alphabet. For example, "I'm well" would match as _three_ words: "i", "m", "well".
+- `NimiPu` and other sets containing `a`, `mute`, `open`, and others will unavoidably match those words in English text too.
 ### Don't some of the cleaners/filters conflict?
-Yes. Some do so
+Yes, though not terribly much.
 - `ConsecutiveDuplicates` may errantly change a word's validity. For example, "manna" is phonotactically invalid in Toki Pona, but would become "mana" which is valid.
-- `ConsecutiveDuplicates` will not work correctly with syllabaries (alphabets, but representing a pair of consonant and vowel).
+- `ConsecutiveDuplicates` will not work correctly with syllabaries, though this should not change the validity of the analyzed word unless you attempt to dictionary match these words.
+- If you build your own `MemberFilter` with words that have capital letters or consecutive duplicates, they will never match unless you use `prep_dictionary`.
-You'll notice a _lot_ of these are troubles regarding the application of latin alphabet filters to non-latin text. Working on it!
+You'll notice these are mostly casued by applying latin alphabet filters to non-latin text. Working on it!

{sonatoki-0.3.0.dist-info → sonatoki-0.3.2.dist-info}/RECORD RENAMED Viewed

@@ -1,18 +1,18 @@
-sonatoki-0.3.0.dist-info/METADATA,sha256=94NlsvWK1jI4a-wQNdbtwtl0AH7985Cw5aV7IvQbcqo,5160
-sonatoki-0.3.0.dist-info/WHEEL,sha256=vnE8JVcI2Wz7GRKorsPArnBdnW2SWKWGow5gu5tHlRU,90
-sonatoki-0.3.0.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
+sonatoki-0.3.2.dist-info/METADATA,sha256=9cnhaaYFLxN3uaubD0jfTAU_CC9wUGtzho4fs1UGLFc,6341
+sonatoki-0.3.2.dist-info/WHEEL,sha256=vnE8JVcI2Wz7GRKorsPArnBdnW2SWKWGow5gu5tHlRU,90
+sonatoki-0.3.2.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
 sonatoki/Cleaners.py,sha256=m0j1a1vs9Mdqp724r9Xfh1Y_tyP6GYCkihv8rH8m7lA,1871
-sonatoki/Configs.py,sha256=qDLSI0c_FmTggtzNUiYk94P8GZqm5r0co5bdsoCZsa0,3120
-sonatoki/Filters.py,sha256=j7UcESrGGrZxS0Ln4D-0ZTEzm94xs8zzpcb22PSF_Fo,7930
+sonatoki/Configs.py,sha256=o_uFp-Z6sbhbMi8drgQTkdu8S5LaTr0Xnns6Cg0cHSY,3548
+sonatoki/Filters.py,sha256=-7zIV_IBsbASR7pF5WuoABNtBW5a7L135Ev_Rrn35o4,10664
 sonatoki/Preprocessors.py,sha256=aMXXuFBDlJudvzvukvCa7BixuROXXEb62un7I-TGOGs,4441
 sonatoki/Scorers.py,sha256=W-1uYiqjsDejJzoe592ixs7wHazjJXPhuo-41zuJ26U,3643
 sonatoki/Tokenizers.py,sha256=So5_Tu6J98MD3yVcwB_X3lw2uMG0TN6XHcTbQjFCu5Q,4254
 sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sonatoki/__main__.py,sha256=6xc-wIrrFo9wTyn4zRQNAmqwmJBtVvCMwV-CrM-hueA,82
-sonatoki/constants.py,sha256=ocH3gJOh5SzTKxhVgGmy0VP8KDk-IQpodwzh2Ilr_G4,12349
+sonatoki/constants.py,sha256=qq1_ZTsVKG_d7nqlJv3a-KS6ZvYwfUSHWA--e0BuyXc,13268
 sonatoki/ilo.py,sha256=yyLgNPI0Hmb4f1BzX6IRHr11FPChfL2xDR_9odlr8_8,3849
 sonatoki/linku.json,sha256=B5KNdhyM5UEfMciROgh1ECHr3i-ASBeMvwrkzNJX47c,271013
 sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sonatoki/sandbox.json,sha256=hx6LRsfvmmTtqXcXIyCsfSaGK3DZ-GCdbM8xhZQBHoA,77650
-sonatoki/utils.py,sha256=9Dcjg2fUZygA2Z9MUr30Dq3gL2xViJC4hBvRhQDSx3Q,3210
-sonatoki-0.3.0.dist-info/RECORD,,
+sonatoki/utils.py,sha256=OMaRyoNvKGKYQCBDjQyaCI58-wMpQ0wrrNjTJKsEZ9Y,3550
+sonatoki-0.3.2.dist-info/RECORD,,

{sonatoki-0.3.0.dist-info → sonatoki-0.3.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{sonatoki-0.3.0.dist-info → sonatoki-0.3.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

sonatoki 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

sonatoki 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl