PyPI - sonatoki - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

sonatoki 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

sonatoki/Cleaners.py +4 -1
sonatoki/Configs.py +52 -31
sonatoki/Filters.py +96 -33
sonatoki/Preprocessors.py +12 -6
sonatoki/Scorers.py +54 -51
sonatoki/constants.py +21 -29
sonatoki/linku.json +1 -1
sonatoki/sandbox.json +1 -1
sonatoki/utils.py +23 -5
{sonatoki-0.3.1.dist-info → sonatoki-0.3.3.dist-info}/METADATA +1 -1
sonatoki-0.3.3.dist-info/RECORD +18 -0
{sonatoki-0.3.1.dist-info → sonatoki-0.3.3.dist-info}/WHEEL +1 -1
sonatoki-0.3.1.dist-info/RECORD +0 -18
{sonatoki-0.3.1.dist-info → sonatoki-0.3.3.dist-info}/licenses/LICENSE +0 -0

sonatoki/Cleaners.py CHANGED Viewed

@@ -10,6 +10,7 @@ class Cleaner(ABC):
     @classmethod
     @abstractmethod
     def clean(cls, token: str) -> str:
+        """Transform a token to remove some undesirable part."""
         raise NotImplementedError
@@ -33,7 +34,8 @@ class ConsecutiveDuplicates(Cleaner):
     may be altered for emphasis or effect, such as in "sonaaaa" or "AAAAAA".
     This may be undesirable for moraic scripts like Hiragana, where `わわ` would be
-    incorrectly reduced to `わ`. This does preserve phonotactic validity, though."""
+    incorrectly reduced to `わ`. This does preserve phonotactic validity, though.
+    """
     @classmethod
     @override
@@ -69,4 +71,5 @@ class Lowercase(Cleaner):
 __all__ = [
     "ConsecutiveDuplicates",
+    "Lowercase",
 ]

sonatoki/Configs.py CHANGED Viewed

@@ -5,22 +5,23 @@ from typing import List, Type, TypedDict
 # LOCAL
 from sonatoki.Filters import (
     Filter,
-    NimiPu,
     Numeric,
-    OrFilter,
     Syllabic,
     NimiUCSUR,
     Alphabetic,
+    NimiKuLili,
+    NimiKuSuli,
     ProperName,
-    Phonotactic,
     Punctuation,
+    LongSyllabic,
+    Miscellaneous,
     NimiLinkuCore,
-    NimiPuSynonyms,
+    LongAlphabetic,
+    LongProperName,
     OrMemberFilter,
     NimiLinkuCommon,
     NimiLinkuObscure,
     NimiLinkuSandbox,
-    EnglishIgnorables,
     NimiLinkuUncommon,
 )
 from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
@@ -28,12 +29,9 @@ from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
 from sonatoki.Tokenizers import Tokenizer, WordTokenizer
 from sonatoki.Preprocessors import (
     URLs,
+    Backticks,
     Reference,
     Preprocessor,
-    DiscordEmotes,
-    DiscordSpecial,
-    DiscordChannels,
-    DiscordMentions,
     AngleBracketObject,
 )
@@ -48,7 +46,7 @@ class IloConfig(TypedDict):
     passing_score: Number
-# TODO: branching configs?
+# TODO: branching configs? config builder?
 BaseConfig: IloConfig = {
     "preprocessors": [URLs],
@@ -62,14 +60,14 @@ BaseConfig: IloConfig = {
 PrefConfig: IloConfig = {
-    "preprocessors": [URLs, Reference],
+    "preprocessors": [Backticks, URLs, Reference],
     "cleaners": [ConsecutiveDuplicates],
-    "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
+    "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
-        OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
-        Syllabic,
-        ProperName,
-        Alphabetic,
+        OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
+        LongSyllabic,
+        LongProperName,
+        LongAlphabetic,
     ],
     "scorer": SoftScaling,
     "passing_score": 0.8,
@@ -77,9 +75,9 @@ PrefConfig: IloConfig = {
 }
 CorpusConfig: IloConfig = {
-    "preprocessors": [URLs, AngleBracketObject, Reference],
+    "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
-    "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
+    "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
         OrMemberFilter(
             NimiLinkuCore,
@@ -88,36 +86,58 @@ CorpusConfig: IloConfig = {
             NimiLinkuObscure,
             NimiLinkuSandbox,
             NimiUCSUR,
+            Miscellaneous,
         ),
-        Syllabic,
-        ProperName,
-        Alphabetic,
+        LongSyllabic,
+        LongProperName,
+        LongAlphabetic,
     ],
     "scorer": SoftScaling,
     "passing_score": 0.8,
     "word_tokenizer": WordTokenizer,
 }
+"""Mimics the previous implementation of ilo pi toki pona taso."""
 LazyConfig: IloConfig = {
-    "preprocessors": [URLs],
+    "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
-    "scoring_filters": [Alphabetic, NimiUCSUR, ProperName],
+    "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
     "scorer": SoftPassFail,
     "passing_score": 0.8,
     "word_tokenizer": WordTokenizer,
 }
+"""This is extremely silly."""
+IsipinEpikuConfig: IloConfig = {
+    "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
+    "cleaners": [ConsecutiveDuplicates],
+    "ignoring_filters": [Numeric, Punctuation],
+    "scoring_filters": [
+        OrMemberFilter(
+            NimiKuSuli,
+            NimiKuLili,
+            NimiLinkuUncommon,
+            NimiLinkuObscure,
+            NimiLinkuSandbox,
+        ),
+        LongSyllabic,
+        LongProperName,
+        LongAlphabetic,
+    ],
+    "scorer": SoftScaling,
+    "passing_score": 0.8,
+    "word_tokenizer": WordTokenizer,
+}
 DiscordConfig: IloConfig = {
-    "preprocessors": [URLs, AngleBracketObject, Reference],
+    "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
-    "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
+    "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
-        OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
-        Syllabic,
-        ProperName,
-        Alphabetic,
+        OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
+        LongSyllabic,
+        LongProperName,
+        LongAlphabetic,
     ],
     "scorer": SoftScaling,
     "passing_score": 0.8,
@@ -127,6 +147,7 @@ DiscordConfig: IloConfig = {
 TelegramConfig: IloConfig = deepcopy(PrefConfig)
 ForumConfig: IloConfig = deepcopy(PrefConfig)
 __all__ = [
     "BaseConfig",
     "CorpusConfig",

sonatoki/Filters.py CHANGED Viewed

@@ -42,6 +42,33 @@ class Filter(ABC):
         raise NotImplementedError
+class MinLen(Filter):
+    """
+    Meta filter meant to be inherited by another filter to add a length requirement.
+    Multiple-inherit with `MinLen` as the first argument so `super()` resolves correctly.
+    You may also construct any other filter with a minimum length filter like so:
+    ```
+    MinLen(Alphabetic, 3)
+    ```
+    """
+    length = 0
+    @classmethod
+    @cache(maxsize=None)
+    def filter(cls, token: str) -> bool:
+        if len(token) < cls.length:
+            return False
+        return super().filter(token)
+    def __new__(cls, filter: Type[Filter], length_: int) -> Type[Filter]:
+        class MinLenFilter(MinLen, Filter):
+            length = length_
+        return MinLenFilter
 class RegexFilter(Filter):
     pattern: "re.Pattern[str]"
@@ -83,11 +110,16 @@ class SubsetFilter(Filter):
 class Miscellaneous(MemberFilter):
-    tokens = set(ALLOWABLES)
+    tokens = prep_dictionary(ALLOWABLES)
 class EnglishIgnorables(MemberFilter):
-    tokens = set(IGNORABLES)
+    """NOTE: Not recommended for use.
+    It is better to use a Long* filter such as LongSyllabic than to use this filter.
+    This filter hides words from scoring rather than scoring them poorly,
+    which is more of a benefit than a loss for a word you would like to omit."""
+    tokens = prep_dictionary(IGNORABLES)
 class ProperName(Filter):
@@ -95,9 +127,11 @@ class ProperName(Filter):
     When Toki Pona is written with the Latin alphabet, names are generally
     capitalized at their start. This filter identifies those tokens.
-    Note that this alone cannot determine if a token is a valid name, because
-    a standalone name is considered invalid in Toki Pona- names generally have head nouns.
-    This tool only examines one token at a time, so cannot detect names any better than identifying their capital letter.
+    Note that this alone cannot determine if a token is a valid name,
+    because a standalone name is considered invalid in Toki Pona- names
+    generally have head nouns. This tool only examines one token at a
+    time, so cannot detect names any better than identifying their
+    capital letter.
     """
     @classmethod
@@ -109,6 +143,10 @@ class ProperName(Filter):
         # this will errantly match.
+class LongProperName(MinLen, ProperName):
+    length = 2  # reject "names" of length 1
 class NimiPu(MemberFilter):
     tokens = prep_dictionary(NIMI_PU)
@@ -151,12 +189,14 @@ class NimiUCSUR(MemberFilter):
 class Phonotactic(RegexFilter):
     """Determines if a given token is phonotactically valid Toki Pona (or `n`).
     Excludes both consecutive nasals and the illegal syllables:
     - "nm", "nn"
     - "wu", "wo", "ji", "ti"
     Note that if this validator is used after `Cleaners.ConsecutiveDuplicates`,
-    "nn" cannot be found."""
+    "nn" cannot be found.
+    """
     pattern = re.compile(
         rf"^((^[{VOWELS}]|[klmnps][{VOWELS}]|[jt][aeou]|[w][aei])(n(?![mn]))?)+$|^n$",
@@ -166,10 +206,16 @@ class Phonotactic(RegexFilter):
     )
+class LongPhonotactic(MinLen, Phonotactic):
+    length = 3
 class Syllabic(RegexFilter):
     """Determines if a given token is syllabically valid Toki Pona (or `n`).
-    Words must have correctly ordered vowels and consonants, but the phonotactic
-    exceptions are not considered."""
+    Words must have correctly ordered vowels and consonants, but the
+    phonotactic exceptions are not considered.
+    """
     # rf"^((^[{VOWELS}]|[{CONSONANTS}][{VOWELS}])n?)+$|^n$"
     # Alterative I was exploring takes ~15% more steps
@@ -179,6 +225,10 @@ class Syllabic(RegexFilter):
     )
+class LongSyllabic(MinLen, Syllabic):
+    length = 3
 class Alphabetic(SubsetFilter):
     tokens = set(ALPHABET)
@@ -187,19 +237,19 @@ class AlphabeticRe(RegexFilter):
     pattern = re.compile(rf"[{ALPHABET}]+", flags=re.IGNORECASE)
-class TwoOrMoreAlphabetic(Filter):
-    # TODO: alphabetic implementation that ignores single characters
-    pass
+class LongAlphabetic(MinLen, Alphabetic):
+    length = 3
 class Numeric(Filter):
-    """Determine if a given token is entirely numeric.
-    Covers all numeric symbols in Unicode.
+    """Determine if a given token is entirely numeric. Covers all numeric
+    symbols in Unicode.
     This will fail to find numeric tokens such as "1.111" or "-42",
     but if used with the aggressive tokenizer designed for `tok`, these will be
     split into `["1", ".", "111"]` and `["-", "42"]` respectively. As such, the
-    numeric tokens will be split from their punctuation."""
+    numeric tokens will be split from their punctuation.
+    """
     @classmethod
     @override
@@ -209,13 +259,17 @@ class Numeric(Filter):
 class Punctuation(SubsetFilter):
-    """Identify whether a token is entirely punctuation. Fastest implementation."""
+    """Identify whether a token is entirely punctuation.
+    Fastest implementation.
+    """
     tokens = set(ALL_PUNCT)
 class PunctuationRe(RegexFilter):
     """Faster implementation of `PunctuationRe1`.
     Goes out of date compared to the `regex` library if UNICODE_PUNCT_RANGES is not updated.
     """
@@ -223,7 +277,8 @@ class PunctuationRe(RegexFilter):
 class PunctuationRe1(Regex1Filter):
-    """Reference implementation for identifying tokens made entirely of punctuation."""
+    """Reference implementation for identifying tokens made entirely of
+    punctuation."""
     pattern = regex.compile(
         rf"[\p{{Punctuation}}\p{{posix_punct}}{UCSUR_PUNCT_RANGES}]+"
@@ -235,14 +290,16 @@ class OrFilter:
     returning True when any individual filter matches or False otherwise.
     Requires at least two filters.
-    OrFilter exists as a compromise between the need to score some filters equally,
-    while not adding custom behavior to scorers.
-    I could have allowed a position to have a list of filters instead of one filter,
-    but this would require cleaning the user's input, and nested handling of lists.
-    It also would not have been as powerful- I would need another param for the and/or switch,
-    or to not give users the choice.
+    OrFilter exists as a compromise between the need to score some
+    filters equally, while not adding custom behavior to scorers. I
+    could have allowed a position to have a list of filters instead of
+    one filter, but this would require cleaning the user's input, and
+    nested handling of lists. It also would not have been as powerful- I
+    would need another param for the and/or switch, or to not give users
+    the choice.
-    Instead, the user is responsible for building an OrFilter out of their desired filters.
+    Instead, the user is responsible for building an OrFilter out of
+    their desired filters.
     """
     @staticmethod
@@ -266,11 +323,9 @@ class OrFilter:
         if not len(filters) >= 2:
             raise ValueError("Provide at least two Filters to OrFilter.")
-        subset_filters = [f for f in filters if issubclass(f, MemberFilter)]
-        if len(subset_filters) >= 2:
-            raise Warning(
-                "Prefer OrMemberFilter for combining two or more MemberFilters."
-            )
+        member_filters = [f for f in filters if issubclass(f, MemberFilter)]
+        if len(member_filters) >= 2:
+            raise Warning("Use OrMemberFilter for combining two or more MemberFilters.")
         filter = cls.__generic_filter(*filters)
@@ -279,7 +334,7 @@ class OrFilter:
 class OrMemberFilter:
     @staticmethod
-    def __subset_filter(*filters: Type[MemberFilter]) -> Type[MemberFilter]:
+    def __member_filter(*filters: Type[MemberFilter]) -> Type[MemberFilter]:
         all_token_sets: List[Set[str]] = [f.tokens for f in filters]
         all_tokens: Set[str] = set().union(*all_token_sets)
@@ -291,14 +346,17 @@ class OrMemberFilter:
     def __new__(cls, *filters_: Type[MemberFilter]) -> Type[MemberFilter]:
         if not len(filters_) >= 2:
             raise ValueError("Provide two or more MemberFilters to OrMemberFilter.")
-        filter = cls.__subset_filter(*filters_)
+        filter = cls.__member_filter(*filters_)
         return filter
-class AndFilter(Filter):
+class AndFilter:
     """Instantiate with more than one filter to compose them into one filter,
-    returning False when any individual filter fails to match or True otherwise.
-    Requires at least two filters."""
+    returning False when any individual filter fails to match or True
+    otherwise.
+    Requires at least two filters.
+    """
     def __new__(cls, *filters_: Type[Filter]) -> Type[Filter]:
         if not len(filters_) >= 2:
@@ -323,6 +381,11 @@ __all__ = [
     "Alphabetic",
     "AndFilter",
     "EnglishIgnorables",
+    "LongAlphabetic",
+    "LongPhonotactic",
+    "LongProperName",
+    "LongSyllabic",
+    "MinLen",
     "NimiLinkuCore",
     "NimiLinkuSandbox",
     "NimiPu",

sonatoki/Preprocessors.py CHANGED Viewed

@@ -2,7 +2,7 @@
 "Preprocessors" are classes which strip content from a given string prior to tokenization.
 There are currently two distinct types of Preprocessor:
-- Remove a token from a string which would be difficult to identify after tokenization.
+- Remove a token from a string which would be difficult to identify after tokenization.
   - URLs
   - DiscordEmotes
 - Remove a section of a string which is contained in or marked by certain character(s). Also called "Containers"
@@ -61,21 +61,24 @@ Ignorables are tokens which do not count toward the accepted number of tokens
 or the total number of tokens.
 This is generally because they are considered external to Toki Pona.
-It is likely that every user will want to use these.
+It is likely that every user will want to use these.
 Not having them will cause many false negatives, such as when a URL is divided
 into its parts and checked as a token.
 """
 class URLs(RegexPreprocessor):
-    """Remove http(s) protocol URLs"""
+    """Remove http(s) protocol URLs."""
     pattern = re.compile(r"https?:\/\/\S+")
 class Reference(RegexPreprocessor):
     """Remove text contained in double brackets.
-    Often used to fetch articles on Wikipedia, or Magic the Gathering cards."""
+    Often used to fetch articles on Wikipedia, or Magic the Gathering
+    cards.
+    """
     pattern = re.compile(r"\[\[.+\]\]")
@@ -100,7 +103,10 @@ class DiscordSpecial(RegexPreprocessor):
 class AngleBracketObject(RegexPreprocessor):
     """A generalized version of the Discord-specific angle bracket objects.
-    Removes any contiguous (not broken by whitespace) text in angle brackets."""
+    Removes any contiguous (not broken by whitespace) text in angle
+    brackets.
+    """
     pattern = re.compile(r"<[^<>\s]+>")
@@ -111,7 +117,7 @@ The following classes are Containers.
 Containers are a special case of Ignorables, where an entire segment of an input
 may be removed and not counted toward the accepted or total number of tokens.
-Some users may prefer to use these so that they may quote third parties who
+Some users may prefer to use these so that they may quote third parties who
 would likely be using a language other than Toki Pona.
 """

sonatoki/Scorers.py CHANGED Viewed

@@ -13,22 +13,52 @@ Number = Union[int, float]
 Weights = Dict[str, Number]
-def sigmoid(n: int) -> Number:
-    return 1 / (1 + math.exp(-(0.30 * (n - 1))))
-    # n-1 makes sigmoid(1) == 0.5
-    # 0.30 softens scaling in favor of short input
-    # return n / (1+abs(n))   # too weak in 0.7+
 class Scorer(ABC):
     @classmethod
     @abstractmethod
     def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
+        """Score a list of tokens using the given `Filter`s, returning a
+        `Number` between 0 and 1 inclusive."""
         raise NotImplementedError
+class Soften(Scorer):
+    """Meta `Scorer` which scales the scores of short messages to reduce the
+    impact of shortness on scoring.
+    The scores of short messages are scaled by mapping the token count
+    to [0.5, 1.0] via the sigmoid function, then raising the score to
+    the resultant power.
+    For example, a single token scoring 0.64 will score 0.8 instead.
+    """
+    @staticmethod
+    def sigmoid(n: int) -> Number:
+        return 1 / (1 + math.exp(-(0.30 * (n - 1))))
+        # n-1 makes sigmoid(1) == 0.5
+        # 0.30 softens scaling in favor of short input
+        # return n / (1+abs(n))   # too weak in 0.7+
+    @classmethod
+    @override
+    def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
+        percentage = super().score(tokens, filters)  # type: ignore [abstractmethod]
+        len_tokens = len(tokens)
+        percentage **= cls.sigmoid(len_tokens)
+        return percentage
+    def __new__(cls, scorer: Type[Scorer]) -> Type[Scorer]:
+        class SoftenedScorer(Soften, scorer): ...
+        return SoftenedScorer
 class PassFail(Scorer):
-    """The token passes any filter or fails all of them, scoring 1 or 0 respectively."""
+    """If a token matches any filter, it scores 1.
+    Otherwise, it scores 0.
+    """
     @classmethod
     def score_token(cls, token: str, filters: List[Type[Filter]]) -> Number:
@@ -50,28 +80,17 @@ class PassFail(Scorer):
         return total_score / len_tokens if len_tokens else 0
-class SoftPassFail(PassFail):
-    @classmethod
-    @override
-    def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
-        if not tokens:
-            return 1
-        total_score = 0
-        len_tokens = len(tokens)
-        for token in tokens:
-            total_score += cls.score_token(token, filters)
-        percentage = total_score / len_tokens if len_tokens else 0
-        percentage **= sigmoid(len_tokens)
-        return percentage
+class Scaling(Scorer):
+    """Tokens score 1 for matching the first filter, and a linearly reduced
+    amount for matching later filters based on how many filters there are.
+    For example, if there are 4 filters, a token scores 1.0, 0.75, 0.50,
+    and 0.25 for matching each respectively.
-class Scaling(Scorer):
-    """
-    The sooner a token matches a filter, the higher its score.
-    In other words, filter order matters, weighing earlier listed filters higher than later ones.
-    This is desirable to avoid messages which would only match weaker filters, as these are less likely to be Toki Pona.
+    In other words, filter order matters, weighing earlier listed
+    filters higher than later ones. This is desirable to avoid messages
+    which would only match weaker filters, as these are less likely to
+    be Toki Pona.
     """
     @classmethod
@@ -95,33 +114,17 @@ class Scaling(Scorer):
         return total_score / max_score if max_score else 0
-class SoftScaling(Scaling):
-    """Shorter messages are subject to less harsh scoring
-    by mapping the token count to [0.5, 1.0] via the sigmoid function,
-    then raising the score to the resultant power.
-    For example, a single token scoring 0.64 will now score 0.8.
-    """
+class SoftPassFail(Soften, PassFail):
+    """Same as `PassFail`, but shorter messages are subject to less harsh
+    scoring."""
-    @classmethod
-    @override
-    def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
-        if not tokens:
-            return 1
-        total_score = 0
-        len_filters = len(filters)
-        len_tokens = len(tokens)
-        max_score = len_tokens * len_filters
-        for token in tokens:
-            total_score += cls.score_token(token, filters, len_filters)
-        percentage = total_score / max_score if max_score else 0
-        percentage **= sigmoid(len_tokens)
-        return percentage
+class SoftScaling(Soften, Scaling):
+    """Same as `Scaling`, but shorter messages are subject to less harsh
+    scoring."""
-class Logarithmic(Scorer): ...
+# class Logarithmic(Scorer): ...
 __all__ = ["PassFail", "SoftPassFail", "Scaling", "SoftScaling"]

sonatoki 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

sonatoki 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl