PyPI - sonatoki - Versions diffs - 0.2.2__tar.gz → 0.3.0__tar.gz - Mend

sonatoki 0.2.2tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{sonatoki-0.2.2 → sonatoki-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sonatoki
-Version: 0.2.2
+Version: 0.3.0
 Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
 Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
 License: AGPL-3.0-or-later

{sonatoki-0.2.2 → sonatoki-0.3.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "sonatoki"
-version = "0.2.2"
+version = "0.3.0"
 description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
 authors = [
     { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },

{sonatoki-0.2.2 → sonatoki-0.3.0}/src/sonatoki/Cleaners.py RENAMED Viewed

@@ -60,6 +60,13 @@ class ConsecutiveDuplicatesRe(RegexCleaner):
     replace = r"\1"
+class Lowercase(Cleaner):
+    @classmethod
+    @override
+    def clean(cls, token: str) -> str:
+        return token.lower()
 __all__ = [
     "ConsecutiveDuplicates",
 ]

sonatoki-0.3.0/src/sonatoki/Configs.py ADDED Viewed

@@ -0,0 +1,129 @@
+# STL
+from copy import deepcopy
+from typing import List, Type, Union, TypedDict
+# LOCAL
+from sonatoki.Filters import (
+    Filter,
+    NimiPu,
+    Numeric,
+    OrFilter,
+    Syllabic,
+    NimiLinku,
+    NimiPuAle,
+    NimiUCSUR,
+    Alphabetic,
+    ProperName,
+    Phonotactic,
+    Punctuation,
+    NimiLinkuAle,
+    NimiLinkuSandbox,
+    EnglishIgnorables,
+)
+from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
+from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
+from sonatoki.Tokenizers import Tokenizer, WordTokenizer
+from sonatoki.Preprocessors import (
+    URLs,
+    Reference,
+    Preprocessor,
+    DiscordEmotes,
+    DiscordSpecial,
+    DiscordChannels,
+    DiscordMentions,
+    AngleBracketObject,
+)
+class IloConfig(TypedDict):
+    preprocessors: List[Type[Preprocessor]]
+    word_tokenizer: Type[Tokenizer]
+    cleaners: List[Type[Cleaner]]
+    ignoring_filters: List[Type[Filter]]
+    scoring_filters: List[Type[Filter]]
+    scorer: Type[Scorer]
+    passing_score: Number
+# TODO: branching configs?
+BaseConfig: IloConfig = {
+    "preprocessors": [URLs],
+    "cleaners": [ConsecutiveDuplicates],
+    "ignoring_filters": [Numeric, Punctuation],
+    "scoring_filters": [],
+    "scorer": PassFail,
+    "passing_score": 0.8,
+    "word_tokenizer": WordTokenizer,
+}
+PrefConfig: IloConfig = {
+    "preprocessors": [URLs, Reference],
+    "cleaners": [ConsecutiveDuplicates],
+    "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
+    "scoring_filters": [
+        OrFilter(NimiLinku, NimiUCSUR),
+        Syllabic,
+        ProperName,
+        Alphabetic,
+    ],
+    "scorer": SoftScaling,
+    "passing_score": 0.8,
+    "word_tokenizer": WordTokenizer,
+}
+CorpusConfig: IloConfig = {
+    "preprocessors": [URLs, AngleBracketObject, Reference],
+    "cleaners": [ConsecutiveDuplicates],
+    "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
+    "scoring_filters": [
+        OrFilter(NimiLinkuSandbox, NimiUCSUR),
+        Syllabic,
+        ProperName,
+        Alphabetic,
+    ],
+    "scorer": SoftScaling,
+    "passing_score": 0.8,
+    "word_tokenizer": WordTokenizer,
+}
+LazyConfig: IloConfig = {
+    "preprocessors": [URLs],
+    "cleaners": [ConsecutiveDuplicates],
+    "ignoring_filters": [Numeric, Punctuation],
+    "scoring_filters": [Alphabetic, NimiUCSUR, ProperName],
+    "scorer": SoftPassFail,
+    "passing_score": 0.8,
+    "word_tokenizer": WordTokenizer,
+}
+DiscordConfig: IloConfig = {
+    "preprocessors": [URLs, AngleBracketObject, Reference],
+    "cleaners": [ConsecutiveDuplicates],
+    "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
+    "scoring_filters": [
+        OrFilter(NimiLinku, NimiUCSUR),
+        Syllabic,
+        ProperName,
+        Alphabetic,
+    ],
+    "scorer": SoftScaling,
+    "passing_score": 0.8,
+    "word_tokenizer": WordTokenizer,
+}
+TelegramConfig: IloConfig = deepcopy(PrefConfig)
+ForumConfig: IloConfig = deepcopy(PrefConfig)
+__all__ = [
+    "BaseConfig",
+    "CorpusConfig",
+    "DiscordConfig",
+    "ForumConfig",
+    "IloConfig",
+    "LazyConfig",
+    "PrefConfig",
+    "TelegramConfig",
+]

{sonatoki-0.2.2 → sonatoki-0.3.0}/src/sonatoki/Filters.py RENAMED Viewed

@@ -1,7 +1,7 @@
 # STL
 import re
 from abc import ABC, abstractmethod
-from typing import Set
+from typing import Set, List, Type
 from functools import lru_cache as cache  # cache comes in 3.9
 # PDM
@@ -13,15 +13,17 @@ from sonatoki.constants import (
     VOWELS,
     NIMI_PU,
     ALPHABET,
+    ALL_PUNCT,
     ALLOWABLES,
     CONSONANTS,
+    IGNORABLES,
     NIMI_LINKU,
-    POSIX_PUNCT,
-    UNICODE_PUNCT,
+    NIMI_UCSUR,
     NIMI_LINKU_LILI,
     ALL_PUNCT_RANGES,
     NIMI_PU_SYNONYMS,
     NIMI_LINKU_SANDBOX,
+    UCSUR_PUNCT_RANGES,
 )
 regex.DEFAULT_VERSION = regex.VERSION1
@@ -79,6 +81,10 @@ class Miscellaneous(MemberFilter):
     tokens = set(ALLOWABLES)
+class EnglishIgnorables(MemberFilter):
+    tokens = set(IGNORABLES)
 class ProperName(Filter):
     """Determines if a given token is a valid name (also called a loan word).
     When Toki Pona is written with the Latin alphabet, names are generally
@@ -118,6 +124,10 @@ class NimiLinkuSandbox(MemberFilter):
     tokens = set(NIMI_LINKU + NIMI_LINKU_LILI + NIMI_LINKU_SANDBOX)
+class NimiUCSUR(MemberFilter):
+    tokens = set(NIMI_UCSUR)
 class Phonotactic(RegexFilter):
     """Determines if a given token is phonotactically valid Toki Pona (or `n`).
     Excludes both consecutive nasals and the illegal syllables:
@@ -156,6 +166,11 @@ class AlphabeticRe(RegexFilter):
     pattern = re.compile(rf"[{ALPHABET}]+", flags=re.IGNORECASE)
+class TwoOrMoreAlphabetic(Filter):
+    # TODO: alphabetic implementation that ignores single characters
+    pass
 class Numeric(Filter):
     """Determine if a given token is entirely numeric.
     Covers all numeric symbols in Unicode.
@@ -175,12 +190,13 @@ class Numeric(Filter):
 class Punctuation(SubsetFilter):
     """Identify whether a token is entirely punctuation. Fastest implementation."""
-    tokens = set(POSIX_PUNCT + UNICODE_PUNCT)
+    tokens = set(ALL_PUNCT)
 class PunctuationRe(RegexFilter):
     """Faster implementation of `PunctuationRe1`.
-    Goes out of date compared to the `regex` library if UNICODE_PUNCT is not updated."""
+    Goes out of date compared to the `regex` library if UNICODE_PUNCT_RANGES is not updated.
+    """
     pattern = re.compile(rf"[{ALL_PUNCT_RANGES}]+")
@@ -188,17 +204,81 @@ class PunctuationRe(RegexFilter):
 class PunctuationRe1(Regex1Filter):
     """Reference implementation for identifying tokens made entirely of punctuation."""
-    pattern = regex.compile(r"[\p{Punctuation}\p{posix_punct}]+")
+    pattern = regex.compile(
+        rf"[\p{{Punctuation}}\p{{posix_punct}}{UCSUR_PUNCT_RANGES}]+"
+    )
+class OrFilter:
+    """Instantiate with more than one filter to compose them into one filter,
+    returning True when any individual filter matches or False otherwise.
+    Requires at least two filters.
+    OrFilter exists as a compromise between the need to score some filters equally,
+    while not adding custom behavior to scorers.
+    I could have allowed a position to have a list of filters instead of one filter,
+    but this would require cleaning the user's input, and nested handling of lists.
+    It also would not have been as powerful- I would need another param for the and/or switch,
+    or to not give users the choice.
+    Instead, the user is responsible for building an OrFilter out of their desired filters.
+    """
+    def __new__(cls, *filters_: Type[Filter]) -> Type[Filter]:
+        if not len(filters_) >= 2:
+            raise ValueError("Must provide at least two Filters to OrFilter.")
+        class AnonymousOrFilter(Filter):
+            filters: List[Type[Filter]] = list(filters_)  # TODO: tuple better?
+            @classmethod
+            @override
+            @cache(maxsize=None)
+            def filter(cls, token: str) -> bool:
+                for f in cls.filters:
+                    if f.filter(token):
+                        return True
+                return False
+        return AnonymousOrFilter
+class AndFilter(Filter):
+    """Instantiate with more than one filter to compose them into one filter,
+    returning False when any individual filter fails to match or True otherwise.
+    Requires at least two filters."""
+    def __new__(cls, *filters_: Type[Filter]) -> Type[Filter]:
+        if not len(filters_) >= 2:
+            raise ValueError("Must provide at least two Filters to AndFilter.")
+        class AnonymousAndFilter(Filter):
+            filters: List[Type[Filter]] = list(filters_)  # TODO: tuple better?
+            @classmethod
+            @override
+            @cache(maxsize=None)
+            def filter(cls, token: str) -> bool:
+                for f in cls.filters:
+                    if not f.filter(token):
+                        return False
+                return True
+        return AnonymousAndFilter
 __all__ = [
     "Alphabetic",
+    "AndFilter",
+    "EnglishIgnorables",
     "NimiLinku",
     "NimiLinkuAle",
     "NimiLinkuSandbox",
     "NimiPu",
     "NimiPuAle",
+    "NimiUCSUR",
     "Numeric",
+    "OrFilter",
     "Phonotactic",
     "ProperName",
     "Punctuation",

{sonatoki-0.2.2 → sonatoki-0.3.0}/src/sonatoki/Tokenizers.py RENAMED Viewed

@@ -5,16 +5,12 @@ from typing import Set, List
 # PDM
 import regex
-from typing_extensions import override
+from typing_extensions import override, deprecated
 # LOCAL
 from sonatoki.utils import regex_escape
-from sonatoki.constants import (
-    POSIX_PUNCT,
-    UNICODE_PUNCT,
-    SENTENCE_PUNCT,
-    ALL_PUNCT_RANGES,
-)
+from sonatoki.Filters import NimiUCSUR  # seriously this sucks
+from sonatoki.constants import ALL_PUNCT, SENTENCE_PUNCT, ALL_PUNCT_RANGES
 regex.DEFAULT_VERSION = regex.VERSION1
@@ -50,7 +46,12 @@ class Regex1Tokenizer(Tokenizer):
 class WordTokenizer(SetTokenizer):
-    delimiters = set(POSIX_PUNCT + UNICODE_PUNCT)
+    delimiters = set(ALL_PUNCT)
+    @classmethod
+    def __helper(cls, s: str, tokens: List[str], last_match: int, i: int):
+        match = s[last_match:i].split()
+        [tokens.append(t) for t in match if t]
     @classmethod
     @override
@@ -60,32 +61,47 @@ class WordTokenizer(SetTokenizer):
         tokens: List[str] = []
+        i = 0  # ensure i is bound
         last_match = 0
         last_membership = s[0] in cls.delimiters
         for i, char in enumerate(s):
             mem = char in cls.delimiters
-            if mem == last_membership:
+            ucsur = NimiUCSUR.filter(char)  # always "changed" means
+            changed = (mem != last_membership) or ucsur
+            # this keeps contiguous words together, but splits UCSUR
+            if not changed:
+                continue
+            if ucsur:
+                if i > last_match:
+                    # Add the token before UCSUR character
+                    cls.__helper(s, tokens, last_match, i)
+                # Add UCSUR character itself as a token
+                tokens.append(char)
+                last_match = i + 1
+                last_membership = mem
                 continue
-            match = s[last_match:i].split()
-            # TODO: kinda sucks? what about unicode whitespace?
+            cls.__helper(s, tokens, last_match, i)
             last_match = i
             last_membership = mem
-            [tokens.append(t) for t in match if t]
-        match = s[last_match:].strip().split()
-        if match:
-            tokens.extend(match)
+        cls.__helper(s, tokens, last_match, i + 1)
         return tokens
+@deprecated(
+    "WordTokenizerRe is a previous reference implementation. Its behavior has diverged from WordTokenizer and it may not be restored."
+)
 class WordTokenizerRe(RegexTokenizer):
     pattern = re.compile(rf"""([{ALL_PUNCT_RANGES}]+|\s+)""")
+@deprecated(
+    "WordTokenizerRe1 is a previous reference implementation. Its behavior has diverged from WordTokenizer and it may not be restored."
+)
 class WordTokenizerRe1(Regex1Tokenizer):
-    """Reference implementation for WorkTokenizer."""
+    """Reference implementation for WordTokenizer."""
     pattern = regex.compile(r"""([\p{posix_punct}\p{Punctuation}]+|\s+)""")

sonatoki 0.2.2__tar.gz → 0.3.0__tar.gz

sonatoki 0.2.2tar.gz → 0.3.0tar.gz