PyPI - sonatoki - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

sonatoki 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

sonatoki/Configs.py +18 -14
sonatoki/Filters.py +75 -45
sonatoki/Preprocessors.py +31 -0
sonatoki/Tokenizers.py +3 -3
sonatoki/__main__.py +176 -3
sonatoki/alphabetic.txt +1771 -0
sonatoki/constants.py +236 -47
sonatoki/ilo.py +1 -1
sonatoki/linku.json +1 -1
sonatoki/sandbox.json +1 -1
sonatoki/syllabic.txt +297 -0
sonatoki/utils.py +0 -56
{sonatoki-0.4.0.dist-info → sonatoki-0.5.0.dist-info}/METADATA +2 -1
sonatoki-0.5.0.dist-info/RECORD +20 -0
sonatoki-0.4.0.dist-info/RECORD +0 -18
{sonatoki-0.4.0.dist-info → sonatoki-0.5.0.dist-info}/WHEEL +0 -0
{sonatoki-0.4.0.dist-info → sonatoki-0.5.0.dist-info}/licenses/LICENSE +0 -0

sonatoki/Configs.py CHANGED Viewed

@@ -7,6 +7,9 @@ from typing_extensions import NotRequired
 # LOCAL
 from sonatoki.Filters import (
+    Or,
+    And,
+    Not,
     Filter,
     Numeric,
     Syllabic,
@@ -21,8 +24,8 @@ from sonatoki.Filters import (
     NimiLinkuCore,
     LongAlphabetic,
     LongProperName,
-    OrMemberFilter,
     NimiLinkuCommon,
+    FalsePosSyllabic,
     NimiLinkuObscure,
     NimiLinkuSandbox,
     NimiLinkuUncommon,
@@ -32,6 +35,7 @@ from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
 from sonatoki.Tokenizers import Tokenizer
 from sonatoki.Preprocessors import (
     URLs,
+    Emoji,
     Backticks,
     Reference,
     Preprocessor,
@@ -63,12 +67,12 @@ BaseConfig: IloConfig = {
 PrefConfig: IloConfig = {
-    "preprocessors": [Backticks, URLs, Reference],
+    "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
-        OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
-        LongSyllabic,
+        Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
+        And(LongSyllabic, Not(FalsePosSyllabic)),
         LongProperName,
         LongAlphabetic,
     ],
@@ -77,11 +81,11 @@ PrefConfig: IloConfig = {
 }
 CorpusConfig: IloConfig = {
-    "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
+    "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
-        OrMemberFilter(
+        Or(
             NimiLinkuCore,
             NimiLinkuCommon,
             NimiLinkuUncommon,
@@ -90,7 +94,7 @@ CorpusConfig: IloConfig = {
             NimiUCSUR,
             Miscellaneous,
         ),
-        LongSyllabic,
+        And(LongSyllabic, Not(FalsePosSyllabic)),
         LongProperName,
         LongAlphabetic,
     ],
@@ -99,7 +103,7 @@ CorpusConfig: IloConfig = {
 }
 """Mimics the previous implementation of ilo pi toki pona taso."""
 LazyConfig: IloConfig = {
-    "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
+    "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
@@ -108,18 +112,18 @@ LazyConfig: IloConfig = {
 }
 """This is extremely silly."""
 IsipinEpikuConfig: IloConfig = {
-    "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
+    "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
-        OrMemberFilter(
+        Or(
             NimiKuSuli,
             NimiKuLili,
             NimiLinkuUncommon,
             NimiLinkuObscure,
             NimiLinkuSandbox,
         ),
-        LongSyllabic,
+        And(LongSyllabic, Not(FalsePosSyllabic)),
         LongProperName,
         LongAlphabetic,
     ],
@@ -129,12 +133,12 @@ IsipinEpikuConfig: IloConfig = {
 DiscordConfig: IloConfig = {
-    "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
+    "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
-        OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
-        LongSyllabic,
+        Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
+        And(LongSyllabic, Not(FalsePosSyllabic)),
         LongProperName,
         LongAlphabetic,
     ],

sonatoki/Filters.py CHANGED Viewed

@@ -6,7 +6,7 @@ from functools import lru_cache as cache  # cache comes in 3.9
 # PDM
 import regex
-from typing_extensions import override
+from typing_extensions import override, deprecated
 # LOCAL
 from sonatoki.utils import prep_dictionary
@@ -17,18 +17,21 @@ from sonatoki.constants import (
     ALL_PUNCT,
     ALLOWABLES,
     CONSONANTS,
-    IGNORABLES,
     NIMI_UCSUR,
     NIMI_KU_LILI,
     NIMI_KU_SULI,
     NIMI_LINKU_CORE,
-    ALL_PUNCT_RANGES,
     NIMI_PU_SYNONYMS,
     NIMI_LINKU_COMMON,
+    FALSE_POS_SYLLABIC,
     NIMI_LINKU_OBSCURE,
     NIMI_LINKU_SANDBOX,
-    UCSUR_PUNCT_RANGES,
+    NOT_IN_PUNCT_CLASS,
     NIMI_LINKU_UNCOMMON,
+    ALL_PUNCT_RANGES_STR,
+    FALSE_POS_ALPHABETIC,
+    UCSUR_PUNCT_RANGES_STR,
+    EMOJI_VARIATION_SELECTOR_RANGES_STR,
 )
 regex.DEFAULT_VERSION = regex.VERSION1
@@ -113,13 +116,18 @@ class Miscellaneous(MemberFilter):
     tokens = prep_dictionary(ALLOWABLES)
-class EnglishIgnorables(MemberFilter):
-    """NOTE: Not recommended for use.
-    It is better to use a Long* filter such as LongSyllabic than to use this filter.
-    This filter hides words from scoring rather than scoring them poorly,
-    which is more of a benefit than a loss for a word you would like to omit."""
+class FalsePosSyllabic(MemberFilter):
+    """A MemberFilter of words which would match Syllabic (and often Phonetic),
+    but are words in other languages."""
-    tokens = prep_dictionary(IGNORABLES)
+    tokens = prep_dictionary(FALSE_POS_SYLLABIC)
+class FalsePosAlphabetic(MemberFilter):
+    """A MemberFilter of words which would match Alphabetic, but are words in
+    other languages."""
+    tokens = prep_dictionary(FALSE_POS_ALPHABETIC)
 class ProperName(Filter):
@@ -273,7 +281,7 @@ class PunctuationRe(RegexFilter):
     Goes out of date compared to the `regex` library if UNICODE_PUNCT_RANGES is not updated.
     """
-    pattern = re.compile(rf"[{ALL_PUNCT_RANGES}]+")
+    pattern = re.compile(rf"[{ALL_PUNCT_RANGES_STR}]+")
 class PunctuationRe1(Regex1Filter):
@@ -281,22 +289,24 @@ class PunctuationRe1(Regex1Filter):
     punctuation."""
     pattern = regex.compile(
-        rf"[\p{{Punctuation}}\p{{posix_punct}}{UCSUR_PUNCT_RANGES}]+"
+        rf"[\p{{Punctuation}}\p{{posix_punct}}{NOT_IN_PUNCT_CLASS}{UCSUR_PUNCT_RANGES_STR}{EMOJI_VARIATION_SELECTOR_RANGES_STR}]+"
     )
-class OrFilter:
+class Or:
     """Instantiate with more than one filter to compose them into one filter,
     returning True when any individual filter matches or False otherwise.
-    Requires at least two filters.
-    OrFilter exists as a compromise between the need to score some
-    filters equally, while not adding custom behavior to scorers. I
-    could have allowed a position to have a list of filters instead of
-    one filter, but this would require cleaning the user's input, and
-    nested handling of lists. It also would not have been as powerful- I
-    would need another param for the and/or switch, or to not give users
-    the choice.
+    Requires at least two filters. If two or more MemberFilters are provided,
+    they will be combined by creating a single set with the members of every
+    individual filter.
+    Or exists as a compromise between the need to score some filters
+    equally, while not adding custom behavior to scorers. I could have
+    allowed a position to have a list of filters instead of one filter,
+    but this would require cleaning the user's input, and nested
+    handling of lists. It also would not have been as powerful- I would
+    need another param for the and/or switch, or to not give users the
+    choice.
     Instead, the user is responsible for building an OrFilter out of
     their desired filters.
@@ -304,7 +314,6 @@ class OrFilter:
     @staticmethod
     def __generic_filter(*filters_: Type[Filter]) -> Type[Filter]:
         class CombinedFilter(Filter):
             filters: List[Type[Filter]] = list(filters_)  # TODO: tuple better?
@@ -319,20 +328,6 @@ class OrFilter:
         return CombinedFilter
-    def __new__(cls, *filters: Type[Filter]) -> Type[Filter]:
-        if not len(filters) >= 2:
-            raise ValueError("Provide at least two Filters to OrFilter.")
-        member_filters = [f for f in filters if issubclass(f, MemberFilter)]
-        if len(member_filters) >= 2:
-            raise Warning("Use OrMemberFilter for combining two or more MemberFilters.")
-        filter = cls.__generic_filter(*filters)
-        return filter
-class OrMemberFilter:
     @staticmethod
     def __member_filter(*filters: Type[MemberFilter]) -> Type[MemberFilter]:
         all_token_sets: List[Set[str]] = [f.tokens for f in filters]
@@ -343,14 +338,24 @@ class OrMemberFilter:
         return CombinedFilter
-    def __new__(cls, *filters_: Type[MemberFilter]) -> Type[MemberFilter]:
-        if not len(filters_) >= 2:
-            raise ValueError("Provide two or more MemberFilters to OrMemberFilter.")
-        filter = cls.__member_filter(*filters_)
+    def __new__(cls, *filters: Type[Filter]) -> Type[Filter]:
+        if not len(filters) >= 2:
+            raise ValueError("Provide at least two Filters to OrFilter.")
+        member_filters = [f for f in filters if issubclass(f, MemberFilter)]
+        other_filters = [f for f in filters if not issubclass(f, MemberFilter)]
+        if len(member_filters) >= 2:
+            # we can save some effort by making a single filter out of these
+            member_filter = cls.__member_filter(*member_filters)
+            other_filters.append(member_filter)
+        else:
+            other_filters.extend(member_filters)
+        filter = cls.__generic_filter(*other_filters)
         return filter
-class AndFilter:
+class And:
     """Instantiate with more than one filter to compose them into one filter,
     returning False when any individual filter fails to match or True
     otherwise.
@@ -377,10 +382,34 @@ class AndFilter:
         return AnonymousAndFilter
+class Not(Filter):
+    """
+    Meta filter which may be inherited by or constructed with a filter to invert its output.
+    ---
+    ```
+    from sonatoki.Filters import Alphabetic, Not
+    my_filter = Not(Alphabetic)
+    class MyFilter(Not, Alphabetic):
+        ...
+    ```
+    """
+    @classmethod
+    @cache(maxsize=None)
+    def filter(cls, token: str) -> bool:
+        return not super().filter(token)
+    def __new__(cls, filter: Type[Filter]) -> Type[Filter]:
+        class NotFilter(Not, filter): ...
+        return NotFilter
 __all__ = [
     "Alphabetic",
-    "AndFilter",
-    "EnglishIgnorables",
+    "And",
+    "FalsePosSyllabic",
     "LongAlphabetic",
     "LongPhonotactic",
     "LongProperName",
@@ -391,8 +420,9 @@ __all__ = [
     "NimiPu",
     "NimiPuSynonyms",
     "NimiUCSUR",
+    "Not",
     "Numeric",
-    "OrFilter",
+    "Or",
     "Phonotactic",
     "ProperName",
     "Punctuation",

sonatoki/Preprocessors.py CHANGED Viewed

@@ -21,6 +21,7 @@ import re
 from abc import ABC, abstractmethod
 # PDM
+import emoji
 import regex
 from typing_extensions import override
@@ -162,6 +163,34 @@ class AllQuotes(RegexPreprocessor):
     )
+class Emoji(Preprocessor):
+    @classmethod
+    @override
+    def process(cls, msg: str) -> str:
+        return emoji.replace_emoji(msg)
+class ZeroWidths(RegexPreprocessor):
+    """Remove the Zero Width Joiner and Zero Width Non-Joiner from the input.
+    ZWJ and ZWNJ do serve semantic purposes,
+    such as combining many person emojis into the family emojis,
+    or ensuring two characters do not become a ligature.
+    However, all emojis are considered punctuation by this library,
+    so preprocessing ZWJ out is more accurate:
+    It will leave behind the component emojis, which will be ignored.
+    But ZWJ cannot be considered punctuation for tokenizing purposes because it is used in the middle of words to render them differently.
+    In this vein, ZWJ is a function character.
+    In the future, it may be smarter to omit ZWJ in the tokenization process,
+    or to make the tokenizer smarter by having it keep together collected emojis.
+    But in order to do this, emoji would have to be accurately distinguished from all other punctuation.
+    """
+    pattern = re.compile("[\\U0000200C-\\U0000200D]")
 __all__ = [
     "AllQuotes",
     "AngleBracketObject",
@@ -176,4 +205,6 @@ __all__ = [
     "SingleQuotes",
     "Spoilers",
     "URLs",
+    "ZeroWidths",
+    "Emoji",
 ]

sonatoki/Tokenizers.py CHANGED Viewed

@@ -10,7 +10,7 @@ from typing_extensions import override, deprecated
 # LOCAL
 from sonatoki.utils import regex_escape
 from sonatoki.Filters import NimiUCSUR  # seriously this sucks
-from sonatoki.constants import ALL_PUNCT, SENTENCE_PUNCT, ALL_PUNCT_RANGES
+from sonatoki.constants import ALL_PUNCT, SENTENCE_PUNCT, ALL_PUNCT_RANGES_STR
 regex.DEFAULT_VERSION = regex.VERSION1
@@ -66,7 +66,7 @@ class WordTokenizer(SetTokenizer):
         last_membership = s[0] in cls.delimiters
         for i, char in enumerate(s):
             mem = char in cls.delimiters
-            ucsur = NimiUCSUR.filter(char)  # always "changed" means
+            ucsur = NimiUCSUR.filter(char)
             changed = (mem != last_membership) or ucsur
             # this keeps contiguous words together, but splits UCSUR
             if not changed:
@@ -94,7 +94,7 @@ class WordTokenizer(SetTokenizer):
     "WordTokenizerRe is a previous reference implementation. Its behavior has diverged from WordTokenizer and it may not be restored."
 )
 class WordTokenizerRe(RegexTokenizer):
-    pattern = re.compile(rf"""([{ALL_PUNCT_RANGES}]+|\s+)""")
+    pattern = re.compile(rf"""([{ALL_PUNCT_RANGES_STR}]+|\s+)""")
 @deprecated(

sonatoki/__main__.py CHANGED Viewed

@@ -1,9 +1,182 @@
 #!/bin/env python3
+# STL
+import os
+import json
+import argparse
+from typing import Any, Set, Dict, List
+# PDM
+import emoji
+import requests
-def open():
-    pass
+# LOCAL
+from sonatoki.utils import find_unicode_ranges
+from sonatoki.Filters import (
+    Or,
+    LongSyllabic,
+    NimiLinkuCore,
+    LongAlphabetic,
+    NimiLinkuCommon,
+    NimiLinkuObscure,
+    NimiLinkuUncommon,
+)
+from sonatoki.Cleaners import ConsecutiveDuplicates
+from sonatoki.constants import (
+    UCSUR_PUNCT_RANGES,
+    UNICODE_PUNCT_RANGES,
+    EMOJI_VARIATION_SELECTOR_RANGES,
+)
+HERE = os.path.dirname(os.path.realpath(__file__))
+UNICODE_DATA = "https://unicode.org/Public/UNIDATA/UnicodeData.txt"
+LINKU_WORDS = "https://api.linku.la/v1/words?lang=en"
+LINKU_SANDBOX = "https://api.linku.la/v1/sandbox?lang=en"
+WORDS_10K = "https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt"
+WORDS_25K = "https://raw.githubusercontent.com/dolph/dictionary/master/popular.txt"
+WORDS_479K = (
+    "https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt"
+)
+HEADERS = {  # pretend to be Chrome 121, just in case
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.3"
+}
+def download(url: str) -> str:
+    if not url.startswith("https://"):
+        raise ValueError(url)
+    resp = requests.get(url, timeout=5, headers=HEADERS)
+    return resp.text
+def download_json(url: str) -> Dict[str, Any]:
+    resp = download(url)
+    return json.loads(resp)
+def regen_linku_data():
+    data = download_json(LINKU_WORDS)
+    with open(os.path.join(HERE, "linku.json"), "w") as f:
+        _ = f.write(json.dumps(data))
+    data = download_json(LINKU_SANDBOX)
+    with open(os.path.join(HERE, "sandbox.json"), "w") as f:
+        _ = f.write(json.dumps(data))
+def regen_false_negatives():
+    # TODO: regen from my frequency data where the score is below 0.8?
+    KnownWords = Or(
+        NimiLinkuCore,
+        NimiLinkuCommon,
+        NimiLinkuUncommon,
+        NimiLinkuObscure,
+    )
+    syllabic_matches: Set[str] = set()
+    alphabetic_matches: Set[str] = set()
+    data = download(WORDS_25K)
+    for word in data.splitlines():
+        if not word:
+            continue
+        word = ConsecutiveDuplicates.clean(word)
+        if KnownWords.filter(word):
+            # ignore dictionary
+            continue
+        if LongSyllabic.filter(word):
+            syllabic_matches.add(word)
+            continue
+        if LongAlphabetic.filter(word):
+            alphabetic_matches.add(word)
+            continue
+    # TODO: include short matches or no?
+    with open(os.path.join(HERE, "syllabic.txt"), "w") as f:
+        syllabic_final = sorted([word + "\n" for word in syllabic_matches])
+        f.writelines(syllabic_final)
+    with open(os.path.join(HERE, "alphabetic.txt"), "w") as f:
+        alphabetic_final = sorted([word + "\n" for word in alphabetic_matches])
+        f.writelines(alphabetic_final)
+def regen_unicode_data():
+    PUNCT_CATEGORIES = {
+        # Punctuation
+        "Pc",  # Connector
+        "Pd",  # Dash
+        "Pe",  # Close (end)
+        "Pf",  # Final
+        "Pi",  # Initial
+        "Po",  # Other
+        "Ps",  # Open (sOpen)
+        # Symbol
+        "Sm",  # Math
+        "Sk",  # Modifier (kModifier)
+        "Sc",  # Currency
+        "So",  # Other
+    }
+    r"""These characters are in Symbol other (So) but are not in
+    `\p{Punctuation}` However, I began excluding them again, because it turns
+    out that some sequences of latin alphabet emoji."""
+    # NOTE: There are many characters which look like writing characters but are in the punctuation character class. Examples:
+    # - kangxi radicals from ⺀ to ⿕ which are for demonstration, not writing
+    # - parenthesized hangul letters and syllables from ㈀ to ㈜
+    # - circled katakana from ㋐ to ㋾
+    # the latter two shouldn't be in `\p{Punctuation}` if the latin alphabet isn't... oof
+    def is_punctuation(data: List[str]):
+        return data[2] in PUNCT_CATEGORIES
+    def get_character(data: List[str]):
+        return chr(int(data[0], 16))
+    unicode_data = download(UNICODE_DATA)
+    unicode_punctuation = ""
+    for line in unicode_data.split("\n"):
+        if not line:  # damn you, trailing newline
+            continue
+        # NOTE: UnicodeData.txt lists a range if there are many consecutive similar characters
+        # (e.g. CJK Ideograph, First at 4E00 and CJK Ideograph, Last at 9FFF).
+        # This does not apply to any currently defined punctuation category.
+        unicode_data = line.split(";")
+        if not is_punctuation(unicode_data):
+            continue
+        char = get_character(unicode_data)
+        unicode_punctuation += char
+    unicode_punctuation = emoji.replace_emoji(unicode_punctuation)
+    unicode_ranges = find_unicode_ranges(unicode_punctuation)
+    unicode_ranges.extend(UCSUR_PUNCT_RANGES)
+    # unicode_ranges.extend(EMOJI_VARIATION_SELECTOR_RANGES)  # made unnecessary by emoji library
+    unicode_ranges = sorted(unicode_ranges)
+    # sorted in case my manual additions are out of order
+    if unicode_ranges != UNICODE_PUNCT_RANGES:
+        output = json.dumps(unicode_ranges, indent=4, ensure_ascii=True)
+        print(output)
+def main(argv: argparse.Namespace):
+    regen_unicode_data()
+    regen_linku_data()
+    regen_false_negatives()
 if __name__ == "__main__":
-    open()
+    """Helper script to fetch UNICODE_PUNCT in constants.py."""
+    parser = argparse.ArgumentParser()
+    # TODO: choice between regen unicode data, regen linku, regen english phonomatches
+    argv = parser.parse_args()
+    main(argv)

sonatoki 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

sonatoki 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl