PyPI - sonatoki - Versions diffs - 0.1.3__tar.gz → 0.1.5__tar.gz - Mend

sonatoki 0.1.3tar.gz → 0.1.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{sonatoki-0.1.3 → sonatoki-0.1.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sonatoki
-Version: 0.1.3
+Version: 0.1.5
 Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
 Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
 License: AGPL-3.0-or-later

{sonatoki-0.1.3 → sonatoki-0.1.5}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "sonatoki"
-version = "0.1.3"
+version = "0.1.5"
 description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
 authors = [
     { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },

{sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/Configs.py RENAMED Viewed

@@ -9,15 +9,15 @@ from typing_extensions import NotRequired
 from sonatoki.Filters import (
     Filter,
     NimiPu,
-    Numerics,
+    Numeric,
     Syllabic,
     NimiLinku,
     NimiPuAle,
     Alphabetic,
     ProperName,
     Phonotactic,
+    Punctuation,
     NimiLinkuAle,
-    Punctuations,
 )
 from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
 from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
@@ -45,7 +45,7 @@ class IloConfig(TypedDict):
 BaseConfig: IloConfig = {
     "preprocessors": [URLs],
     "cleaners": [ConsecutiveDuplicates],
-    "ignoring_filters": [Numerics, Punctuations],
+    "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [],
     "scorer": PassFail,
     "passing_score": 0.8,

{sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/Filters.py RENAMED Viewed

@@ -1,10 +1,11 @@
 # STL
+import re
 from abc import ABC, abstractmethod
 from typing import Set
 from functools import lru_cache as cache  # cache comes in 3.9
 # PDM
-import regex as re
+import regex
 from typing_extensions import override
 # LOCAL
@@ -13,14 +14,16 @@ from sonatoki.constants import (
     CONSONANTS,
     NIMI_PU_SET,
     ALPHABET_SET,
+    UNICODE_PUNCT,
     ALLOWABLES_SET,
     NIMI_LINKU_SET,
     NIMI_PU_ALE_SET,
     NIMI_LINKU_ALE_SET,
+    PRUNED_POSIX_PUNCT,
     NIMI_LINKU_SANDBOX_SET,
 )
-re.DEFAULT_VERSION = re.VERSION1
+regex.DEFAULT_VERSION = regex.VERSION1
 class Filter(ABC):
@@ -41,6 +44,16 @@ class RegexFilter(Filter):
         return not not re.fullmatch(cls.pattern, token)
+class Regex1Filter(Filter):
+    pattern: "regex.Pattern[str]"
+    @classmethod
+    @override
+    @cache(maxsize=None)
+    def filter(cls, token: str) -> bool:
+        return not not regex.fullmatch(cls.pattern, token)
 class SetFilter(Filter):
     tokens: Set[str]
@@ -131,7 +144,7 @@ class Alphabetic(Filter):
         return set(token.lower()).issubset(ALPHABET_SET)
-class Numerics(Filter):
+class Numeric(Filter):
     """Determine if a given token is entirely numeric.
     Covers all numeric symbols in Unicode.
@@ -147,8 +160,8 @@ class Numerics(Filter):
         return msg.isnumeric()
-class Punctuations(RegexFilter):
-    pattern = re.compile(r"[\p{Punctuation}\p{posix_punct}]+")
+class Punctuation(RegexFilter):
+    pattern = re.compile(rf"[{PRUNED_POSIX_PUNCT}{UNICODE_PUNCT}]+")
 __all__ = [
@@ -159,6 +172,6 @@ __all__ = [
     "Syllabic",
     "Alphabetic",
     "ProperName",
-    "Punctuations",
-    "Numerics",
+    "Punctuation",
+    "Numeric",
 ]

{sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/Preprocessors.py RENAMED Viewed

@@ -17,13 +17,14 @@ It is up to the user to order them appropriately.
 """
 # STL
+import re
 from abc import ABC, abstractmethod
 # PDM
-import regex as re
+import regex
 from typing_extensions import override
-re.DEFAULT_VERSION = re.VERSION1
+regex.DEFAULT_VERSION = regex.VERSION1
 class Preprocessor(ABC):
@@ -43,6 +44,16 @@ class RegexPreprocessor(Preprocessor):
         return re.sub(cls.pattern, cls.replace, msg)
+class Regex1Preprocessor(Preprocessor):
+    pattern: "regex.Pattern[str]"
+    replace: str = " "
+    @classmethod
+    @override
+    def process(cls, msg: str) -> str:
+        return regex.sub(cls.pattern, cls.replace, msg)
 """
 The following classes are Ignorables.
@@ -62,6 +73,13 @@ class URLs(RegexPreprocessor):
     pattern = re.compile(r"https?:\/\/\S+")
+class Reference(RegexPreprocessor):
+    """Remove text contained in double brackets.
+    Often used to fetch articles on Wikipedia, or Magic the Gathering cards."""
+    pattern = re.compile(r"\[\[.+\]\]")
 class DiscordEmotes(RegexPreprocessor):
     """Remove text-formatted Discord emotes `<flags:name:id>`"""
@@ -80,6 +98,13 @@ class DiscordSpecial(RegexPreprocessor):
     pattern = re.compile(r"<id:[a-zA-Z0-9_]{4,}>")
+class AngleBracketObject(RegexPreprocessor):
+    """A generalized version of the Discord-specific angle bracket objects.
+    Removes any contiguous (not broken by whitespace) text in angle brackets."""
+    pattern = re.compile(r"<[^<>\s]+>")
 """
 The following classes are Containers.
@@ -92,23 +117,23 @@ would likely be using a language other than Toki Pona.
 class SingleQuotes(RegexPreprocessor):
-    pattern = re.compile(r"'[^']+'", flags=re.S)  # . matches newline
+    pattern = re.compile(r"'[^']+'", flags=re.DOTALL)
 class DoubleQuotes(RegexPreprocessor):
-    pattern = re.compile(r'"[^"]+"', flags=re.S)
+    pattern = re.compile(r'"[^"]+"', flags=re.DOTALL)
 class Backticks(RegexPreprocessor):
     """Remove paired backticks and their contents `like this`"""
-    pattern = re.compile(r"`[^`]+`", flags=re.S)
+    pattern = re.compile(r"`[^`]+`", flags=re.DOTALL)
 class Spoilers(RegexPreprocessor):
     """Remove paired double bars and their contents `||like this||`"""
-    pattern = re.compile(r"\|\|(?:(?!\|\|).)+\|\|", flags=re.S)
+    pattern = re.compile(r"\|\|(?:(?!\|\|).)+\|\|", flags=re.DOTALL)
 class ArrowQuote(RegexPreprocessor):
@@ -117,7 +142,22 @@ class ArrowQuote(RegexPreprocessor):
     pattern = re.compile(r"^>\ .+$", re.MULTILINE)
+class AllQuotes(RegexPreprocessor):
+    pattern = re.compile(
+        "|".join(
+            [
+                SingleQuotes.pattern.pattern,
+                DoubleQuotes.pattern.pattern,
+                Backticks.pattern.pattern,
+                ArrowQuote.pattern.pattern,
+            ]
+        ),
+        flags=re.MULTILINE | re.DOTALL,
+    )
 __all__ = [
+    "AngleBracketObject",
     "DiscordChannels",
     "DiscordMentions",
     "DiscordSpecial",
@@ -125,7 +165,9 @@ __all__ = [
     "SingleQuotes",
     "DoubleQuotes",
     "ArrowQuote",
+    "AllQuotes",
     "Backticks",
+    "Reference",
     "Spoilers",
     "URLs",
 ]

{sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/Scorers.py RENAMED Viewed

@@ -10,8 +10,6 @@ from typing_extensions import override
 # LOCAL
 from sonatoki.Filters import Filter
-LOG = logging.getLogger(__name__)
 Number = Union[int, float]
 Weights = Dict[str, Number]
@@ -37,12 +35,7 @@ class PassFail(Scorer):
     def score_token(cls, token: str, filters: List[Type[Filter]]) -> Number:
         for f in filters:
             if f.filter(token):
-                score = 1
-                LOG.debug(
-                    "%12s.%s('%s') = %.2f", cls.__name__, f.__name__, token, score
-                )
-                return score
-        LOG.debug("%12s('%s') = 0.00", cls.__name__, token)
+                return 1
         return 0
     @classmethod
@@ -86,12 +79,7 @@ class Scaling(Scorer):
     def score_token(cls, token: str, filters: List[Type[Filter]], scale: int):
         for i, f in enumerate(filters):
             if f.filter(token):
-                score = scale - i
-                LOG.debug(
-                    "%12s.%s('%s') = %.2f", cls.__name__, f.__name__, token, score
-                )
-                return score
-        LOG.debug("%12s('%s') = 0.00", cls.__name__, token)
+                return scale - i
         return 0
     @classmethod

{sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/Tokenizers.py RENAMED Viewed

@@ -1,11 +1,15 @@
 # STL
+import re
 from abc import ABC, abstractmethod
 from typing import List
 # PDM
-import regex as re
+import regex
 from typing_extensions import override
+# LOCAL
+from sonatoki.constants import UNICODE_PUNCT, PRUNED_POSIX_PUNCT
 try:
     # PDM
     import nltk
@@ -15,7 +19,7 @@ except ImportError as e:
     nltk = e
-LANGUAGE = "english"  # for NLTK
+regex.DEFAULT_VERSION = regex.VERSION1
 class Tokenizer(ABC):
@@ -42,15 +46,26 @@ class RegexTokenizer(Tokenizer):
         return [clean for word in re.split(cls.pattern, s) if (clean := word.strip())]
+class Regex1Tokenizer(Tokenizer):
+    pattern: "regex.Pattern[str]"
+    @classmethod
+    @override
+    def tokenize(cls, s: str) -> List[str]:
+        return [
+            clean for word in regex.split(cls.pattern, s) if (clean := word.strip())
+        ]
 class WordTokenizerTok(RegexTokenizer):
-    pattern = re.compile(r"""([\p{Punctuation}\p{posix_punct}]+|\s+)""")
-    # TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
-    # TODO: do the typography characters matter?
-    # NOTE: | / and , are *not* sentence delimiters for my purpose
+    pattern = re.compile(rf"""([{PRUNED_POSIX_PUNCT}{UNICODE_PUNCT}]+|\s+)""")
 class SentTokenizerTok(RegexTokenizer):
-    pattern = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-]|$)""")
+    pattern = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-])|$""", flags=re.MULTILINE)
+    # TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
+    # TODO: do the typography characters matter?
+    # NOTE: | / and , are *not* sentence delimiters for my purpose
 class WordTokenizerRe(RegexTokenizer):

sonatoki-0.1.5/src/sonatoki/constants.py ADDED Viewed

@@ -0,0 +1,83 @@
+# STL
+import json
+from typing import Dict, List
+from pathlib import Path
+LINKU = Path(__file__).resolve().parent / Path("linku.json")
+SANDBOX = Path(__file__).resolve().parent / Path("sandbox.json")
+VOWELS = "aeiou"
+CONSONANTS = "jklmnpstw"
+ALPHABET = VOWELS + CONSONANTS
+ALPHABET_SET = set(ALPHABET)
+LANGUAGE = "english"  # for NLTK
+# `\p{posix_punct}` character class
+POSIX_PUNCT = r"""-!"#$%&'()*+,./:;<=>?@[\]^_`{|}~"""
+PRUNED_POSIX_PUNCT = r"""$+<=>^`|~"""  # only those that are not in UNICODE_PUNCT
+# `\p{Punctuation}` character class
+UNICODE_PUNCT = r"""!"#%&'()*,-./:;?@\[\\\]_{}¡§«¶·»¿;·՚՛՜՝՞՟։֊־׀׃׆׳״؉؊،؍؛؝؞؟٪٫٬٭۔܀܁܂܃܄܅܆܇܈܉܊܋܌܍߷߸߹࠰࠱࠲࠳࠴࠵࠶࠷࠸࠹࠺࠻࠼࠽࠾࡞।॥॰৽੶૰౷಄෴๏๚๛༄༅༆༇༈༉༊་༌།༎༏༐༑༒༔༺༻༼༽྅࿐࿑࿒࿓࿔࿙࿚၊။၌၍၎၏჻፠፡።፣፤፥፦፧፨᐀᙮᚛᚜᛫᛬᛭᜵᜶។៕៖៘៙៚᠀᠁᠂᠃᠄᠅᠆᠇᠈᠉᠊᥄᥅᨞᨟᪠᪡᪢᪣᪤᪥᪦᪨᪩᪪᪫᪬᪭᭚᭛᭜᭝᭞᭟᭠᭽᭾᯼᯽᯾᯿᰻᰼᰽᰾᰿᱾᱿᳀᳁᳂᳃᳄᳅᳆᳇᳓‐‑‒–—―‖‗‘’‚‛“”„‟†‡•‣․‥…‧‰‱′″‴‵‶‷‸‹›※‼‽‾‿⁀⁁⁂⁃⁅⁆⁇⁈⁉⁊⁋⁌⁍⁎⁏⁐⁑⁓⁔⁕⁖⁗⁘⁙⁚⁛⁜⁝⁞⁽⁾₍₎⌈⌉⌊⌋〈〉❨❩❪❫❬❭❮❯❰❱❲❳❴❵⟅⟆⟦⟧⟨⟩⟪⟫⟬⟭⟮⟯⦃⦄⦅⦆⦇⦈⦉⦊⦋⦌⦍⦎⦏⦐⦑⦒⦓⦔⦕⦖⦗⦘⧘⧙⧚⧛⧼⧽⳹⳺⳻⳼⳾⳿⵰⸀⸁⸂⸃⸄⸅⸆⸇⸈⸉⸊⸋⸌⸍⸎⸏⸐⸑⸒⸓⸔⸕⸖⸗⸘⸙⸚⸛⸜⸝⸞⸟⸠⸡⸢⸣⸤⸥⸦⸧⸨⸩⸪⸫⸬⸭⸮⸰⸱⸲⸳⸴⸵⸶⸷⸸⸹⸺⸻⸼⸽⸾⸿⹀⹁⹂⹃⹄⹅⹆⹇⹈⹉⹊⹋⹌⹍⹎⹏⹒⹓⹔⹕⹖⹗⹘⹙⹚⹛⹜⹝、。〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〽゠・꓾꓿꘍꘎꘏꙳꙾꛲꛳꛴꛵꛶꛷꡴꡵꡶꡷꣎꣏꣸꣹꣺꣼꤮꤯꥟꧁꧂꧃꧄꧅꧆꧇꧈꧉꧊꧋꧌꧍꧞꧟꩜꩝꩞꩟꫞꫟꫰꫱꯫﴾﴿︐︑︒︓︔︕︖︗︘︙︰︱︲︳︴︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄﹅﹆﹇﹈﹉﹊﹋﹌﹍﹎﹏﹐﹑﹒﹔﹕﹖﹗﹘﹙﹚﹛﹜﹝﹞﹟﹠﹡﹣﹨﹪﹫！＂＃％＆＇（）＊，－．／：；？＠［＼］＿｛｝｟｠｡｢｣､･𐄀𐄁𐄂𐎟𐏐𐕯𐡗𐤟𐤿𐩐𐩑𐩒𐩓𐩔𐩕𐩖𐩗𐩘𐩿𐫰𐫱𐫲𐫳𐫴𐫵𐫶𐬹𐬺𐬻𐬼𐬽𐬾𐬿𐮙𐮚𐮛𐮜𐺭𐽕𐽖𐽗𐽘𐽙𐾆𐾇𐾈𐾉𑁇𑁈𑁉𑁊𑁋𑁌𑁍𑂻𑂼𑂾𑂿𑃀𑃁𑅀𑅁𑅂𑅃𑅴𑅵𑇅𑇆𑇇𑇈𑇍𑇛𑇝𑇞𑇟𑈸𑈹𑈺𑈻𑈼𑈽𑊩𑑋𑑌𑑍𑑎𑑏𑑚𑑛𑑝𑓆𑗁𑗂𑗃𑗄𑗅𑗆𑗇𑗈𑗉𑗊𑗋𑗌𑗍𑗎𑗏𑗐𑗑𑗒𑗓𑗔𑗕𑗖𑗗𑙁𑙂𑙃𑙠𑙡𑙢𑙣𑙤𑙥𑙦𑙧𑙨𑙩𑙪𑙫𑙬𑚹𑜼𑜽𑜾𑠻𑥄𑥅𑥆𑧢𑨿𑩀𑩁𑩂𑩃𑩄𑩅𑩆𑪚𑪛𑪜𑪞𑪟𑪠𑪡𑪢𑬀𑬁𑬂𑬃𑬄𑬅𑬆𑬇𑬈𑬉𑱁𑱂𑱃𑱄𑱅𑱰𑱱𑻷𑻸𑽃𑽄𑽅𑽆𑽇𑽈𑽉𑽊𑽋𑽌𑽍𑽎𑽏𑿿𒑰𒑱𒑲𒑳𒑴𒿱𒿲𖩮𖩯𖫵𖬷𖬸𖬹𖬺𖬻𖭄𖺗𖺘𖺙𖺚𖿢𛲟𝪇𝪈𝪉𝪊𝪋𞥞𞥟"""
+# NOTE: This list diverges slightly from the raw list, since []\ must be escaped
+# The [] need to be escaped to avoid prematurely closing the regex character class
+# The \ needs to be escaped to be considered as a raw \
+# https://www.compart.com/en/unicode/category
+# https://unicode.org/Public/UNIDATA/UnicodeData.txt
+"""Commonly occurring strings which are some kind of valid Toki Pona or external token"""
+ALLOWABLES = {
+    "cw",  # Content Warning
+    "x",  # ala
+    "y",  # anu
+    "kxk",  # ken ala ken
+    "wxw",  # wile ala wile
+}
+with open(LINKU) as f:
+    r: Dict[str, Dict[str, str]] = json.loads(f.read())
+    NIMI_PU: List[str] = [d["word"] for d in r.values() if d["book"] == "pu"]
+    NIMI_PU_ALE: List[str] = NIMI_PU + ["namako", "kin", "oko"]
+    NIMI_LINKU: List[str] = [
+        d["word"] for d in r.values() if d["usage_category"] in ["core", "common"]
+    ]
+    NIMI_LINKU_ALE: List[str] = [d["word"] for d in r.values()]
+with open(SANDBOX) as f:
+    r: Dict[str, Dict[str, str]] = json.loads(f.read())
+    NIMI_LINKU_SANDBOX: List[str] = [d["word"] for d in r.values()]
+NIMI_PU_SET = set(NIMI_PU)
+NIMI_PU_ALE_SET = set(NIMI_PU_ALE)
+NIMI_LINKU_SET = set(NIMI_LINKU)
+NIMI_LINKU_ALE_SET = set(NIMI_LINKU_ALE)
+NIMI_LINKU_SANDBOX_SET = set(NIMI_LINKU_SANDBOX)
+ALLOWABLES_SET = set(ALLOWABLES)
+__all__ = [
+    "VOWELS",
+    #
+    "CONSONANTS",
+    #
+    "ALPHABET",
+    "ALPHABET_SET",
+    #
+    "NIMI_PU",
+    "NIMI_PU_SET",
+    #
+    "NIMI_PU_ALE",
+    "NIMI_PU_ALE_SET",
+    #
+    "NIMI_LINKU",
+    "NIMI_LINKU_SET",
+    #
+    "NIMI_LINKU_ALE",
+    "NIMI_LINKU_ALE_SET",
+    #
+    "NIMI_LINKU_SANDBOX",
+    "NIMI_LINKU_SANDBOX_SET",
+]

{sonatoki-0.1.3 → sonatoki-0.1.5}/src/sonatoki/ilo.py RENAMED Viewed

@@ -1,5 +1,4 @@
 # STL
-import logging
 from typing import List, Type, Tuple
 # LOCAL
@@ -9,8 +8,6 @@ from sonatoki.Cleaners import Cleaner
 from sonatoki.Tokenizers import Tokenizer
 from sonatoki.Preprocessors import Preprocessor
-LOG = logging.getLogger(__name__)
 class Ilo:
     __preprocessors: List[Type[Preprocessor]]
@@ -20,7 +17,6 @@ class Ilo:
     __scoring_filters: List[Type[Filter]]
     __scorer: Type[Scorer]
     __passing_score: Number
-    logging_threshold: Number = -1
     def __init__(
         self,
@@ -104,14 +100,6 @@ class Ilo:
         score = self.score_tokens(cleaned)
         result = score >= self.__passing_score
-        if score <= self.logging_threshold:
-            LOG.debug("msg: %.2f  %s", score, repr(message))
-            LOG.debug("preproc:   %s", repr(preprocessed))
-            LOG.debug("tokenized: %s", tokenized)
-            LOG.debug("filtered:  %s", filtered)
-            LOG.debug("cleaned:   %s", cleaned)
-        # TODO: Move to each function? Loses ability to control when logging occurs by threshold
         return preprocessed, tokenized, filtered, cleaned, score, result
     def is_toki_pona(self, message: str) -> bool:

{sonatoki-0.1.3 → sonatoki-0.1.5}/tests/test_filters.py RENAMED Viewed

@@ -9,13 +9,13 @@ from hypothesis import HealthCheck, given, assume, example, settings
 # LOCAL
 from sonatoki.Filters import (
     NimiPu,
-    Numerics,
+    Numeric,
     Syllabic,
     NimiLinku,
     Alphabetic,
     ProperName,
     Phonotactic,
-    Punctuations,
+    Punctuation,
 )
 from sonatoki.Cleaners import ConsecutiveDuplicates
 from sonatoki.constants import NIMI_PU, NIMI_LINKU
@@ -82,17 +82,16 @@ def test_ProperName(s: str):
     assert res, repr(s)
-# I use `regex`'s Unicode property feature, which Hypothesis doesn't understand
-# So I have to provide a different regex tha doesn't technically match
-@given(st.from_regex(r"[^\w\s]+", fullmatch=True))
+@given(st.from_regex(Punctuation.pattern.pattern, fullmatch=True))
+@example("[]")
+@example(r"\\")
+@example(r"\"")
 @example("⟨·⟩")
 @example("…")
-@example("「　」")
+@example("「」")  # `　`
 @example(string.punctuation)
-@settings(suppress_health_check=[HealthCheck.filter_too_much])  # FIXME
-def test_Punctuations(s: str):
-    _ = assume(re.fullmatch(Punctuations.pattern.pattern, s))
-    res = Punctuations.filter(s)
+def test_Punctuation(s: str):
+    res = Punctuation.filter(s)
     assert res, repr(s)
@@ -100,5 +99,5 @@ def test_Punctuations(s: str):
 @example("124125")
 @example("99990000")
 def test_Numeric(s: str):
-    res = Numerics.filter(s)
+    res = Numeric.filter(s)
     assert res, repr(s)

{sonatoki-0.1.3 → sonatoki-0.1.5}/tests/test_ilo.py RENAMED Viewed

@@ -9,7 +9,6 @@ from sonatoki.Configs import LazyConfig, PrefConfig
 @pytest.fixture
 def ilo():
     ilo = Ilo(**PrefConfig)
-    # ilo.logging_threshold = 0.8
     return ilo

{sonatoki-0.1.3 → sonatoki-0.1.5}/tests/test_preprocessors.py RENAMED Viewed

@@ -6,7 +6,9 @@ from hypothesis import given, example
 from sonatoki.Preprocessors import (
     URLs,
     Spoilers,
+    AllQuotes,
     Backticks,
+    Reference,
     ArrowQuote,
     DoubleQuotes,
     SingleQuotes,
@@ -14,6 +16,7 @@ from sonatoki.Preprocessors import (
     DiscordSpecial,
     DiscordChannels,
     DiscordMentions,
+    AngleBracketObject,
 )
@@ -101,3 +104,40 @@ def test_DiscordChannels(s: str):
 def test_DiscordSpecial(s: str):
     res = DiscordSpecial.process(s).strip()
     assert res == "", (repr(s), repr(res))
+@given(
+    st.from_regex(DiscordEmotes.pattern.pattern, fullmatch=True)
+    | st.from_regex(DiscordMentions.pattern.pattern, fullmatch=True)
+    | st.from_regex(DiscordChannels.pattern.pattern, fullmatch=True)
+    | st.from_regex(DiscordSpecial.pattern.pattern, fullmatch=True)
+    | st.from_regex(AngleBracketObject.pattern.pattern, fullmatch=True)
+)
+@example("<https://example.com>")
+@example("<#123124125125>")
+def test_AngleBracketObject(s: str):
+    res = AngleBracketObject.process(s).strip()
+    assert res == "", (repr(s), repr(res))
+@given(
+    st.from_regex(SingleQuotes.pattern.pattern, fullmatch=True)
+    | st.from_regex(DoubleQuotes.pattern.pattern, fullmatch=True)
+    | st.from_regex(Backticks.pattern.pattern, fullmatch=True)
+    | st.from_regex(ArrowQuote.pattern.pattern, fullmatch=True)
+    | st.from_regex(AllQuotes.pattern.pattern, fullmatch=True)
+)
+@example("> bruh")
+@example("`bruh`")
+def test_AllQuotes(s: str):
+    res = AllQuotes.process(s).strip()
+    assert res == "", (repr(s), repr(res))
+@given(st.from_regex(Reference.pattern.pattern, fullmatch=True))
+@example("[[Brainstorm]]")
+@example("[[Phatic Phrases]]")
+@example("[[Yahoo!]]")
+def test_Reference(s: str):
+    res = Reference.process(s).strip()
+    assert res == "", (repr(s), repr(res))

{sonatoki-0.1.3 → sonatoki-0.1.5}/tests/test_scorers.py RENAMED Viewed

@@ -4,38 +4,39 @@ from typing import List, Type
 # PDM
 import pytest
 import hypothesis.strategies as st
-from hypothesis import given
+from hypothesis import given, example
 # LOCAL
 from sonatoki.Filters import (
     Filter,
     NimiPu,
-    Numerics,
+    Numeric,
     Syllabic,
     NimiLinku,
     Alphabetic,
     ProperName,
     Phonotactic,
-    Punctuations,
+    Punctuation,
 )
-from sonatoki.Scorers import Scorer, Scaling, PassFail, SoftScaling
+from sonatoki.Scorers import Scorer, Scaling, PassFail, SoftScaling, SoftPassFail
 # FILESYSTEM
 from .test_utils import token_strategy
 FILTERS = [
     NimiPu,
-    Numerics,
+    Numeric,
     Syllabic,
     NimiLinku,
     Alphabetic,
     ProperName,
     Phonotactic,
-    Punctuations,
+    Punctuation,
 ]
 SCORERS = [
     PassFail,
+    SoftPassFail,
     Scaling,
     SoftScaling,
 ]
@@ -46,6 +47,7 @@ SCORERS = [
     st.lists(st.sampled_from(FILTERS), min_size=1, unique=True),
     st.lists(token_strategy, min_size=0, max_size=10),
 )
+@example(st.sampled_from(FILTERS), [])
 def test_score_bounds(scorer: Scorer, filters: List[Type[Filter]], text: List[str]):
     score = scorer.score(text, filters)
     assert 0 <= score <= 1, (score, filters, text)

{sonatoki-0.1.3 → sonatoki-0.1.5}/tests/tokenize_cases/tokenize_sentences_tok.yml RENAMED Viewed

@@ -19,6 +19,24 @@
   output:
     - "mi mu."
     - "mi wawa."
+- name: "empty"
+  input: ""
+  output: []
+- name: "whitespace"
+  input: "  \n  "
+  output: []
+- name: "newline basic"
+  input: "sina lon seme?\nmi wile lon poka...\n"
+  output:
+    - "sina lon seme?"
+    - "mi wile lon poka."
+    - "."
+    - "."
+- name: "newline alone"
+  input: "sina lon seme\nmi wile lon poka"
+  output:
+    - "sina lon seme"
+    - "mi wile lon poka"
 - name: "dash"
   input: "mi sona ala e ni- sina seme a"
   output:

sonatoki-0.1.3/src/sonatoki/constants.py DELETED Viewed

@@ -1,67 +0,0 @@
-# STL
-import json
-from typing import Dict, List
-from pathlib import Path
-LINKU = Path(__file__).resolve().parent / Path("linku.json")
-SANDBOX = Path(__file__).resolve().parent / Path("sandbox.json")
-VOWELS = "aeiou"
-CONSONANTS = "jklmnpstw"
-ALPHABET = VOWELS + CONSONANTS
-ALPHABET_SET = set(ALPHABET)
-"""Commonly occurring strings which are some kind of valid Toki Pona or external token"""
-ALLOWABLES = {
-    "cw",  # Content Warning
-    "x",  # ala
-    "y",  # anu
-    "kxk",  # ken ala ken
-    "wxw",  # wile ala wile
-}
-with open(LINKU) as f:
-    r: Dict[str, Dict[str, str]] = json.loads(f.read())
-    NIMI_PU: List[str] = [d["word"] for d in r.values() if d["book"] == "pu"]
-    NIMI_PU_ALE: List[str] = NIMI_PU + ["namako", "kin", "oko"]
-    NIMI_LINKU: List[str] = [
-        d["word"] for d in r.values() if d["usage_category"] in ["core", "common"]
-    ]
-    NIMI_LINKU_ALE: List[str] = [d["word"] for d in r.values()]
-with open(SANDBOX) as f:
-    r: Dict[str, Dict[str, str]] = json.loads(f.read())
-    NIMI_LINKU_SANDBOX: List[str] = [d["word"] for d in r.values()]
-NIMI_PU_SET = set(NIMI_PU)
-NIMI_PU_ALE_SET = set(NIMI_PU_ALE)
-NIMI_LINKU_SET = set(NIMI_LINKU)
-NIMI_LINKU_ALE_SET = set(NIMI_LINKU_ALE)
-NIMI_LINKU_SANDBOX_SET = set(NIMI_LINKU_SANDBOX)
-ALLOWABLES_SET = set(ALLOWABLES)
-__all__ = [
-    "VOWELS",
-    #
-    "CONSONANTS",
-    #
-    "ALPHABET",
-    "ALPHABET_SET",
-    #
-    "NIMI_PU",
-    "NIMI_PU_SET",
-    #
-    "NIMI_PU_ALE",
-    "NIMI_PU_ALE_SET",
-    #
-    "NIMI_LINKU",
-    "NIMI_LINKU_SET",
-    #
-    "NIMI_LINKU_ALE",
-    "NIMI_LINKU_ALE_SET",
-    #
-    "NIMI_LINKU_SANDBOX",
-    "NIMI_LINKU_SANDBOX_SET",
-]