PyPI - sonatoki - Versions diffs - 0.6.2__tar.gz → 0.7.0__tar.gz - Mend

sonatoki 0.6.2tar.gz → 0.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{sonatoki-0.6.2 → sonatoki-0.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sonatoki
-Version: 0.6.2
+Version: 0.7.0
 Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
 Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
 License: AGPL-3.0-or-later

{sonatoki-0.6.2 → sonatoki-0.7.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "sonatoki"
-version = "0.6.2"
+version = "0.7.0"
 description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
 authors = [
     { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },

{sonatoki-0.6.2 → sonatoki-0.7.0}/src/sonatoki/Configs.py RENAMED Viewed

@@ -1,49 +1,74 @@
 # STL
-from copy import deepcopy
-from typing import Set, List, Type, TypedDict, cast
+from typing import List, Type, TypedDict
 # PDM
 from typing_extensions import NotRequired
 # LOCAL
+from sonatoki.types import Number
 from sonatoki.Filters import (
     Or,
     And,
     Not,
     Filter,
     Numeric,
-    Syllabic,
     NimiUCSUR,
     Alphabetic,
     NimiKuLili,
     NimiKuSuli,
     ProperName,
-    Phonotactic,
     Punctuation,
     LongSyllabic,
     Miscellaneous,
-    NimiLinkuCore,
     LongAlphabetic,
     LongProperName,
-    NimiLinkuCommon,
     FalsePosSyllabic,
+    NimiLinkuByUsage,
     NimiLinkuObscure,
     NimiLinkuSandbox,
     NimiLinkuUncommon,
     FalsePosAlphabetic,
 )
-from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
+from sonatoki.Scorers import Scorer, PassFail, SoftScaling, SoftPassFail
 from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
 from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
 from sonatoki.Preprocessors import (
     URLs,
     Emoji,
-    Backticks,
+    Codeblock,
     Reference,
     Preprocessor,
     AngleBracketObject,
 )
+__DICT_PHONOMATCHES = {
+    # Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
+    # In this case, all of these appear more often in English by a factor of at least 10.
+    "aka",  # also known as
+    "an",  # article
+    "api",  # API
+    "i",  # 1st person
+    "kana",  # japanese script
+    "me",  # 1st person singular, english
+    "ne",  # "no" in several languages
+    "nu",  # "new" in english, "now" in dutch
+    "se",  # spanish particle, english "see"
+    "take",  # acquire, perhaps forcefully or without permission
+    "ten",  # 10
+    "to",  # to, too
+    "je",  # 1st person pronoun, french
+    "u",  # no u
+    "we",  # 1st person plural, english
+    "wi",  # wii and discussions of syllables
+    "sole",  # singular, of shoe
+    # unexplored candidates for removal
+    # "omen",  # ominous
+    # "papa",  # father
+    # "lo",  # "lo" and "loo"
+    # "ewe",  # sheep
+    # "pa",  # father- eh?
+}
 class IloConfig(TypedDict):
     preprocessors: List[Type[Preprocessor]]
@@ -69,11 +94,11 @@ BaseConfig: IloConfig = {
 PrefConfig: IloConfig = {
-    "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
+    "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
-        Or(NimiLinkuCore, NimiLinkuCommon, NimiLinkuUncommon, NimiUCSUR),
+        Or(NimiLinkuByUsage(30), NimiUCSUR),
         And(LongSyllabic, Not(FalsePosSyllabic)),
         # NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
         LongProperName,
@@ -84,16 +109,13 @@ PrefConfig: IloConfig = {
 }
 CorpusConfig: IloConfig = {
-    "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
+    "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
         Or(
-            NimiLinkuCore,
-            NimiLinkuCommon,
-            NimiLinkuUncommon,
-            NimiLinkuObscure,
-            NimiLinkuSandbox,
+            # awkward but efficient syntax
+            NimiLinkuByUsage(0)(sub=__DICT_PHONOMATCHES),
             NimiUCSUR,
             Miscellaneous,
         ),
@@ -104,43 +126,9 @@ CorpusConfig: IloConfig = {
     "scorer": SoftScaling,
     "passing_score": 0.8,
 }
-# TODO: create a mechanism to omit tokens from a filter with more granularity
-__corpus_tokens_dict: Set[str] = cast(
-    Set[str],
-    CorpusConfig["scoring_filters"][
-        0
-    ].tokens,  # pyright: ignore[reportAttributeAccessIssue]
-)
-__corpus_tokens_dict -= {
-    # Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
-    # In this case, all of these appear more often in English by a factor of at least 10.
-    "aka",  # also known as
-    "an",  # article
-    "api",  # API
-    "i",  # 1st person
-    "kana",  # japanese script
-    "me",  # 1st person
-    "ne",  # "no" in several languages
-    "nu",  # "new", now in dutch
-    "se",  # spanish particle, "see"
-    "take",  # acquire, perhaps forcefully or without permission
-    "ten",  # 10
-    "to",  # to, too
-    "u",  # no u
-    "we",  # 1st person plural
-    "wi",  # wii and discussions of syllables
-    "sole",  # singular, of shoe
-    # unexplored candidates for removal
-    # "omen",  # ominous
-    # "papa",  # father
-    # "lo",  # "lo" and "loo"
-    # "ewe",  # sheep
-    # "pa",  # father- eh?
-}
 """Mimics the previous implementation of ilo pi toki pona taso."""
 LazyConfig: IloConfig = {
-    "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
+    "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
@@ -150,7 +138,7 @@ LazyConfig: IloConfig = {
 }
 """This is extremely silly."""
 IsipinEpikuConfig: IloConfig = {
-    "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
+    "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
@@ -170,31 +158,10 @@ IsipinEpikuConfig: IloConfig = {
 }
-DiscordConfig: IloConfig = {
-    "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
-    "cleaners": [ConsecutiveDuplicates],
-    "ignoring_filters": [Numeric, Punctuation],
-    "scoring_filters": [
-        Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
-        And(LongSyllabic, Not(FalsePosSyllabic)),
-        LongProperName,
-        And(LongAlphabetic, Not(FalsePosAlphabetic)),
-    ],
-    "scorer": SoftScaling,
-    "passing_score": 0.8,
-}
-TelegramConfig: IloConfig = deepcopy(PrefConfig)
-ForumConfig: IloConfig = deepcopy(PrefConfig)
 __all__ = [
     "BaseConfig",
     "CorpusConfig",
-    "DiscordConfig",
-    "ForumConfig",
     "IloConfig",
     "LazyConfig",
     "PrefConfig",
-    "TelegramConfig",
 ]

{sonatoki-0.6.2 → sonatoki-0.7.0}/src/sonatoki/Filters.py RENAMED Viewed

@@ -1,37 +1,33 @@
 # STL
 import re
 from abc import ABC, abstractmethod
-from typing import Set, List, Type
+from copy import deepcopy
+from typing import Set, List, Type, Union, Literal, Optional
 from functools import lru_cache as cache  # cache comes in 3.9
 # PDM
 import regex
-from typing_extensions import override, deprecated
+from typing_extensions import override
 # LOCAL
+from sonatoki.types import LinkuBooks, LinkuUsageDate, LinkuUsageCategory
 from sonatoki.utils import prep_dictionary
 from sonatoki.constants import (
     VOWELS,
-    NIMI_PU,
     ALPHABET,
     ALL_PUNCT,
     ALLOWABLES,
     CONSONANTS,
     NIMI_UCSUR,
-    NIMI_KU_LILI,
-    NIMI_KU_SULI,
-    NIMI_LINKU_CORE,
     NIMI_PU_SYNONYMS,
-    NIMI_LINKU_COMMON,
     FALSE_POS_SYLLABIC,
-    NIMI_LINKU_OBSCURE,
-    NIMI_LINKU_SANDBOX,
     NOT_IN_PUNCT_CLASS,
-    NIMI_LINKU_UNCOMMON,
     ALL_PUNCT_RANGES_STR,
     FALSE_POS_ALPHABETIC,
     UCSUR_PUNCT_RANGES_STR,
     EMOJI_VARIATION_SELECTOR_RANGES_STR,
+    words_by_tag,
+    words_by_usage,
 )
 regex.DEFAULT_VERSION = regex.VERSION1
@@ -101,6 +97,20 @@ class MemberFilter(Filter):
     def filter(cls, token: str) -> bool:
         return token.lower() in cls.tokens
+    def __new__(
+        cls, add: Optional[Set[str]] = None, sub: Optional[Set[str]] = None
+    ) -> Type[Filter]:
+        parent_tokens = deepcopy(cls.tokens)
+        if add:
+            parent_tokens = parent_tokens.union(add)
+        if sub:
+            parent_tokens -= sub
+        class AnonMemberFilter(MemberFilter):
+            tokens = parent_tokens
+        return AnonMemberFilter
 class SubsetFilter(Filter):
     tokens: Set[str]
@@ -155,40 +165,46 @@ class LongProperName(MinLen, ProperName):
     length = 2  # reject "names" of length 1
-class NimiPu(MemberFilter):
-    tokens = prep_dictionary(NIMI_PU)
-class NimiPuSynonyms(MemberFilter):
-    tokens = prep_dictionary(NIMI_PU_SYNONYMS)
+class NimiLinkuByUsage:
+    def __new__(
+        cls,
+        usage: int,
+        date: Optional[LinkuUsageDate] = None,
+    ) -> Type[MemberFilter]:
+        words = words_by_usage(usage, date)
+        class AnonLinkuMemberFilter(MemberFilter):
+            tokens = prep_dictionary(words)
-class NimiKuSuli(MemberFilter):
-    tokens = prep_dictionary(NIMI_KU_SULI)
+        return AnonLinkuMemberFilter
-class NimiKuLili(MemberFilter):
-    tokens = prep_dictionary(NIMI_KU_LILI)
+class NimiLinkuByTag:
+    def __new__(
+        cls,
+        tag: Union[Literal["usage_category"], Literal["book"]],
+        category: Union[LinkuUsageCategory, LinkuBooks],
+    ) -> Type[MemberFilter]:
+        words = words_by_tag(tag, category)
+        class AnonLinkuMemberFilter(MemberFilter):
+            tokens = prep_dictionary(words)
-class NimiLinkuCore(MemberFilter):
-    tokens = prep_dictionary(NIMI_LINKU_CORE)
+        return AnonLinkuMemberFilter
-class NimiLinkuCommon(MemberFilter):
-    tokens = prep_dictionary(NIMI_LINKU_COMMON)
+NimiPu = NimiLinkuByTag("book", "pu")
+NimiKuSuli = NimiLinkuByTag("book", "ku suli")
+NimiKuLili = NimiLinkuByTag("book", "ku lili")
+NimiLinkuCore = NimiLinkuByTag("usage_category", "core")
+NimiLinkuCommon = NimiLinkuByTag("usage_category", "common")
+NimiLinkuUncommon = NimiLinkuByTag("usage_category", "uncommon")
+NimiLinkuObscure = NimiLinkuByTag("usage_category", "obscure")
+NimiLinkuSandbox = NimiLinkuByTag("usage_category", "sandbox")
-class NimiLinkuUncommon(MemberFilter):
-    tokens = prep_dictionary(NIMI_LINKU_UNCOMMON)
-class NimiLinkuObscure(MemberFilter):
-    tokens = prep_dictionary(NIMI_LINKU_OBSCURE)
-class NimiLinkuSandbox(MemberFilter):
-    tokens = prep_dictionary(NIMI_LINKU_SANDBOX)
+class NimiPuSynonyms(MemberFilter):
+    tokens = prep_dictionary(NIMI_PU_SYNONYMS)
 class NimiUCSUR(MemberFilter):

{sonatoki-0.6.2 → sonatoki-0.7.0}/src/sonatoki/Preprocessors.py RENAMED Viewed

@@ -143,6 +143,15 @@ class Backticks(RegexPreprocessor):
     pattern = re.compile(r"`[^`]+`", flags=re.DOTALL)
+class Codeblock(RegexPreprocessor):
+    """Remove codeblocks marked by a set of three backticks on their own lines.
+    Subset of what would be removed by Backticks, but may be preferable.
+    """
+    pattern = re.compile(r"```\n(?:(?!```).*?)?```", flags=re.DOTALL)
 class Spoilers(RegexPreprocessor):
     """Remove paired double bars and their contents `||like this||`"""

{sonatoki-0.6.2 → sonatoki-0.7.0}/src/sonatoki/Scorers.py RENAMED Viewed

@@ -1,17 +1,15 @@
 # STL
 import math
 from abc import ABC, abstractmethod
-from typing import Dict, List, Type, Union
+from typing import List, Type
 # PDM
 from typing_extensions import override
 # LOCAL
+from sonatoki.types import Number, Scorecard
 from sonatoki.Filters import Filter
-Number = Union[int, float]
-Weights = Dict[str, Number]
 class Scorer(ABC):
     @classmethod
@@ -124,7 +122,64 @@ class SoftScaling(Soften, Scaling):
     scoring."""
-# class Logarithmic(Scorer): ...
+class SentenceScorer(ABC):
+    @classmethod
+    @abstractmethod
+    def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
+        """Re-score a list of sentences (scorecards, sentences with all their
+        metadata) and return them."""
+        raise NotImplementedError
+class SentNoOp(SentenceScorer):
+    @classmethod
+    @override
+    def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
+        return scorecards
-__all__ = ["PassFail", "SoftPassFail", "Scaling", "SoftScaling"]
+class SentAvg(SentenceScorer):
+    @classmethod
+    @override
+    def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
+        if not scorecards:
+            return scorecards
+        total = sum(card["score"] for card in scorecards)
+        avg = total / len(scorecards)
+        for card in scorecards:
+            card["score"] = avg
+        return scorecards
+class SentWeightedAvg(SentenceScorer):
+    @classmethod
+    @override
+    def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
+        if not scorecards:
+            return scorecards
+        weighted_total = 0
+        total_len = 0
+        for card in scorecards:
+            cardlen = len(card["cleaned"])
+            cardscore = card["score"]
+            weighted_total += cardlen * cardscore
+            total_len += cardlen
+        weighted_avg = weighted_total / total_len
+        for card in scorecards:
+            card["score"] = weighted_avg
+        return scorecards
+__all__ = [
+    "PassFail",
+    "Scaling",
+    "SoftPassFail",
+    "SoftScaling",
+    "Soften",
+    "SentAvg",
+    "SentWeightedAvg",
+]

{sonatoki-0.6.2 → sonatoki-0.7.0}/src/sonatoki/constants.py RENAMED Viewed

@@ -1,11 +1,16 @@
 # STL
 import json
-from typing import Set, Dict
+from typing import Set, Dict, Optional
 from pathlib import Path
 # LOCAL
+from sonatoki.types import LinkuWord, LinkuUsageDate
 from sonatoki.utils import find_unicode_chars, find_unicode_ranges
+LATEST_DATE = "2023-09"
+# hardcoding this seems bad, but it means the parser is stable w.r.t. Linku!
 # `\p{Punctuation}` character class
 # https://www.compart.com/en/unicode/category
 # https://unicode.org/Public/UNIDATA/UnicodeData.txt
@@ -638,6 +643,7 @@ FALSE_POS_SYLLABIC = {
     "iluminate",
     "imense",
     "imitate",
+    "inanimate",
     "injoke",
     "insane",
     "insolate",
@@ -689,26 +695,42 @@ NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
 # NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
-def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> Set[str]:
-    return {d["word"] for d in data.values() if d[key] == value}
+def linku_data() -> Dict[str, LinkuWord]:
+    # NOTE: this does open+read+parse two files each time you construct a filter
+    # but i expect users to construct filters only at the start of runtime
+    # there is no reason to waste your RAM by leaving the linku data in it
+    with open(LINKU) as f:
+        linku: Dict[str, LinkuWord] = json.loads(f.read())
+    with open(SANDBOX) as f:
+        sandbox: Dict[str, LinkuWord] = json.loads(f.read())
+    return {**linku, **sandbox}
+def words_by_tag(tag: str, value: str) -> Set[str]:
+    data = linku_data()
+    return {d["word"] for d in data.values() if d[tag] == value}
-with open(LINKU) as f:
-    linku: Dict[str, Dict[str, str]] = json.loads(f.read())
-    NIMI_PU = category_helper(linku, "book", "pu")
-    NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
-    NIMI_KU_SULI = category_helper(linku, "book", "ku suli")
-    NIMI_KU_LILI = category_helper(linku, "book", "ku lili")
+def words_by_usage(
+    usage: int,
+    date: Optional[LinkuUsageDate] = None,
+) -> Set[str]:
+    if not date:
+        date = LATEST_DATE
+    data = linku_data()
-    NIMI_LINKU_CORE = category_helper(linku, "usage_category", "core")
-    NIMI_LINKU_COMMON = category_helper(linku, "usage_category", "common")
-    NIMI_LINKU_UNCOMMON = category_helper(linku, "usage_category", "uncommon")
-    NIMI_LINKU_OBSCURE = category_helper(linku, "usage_category", "obscure")
+    result: Set[str] = set()
+    for word in data.values():
+        usages = word["usage"]
+        if date in usages and usages[date] >= usage:
+            result.add(word["word"])
+    return result
+NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
-with open(SANDBOX) as f:
-    sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
-    NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
 # with open(SYLLABICS) as f:
 #     FALSE_POS_SYLLABIC = {line.strip() for line in f}
@@ -716,9 +738,6 @@ with open(SANDBOX) as f:
 # with open(ALPHABETICS) as f:
 #     FALSE_POS_ALPHABETIC = {line.strip() for line in f}
-del linku
-del sandbox
 __all__ = [
     "ALLOWABLES",
     "ALL_PUNCT",
@@ -727,14 +746,6 @@ __all__ = [
     "CONSONANTS",
     "EMOJI_VARIATION_SELECTOR_RANGES",
     "EMOJI_VARIATION_SELECTOR_RANGES_STR",
-    "NIMI_KU_LILI",
-    "NIMI_KU_SULI",
-    "NIMI_LINKU_COMMON",
-    "NIMI_LINKU_CORE",
-    "NIMI_LINKU_OBSCURE",
-    "NIMI_LINKU_SANDBOX",
-    "NIMI_LINKU_UNCOMMON",
-    "NIMI_PU",
     "NIMI_PU_SYNONYMS",
     "POSIX_PUNCT",
     "POSIX_PUNCT_RANGES",

{sonatoki-0.6.2 → sonatoki-0.7.0}/src/sonatoki/ilo.py RENAMED Viewed

@@ -1,17 +1,14 @@
 # STL
-from typing import List, Type, Tuple
+from typing import List, Type
 # LOCAL
+from sonatoki.types import Number, Scorecard
 from sonatoki.Filters import Filter
-from sonatoki.Scorers import Number, Scorer
+from sonatoki.Scorers import Scorer, SentNoOp, SentenceScorer
 from sonatoki.Cleaners import Cleaner
 from sonatoki.Tokenizers import Tokenizer, SentTokenizer, WordTokenizer
 from sonatoki.Preprocessors import Preprocessor
-# tokenized, filtered, cleaned, score, result
-Scorecard = Tuple[List[str], List[str], List[str], Number, bool]
-# TODO: scorecard kinda sucks as a name
 class Ilo:
     __preprocessors: List[Type[Preprocessor]]
@@ -21,6 +18,7 @@ class Ilo:
     __ignoring_filters: List[Type[Filter]]
     __scoring_filters: List[Type[Filter]]
     __scorer: Type[Scorer]
+    __sentence_scorer: Type[SentenceScorer]
     __passing_score: Number
     def __init__(
@@ -31,6 +29,7 @@ class Ilo:
         scoring_filters: List[Type[Filter]],
         scorer: Type[Scorer],
         passing_score: Number,
+        sentence_scorer: Type[SentenceScorer] = SentNoOp,
         word_tokenizer: Type[Tokenizer] = WordTokenizer,
         sent_tokenizer: Type[Tokenizer] = SentTokenizer,
     ):
@@ -43,6 +42,7 @@ class Ilo:
         self.__ignoring_filters = [*ignoring_filters]
         self.__scoring_filters = [*scoring_filters]
         self.__scorer = scorer
+        self.__sentence_scorer = sentence_scorer
         self.__passing_score = passing_score
     def preprocess(self, msg: str) -> str:
@@ -55,6 +55,7 @@ class Ilo:
         return self.__word_tokenizer.tokenize(msg)
     def sent_tokenize(self, msg: str) -> List[str]:
+        """It is *highly* recommended that you run `ilo.preprocess` first."""
         return self.__sent_tokenizer.tokenize(msg)
     def clean_token(self, token: str) -> str:
@@ -93,44 +94,50 @@ class Ilo:
     def score_tokens(self, tokens: List[str]) -> float:
         return self.__scorer.score(tokens, self.__scoring_filters)
+    def score_sentences(self, scorecards: List[Scorecard]) -> List[Scorecard]:
+        return self.__sentence_scorer.score(scorecards)
     def _is_toki_pona(self, message: str) -> Scorecard:
         """Process a message into its tokens, then filters, cleans, and scores
-        them. Returns all parts. Message must already be preprocessed, normally
-        done in `self.is_toki_pona(message)`.
-        Returns all components of the processing algorithm except preprocessing:
-        - Tokenized message (list[str])
-        - Filtered message (list[str])
-        - Cleaned message (list[str])
-        - Score (float)
-        - Result (bool)
+        them. Message must already be preprocessed, normally done in
+        `self.is_toki_pona(message)`.
+        Returns a `Scorecard` with all changes to the input text and a score.
         """
         tokenized = self.word_tokenize(message)
         filtered = self.filter_tokens(tokenized)
         cleaned = self.clean_tokens(filtered)
         score = self.score_tokens(cleaned)
-        result = score >= self.__passing_score
-        return tokenized, filtered, cleaned, score, result
+        scorecard: Scorecard = {
+            "text": message,
+            "tokenized": tokenized,
+            "filtered": filtered,
+            "cleaned": cleaned,
+            "score": score,
+        }
+        return scorecard
     def is_toki_pona(self, message: str) -> bool:
-        """Determines whether a single statement is or is not Toki Pona."""
+        """Determines whether a text is or is not Toki Pona."""
         message = self.preprocess(message)
-        *_, result = self._is_toki_pona(message)
-        return result
+        scorecard = self._is_toki_pona(message)
+        return scorecard["score"] >= self.__passing_score
     def _are_toki_pona(self, message: str) -> List[Scorecard]:
-        """Split a message into sentences, then return a list each sentence's
-        results via `self._is_toki_pona()`.
+        """Split a message into sentences, then return a list with each
+        sentence's scorecard from `self._is_toki_pona()`.
         Message must already be preprocessed, normally done in
         `self.are_toki_pona(message)`.
         """
-        results: List[Scorecard] = list()
+        scorecards: List[Scorecard] = list()
         for sentence in self.sent_tokenize(message):
             result = self._is_toki_pona(sentence)
-            results.append(result)
-        return results
+            scorecards.append(result)
+        scorecards = self.score_sentences(scorecards)
+        return scorecards
     def are_toki_pona(self, message: str) -> List[bool]:
         """Splits a statement into sentences, then determines if each is or is not Toki Pona.
@@ -148,5 +155,5 @@ class Ilo:
         ```
         """
         message = self.preprocess(message)
-        results = self._are_toki_pona(message)
-        return [res[-1] for res in results]
+        scorecards = self._are_toki_pona(message)
+        return [card["score"] >= self.__passing_score for card in scorecards]

sonatoki-0.7.0/src/sonatoki/types.py ADDED Viewed

@@ -0,0 +1,60 @@
+# STL
+from typing import Dict, List, Union, Literal, TypedDict
+Number = Union[int, float]
+# TODO: scorecard kinda sucks as a name
+class Scorecard(TypedDict):
+    text: str
+    tokenized: List[str]
+    filtered: List[str]
+    cleaned: List[str]
+    score: Number
+LinkuUsageDate = Union[
+    Literal["2020-04"],
+    Literal["2021-10"],
+    Literal["2022-08"],
+    Literal["2023-09"],
+    # Literal["2024-09"],
+]
+LinkuUsageCategory = Union[
+    Literal["core"],
+    Literal["common"],
+    Literal["uncommon"],
+    Literal["obscure"],
+    Literal["sandbox"],
+]
+LinkuBooks = Union[
+    Literal["pu"],
+    Literal["ku suli"],
+    Literal["ku lili"],
+    Literal["none"],
+]
+class LinkuWord(TypedDict):
+    id: str
+    author_verbatim: str
+    author_verbatim_source: str
+    book: str
+    coined_era: str
+    coined_year: str
+    creator: List[str]
+    ku_data: Dict[str, int]
+    see_also: List[str]
+    resources: Dict[str, str]
+    representations: Dict[str, Union[str, List[str]]]
+    source_language: str
+    usage_category: LinkuUsageCategory
+    word: str
+    deprecated: bool
+    etymology: List[Dict[str, str]]
+    audio: List[Dict[str, str]]
+    pu_verbatim: Dict[str, str]
+    usage: Dict[LinkuUsageDate, int]
+    translations: Dict[str, Dict[str, str]]

{sonatoki-0.6.2 → sonatoki-0.7.0}/tests/test_filters.py RENAMED Viewed

@@ -34,23 +34,13 @@ from sonatoki.Filters import (
     NimiLinkuUncommon,
 )
 from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates
-from sonatoki.constants import (
-    NIMI_PU,
-    NIMI_KU_LILI,
-    NIMI_KU_SULI,
-    NIMI_LINKU_CORE,
-    NIMI_LINKU_COMMON,
-    FALSE_POS_SYLLABIC,
-    NIMI_LINKU_OBSCURE,
-    NIMI_LINKU_SANDBOX,
-    NIMI_LINKU_UNCOMMON,
-)
+from sonatoki.constants import FALSE_POS_SYLLABIC, words_by_tag
 # FILESYSTEM
 from .test_utils import PROPER_NAME_RE
-@given(st.sampled_from(list(NIMI_PU)))
+@given(st.sampled_from(list(words_by_tag("book", "pu"))))
 @example("lukin")
 @example("selo")
 @example("li")
@@ -59,14 +49,14 @@ def test_NimiPu(s: str):
     assert res, repr(s)
-@given(st.sampled_from(list(NIMI_LINKU_CORE)))
+@given(st.sampled_from(list(words_by_tag("usage_category", "core"))))
 @example("pona")
 def test_NimiLinkuCore(s: str):
     res = NimiLinkuCore.filter(s)
     assert res, repr(s)
-@given(st.sampled_from(list(NIMI_LINKU_COMMON)))
+@given(st.sampled_from(list(words_by_tag("usage_category", "common"))))
 @example("n")
 @example("tonsi")
 @example("kipisi")
@@ -75,19 +65,21 @@ def test_NimiLinkuCommon(s: str):
     assert res, repr(s)
-@given(st.sampled_from(list(NIMI_LINKU_UNCOMMON)))
+@given(st.sampled_from(list(words_by_tag("usage_category", "uncommon"))))
 def test_NimiLinkuUncommon(s: str):
     res = NimiLinkuUncommon.filter(s)
     assert res, repr(s)
-@given(st.sampled_from(list(NIMI_LINKU_OBSCURE)))
+@given(st.sampled_from(list(words_by_tag("usage_category", "obscure"))))
+@example("pake")
+@example("san")
 def test_NimiLinkuObscure(s: str):
     res = NimiLinkuObscure.filter(s)
     assert res, repr(s)
-@given(st.sampled_from(list(NIMI_LINKU_SANDBOX)))
+@given(st.sampled_from(list(words_by_tag("usage_category", "sandbox"))))
 @example("kalamARR")
 @example("Pingo")
 def test_NimiLinkuSandbox(s: str):
@@ -207,7 +199,11 @@ def test_OrFilter(s: str):
 # NOTE: No subset filter test because A | B is not the same as A combined with B.
 # e.g. "apple" passes Alphabetic, "..." passes Punctuation, "apple..." passes neither
 # but would incorrectly pass a combined filter.
-@given(st.sampled_from(list(NIMI_PU | NIMI_LINKU_OBSCURE)))
+@given(
+    st.sampled_from(
+        list(words_by_tag("book", "pu") | words_by_tag("usage_category", "obscure"))
+    )
+)
 def test_MemberFilters_OrFilter(s: str):
     filter = Or(NimiPu, NimiLinkuObscure)
     assert issubclass(filter, MemberFilter)
@@ -221,11 +217,11 @@ def test_MemberFilters_OrFilter(s: str):
 @given(
     st.sampled_from(
         list(
-            NIMI_KU_SULI
-            | NIMI_KU_LILI
-            | NIMI_LINKU_UNCOMMON
-            | NIMI_LINKU_OBSCURE
-            | NIMI_LINKU_SANDBOX
+            words_by_tag("book", "ku suli")
+            | words_by_tag("book", "ku lili")
+            | words_by_tag("usage_category", "uncommon")
+            | words_by_tag("usage_category", "obscure")
+            | words_by_tag("usage_category", "sandbox")
         ),
     )
 )
@@ -248,14 +244,14 @@ def test_OrFilter_IsipinEpiku(s: str):
     )
-@given(st.sampled_from(list(NIMI_PU)))
+@given(st.sampled_from(list(words_by_tag("book", "pu"))))
 def test_AndFilter(s: str):
     s = s.capitalize()
     f = And(ProperName, NimiPu)
     assert f.filter(s)
-@given(st.sampled_from(list(NIMI_PU)))
+@given(st.sampled_from(list(words_by_tag("book", "pu"))))
 def test_NotFilter(s: str):
     f = Not(NimiPu)
     assert not f.filter(s)
@@ -280,3 +276,44 @@ def test_AndNotFilter(s: str):
     if res_fp:
         # syl matched- but if fp matches, then the composed filter should not match
         assert not res_composed
+@given(
+    st.sampled_from(list(words_by_tag("book", "pu") | words_by_tag("book", "ku suli")))
+)
+def test_AddTokensToMemberFilter(s: str):
+    PuEnKuSuliFilter = NimiPu(add=NimiKuSuli.tokens)
+    assert PuEnKuSuliFilter.filter(s)
+@given(
+    st.sampled_from(
+        list(
+            words_by_tag("usage_category", "sandbox") | words_by_tag("book", "ku lili")
+        )
+    )
+)
+def test_AddTokensToMemberFilterNegative(s: str):
+    PuEnKuSuliFilter = NimiPu(add=NimiKuSuli.tokens)
+    assert not PuEnKuSuliFilter.filter(s)
+@given(
+    st.sampled_from(
+        list(
+            words_by_tag("book", "pu")
+            | words_by_tag("book", "ku suli")
+            | words_by_tag("book", "ku lili")
+            | words_by_tag("usage_category", "uncommon")
+            | words_by_tag("usage_category", "obscure")
+            | words_by_tag("usage_category", "sandbox")
+        ),
+    )
+    | st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
+)
+def test_SubTokensFromMemberFilter(s: str):
+    NimiAlaFilter = NimiLinkuCore(sub=NimiPu.tokens)
+    # core is a strict subset of pu
+    # if kin becomes core, needs to be corrected
+    assert not NimiAlaFilter.filter(s)

{sonatoki-0.6.2 → sonatoki-0.7.0}/tests/test_ilo.py RENAMED Viewed

@@ -1,3 +1,6 @@
+# STL
+from typing import List, Tuple
 # PDM
 import pytest
@@ -35,6 +38,10 @@ ALL_VALID = [
     "󱥄󱥬󱥩󱤴",  # "o toki tawa mi" in UCSUR
     "󱤴󱤧󱤑󱥍󱦗󱤖󱥡󱦘󱤬󱥭‍󱥡󱥚",
     "󱤑󱦐󱥗󱦜󱦈󱦜󱥉󱦜󱦑󱥄󱤤󱤂󱤉󱥆󱤀",
+    "o lukin, 󱤴󱥬󱥩󱤴󱤧wawa",
+    "ni li sona kiwen",
+    "nimi namako li toki e ale",
+    "mi open mute a",  # mostly eng words
 ]
 IGNORABLES = [
@@ -55,10 +62,9 @@ IGNORABLES = [
     "❤️",  # heart
     "😊",
     "👨‍👩‍👧‍👧",  # family emoji with zwj
-    # every non-emoji in
+    # every non-emoji in the writables
     "🄀🄁🄂🄃🄄🄅🄆🄇🄈🄉🄊🄋🄌🄍🄎🄏🄐🄑🄒🄓🄔🄕🄖🄗🄘🄙🄚🄛🄜🄝🄞🄟🄠🄡🄢🄣🄤🄥🄦🄧🄨🄩🄪🄫🄬🄭🄮🄯🄰🄱🄲🄳🄴🄵🄶🄷🄸🄹🄺🄻🄼🄽🄾🄿🅀🅁🅂🅃🅄🅅🅆🅇🅈🅉🅊🅋🅌🅍🅎🅏🅐🅑🅒🅓🅔🅕🅖🅗🅘🅙🅚🅛🅜🅝🅞🅟🅠🅡🅢🅣🅤🅥🅦🅧🅨🅩🅪🅫🅬🅭🅮🅯🅲🅳🅴🅵🅶🅷🅸🅹🅺🅻🅼🅽🆀🆁🆂🆃🆄🆅🆆🆇🆈🆉🆊🆋🆌🆍🆏🆐 🆛🆜🆝🆞🆟🆠🆡🆢🆣🆤🆥🆦🆧🆨🆩🆪🆫🆬🆭🇦🇧🇨🇩🇪🇫🇬🇭🇮🇯🇰🇱🇲🇳🇴🇵🇶🇷🇸🇹🇺🇻🇼🇽🇾🇿",
     "🅰️🅱️🅾️🅱️🅰️",  # blood type emojis
-    # "😃⃢👍",  # sincerely, no idea, but it came up
 ]
 SYLLABIC_MATCHES = [
@@ -108,7 +114,7 @@ CORPUS_SPECIFIC = [
     "Pingo",
     "we Luke li alente wa",
 ]
-CORPUS_SPECIFIC_XFAIL = []
+CORPUS_SPECIFIC_XFAIL: List[str] = []
 EXCESSIVE_SYLLABICS = [
@@ -193,10 +199,20 @@ FALSE_NEGATIVES = [
     "mtue",
     "mi nasa B^)",  # emoticon
     "lete li ike x.x",  # this is an emoticon but passes because 'x' is in Filters.Miscellaneous
+    "😃⃢👍",  # sincerely, no idea, but it came up and it should be omitted by emojis but isn't
 ]
 FALSE_POSITIVES = [
-    "Knowing a little toki pona",
+    "Knowing a little toki pona",  # name, dict, alphabet, dict, dict- damn, that's hard.
+]
+IGNORABLE_PAIRS: List[Tuple[str, str]] = [
+    ("o lukin e ni: https://example.com/", "o lukin e ni:"),
+    ("ni li nasa anu seme <:musiwawa:198591138591>", "ni li nasa anu seme"),
+    ("seme la ni li toki pona ala https://example.com/", "seme la ni li toki pona ala"),
+    ("```\ndef bad():\n    pass\n``` o lukin e ni", "o lukin e ni"),
+    ("mi tawa tomo telo 💦💦", "mi tawa tomo telo"),
+    ("o lukin e lipu ni: [[wp:Canvassing]]", "o lukin e lipu ni:"),
 ]
@@ -254,3 +270,33 @@ def test_false_negatives_pref(ilo: Ilo, text: str):
 @pytest.mark.parametrize("text", CORPUS_SPECIFIC_XFAIL)
 def test_false_positives_corpus(corpus_ilo: Ilo, text: str):
     assert not corpus_ilo.is_toki_pona(text)
+@pytest.mark.parametrize("pair", IGNORABLE_PAIRS)
+def test_pref_ignorable_doesnt_change_score(ilo: Ilo, pair: Tuple[str, str]):
+    with_ignorable, without_ignorable = pair
+    with_ignorable = ilo.preprocess(with_ignorable)
+    without_ignorable = ilo.preprocess(without_ignorable)
+    score_with = ilo._is_toki_pona(with_ignorable)["score"]
+    score_without = ilo._is_toki_pona(without_ignorable)["score"]
+    assert score_with == score_without
+@pytest.mark.parametrize("pair", IGNORABLE_PAIRS)
+def test_lazy_ignorable_doesnt_change_score(lazy_ilo: Ilo, pair: Tuple[str, str]):
+    with_ignorable, without_ignorable = pair
+    with_ignorable = lazy_ilo.preprocess(with_ignorable)
+    without_ignorable = lazy_ilo.preprocess(without_ignorable)
+    score_with = lazy_ilo._is_toki_pona(with_ignorable)["score"]
+    score_without = lazy_ilo._is_toki_pona(without_ignorable)["score"]
+    assert score_with == score_without
+@pytest.mark.parametrize("pair", IGNORABLE_PAIRS)
+def test_corpus_ignorable_doesnt_change_score(corpus_ilo: Ilo, pair: Tuple[str, str]):
+    with_ignorable, without_ignorable = pair
+    with_ignorable = corpus_ilo.preprocess(with_ignorable)
+    without_ignorable = corpus_ilo.preprocess(without_ignorable)
+    score_with = corpus_ilo._is_toki_pona(with_ignorable)["score"]
+    score_without = corpus_ilo._is_toki_pona(without_ignorable)["score"]
+    assert score_with == score_without

{sonatoki-0.6.2 → sonatoki-0.7.0}/tests/test_preprocessors.py RENAMED Viewed

@@ -8,6 +8,7 @@ from sonatoki.Preprocessors import (
     Spoilers,
     AllQuotes,
     Backticks,
+    Codeblock,
     Reference,
     ArrowQuote,
     ColonEmotes,
@@ -48,6 +49,25 @@ def test_Backticks(s: str):
     assert res == "", (repr(s), repr(res))
+@given(st.from_regex(Codeblock.pattern.pattern, fullmatch=True))
+@example(
+    """```
+```"""
+)
+@example(
+    """```
+blocky message
+```
+```
+second blocky message
+```"""
+)
+def test_Codeblock(s: str):
+    res = Codeblock.process(s).strip()
+    assert res == "", (repr(s), repr(res))
 @given(st.from_regex(ArrowQuote.pattern.pattern, fullmatch=True))
 @example("> base")
 @example("> newline\n> newline")

{sonatoki-0.6.2 → sonatoki-0.7.0}/tests/test_properties.py RENAMED Viewed

@@ -19,45 +19,35 @@ from sonatoki.Filters import (
 )
 from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates
 from sonatoki.constants import (
-    NIMI_PU,
-    NIMI_KU_LILI,
-    NIMI_KU_SULI,
-    NIMI_LINKU_CORE,
     NIMI_PU_SYNONYMS,
-    NIMI_LINKU_COMMON,
     FALSE_POS_SYLLABIC,
-    NIMI_LINKU_OBSCURE,
-    NIMI_LINKU_SANDBOX,
-    NIMI_LINKU_UNCOMMON,
     FALSE_POS_ALPHABETIC,
+    words_by_tag,
+    words_by_usage,
 )
-@given(st.sampled_from(list(NIMI_PU | NIMI_PU_SYNONYMS)))
+@given(st.sampled_from(list(words_by_tag("book", "pu") | NIMI_PU_SYNONYMS)))
 def test_pu_filters_non_overlap(s: str):
     res_pu = NimiPu.filter(s)
     res_synonyms = NimiPuSynonyms.filter(s)
     assert (res_pu + res_synonyms) == 1
-@given(st.sampled_from(list(NIMI_KU_SULI | NIMI_KU_LILI)))
+@given(
+    st.sampled_from(
+        list(words_by_tag("book", "ku suli") | words_by_tag("book", "ku lili"))
+    )
+)
 def test_ku_filters_non_overlap(s: str):
+    s = Lowercase.clean(s)
+    s = ConsecutiveDuplicates.clean(s)
     res_ku_suli = NimiKuSuli.filter(s)
     res_ku_lili = NimiKuLili.filter(s)
     assert (res_ku_suli + res_ku_lili) == 1
-@given(
-    st.sampled_from(
-        list(
-            NIMI_LINKU_CORE
-            | NIMI_LINKU_COMMON
-            | NIMI_LINKU_UNCOMMON
-            | NIMI_LINKU_OBSCURE
-            | NIMI_LINKU_SANDBOX
-        )
-    )
-)
+@given(st.sampled_from(list(words_by_usage(0))))
 def test_linku_filters_non_overlap(s: str):
     _ = assume(s != "su")
@@ -73,7 +63,7 @@ def test_linku_filters_non_overlap(s: str):
     assert (res_core + res_common + res_uncommon + res_obscure + res_sandbox) == 1
-@given(st.sampled_from(list(NIMI_LINKU_CORE | NIMI_LINKU_COMMON | NIMI_LINKU_UNCOMMON)))
+@given(st.sampled_from(list(words_by_usage(30))))
 def test_nimi_linku_properties(s: str):
     assert ConsecutiveDuplicates.clean(s) == s, repr(s)
     assert Alphabetic.filter(s), repr(s)

{sonatoki-0.6.2 → sonatoki-0.7.0}/tests/test_utils.py RENAMED Viewed

@@ -1,17 +1,14 @@
-# STL
-import re
 # PDM
 import hypothesis.strategies as st
 # LOCAL
 from sonatoki.Filters import Syllabic, Phonotactic, AlphabeticRe
-from sonatoki.constants import NIMI_LINKU_CORE, NIMI_LINKU_COMMON
+from sonatoki.constants import words_by_usage
 PROPER_NAME_RE = r"[A-Z][a-z]*"
 token_strategy = (
-    st.sampled_from(list(NIMI_LINKU_CORE | NIMI_LINKU_COMMON))
+    st.sampled_from(list(words_by_usage(60)))
     | st.from_regex(Phonotactic.pattern.pattern, fullmatch=True)
     | st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
     | st.from_regex(PROPER_NAME_RE, fullmatch=True)