PyPI - sonatoki - Versions diffs - 0.6.3__tar.gz → 0.8.0__tar.gz - Mend

sonatoki 0.6.3tar.gz → 0.8.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{sonatoki-0.6.3 → sonatoki-0.8.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sonatoki
-Version: 0.6.3
+Version: 0.8.0
 Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
 Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
 License: AGPL-3.0-or-later

{sonatoki-0.6.3 → sonatoki-0.8.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "sonatoki"
-version = "0.6.3"
+version = "0.8.0"
 description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
 authors = [
     { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },

{sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/Configs.py RENAMED Viewed

@@ -1,42 +1,41 @@
 # STL
-from copy import deepcopy
 from typing import List, Type, TypedDict
 # PDM
 from typing_extensions import NotRequired
 # LOCAL
+from sonatoki.types import Number
 from sonatoki.Filters import (
     Or,
     And,
     Not,
     Filter,
+    PuName,
     Numeric,
     NimiUCSUR,
     Alphabetic,
     NimiKuLili,
     NimiKuSuli,
-    ProperName,
     Punctuation,
     LongSyllabic,
     Miscellaneous,
-    NimiLinkuCore,
     LongAlphabetic,
     LongProperName,
-    NimiLinkuCommon,
     FalsePosSyllabic,
+    NimiLinkuByUsage,
     NimiLinkuObscure,
     NimiLinkuSandbox,
     NimiLinkuUncommon,
     FalsePosAlphabetic,
 )
-from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
+from sonatoki.Scorers import Scorer, PassFail, SoftScaling, SoftPassFail
 from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
 from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
 from sonatoki.Preprocessors import (
     URLs,
     Emoji,
-    Backticks,
+    Codeblock,
     Reference,
     Preprocessor,
     AngleBracketObject,
@@ -95,11 +94,11 @@ BaseConfig: IloConfig = {
 PrefConfig: IloConfig = {
-    "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
+    "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
-        Or(NimiLinkuCore, NimiLinkuCommon, NimiLinkuUncommon, NimiUCSUR),
+        Or(NimiLinkuByUsage(30), NimiUCSUR),
         And(LongSyllabic, Not(FalsePosSyllabic)),
         # NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
         LongProperName,
@@ -110,16 +109,13 @@ PrefConfig: IloConfig = {
 }
 CorpusConfig: IloConfig = {
-    "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
+    "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
         Or(
-            NimiLinkuCore,
-            NimiLinkuCommon,
-            NimiLinkuUncommon,
-            NimiLinkuObscure(sub=__DICT_PHONOMATCHES),
-            NimiLinkuSandbox(sub=__DICT_PHONOMATCHES),
+            # awkward but efficient syntax
+            NimiLinkuByUsage(0)(sub=__DICT_PHONOMATCHES),
             NimiUCSUR,
             Miscellaneous,
         ),
@@ -132,17 +128,17 @@ CorpusConfig: IloConfig = {
 }
 """Mimics the previous implementation of ilo pi toki pona taso."""
 LazyConfig: IloConfig = {
-    "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
+    "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
-    "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
+    "scoring_filters": [Alphabetic, NimiUCSUR, PuName, Miscellaneous],
     "scorer": SoftPassFail,
     "passing_score": 0.8,
     "word_tokenizer": WordTokenizerRe,  # mimics old tokenizer
 }
 """This is extremely silly."""
 IsipinEpikuConfig: IloConfig = {
-    "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
+    "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
@@ -162,31 +158,10 @@ IsipinEpikuConfig: IloConfig = {
 }
-DiscordConfig: IloConfig = {
-    "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
-    "cleaners": [ConsecutiveDuplicates],
-    "ignoring_filters": [Numeric, Punctuation],
-    "scoring_filters": [
-        Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
-        And(LongSyllabic, Not(FalsePosSyllabic)),
-        LongProperName,
-        And(LongAlphabetic, Not(FalsePosAlphabetic)),
-    ],
-    "scorer": SoftScaling,
-    "passing_score": 0.8,
-}
-TelegramConfig: IloConfig = deepcopy(PrefConfig)
-ForumConfig: IloConfig = deepcopy(PrefConfig)
 __all__ = [
     "BaseConfig",
     "CorpusConfig",
-    "DiscordConfig",
-    "ForumConfig",
     "IloConfig",
     "LazyConfig",
     "PrefConfig",
-    "TelegramConfig",
 ]

{sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/Filters.py RENAMED Viewed

@@ -2,37 +2,32 @@
 import re
 from abc import ABC, abstractmethod
 from copy import deepcopy
-from typing import Set, List, Type, Optional
+from typing import Set, List, Type, Union, Literal, Optional
 from functools import lru_cache as cache  # cache comes in 3.9
 # PDM
 import regex
-from typing_extensions import override, deprecated
+from typing_extensions import override
 # LOCAL
+from sonatoki.types import LinkuBooks, LinkuUsageDate, LinkuUsageCategory
 from sonatoki.utils import prep_dictionary
 from sonatoki.constants import (
     VOWELS,
-    NIMI_PU,
     ALPHABET,
     ALL_PUNCT,
     ALLOWABLES,
     CONSONANTS,
     NIMI_UCSUR,
-    NIMI_KU_LILI,
-    NIMI_KU_SULI,
-    NIMI_LINKU_CORE,
     NIMI_PU_SYNONYMS,
-    NIMI_LINKU_COMMON,
     FALSE_POS_SYLLABIC,
-    NIMI_LINKU_OBSCURE,
-    NIMI_LINKU_SANDBOX,
     NOT_IN_PUNCT_CLASS,
-    NIMI_LINKU_UNCOMMON,
     ALL_PUNCT_RANGES_STR,
     FALSE_POS_ALPHABETIC,
     UCSUR_PUNCT_RANGES_STR,
     EMOJI_VARIATION_SELECTOR_RANGES_STR,
+    words_by_tag,
+    words_by_usage,
 )
 regex.DEFAULT_VERSION = regex.VERSION1
@@ -146,8 +141,27 @@ class FalsePosAlphabetic(MemberFilter):
 class ProperName(Filter):
-    """Determines if a given token is a valid name (also called a loan word).
-    When Toki Pona is written with the Latin alphabet, names are generally
+    """Determine if a given token is a valid name based on a reasonable weakening of
+    the rules given in Toki Pona: The Language of Good. A token matches if it has a capital
+    letter at its start and is **not** fully capitalized.
+    This corrects an issue with PuName, where scripts lacking a case distinction are
+    errantly counted"""
+    @classmethod
+    @override
+    @cache(maxsize=None)
+    def filter(cls, token: str) -> bool:
+        first_capitalized = token[0].isupper()
+        all_caps = token.isupper()
+        return first_capitalized and not all_caps
+class PuName(Filter):
+    """Determine if a given token is a valid name (also called a loan word) based on
+    the rules given in Toki Pona: The Language of Good.
+    When Toki Pona is written with the Latin alphabet, names are
     capitalized at their start. This filter identifies those tokens.
     Note that this alone cannot determine if a token is a valid name,
@@ -161,6 +175,9 @@ class ProperName(Filter):
     @override
     @cache(maxsize=None)
     def filter(cls, token: str) -> bool:
+        # first_capitalized = token[0].isupper()
+        # rest_capitalized = token[1:] == token[1:].upper()
+        # return first_capitalized and not rest_capitalized
         return token == token.capitalize()
         # TODO:  If the token is in a script which doesn't have a case distinction,
         # this will errantly match.
@@ -170,40 +187,46 @@ class LongProperName(MinLen, ProperName):
     length = 2  # reject "names" of length 1
-class NimiPu(MemberFilter):
-    tokens = prep_dictionary(NIMI_PU)
-class NimiPuSynonyms(MemberFilter):
-    tokens = prep_dictionary(NIMI_PU_SYNONYMS)
-class NimiKuSuli(MemberFilter):
-    tokens = prep_dictionary(NIMI_KU_SULI)
-class NimiKuLili(MemberFilter):
-    tokens = prep_dictionary(NIMI_KU_LILI)
+class NimiLinkuByUsage:
+    def __new__(
+        cls,
+        usage: int,
+        date: Optional[LinkuUsageDate] = None,
+    ) -> Type[MemberFilter]:
+        words = words_by_usage(usage, date)
+        class AnonLinkuMemberFilter(MemberFilter):
+            tokens = prep_dictionary(words)
-class NimiLinkuCore(MemberFilter):
-    tokens = prep_dictionary(NIMI_LINKU_CORE)
+        return AnonLinkuMemberFilter
-class NimiLinkuCommon(MemberFilter):
-    tokens = prep_dictionary(NIMI_LINKU_COMMON)
+class NimiLinkuByTag:
+    def __new__(
+        cls,
+        tag: Union[Literal["usage_category"], Literal["book"]],
+        category: Union[LinkuUsageCategory, LinkuBooks],
+    ) -> Type[MemberFilter]:
+        words = words_by_tag(tag, category)
+        class AnonLinkuMemberFilter(MemberFilter):
+            tokens = prep_dictionary(words)
-class NimiLinkuUncommon(MemberFilter):
-    tokens = prep_dictionary(NIMI_LINKU_UNCOMMON)
+        return AnonLinkuMemberFilter
-class NimiLinkuObscure(MemberFilter):
-    tokens = prep_dictionary(NIMI_LINKU_OBSCURE)
+NimiPu = NimiLinkuByTag("book", "pu")
+NimiKuSuli = NimiLinkuByTag("book", "ku suli")
+NimiKuLili = NimiLinkuByTag("book", "ku lili")
+NimiLinkuCore = NimiLinkuByTag("usage_category", "core")
+NimiLinkuCommon = NimiLinkuByTag("usage_category", "common")
+NimiLinkuUncommon = NimiLinkuByTag("usage_category", "uncommon")
+NimiLinkuObscure = NimiLinkuByTag("usage_category", "obscure")
+NimiLinkuSandbox = NimiLinkuByTag("usage_category", "sandbox")
-class NimiLinkuSandbox(MemberFilter):
-    tokens = prep_dictionary(NIMI_LINKU_SANDBOX)
+class NimiPuSynonyms(MemberFilter):
+    tokens = prep_dictionary(NIMI_PU_SYNONYMS)
 class NimiUCSUR(MemberFilter):
@@ -444,6 +467,7 @@ __all__ = [
     "Or",
     "Phonotactic",
     "ProperName",
+    "PuName",
     "Punctuation",
     "Syllabic",
 ]

{sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/Preprocessors.py RENAMED Viewed

@@ -143,6 +143,15 @@ class Backticks(RegexPreprocessor):
     pattern = re.compile(r"`[^`]+`", flags=re.DOTALL)
+class Codeblock(RegexPreprocessor):
+    """Remove codeblocks marked by a set of three backticks on their own lines.
+    Subset of what would be removed by Backticks, but may be preferable.
+    """
+    pattern = re.compile(r"```\n(?:(?!```).*?)?```", flags=re.DOTALL)
 class Spoilers(RegexPreprocessor):
     """Remove paired double bars and their contents `||like this||`"""

{sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/Scorers.py RENAMED Viewed

@@ -1,17 +1,15 @@
 # STL
 import math
 from abc import ABC, abstractmethod
-from typing import Dict, List, Type, Union
+from typing import List, Type
 # PDM
 from typing_extensions import override
 # LOCAL
+from sonatoki.types import Number, Scorecard
 from sonatoki.Filters import Filter
-Number = Union[int, float]
-Weights = Dict[str, Number]
 class Scorer(ABC):
     @classmethod
@@ -124,7 +122,64 @@ class SoftScaling(Soften, Scaling):
     scoring."""
-# class Logarithmic(Scorer): ...
+class SentenceScorer(ABC):
+    @classmethod
+    @abstractmethod
+    def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
+        """Re-score a list of sentences (scorecards, sentences with all their
+        metadata) and return them."""
+        raise NotImplementedError
+class SentNoOp(SentenceScorer):
+    @classmethod
+    @override
+    def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
+        return scorecards
-__all__ = ["PassFail", "SoftPassFail", "Scaling", "SoftScaling"]
+class SentAvg(SentenceScorer):
+    @classmethod
+    @override
+    def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
+        if not scorecards:
+            return scorecards
+        total = sum(card["score"] for card in scorecards)
+        avg = total / len(scorecards)
+        for card in scorecards:
+            card["score"] = avg
+        return scorecards
+class SentWeightedAvg(SentenceScorer):
+    @classmethod
+    @override
+    def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
+        if not scorecards:
+            return scorecards
+        weighted_total = 0
+        total_len = 0
+        for card in scorecards:
+            cardlen = len(card["cleaned"])
+            cardscore = card["score"]
+            weighted_total += cardlen * cardscore
+            total_len += cardlen
+        weighted_avg = weighted_total / total_len
+        for card in scorecards:
+            card["score"] = weighted_avg
+        return scorecards
+__all__ = [
+    "PassFail",
+    "Scaling",
+    "SoftPassFail",
+    "SoftScaling",
+    "Soften",
+    "SentAvg",
+    "SentWeightedAvg",
+]

{sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/constants.py RENAMED Viewed

@@ -1,11 +1,16 @@
 # STL
 import json
-from typing import Set, Dict
+from typing import Set, Dict, Optional
 from pathlib import Path
 # LOCAL
+from sonatoki.types import LinkuWord, LinkuUsageDate
 from sonatoki.utils import find_unicode_chars, find_unicode_ranges
+LATEST_DATE = "2023-09"
+# hardcoding this seems bad, but it means the parser is stable w.r.t. Linku!
 # `\p{Punctuation}` character class
 # https://www.compart.com/en/unicode/category
 # https://unicode.org/Public/UNIDATA/UnicodeData.txt
@@ -638,6 +643,7 @@ FALSE_POS_SYLLABIC = {
     "iluminate",
     "imense",
     "imitate",
+    "inanimate",
     "injoke",
     "insane",
     "insolate",
@@ -689,26 +695,42 @@ NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
 # NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
-def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> Set[str]:
-    return {d["word"] for d in data.values() if d[key] == value}
+def linku_data() -> Dict[str, LinkuWord]:
+    # NOTE: this does open+read+parse two files each time you construct a filter
+    # but i expect users to construct filters only at the start of runtime
+    # there is no reason to waste your RAM by leaving the linku data in it
+    with open(LINKU) as f:
+        linku: Dict[str, LinkuWord] = json.loads(f.read())
+    with open(SANDBOX) as f:
+        sandbox: Dict[str, LinkuWord] = json.loads(f.read())
+    return {**linku, **sandbox}
+def words_by_tag(tag: str, value: str) -> Set[str]:
+    data = linku_data()
+    return {d["word"] for d in data.values() if d[tag] == value}
-with open(LINKU) as f:
-    linku: Dict[str, Dict[str, str]] = json.loads(f.read())
-    NIMI_PU = category_helper(linku, "book", "pu")
-    NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
-    NIMI_KU_SULI = category_helper(linku, "book", "ku suli")
-    NIMI_KU_LILI = category_helper(linku, "book", "ku lili")
+def words_by_usage(
+    usage: int,
+    date: Optional[LinkuUsageDate] = None,
+) -> Set[str]:
+    if not date:
+        date = LATEST_DATE
+    data = linku_data()
-    NIMI_LINKU_CORE = category_helper(linku, "usage_category", "core")
-    NIMI_LINKU_COMMON = category_helper(linku, "usage_category", "common")
-    NIMI_LINKU_UNCOMMON = category_helper(linku, "usage_category", "uncommon")
-    NIMI_LINKU_OBSCURE = category_helper(linku, "usage_category", "obscure")
+    result: Set[str] = set()
+    for word in data.values():
+        usages = word["usage"]
+        if date in usages and usages[date] >= usage:
+            result.add(word["word"])
+    return result
+NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
-with open(SANDBOX) as f:
-    sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
-    NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
 # with open(SYLLABICS) as f:
 #     FALSE_POS_SYLLABIC = {line.strip() for line in f}
@@ -716,9 +738,6 @@ with open(SANDBOX) as f:
 # with open(ALPHABETICS) as f:
 #     FALSE_POS_ALPHABETIC = {line.strip() for line in f}
-del linku
-del sandbox
 __all__ = [
     "ALLOWABLES",
     "ALL_PUNCT",
@@ -727,14 +746,6 @@ __all__ = [
     "CONSONANTS",
     "EMOJI_VARIATION_SELECTOR_RANGES",
     "EMOJI_VARIATION_SELECTOR_RANGES_STR",
-    "NIMI_KU_LILI",
-    "NIMI_KU_SULI",
-    "NIMI_LINKU_COMMON",
-    "NIMI_LINKU_CORE",
-    "NIMI_LINKU_OBSCURE",
-    "NIMI_LINKU_SANDBOX",
-    "NIMI_LINKU_UNCOMMON",
-    "NIMI_PU",
     "NIMI_PU_SYNONYMS",
     "POSIX_PUNCT",
     "POSIX_PUNCT_RANGES",

{sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/ilo.py RENAMED Viewed

@@ -1,17 +1,14 @@
 # STL
-from typing import List, Type, Tuple
+from typing import List, Type
 # LOCAL
+from sonatoki.types import Number, Scorecard
 from sonatoki.Filters import Filter
-from sonatoki.Scorers import Number, Scorer
+from sonatoki.Scorers import Scorer, SentNoOp, SentenceScorer
 from sonatoki.Cleaners import Cleaner
 from sonatoki.Tokenizers import Tokenizer, SentTokenizer, WordTokenizer
 from sonatoki.Preprocessors import Preprocessor
-# tokenized, filtered, cleaned, score, result
-Scorecard = Tuple[List[str], List[str], List[str], Number, bool]
-# TODO: scorecard kinda sucks as a name
 class Ilo:
     __preprocessors: List[Type[Preprocessor]]
@@ -21,6 +18,7 @@ class Ilo:
     __ignoring_filters: List[Type[Filter]]
     __scoring_filters: List[Type[Filter]]
     __scorer: Type[Scorer]
+    __sentence_scorer: Type[SentenceScorer]
     __passing_score: Number
     def __init__(
@@ -31,6 +29,7 @@ class Ilo:
         scoring_filters: List[Type[Filter]],
         scorer: Type[Scorer],
         passing_score: Number,
+        sentence_scorer: Type[SentenceScorer] = SentNoOp,
         word_tokenizer: Type[Tokenizer] = WordTokenizer,
         sent_tokenizer: Type[Tokenizer] = SentTokenizer,
     ):
@@ -43,6 +42,7 @@ class Ilo:
         self.__ignoring_filters = [*ignoring_filters]
         self.__scoring_filters = [*scoring_filters]
         self.__scorer = scorer
+        self.__sentence_scorer = sentence_scorer
         self.__passing_score = passing_score
     def preprocess(self, msg: str) -> str:
@@ -55,6 +55,7 @@ class Ilo:
         return self.__word_tokenizer.tokenize(msg)
     def sent_tokenize(self, msg: str) -> List[str]:
+        """It is *highly* recommended that you run `ilo.preprocess` first."""
         return self.__sent_tokenizer.tokenize(msg)
     def clean_token(self, token: str) -> str:
@@ -93,44 +94,50 @@ class Ilo:
     def score_tokens(self, tokens: List[str]) -> float:
         return self.__scorer.score(tokens, self.__scoring_filters)
+    def score_sentences(self, scorecards: List[Scorecard]) -> List[Scorecard]:
+        return self.__sentence_scorer.score(scorecards)
     def _is_toki_pona(self, message: str) -> Scorecard:
         """Process a message into its tokens, then filters, cleans, and scores
-        them. Returns all parts. Message must already be preprocessed, normally
-        done in `self.is_toki_pona(message)`.
-        Returns all components of the processing algorithm except preprocessing:
-        - Tokenized message (list[str])
-        - Filtered message (list[str])
-        - Cleaned message (list[str])
-        - Score (float)
-        - Result (bool)
+        them. Message must already be preprocessed, normally done in
+        `self.is_toki_pona(message)`.
+        Returns a `Scorecard` with all changes to the input text and a score.
         """
         tokenized = self.word_tokenize(message)
         filtered = self.filter_tokens(tokenized)
         cleaned = self.clean_tokens(filtered)
         score = self.score_tokens(cleaned)
-        result = score >= self.__passing_score
-        return tokenized, filtered, cleaned, score, result
+        scorecard: Scorecard = {
+            "text": message,
+            "tokenized": tokenized,
+            "filtered": filtered,
+            "cleaned": cleaned,
+            "score": score,
+        }
+        return scorecard
     def is_toki_pona(self, message: str) -> bool:
-        """Determines whether a single statement is or is not Toki Pona."""
+        """Determines whether a text is or is not Toki Pona."""
         message = self.preprocess(message)
-        *_, result = self._is_toki_pona(message)
-        return result
+        scorecard = self._is_toki_pona(message)
+        return scorecard["score"] >= self.__passing_score
     def _are_toki_pona(self, message: str) -> List[Scorecard]:
-        """Split a message into sentences, then return a list each sentence's
-        results via `self._is_toki_pona()`.
+        """Split a message into sentences, then return a list with each
+        sentence's scorecard from `self._is_toki_pona()`.
         Message must already be preprocessed, normally done in
         `self.are_toki_pona(message)`.
         """
-        results: List[Scorecard] = list()
+        scorecards: List[Scorecard] = list()
         for sentence in self.sent_tokenize(message):
             result = self._is_toki_pona(sentence)
-            results.append(result)
-        return results
+            scorecards.append(result)
+        scorecards = self.score_sentences(scorecards)
+        return scorecards
     def are_toki_pona(self, message: str) -> List[bool]:
         """Splits a statement into sentences, then determines if each is or is not Toki Pona.
@@ -148,5 +155,5 @@ class Ilo:
         ```
         """
         message = self.preprocess(message)
-        results = self._are_toki_pona(message)
-        return [res[-1] for res in results]
+        scorecards = self._are_toki_pona(message)
+        return [card["score"] >= self.__passing_score for card in scorecards]

sonatoki-0.8.0/src/sonatoki/types.py ADDED Viewed

@@ -0,0 +1,60 @@
+# STL
+from typing import Dict, List, Union, Literal, TypedDict
+Number = Union[int, float]
+# TODO: scorecard kinda sucks as a name
+class Scorecard(TypedDict):
+    text: str
+    tokenized: List[str]
+    filtered: List[str]
+    cleaned: List[str]
+    score: Number
+LinkuUsageDate = Union[
+    Literal["2020-04"],
+    Literal["2021-10"],
+    Literal["2022-08"],
+    Literal["2023-09"],
+    # Literal["2024-09"],
+]
+LinkuUsageCategory = Union[
+    Literal["core"],
+    Literal["common"],
+    Literal["uncommon"],
+    Literal["obscure"],
+    Literal["sandbox"],
+]
+LinkuBooks = Union[
+    Literal["pu"],
+    Literal["ku suli"],
+    Literal["ku lili"],
+    Literal["none"],
+]
+class LinkuWord(TypedDict):
+    id: str
+    author_verbatim: str
+    author_verbatim_source: str
+    book: str
+    coined_era: str
+    coined_year: str
+    creator: List[str]
+    ku_data: Dict[str, int]
+    see_also: List[str]
+    resources: Dict[str, str]
+    representations: Dict[str, Union[str, List[str]]]
+    source_language: str
+    usage_category: LinkuUsageCategory
+    word: str
+    deprecated: bool
+    etymology: List[Dict[str, str]]
+    audio: List[Dict[str, str]]
+    pu_verbatim: Dict[str, str]
+    usage: Dict[LinkuUsageDate, int]
+    translations: Dict[str, Dict[str, str]]

{sonatoki-0.6.3 → sonatoki-0.8.0}/tests/test_filters.py RENAMED Viewed

@@ -11,12 +11,12 @@ from sonatoki.Filters import (
     And,
     Not,
     NimiPu,
+    PuName,
     Numeric,
     Syllabic,
     Alphabetic,
     NimiKuLili,
     NimiKuSuli,
-    ProperName,
     Phonotactic,
     Punctuation,
     AlphabeticRe,
@@ -34,23 +34,13 @@ from sonatoki.Filters import (
     NimiLinkuUncommon,
 )
 from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates
-from sonatoki.constants import (
-    NIMI_PU,
-    NIMI_KU_LILI,
-    NIMI_KU_SULI,
-    NIMI_LINKU_CORE,
-    NIMI_LINKU_COMMON,
-    FALSE_POS_SYLLABIC,
-    NIMI_LINKU_OBSCURE,
-    NIMI_LINKU_SANDBOX,
-    NIMI_LINKU_UNCOMMON,
-)
+from sonatoki.constants import FALSE_POS_SYLLABIC, words_by_tag
 # FILESYSTEM
 from .test_utils import PROPER_NAME_RE
-@given(st.sampled_from(list(NIMI_PU)))
+@given(st.sampled_from(list(words_by_tag("book", "pu"))))
 @example("lukin")
 @example("selo")
 @example("li")
@@ -59,14 +49,14 @@ def test_NimiPu(s: str):
     assert res, repr(s)
-@given(st.sampled_from(list(NIMI_LINKU_CORE)))
+@given(st.sampled_from(list(words_by_tag("usage_category", "core"))))
 @example("pona")
 def test_NimiLinkuCore(s: str):
     res = NimiLinkuCore.filter(s)
     assert res, repr(s)
-@given(st.sampled_from(list(NIMI_LINKU_COMMON)))
+@given(st.sampled_from(list(words_by_tag("usage_category", "common"))))
 @example("n")
 @example("tonsi")
 @example("kipisi")
@@ -75,19 +65,21 @@ def test_NimiLinkuCommon(s: str):
     assert res, repr(s)
-@given(st.sampled_from(list(NIMI_LINKU_UNCOMMON)))
+@given(st.sampled_from(list(words_by_tag("usage_category", "uncommon"))))
 def test_NimiLinkuUncommon(s: str):
     res = NimiLinkuUncommon.filter(s)
     assert res, repr(s)
-@given(st.sampled_from(list(NIMI_LINKU_OBSCURE)))
+@given(st.sampled_from(list(words_by_tag("usage_category", "obscure"))))
+@example("pake")
+@example("san")
 def test_NimiLinkuObscure(s: str):
     res = NimiLinkuObscure.filter(s)
     assert res, repr(s)
-@given(st.sampled_from(list(NIMI_LINKU_SANDBOX)))
+@given(st.sampled_from(list(words_by_tag("usage_category", "sandbox"))))
 @example("kalamARR")
 @example("Pingo")
 def test_NimiLinkuSandbox(s: str):
@@ -152,7 +144,7 @@ def test_AlphabeticRe(s: str):
 @given(st.from_regex(PROPER_NAME_RE, fullmatch=True))
 def test_ProperName(s: str):
-    res = ProperName.filter(s)
+    res = PuName.filter(s)
     assert res, repr(s)
@@ -207,7 +199,11 @@ def test_OrFilter(s: str):
 # NOTE: No subset filter test because A | B is not the same as A combined with B.
 # e.g. "apple" passes Alphabetic, "..." passes Punctuation, "apple..." passes neither
 # but would incorrectly pass a combined filter.
-@given(st.sampled_from(list(NIMI_PU | NIMI_LINKU_OBSCURE)))
+@given(
+    st.sampled_from(
+        list(words_by_tag("book", "pu") | words_by_tag("usage_category", "obscure"))
+    )
+)
 def test_MemberFilters_OrFilter(s: str):
     filter = Or(NimiPu, NimiLinkuObscure)
     assert issubclass(filter, MemberFilter)
@@ -221,11 +217,11 @@ def test_MemberFilters_OrFilter(s: str):
 @given(
     st.sampled_from(
         list(
-            NIMI_KU_SULI
-            | NIMI_KU_LILI
-            | NIMI_LINKU_UNCOMMON
-            | NIMI_LINKU_OBSCURE
-            | NIMI_LINKU_SANDBOX
+            words_by_tag("book", "ku suli")
+            | words_by_tag("book", "ku lili")
+            | words_by_tag("usage_category", "uncommon")
+            | words_by_tag("usage_category", "obscure")
+            | words_by_tag("usage_category", "sandbox")
         ),
     )
 )
@@ -248,14 +244,14 @@ def test_OrFilter_IsipinEpiku(s: str):
     )
-@given(st.sampled_from(list(NIMI_PU)))
+@given(st.sampled_from(list(words_by_tag("book", "pu"))))
 def test_AndFilter(s: str):
     s = s.capitalize()
-    f = And(ProperName, NimiPu)
+    f = And(PuName, NimiPu)
     assert f.filter(s)
-@given(st.sampled_from(list(NIMI_PU)))
+@given(st.sampled_from(list(words_by_tag("book", "pu"))))
 def test_NotFilter(s: str):
     f = Not(NimiPu)
     assert not f.filter(s)
@@ -282,13 +278,21 @@ def test_AndNotFilter(s: str):
         assert not res_composed
-@given(st.sampled_from(list(NIMI_PU | NIMI_KU_SULI)))
+@given(
+    st.sampled_from(list(words_by_tag("book", "pu") | words_by_tag("book", "ku suli")))
+)
 def test_AddTokensToMemberFilter(s: str):
     PuEnKuSuliFilter = NimiPu(add=NimiKuSuli.tokens)
     assert PuEnKuSuliFilter.filter(s)
-@given(st.sampled_from(list(NIMI_LINKU_SANDBOX | NIMI_KU_LILI)))
+@given(
+    st.sampled_from(
+        list(
+            words_by_tag("usage_category", "sandbox") | words_by_tag("book", "ku lili")
+        )
+    )
+)
 def test_AddTokensToMemberFilterNegative(s: str):
     PuEnKuSuliFilter = NimiPu(add=NimiKuSuli.tokens)
     assert not PuEnKuSuliFilter.filter(s)
@@ -297,12 +301,12 @@ def test_AddTokensToMemberFilterNegative(s: str):
 @given(
     st.sampled_from(
         list(
-            NIMI_PU
-            | NIMI_KU_SULI
-            | NIMI_KU_LILI
-            | NIMI_LINKU_UNCOMMON
-            | NIMI_LINKU_OBSCURE
-            | NIMI_LINKU_SANDBOX
+            words_by_tag("book", "pu")
+            | words_by_tag("book", "ku suli")
+            | words_by_tag("book", "ku lili")
+            | words_by_tag("usage_category", "uncommon")
+            | words_by_tag("usage_category", "obscure")
+            | words_by_tag("usage_category", "sandbox")
         ),
     )
     | st.from_regex(Syllabic.pattern.pattern, fullmatch=True)

{sonatoki-0.6.3 → sonatoki-0.8.0}/tests/test_ilo.py RENAMED Viewed

@@ -1,3 +1,6 @@
+# STL
+from typing import List, Tuple
 # PDM
 import pytest
@@ -35,6 +38,10 @@ ALL_VALID = [
     "󱥄󱥬󱥩󱤴",  # "o toki tawa mi" in UCSUR
     "󱤴󱤧󱤑󱥍󱦗󱤖󱥡󱦘󱤬󱥭‍󱥡󱥚",
     "󱤑󱦐󱥗󱦜󱦈󱦜󱥉󱦜󱦑󱥄󱤤󱤂󱤉󱥆󱤀",
+    "o lukin, 󱤴󱥬󱥩󱤴󱤧wawa",
+    "ni li sona kiwen",
+    "nimi namako li toki e ale",
+    "mi open mute a",  # mostly eng words
 ]
 IGNORABLES = [
@@ -55,10 +62,9 @@ IGNORABLES = [
     "❤️",  # heart
     "😊",
     "👨‍👩‍👧‍👧",  # family emoji with zwj
-    # every non-emoji in
+    # every non-emoji in the writables
     "🄀🄁🄂🄃🄄🄅🄆🄇🄈🄉🄊🄋🄌🄍🄎🄏🄐🄑🄒🄓🄔🄕🄖🄗🄘🄙🄚🄛🄜🄝🄞🄟🄠🄡🄢🄣🄤🄥🄦🄧🄨🄩🄪🄫🄬🄭🄮🄯🄰🄱🄲🄳🄴🄵🄶🄷🄸🄹🄺🄻🄼🄽🄾🄿🅀🅁🅂🅃🅄🅅🅆🅇🅈🅉🅊🅋🅌🅍🅎🅏🅐🅑🅒🅓🅔🅕🅖🅗🅘🅙🅚🅛🅜🅝🅞🅟🅠🅡🅢🅣🅤🅥🅦🅧🅨🅩🅪🅫🅬🅭🅮🅯🅲🅳🅴🅵🅶🅷🅸🅹🅺🅻🅼🅽🆀🆁🆂🆃🆄🆅🆆🆇🆈🆉🆊🆋🆌🆍🆏🆐 🆛🆜🆝🆞🆟🆠🆡🆢🆣🆤🆥🆦🆧🆨🆩🆪🆫🆬🆭🇦🇧🇨🇩🇪🇫🇬🇭🇮🇯🇰🇱🇲🇳🇴🇵🇶🇷🇸🇹🇺🇻🇼🇽🇾🇿",
     "🅰️🅱️🅾️🅱️🅰️",  # blood type emojis
-    # "😃⃢👍",  # sincerely, no idea, but it came up
 ]
 SYLLABIC_MATCHES = [
@@ -88,6 +94,9 @@ NAME_MATCHES = [
     "toki Kanse li lon",
     "toki Lojban li nasa e lawa mi",
     "ilo Firefox",
+    "ilo FaceBook li nasa",
+    "mi kepeken ilo MySQL",
+    "poki li nasin SQLite",
     "mi musi Space Station 13",
     "jan Tepo en jan Salo en jan Lakuse en pipi Kewapi en soweli Eweke en mi li musi",
 ]
@@ -108,7 +117,7 @@ CORPUS_SPECIFIC = [
     "Pingo",
     "we Luke li alente wa",
 ]
-CORPUS_SPECIFIC_XFAIL = []
+CORPUS_SPECIFIC_XFAIL: List[str] = []
 EXCESSIVE_SYLLABICS = [
@@ -129,7 +138,6 @@ EXCESSIVE_SYLLABICS = [
 ]
 EXCESSIVE_ALPHABETICS = [
-    "21st",  # candidate for xfails?
     "wen i tok usin onli notes in toki pona i look silli. ",
     "I wait, I sulk, as a tool I make stoops to ineptness.",
     "aaa i non-saw usa's most multiple element-set. it's as asinine as in `e`-less speak",
@@ -155,6 +163,7 @@ EXCESSIVE_ENGLISH = [
     "i'm online all the time",
     "How to Cut a Kiwi",
     "a e i o u",
+    "21st",  # previous false positive; fixed by ProperName change
 ]
 NON_MATCHES = [
@@ -193,10 +202,20 @@ FALSE_NEGATIVES = [
     "mtue",
     "mi nasa B^)",  # emoticon
     "lete li ike x.x",  # this is an emoticon but passes because 'x' is in Filters.Miscellaneous
+    "😃⃢👍",  # sincerely, no idea, but it came up and it should be omitted by emojis but isn't
 ]
 FALSE_POSITIVES = [
-    "Knowing a little toki pona",
+    "Knowing a little toki pona",  # name, dict, alphabet, dict, dict- damn, that's hard.
+]
+IGNORABLE_PAIRS: List[Tuple[str, str]] = [
+    ("o lukin e ni: https://example.com/", "o lukin e ni:"),
+    ("ni li nasa anu seme <:musiwawa:198591138591>", "ni li nasa anu seme"),
+    ("seme la ni li toki pona ala https://example.com/", "seme la ni li toki pona ala"),
+    ("```\ndef bad():\n    pass\n``` o lukin e ni", "o lukin e ni"),
+    ("mi tawa tomo telo 💦💦", "mi tawa tomo telo"),
+    ("o lukin e lipu ni: [[wp:Canvassing]]", "o lukin e lipu ni:"),
 ]
@@ -254,3 +273,33 @@ def test_false_negatives_pref(ilo: Ilo, text: str):
 @pytest.mark.parametrize("text", CORPUS_SPECIFIC_XFAIL)
 def test_false_positives_corpus(corpus_ilo: Ilo, text: str):
     assert not corpus_ilo.is_toki_pona(text)
+@pytest.mark.parametrize("pair", IGNORABLE_PAIRS)
+def test_pref_ignorable_doesnt_change_score(ilo: Ilo, pair: Tuple[str, str]):
+    with_ignorable, without_ignorable = pair
+    with_ignorable = ilo.preprocess(with_ignorable)
+    without_ignorable = ilo.preprocess(without_ignorable)
+    score_with = ilo._is_toki_pona(with_ignorable)["score"]
+    score_without = ilo._is_toki_pona(without_ignorable)["score"]
+    assert score_with == score_without
+@pytest.mark.parametrize("pair", IGNORABLE_PAIRS)
+def test_lazy_ignorable_doesnt_change_score(lazy_ilo: Ilo, pair: Tuple[str, str]):
+    with_ignorable, without_ignorable = pair
+    with_ignorable = lazy_ilo.preprocess(with_ignorable)
+    without_ignorable = lazy_ilo.preprocess(without_ignorable)
+    score_with = lazy_ilo._is_toki_pona(with_ignorable)["score"]
+    score_without = lazy_ilo._is_toki_pona(without_ignorable)["score"]
+    assert score_with == score_without
+@pytest.mark.parametrize("pair", IGNORABLE_PAIRS)
+def test_corpus_ignorable_doesnt_change_score(corpus_ilo: Ilo, pair: Tuple[str, str]):
+    with_ignorable, without_ignorable = pair
+    with_ignorable = corpus_ilo.preprocess(with_ignorable)
+    without_ignorable = corpus_ilo.preprocess(without_ignorable)
+    score_with = corpus_ilo._is_toki_pona(with_ignorable)["score"]
+    score_without = corpus_ilo._is_toki_pona(without_ignorable)["score"]
+    assert score_with == score_without

{sonatoki-0.6.3 → sonatoki-0.8.0}/tests/test_preprocessors.py RENAMED Viewed

@@ -8,6 +8,7 @@ from sonatoki.Preprocessors import (
     Spoilers,
     AllQuotes,
     Backticks,
+    Codeblock,
     Reference,
     ArrowQuote,
     ColonEmotes,
@@ -48,6 +49,25 @@ def test_Backticks(s: str):
     assert res == "", (repr(s), repr(res))
+@given(st.from_regex(Codeblock.pattern.pattern, fullmatch=True))
+@example(
+    """```
+```"""
+)
+@example(
+    """```
+blocky message
+```
+```
+second blocky message
+```"""
+)
+def test_Codeblock(s: str):
+    res = Codeblock.process(s).strip()
+    assert res == "", (repr(s), repr(res))
 @given(st.from_regex(ArrowQuote.pattern.pattern, fullmatch=True))
 @example("> base")
 @example("> newline\n> newline")

{sonatoki-0.6.3 → sonatoki-0.8.0}/tests/test_properties.py RENAMED Viewed

@@ -19,45 +19,35 @@ from sonatoki.Filters import (
 )
 from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates
 from sonatoki.constants import (
-    NIMI_PU,
-    NIMI_KU_LILI,
-    NIMI_KU_SULI,
-    NIMI_LINKU_CORE,
     NIMI_PU_SYNONYMS,
-    NIMI_LINKU_COMMON,
     FALSE_POS_SYLLABIC,
-    NIMI_LINKU_OBSCURE,
-    NIMI_LINKU_SANDBOX,
-    NIMI_LINKU_UNCOMMON,
     FALSE_POS_ALPHABETIC,
+    words_by_tag,
+    words_by_usage,
 )
-@given(st.sampled_from(list(NIMI_PU | NIMI_PU_SYNONYMS)))
+@given(st.sampled_from(list(words_by_tag("book", "pu") | NIMI_PU_SYNONYMS)))
 def test_pu_filters_non_overlap(s: str):
     res_pu = NimiPu.filter(s)
     res_synonyms = NimiPuSynonyms.filter(s)
     assert (res_pu + res_synonyms) == 1
-@given(st.sampled_from(list(NIMI_KU_SULI | NIMI_KU_LILI)))
+@given(
+    st.sampled_from(
+        list(words_by_tag("book", "ku suli") | words_by_tag("book", "ku lili"))
+    )
+)
 def test_ku_filters_non_overlap(s: str):
+    s = Lowercase.clean(s)
+    s = ConsecutiveDuplicates.clean(s)
     res_ku_suli = NimiKuSuli.filter(s)
     res_ku_lili = NimiKuLili.filter(s)
     assert (res_ku_suli + res_ku_lili) == 1
-@given(
-    st.sampled_from(
-        list(
-            NIMI_LINKU_CORE
-            | NIMI_LINKU_COMMON
-            | NIMI_LINKU_UNCOMMON
-            | NIMI_LINKU_OBSCURE
-            | NIMI_LINKU_SANDBOX
-        )
-    )
-)
+@given(st.sampled_from(list(words_by_usage(0))))
 def test_linku_filters_non_overlap(s: str):
     _ = assume(s != "su")
@@ -73,7 +63,7 @@ def test_linku_filters_non_overlap(s: str):
     assert (res_core + res_common + res_uncommon + res_obscure + res_sandbox) == 1
-@given(st.sampled_from(list(NIMI_LINKU_CORE | NIMI_LINKU_COMMON | NIMI_LINKU_UNCOMMON)))
+@given(st.sampled_from(list(words_by_usage(30))))
 def test_nimi_linku_properties(s: str):
     assert ConsecutiveDuplicates.clean(s) == s, repr(s)
     assert Alphabetic.filter(s), repr(s)

{sonatoki-0.6.3 → sonatoki-0.8.0}/tests/test_scorers.py RENAMED Viewed

@@ -10,10 +10,10 @@ from hypothesis import given, example
 from sonatoki.Filters import (
     Filter,
     NimiPu,
+    PuName,
     Numeric,
     Syllabic,
     Alphabetic,
-    ProperName,
     Phonotactic,
     NimiLinkuCore,
     PunctuationRe,
@@ -31,7 +31,7 @@ FILTERS = [
     NimiLinkuCore,
     NimiLinkuCommon,
     Alphabetic,
-    ProperName,
+    PuName,
     Phonotactic,
     PunctuationRe,
 ]

{sonatoki-0.6.3 → sonatoki-0.8.0}/tests/test_utils.py RENAMED Viewed

@@ -1,17 +1,14 @@
-# STL
-import re
 # PDM
 import hypothesis.strategies as st
 # LOCAL
 from sonatoki.Filters import Syllabic, Phonotactic, AlphabeticRe
-from sonatoki.constants import NIMI_LINKU_CORE, NIMI_LINKU_COMMON
+from sonatoki.constants import words_by_usage
 PROPER_NAME_RE = r"[A-Z][a-z]*"
 token_strategy = (
-    st.sampled_from(list(NIMI_LINKU_CORE | NIMI_LINKU_COMMON))
+    st.sampled_from(list(words_by_usage(60)))
     | st.from_regex(Phonotactic.pattern.pattern, fullmatch=True)
     | st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
     | st.from_regex(PROPER_NAME_RE, fullmatch=True)