PyPI - sonatoki - Versions diffs - 0.7.0__tar.gz → 0.8.1__tar.gz - Mend

sonatoki 0.7.0tar.gz → 0.8.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{sonatoki-0.7.0 → sonatoki-0.8.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sonatoki
-Version: 0.7.0
+Version: 0.8.1
 Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
 Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
 License: AGPL-3.0-or-later

{sonatoki-0.7.0 → sonatoki-0.8.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "sonatoki"
-version = "0.7.0"
+version = "0.8.1"
 description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
 authors = [
     { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },

{sonatoki-0.7.0 → sonatoki-0.8.1}/src/sonatoki/Configs.py RENAMED Viewed

@@ -11,12 +11,12 @@ from sonatoki.Filters import (
     And,
     Not,
     Filter,
+    PuName,
     Numeric,
     NimiUCSUR,
     Alphabetic,
     NimiKuLili,
     NimiKuSuli,
-    ProperName,
     Punctuation,
     LongSyllabic,
     Miscellaneous,
@@ -131,7 +131,7 @@ LazyConfig: IloConfig = {
     "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
-    "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
+    "scoring_filters": [Alphabetic, NimiUCSUR, PuName, Miscellaneous],
     "scorer": SoftPassFail,
     "passing_score": 0.8,
     "word_tokenizer": WordTokenizerRe,  # mimics old tokenizer

{sonatoki-0.7.0 → sonatoki-0.8.1}/src/sonatoki/Filters.py RENAMED Viewed

@@ -141,8 +141,27 @@ class FalsePosAlphabetic(MemberFilter):
 class ProperName(Filter):
-    """Determines if a given token is a valid name (also called a loan word).
-    When Toki Pona is written with the Latin alphabet, names are generally
+    """Determine if a given token is a valid name based on a reasonable weakening of
+    the rules given in Toki Pona: The Language of Good. A token matches if it has a capital
+    letter at its start and is **not** fully capitalized.
+    This corrects an issue with PuName, where scripts lacking a case distinction are
+    errantly counted"""
+    @classmethod
+    @override
+    @cache(maxsize=None)
+    def filter(cls, token: str) -> bool:
+        first_capitalized = token[0].isupper()
+        all_caps = token.isupper()
+        return first_capitalized and not all_caps
+class PuName(Filter):
+    """Determine if a given token is a valid name (also called a loan word) based on
+    the rules given in Toki Pona: The Language of Good.
+    When Toki Pona is written with the Latin alphabet, names are
     capitalized at their start. This filter identifies those tokens.
     Note that this alone cannot determine if a token is a valid name,
@@ -156,6 +175,9 @@ class ProperName(Filter):
     @override
     @cache(maxsize=None)
     def filter(cls, token: str) -> bool:
+        # first_capitalized = token[0].isupper()
+        # rest_capitalized = token[1:] == token[1:].upper()
+        # return first_capitalized and not rest_capitalized
         return token == token.capitalize()
         # TODO:  If the token is in a script which doesn't have a case distinction,
         # this will errantly match.
@@ -445,6 +467,7 @@ __all__ = [
     "Or",
     "Phonotactic",
     "ProperName",
+    "PuName",
     "Punctuation",
     "Syllabic",
 ]

{sonatoki-0.7.0 → sonatoki-0.8.1}/src/sonatoki/Preprocessors.py RENAMED Viewed

@@ -149,7 +149,10 @@ class Codeblock(RegexPreprocessor):
     Subset of what would be removed by Backticks, but may be preferable.
     """
-    pattern = re.compile(r"```\n(?:(?!```).*?)?```", flags=re.DOTALL)
+    pattern = re.compile(
+        r"```.+?```",
+        flags=re.DOTALL,
+    )
 class Spoilers(RegexPreprocessor):

{sonatoki-0.7.0 → sonatoki-0.8.1}/tests/test_cleaners.py RENAMED Viewed

@@ -12,7 +12,7 @@ from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates, ConsecutiveDupli
 from .test_utils import PROPER_NAME_RE
-@given(st.from_regex(ConsecutiveDuplicatesRe.pattern.pattern))
+@given(st.from_regex(ConsecutiveDuplicatesRe.pattern))
 @example("tooooki a")
 @example("muuuuuu")
 @example("nnn")
@@ -25,7 +25,7 @@ def test_ConsecutiveDuplicatesRe(s: str):
         assert a.lower() != b.lower(), (s, res)
-@given(st.from_regex(ConsecutiveDuplicatesRe.pattern.pattern))
+@given(st.from_regex(ConsecutiveDuplicatesRe.pattern))
 @example("Aaa")
 @example("aAa")
 @example("aaA")

{sonatoki-0.7.0 → sonatoki-0.8.1}/tests/test_filters.py RENAMED Viewed

@@ -11,12 +11,12 @@ from sonatoki.Filters import (
     And,
     Not,
     NimiPu,
+    PuName,
     Numeric,
     Syllabic,
     Alphabetic,
     NimiKuLili,
     NimiKuSuli,
-    ProperName,
     Phonotactic,
     Punctuation,
     AlphabeticRe,
@@ -90,7 +90,7 @@ def test_NimiLinkuSandbox(s: str):
     assert res, repr(s)
-@given(st.from_regex(Phonotactic.pattern.pattern, fullmatch=True))
+@given(st.from_regex(Phonotactic.pattern, fullmatch=True))
 @example("kijetesantakalu")
 @example("n")
 def test_Phonotactic(s: str):
@@ -98,28 +98,28 @@ def test_Phonotactic(s: str):
     assert res, repr(s)
-@given(st.from_regex(Phonotactic.pattern.pattern, fullmatch=True))
+@given(st.from_regex(Phonotactic.pattern, fullmatch=True))
 def test_LongPhonotactic(s: str):
     len_ok = len(s) >= LongPhonotactic.length
     res = LongPhonotactic.filter(s)
     assert res == len_ok, repr(s)  # will match given fullmatch
-@given(st.from_regex(Syllabic.pattern.pattern, fullmatch=True))
+@given(st.from_regex(Syllabic.pattern, fullmatch=True))
 @example("wuwojitiwunwonjintinmanna")
 def test_Syllabic(s: str):
     res = Syllabic.filter(s)
     assert res, repr(s)
-@given(st.from_regex(Syllabic.pattern.pattern, fullmatch=True))
+@given(st.from_regex(Syllabic.pattern, fullmatch=True))
 def test_LongSyllabic(s: str):
     len_ok = len(s) >= LongSyllabic.length
     res = LongSyllabic.filter(s)
     assert res == len_ok
-@given(st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True))
+@given(st.from_regex(AlphabeticRe.pattern, fullmatch=True))
 @example("muems")
 @example("mpptp")
 @example("tptpt")
@@ -129,14 +129,14 @@ def test_Alphabetic(s: str):
     assert res_fn == res_re, repr(s)
-@given(st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True))
+@given(st.from_regex(AlphabeticRe.pattern, fullmatch=True))
 def test_LongAlphabetic(s: str):
     len_ok = len(s) >= LongAlphabetic.length
     res = LongAlphabetic.filter(s)
     assert res == len_ok
-@given(st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True))
+@given(st.from_regex(AlphabeticRe.pattern, fullmatch=True))
 def test_AlphabeticRe(s: str):
     res_re = AlphabeticRe.filter(s)
     assert res_re, repr(s)
@@ -144,11 +144,11 @@ def test_AlphabeticRe(s: str):
 @given(st.from_regex(PROPER_NAME_RE, fullmatch=True))
 def test_ProperName(s: str):
-    res = ProperName.filter(s)
+    res = PuName.filter(s)
     assert res, repr(s)
-@given(st.from_regex(PunctuationRe.pattern.pattern, fullmatch=True))
+@given(st.from_regex(PunctuationRe.pattern, fullmatch=True))
 @example("[]")
 @example(r"\\")
 @example(r"\"")
@@ -161,14 +161,14 @@ def test_PunctuationRe1(s: str):
     assert res, repr(s)
-@given(st.from_regex(PunctuationRe.pattern.pattern, fullmatch=True))
+@given(st.from_regex(PunctuationRe.pattern, fullmatch=True))
 def test_PunctuationRe(s: str):
     res_re = PunctuationRe.filter(s)
     res_re1 = PunctuationRe1.filter(s)
     assert res_re == res_re1, repr(s)
-@given(st.from_regex(PunctuationRe.pattern.pattern, fullmatch=True))
+@given(st.from_regex(PunctuationRe.pattern, fullmatch=True))
 @example("\U000f1990")  # UCSUR char
 def test_Punctuation(s: str):
     res_fn = Punctuation.filter(s)
@@ -185,7 +185,7 @@ def test_Numeric(s: str):
 @given(
-    st.from_regex(PunctuationRe.pattern.pattern, fullmatch=True)
+    st.from_regex(PunctuationRe.pattern, fullmatch=True)
     | st.from_regex(r"\d+", fullmatch=True),
 )
 def test_OrFilter(s: str):
@@ -247,7 +247,7 @@ def test_OrFilter_IsipinEpiku(s: str):
 @given(st.sampled_from(list(words_by_tag("book", "pu"))))
 def test_AndFilter(s: str):
     s = s.capitalize()
-    f = And(ProperName, NimiPu)
+    f = And(PuName, NimiPu)
     assert f.filter(s)
@@ -259,8 +259,8 @@ def test_NotFilter(s: str):
 @given(
     st.sampled_from(list(FALSE_POS_SYLLABIC))
-    | st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
-    | st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True)
+    | st.from_regex(Syllabic.pattern, fullmatch=True)
+    | st.from_regex(AlphabeticRe.pattern, fullmatch=True)
 )
 def test_AndNotFilter(s: str):
     AndNotFilter = And(Syllabic, Not(FalsePosSyllabic))
@@ -309,7 +309,7 @@ def test_AddTokensToMemberFilterNegative(s: str):
             | words_by_tag("usage_category", "sandbox")
         ),
     )
-    | st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
+    | st.from_regex(Syllabic.pattern, fullmatch=True)
 )
 def test_SubTokensFromMemberFilter(s: str):
     NimiAlaFilter = NimiLinkuCore(sub=NimiPu.tokens)

{sonatoki-0.7.0 → sonatoki-0.8.1}/tests/test_ilo.py RENAMED Viewed

@@ -94,6 +94,9 @@ NAME_MATCHES = [
     "toki Kanse li lon",
     "toki Lojban li nasa e lawa mi",
     "ilo Firefox",
+    "ilo FaceBook li nasa",
+    "mi kepeken ilo MySQL",
+    "poki li nasin SQLite",
     "mi musi Space Station 13",
     "jan Tepo en jan Salo en jan Lakuse en pipi Kewapi en soweli Eweke en mi li musi",
 ]
@@ -135,7 +138,6 @@ EXCESSIVE_SYLLABICS = [
 ]
 EXCESSIVE_ALPHABETICS = [
-    "21st",  # candidate for xfails?
     "wen i tok usin onli notes in toki pona i look silli. ",
     "I wait, I sulk, as a tool I make stoops to ineptness.",
     "aaa i non-saw usa's most multiple element-set. it's as asinine as in `e`-less speak",
@@ -161,6 +163,7 @@ EXCESSIVE_ENGLISH = [
     "i'm online all the time",
     "How to Cut a Kiwi",
     "a e i o u",
+    "21st",  # previous false positive; fixed by ProperName change
 ]
 NON_MATCHES = [

{sonatoki-0.7.0 → sonatoki-0.8.1}/tests/test_preprocessors.py RENAMED Viewed

@@ -22,7 +22,7 @@ from sonatoki.Preprocessors import (
 )
-@given(st.from_regex(URLs.pattern.pattern, fullmatch=True))
+@given(st.from_regex(URLs.pattern, fullmatch=True))
 @example("https://google.com")
 @example("https://mun.la")
 @example("https://discord.gg/")
@@ -32,7 +32,7 @@ def test_URLs(s: str):
     assert URLs.process(s).strip() == ""
-@given(st.from_regex(Spoilers.pattern.pattern, fullmatch=True))
+@given(st.from_regex(Spoilers.pattern, fullmatch=True))
 @example("|| | ||")
 @example("|| content\n\n\ncontent ||")
 @example("||\n||")
@@ -42,14 +42,14 @@ def test_Spoilers(s: str):
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(Backticks.pattern.pattern, fullmatch=True))
+@given(st.from_regex(Backticks.pattern, fullmatch=True))
 @example("` ` ` `")
 def test_Backticks(s: str):
     res = Backticks.process(s).strip()
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(Codeblock.pattern.pattern, fullmatch=True))
+@given(st.from_regex(Codeblock.pattern, fullmatch=True))
 @example(
     """```
 ```"""
@@ -63,12 +63,18 @@ blocky message
 second blocky message
 ```"""
 )
+@example(
+    """```oisandm123-_mu
+arbitrary content
+```"""
+)
+@example("""```mu```""")
 def test_Codeblock(s: str):
     res = Codeblock.process(s).strip()
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(ArrowQuote.pattern.pattern, fullmatch=True))
+@given(st.from_regex(ArrowQuote.pattern, fullmatch=True))
 @example("> base")
 @example("> newline\n> newline")
 def test_ArrowQuote(s: str):
@@ -76,7 +82,7 @@ def test_ArrowQuote(s: str):
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(DoubleQuotes.pattern.pattern, fullmatch=True))
+@given(st.from_regex(DoubleQuotes.pattern, fullmatch=True))
 @example('" "" "')
 @example('" "\n" "')
 @example('" \n "')
@@ -85,7 +91,7 @@ def test_DoubleQuotes(s: str):
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(SingleQuotes.pattern.pattern, fullmatch=True))
+@given(st.from_regex(SingleQuotes.pattern, fullmatch=True))
 @example("' '' '")
 @example("' '\n' '")
 @example("' \n '")
@@ -94,7 +100,7 @@ def test_SingleQuotes(s: str):
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(DiscordEmotes.pattern.pattern, fullmatch=True))
+@given(st.from_regex(DiscordEmotes.pattern, fullmatch=True))
 @example("<a:example:123123>")
 @example("<:example:123123>")
 def test_DiscordEmotes(s: str):
@@ -102,7 +108,7 @@ def test_DiscordEmotes(s: str):
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(DiscordMentions.pattern.pattern, fullmatch=True))
+@given(st.from_regex(DiscordMentions.pattern, fullmatch=True))
 @example("<@497549183847497739>")
 @example("<@!457890000>")
 @example("<@&18398198981985>")
@@ -111,7 +117,7 @@ def test_DiscordMentions(s: str):
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(DiscordChannels.pattern.pattern, fullmatch=True))
+@given(st.from_regex(DiscordChannels.pattern, fullmatch=True))
 @example("<#19858915>")
 @example("<#18591912589812985>")
 def test_DiscordChannels(s: str):
@@ -119,7 +125,7 @@ def test_DiscordChannels(s: str):
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(DiscordSpecial.pattern.pattern, fullmatch=True))
+@given(st.from_regex(DiscordSpecial.pattern, fullmatch=True))
 @example("<id:guide>")
 @example("<id:browse>")
 def test_DiscordSpecial(s: str):
@@ -128,11 +134,11 @@ def test_DiscordSpecial(s: str):
 @given(
-    st.from_regex(DiscordEmotes.pattern.pattern, fullmatch=True)
-    | st.from_regex(DiscordMentions.pattern.pattern, fullmatch=True)
-    | st.from_regex(DiscordChannels.pattern.pattern, fullmatch=True)
-    | st.from_regex(DiscordSpecial.pattern.pattern, fullmatch=True)
-    | st.from_regex(AngleBracketObject.pattern.pattern, fullmatch=True)
+    st.from_regex(DiscordEmotes.pattern, fullmatch=True)
+    | st.from_regex(DiscordMentions.pattern, fullmatch=True)
+    | st.from_regex(DiscordChannels.pattern, fullmatch=True)
+    | st.from_regex(DiscordSpecial.pattern, fullmatch=True)
+    | st.from_regex(AngleBracketObject.pattern, fullmatch=True)
 )
 @example("<https://example.com>")
 @example("<#123124125125>")
@@ -142,11 +148,11 @@ def test_AngleBracketObject(s: str):
 @given(
-    st.from_regex(SingleQuotes.pattern.pattern, fullmatch=True)
-    | st.from_regex(DoubleQuotes.pattern.pattern, fullmatch=True)
-    | st.from_regex(Backticks.pattern.pattern, fullmatch=True)
-    | st.from_regex(ArrowQuote.pattern.pattern, fullmatch=True)
-    | st.from_regex(AllQuotes.pattern.pattern, fullmatch=True)
+    st.from_regex(SingleQuotes.pattern, fullmatch=True)
+    | st.from_regex(DoubleQuotes.pattern, fullmatch=True)
+    | st.from_regex(Backticks.pattern, fullmatch=True)
+    | st.from_regex(ArrowQuote.pattern, fullmatch=True)
+    | st.from_regex(AllQuotes.pattern, fullmatch=True)
 )
 @example("> bruh")
 @example("`bruh`")
@@ -155,7 +161,7 @@ def test_AllQuotes(s: str):
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(Reference.pattern.pattern, fullmatch=True))
+@given(st.from_regex(Reference.pattern, fullmatch=True))
 @example("[[Brainstorm]]")
 @example("[[Phatic Phrases]]")
 @example("[[Yahoo!]]")
@@ -164,7 +170,7 @@ def test_Reference(s: str):
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(ColonEmotes.pattern.pattern, fullmatch=True))
+@given(st.from_regex(ColonEmotes.pattern, fullmatch=True))
 @example(":owe::owe:")
 @example(":suffering:")
 @example(":presid65despair:")

{sonatoki-0.7.0 → sonatoki-0.8.1}/tests/test_scorers.py RENAMED Viewed

@@ -10,10 +10,10 @@ from hypothesis import given, example
 from sonatoki.Filters import (
     Filter,
     NimiPu,
+    PuName,
     Numeric,
     Syllabic,
     Alphabetic,
-    ProperName,
     Phonotactic,
     NimiLinkuCore,
     PunctuationRe,
@@ -31,7 +31,7 @@ FILTERS = [
     NimiLinkuCore,
     NimiLinkuCommon,
     Alphabetic,
-    ProperName,
+    PuName,
     Phonotactic,
     PunctuationRe,
 ]

{sonatoki-0.7.0 → sonatoki-0.8.1}/tests/test_utils.py RENAMED Viewed

@@ -9,10 +9,10 @@ PROPER_NAME_RE = r"[A-Z][a-z]*"
 token_strategy = (
     st.sampled_from(list(words_by_usage(60)))
-    | st.from_regex(Phonotactic.pattern.pattern, fullmatch=True)
-    | st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
+    | st.from_regex(Phonotactic.pattern, fullmatch=True)
+    | st.from_regex(Syllabic.pattern, fullmatch=True)
     | st.from_regex(PROPER_NAME_RE, fullmatch=True)
-    | st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True)
+    | st.from_regex(AlphabeticRe.pattern, fullmatch=True)
 )