PyPI - sonatoki - Versions diffs - 0.8.0__tar.gz → 0.8.2__tar.gz - Mend

sonatoki 0.8.0tar.gz → 0.8.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{sonatoki-0.8.0 → sonatoki-0.8.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sonatoki
-Version: 0.8.0
+Version: 0.8.2
 Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
 Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
 License: AGPL-3.0-or-later

{sonatoki-0.8.0 → sonatoki-0.8.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "sonatoki"
-version = "0.8.0"
+version = "0.8.2"
 description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
 authors = [
     { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },

{sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/Preprocessors.py RENAMED Viewed

@@ -149,7 +149,10 @@ class Codeblock(RegexPreprocessor):
     Subset of what would be removed by Backticks, but may be preferable.
     """
-    pattern = re.compile(r"```\n(?:(?!```).*?)?```", flags=re.DOTALL)
+    pattern = re.compile(
+        r"```.+?```",
+        flags=re.DOTALL,
+    )
 class Spoilers(RegexPreprocessor):

{sonatoki-0.8.0 → sonatoki-0.8.2}/src/sonatoki/constants.py RENAMED Viewed

@@ -503,8 +503,9 @@ ALL_PUNCT = "".join(sorted(list(set(POSIX_PUNCT + UNICODE_PUNCT))))
 ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
 # combined bc the result could be simpler
-SENTENCE_PUNCT = """.?!:;()[-]·•…"""
-# NOTE: quotes were previously included, but in TP they are *not* reliably sentence boundaries
+SENTENCE_PUNCT = """.?!:;"()[-]«»‹›“”‟„⹂‽·•…「」『』"""
+# single quotes are word boundaries if not intra-word, but double quotes are sentence
+# boundaries
 INTRA_WORD_PUNCT = """-'’"""

{sonatoki-0.8.0 → sonatoki-0.8.2}/tests/test_cleaners.py RENAMED Viewed

@@ -12,7 +12,7 @@ from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates, ConsecutiveDupli
 from .test_utils import PROPER_NAME_RE
-@given(st.from_regex(ConsecutiveDuplicatesRe.pattern.pattern))
+@given(st.from_regex(ConsecutiveDuplicatesRe.pattern))
 @example("tooooki a")
 @example("muuuuuu")
 @example("nnn")
@@ -25,7 +25,7 @@ def test_ConsecutiveDuplicatesRe(s: str):
         assert a.lower() != b.lower(), (s, res)
-@given(st.from_regex(ConsecutiveDuplicatesRe.pattern.pattern))
+@given(st.from_regex(ConsecutiveDuplicatesRe.pattern))
 @example("Aaa")
 @example("aAa")
 @example("aaA")

{sonatoki-0.8.0 → sonatoki-0.8.2}/tests/test_filters.py RENAMED Viewed

@@ -90,7 +90,7 @@ def test_NimiLinkuSandbox(s: str):
     assert res, repr(s)
-@given(st.from_regex(Phonotactic.pattern.pattern, fullmatch=True))
+@given(st.from_regex(Phonotactic.pattern, fullmatch=True))
 @example("kijetesantakalu")
 @example("n")
 def test_Phonotactic(s: str):
@@ -98,28 +98,28 @@ def test_Phonotactic(s: str):
     assert res, repr(s)
-@given(st.from_regex(Phonotactic.pattern.pattern, fullmatch=True))
+@given(st.from_regex(Phonotactic.pattern, fullmatch=True))
 def test_LongPhonotactic(s: str):
     len_ok = len(s) >= LongPhonotactic.length
     res = LongPhonotactic.filter(s)
     assert res == len_ok, repr(s)  # will match given fullmatch
-@given(st.from_regex(Syllabic.pattern.pattern, fullmatch=True))
+@given(st.from_regex(Syllabic.pattern, fullmatch=True))
 @example("wuwojitiwunwonjintinmanna")
 def test_Syllabic(s: str):
     res = Syllabic.filter(s)
     assert res, repr(s)
-@given(st.from_regex(Syllabic.pattern.pattern, fullmatch=True))
+@given(st.from_regex(Syllabic.pattern, fullmatch=True))
 def test_LongSyllabic(s: str):
     len_ok = len(s) >= LongSyllabic.length
     res = LongSyllabic.filter(s)
     assert res == len_ok
-@given(st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True))
+@given(st.from_regex(AlphabeticRe.pattern, fullmatch=True))
 @example("muems")
 @example("mpptp")
 @example("tptpt")
@@ -129,14 +129,14 @@ def test_Alphabetic(s: str):
     assert res_fn == res_re, repr(s)
-@given(st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True))
+@given(st.from_regex(AlphabeticRe.pattern, fullmatch=True))
 def test_LongAlphabetic(s: str):
     len_ok = len(s) >= LongAlphabetic.length
     res = LongAlphabetic.filter(s)
     assert res == len_ok
-@given(st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True))
+@given(st.from_regex(AlphabeticRe.pattern, fullmatch=True))
 def test_AlphabeticRe(s: str):
     res_re = AlphabeticRe.filter(s)
     assert res_re, repr(s)
@@ -148,7 +148,7 @@ def test_ProperName(s: str):
     assert res, repr(s)
-@given(st.from_regex(PunctuationRe.pattern.pattern, fullmatch=True))
+@given(st.from_regex(PunctuationRe.pattern, fullmatch=True))
 @example("[]")
 @example(r"\\")
 @example(r"\"")
@@ -161,14 +161,14 @@ def test_PunctuationRe1(s: str):
     assert res, repr(s)
-@given(st.from_regex(PunctuationRe.pattern.pattern, fullmatch=True))
+@given(st.from_regex(PunctuationRe.pattern, fullmatch=True))
 def test_PunctuationRe(s: str):
     res_re = PunctuationRe.filter(s)
     res_re1 = PunctuationRe1.filter(s)
     assert res_re == res_re1, repr(s)
-@given(st.from_regex(PunctuationRe.pattern.pattern, fullmatch=True))
+@given(st.from_regex(PunctuationRe.pattern, fullmatch=True))
 @example("\U000f1990")  # UCSUR char
 def test_Punctuation(s: str):
     res_fn = Punctuation.filter(s)
@@ -185,7 +185,7 @@ def test_Numeric(s: str):
 @given(
-    st.from_regex(PunctuationRe.pattern.pattern, fullmatch=True)
+    st.from_regex(PunctuationRe.pattern, fullmatch=True)
     | st.from_regex(r"\d+", fullmatch=True),
 )
 def test_OrFilter(s: str):
@@ -259,8 +259,8 @@ def test_NotFilter(s: str):
 @given(
     st.sampled_from(list(FALSE_POS_SYLLABIC))
-    | st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
-    | st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True)
+    | st.from_regex(Syllabic.pattern, fullmatch=True)
+    | st.from_regex(AlphabeticRe.pattern, fullmatch=True)
 )
 def test_AndNotFilter(s: str):
     AndNotFilter = And(Syllabic, Not(FalsePosSyllabic))
@@ -309,7 +309,7 @@ def test_AddTokensToMemberFilterNegative(s: str):
             | words_by_tag("usage_category", "sandbox")
         ),
     )
-    | st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
+    | st.from_regex(Syllabic.pattern, fullmatch=True)
 )
 def test_SubTokensFromMemberFilter(s: str):
     NimiAlaFilter = NimiLinkuCore(sub=NimiPu.tokens)

{sonatoki-0.8.0 → sonatoki-0.8.2}/tests/test_preprocessors.py RENAMED Viewed

@@ -22,7 +22,7 @@ from sonatoki.Preprocessors import (
 )
-@given(st.from_regex(URLs.pattern.pattern, fullmatch=True))
+@given(st.from_regex(URLs.pattern, fullmatch=True))
 @example("https://google.com")
 @example("https://mun.la")
 @example("https://discord.gg/")
@@ -32,7 +32,7 @@ def test_URLs(s: str):
     assert URLs.process(s).strip() == ""
-@given(st.from_regex(Spoilers.pattern.pattern, fullmatch=True))
+@given(st.from_regex(Spoilers.pattern, fullmatch=True))
 @example("|| | ||")
 @example("|| content\n\n\ncontent ||")
 @example("||\n||")
@@ -42,14 +42,15 @@ def test_Spoilers(s: str):
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(Backticks.pattern.pattern, fullmatch=True))
+@given(st.from_regex(Backticks.pattern, fullmatch=True))
 @example("` ` ` `")
 def test_Backticks(s: str):
     res = Backticks.process(s).strip()
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(Codeblock.pattern.pattern, fullmatch=True))
+@given(st.from_regex(r"```(?:(?!`).+?)```", fullmatch=True))
+@example("""```0```""")
 @example(
     """```
 ```"""
@@ -63,12 +64,18 @@ blocky message
 second blocky message
 ```"""
 )
+@example(
+    """```oisandm123-_mu
+arbitrary content
+```"""
+)
+@example("""```mu```""")
 def test_Codeblock(s: str):
     res = Codeblock.process(s).strip()
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(ArrowQuote.pattern.pattern, fullmatch=True))
+@given(st.from_regex(ArrowQuote.pattern, fullmatch=True))
 @example("> base")
 @example("> newline\n> newline")
 def test_ArrowQuote(s: str):
@@ -76,7 +83,7 @@ def test_ArrowQuote(s: str):
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(DoubleQuotes.pattern.pattern, fullmatch=True))
+@given(st.from_regex(DoubleQuotes.pattern, fullmatch=True))
 @example('" "" "')
 @example('" "\n" "')
 @example('" \n "')
@@ -85,7 +92,7 @@ def test_DoubleQuotes(s: str):
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(SingleQuotes.pattern.pattern, fullmatch=True))
+@given(st.from_regex(SingleQuotes.pattern, fullmatch=True))
 @example("' '' '")
 @example("' '\n' '")
 @example("' \n '")
@@ -94,7 +101,7 @@ def test_SingleQuotes(s: str):
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(DiscordEmotes.pattern.pattern, fullmatch=True))
+@given(st.from_regex(DiscordEmotes.pattern, fullmatch=True))
 @example("<a:example:123123>")
 @example("<:example:123123>")
 def test_DiscordEmotes(s: str):
@@ -102,7 +109,7 @@ def test_DiscordEmotes(s: str):
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(DiscordMentions.pattern.pattern, fullmatch=True))
+@given(st.from_regex(DiscordMentions.pattern, fullmatch=True))
 @example("<@497549183847497739>")
 @example("<@!457890000>")
 @example("<@&18398198981985>")
@@ -111,7 +118,7 @@ def test_DiscordMentions(s: str):
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(DiscordChannels.pattern.pattern, fullmatch=True))
+@given(st.from_regex(DiscordChannels.pattern, fullmatch=True))
 @example("<#19858915>")
 @example("<#18591912589812985>")
 def test_DiscordChannels(s: str):
@@ -119,7 +126,7 @@ def test_DiscordChannels(s: str):
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(DiscordSpecial.pattern.pattern, fullmatch=True))
+@given(st.from_regex(DiscordSpecial.pattern, fullmatch=True))
 @example("<id:guide>")
 @example("<id:browse>")
 def test_DiscordSpecial(s: str):
@@ -128,11 +135,11 @@ def test_DiscordSpecial(s: str):
 @given(
-    st.from_regex(DiscordEmotes.pattern.pattern, fullmatch=True)
-    | st.from_regex(DiscordMentions.pattern.pattern, fullmatch=True)
-    | st.from_regex(DiscordChannels.pattern.pattern, fullmatch=True)
-    | st.from_regex(DiscordSpecial.pattern.pattern, fullmatch=True)
-    | st.from_regex(AngleBracketObject.pattern.pattern, fullmatch=True)
+    st.from_regex(DiscordEmotes.pattern, fullmatch=True)
+    | st.from_regex(DiscordMentions.pattern, fullmatch=True)
+    | st.from_regex(DiscordChannels.pattern, fullmatch=True)
+    | st.from_regex(DiscordSpecial.pattern, fullmatch=True)
+    | st.from_regex(AngleBracketObject.pattern, fullmatch=True)
 )
 @example("<https://example.com>")
 @example("<#123124125125>")
@@ -142,11 +149,11 @@ def test_AngleBracketObject(s: str):
 @given(
-    st.from_regex(SingleQuotes.pattern.pattern, fullmatch=True)
-    | st.from_regex(DoubleQuotes.pattern.pattern, fullmatch=True)
-    | st.from_regex(Backticks.pattern.pattern, fullmatch=True)
-    | st.from_regex(ArrowQuote.pattern.pattern, fullmatch=True)
-    | st.from_regex(AllQuotes.pattern.pattern, fullmatch=True)
+    st.from_regex(SingleQuotes.pattern, fullmatch=True)
+    | st.from_regex(DoubleQuotes.pattern, fullmatch=True)
+    | st.from_regex(Backticks.pattern, fullmatch=True)
+    | st.from_regex(ArrowQuote.pattern, fullmatch=True)
+    | st.from_regex(AllQuotes.pattern, fullmatch=True)
 )
 @example("> bruh")
 @example("`bruh`")
@@ -155,7 +162,7 @@ def test_AllQuotes(s: str):
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(Reference.pattern.pattern, fullmatch=True))
+@given(st.from_regex(Reference.pattern, fullmatch=True))
 @example("[[Brainstorm]]")
 @example("[[Phatic Phrases]]")
 @example("[[Yahoo!]]")
@@ -164,7 +171,7 @@ def test_Reference(s: str):
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(ColonEmotes.pattern.pattern, fullmatch=True))
+@given(st.from_regex(ColonEmotes.pattern, fullmatch=True))
 @example(":owe::owe:")
 @example(":suffering:")
 @example(":presid65despair:")

{sonatoki-0.8.0 → sonatoki-0.8.2}/tests/test_utils.py RENAMED Viewed

@@ -9,10 +9,10 @@ PROPER_NAME_RE = r"[A-Z][a-z]*"
 token_strategy = (
     st.sampled_from(list(words_by_usage(60)))
-    | st.from_regex(Phonotactic.pattern.pattern, fullmatch=True)
-    | st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
+    | st.from_regex(Phonotactic.pattern, fullmatch=True)
+    | st.from_regex(Syllabic.pattern, fullmatch=True)
     | st.from_regex(PROPER_NAME_RE, fullmatch=True)
-    | st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True)
+    | st.from_regex(AlphabeticRe.pattern, fullmatch=True)
 )

{sonatoki-0.8.0 → sonatoki-0.8.2}/tests/tokenize_cases/tokenize_sentences_tok.yml RENAMED Viewed

@@ -56,7 +56,8 @@
   input: 'ona li toki e ni: "mama sina"'
   output:
     - "ona li toki e ni:"
-    - '"mama sina"'
+    - '"'
+    - 'mama sina"'
 - name: "discovered case 1"
   input: "ona li ken lukin e sitelen [_ike_nanpa_lete_ike]. ni li pona kin."
   output: