PyPI - sonatoki - Versions diffs - 0.8.1__tar.gz → 0.8.2__tar.gz - Mend

sonatoki 0.8.1tar.gz → 0.8.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{sonatoki-0.8.1 → sonatoki-0.8.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sonatoki
-Version: 0.8.1
+Version: 0.8.2
 Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
 Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
 License: AGPL-3.0-or-later

{sonatoki-0.8.1 → sonatoki-0.8.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "sonatoki"
-version = "0.8.1"
+version = "0.8.2"
 description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
 authors = [
     { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },

{sonatoki-0.8.1 → sonatoki-0.8.2}/src/sonatoki/constants.py RENAMED Viewed

@@ -503,8 +503,9 @@ ALL_PUNCT = "".join(sorted(list(set(POSIX_PUNCT + UNICODE_PUNCT))))
 ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
 # combined bc the result could be simpler
-SENTENCE_PUNCT = """.?!:;()[-]·•…"""
-# NOTE: quotes were previously included, but in TP they are *not* reliably sentence boundaries
+SENTENCE_PUNCT = """.?!:;"()[-]«»‹›“”‟„⹂‽·•…「」『』"""
+# single quotes are word boundaries if not intra-word, but double quotes are sentence
+# boundaries
 INTRA_WORD_PUNCT = """-'’"""

{sonatoki-0.8.1 → sonatoki-0.8.2}/tests/test_preprocessors.py RENAMED Viewed

@@ -49,7 +49,8 @@ def test_Backticks(s: str):
     assert res == "", (repr(s), repr(res))
-@given(st.from_regex(Codeblock.pattern, fullmatch=True))
+@given(st.from_regex(r"```(?:(?!`).+?)```", fullmatch=True))
+@example("""```0```""")
 @example(
     """```
 ```"""

{sonatoki-0.8.1 → sonatoki-0.8.2}/tests/tokenize_cases/tokenize_sentences_tok.yml RENAMED Viewed

@@ -56,7 +56,8 @@
   input: 'ona li toki e ni: "mama sina"'
   output:
     - "ona li toki e ni:"
-    - '"mama sina"'
+    - '"'
+    - 'mama sina"'
 - name: "discovered case 1"
   input: "ona li ken lukin e sitelen [_ike_nanpa_lete_ike]. ni li pona kin."
   output: