PyPI - sonatoki - Versions diffs - 0.9.1__tar.gz → 0.10.0__tar.gz - Mend

sonatoki 0.9.1tar.gz → 0.10.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{sonatoki-0.9.1 → sonatoki-0.10.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sonatoki
-Version: 0.9.1
+Version: 0.10.0
 Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
 Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
 License: AGPL-3.0-or-later

{sonatoki-0.9.1 → sonatoki-0.10.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "sonatoki"
-version = "0.9.1"
+version = "0.10.0"
 description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
 authors = [
     { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },

{sonatoki-0.9.1 → sonatoki-0.10.0}/src/sonatoki/Tokenizers.py RENAMED Viewed

@@ -12,9 +12,13 @@ from sonatoki.utils import regex_escape
 from sonatoki.Filters import NimiUCSUR  # seriously this sucks
 from sonatoki.constants import (
     ALL_PUNCT,
-    SENTENCE_PUNCT,
     INTRA_WORD_PUNCT,
+    ALL_SENTENCE_PUNCT,
+    UNICODE_WHITESPACE,
     ALL_PUNCT_RANGES_STR,
+    UCSUR_CARTOUCHE_LEFT,
+    UCSUR_CARTOUCHE_RIGHT,
+    UCSUR_MINUS_CARTOUCHE,
 )
 regex.DEFAULT_VERSION = regex.VERSION1
@@ -104,6 +108,10 @@ class WordTokenizer(SetTokenizer):
                 # we skipped, but there wasn't another writing character
                 cls.add_token(s, tokens, last_match, i - 1)
                 last_match = i - 1
+                # there may be punctuation though
+                # TODO: this is duplicated
+                while i < slen and cls.is_delimiter(s[i]):
+                    i += 1
             cls.add_token(s, tokens, last_match, i)
@@ -142,7 +150,9 @@ class WordTokenizerRe1(Regex1Tokenizer):
 class SentTokenizer(SetTokenizer):
-    delimiters = set(SENTENCE_PUNCT + "\n")  # regex does \n with a flag
+    delimiters: Set[str] = set(ALL_SENTENCE_PUNCT + "\n")  # regex does \n with a flag
+    intra_word_punct: Set[str] = set(INTRA_WORD_PUNCT)
+    all_punct: Set[str] = set(ALL_PUNCT + UNICODE_WHITESPACE)
     @classmethod
     @override
@@ -151,16 +161,43 @@ class SentTokenizer(SetTokenizer):
             return []
         tokens: List[str] = []
+        slen = len(s)
         last_match = 0
-        for i, char in enumerate(s):
-            if char not in cls.delimiters:
+        i = 0
+        while i < slen:
+            # if a cartouche appears, we do not want to split on its punctuation
+            if s[i] == UCSUR_CARTOUCHE_LEFT:
+                right_i = s.find(UCSUR_CARTOUCHE_RIGHT, i)
+                contained: set[str] = set()
+                if right_i > 0:
+                    contained = set(s[i + 1 : right_i])
+                # but it must contain only non-cartouche UCSUR chars
+                if contained and contained.issubset(UCSUR_MINUS_CARTOUCHE):
+                    i = right_i + 1
+                    continue
+            if s[i] not in cls.delimiters:
+                i += 1
                 continue
+            if s[i] in cls.intra_word_punct:
+                prev = s[i - 1] if i > 0 else ""
+                next = s[i + 1] if i + 1 < slen else ""
+                if (
+                    prev
+                    and next
+                    and prev not in cls.all_punct
+                    and next not in cls.all_punct
+                ):
+                    i += 2
+                    continue
             match = s[last_match : i + 1].strip()
             last_match = i + 1  # newlines can strip but idc
             if not match:
+                i += 1
                 continue
             tokens.append(match)
+            i += 1
         match = s[last_match:].strip()
         if match:
@@ -169,18 +206,24 @@ class SentTokenizer(SetTokenizer):
         return tokens
+@deprecated(
+    "SentTokenizerRe is a previous reference implementation. Its behavior has diverged from SentTokenizer and it may not be restored."
+)
 class SentTokenizerRe(RegexTokenizer):
     pattern = re.compile(
-        rf"""(?<=[{regex_escape(SENTENCE_PUNCT)}])|$""", flags=re.MULTILINE
+        rf"""(?<=[{regex_escape(ALL_SENTENCE_PUNCT)}])|$""", flags=re.MULTILINE
     )
     # TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
     # TODO: do the typography characters matter?
     # NOTE: | / and , are *not* sentence delimiters for my purpose
+@deprecated(
+    "SentTokenizerRe1 is a previous reference implementation. Its behavior has diverged from SentTokenizer and it may not be restored."
+)
 class SentTokenizerRe1(Regex1Tokenizer):
     pattern = regex.compile(
-        rf"""(?<=[{regex_escape(SENTENCE_PUNCT)}]|$)""", flags=regex.MULTILINE
+        rf"""(?<=[{regex_escape(ALL_SENTENCE_PUNCT)}]|$)""", flags=regex.MULTILINE
     )

{sonatoki-0.9.1 → sonatoki-0.10.0}/src/sonatoki/__main__.py RENAMED Viewed

@@ -24,6 +24,7 @@ from sonatoki.Cleaners import ConsecutiveDuplicates
 from sonatoki.constants import (
     UCSUR_PUNCT_RANGES,
     UNICODE_PUNCT_RANGES,
+    UNICODE_WHITESPACE_RANGES,
     EMOJI_VARIATION_SELECTOR_RANGES,
 )
@@ -121,6 +122,11 @@ def regen_unicode_data():
         "Sc",  # Currency
         "So",  # Other
     }
+    WHITESPACE_CATEGORIES = {
+        "Zl",  # Line Separator
+        "Zp",  # Paragraph Separator
+        "Zs",  # Space Separator
+    }
     r"""These characters are in Symbol other (So) but are not in
     `\p{Punctuation}` However, I began excluding them again, because it turns
     out that some sequences of latin alphabet emoji."""
@@ -134,11 +140,15 @@ def regen_unicode_data():
     def is_punctuation(data: List[str]):
         return data[2] in PUNCT_CATEGORIES
+    def is_whitespace(data: List[str]):
+        return data[2] in WHITESPACE_CATEGORIES
     def get_character(data: List[str]):
         return chr(int(data[0], 16))
     unicode_data = download(UNICODE_DATA)
     unicode_punctuation = ""
+    unicode_whitespace = ""
     for line in unicode_data.split("\n"):
         if not line:  # damn you, trailing newline
             continue
@@ -147,24 +157,35 @@ def regen_unicode_data():
         # This does not apply to any currently defined punctuation category.
         unicode_data = line.split(";")
-        if not is_punctuation(unicode_data):
+        if is_punctuation(unicode_data):
+            char = get_character(unicode_data)
+            unicode_punctuation += char
+            continue
+        if is_whitespace((unicode_data)):
+            char = get_character(unicode_data)
+            unicode_whitespace += char
             continue
-        char = get_character(unicode_data)
-        unicode_punctuation += char
     unicode_punctuation = emoji.replace_emoji(unicode_punctuation)
-    unicode_ranges = find_unicode_ranges(unicode_punctuation)
-    unicode_ranges.extend(UCSUR_PUNCT_RANGES)
-    # unicode_ranges.extend(EMOJI_VARIATION_SELECTOR_RANGES)  # made unnecessary by emoji library
-    unicode_ranges = sorted(unicode_ranges)
+    unicode_punct_ranges = find_unicode_ranges(unicode_punctuation)
+    unicode_punct_ranges.extend(UCSUR_PUNCT_RANGES)
+    unicode_punct_ranges = sorted(unicode_punct_ranges)
     # sorted in case my manual additions are out of order
-    if unicode_ranges != UNICODE_PUNCT_RANGES:
-        output = json.dumps(unicode_ranges, indent=4, ensure_ascii=True)
-        print(output)
+    # TODO: can i push these outputs directly into the constants.py file?
+    if unicode_punct_ranges != UNICODE_PUNCT_RANGES:
+        output = json.dumps(unicode_punct_ranges, indent=4, ensure_ascii=True)
+        with open("updated_unicode_punct_ranges.txt", "w") as f:
+            f.write(output)
+    unicode_whitespace_ranges = find_unicode_ranges(unicode_whitespace)
+    unicode_whitespace_ranges = sorted(unicode_whitespace_ranges)
+    if unicode_whitespace_ranges != UNICODE_WHITESPACE_RANGES:
+        output = json.dumps(unicode_whitespace_ranges, indent=4, ensure_ascii=True)
+        with open("updated_unicode_whitespace_ranges.txt", "w") as f:
+            f.write(output)
 def main(argv: argparse.Namespace):

{sonatoki-0.9.1 → sonatoki-0.10.0}/src/sonatoki/constants.py RENAMED Viewed

@@ -109,8 +109,9 @@ UNICODE_PUNCT_RANGES = [
     "\\U00001a1e-\\U00001a1f",
     "\\U00001aa0-\\U00001aa6",
     "\\U00001aa8-\\U00001aad",
+    "\\U00001b4e-\\U00001b4f",
     "\\U00001b5a-\\U00001b6a",
-    "\\U00001b74-\\U00001b7e",
+    "\\U00001b74-\\U00001b7f",
     "\\U00001bfc-\\U00001bff",
     "\\U00001c3b-\\U00001c3f",
     "\\U00001c7e-\\U00001c7f",
@@ -152,7 +153,7 @@ UNICODE_PUNCT_RANGES = [
     "\\U00002329-\\U000023ce",
     "\\U000023d0-\\U000023e8",
     "\\U000023f4-\\U000023f7",
-    "\\U000023fb-\\U00002426",
+    "\\U000023fb-\\U00002429",
     "\\U00002440-\\U0000244a",
     "\\U0000249c-\\U000024c1",
     "\\U000024c3-\\U000024e9",
@@ -248,7 +249,7 @@ UNICODE_PUNCT_RANGES = [
     "\\U000030fb",
     "\\U00003190-\\U00003191",
     "\\U00003196-\\U0000319f",
-    "\\U000031c0-\\U000031e3",
+    "\\U000031c0-\\U000031e5",
     "\\U000031ef",
     "\\U00003200-\\U0000321e",
     "\\U0000322a-\\U00003247",
@@ -321,6 +322,8 @@ UNICODE_PUNCT_RANGES = [
     "\\U00010af0-\\U00010af6",
     "\\U00010b39-\\U00010b3f",
     "\\U00010b99-\\U00010b9c",
+    "\\U00010d6e",
+    "\\U00010d8e-\\U00010d8f",
     "\\U00010ead",
     "\\U00010f55-\\U00010f59",
     "\\U00010f86-\\U00010f89",
@@ -335,6 +338,8 @@ UNICODE_PUNCT_RANGES = [
     "\\U000111dd-\\U000111df",
     "\\U00011238-\\U0001123d",
     "\\U000112a9",
+    "\\U000113d4-\\U000113d5",
+    "\\U000113d7-\\U000113d8",
     "\\U0001144b-\\U0001144f",
     "\\U0001145a-\\U0001145b",
     "\\U0001145d",
@@ -351,6 +356,7 @@ UNICODE_PUNCT_RANGES = [
     "\\U00011a9a-\\U00011a9c",
     "\\U00011a9e-\\U00011aa2",
     "\\U00011b00-\\U00011b09",
+    "\\U00011be1",
     "\\U00011c41-\\U00011c45",
     "\\U00011c70-\\U00011c71",
     "\\U00011ef7-\\U00011ef8",
@@ -363,10 +369,13 @@ UNICODE_PUNCT_RANGES = [
     "\\U00016af5",
     "\\U00016b37-\\U00016b3f",
     "\\U00016b44-\\U00016b45",
+    "\\U00016d6d-\\U00016d6f",
     "\\U00016e97-\\U00016e9a",
     "\\U00016fe2",
     "\\U0001bc9c",
     "\\U0001bc9f",
+    "\\U0001cc00-\\U0001ccef",
+    "\\U0001cd00-\\U0001ceb3",
     "\\U0001cf50-\\U0001cfc3",
     "\\U0001d000-\\U0001d0f5",
     "\\U0001d100-\\U0001d126",
@@ -395,6 +404,7 @@ UNICODE_PUNCT_RANGES = [
     "\\U0001da85-\\U0001da8b",
     "\\U0001e14f",
     "\\U0001e2ff",
+    "\\U0001e5ff",
     "\\U0001e95e-\\U0001e95f",
     "\\U0001ecac",
     "\\U0001ecb0",
@@ -464,16 +474,41 @@ UNICODE_PUNCT_RANGES = [
     "\\U0001f850-\\U0001f859",
     "\\U0001f860-\\U0001f887",
     "\\U0001f890-\\U0001f8ad",
-    "\\U0001f8b0-\\U0001f8b1",
+    "\\U0001f8b0-\\U0001f8bb",
+    "\\U0001f8c0-\\U0001f8c1",
     "\\U0001f900-\\U0001f90b",
     "\\U0001f93b",
     "\\U0001f946",
     "\\U0001fa00-\\U0001fa53",
     "\\U0001fa60-\\U0001fa6d",
     "\\U0001fb00-\\U0001fb92",
-    "\\U0001fb94-\\U0001fbca",
+    "\\U0001fb94-\\U0001fbef",
     "\\U000f1990-\\U000f199d",
 ]
+UNICODE_PUNCT = find_unicode_chars(UNICODE_PUNCT_RANGES)
+# this is a large string.
+# `\p{posix_punct}` character class
+POSIX_PUNCT = r"""-!"#$%&'()*+,./:;<=>?@[\]^_`{|}~"""
+POSIX_PUNCT_RANGES = find_unicode_ranges(POSIX_PUNCT)
+ALL_PUNCT = "".join(sorted(list(set(POSIX_PUNCT + UNICODE_PUNCT))))
+ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
+# combined bc the result could be simpler
+UNICODE_WHITESPACE_RANGES = [
+    "\\U00000020",
+    "\\U000000a0",
+    "\\U00001680",
+    "\\U00002000-\\U0000200a",
+    "\\U00002028-\\U00002029",
+    "\\U0000202f",
+    "\\U0000205f",
+    "\\U00003000",
+]
+UNICODE_WHITESPACE = find_unicode_chars(UNICODE_WHITESPACE_RANGES)
+UNICODE_WHITESPACE_RANGES_STR = "".join(UNICODE_WHITESPACE_RANGES)
 NOT_IN_PUNCT_CLASS = r"Ⓐ-ⓩ🄰-🅉🅐-🅩🅰-🆉"
@@ -482,9 +517,7 @@ EMOJI_VARIATION_SELECTOR_RANGES = ["\\U0000fe0e-\\U0000fe0f"]
 EMOJI_VARIATION_SELECTOR_RANGES_STR = "".join(EMOJI_VARIATION_SELECTOR_RANGES)
 """All variation selectors are in Nonspacing Mark (Mn), but it is more apt to
 mark these two as punctuation, since they are used exclusively for rendering
-emoji.
-But it's even better to use the Emoji filter.
+emoji. But it's best to use the Emoji filter.
 """
 UCSUR_PUNCT_RANGES = ["\\U000f1990-\\U000f199d"]
@@ -492,22 +525,17 @@ UCSUR_PUNCT_RANGES_STR = "".join(UCSUR_PUNCT_RANGES)
 """Private Use Area glyphs are given the apt but unhelpful 'Private Use'
 class."""
-UNICODE_PUNCT = find_unicode_chars(UNICODE_PUNCT_RANGES)
-# this is a large string.
-# `\p{posix_punct}` character class
-POSIX_PUNCT = r"""-!"#$%&'()*+,./:;<=>?@[\]^_`{|}~"""
-POSIX_PUNCT_RANGES = find_unicode_ranges(POSIX_PUNCT)
-ALL_PUNCT = "".join(sorted(list(set(POSIX_PUNCT + UNICODE_PUNCT))))
-ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
-# combined bc the result could be simpler
+UCSUR_CARTOUCHE_LEFT = "󱦐"
+UCSUR_CARTOUCHE_RIGHT = "󱦑"
-SENTENCE_PUNCT = """.?!:;"()[-]«»‹›“”‟„⹂‽·•…「」『』"""
 # single quotes are word boundaries if not intra-word, but double quotes are sentence
 # boundaries
+BASIC_SENTENCE_PUNCT = """.?!:;()[-]‽·•…"""
+QUOTATIVE_PUNCT = """"«»‹›“”‟„⹂「」『』"""
+UCSUR_SENTENCE_PUNCT = """󱦜󱦝"""
+ALL_SENTENCE_PUNCT = BASIC_SENTENCE_PUNCT + UCSUR_SENTENCE_PUNCT
-INTRA_WORD_PUNCT = """-'’"""
+INTRA_WORD_PUNCT = """-'’."""
 LINKU = Path(__file__).resolve().parent / Path("linku.json")
@@ -691,7 +719,11 @@ UCSUR_RANGES = [
     "\\U000F19A0-\\U000F19A3",  # ku lili
 ]
 NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
+ALL_UCSUR = NIMI_UCSUR + find_unicode_chars(UCSUR_PUNCT_RANGES)
+UCSUR_MINUS_CARTOUCHE = set(ALL_UCSUR).difference(
+    {UCSUR_CARTOUCHE_LEFT, UCSUR_CARTOUCHE_RIGHT}
+)
+print(UCSUR_MINUS_CARTOUCHE)
 # NIMI_PU_UCSUR_RANGES = ["\\U000F1900-\\U000F1977"]
 # NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
@@ -757,7 +789,9 @@ __all__ = [
     "POSIX_PUNCT_RANGES",
     "UCSUR_PUNCT_RANGES",
     "UCSUR_PUNCT_RANGES_STR",
+    "UCSUR_SENTENCE_PUNCT",
     "UNICODE_PUNCT",
     "UNICODE_PUNCT_RANGES",
+    "UNICODE_WHITESPACE",
     "VOWELS",
 ]

{sonatoki-0.9.1 → sonatoki-0.10.0}/tests/test_ilo.py RENAMED Viewed

@@ -165,6 +165,7 @@ EXCESSIVE_ENGLISH = [
     "I wanna see",  # same down to here
     "i'm online all the time",
     "How to Cut a Kiwi",
+    "ni li make e sense",
     "21st",  # previous false positive; fixed by ProperName change
     "a e i o u",  # voting brings this back to false positive zone...
 ]

{sonatoki-0.9.1 → sonatoki-0.10.0}/tests/test_tokenize.py RENAMED Viewed

@@ -54,10 +54,11 @@ def test_SentTokenizer(test: TokenizerTest):
         pytest.xfail()
     fn_tokenized = SentTokenizer.tokenize(test["input"])
-    re1_tokenized = SentTokenizerRe1.tokenize(test["input"])
-    assert fn_tokenized == re1_tokenized, test["name"]
+    # re1_tokenized = SentTokenizerRe1.tokenize(test["input"])
+    assert fn_tokenized == test["output"], test["name"]
+@pytest.mark.skip("Deprecated")
 @pytest.mark.parametrize(
     "test", load_tokenizer_tests("tests/tokenize_cases/tokenize_sentences_tok.yml")
 )
@@ -65,11 +66,23 @@ def test_SentTokenizerRe(test: TokenizerTest):
     if test["xfail"]:
         pytest.xfail()
+    re_tokenized = SentTokenizerRe.tokenize(test["input"])
+    assert re_tokenized == test["output"], test["name"]
+@pytest.mark.parametrize(
+    "test", load_tokenizer_tests("tests/tokenize_cases/tokenize_sentences_tok.yml")
+)
+def test_SentTokenizerReCompare(test: TokenizerTest):
+    if test["xfail"]:
+        pytest.xfail()
     re_tokenized = SentTokenizerRe.tokenize(test["input"])
     re1_tokenized = SentTokenizerRe1.tokenize(test["input"])
     assert re_tokenized == re1_tokenized, test["name"]
+@pytest.mark.skip("Deprecated")
 @pytest.mark.parametrize(
     "test", load_tokenizer_tests("tests/tokenize_cases/tokenize_sentences_tok.yml")
 )

sonatoki-0.10.0/tests/tokenize_cases/tokenize_sentences_tok.yml ADDED Viewed

@@ -0,0 +1,162 @@
+---
+- name: "basic1"
+  input: "mu. mu."
+  output:
+    - "mu."
+    - "mu."
+- name: "basic2"
+  input: "mu! mu!"
+  output:
+    - "mu!"
+    - "mu!"
+- name: "basic3"
+  input: "mu? mu?"
+  output:
+    - "mu?"
+    - "mu?"
+- name: "basic4"
+  input: "mi mu. mi wawa."
+  output:
+    - "mi mu."
+    - "mi wawa."
+- name: "empty"
+  input: ""
+  output: []
+- name: "whitespace"
+  input: "  \n  "
+  output: []
+- name: "newline basic"
+  input: "sina lon seme?\nmi wile lon poka...\n"
+  output:
+    - "sina lon seme?"
+    - "mi wile lon poka."
+    - "."
+    - "."
+- name: "newline alone"
+  input: "sina lon seme\nmi wile lon poka"
+  output:
+    - "sina lon seme"
+    - "mi wile lon poka"
+- name: "dash"
+  input: "mi sona ala e ni- sina seme a"
+  output:
+    - "mi sona ala e ni-"
+    - "sina seme a"
+- name: "comma"
+  input: "mi mu tawa sina, mi wawa e sina."
+  output:
+    - "mi mu tawa sina, mi wawa e sina."
+- name: "singlequotes"
+  input: "toki li tan kulupu Kuko li ni: 'o ike ala!'"
+  output:
+    - "toki li tan kulupu Kuko li ni:"
+    - "'o ike ala!"
+    - "'"
+- name: "doublequotes"
+  input: 'ona li toki e ni: "mama sina"'
+  output:
+    - "ona li toki e ni:"
+    - '"mama sina"'
+- name: "doublequotes 2"
+  input: 'this is a bit dumb, right? they said "where is the pacific ocean?"'
+  output:
+    - "this is a bit dumb, right?"
+    - 'they said "where is the pacific ocean?'
+    - '"'
+- name: "doublequotes 3"
+  input: 'they said "wow, its made"'
+  output:
+    - they said "wow, its made"
+- name: "mixed periods spoilers"
+  input: "||...||"
+  output:
+    - "||."
+    - "."
+    - "."
+    - "||"
+- name: "trailing periods"
+  input: "h.."
+  output:
+    - "h."
+    - "."
+- name: "trailing periods 2"
+  input: "h.!"
+  output:
+    - "h."
+    - "!"
+- name: "intraword punctuation 1"
+  input: "e.g. monsuta"
+  output:
+    - "e.g."
+    - "monsuta"
+- name: "intraword punctuation 2"
+  input: "isn't that game-breaking? i think so"
+  output:
+    - "isn't that game-breaking?"
+    - "i think so"
+- name: "fake intraword punct 1"
+  input: "!.h"
+  output:
+    - "!"
+    - "."
+    - "h"
+- name: "full width space"
+  input: "life-altering pseudo-science. and non-sense"
+  output:
+    - "life-altering pseudo-science."
+    - "and non-sense"
+- name: "discovered case 1"
+  input: "ona li ken lukin e sitelen [_ike_nanpa_lete_ike]. ni li pona kin."
+  output:
+    - "ona li ken lukin e sitelen ["
+    - "_ike_nanpa_lete_ike]"
+    - "."
+    - "ni li pona kin."
+- name: "zwj in emoji"
+  input: "👨‍👩‍👧‍👧"
+  output:
+    - "👨‍👩‍👧‍👧"
+- name: UCSUR 1
+  input: "󱥄󱥬󱥩󱤴󱦜󱥄󱥬󱥩󱤴"
+  output:
+    - "󱥄󱥬󱥩󱤴󱦜"
+    - "󱥄󱥬󱥩󱤴"
+# - name: "UCSUR 2 (original)"
+#   input: "󱤴󱤺󱦐󱤘󱦜󱤕󱦜󱤾󱦑󱦐󱤼󱦝󱦑"
+#   output:
+#     - "󱤴󱤺󱦐󱤘󱦜"
+#     - "󱤕󱦜"
+#     - "󱤾󱦑󱦐󱤼󱦝"
+#     - "󱦑"
+- name: "UCSUR 2 (preferred)"
+  input: "󱤴󱤺󱦐󱤘󱦜󱤕󱦜󱤾󱦑󱦐󱤼󱦝󱦑"
+  output:
+    - "󱤴󱤺󱦐󱤘󱦜󱤕󱦜󱤾󱦑󱦐󱤼󱦝󱦑"
+- name: "UCSUR 3"
+  input: "󱤴󱦐󱦑󱦐󱦑"
+  output:
+    - "󱤴󱦐󱦑󱦐󱦑"
+- name: "UCSUR 4"
+  input: "󱤴󱦐󱦐"
+  output:
+    - "󱤴󱦐󱦐"
+- name: "UCSUR 5"
+  input: "󱦑󱤴󱦐󱦐"
+  output:
+    - "󱦑󱤴󱦐󱦐"
+- name: "UCSUR 6"
+  input: "󱦐nvidia shield. and other nvidia products.󱦑"
+  output:
+    - "󱦐nvidia shield."
+    - "and other nvidia products."
+    - "󱦑"
+- name: "UCSUR 7"
+  input: "󱤴󱤺󱦐󱤘󱦜󱦐󱤕󱦐󱦜󱤾󱦑󱦐󱤼󱦝󱦑"
+  output:
+    - "󱤴󱤺󱦐󱤘󱦜"
+    - "󱦐󱤕󱦐󱦜󱤾󱦑󱦐󱤼󱦝󱦑"
+- name: "UCSUR 8"
+  input: "󱤴󱤺󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦝󱦑"
+  output:
+    - "󱤴󱤺󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦐󱦝󱦑"

{sonatoki-0.9.1 → sonatoki-0.10.0}/tests/tokenize_cases/tokenize_words_tok.yml RENAMED Viewed

@@ -53,9 +53,7 @@
   output:
     - "i'm"
     - "an"
-    - "m"
-    - "."
-    - "d"
+    - "m.d"
     - "."
 - name: "english 4"
   input: "it's mind-numbing honestly"
@@ -142,15 +140,7 @@
 - name: periods every word
   input: "mi.unpa.e.mama.sina"
   output:
-    - "mi"
-    - "."
-    - "unpa"
-    - "."
-    - "e"
-    - "."
-    - "mama"
-    - "."
-    - "sina"
+    - "mi.unpa.e.mama.sina"
 - name: simple bold
   input: "**mi unpa e mama sina**"
   output:
@@ -299,7 +289,20 @@
     - "「"
     - "Direct"
     - "」"
+- name: "UCSUR 4"
+  input: "󱤴󱤺󱦐󱤘󱦜󱤕󱦜󱤾󱦑󱦐󱤼󱦝󱦑"
+  output:
+    - "󱤴"
+    - "󱤺"
+    - "󱦐"
+    - "󱤘"
+    - "󱦜"
+    - "󱤕"
+    - "󱦜"
+    - "󱤾"
+    - "󱦑󱦐"
+    - "󱤼"
+    - "󱦝󱦑"
 - name: "simple intrapunct 1"
   input: "i'm"
   output:
@@ -313,6 +316,11 @@
   input: "isn't"
   output:
     - "isn't"
+- name: "simple intrapunct with punct"
+  input: "isn't."
+  output:
+    - "isn't"
+    - "."
 - name: "quoted with intrapunct"
   input: "'bother'"
   output:
@@ -337,3 +345,55 @@
   input: "whom's't'd've'n't"
   output:
     - "whom's't'd've'n't"
+- name: "just periods"
+  input: "..."
+  output:
+    - "..."
+- name: "just periods 2"
+  input: "... ..."
+  output:
+    - "..."
+    - "..."
+- name: "mixed periods spoilers"
+  input: "||...||"
+  output:
+    - "||...||"
+- name: "trailing periods"
+  input: "h.."
+  output:
+    - "h"
+    - ".."
+- name: "trailing periods"
+  input: "h.!"
+  output:
+    - "h"
+    - ".!"
+- name: "trailing period"
+  input: "h."
+  output:
+    - "h"
+    - "."
+- name: "trailing interpunctuation"
+  input: "h-.'"
+  output:
+    - "h"
+    - "-.'"
+- name: "trailing period 2"
+  input: "h. h."
+  output:
+    - "h"
+    - "."
+    - "h"
+    - "."
+- name: "sad face"
+  input: "q.q"
+  output:
+    - "q.q"
+- name: "full width space"
+  input: "life-altering pseudo-science. and non-sense"
+  output:
+    - "life-altering"
+    - "pseudo-science"
+    - "."
+    - "and"
+    - "non-sense"

sonatoki-0.9.1/tests/tokenize_cases/tokenize_sentences_tok.yml DELETED Viewed

@@ -1,71 +0,0 @@
----
-- name: "basic1"
-  input: "mu. mu."
-  output:
-    - "mu."
-    - "mu."
-- name: "basic2"
-  input: "mu! mu!"
-  output:
-    - "mu!"
-    - "mu!"
-- name: "basic3"
-  input: "mu? mu?"
-  output:
-    - "mu?"
-    - "mu?"
-- name: "basic4"
-  input: "mi mu. mi wawa."
-  output:
-    - "mi mu."
-    - "mi wawa."
-- name: "empty"
-  input: ""
-  output: []
-- name: "whitespace"
-  input: "  \n  "
-  output: []
-- name: "newline basic"
-  input: "sina lon seme?\nmi wile lon poka...\n"
-  output:
-    - "sina lon seme?"
-    - "mi wile lon poka."
-    - "."
-    - "."
-- name: "newline alone"
-  input: "sina lon seme\nmi wile lon poka"
-  output:
-    - "sina lon seme"
-    - "mi wile lon poka"
-- name: "dash"
-  input: "mi sona ala e ni- sina seme a"
-  output:
-    - "mi sona ala e ni-"
-    - "sina seme a"
-- name: "comma"
-  input: "mi mu tawa sina, mi wawa e sina."
-  output:
-    - "mi mu tawa sina, mi wawa e sina."
-- name: "singlequotes"
-  input: "toki li tan kulupu Kuko li ni: 'o ike ala!'"
-  output:
-    - "toki li tan kulupu Kuko li ni:"
-    - "'o ike ala!"
-    - "'"
-- name: "doublequotes"
-  input: 'ona li toki e ni: "mama sina"'
-  output:
-    - "ona li toki e ni:"
-    - '"'
-    - 'mama sina"'
-- name: "discovered case 1"
-  input: "ona li ken lukin e sitelen [_ike_nanpa_lete_ike]. ni li pona kin."
-  output:
-    - "ona li ken lukin e sitelen ["
-    - "_ike_nanpa_lete_ike]"
-    - "."
-    - "ni li pona kin."
-- name: "zwj in emoji"
-  input: "👨‍👩‍👧‍👧"
-  output:
-    - "👨‍👩‍👧‍👧"