PyPI - sonatoki - Versions diffs - 0.5.2__tar.gz → 0.6.0__tar.gz - Mend

sonatoki 0.5.2tar.gz → 0.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{sonatoki-0.5.2 → sonatoki-0.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sonatoki
-Version: 0.5.2
+Version: 0.6.0
 Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
 Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
 License: AGPL-3.0-or-later

{sonatoki-0.5.2 → sonatoki-0.6.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "sonatoki"
-version = "0.5.2"
+version = "0.6.0"
 description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
 authors = [
     { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },

{sonatoki-0.5.2 → sonatoki-0.6.0}/src/sonatoki/Configs.py RENAMED Viewed

@@ -30,10 +30,11 @@ from sonatoki.Filters import (
     NimiLinkuObscure,
     NimiLinkuSandbox,
     NimiLinkuUncommon,
+    FalsePosAlphabetic,
 )
 from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
 from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
-from sonatoki.Tokenizers import Tokenizer
+from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
 from sonatoki.Preprocessors import (
     URLs,
     Emoji,
@@ -72,11 +73,11 @@ PrefConfig: IloConfig = {
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
-        Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
+        Or(NimiLinkuCore, NimiLinkuCommon, NimiLinkuUncommon, NimiUCSUR),
         And(LongSyllabic, Not(FalsePosSyllabic)),
         # NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
         LongProperName,
-        LongAlphabetic,
+        And(LongAlphabetic, Not(FalsePosAlphabetic)),
     ],
     "scorer": SoftScaling,
     "passing_score": 0.8,
@@ -98,7 +99,7 @@ CorpusConfig: IloConfig = {
         ),
         And(LongSyllabic, Not(FalsePosSyllabic)),
         LongProperName,
-        LongAlphabetic,
+        And(LongAlphabetic, Not(FalsePosAlphabetic)),
     ],
     "scorer": SoftScaling,
     "passing_score": 0.8,
@@ -112,16 +113,30 @@ __corpus_tokens_dict: Set[str] = cast(
     ].tokens,  # pyright: ignore[reportAttributeAccessIssue]
 )
 __corpus_tokens_dict -= {
-    "an",
-    "i",
-    "me",
-    "ne",
-    "se",
-    "take",
-    "ten",
-    "to",
-    "u",
-    "we",
+    # Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
+    # In this case, all of these appear more often in English by a factor of at least 10.
+    "aka",  # also known as
+    "an",  # article
+    "api",  # API
+    "i",  # 1st person
+    "kana",  # japanese script
+    "me",  # 1st person
+    "ne",  # "no" in several languages
+    "nu",  # "new", now in dutch
+    "se",  # spanish particle, "see"
+    "take",  # acquire, perhaps forcefully or without permission
+    "ten",  # 10
+    "to",  # to, too
+    "u",  # no u
+    "we",  # 1st person plural
+    "wi",  # wii and discussions of syllables
+    "sole",  # singular, of shoe
+    # unexplored candidates for removal
+    # "omen",  # ominous
+    # "papa",  # father
+    # "lo",  # "lo" and "loo"
+    # "ewe",  # sheep
+    # "pa",  # father- eh?
 }
 """Mimics the previous implementation of ilo pi toki pona taso."""
 LazyConfig: IloConfig = {
@@ -131,6 +146,7 @@ LazyConfig: IloConfig = {
     "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
     "scorer": SoftPassFail,
     "passing_score": 0.8,
+    "word_tokenizer": WordTokenizerRe,  # mimics old tokenizer
 }
 """This is extremely silly."""
 IsipinEpikuConfig: IloConfig = {
@@ -147,7 +163,7 @@ IsipinEpikuConfig: IloConfig = {
         ),
         And(LongSyllabic, Not(FalsePosSyllabic)),
         LongProperName,
-        LongAlphabetic,
+        And(LongAlphabetic, Not(FalsePosAlphabetic)),
     ],
     "scorer": SoftScaling,
     "passing_score": 0.8,
@@ -162,7 +178,7 @@ DiscordConfig: IloConfig = {
         Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
         And(LongSyllabic, Not(FalsePosSyllabic)),
         LongProperName,
-        LongAlphabetic,
+        And(LongAlphabetic, Not(FalsePosAlphabetic)),
     ],
     "scorer": SoftScaling,
     "passing_score": 0.8,

{sonatoki-0.5.2 → sonatoki-0.6.0}/src/sonatoki/Tokenizers.py RENAMED Viewed

@@ -10,7 +10,12 @@ from typing_extensions import override, deprecated
 # LOCAL
 from sonatoki.utils import regex_escape
 from sonatoki.Filters import NimiUCSUR  # seriously this sucks
-from sonatoki.constants import ALL_PUNCT, SENTENCE_PUNCT, ALL_PUNCT_RANGES_STR
+from sonatoki.constants import (
+    ALL_PUNCT,
+    SENTENCE_PUNCT,
+    INTRA_WORD_PUNCT,
+    ALL_PUNCT_RANGES_STR,
+)
 regex.DEFAULT_VERSION = regex.VERSION1
@@ -47,11 +52,62 @@ class Regex1Tokenizer(Tokenizer):
 class WordTokenizer(SetTokenizer):
     delimiters = set(ALL_PUNCT)
+    intra_word_punct = set(INTRA_WORD_PUNCT)
+    @classmethod
+    def is_delimiter(cls, c: str) -> bool:
+        return c in cls.delimiters or not c
     @classmethod
-    def __helper(cls, s: str, tokens: List[str], last_match: int, i: int):
-        match = s[last_match:i].split()
-        [tokens.append(t) for t in match if t]
+    def add_token(cls, s: str, tokens: List[str], last_match: int, i: int):
+        if i > last_match:
+            tokens.append(s[last_match:i])
+    @classmethod
+    def to_tokens(cls, s: str) -> List[str]:
+        tokens: List[str] = []
+        slen = len(s)
+        i = 0
+        did_skip = False  # ensure exists
+        while i < slen:
+            # contiguous punctuation chars
+            last_match = i
+            while i < slen and cls.is_delimiter(s[i]):
+                # no special case
+                i += 1
+            cls.add_token(s, tokens, last_match, i)
+            # contiguous writing chars (much harder)
+            last_match = i
+            while i < slen and not cls.is_delimiter(s[i]):
+                did_skip = False
+                # we skip and see another writing char, or init
+                if NimiUCSUR.filter(s[i]):
+                    cls.add_token(s, tokens, last_match, i)
+                    tokens.append(s[i])
+                    i += 1
+                    last_match = i
+                    continue
+                next_char = s[i + 1] if i + 1 < slen else ""
+                if next_char in cls.intra_word_punct:
+                    did_skip = True
+                    i += 2
+                    continue
+                i += 1
+            if did_skip:
+                # we skipped, but there wasn't another writing character
+                cls.add_token(s, tokens, last_match, i - 1)
+                last_match = i - 1
+            cls.add_token(s, tokens, last_match, i)
+        return tokens
     @classmethod
     @override
@@ -60,33 +116,12 @@ class WordTokenizer(SetTokenizer):
             return []
         tokens: List[str] = []
+        candidates: List[str] = s.split()
-        i = 0  # ensure i is bound
-        last_match = 0
-        last_membership = s[0] in cls.delimiters
-        for i, char in enumerate(s):
-            mem = char in cls.delimiters
-            ucsur = NimiUCSUR.filter(char)
-            changed = (mem != last_membership) or ucsur
-            # this keeps contiguous words together, but splits UCSUR
-            if not changed:
-                continue
-            if ucsur:
-                if i > last_match:
-                    # Add the token before UCSUR character
-                    cls.__helper(s, tokens, last_match, i)
-                # Add UCSUR character itself as a token
-                tokens.append(char)
-                last_match = i + 1
-                last_membership = mem
-                continue
-            cls.__helper(s, tokens, last_match, i)
-            last_match = i
-            last_membership = mem
+        for candidate in candidates:
+            results = cls.to_tokens(candidate)
+            tokens.extend(results)
-        cls.__helper(s, tokens, last_match, i + 1)
         return tokens

{sonatoki-0.5.2 → sonatoki-0.6.0}/src/sonatoki/constants.py RENAMED Viewed

@@ -498,7 +498,10 @@ ALL_PUNCT = "".join(sorted(list(set(POSIX_PUNCT + UNICODE_PUNCT))))
 ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
 # combined bc the result could be simpler
-SENTENCE_PUNCT = """.?!:;'"()[-]“”·…"""
+SENTENCE_PUNCT = """.?!:;()[-]·…"""
+# NOTE: quotes were previously included, but in TP they are *not* reliably sentence boundaries
+INTRA_WORD_PUNCT = """-'"""
 LINKU = Path(__file__).resolve().parent / Path("linku.json")
@@ -514,8 +517,8 @@ LANGUAGE = "english"  # for NLTK
 """Commonly occurring strings which are some kind of valid Toki Pona or
 external token."""
 ALLOWABLES = {
-    "x",  # ala
-    "y",  # anu
+    # "x",  # ala
+    # "y",  # anu
     "kxk",  # ken ala ken
     "wxw",  # wile ala wile
     "msa",
@@ -539,6 +542,7 @@ FALSE_POS_SYLLABIC = {
     "name",
     "time",
     "imo",  # "in my opinion"
+    "ime",  # "in my experience"
     "man",
     # "son",  # sona typo?
     "joke",
@@ -616,6 +620,7 @@ FALSE_POS_SYLLABIC = {
     # manual additions
     "alike",
     "amuse",
+    "animate",
     "antelope",
     "antena",
     "apetite",
@@ -638,16 +643,21 @@ FALSE_POS_SYLLABIC = {
     "insolate",
     "insulate",
     "intense",
+    # "june",
     "lemon",
     "manipulate",
     "misuse",
     "ne",  # "no" in many other languages
+    "tape",
+    "onto",
     "wana",
+    "muse",
 }
 FALSE_POS_ALPHABETIC: Set[str] = {
     "t",
     "is",
+    "os",  # some command prefix...
     "as",
     "not",
     "link",

{sonatoki-0.5.2 → sonatoki-0.6.0}/tests/test_properties.py RENAMED Viewed

@@ -1,6 +1,6 @@
 # PDM
 import hypothesis.strategies as st
-from hypothesis import given
+from hypothesis import given, assume
 # LOCAL
 from sonatoki.Filters import (
@@ -54,11 +54,13 @@ def test_ku_filters_non_overlap(s: str):
             | NIMI_LINKU_COMMON
             | NIMI_LINKU_UNCOMMON
             | NIMI_LINKU_OBSCURE
-            | NIMI_LINKU_SANDBOX - {"su"}
+            | NIMI_LINKU_SANDBOX
         )
     )
 )
 def test_linku_filters_non_overlap(s: str):
+    _ = assume(s != "su")
     s = Lowercase.clean(s)
     s = ConsecutiveDuplicates.clean(s)

{sonatoki-0.5.2 → sonatoki-0.6.0}/tests/tokenize_cases/tokenize_sentences_tok.yml RENAMED Viewed

@@ -46,13 +46,17 @@
   input: "mi mu tawa sina, mi wawa e sina."
   output:
     - "mi mu tawa sina, mi wawa e sina."
-- name: "quotes"
+- name: "singlequotes"
   input: "toki li tan kulupu Kuko li ni: 'o ike ala!'"
-  output: # expected; we split on right of all sentence-ending puncts
+  output:
     - "toki li tan kulupu Kuko li ni:"
+    - "'o ike ala!"
     - "'"
-    - "o ike ala!"
-    - "'"
+- name: "doublequotes"
+  input: 'ona li toki e ni: "mama sina"'
+  output:
+    - "ona li toki e ni:"
+    - '"mama sina"'
 - name: "discovered case 1"
   input: "ona li ken lukin e sitelen [_ike_nanpa_lete_ike]. ni li pona kin."
   output:

{sonatoki-0.5.2 → sonatoki-0.6.0}/tests/tokenize_cases/tokenize_words_tok.yml RENAMED Viewed

@@ -34,7 +34,73 @@
     - "ike"
     - "ala"
     - "!'"
+- name: "english 1"
+  input: "isn't that strange?"
+  output:
+    - "isn't"
+    - "that"
+    - "strange"
+    - "?"
+- name: "english 2"
+  input: "i have self-respect..."
+  output:
+    - "i"
+    - "have"
+    - "self-respect"
+    - "..."
+- name: "english 3"
+  input: "i'm an m.d."
+  output:
+    - "i'm"
+    - "an"
+    - "m"
+    - "."
+    - "d"
+    - "."
+- name: "english 4"
+  input: "it's mind-numbing honestly"
+  output:
+    - "it's"
+    - "mind-numbing"
+    - "honestly"
+- name: "english 5"
+  input: "Here's what they said: 'single quotes are boring'"
+  output:
+    - "Here's"
+    - "what"
+    - "they"
+    - "said"
+    - ":"
+    - "'"
+    - "single"
+    - "quotes"
+    - "are"
+    - "boring"
+    - "'"
+- name: "english 6"
+  input: "Here's what they said: 'single quotes are boring' and true"
+  output:
+    - "Here's"
+    - "what"
+    - "they"
+    - "said"
+    - ":"
+    - "'"
+    - "single"
+    - "quotes"
+    - "are"
+    - "boring"
+    - "'"
+    - "and"
+    - "true"
+- name: "non-consecutive puncts"
+  input: ". . ."
+  output:
+    - "."
+    - "."
+    - "."
 - name: "url"
+  xfail: true # we get rid of URLs before tokenizing, so the result isn't material
   input: "https://mun.la/sona/"
   output:
     - "https"
@@ -85,6 +151,16 @@
     - "mama"
     - "."
     - "sina"
+- name: simple bold
+  input: "**mi unpa e mama sina**"
+  output:
+    - "**"
+    - "mi"
+    - "unpa"
+    - "e"
+    - "mama"
+    - "sina"
+    - "**"
 - name: weird punctuation characters
   input: "mi^en$sina-li*toki()="
   output:
@@ -92,9 +168,7 @@
     - "^"
     - "en"
     - "$"
-    - "sina"
-    - "-"
-    - "li"
+    - "sina-li" # intended; looks like valid intrapunct
     - "*"
     - "toki"
     - "()="
@@ -225,3 +299,41 @@
     - "「"
     - "Direct"
     - "」"
+- name: "simple intrapunct 1"
+  input: "i'm"
+  output:
+    - "i'm"
+- name: "intrapunct and punct"
+  input: "i'm."
+  output:
+    - "i'm"
+    - "."
+- name: "simple intrapunct 2"
+  input: "isn't"
+  output:
+    - "isn't"
+- name: "quoted with intrapunct"
+  input: "'bother'"
+  output:
+    - "'"
+    - "bother"
+    - "'"
+- name: "quoted intrapunct with intrapunct 1"
+  input: "'isn't'"
+  output:
+    - "'"
+    - "isn't"
+    - "'"
+- name: "quoted intrapunct with intrapunct 2"
+  input: "'isn't it gross?'"
+  output:
+    - "'"
+    - "isn't"
+    - "it"
+    - "gross"
+    - "?'"
+- name: "multiple intrapunct"
+  input: "whom's't'd've'n't"
+  output:
+    - "whom's't'd've'n't"