PyPI - sonatoki - Versions diffs - 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

sonatoki 0.5.3py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

sonatoki/Configs.py +8 -6
sonatoki/Tokenizers.py +64 -29
sonatoki/constants.py +13 -3
{sonatoki-0.5.3.dist-info → sonatoki-0.6.0.dist-info}/METADATA +1 -1
{sonatoki-0.5.3.dist-info → sonatoki-0.6.0.dist-info}/RECORD +7 -7
{sonatoki-0.5.3.dist-info → sonatoki-0.6.0.dist-info}/WHEEL +0 -0
{sonatoki-0.5.3.dist-info → sonatoki-0.6.0.dist-info}/licenses/LICENSE +0 -0

sonatoki/Configs.py CHANGED Viewed

@@ -30,10 +30,11 @@ from sonatoki.Filters import (
     NimiLinkuObscure,
     NimiLinkuSandbox,
     NimiLinkuUncommon,
+    FalsePosAlphabetic,
 )
 from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
 from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
-from sonatoki.Tokenizers import Tokenizer
+from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
 from sonatoki.Preprocessors import (
     URLs,
     Emoji,
@@ -72,11 +73,11 @@ PrefConfig: IloConfig = {
     "cleaners": [ConsecutiveDuplicates],
     "ignoring_filters": [Numeric, Punctuation],
     "scoring_filters": [
-        Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
+        Or(NimiLinkuCore, NimiLinkuCommon, NimiLinkuUncommon, NimiUCSUR),
         And(LongSyllabic, Not(FalsePosSyllabic)),
         # NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
         LongProperName,
-        LongAlphabetic,
+        And(LongAlphabetic, Not(FalsePosAlphabetic)),
     ],
     "scorer": SoftScaling,
     "passing_score": 0.8,
@@ -98,7 +99,7 @@ CorpusConfig: IloConfig = {
         ),
         And(LongSyllabic, Not(FalsePosSyllabic)),
         LongProperName,
-        LongAlphabetic,
+        And(LongAlphabetic, Not(FalsePosAlphabetic)),
     ],
     "scorer": SoftScaling,
     "passing_score": 0.8,
@@ -145,6 +146,7 @@ LazyConfig: IloConfig = {
     "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
     "scorer": SoftPassFail,
     "passing_score": 0.8,
+    "word_tokenizer": WordTokenizerRe,  # mimics old tokenizer
 }
 """This is extremely silly."""
 IsipinEpikuConfig: IloConfig = {
@@ -161,7 +163,7 @@ IsipinEpikuConfig: IloConfig = {
         ),
         And(LongSyllabic, Not(FalsePosSyllabic)),
         LongProperName,
-        LongAlphabetic,
+        And(LongAlphabetic, Not(FalsePosAlphabetic)),
     ],
     "scorer": SoftScaling,
     "passing_score": 0.8,
@@ -176,7 +178,7 @@ DiscordConfig: IloConfig = {
         Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
         And(LongSyllabic, Not(FalsePosSyllabic)),
         LongProperName,
-        LongAlphabetic,
+        And(LongAlphabetic, Not(FalsePosAlphabetic)),
     ],
     "scorer": SoftScaling,
     "passing_score": 0.8,

sonatoki/Tokenizers.py CHANGED Viewed

@@ -10,7 +10,12 @@ from typing_extensions import override, deprecated
 # LOCAL
 from sonatoki.utils import regex_escape
 from sonatoki.Filters import NimiUCSUR  # seriously this sucks
-from sonatoki.constants import ALL_PUNCT, SENTENCE_PUNCT, ALL_PUNCT_RANGES_STR
+from sonatoki.constants import (
+    ALL_PUNCT,
+    SENTENCE_PUNCT,
+    INTRA_WORD_PUNCT,
+    ALL_PUNCT_RANGES_STR,
+)
 regex.DEFAULT_VERSION = regex.VERSION1
@@ -47,11 +52,62 @@ class Regex1Tokenizer(Tokenizer):
 class WordTokenizer(SetTokenizer):
     delimiters = set(ALL_PUNCT)
+    intra_word_punct = set(INTRA_WORD_PUNCT)
+    @classmethod
+    def is_delimiter(cls, c: str) -> bool:
+        return c in cls.delimiters or not c
     @classmethod
-    def __helper(cls, s: str, tokens: List[str], last_match: int, i: int):
-        match = s[last_match:i].split()
-        [tokens.append(t) for t in match if t]
+    def add_token(cls, s: str, tokens: List[str], last_match: int, i: int):
+        if i > last_match:
+            tokens.append(s[last_match:i])
+    @classmethod
+    def to_tokens(cls, s: str) -> List[str]:
+        tokens: List[str] = []
+        slen = len(s)
+        i = 0
+        did_skip = False  # ensure exists
+        while i < slen:
+            # contiguous punctuation chars
+            last_match = i
+            while i < slen and cls.is_delimiter(s[i]):
+                # no special case
+                i += 1
+            cls.add_token(s, tokens, last_match, i)
+            # contiguous writing chars (much harder)
+            last_match = i
+            while i < slen and not cls.is_delimiter(s[i]):
+                did_skip = False
+                # we skip and see another writing char, or init
+                if NimiUCSUR.filter(s[i]):
+                    cls.add_token(s, tokens, last_match, i)
+                    tokens.append(s[i])
+                    i += 1
+                    last_match = i
+                    continue
+                next_char = s[i + 1] if i + 1 < slen else ""
+                if next_char in cls.intra_word_punct:
+                    did_skip = True
+                    i += 2
+                    continue
+                i += 1
+            if did_skip:
+                # we skipped, but there wasn't another writing character
+                cls.add_token(s, tokens, last_match, i - 1)
+                last_match = i - 1
+            cls.add_token(s, tokens, last_match, i)
+        return tokens
     @classmethod
     @override
@@ -60,33 +116,12 @@ class WordTokenizer(SetTokenizer):
             return []
         tokens: List[str] = []
+        candidates: List[str] = s.split()
-        i = 0  # ensure i is bound
-        last_match = 0
-        last_membership = s[0] in cls.delimiters
-        for i, char in enumerate(s):
-            mem = char in cls.delimiters
-            ucsur = NimiUCSUR.filter(char)
-            changed = (mem != last_membership) or ucsur
-            # this keeps contiguous words together, but splits UCSUR
-            if not changed:
-                continue
-            if ucsur:
-                if i > last_match:
-                    # Add the token before UCSUR character
-                    cls.__helper(s, tokens, last_match, i)
-                # Add UCSUR character itself as a token
-                tokens.append(char)
-                last_match = i + 1
-                last_membership = mem
-                continue
-            cls.__helper(s, tokens, last_match, i)
-            last_match = i
-            last_membership = mem
+        for candidate in candidates:
+            results = cls.to_tokens(candidate)
+            tokens.extend(results)
-        cls.__helper(s, tokens, last_match, i + 1)
         return tokens

sonatoki/constants.py CHANGED Viewed

@@ -498,7 +498,10 @@ ALL_PUNCT = "".join(sorted(list(set(POSIX_PUNCT + UNICODE_PUNCT))))
 ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
 # combined bc the result could be simpler
-SENTENCE_PUNCT = """.?!:;'"()[-]“”·…"""
+SENTENCE_PUNCT = """.?!:;()[-]·…"""
+# NOTE: quotes were previously included, but in TP they are *not* reliably sentence boundaries
+INTRA_WORD_PUNCT = """-'"""
 LINKU = Path(__file__).resolve().parent / Path("linku.json")
@@ -514,8 +517,8 @@ LANGUAGE = "english"  # for NLTK
 """Commonly occurring strings which are some kind of valid Toki Pona or
 external token."""
 ALLOWABLES = {
-    "x",  # ala
-    "y",  # anu
+    # "x",  # ala
+    # "y",  # anu
     "kxk",  # ken ala ken
     "wxw",  # wile ala wile
     "msa",
@@ -539,6 +542,7 @@ FALSE_POS_SYLLABIC = {
     "name",
     "time",
     "imo",  # "in my opinion"
+    "ime",  # "in my experience"
     "man",
     # "son",  # sona typo?
     "joke",
@@ -616,6 +620,7 @@ FALSE_POS_SYLLABIC = {
     # manual additions
     "alike",
     "amuse",
+    "animate",
     "antelope",
     "antena",
     "apetite",
@@ -638,16 +643,21 @@ FALSE_POS_SYLLABIC = {
     "insolate",
     "insulate",
     "intense",
+    # "june",
     "lemon",
     "manipulate",
     "misuse",
     "ne",  # "no" in many other languages
+    "tape",
+    "onto",
     "wana",
+    "muse",
 }
 FALSE_POS_ALPHABETIC: Set[str] = {
     "t",
     "is",
+    "os",  # some command prefix...
     "as",
     "not",
     "link",

{sonatoki-0.5.3.dist-info → sonatoki-0.6.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sonatoki
-Version: 0.5.3
+Version: 0.6.0
 Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
 Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
 License: AGPL-3.0-or-later

{sonatoki-0.5.3.dist-info → sonatoki-0.6.0.dist-info}/RECORD RENAMED Viewed

@@ -1,20 +1,20 @@
-sonatoki-0.5.3.dist-info/METADATA,sha256=mC-i9FszUcyFA8peFVjRvj5QxCoVFjfHf60UWZNxquA,6517
-sonatoki-0.5.3.dist-info/WHEEL,sha256=rSwsxJWe3vzyR5HCwjWXQruDgschpei4h_giTm0dJVE,90
-sonatoki-0.5.3.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
+sonatoki-0.6.0.dist-info/METADATA,sha256=JuR9XrtjbWWZwtYz2rzqwMEIzR_ddQ2te2mskmc-evs,6517
+sonatoki-0.6.0.dist-info/WHEEL,sha256=rSwsxJWe3vzyR5HCwjWXQruDgschpei4h_giTm0dJVE,90
+sonatoki-0.6.0.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
 sonatoki/Cleaners.py,sha256=x2dT3MpDUfbrHA0EP2D3n1sTiKFFi5jw9ha-1dX973o,1958
-sonatoki/Configs.py,sha256=yprG3LEMyy6KKJWEEeJ7nEIC3-qtqA7p4CTHYv4a4vU,5469
+sonatoki/Configs.py,sha256=RD6YUYW45pwIFx8ebJgGs5PhIhL9sjn_VqIg4zf3VUE,5697
 sonatoki/Filters.py,sha256=nVSmw5M4sEYA_8KI1fI53rMHkd9KO6yWbKfdxxExxN8,11700
 sonatoki/Preprocessors.py,sha256=nN6xL6mvVAnWZjSNW8CaeLm8x4kK3dCoB-1WYqi0ANU,5763
 sonatoki/Scorers.py,sha256=LRQLgXKTU2VqhkMHFPVxyVt83DXf85_zrpDGk4ThU24,3811
-sonatoki/Tokenizers.py,sha256=qFaA1-v-wjKMihtEJMeZpi3m4cSkJQgWhGhL-w0VgPE,4236
+sonatoki/Tokenizers.py,sha256=8lpC70bzXOpHyhVr5bmqpYKmdmQvJdf7X5-Icc9RRCw,5040
 sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sonatoki/__main__.py,sha256=6n4kUF80APl6a0jV46h_ncHNuQbrLpZ_nAmiNAakiag,5673
 sonatoki/alphabetic.txt,sha256=duyqAKilD2vLIr75RShCIAnktNJcGeEoQIk18V6czmg,11702
-sonatoki/constants.py,sha256=BYML7p9oUELgUDO0xdgmP74idcwjiFSw_NfuDLpsp8k,18952
+sonatoki/constants.py,sha256=qKjWqVcsvfScDPW4lUvRh_Qhwxv6AYkGrGQyhxEbX8w,19206
 sonatoki/ilo.py,sha256=PWZa202Q4h7IjnLxmfgT93iAPJL7dqJbA97L9kQDPiA,5658
 sonatoki/linku.json,sha256=d72Dvht-a4gBmdqLLI8mElvo83zSpbxDmxJj05hOudM,295413
 sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sonatoki/sandbox.json,sha256=44csrQDaVtV-n8OyewabX1J9MmUFCsPct5C8E5Xuc58,140197
 sonatoki/syllabic.txt,sha256=HnqY4TrZ3tPcHah3TsvG9F9gjMrnAGdJ8hHJNHyyUPc,1712
 sonatoki/utils.py,sha256=sT5xLMEj0aLpy8GP92HKblJU1Wt1m8NUlMgCFWB32xQ,2265
-sonatoki-0.5.3.dist-info/RECORD,,
+sonatoki-0.6.0.dist-info/RECORD,,

{sonatoki-0.5.3.dist-info → sonatoki-0.6.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{sonatoki-0.5.3.dist-info → sonatoki-0.6.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

sonatoki 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl

sonatoki 0.5.3py3-none-any.whl → 0.6.0py3-none-any.whl