PyPI - sonatoki - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

sonatoki 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

sonatoki/Cleaners.py +4 -1
sonatoki/Configs.py +52 -31
sonatoki/Filters.py +96 -33
sonatoki/Preprocessors.py +12 -6
sonatoki/Scorers.py +54 -51
sonatoki/constants.py +21 -29
sonatoki/linku.json +1 -1
sonatoki/sandbox.json +1 -1
sonatoki/utils.py +23 -5
{sonatoki-0.3.1.dist-info → sonatoki-0.3.3.dist-info}/METADATA +1 -1
sonatoki-0.3.3.dist-info/RECORD +18 -0
{sonatoki-0.3.1.dist-info → sonatoki-0.3.3.dist-info}/WHEEL +1 -1
sonatoki-0.3.1.dist-info/RECORD +0 -18
{sonatoki-0.3.1.dist-info → sonatoki-0.3.3.dist-info}/licenses/LICENSE +0 -0

sonatoki/constants.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # STL
 import json
-from typing import Dict, List
+from typing import Set, Dict, List
 from pathlib import Path
 # LOCAL
@@ -380,40 +380,32 @@ CONSONANTS = "jklmnpstw"
 ALPHABET = VOWELS + CONSONANTS
 LANGUAGE = "english"  # for NLTK
-"""Commonly occurring strings which are some kind of valid Toki Pona or external token"""
+"""Commonly occurring strings which are some kind of valid Toki Pona or
+external token."""
 ALLOWABLES = {
-    "cw",  # Content Warning
     "x",  # ala
     "y",  # anu
     "kxk",  # ken ala ken
     "wxw",  # wile ala wile
+    "msa",
 }
-IGNORABLES = {
-    # o, e, n are not here bc they're not frequently problematic in english messages
-    "a",
-    "am",
-    "an",
-    "i",
-    "in",
-    "is",
-    "l",  # they'll
-    "m",  # i'm
-    "me",
-    "no",
-    "s",  # let's
-    "so",
-    "t",  # don't
-    "to",
-    "u",  # you
-    "we",
-    "un",  # un-
-    "use",
+PHONOMATCHES = {
+    "non",
+    "nope",
     "some",
     "like",
+    "use",
+    "imo",
+    "time",
+    "man",
+    "also",
 }
+ALPHABETIC_MATCHES: Set[str] = set()
+IGNORABLES = PHONOMATCHES | ALPHABETIC_MATCHES
 UCSUR_RANGES = [
     "\\U000F1900-\\U000F1977",  # pu
     "\\U000F1978-\\U000F1988",  # ku suli
@@ -426,14 +418,14 @@ NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
 # NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
-def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> List[str]:
-    return [d["word"] for d in data.values() if d[key] == value]
+def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> Set[str]:
+    return {d["word"] for d in data.values() if d[key] == value}
 with open(LINKU) as f:
     linku: Dict[str, Dict[str, str]] = json.loads(f.read())
-    NIMI_PU: List[str] = category_helper(linku, "book", "pu")
-    NIMI_PU_SYNONYMS: List[str] = ["namako", "kin", "oko"]
+    NIMI_PU = category_helper(linku, "book", "pu")
+    NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
     NIMI_KU_SULI = category_helper(linku, "book", "ku suli")
     NIMI_KU_LILI = category_helper(linku, "book", "ku lili")
@@ -445,7 +437,7 @@ with open(LINKU) as f:
 with open(SANDBOX) as f:
     sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
-    NIMI_LINKU_SANDBOX: List[str] = [d["word"] for d in sandbox.values()]
+    NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
 del linku
 del sandbox

sonatoki 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

sonatoki 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl