sonatoki 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Cleaners.py +4 -1
- sonatoki/Configs.py +52 -31
- sonatoki/Filters.py +96 -33
- sonatoki/Preprocessors.py +12 -6
- sonatoki/Scorers.py +54 -51
- sonatoki/constants.py +21 -29
- sonatoki/linku.json +1 -1
- sonatoki/sandbox.json +1 -1
- sonatoki/utils.py +23 -5
- {sonatoki-0.3.1.dist-info → sonatoki-0.3.3.dist-info}/METADATA +1 -1
- sonatoki-0.3.3.dist-info/RECORD +18 -0
- {sonatoki-0.3.1.dist-info → sonatoki-0.3.3.dist-info}/WHEEL +1 -1
- sonatoki-0.3.1.dist-info/RECORD +0 -18
- {sonatoki-0.3.1.dist-info → sonatoki-0.3.3.dist-info}/licenses/LICENSE +0 -0
sonatoki/constants.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# STL
|
2
2
|
import json
|
3
|
-
from typing import Dict, List
|
3
|
+
from typing import Set, Dict, List
|
4
4
|
from pathlib import Path
|
5
5
|
|
6
6
|
# LOCAL
|
@@ -380,40 +380,32 @@ CONSONANTS = "jklmnpstw"
|
|
380
380
|
ALPHABET = VOWELS + CONSONANTS
|
381
381
|
|
382
382
|
LANGUAGE = "english" # for NLTK
|
383
|
-
|
384
|
-
|
383
|
+
"""Commonly occurring strings which are some kind of valid Toki Pona or
|
384
|
+
external token."""
|
385
385
|
ALLOWABLES = {
|
386
|
-
"cw", # Content Warning
|
387
386
|
"x", # ala
|
388
387
|
"y", # anu
|
389
388
|
"kxk", # ken ala ken
|
390
389
|
"wxw", # wile ala wile
|
390
|
+
"msa",
|
391
391
|
}
|
392
392
|
|
393
|
-
|
394
|
-
|
395
|
-
"
|
396
|
-
"am",
|
397
|
-
"an",
|
398
|
-
"i",
|
399
|
-
"in",
|
400
|
-
"is",
|
401
|
-
"l", # they'll
|
402
|
-
"m", # i'm
|
403
|
-
"me",
|
404
|
-
"no",
|
405
|
-
"s", # let's
|
406
|
-
"so",
|
407
|
-
"t", # don't
|
408
|
-
"to",
|
409
|
-
"u", # you
|
410
|
-
"we",
|
411
|
-
"un", # un-
|
412
|
-
"use",
|
393
|
+
PHONOMATCHES = {
|
394
|
+
"non",
|
395
|
+
"nope",
|
413
396
|
"some",
|
414
397
|
"like",
|
398
|
+
"use",
|
399
|
+
"imo",
|
400
|
+
"time",
|
401
|
+
"man",
|
402
|
+
"also",
|
415
403
|
}
|
416
404
|
|
405
|
+
ALPHABETIC_MATCHES: Set[str] = set()
|
406
|
+
|
407
|
+
IGNORABLES = PHONOMATCHES | ALPHABETIC_MATCHES
|
408
|
+
|
417
409
|
UCSUR_RANGES = [
|
418
410
|
"\\U000F1900-\\U000F1977", # pu
|
419
411
|
"\\U000F1978-\\U000F1988", # ku suli
|
@@ -426,14 +418,14 @@ NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
|
|
426
418
|
# NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
|
427
419
|
|
428
420
|
|
429
|
-
def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) ->
|
430
|
-
return
|
421
|
+
def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> Set[str]:
|
422
|
+
return {d["word"] for d in data.values() if d[key] == value}
|
431
423
|
|
432
424
|
|
433
425
|
with open(LINKU) as f:
|
434
426
|
linku: Dict[str, Dict[str, str]] = json.loads(f.read())
|
435
|
-
NIMI_PU
|
436
|
-
NIMI_PU_SYNONYMS
|
427
|
+
NIMI_PU = category_helper(linku, "book", "pu")
|
428
|
+
NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
|
437
429
|
|
438
430
|
NIMI_KU_SULI = category_helper(linku, "book", "ku suli")
|
439
431
|
NIMI_KU_LILI = category_helper(linku, "book", "ku lili")
|
@@ -445,7 +437,7 @@ with open(LINKU) as f:
|
|
445
437
|
|
446
438
|
with open(SANDBOX) as f:
|
447
439
|
sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
|
448
|
-
NIMI_LINKU_SANDBOX
|
440
|
+
NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
|
449
441
|
|
450
442
|
del linku
|
451
443
|
del sandbox
|