sonatoki 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/constants.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # STL
2
2
  import json
3
- from typing import Dict, List
3
+ from typing import Set, Dict, List
4
4
  from pathlib import Path
5
5
 
6
6
  # LOCAL
@@ -380,40 +380,32 @@ CONSONANTS = "jklmnpstw"
380
380
  ALPHABET = VOWELS + CONSONANTS
381
381
 
382
382
  LANGUAGE = "english" # for NLTK
383
-
384
- """Commonly occurring strings which are some kind of valid Toki Pona or external token"""
383
+ """Commonly occurring strings which are some kind of valid Toki Pona or
384
+ external token."""
385
385
  ALLOWABLES = {
386
- "cw", # Content Warning
387
386
  "x", # ala
388
387
  "y", # anu
389
388
  "kxk", # ken ala ken
390
389
  "wxw", # wile ala wile
390
+ "msa",
391
391
  }
392
392
 
393
- IGNORABLES = {
394
- # o, e, n are not here bc they're not frequently problematic in english messages
395
- "a",
396
- "am",
397
- "an",
398
- "i",
399
- "in",
400
- "is",
401
- "l", # they'll
402
- "m", # i'm
403
- "me",
404
- "no",
405
- "s", # let's
406
- "so",
407
- "t", # don't
408
- "to",
409
- "u", # you
410
- "we",
411
- "un", # un-
412
- "use",
393
+ PHONOMATCHES = {
394
+ "non",
395
+ "nope",
413
396
  "some",
414
397
  "like",
398
+ "use",
399
+ "imo",
400
+ "time",
401
+ "man",
402
+ "also",
415
403
  }
416
404
 
405
+ ALPHABETIC_MATCHES: Set[str] = set()
406
+
407
+ IGNORABLES = PHONOMATCHES | ALPHABETIC_MATCHES
408
+
417
409
  UCSUR_RANGES = [
418
410
  "\\U000F1900-\\U000F1977", # pu
419
411
  "\\U000F1978-\\U000F1988", # ku suli
@@ -426,14 +418,14 @@ NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
426
418
  # NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
427
419
 
428
420
 
429
- def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> List[str]:
430
- return [d["word"] for d in data.values() if d[key] == value]
421
+ def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> Set[str]:
422
+ return {d["word"] for d in data.values() if d[key] == value}
431
423
 
432
424
 
433
425
  with open(LINKU) as f:
434
426
  linku: Dict[str, Dict[str, str]] = json.loads(f.read())
435
- NIMI_PU: List[str] = category_helper(linku, "book", "pu")
436
- NIMI_PU_SYNONYMS: List[str] = ["namako", "kin", "oko"]
427
+ NIMI_PU = category_helper(linku, "book", "pu")
428
+ NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
437
429
 
438
430
  NIMI_KU_SULI = category_helper(linku, "book", "ku suli")
439
431
  NIMI_KU_LILI = category_helper(linku, "book", "ku lili")
@@ -445,7 +437,7 @@ with open(LINKU) as f:
445
437
 
446
438
  with open(SANDBOX) as f:
447
439
  sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
448
- NIMI_LINKU_SANDBOX: List[str] = [d["word"] for d in sandbox.values()]
440
+ NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
449
441
 
450
442
  del linku
451
443
  del sandbox