sonatoki 0.1.4__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {sonatoki-0.1.4 → sonatoki-0.1.5}/PKG-INFO +1 -1
  2. {sonatoki-0.1.4 → sonatoki-0.1.5}/pyproject.toml +1 -1
  3. {sonatoki-0.1.4 → sonatoki-0.1.5}/src/sonatoki/Filters.py +16 -3
  4. {sonatoki-0.1.4 → sonatoki-0.1.5}/src/sonatoki/Preprocessors.py +13 -2
  5. {sonatoki-0.1.4 → sonatoki-0.1.5}/src/sonatoki/Scorers.py +2 -14
  6. {sonatoki-0.1.4 → sonatoki-0.1.5}/src/sonatoki/Tokenizers.py +22 -7
  7. sonatoki-0.1.5/src/sonatoki/constants.py +83 -0
  8. {sonatoki-0.1.4 → sonatoki-0.1.5}/src/sonatoki/ilo.py +0 -12
  9. {sonatoki-0.1.4 → sonatoki-0.1.5}/tests/test_filters.py +5 -6
  10. {sonatoki-0.1.4 → sonatoki-0.1.5}/tests/test_ilo.py +0 -1
  11. {sonatoki-0.1.4 → sonatoki-0.1.5}/tests/tokenize_cases/tokenize_sentences_tok.yml +18 -0
  12. sonatoki-0.1.4/src/sonatoki/constants.py +0 -67
  13. {sonatoki-0.1.4 → sonatoki-0.1.5}/LICENSE +0 -0
  14. {sonatoki-0.1.4 → sonatoki-0.1.5}/README.md +0 -0
  15. {sonatoki-0.1.4 → sonatoki-0.1.5}/src/sonatoki/Cleaners.py +0 -0
  16. {sonatoki-0.1.4 → sonatoki-0.1.5}/src/sonatoki/Configs.py +0 -0
  17. {sonatoki-0.1.4 → sonatoki-0.1.5}/src/sonatoki/__init__.py +0 -0
  18. {sonatoki-0.1.4 → sonatoki-0.1.5}/src/sonatoki/__main__.py +0 -0
  19. {sonatoki-0.1.4 → sonatoki-0.1.5}/src/sonatoki/linku.json +0 -0
  20. {sonatoki-0.1.4 → sonatoki-0.1.5}/src/sonatoki/sandbox.json +0 -0
  21. {sonatoki-0.1.4 → sonatoki-0.1.5}/tests/__init__.py +0 -0
  22. {sonatoki-0.1.4 → sonatoki-0.1.5}/tests/test_cleaners.py +0 -0
  23. {sonatoki-0.1.4 → sonatoki-0.1.5}/tests/test_preprocessors.py +0 -0
  24. {sonatoki-0.1.4 → sonatoki-0.1.5}/tests/test_scorers.py +0 -0
  25. {sonatoki-0.1.4 → sonatoki-0.1.5}/tests/test_tokenize.py +0 -0
  26. {sonatoki-0.1.4 → sonatoki-0.1.5}/tests/test_utils.py +0 -0
  27. {sonatoki-0.1.4 → sonatoki-0.1.5}/tests/tokenize_cases/tokenize_sentences.yml +0 -0
  28. {sonatoki-0.1.4 → sonatoki-0.1.5}/tests/tokenize_cases/tokenize_words.yml +0 -0
  29. {sonatoki-0.1.4 → sonatoki-0.1.5}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.1.4
3
+ Version: 0.1.5
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.1.4"
3
+ version = "0.1.5"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -1,10 +1,11 @@
1
1
  # STL
2
+ import re
2
3
  from abc import ABC, abstractmethod
3
4
  from typing import Set
4
5
  from functools import lru_cache as cache # cache comes in 3.9
5
6
 
6
7
  # PDM
7
- import regex as re
8
+ import regex
8
9
  from typing_extensions import override
9
10
 
10
11
  # LOCAL
@@ -13,14 +14,16 @@ from sonatoki.constants import (
13
14
  CONSONANTS,
14
15
  NIMI_PU_SET,
15
16
  ALPHABET_SET,
17
+ UNICODE_PUNCT,
16
18
  ALLOWABLES_SET,
17
19
  NIMI_LINKU_SET,
18
20
  NIMI_PU_ALE_SET,
19
21
  NIMI_LINKU_ALE_SET,
22
+ PRUNED_POSIX_PUNCT,
20
23
  NIMI_LINKU_SANDBOX_SET,
21
24
  )
22
25
 
23
- re.DEFAULT_VERSION = re.VERSION1
26
+ regex.DEFAULT_VERSION = regex.VERSION1
24
27
 
25
28
 
26
29
  class Filter(ABC):
@@ -41,6 +44,16 @@ class RegexFilter(Filter):
41
44
  return not not re.fullmatch(cls.pattern, token)
42
45
 
43
46
 
47
+ class Regex1Filter(Filter):
48
+ pattern: "regex.Pattern[str]"
49
+
50
+ @classmethod
51
+ @override
52
+ @cache(maxsize=None)
53
+ def filter(cls, token: str) -> bool:
54
+ return not not regex.fullmatch(cls.pattern, token)
55
+
56
+
44
57
  class SetFilter(Filter):
45
58
  tokens: Set[str]
46
59
 
@@ -148,7 +161,7 @@ class Numeric(Filter):
148
161
 
149
162
 
150
163
  class Punctuation(RegexFilter):
151
- pattern = re.compile(r"[\p{Punctuation}\p{posix_punct}]+")
164
+ pattern = re.compile(rf"[{PRUNED_POSIX_PUNCT}{UNICODE_PUNCT}]+")
152
165
 
153
166
 
154
167
  __all__ = [
@@ -17,13 +17,14 @@ It is up to the user to order them appropriately.
17
17
  """
18
18
 
19
19
  # STL
20
+ import re
20
21
  from abc import ABC, abstractmethod
21
22
 
22
23
  # PDM
23
- import regex as re
24
+ import regex
24
25
  from typing_extensions import override
25
26
 
26
- re.DEFAULT_VERSION = re.VERSION1
27
+ regex.DEFAULT_VERSION = regex.VERSION1
27
28
 
28
29
 
29
30
  class Preprocessor(ABC):
@@ -43,6 +44,16 @@ class RegexPreprocessor(Preprocessor):
43
44
  return re.sub(cls.pattern, cls.replace, msg)
44
45
 
45
46
 
47
+ class Regex1Preprocessor(Preprocessor):
48
+ pattern: "regex.Pattern[str]"
49
+ replace: str = " "
50
+
51
+ @classmethod
52
+ @override
53
+ def process(cls, msg: str) -> str:
54
+ return regex.sub(cls.pattern, cls.replace, msg)
55
+
56
+
46
57
  """
47
58
  The following classes are Ignorables.
48
59
 
@@ -10,8 +10,6 @@ from typing_extensions import override
10
10
  # LOCAL
11
11
  from sonatoki.Filters import Filter
12
12
 
13
- LOG = logging.getLogger(__name__)
14
-
15
13
  Number = Union[int, float]
16
14
  Weights = Dict[str, Number]
17
15
 
@@ -37,12 +35,7 @@ class PassFail(Scorer):
37
35
  def score_token(cls, token: str, filters: List[Type[Filter]]) -> Number:
38
36
  for f in filters:
39
37
  if f.filter(token):
40
- score = 1
41
- LOG.debug(
42
- "%12s.%s('%s') = %.2f", cls.__name__, f.__name__, token, score
43
- )
44
- return score
45
- LOG.debug("%12s('%s') = 0.00", cls.__name__, token)
38
+ return 1
46
39
  return 0
47
40
 
48
41
  @classmethod
@@ -86,12 +79,7 @@ class Scaling(Scorer):
86
79
  def score_token(cls, token: str, filters: List[Type[Filter]], scale: int):
87
80
  for i, f in enumerate(filters):
88
81
  if f.filter(token):
89
- score = scale - i
90
- LOG.debug(
91
- "%12s.%s('%s') = %.2f", cls.__name__, f.__name__, token, score
92
- )
93
- return score
94
- LOG.debug("%12s('%s') = 0.00", cls.__name__, token)
82
+ return scale - i
95
83
  return 0
96
84
 
97
85
  @classmethod
@@ -1,11 +1,15 @@
1
1
  # STL
2
+ import re
2
3
  from abc import ABC, abstractmethod
3
4
  from typing import List
4
5
 
5
6
  # PDM
6
- import regex as re
7
+ import regex
7
8
  from typing_extensions import override
8
9
 
10
+ # LOCAL
11
+ from sonatoki.constants import UNICODE_PUNCT, PRUNED_POSIX_PUNCT
12
+
9
13
  try:
10
14
  # PDM
11
15
  import nltk
@@ -15,7 +19,7 @@ except ImportError as e:
15
19
  nltk = e
16
20
 
17
21
 
18
- LANGUAGE = "english" # for NLTK
22
+ regex.DEFAULT_VERSION = regex.VERSION1
19
23
 
20
24
 
21
25
  class Tokenizer(ABC):
@@ -42,15 +46,26 @@ class RegexTokenizer(Tokenizer):
42
46
  return [clean for word in re.split(cls.pattern, s) if (clean := word.strip())]
43
47
 
44
48
 
49
+ class Regex1Tokenizer(Tokenizer):
50
+ pattern: "regex.Pattern[str]"
51
+
52
+ @classmethod
53
+ @override
54
+ def tokenize(cls, s: str) -> List[str]:
55
+ return [
56
+ clean for word in regex.split(cls.pattern, s) if (clean := word.strip())
57
+ ]
58
+
59
+
45
60
  class WordTokenizerTok(RegexTokenizer):
46
- pattern = re.compile(r"""([\p{Punctuation}\p{posix_punct}]+|\s+)""")
47
- # TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
48
- # TODO: do the typography characters matter?
49
- # NOTE: | / and , are *not* sentence delimiters for my purpose
61
+ pattern = re.compile(rf"""([{PRUNED_POSIX_PUNCT}{UNICODE_PUNCT}]+|\s+)""")
50
62
 
51
63
 
52
64
  class SentTokenizerTok(RegexTokenizer):
53
- pattern = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-]|$)""")
65
+ pattern = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-])|$""", flags=re.MULTILINE)
66
+ # TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
67
+ # TODO: do the typography characters matter?
68
+ # NOTE: | / and , are *not* sentence delimiters for my purpose
54
69
 
55
70
 
56
71
  class WordTokenizerRe(RegexTokenizer):
@@ -0,0 +1,83 @@
1
+ # STL
2
+ import json
3
+ from typing import Dict, List
4
+ from pathlib import Path
5
+
6
+ LINKU = Path(__file__).resolve().parent / Path("linku.json")
7
+ SANDBOX = Path(__file__).resolve().parent / Path("sandbox.json")
8
+
9
+ VOWELS = "aeiou"
10
+ CONSONANTS = "jklmnpstw"
11
+ ALPHABET = VOWELS + CONSONANTS
12
+ ALPHABET_SET = set(ALPHABET)
13
+
14
+ LANGUAGE = "english" # for NLTK
15
+
16
+ # `\p{posix_punct}` character class
17
+ POSIX_PUNCT = r"""-!"#$%&'()*+,./:;<=>?@[\]^_`{|}~"""
18
+ PRUNED_POSIX_PUNCT = r"""$+<=>^`|~""" # only those that are not in UNICODE_PUNCT
19
+
20
+ # `\p{Punctuation}` character class
21
+ UNICODE_PUNCT = r"""!"#%&'()*,-./:;?@\[\\\]_{}¡§«¶·»¿;·՚՛՜՝՞՟։֊־׀׃׆׳״؉؊،؍؛؝؞؟٪٫٬٭۔܀܁܂܃܄܅܆܇܈܉܊܋܌܍߷߸߹࠰࠱࠲࠳࠴࠵࠶࠷࠸࠹࠺࠻࠼࠽࠾࡞।॥॰৽੶૰౷಄෴๏๚๛༄༅༆༇༈༉༊་༌།༎༏༐༑༒༔༺༻༼༽྅࿐࿑࿒࿓࿔࿙࿚၊။၌၍၎၏჻፠፡።፣፤፥፦፧፨᐀᙮᚛᚜᛫᛬᛭᜵᜶។៕៖៘៙៚᠀᠁᠂᠃᠄᠅᠆᠇᠈᠉᠊᥄᥅᨞᨟᪠᪡᪢᪣᪤᪥᪦᪨᪩᪪᪫᪬᪭᭚᭛᭜᭝᭞᭟᭠᭽᭾᯼᯽᯾᯿᰻᰼᰽᰾᰿᱾᱿᳀᳁᳂᳃᳄᳅᳆᳇᳓‐‑‒–—―‖‗‘’‚‛“”„‟†‡•‣․‥…‧‰‱′″‴‵‶‷‸‹›※‼‽‾‿⁀⁁⁂⁃⁅⁆⁇⁈⁉⁊⁋⁌⁍⁎⁏⁐⁑⁓⁔⁕⁖⁗⁘⁙⁚⁛⁜⁝⁞⁽⁾₍₎⌈⌉⌊⌋〈〉❨❩❪❫❬❭❮❯❰❱❲❳❴❵⟅⟆⟦⟧⟨⟩⟪⟫⟬⟭⟮⟯⦃⦄⦅⦆⦇⦈⦉⦊⦋⦌⦍⦎⦏⦐⦑⦒⦓⦔⦕⦖⦗⦘⧘⧙⧚⧛⧼⧽⳹⳺⳻⳼⳾⳿⵰⸀⸁⸂⸃⸄⸅⸆⸇⸈⸉⸊⸋⸌⸍⸎⸏⸐⸑⸒⸓⸔⸕⸖⸗⸘⸙⸚⸛⸜⸝⸞⸟⸠⸡⸢⸣⸤⸥⸦⸧⸨⸩⸪⸫⸬⸭⸮⸰⸱⸲⸳⸴⸵⸶⸷⸸⸹⸺⸻⸼⸽⸾⸿⹀⹁⹂⹃⹄⹅⹆⹇⹈⹉⹊⹋⹌⹍⹎⹏⹒⹓⹔⹕⹖⹗⹘⹙⹚⹛⹜⹝、。〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〽゠・꓾꓿꘍꘎꘏꙳꙾꛲꛳꛴꛵꛶꛷꡴꡵꡶꡷꣎꣏꣸꣹꣺꣼꤮꤯꥟꧁꧂꧃꧄꧅꧆꧇꧈꧉꧊꧋꧌꧍꧞꧟꩜꩝꩞꩟꫞꫟꫰꫱꯫﴾﴿︐︑︒︓︔︕︖︗︘︙︰︱︲︳︴︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄﹅﹆﹇﹈﹉﹊﹋﹌﹍﹎﹏﹐﹑﹒﹔﹕﹖﹗﹘﹙﹚﹛﹜﹝﹞﹟﹠﹡﹣﹨﹪﹫!"#%&'()*,-./:;?@[\]_{}⦅⦆。「」、・𐄀𐄁𐄂𐎟𐏐𐕯𐡗𐤟𐤿𐩐𐩑𐩒𐩓𐩔𐩕𐩖𐩗𐩘𐩿𐫰𐫱𐫲𐫳𐫴𐫵𐫶𐬹𐬺𐬻𐬼𐬽𐬾𐬿𐮙𐮚𐮛𐮜𐺭𐽕𐽖𐽗𐽘𐽙𐾆𐾇𐾈𐾉𑁇𑁈𑁉𑁊𑁋𑁌𑁍𑂻𑂼𑂾𑂿𑃀𑃁𑅀𑅁𑅂𑅃𑅴𑅵𑇅𑇆𑇇𑇈𑇍𑇛𑇝𑇞𑇟𑈸𑈹𑈺𑈻𑈼𑈽𑊩𑑋𑑌𑑍𑑎𑑏𑑚𑑛𑑝𑓆𑗁𑗂𑗃𑗄𑗅𑗆𑗇𑗈𑗉𑗊𑗋𑗌𑗍𑗎𑗏𑗐𑗑𑗒𑗓𑗔𑗕𑗖𑗗𑙁𑙂𑙃𑙠𑙡𑙢𑙣𑙤𑙥𑙦𑙧𑙨𑙩𑙪𑙫𑙬𑚹𑜼𑜽𑜾𑠻𑥄𑥅𑥆𑧢𑨿𑩀𑩁𑩂𑩃𑩄𑩅𑩆𑪚𑪛𑪜𑪞𑪟𑪠𑪡𑪢𑬀𑬁𑬂𑬃𑬄𑬅𑬆𑬇𑬈𑬉𑱁𑱂𑱃𑱄𑱅𑱰𑱱𑻷𑻸𑽃𑽄𑽅𑽆𑽇𑽈𑽉𑽊𑽋𑽌𑽍𑽎𑽏𑿿𒑰𒑱𒑲𒑳𒑴𒿱𒿲𖩮𖩯𖫵𖬷𖬸𖬹𖬺𖬻𖭄𖺗𖺘𖺙𖺚𖿢𛲟𝪇𝪈𝪉𝪊𝪋𞥞𞥟"""
22
+ # NOTE: This list diverges slightly from the raw list, since []\ must be escaped
23
+ # The [] need to be escaped to avoid prematurely closing the regex character class
24
+ # The \ needs to be escaped to be considered as a raw \
25
+
26
+ # https://www.compart.com/en/unicode/category
27
+ # https://unicode.org/Public/UNIDATA/UnicodeData.txt
28
+
29
+
30
+ """Commonly occurring strings which are some kind of valid Toki Pona or external token"""
31
+ ALLOWABLES = {
32
+ "cw", # Content Warning
33
+ "x", # ala
34
+ "y", # anu
35
+ "kxk", # ken ala ken
36
+ "wxw", # wile ala wile
37
+ }
38
+
39
+
40
+ with open(LINKU) as f:
41
+ r: Dict[str, Dict[str, str]] = json.loads(f.read())
42
+ NIMI_PU: List[str] = [d["word"] for d in r.values() if d["book"] == "pu"]
43
+ NIMI_PU_ALE: List[str] = NIMI_PU + ["namako", "kin", "oko"]
44
+ NIMI_LINKU: List[str] = [
45
+ d["word"] for d in r.values() if d["usage_category"] in ["core", "common"]
46
+ ]
47
+ NIMI_LINKU_ALE: List[str] = [d["word"] for d in r.values()]
48
+
49
+ with open(SANDBOX) as f:
50
+ r: Dict[str, Dict[str, str]] = json.loads(f.read())
51
+ NIMI_LINKU_SANDBOX: List[str] = [d["word"] for d in r.values()]
52
+
53
+
54
+ NIMI_PU_SET = set(NIMI_PU)
55
+ NIMI_PU_ALE_SET = set(NIMI_PU_ALE)
56
+ NIMI_LINKU_SET = set(NIMI_LINKU)
57
+ NIMI_LINKU_ALE_SET = set(NIMI_LINKU_ALE)
58
+ NIMI_LINKU_SANDBOX_SET = set(NIMI_LINKU_SANDBOX)
59
+ ALLOWABLES_SET = set(ALLOWABLES)
60
+
61
+ __all__ = [
62
+ "VOWELS",
63
+ #
64
+ "CONSONANTS",
65
+ #
66
+ "ALPHABET",
67
+ "ALPHABET_SET",
68
+ #
69
+ "NIMI_PU",
70
+ "NIMI_PU_SET",
71
+ #
72
+ "NIMI_PU_ALE",
73
+ "NIMI_PU_ALE_SET",
74
+ #
75
+ "NIMI_LINKU",
76
+ "NIMI_LINKU_SET",
77
+ #
78
+ "NIMI_LINKU_ALE",
79
+ "NIMI_LINKU_ALE_SET",
80
+ #
81
+ "NIMI_LINKU_SANDBOX",
82
+ "NIMI_LINKU_SANDBOX_SET",
83
+ ]
@@ -1,5 +1,4 @@
1
1
  # STL
2
- import logging
3
2
  from typing import List, Type, Tuple
4
3
 
5
4
  # LOCAL
@@ -9,8 +8,6 @@ from sonatoki.Cleaners import Cleaner
9
8
  from sonatoki.Tokenizers import Tokenizer
10
9
  from sonatoki.Preprocessors import Preprocessor
11
10
 
12
- LOG = logging.getLogger(__name__)
13
-
14
11
 
15
12
  class Ilo:
16
13
  __preprocessors: List[Type[Preprocessor]]
@@ -20,7 +17,6 @@ class Ilo:
20
17
  __scoring_filters: List[Type[Filter]]
21
18
  __scorer: Type[Scorer]
22
19
  __passing_score: Number
23
- logging_threshold: Number = -1
24
20
 
25
21
  def __init__(
26
22
  self,
@@ -104,14 +100,6 @@ class Ilo:
104
100
  score = self.score_tokens(cleaned)
105
101
  result = score >= self.__passing_score
106
102
 
107
- if score <= self.logging_threshold:
108
- LOG.debug("msg: %.2f %s", score, repr(message))
109
- LOG.debug("preproc: %s", repr(preprocessed))
110
- LOG.debug("tokenized: %s", tokenized)
111
- LOG.debug("filtered: %s", filtered)
112
- LOG.debug("cleaned: %s", cleaned)
113
- # TODO: Move to each function? Loses ability to control when logging occurs by threshold
114
-
115
103
  return preprocessed, tokenized, filtered, cleaned, score, result
116
104
 
117
105
  def is_toki_pona(self, message: str) -> bool:
@@ -82,16 +82,15 @@ def test_ProperName(s: str):
82
82
  assert res, repr(s)
83
83
 
84
84
 
85
- # I use `regex`'s Unicode property feature, which Hypothesis doesn't understand
86
- # So I have to provide a different regex tha doesn't technically match
87
- @given(st.from_regex(r"[^\w\s]+", fullmatch=True))
85
+ @given(st.from_regex(Punctuation.pattern.pattern, fullmatch=True))
86
+ @example("[]")
87
+ @example(r"\\")
88
+ @example(r"\"")
88
89
  @example("⟨·⟩")
89
90
  @example("…")
90
- @example("「 」")
91
+ @example("「」") # ` `
91
92
  @example(string.punctuation)
92
- @settings(suppress_health_check=[HealthCheck.filter_too_much]) # FIXME
93
93
  def test_Punctuation(s: str):
94
- _ = assume(re.fullmatch(Punctuation.pattern.pattern, s))
95
94
  res = Punctuation.filter(s)
96
95
  assert res, repr(s)
97
96
 
@@ -9,7 +9,6 @@ from sonatoki.Configs import LazyConfig, PrefConfig
9
9
  @pytest.fixture
10
10
  def ilo():
11
11
  ilo = Ilo(**PrefConfig)
12
- # ilo.logging_threshold = 0.8
13
12
  return ilo
14
13
 
15
14
 
@@ -19,6 +19,24 @@
19
19
  output:
20
20
  - "mi mu."
21
21
  - "mi wawa."
22
+ - name: "empty"
23
+ input: ""
24
+ output: []
25
+ - name: "whitespace"
26
+ input: " \n "
27
+ output: []
28
+ - name: "newline basic"
29
+ input: "sina lon seme?\nmi wile lon poka...\n"
30
+ output:
31
+ - "sina lon seme?"
32
+ - "mi wile lon poka."
33
+ - "."
34
+ - "."
35
+ - name: "newline alone"
36
+ input: "sina lon seme\nmi wile lon poka"
37
+ output:
38
+ - "sina lon seme"
39
+ - "mi wile lon poka"
22
40
  - name: "dash"
23
41
  input: "mi sona ala e ni- sina seme a"
24
42
  output:
@@ -1,67 +0,0 @@
1
- # STL
2
- import json
3
- from typing import Dict, List
4
- from pathlib import Path
5
-
6
- LINKU = Path(__file__).resolve().parent / Path("linku.json")
7
- SANDBOX = Path(__file__).resolve().parent / Path("sandbox.json")
8
-
9
- VOWELS = "aeiou"
10
- CONSONANTS = "jklmnpstw"
11
- ALPHABET = VOWELS + CONSONANTS
12
- ALPHABET_SET = set(ALPHABET)
13
-
14
- """Commonly occurring strings which are some kind of valid Toki Pona or external token"""
15
- ALLOWABLES = {
16
- "cw", # Content Warning
17
- "x", # ala
18
- "y", # anu
19
- "kxk", # ken ala ken
20
- "wxw", # wile ala wile
21
- }
22
-
23
-
24
- with open(LINKU) as f:
25
- r: Dict[str, Dict[str, str]] = json.loads(f.read())
26
- NIMI_PU: List[str] = [d["word"] for d in r.values() if d["book"] == "pu"]
27
- NIMI_PU_ALE: List[str] = NIMI_PU + ["namako", "kin", "oko"]
28
- NIMI_LINKU: List[str] = [
29
- d["word"] for d in r.values() if d["usage_category"] in ["core", "common"]
30
- ]
31
- NIMI_LINKU_ALE: List[str] = [d["word"] for d in r.values()]
32
-
33
- with open(SANDBOX) as f:
34
- r: Dict[str, Dict[str, str]] = json.loads(f.read())
35
- NIMI_LINKU_SANDBOX: List[str] = [d["word"] for d in r.values()]
36
-
37
-
38
- NIMI_PU_SET = set(NIMI_PU)
39
- NIMI_PU_ALE_SET = set(NIMI_PU_ALE)
40
- NIMI_LINKU_SET = set(NIMI_LINKU)
41
- NIMI_LINKU_ALE_SET = set(NIMI_LINKU_ALE)
42
- NIMI_LINKU_SANDBOX_SET = set(NIMI_LINKU_SANDBOX)
43
- ALLOWABLES_SET = set(ALLOWABLES)
44
-
45
- __all__ = [
46
- "VOWELS",
47
- #
48
- "CONSONANTS",
49
- #
50
- "ALPHABET",
51
- "ALPHABET_SET",
52
- #
53
- "NIMI_PU",
54
- "NIMI_PU_SET",
55
- #
56
- "NIMI_PU_ALE",
57
- "NIMI_PU_ALE_SET",
58
- #
59
- "NIMI_LINKU",
60
- "NIMI_LINKU_SET",
61
- #
62
- "NIMI_LINKU_ALE",
63
- "NIMI_LINKU_ALE_SET",
64
- #
65
- "NIMI_LINKU_SANDBOX",
66
- "NIMI_LINKU_SANDBOX_SET",
67
- ]
File without changes
File without changes
File without changes
File without changes
File without changes