sonatoki 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/Configs.py CHANGED
@@ -1,11 +1,11 @@
1
1
  # STL
2
- from copy import deepcopy
3
2
  from typing import List, Type, TypedDict
4
3
 
5
4
  # PDM
6
5
  from typing_extensions import NotRequired
7
6
 
8
7
  # LOCAL
8
+ from sonatoki.types import Number
9
9
  from sonatoki.Filters import (
10
10
  Or,
11
11
  And,
@@ -20,23 +20,22 @@ from sonatoki.Filters import (
20
20
  Punctuation,
21
21
  LongSyllabic,
22
22
  Miscellaneous,
23
- NimiLinkuCore,
24
23
  LongAlphabetic,
25
24
  LongProperName,
26
- NimiLinkuCommon,
27
25
  FalsePosSyllabic,
26
+ NimiLinkuByUsage,
28
27
  NimiLinkuObscure,
29
28
  NimiLinkuSandbox,
30
29
  NimiLinkuUncommon,
31
30
  FalsePosAlphabetic,
32
31
  )
33
- from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
32
+ from sonatoki.Scorers import Scorer, PassFail, SoftScaling, SoftPassFail
34
33
  from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
35
34
  from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
36
35
  from sonatoki.Preprocessors import (
37
36
  URLs,
38
37
  Emoji,
39
- Backticks,
38
+ Codeblock,
40
39
  Reference,
41
40
  Preprocessor,
42
41
  AngleBracketObject,
@@ -95,11 +94,11 @@ BaseConfig: IloConfig = {
95
94
 
96
95
 
97
96
  PrefConfig: IloConfig = {
98
- "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
97
+ "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
99
98
  "cleaners": [ConsecutiveDuplicates],
100
99
  "ignoring_filters": [Numeric, Punctuation],
101
100
  "scoring_filters": [
102
- Or(NimiLinkuCore, NimiLinkuCommon, NimiLinkuUncommon, NimiUCSUR),
101
+ Or(NimiLinkuByUsage(30), NimiUCSUR),
103
102
  And(LongSyllabic, Not(FalsePosSyllabic)),
104
103
  # NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
105
104
  LongProperName,
@@ -110,16 +109,13 @@ PrefConfig: IloConfig = {
110
109
  }
111
110
 
112
111
  CorpusConfig: IloConfig = {
113
- "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
112
+ "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
114
113
  "cleaners": [ConsecutiveDuplicates],
115
114
  "ignoring_filters": [Numeric, Punctuation],
116
115
  "scoring_filters": [
117
116
  Or(
118
- NimiLinkuCore,
119
- NimiLinkuCommon,
120
- NimiLinkuUncommon,
121
- NimiLinkuObscure(sub=__DICT_PHONOMATCHES),
122
- NimiLinkuSandbox(sub=__DICT_PHONOMATCHES),
117
+ # awkward but efficient syntax
118
+ NimiLinkuByUsage(0)(sub=__DICT_PHONOMATCHES),
123
119
  NimiUCSUR,
124
120
  Miscellaneous,
125
121
  ),
@@ -132,7 +128,7 @@ CorpusConfig: IloConfig = {
132
128
  }
133
129
  """Mimics the previous implementation of ilo pi toki pona taso."""
134
130
  LazyConfig: IloConfig = {
135
- "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
131
+ "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
136
132
  "cleaners": [ConsecutiveDuplicates],
137
133
  "ignoring_filters": [Numeric, Punctuation],
138
134
  "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
@@ -142,7 +138,7 @@ LazyConfig: IloConfig = {
142
138
  }
143
139
  """This is extremely silly."""
144
140
  IsipinEpikuConfig: IloConfig = {
145
- "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
141
+ "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
146
142
  "cleaners": [ConsecutiveDuplicates],
147
143
  "ignoring_filters": [Numeric, Punctuation],
148
144
  "scoring_filters": [
@@ -162,31 +158,10 @@ IsipinEpikuConfig: IloConfig = {
162
158
  }
163
159
 
164
160
 
165
- DiscordConfig: IloConfig = {
166
- "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
167
- "cleaners": [ConsecutiveDuplicates],
168
- "ignoring_filters": [Numeric, Punctuation],
169
- "scoring_filters": [
170
- Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
171
- And(LongSyllabic, Not(FalsePosSyllabic)),
172
- LongProperName,
173
- And(LongAlphabetic, Not(FalsePosAlphabetic)),
174
- ],
175
- "scorer": SoftScaling,
176
- "passing_score": 0.8,
177
- }
178
-
179
- TelegramConfig: IloConfig = deepcopy(PrefConfig)
180
- ForumConfig: IloConfig = deepcopy(PrefConfig)
181
-
182
-
183
161
  __all__ = [
184
162
  "BaseConfig",
185
163
  "CorpusConfig",
186
- "DiscordConfig",
187
- "ForumConfig",
188
164
  "IloConfig",
189
165
  "LazyConfig",
190
166
  "PrefConfig",
191
- "TelegramConfig",
192
167
  ]
sonatoki/Filters.py CHANGED
@@ -2,37 +2,32 @@
2
2
  import re
3
3
  from abc import ABC, abstractmethod
4
4
  from copy import deepcopy
5
- from typing import Set, List, Type, Optional
5
+ from typing import Set, List, Type, Union, Literal, Optional
6
6
  from functools import lru_cache as cache # cache comes in 3.9
7
7
 
8
8
  # PDM
9
9
  import regex
10
- from typing_extensions import override, deprecated
10
+ from typing_extensions import override
11
11
 
12
12
  # LOCAL
13
+ from sonatoki.types import LinkuBooks, LinkuUsageDate, LinkuUsageCategory
13
14
  from sonatoki.utils import prep_dictionary
14
15
  from sonatoki.constants import (
15
16
  VOWELS,
16
- NIMI_PU,
17
17
  ALPHABET,
18
18
  ALL_PUNCT,
19
19
  ALLOWABLES,
20
20
  CONSONANTS,
21
21
  NIMI_UCSUR,
22
- NIMI_KU_LILI,
23
- NIMI_KU_SULI,
24
- NIMI_LINKU_CORE,
25
22
  NIMI_PU_SYNONYMS,
26
- NIMI_LINKU_COMMON,
27
23
  FALSE_POS_SYLLABIC,
28
- NIMI_LINKU_OBSCURE,
29
- NIMI_LINKU_SANDBOX,
30
24
  NOT_IN_PUNCT_CLASS,
31
- NIMI_LINKU_UNCOMMON,
32
25
  ALL_PUNCT_RANGES_STR,
33
26
  FALSE_POS_ALPHABETIC,
34
27
  UCSUR_PUNCT_RANGES_STR,
35
28
  EMOJI_VARIATION_SELECTOR_RANGES_STR,
29
+ words_by_tag,
30
+ words_by_usage,
36
31
  )
37
32
 
38
33
  regex.DEFAULT_VERSION = regex.VERSION1
@@ -170,40 +165,46 @@ class LongProperName(MinLen, ProperName):
170
165
  length = 2 # reject "names" of length 1
171
166
 
172
167
 
173
- class NimiPu(MemberFilter):
174
- tokens = prep_dictionary(NIMI_PU)
175
-
176
-
177
- class NimiPuSynonyms(MemberFilter):
178
- tokens = prep_dictionary(NIMI_PU_SYNONYMS)
179
-
180
-
181
- class NimiKuSuli(MemberFilter):
182
- tokens = prep_dictionary(NIMI_KU_SULI)
183
-
184
-
185
- class NimiKuLili(MemberFilter):
186
- tokens = prep_dictionary(NIMI_KU_LILI)
168
+ class NimiLinkuByUsage:
169
+ def __new__(
170
+ cls,
171
+ usage: int,
172
+ date: Optional[LinkuUsageDate] = None,
173
+ ) -> Type[MemberFilter]:
174
+ words = words_by_usage(usage, date)
187
175
 
176
+ class AnonLinkuMemberFilter(MemberFilter):
177
+ tokens = prep_dictionary(words)
188
178
 
189
- class NimiLinkuCore(MemberFilter):
190
- tokens = prep_dictionary(NIMI_LINKU_CORE)
179
+ return AnonLinkuMemberFilter
191
180
 
192
181
 
193
- class NimiLinkuCommon(MemberFilter):
194
- tokens = prep_dictionary(NIMI_LINKU_COMMON)
182
+ class NimiLinkuByTag:
183
+ def __new__(
184
+ cls,
185
+ tag: Union[Literal["usage_category"], Literal["book"]],
186
+ category: Union[LinkuUsageCategory, LinkuBooks],
187
+ ) -> Type[MemberFilter]:
188
+ words = words_by_tag(tag, category)
195
189
 
190
+ class AnonLinkuMemberFilter(MemberFilter):
191
+ tokens = prep_dictionary(words)
196
192
 
197
- class NimiLinkuUncommon(MemberFilter):
198
- tokens = prep_dictionary(NIMI_LINKU_UNCOMMON)
193
+ return AnonLinkuMemberFilter
199
194
 
200
195
 
201
- class NimiLinkuObscure(MemberFilter):
202
- tokens = prep_dictionary(NIMI_LINKU_OBSCURE)
196
+ NimiPu = NimiLinkuByTag("book", "pu")
197
+ NimiKuSuli = NimiLinkuByTag("book", "ku suli")
198
+ NimiKuLili = NimiLinkuByTag("book", "ku lili")
199
+ NimiLinkuCore = NimiLinkuByTag("usage_category", "core")
200
+ NimiLinkuCommon = NimiLinkuByTag("usage_category", "common")
201
+ NimiLinkuUncommon = NimiLinkuByTag("usage_category", "uncommon")
202
+ NimiLinkuObscure = NimiLinkuByTag("usage_category", "obscure")
203
+ NimiLinkuSandbox = NimiLinkuByTag("usage_category", "sandbox")
203
204
 
204
205
 
205
- class NimiLinkuSandbox(MemberFilter):
206
- tokens = prep_dictionary(NIMI_LINKU_SANDBOX)
206
+ class NimiPuSynonyms(MemberFilter):
207
+ tokens = prep_dictionary(NIMI_PU_SYNONYMS)
207
208
 
208
209
 
209
210
  class NimiUCSUR(MemberFilter):
sonatoki/Preprocessors.py CHANGED
@@ -143,6 +143,15 @@ class Backticks(RegexPreprocessor):
143
143
  pattern = re.compile(r"`[^`]+`", flags=re.DOTALL)
144
144
 
145
145
 
146
+ class Codeblock(RegexPreprocessor):
147
+ """Remove codeblocks marked by a set of three backticks on their own lines.
148
+
149
+ Subset of what would be removed by Backticks, but may be preferable.
150
+ """
151
+
152
+ pattern = re.compile(r"```\n(?:(?!```).*?)?```", flags=re.DOTALL)
153
+
154
+
146
155
  class Spoilers(RegexPreprocessor):
147
156
  """Remove paired double bars and their contents `||like this||`"""
148
157
 
sonatoki/Scorers.py CHANGED
@@ -1,17 +1,15 @@
1
1
  # STL
2
2
  import math
3
3
  from abc import ABC, abstractmethod
4
- from typing import Dict, List, Type, Union
4
+ from typing import List, Type
5
5
 
6
6
  # PDM
7
7
  from typing_extensions import override
8
8
 
9
9
  # LOCAL
10
+ from sonatoki.types import Number, Scorecard
10
11
  from sonatoki.Filters import Filter
11
12
 
12
- Number = Union[int, float]
13
- Weights = Dict[str, Number]
14
-
15
13
 
16
14
  class Scorer(ABC):
17
15
  @classmethod
@@ -124,7 +122,64 @@ class SoftScaling(Soften, Scaling):
124
122
  scoring."""
125
123
 
126
124
 
127
- # class Logarithmic(Scorer): ...
125
+ class SentenceScorer(ABC):
126
+ @classmethod
127
+ @abstractmethod
128
+ def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
129
+ """Re-score a list of sentences (scorecards, sentences with all their
130
+ metadata) and return them."""
131
+ raise NotImplementedError
132
+
133
+
134
+ class SentNoOp(SentenceScorer):
135
+ @classmethod
136
+ @override
137
+ def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
138
+ return scorecards
128
139
 
129
140
 
130
- __all__ = ["PassFail", "SoftPassFail", "Scaling", "SoftScaling"]
141
+ class SentAvg(SentenceScorer):
142
+ @classmethod
143
+ @override
144
+ def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
145
+ if not scorecards:
146
+ return scorecards
147
+
148
+ total = sum(card["score"] for card in scorecards)
149
+ avg = total / len(scorecards)
150
+ for card in scorecards:
151
+ card["score"] = avg
152
+ return scorecards
153
+
154
+
155
+ class SentWeightedAvg(SentenceScorer):
156
+ @classmethod
157
+ @override
158
+ def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
159
+ if not scorecards:
160
+ return scorecards
161
+
162
+ weighted_total = 0
163
+ total_len = 0
164
+ for card in scorecards:
165
+ cardlen = len(card["cleaned"])
166
+ cardscore = card["score"]
167
+
168
+ weighted_total += cardlen * cardscore
169
+ total_len += cardlen
170
+
171
+ weighted_avg = weighted_total / total_len
172
+ for card in scorecards:
173
+ card["score"] = weighted_avg
174
+ return scorecards
175
+
176
+
177
+ __all__ = [
178
+ "PassFail",
179
+ "Scaling",
180
+ "SoftPassFail",
181
+ "SoftScaling",
182
+ "Soften",
183
+ "SentAvg",
184
+ "SentWeightedAvg",
185
+ ]
sonatoki/constants.py CHANGED
@@ -1,11 +1,16 @@
1
1
  # STL
2
2
  import json
3
- from typing import Set, Dict
3
+ from typing import Set, Dict, Optional
4
4
  from pathlib import Path
5
5
 
6
6
  # LOCAL
7
+ from sonatoki.types import LinkuWord, LinkuUsageDate
7
8
  from sonatoki.utils import find_unicode_chars, find_unicode_ranges
8
9
 
10
+ LATEST_DATE = "2023-09"
11
+ # hardcoding this seems bad, but it means the parser is stable w.r.t. Linku!
12
+
13
+
9
14
  # `\p{Punctuation}` character class
10
15
  # https://www.compart.com/en/unicode/category
11
16
  # https://unicode.org/Public/UNIDATA/UnicodeData.txt
@@ -638,6 +643,7 @@ FALSE_POS_SYLLABIC = {
638
643
  "iluminate",
639
644
  "imense",
640
645
  "imitate",
646
+ "inanimate",
641
647
  "injoke",
642
648
  "insane",
643
649
  "insolate",
@@ -689,26 +695,42 @@ NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
689
695
  # NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
690
696
 
691
697
 
692
- def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> Set[str]:
693
- return {d["word"] for d in data.values() if d[key] == value}
698
+ def linku_data() -> Dict[str, LinkuWord]:
699
+ # NOTE: this does open+read+parse two files each time you construct a filter
700
+ # but i expect users to construct filters only at the start of runtime
701
+ # there is no reason to waste your RAM by leaving the linku data in it
702
+ with open(LINKU) as f:
703
+ linku: Dict[str, LinkuWord] = json.loads(f.read())
704
+ with open(SANDBOX) as f:
705
+ sandbox: Dict[str, LinkuWord] = json.loads(f.read())
706
+
707
+ return {**linku, **sandbox}
708
+
694
709
 
710
+ def words_by_tag(tag: str, value: str) -> Set[str]:
711
+ data = linku_data()
712
+ return {d["word"] for d in data.values() if d[tag] == value}
695
713
 
696
- with open(LINKU) as f:
697
- linku: Dict[str, Dict[str, str]] = json.loads(f.read())
698
- NIMI_PU = category_helper(linku, "book", "pu")
699
- NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
700
714
 
701
- NIMI_KU_SULI = category_helper(linku, "book", "ku suli")
702
- NIMI_KU_LILI = category_helper(linku, "book", "ku lili")
715
+ def words_by_usage(
716
+ usage: int,
717
+ date: Optional[LinkuUsageDate] = None,
718
+ ) -> Set[str]:
719
+ if not date:
720
+ date = LATEST_DATE
721
+ data = linku_data()
703
722
 
704
- NIMI_LINKU_CORE = category_helper(linku, "usage_category", "core")
705
- NIMI_LINKU_COMMON = category_helper(linku, "usage_category", "common")
706
- NIMI_LINKU_UNCOMMON = category_helper(linku, "usage_category", "uncommon")
707
- NIMI_LINKU_OBSCURE = category_helper(linku, "usage_category", "obscure")
723
+ result: Set[str] = set()
724
+ for word in data.values():
725
+ usages = word["usage"]
726
+ if date in usages and usages[date] >= usage:
727
+ result.add(word["word"])
728
+
729
+ return result
730
+
731
+
732
+ NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
708
733
 
709
- with open(SANDBOX) as f:
710
- sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
711
- NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
712
734
 
713
735
  # with open(SYLLABICS) as f:
714
736
  # FALSE_POS_SYLLABIC = {line.strip() for line in f}
@@ -716,9 +738,6 @@ with open(SANDBOX) as f:
716
738
  # with open(ALPHABETICS) as f:
717
739
  # FALSE_POS_ALPHABETIC = {line.strip() for line in f}
718
740
 
719
- del linku
720
- del sandbox
721
-
722
741
  __all__ = [
723
742
  "ALLOWABLES",
724
743
  "ALL_PUNCT",
@@ -727,14 +746,6 @@ __all__ = [
727
746
  "CONSONANTS",
728
747
  "EMOJI_VARIATION_SELECTOR_RANGES",
729
748
  "EMOJI_VARIATION_SELECTOR_RANGES_STR",
730
- "NIMI_KU_LILI",
731
- "NIMI_KU_SULI",
732
- "NIMI_LINKU_COMMON",
733
- "NIMI_LINKU_CORE",
734
- "NIMI_LINKU_OBSCURE",
735
- "NIMI_LINKU_SANDBOX",
736
- "NIMI_LINKU_UNCOMMON",
737
- "NIMI_PU",
738
749
  "NIMI_PU_SYNONYMS",
739
750
  "POSIX_PUNCT",
740
751
  "POSIX_PUNCT_RANGES",
sonatoki/ilo.py CHANGED
@@ -1,17 +1,14 @@
1
1
  # STL
2
- from typing import List, Type, Tuple
2
+ from typing import List, Type
3
3
 
4
4
  # LOCAL
5
+ from sonatoki.types import Number, Scorecard
5
6
  from sonatoki.Filters import Filter
6
- from sonatoki.Scorers import Number, Scorer
7
+ from sonatoki.Scorers import Scorer, SentNoOp, SentenceScorer
7
8
  from sonatoki.Cleaners import Cleaner
8
9
  from sonatoki.Tokenizers import Tokenizer, SentTokenizer, WordTokenizer
9
10
  from sonatoki.Preprocessors import Preprocessor
10
11
 
11
- # tokenized, filtered, cleaned, score, result
12
- Scorecard = Tuple[List[str], List[str], List[str], Number, bool]
13
- # TODO: scorecard kinda sucks as a name
14
-
15
12
 
16
13
  class Ilo:
17
14
  __preprocessors: List[Type[Preprocessor]]
@@ -21,6 +18,7 @@ class Ilo:
21
18
  __ignoring_filters: List[Type[Filter]]
22
19
  __scoring_filters: List[Type[Filter]]
23
20
  __scorer: Type[Scorer]
21
+ __sentence_scorer: Type[SentenceScorer]
24
22
  __passing_score: Number
25
23
 
26
24
  def __init__(
@@ -31,6 +29,7 @@ class Ilo:
31
29
  scoring_filters: List[Type[Filter]],
32
30
  scorer: Type[Scorer],
33
31
  passing_score: Number,
32
+ sentence_scorer: Type[SentenceScorer] = SentNoOp,
34
33
  word_tokenizer: Type[Tokenizer] = WordTokenizer,
35
34
  sent_tokenizer: Type[Tokenizer] = SentTokenizer,
36
35
  ):
@@ -43,6 +42,7 @@ class Ilo:
43
42
  self.__ignoring_filters = [*ignoring_filters]
44
43
  self.__scoring_filters = [*scoring_filters]
45
44
  self.__scorer = scorer
45
+ self.__sentence_scorer = sentence_scorer
46
46
  self.__passing_score = passing_score
47
47
 
48
48
  def preprocess(self, msg: str) -> str:
@@ -55,6 +55,7 @@ class Ilo:
55
55
  return self.__word_tokenizer.tokenize(msg)
56
56
 
57
57
  def sent_tokenize(self, msg: str) -> List[str]:
58
+ """It is *highly* recommended that you run `ilo.preprocess` first."""
58
59
  return self.__sent_tokenizer.tokenize(msg)
59
60
 
60
61
  def clean_token(self, token: str) -> str:
@@ -93,44 +94,50 @@ class Ilo:
93
94
  def score_tokens(self, tokens: List[str]) -> float:
94
95
  return self.__scorer.score(tokens, self.__scoring_filters)
95
96
 
97
+ def score_sentences(self, scorecards: List[Scorecard]) -> List[Scorecard]:
98
+ return self.__sentence_scorer.score(scorecards)
99
+
96
100
  def _is_toki_pona(self, message: str) -> Scorecard:
97
101
  """Process a message into its tokens, then filters, cleans, and scores
98
- them. Returns all parts. Message must already be preprocessed, normally
99
- done in `self.is_toki_pona(message)`.
100
-
101
- Returns all components of the processing algorithm except preprocessing:
102
- - Tokenized message (list[str])
103
- - Filtered message (list[str])
104
- - Cleaned message (list[str])
105
- - Score (float)
106
- - Result (bool)
102
+ them. Message must already be preprocessed, normally done in
103
+ `self.is_toki_pona(message)`.
104
+
105
+ Returns a `Scorecard` with all changes to the input text and a score.
107
106
  """
108
107
  tokenized = self.word_tokenize(message)
109
108
  filtered = self.filter_tokens(tokenized)
110
109
  cleaned = self.clean_tokens(filtered)
111
110
  score = self.score_tokens(cleaned)
112
- result = score >= self.__passing_score
113
111
 
114
- return tokenized, filtered, cleaned, score, result
112
+ scorecard: Scorecard = {
113
+ "text": message,
114
+ "tokenized": tokenized,
115
+ "filtered": filtered,
116
+ "cleaned": cleaned,
117
+ "score": score,
118
+ }
119
+
120
+ return scorecard
115
121
 
116
122
  def is_toki_pona(self, message: str) -> bool:
117
- """Determines whether a single statement is or is not Toki Pona."""
123
+ """Determines whether a text is or is not Toki Pona."""
118
124
  message = self.preprocess(message)
119
- *_, result = self._is_toki_pona(message)
120
- return result
125
+ scorecard = self._is_toki_pona(message)
126
+ return scorecard["score"] >= self.__passing_score
121
127
 
122
128
  def _are_toki_pona(self, message: str) -> List[Scorecard]:
123
- """Split a message into sentences, then return a list each sentence's
124
- results via `self._is_toki_pona()`.
129
+ """Split a message into sentences, then return a list with each
130
+ sentence's scorecard from `self._is_toki_pona()`.
125
131
 
126
132
  Message must already be preprocessed, normally done in
127
133
  `self.are_toki_pona(message)`.
128
134
  """
129
- results: List[Scorecard] = list()
135
+ scorecards: List[Scorecard] = list()
130
136
  for sentence in self.sent_tokenize(message):
131
137
  result = self._is_toki_pona(sentence)
132
- results.append(result)
133
- return results
138
+ scorecards.append(result)
139
+ scorecards = self.score_sentences(scorecards)
140
+ return scorecards
134
141
 
135
142
  def are_toki_pona(self, message: str) -> List[bool]:
136
143
  """Splits a statement into sentences, then determines if each is or is not Toki Pona.
@@ -148,5 +155,5 @@ class Ilo:
148
155
  ```
149
156
  """
150
157
  message = self.preprocess(message)
151
- results = self._are_toki_pona(message)
152
- return [res[-1] for res in results]
158
+ scorecards = self._are_toki_pona(message)
159
+ return [card["score"] >= self.__passing_score for card in scorecards]
sonatoki/types.py ADDED
@@ -0,0 +1,60 @@
1
+ # STL
2
+ from typing import Dict, List, Union, Literal, TypedDict
3
+
4
+ Number = Union[int, float]
5
+
6
+
7
+ # TODO: scorecard kinda sucks as a name
8
+ class Scorecard(TypedDict):
9
+ text: str
10
+ tokenized: List[str]
11
+ filtered: List[str]
12
+ cleaned: List[str]
13
+ score: Number
14
+
15
+
16
+ LinkuUsageDate = Union[
17
+ Literal["2020-04"],
18
+ Literal["2021-10"],
19
+ Literal["2022-08"],
20
+ Literal["2023-09"],
21
+ # Literal["2024-09"],
22
+ ]
23
+
24
+ LinkuUsageCategory = Union[
25
+ Literal["core"],
26
+ Literal["common"],
27
+ Literal["uncommon"],
28
+ Literal["obscure"],
29
+ Literal["sandbox"],
30
+ ]
31
+
32
+ LinkuBooks = Union[
33
+ Literal["pu"],
34
+ Literal["ku suli"],
35
+ Literal["ku lili"],
36
+ Literal["none"],
37
+ ]
38
+
39
+
40
+ class LinkuWord(TypedDict):
41
+ id: str
42
+ author_verbatim: str
43
+ author_verbatim_source: str
44
+ book: str
45
+ coined_era: str
46
+ coined_year: str
47
+ creator: List[str]
48
+ ku_data: Dict[str, int]
49
+ see_also: List[str]
50
+ resources: Dict[str, str]
51
+ representations: Dict[str, Union[str, List[str]]]
52
+ source_language: str
53
+ usage_category: LinkuUsageCategory
54
+ word: str
55
+ deprecated: bool
56
+ etymology: List[Dict[str, str]]
57
+ audio: List[Dict[str, str]]
58
+ pu_verbatim: Dict[str, str]
59
+ usage: Dict[LinkuUsageDate, int]
60
+ translations: Dict[str, Dict[str, str]]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.6.3
3
+ Version: 0.7.0
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,20 +1,21 @@
1
- sonatoki-0.6.3.dist-info/METADATA,sha256=AWtjziHObR8LdeB-QwIXaqWe-k8YQj9C0yDpa1_Y0Q0,6517
2
- sonatoki-0.6.3.dist-info/WHEEL,sha256=rSwsxJWe3vzyR5HCwjWXQruDgschpei4h_giTm0dJVE,90
3
- sonatoki-0.6.3.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
1
+ sonatoki-0.7.0.dist-info/METADATA,sha256=s6w7_WaARQijvFIFIWtg8hL2WzAkj19N7-DsKgfhi3s,6517
2
+ sonatoki-0.7.0.dist-info/WHEEL,sha256=rSwsxJWe3vzyR5HCwjWXQruDgschpei4h_giTm0dJVE,90
3
+ sonatoki-0.7.0.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
4
4
  sonatoki/Cleaners.py,sha256=x2dT3MpDUfbrHA0EP2D3n1sTiKFFi5jw9ha-1dX973o,1958
5
- sonatoki/Configs.py,sha256=l0OTEpbq6_IcNburV5pPTzRxsQ7_UCIugGv02adT8R8,5550
6
- sonatoki/Filters.py,sha256=3daBdOagJtkb4Qx6p5F2cCUd21FfMIY62UDWrR6Jj2Q,12131
7
- sonatoki/Preprocessors.py,sha256=nN6xL6mvVAnWZjSNW8CaeLm8x4kK3dCoB-1WYqi0ANU,5763
8
- sonatoki/Scorers.py,sha256=LRQLgXKTU2VqhkMHFPVxyVt83DXf85_zrpDGk4ThU24,3811
5
+ sonatoki/Configs.py,sha256=rIvrkYjeJeCuWwJIjvmJX6keRZcUJ0pt7h7KdYT5IFI,4766
6
+ sonatoki/Filters.py,sha256=cJ5skX9yeqd4HvjzPxIAswigRWvO0ZV2nepQksFedtk,12575
7
+ sonatoki/Preprocessors.py,sha256=nY0_cmF4aEmGZxXc7ZEvhvf2BZO6GnrMUC8IqDwu47A,6034
8
+ sonatoki/Scorers.py,sha256=aCU3p9rD4QOy-uu851FGGw-ARqUCG_l4V_z5rtRL420,5236
9
9
  sonatoki/Tokenizers.py,sha256=8lpC70bzXOpHyhVr5bmqpYKmdmQvJdf7X5-Icc9RRCw,5040
10
10
  sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  sonatoki/__main__.py,sha256=6n4kUF80APl6a0jV46h_ncHNuQbrLpZ_nAmiNAakiag,5673
12
12
  sonatoki/alphabetic.txt,sha256=duyqAKilD2vLIr75RShCIAnktNJcGeEoQIk18V6czmg,11702
13
- sonatoki/constants.py,sha256=mPbU-X9PNzelOHVZn-8ZqR_ewKYNjDA6lj2XQpnuoRw,19212
14
- sonatoki/ilo.py,sha256=PWZa202Q4h7IjnLxmfgT93iAPJL7dqJbA97L9kQDPiA,5658
13
+ sonatoki/constants.py,sha256=BxE_MME2XZUZLg9ZezPirUO2sxw4JkujsrKoENeYORc,19313
14
+ sonatoki/ilo.py,sha256=Dsn0yagkwjqpAQoCj6mkZ6NqWeanRF2lxNDNoqjWGLo,5993
15
15
  sonatoki/linku.json,sha256=d72Dvht-a4gBmdqLLI8mElvo83zSpbxDmxJj05hOudM,295413
16
16
  sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  sonatoki/sandbox.json,sha256=44csrQDaVtV-n8OyewabX1J9MmUFCsPct5C8E5Xuc58,140197
18
18
  sonatoki/syllabic.txt,sha256=HnqY4TrZ3tPcHah3TsvG9F9gjMrnAGdJ8hHJNHyyUPc,1712
19
+ sonatoki/types.py,sha256=zoVJeaDLOPstREiHtoD9pv-AOCsJq2C4_GG3nTYd114,1267
19
20
  sonatoki/utils.py,sha256=sT5xLMEj0aLpy8GP92HKblJU1Wt1m8NUlMgCFWB32xQ,2265
20
- sonatoki-0.6.3.dist-info/RECORD,,
21
+ sonatoki-0.7.0.dist-info/RECORD,,