sonatoki 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/Configs.py CHANGED
@@ -1,49 +1,74 @@
1
1
  # STL
2
- from copy import deepcopy
3
- from typing import Set, List, Type, TypedDict, cast
2
+ from typing import List, Type, TypedDict
4
3
 
5
4
  # PDM
6
5
  from typing_extensions import NotRequired
7
6
 
8
7
  # LOCAL
8
+ from sonatoki.types import Number
9
9
  from sonatoki.Filters import (
10
10
  Or,
11
11
  And,
12
12
  Not,
13
13
  Filter,
14
14
  Numeric,
15
- Syllabic,
16
15
  NimiUCSUR,
17
16
  Alphabetic,
18
17
  NimiKuLili,
19
18
  NimiKuSuli,
20
19
  ProperName,
21
- Phonotactic,
22
20
  Punctuation,
23
21
  LongSyllabic,
24
22
  Miscellaneous,
25
- NimiLinkuCore,
26
23
  LongAlphabetic,
27
24
  LongProperName,
28
- NimiLinkuCommon,
29
25
  FalsePosSyllabic,
26
+ NimiLinkuByUsage,
30
27
  NimiLinkuObscure,
31
28
  NimiLinkuSandbox,
32
29
  NimiLinkuUncommon,
33
30
  FalsePosAlphabetic,
34
31
  )
35
- from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
32
+ from sonatoki.Scorers import Scorer, PassFail, SoftScaling, SoftPassFail
36
33
  from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
37
34
  from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
38
35
  from sonatoki.Preprocessors import (
39
36
  URLs,
40
37
  Emoji,
41
- Backticks,
38
+ Codeblock,
42
39
  Reference,
43
40
  Preprocessor,
44
41
  AngleBracketObject,
45
42
  )
46
43
 
44
+ __DICT_PHONOMATCHES = {
45
+ # Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
46
+ # In this case, all of these appear more often in English by a factor of at least 10.
47
+ "aka", # also known as
48
+ "an", # article
49
+ "api", # API
50
+ "i", # 1st person
51
+ "kana", # japanese script
52
+ "me", # 1st person singular, english
53
+ "ne", # "no" in several languages
54
+ "nu", # "new" in english, "now" in dutch
55
+ "se", # spanish particle, english "see"
56
+ "take", # acquire, perhaps forcefully or without permission
57
+ "ten", # 10
58
+ "to", # to, too
59
+ "je", # 1st person pronoun, french
60
+ "u", # no u
61
+ "we", # 1st person plural, english
62
+ "wi", # wii and discussions of syllables
63
+ "sole", # singular, of shoe
64
+ # unexplored candidates for removal
65
+ # "omen", # ominous
66
+ # "papa", # father
67
+ # "lo", # "lo" and "loo"
68
+ # "ewe", # sheep
69
+ # "pa", # father- eh?
70
+ }
71
+
47
72
 
48
73
  class IloConfig(TypedDict):
49
74
  preprocessors: List[Type[Preprocessor]]
@@ -69,11 +94,11 @@ BaseConfig: IloConfig = {
69
94
 
70
95
 
71
96
  PrefConfig: IloConfig = {
72
- "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
97
+ "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
73
98
  "cleaners": [ConsecutiveDuplicates],
74
99
  "ignoring_filters": [Numeric, Punctuation],
75
100
  "scoring_filters": [
76
- Or(NimiLinkuCore, NimiLinkuCommon, NimiLinkuUncommon, NimiUCSUR),
101
+ Or(NimiLinkuByUsage(30), NimiUCSUR),
77
102
  And(LongSyllabic, Not(FalsePosSyllabic)),
78
103
  # NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
79
104
  LongProperName,
@@ -84,16 +109,13 @@ PrefConfig: IloConfig = {
84
109
  }
85
110
 
86
111
  CorpusConfig: IloConfig = {
87
- "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
112
+ "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
88
113
  "cleaners": [ConsecutiveDuplicates],
89
114
  "ignoring_filters": [Numeric, Punctuation],
90
115
  "scoring_filters": [
91
116
  Or(
92
- NimiLinkuCore,
93
- NimiLinkuCommon,
94
- NimiLinkuUncommon,
95
- NimiLinkuObscure,
96
- NimiLinkuSandbox,
117
+ # awkward but efficient syntax
118
+ NimiLinkuByUsage(0)(sub=__DICT_PHONOMATCHES),
97
119
  NimiUCSUR,
98
120
  Miscellaneous,
99
121
  ),
@@ -104,43 +126,9 @@ CorpusConfig: IloConfig = {
104
126
  "scorer": SoftScaling,
105
127
  "passing_score": 0.8,
106
128
  }
107
-
108
- # TODO: create a mechanism to omit tokens from a filter with more granularity
109
- __corpus_tokens_dict: Set[str] = cast(
110
- Set[str],
111
- CorpusConfig["scoring_filters"][
112
- 0
113
- ].tokens, # pyright: ignore[reportAttributeAccessIssue]
114
- )
115
- __corpus_tokens_dict -= {
116
- # Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
117
- # In this case, all of these appear more often in English by a factor of at least 10.
118
- "aka", # also known as
119
- "an", # article
120
- "api", # API
121
- "i", # 1st person
122
- "kana", # japanese script
123
- "me", # 1st person
124
- "ne", # "no" in several languages
125
- "nu", # "new", now in dutch
126
- "se", # spanish particle, "see"
127
- "take", # acquire, perhaps forcefully or without permission
128
- "ten", # 10
129
- "to", # to, too
130
- "u", # no u
131
- "we", # 1st person plural
132
- "wi", # wii and discussions of syllables
133
- "sole", # singular, of shoe
134
- # unexplored candidates for removal
135
- # "omen", # ominous
136
- # "papa", # father
137
- # "lo", # "lo" and "loo"
138
- # "ewe", # sheep
139
- # "pa", # father- eh?
140
- }
141
129
  """Mimics the previous implementation of ilo pi toki pona taso."""
142
130
  LazyConfig: IloConfig = {
143
- "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
131
+ "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
144
132
  "cleaners": [ConsecutiveDuplicates],
145
133
  "ignoring_filters": [Numeric, Punctuation],
146
134
  "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
@@ -150,7 +138,7 @@ LazyConfig: IloConfig = {
150
138
  }
151
139
  """This is extremely silly."""
152
140
  IsipinEpikuConfig: IloConfig = {
153
- "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
141
+ "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
154
142
  "cleaners": [ConsecutiveDuplicates],
155
143
  "ignoring_filters": [Numeric, Punctuation],
156
144
  "scoring_filters": [
@@ -170,31 +158,10 @@ IsipinEpikuConfig: IloConfig = {
170
158
  }
171
159
 
172
160
 
173
- DiscordConfig: IloConfig = {
174
- "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
175
- "cleaners": [ConsecutiveDuplicates],
176
- "ignoring_filters": [Numeric, Punctuation],
177
- "scoring_filters": [
178
- Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
179
- And(LongSyllabic, Not(FalsePosSyllabic)),
180
- LongProperName,
181
- And(LongAlphabetic, Not(FalsePosAlphabetic)),
182
- ],
183
- "scorer": SoftScaling,
184
- "passing_score": 0.8,
185
- }
186
-
187
- TelegramConfig: IloConfig = deepcopy(PrefConfig)
188
- ForumConfig: IloConfig = deepcopy(PrefConfig)
189
-
190
-
191
161
  __all__ = [
192
162
  "BaseConfig",
193
163
  "CorpusConfig",
194
- "DiscordConfig",
195
- "ForumConfig",
196
164
  "IloConfig",
197
165
  "LazyConfig",
198
166
  "PrefConfig",
199
- "TelegramConfig",
200
167
  ]
sonatoki/Filters.py CHANGED
@@ -1,37 +1,33 @@
1
1
  # STL
2
2
  import re
3
3
  from abc import ABC, abstractmethod
4
- from typing import Set, List, Type
4
+ from copy import deepcopy
5
+ from typing import Set, List, Type, Union, Literal, Optional
5
6
  from functools import lru_cache as cache # cache comes in 3.9
6
7
 
7
8
  # PDM
8
9
  import regex
9
- from typing_extensions import override, deprecated
10
+ from typing_extensions import override
10
11
 
11
12
  # LOCAL
13
+ from sonatoki.types import LinkuBooks, LinkuUsageDate, LinkuUsageCategory
12
14
  from sonatoki.utils import prep_dictionary
13
15
  from sonatoki.constants import (
14
16
  VOWELS,
15
- NIMI_PU,
16
17
  ALPHABET,
17
18
  ALL_PUNCT,
18
19
  ALLOWABLES,
19
20
  CONSONANTS,
20
21
  NIMI_UCSUR,
21
- NIMI_KU_LILI,
22
- NIMI_KU_SULI,
23
- NIMI_LINKU_CORE,
24
22
  NIMI_PU_SYNONYMS,
25
- NIMI_LINKU_COMMON,
26
23
  FALSE_POS_SYLLABIC,
27
- NIMI_LINKU_OBSCURE,
28
- NIMI_LINKU_SANDBOX,
29
24
  NOT_IN_PUNCT_CLASS,
30
- NIMI_LINKU_UNCOMMON,
31
25
  ALL_PUNCT_RANGES_STR,
32
26
  FALSE_POS_ALPHABETIC,
33
27
  UCSUR_PUNCT_RANGES_STR,
34
28
  EMOJI_VARIATION_SELECTOR_RANGES_STR,
29
+ words_by_tag,
30
+ words_by_usage,
35
31
  )
36
32
 
37
33
  regex.DEFAULT_VERSION = regex.VERSION1
@@ -101,6 +97,20 @@ class MemberFilter(Filter):
101
97
  def filter(cls, token: str) -> bool:
102
98
  return token.lower() in cls.tokens
103
99
 
100
+ def __new__(
101
+ cls, add: Optional[Set[str]] = None, sub: Optional[Set[str]] = None
102
+ ) -> Type[Filter]:
103
+ parent_tokens = deepcopy(cls.tokens)
104
+ if add:
105
+ parent_tokens = parent_tokens.union(add)
106
+ if sub:
107
+ parent_tokens -= sub
108
+
109
+ class AnonMemberFilter(MemberFilter):
110
+ tokens = parent_tokens
111
+
112
+ return AnonMemberFilter
113
+
104
114
 
105
115
  class SubsetFilter(Filter):
106
116
  tokens: Set[str]
@@ -155,40 +165,46 @@ class LongProperName(MinLen, ProperName):
155
165
  length = 2 # reject "names" of length 1
156
166
 
157
167
 
158
- class NimiPu(MemberFilter):
159
- tokens = prep_dictionary(NIMI_PU)
160
-
161
-
162
- class NimiPuSynonyms(MemberFilter):
163
- tokens = prep_dictionary(NIMI_PU_SYNONYMS)
168
+ class NimiLinkuByUsage:
169
+ def __new__(
170
+ cls,
171
+ usage: int,
172
+ date: Optional[LinkuUsageDate] = None,
173
+ ) -> Type[MemberFilter]:
174
+ words = words_by_usage(usage, date)
164
175
 
176
+ class AnonLinkuMemberFilter(MemberFilter):
177
+ tokens = prep_dictionary(words)
165
178
 
166
- class NimiKuSuli(MemberFilter):
167
- tokens = prep_dictionary(NIMI_KU_SULI)
179
+ return AnonLinkuMemberFilter
168
180
 
169
181
 
170
- class NimiKuLili(MemberFilter):
171
- tokens = prep_dictionary(NIMI_KU_LILI)
182
+ class NimiLinkuByTag:
183
+ def __new__(
184
+ cls,
185
+ tag: Union[Literal["usage_category"], Literal["book"]],
186
+ category: Union[LinkuUsageCategory, LinkuBooks],
187
+ ) -> Type[MemberFilter]:
188
+ words = words_by_tag(tag, category)
172
189
 
190
+ class AnonLinkuMemberFilter(MemberFilter):
191
+ tokens = prep_dictionary(words)
173
192
 
174
- class NimiLinkuCore(MemberFilter):
175
- tokens = prep_dictionary(NIMI_LINKU_CORE)
193
+ return AnonLinkuMemberFilter
176
194
 
177
195
 
178
- class NimiLinkuCommon(MemberFilter):
179
- tokens = prep_dictionary(NIMI_LINKU_COMMON)
196
+ NimiPu = NimiLinkuByTag("book", "pu")
197
+ NimiKuSuli = NimiLinkuByTag("book", "ku suli")
198
+ NimiKuLili = NimiLinkuByTag("book", "ku lili")
199
+ NimiLinkuCore = NimiLinkuByTag("usage_category", "core")
200
+ NimiLinkuCommon = NimiLinkuByTag("usage_category", "common")
201
+ NimiLinkuUncommon = NimiLinkuByTag("usage_category", "uncommon")
202
+ NimiLinkuObscure = NimiLinkuByTag("usage_category", "obscure")
203
+ NimiLinkuSandbox = NimiLinkuByTag("usage_category", "sandbox")
180
204
 
181
205
 
182
- class NimiLinkuUncommon(MemberFilter):
183
- tokens = prep_dictionary(NIMI_LINKU_UNCOMMON)
184
-
185
-
186
- class NimiLinkuObscure(MemberFilter):
187
- tokens = prep_dictionary(NIMI_LINKU_OBSCURE)
188
-
189
-
190
- class NimiLinkuSandbox(MemberFilter):
191
- tokens = prep_dictionary(NIMI_LINKU_SANDBOX)
206
+ class NimiPuSynonyms(MemberFilter):
207
+ tokens = prep_dictionary(NIMI_PU_SYNONYMS)
192
208
 
193
209
 
194
210
  class NimiUCSUR(MemberFilter):
sonatoki/Preprocessors.py CHANGED
@@ -143,6 +143,15 @@ class Backticks(RegexPreprocessor):
143
143
  pattern = re.compile(r"`[^`]+`", flags=re.DOTALL)
144
144
 
145
145
 
146
+ class Codeblock(RegexPreprocessor):
147
+ """Remove codeblocks marked by a set of three backticks on their own lines.
148
+
149
+ Subset of what would be removed by Backticks, but may be preferable.
150
+ """
151
+
152
+ pattern = re.compile(r"```\n(?:(?!```).*?)?```", flags=re.DOTALL)
153
+
154
+
146
155
  class Spoilers(RegexPreprocessor):
147
156
  """Remove paired double bars and their contents `||like this||`"""
148
157
 
sonatoki/Scorers.py CHANGED
@@ -1,17 +1,15 @@
1
1
  # STL
2
2
  import math
3
3
  from abc import ABC, abstractmethod
4
- from typing import Dict, List, Type, Union
4
+ from typing import List, Type
5
5
 
6
6
  # PDM
7
7
  from typing_extensions import override
8
8
 
9
9
  # LOCAL
10
+ from sonatoki.types import Number, Scorecard
10
11
  from sonatoki.Filters import Filter
11
12
 
12
- Number = Union[int, float]
13
- Weights = Dict[str, Number]
14
-
15
13
 
16
14
  class Scorer(ABC):
17
15
  @classmethod
@@ -124,7 +122,64 @@ class SoftScaling(Soften, Scaling):
124
122
  scoring."""
125
123
 
126
124
 
127
- # class Logarithmic(Scorer): ...
125
+ class SentenceScorer(ABC):
126
+ @classmethod
127
+ @abstractmethod
128
+ def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
129
+ """Re-score a list of sentences (scorecards, sentences with all their
130
+ metadata) and return them."""
131
+ raise NotImplementedError
132
+
133
+
134
+ class SentNoOp(SentenceScorer):
135
+ @classmethod
136
+ @override
137
+ def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
138
+ return scorecards
128
139
 
129
140
 
130
- __all__ = ["PassFail", "SoftPassFail", "Scaling", "SoftScaling"]
141
+ class SentAvg(SentenceScorer):
142
+ @classmethod
143
+ @override
144
+ def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
145
+ if not scorecards:
146
+ return scorecards
147
+
148
+ total = sum(card["score"] for card in scorecards)
149
+ avg = total / len(scorecards)
150
+ for card in scorecards:
151
+ card["score"] = avg
152
+ return scorecards
153
+
154
+
155
+ class SentWeightedAvg(SentenceScorer):
156
+ @classmethod
157
+ @override
158
+ def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
159
+ if not scorecards:
160
+ return scorecards
161
+
162
+ weighted_total = 0
163
+ total_len = 0
164
+ for card in scorecards:
165
+ cardlen = len(card["cleaned"])
166
+ cardscore = card["score"]
167
+
168
+ weighted_total += cardlen * cardscore
169
+ total_len += cardlen
170
+
171
+ weighted_avg = weighted_total / total_len
172
+ for card in scorecards:
173
+ card["score"] = weighted_avg
174
+ return scorecards
175
+
176
+
177
+ __all__ = [
178
+ "PassFail",
179
+ "Scaling",
180
+ "SoftPassFail",
181
+ "SoftScaling",
182
+ "Soften",
183
+ "SentAvg",
184
+ "SentWeightedAvg",
185
+ ]
sonatoki/constants.py CHANGED
@@ -1,11 +1,16 @@
1
1
  # STL
2
2
  import json
3
- from typing import Set, Dict
3
+ from typing import Set, Dict, Optional
4
4
  from pathlib import Path
5
5
 
6
6
  # LOCAL
7
+ from sonatoki.types import LinkuWord, LinkuUsageDate
7
8
  from sonatoki.utils import find_unicode_chars, find_unicode_ranges
8
9
 
10
+ LATEST_DATE = "2023-09"
11
+ # hardcoding this seems bad, but it means the parser is stable w.r.t. Linku!
12
+
13
+
9
14
  # `\p{Punctuation}` character class
10
15
  # https://www.compart.com/en/unicode/category
11
16
  # https://unicode.org/Public/UNIDATA/UnicodeData.txt
@@ -638,6 +643,7 @@ FALSE_POS_SYLLABIC = {
638
643
  "iluminate",
639
644
  "imense",
640
645
  "imitate",
646
+ "inanimate",
641
647
  "injoke",
642
648
  "insane",
643
649
  "insolate",
@@ -689,26 +695,42 @@ NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
689
695
  # NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
690
696
 
691
697
 
692
- def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> Set[str]:
693
- return {d["word"] for d in data.values() if d[key] == value}
698
+ def linku_data() -> Dict[str, LinkuWord]:
699
+ # NOTE: this does open+read+parse two files each time you construct a filter
700
+ # but i expect users to construct filters only at the start of runtime
701
+ # there is no reason to waste your RAM by leaving the linku data in it
702
+ with open(LINKU) as f:
703
+ linku: Dict[str, LinkuWord] = json.loads(f.read())
704
+ with open(SANDBOX) as f:
705
+ sandbox: Dict[str, LinkuWord] = json.loads(f.read())
706
+
707
+ return {**linku, **sandbox}
708
+
694
709
 
710
+ def words_by_tag(tag: str, value: str) -> Set[str]:
711
+ data = linku_data()
712
+ return {d["word"] for d in data.values() if d[tag] == value}
695
713
 
696
- with open(LINKU) as f:
697
- linku: Dict[str, Dict[str, str]] = json.loads(f.read())
698
- NIMI_PU = category_helper(linku, "book", "pu")
699
- NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
700
714
 
701
- NIMI_KU_SULI = category_helper(linku, "book", "ku suli")
702
- NIMI_KU_LILI = category_helper(linku, "book", "ku lili")
715
+ def words_by_usage(
716
+ usage: int,
717
+ date: Optional[LinkuUsageDate] = None,
718
+ ) -> Set[str]:
719
+ if not date:
720
+ date = LATEST_DATE
721
+ data = linku_data()
703
722
 
704
- NIMI_LINKU_CORE = category_helper(linku, "usage_category", "core")
705
- NIMI_LINKU_COMMON = category_helper(linku, "usage_category", "common")
706
- NIMI_LINKU_UNCOMMON = category_helper(linku, "usage_category", "uncommon")
707
- NIMI_LINKU_OBSCURE = category_helper(linku, "usage_category", "obscure")
723
+ result: Set[str] = set()
724
+ for word in data.values():
725
+ usages = word["usage"]
726
+ if date in usages and usages[date] >= usage:
727
+ result.add(word["word"])
728
+
729
+ return result
730
+
731
+
732
+ NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
708
733
 
709
- with open(SANDBOX) as f:
710
- sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
711
- NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
712
734
 
713
735
  # with open(SYLLABICS) as f:
714
736
  # FALSE_POS_SYLLABIC = {line.strip() for line in f}
@@ -716,9 +738,6 @@ with open(SANDBOX) as f:
716
738
  # with open(ALPHABETICS) as f:
717
739
  # FALSE_POS_ALPHABETIC = {line.strip() for line in f}
718
740
 
719
- del linku
720
- del sandbox
721
-
722
741
  __all__ = [
723
742
  "ALLOWABLES",
724
743
  "ALL_PUNCT",
@@ -727,14 +746,6 @@ __all__ = [
727
746
  "CONSONANTS",
728
747
  "EMOJI_VARIATION_SELECTOR_RANGES",
729
748
  "EMOJI_VARIATION_SELECTOR_RANGES_STR",
730
- "NIMI_KU_LILI",
731
- "NIMI_KU_SULI",
732
- "NIMI_LINKU_COMMON",
733
- "NIMI_LINKU_CORE",
734
- "NIMI_LINKU_OBSCURE",
735
- "NIMI_LINKU_SANDBOX",
736
- "NIMI_LINKU_UNCOMMON",
737
- "NIMI_PU",
738
749
  "NIMI_PU_SYNONYMS",
739
750
  "POSIX_PUNCT",
740
751
  "POSIX_PUNCT_RANGES",
sonatoki/ilo.py CHANGED
@@ -1,17 +1,14 @@
1
1
  # STL
2
- from typing import List, Type, Tuple
2
+ from typing import List, Type
3
3
 
4
4
  # LOCAL
5
+ from sonatoki.types import Number, Scorecard
5
6
  from sonatoki.Filters import Filter
6
- from sonatoki.Scorers import Number, Scorer
7
+ from sonatoki.Scorers import Scorer, SentNoOp, SentenceScorer
7
8
  from sonatoki.Cleaners import Cleaner
8
9
  from sonatoki.Tokenizers import Tokenizer, SentTokenizer, WordTokenizer
9
10
  from sonatoki.Preprocessors import Preprocessor
10
11
 
11
- # tokenized, filtered, cleaned, score, result
12
- Scorecard = Tuple[List[str], List[str], List[str], Number, bool]
13
- # TODO: scorecard kinda sucks as a name
14
-
15
12
 
16
13
  class Ilo:
17
14
  __preprocessors: List[Type[Preprocessor]]
@@ -21,6 +18,7 @@ class Ilo:
21
18
  __ignoring_filters: List[Type[Filter]]
22
19
  __scoring_filters: List[Type[Filter]]
23
20
  __scorer: Type[Scorer]
21
+ __sentence_scorer: Type[SentenceScorer]
24
22
  __passing_score: Number
25
23
 
26
24
  def __init__(
@@ -31,6 +29,7 @@ class Ilo:
31
29
  scoring_filters: List[Type[Filter]],
32
30
  scorer: Type[Scorer],
33
31
  passing_score: Number,
32
+ sentence_scorer: Type[SentenceScorer] = SentNoOp,
34
33
  word_tokenizer: Type[Tokenizer] = WordTokenizer,
35
34
  sent_tokenizer: Type[Tokenizer] = SentTokenizer,
36
35
  ):
@@ -43,6 +42,7 @@ class Ilo:
43
42
  self.__ignoring_filters = [*ignoring_filters]
44
43
  self.__scoring_filters = [*scoring_filters]
45
44
  self.__scorer = scorer
45
+ self.__sentence_scorer = sentence_scorer
46
46
  self.__passing_score = passing_score
47
47
 
48
48
  def preprocess(self, msg: str) -> str:
@@ -55,6 +55,7 @@ class Ilo:
55
55
  return self.__word_tokenizer.tokenize(msg)
56
56
 
57
57
  def sent_tokenize(self, msg: str) -> List[str]:
58
+ """It is *highly* recommended that you run `ilo.preprocess` first."""
58
59
  return self.__sent_tokenizer.tokenize(msg)
59
60
 
60
61
  def clean_token(self, token: str) -> str:
@@ -93,44 +94,50 @@ class Ilo:
93
94
  def score_tokens(self, tokens: List[str]) -> float:
94
95
  return self.__scorer.score(tokens, self.__scoring_filters)
95
96
 
97
+ def score_sentences(self, scorecards: List[Scorecard]) -> List[Scorecard]:
98
+ return self.__sentence_scorer.score(scorecards)
99
+
96
100
  def _is_toki_pona(self, message: str) -> Scorecard:
97
101
  """Process a message into its tokens, then filters, cleans, and scores
98
- them. Returns all parts. Message must already be preprocessed, normally
99
- done in `self.is_toki_pona(message)`.
100
-
101
- Returns all components of the processing algorithm except preprocessing:
102
- - Tokenized message (list[str])
103
- - Filtered message (list[str])
104
- - Cleaned message (list[str])
105
- - Score (float)
106
- - Result (bool)
102
+ them. Message must already be preprocessed, normally done in
103
+ `self.is_toki_pona(message)`.
104
+
105
+ Returns a `Scorecard` with all changes to the input text and a score.
107
106
  """
108
107
  tokenized = self.word_tokenize(message)
109
108
  filtered = self.filter_tokens(tokenized)
110
109
  cleaned = self.clean_tokens(filtered)
111
110
  score = self.score_tokens(cleaned)
112
- result = score >= self.__passing_score
113
111
 
114
- return tokenized, filtered, cleaned, score, result
112
+ scorecard: Scorecard = {
113
+ "text": message,
114
+ "tokenized": tokenized,
115
+ "filtered": filtered,
116
+ "cleaned": cleaned,
117
+ "score": score,
118
+ }
119
+
120
+ return scorecard
115
121
 
116
122
  def is_toki_pona(self, message: str) -> bool:
117
- """Determines whether a single statement is or is not Toki Pona."""
123
+ """Determines whether a text is or is not Toki Pona."""
118
124
  message = self.preprocess(message)
119
- *_, result = self._is_toki_pona(message)
120
- return result
125
+ scorecard = self._is_toki_pona(message)
126
+ return scorecard["score"] >= self.__passing_score
121
127
 
122
128
  def _are_toki_pona(self, message: str) -> List[Scorecard]:
123
- """Split a message into sentences, then return a list each sentence's
124
- results via `self._is_toki_pona()`.
129
+ """Split a message into sentences, then return a list with each
130
+ sentence's scorecard from `self._is_toki_pona()`.
125
131
 
126
132
  Message must already be preprocessed, normally done in
127
133
  `self.are_toki_pona(message)`.
128
134
  """
129
- results: List[Scorecard] = list()
135
+ scorecards: List[Scorecard] = list()
130
136
  for sentence in self.sent_tokenize(message):
131
137
  result = self._is_toki_pona(sentence)
132
- results.append(result)
133
- return results
138
+ scorecards.append(result)
139
+ scorecards = self.score_sentences(scorecards)
140
+ return scorecards
134
141
 
135
142
  def are_toki_pona(self, message: str) -> List[bool]:
136
143
  """Splits a statement into sentences, then determines if each is or is not Toki Pona.
@@ -148,5 +155,5 @@ class Ilo:
148
155
  ```
149
156
  """
150
157
  message = self.preprocess(message)
151
- results = self._are_toki_pona(message)
152
- return [res[-1] for res in results]
158
+ scorecards = self._are_toki_pona(message)
159
+ return [card["score"] >= self.__passing_score for card in scorecards]
sonatoki/types.py ADDED
@@ -0,0 +1,60 @@
1
+ # STL
2
+ from typing import Dict, List, Union, Literal, TypedDict
3
+
4
+ Number = Union[int, float]
5
+
6
+
7
+ # TODO: scorecard kinda sucks as a name
8
+ class Scorecard(TypedDict):
9
+ text: str
10
+ tokenized: List[str]
11
+ filtered: List[str]
12
+ cleaned: List[str]
13
+ score: Number
14
+
15
+
16
+ LinkuUsageDate = Union[
17
+ Literal["2020-04"],
18
+ Literal["2021-10"],
19
+ Literal["2022-08"],
20
+ Literal["2023-09"],
21
+ # Literal["2024-09"],
22
+ ]
23
+
24
+ LinkuUsageCategory = Union[
25
+ Literal["core"],
26
+ Literal["common"],
27
+ Literal["uncommon"],
28
+ Literal["obscure"],
29
+ Literal["sandbox"],
30
+ ]
31
+
32
+ LinkuBooks = Union[
33
+ Literal["pu"],
34
+ Literal["ku suli"],
35
+ Literal["ku lili"],
36
+ Literal["none"],
37
+ ]
38
+
39
+
40
+ class LinkuWord(TypedDict):
41
+ id: str
42
+ author_verbatim: str
43
+ author_verbatim_source: str
44
+ book: str
45
+ coined_era: str
46
+ coined_year: str
47
+ creator: List[str]
48
+ ku_data: Dict[str, int]
49
+ see_also: List[str]
50
+ resources: Dict[str, str]
51
+ representations: Dict[str, Union[str, List[str]]]
52
+ source_language: str
53
+ usage_category: LinkuUsageCategory
54
+ word: str
55
+ deprecated: bool
56
+ etymology: List[Dict[str, str]]
57
+ audio: List[Dict[str, str]]
58
+ pu_verbatim: Dict[str, str]
59
+ usage: Dict[LinkuUsageDate, int]
60
+ translations: Dict[str, Dict[str, str]]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.6.2
3
+ Version: 0.7.0
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,20 +1,21 @@
1
- sonatoki-0.6.2.dist-info/METADATA,sha256=DfNErx2CBzvtmLA-ANWO6LeGNyR3bywqb_ITgOhc4ew,6517
2
- sonatoki-0.6.2.dist-info/WHEEL,sha256=rSwsxJWe3vzyR5HCwjWXQruDgschpei4h_giTm0dJVE,90
3
- sonatoki-0.6.2.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
1
+ sonatoki-0.7.0.dist-info/METADATA,sha256=s6w7_WaARQijvFIFIWtg8hL2WzAkj19N7-DsKgfhi3s,6517
2
+ sonatoki-0.7.0.dist-info/WHEEL,sha256=rSwsxJWe3vzyR5HCwjWXQruDgschpei4h_giTm0dJVE,90
3
+ sonatoki-0.7.0.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
4
4
  sonatoki/Cleaners.py,sha256=x2dT3MpDUfbrHA0EP2D3n1sTiKFFi5jw9ha-1dX973o,1958
5
- sonatoki/Configs.py,sha256=RD6YUYW45pwIFx8ebJgGs5PhIhL9sjn_VqIg4zf3VUE,5697
6
- sonatoki/Filters.py,sha256=nVSmw5M4sEYA_8KI1fI53rMHkd9KO6yWbKfdxxExxN8,11700
7
- sonatoki/Preprocessors.py,sha256=nN6xL6mvVAnWZjSNW8CaeLm8x4kK3dCoB-1WYqi0ANU,5763
8
- sonatoki/Scorers.py,sha256=LRQLgXKTU2VqhkMHFPVxyVt83DXf85_zrpDGk4ThU24,3811
5
+ sonatoki/Configs.py,sha256=rIvrkYjeJeCuWwJIjvmJX6keRZcUJ0pt7h7KdYT5IFI,4766
6
+ sonatoki/Filters.py,sha256=cJ5skX9yeqd4HvjzPxIAswigRWvO0ZV2nepQksFedtk,12575
7
+ sonatoki/Preprocessors.py,sha256=nY0_cmF4aEmGZxXc7ZEvhvf2BZO6GnrMUC8IqDwu47A,6034
8
+ sonatoki/Scorers.py,sha256=aCU3p9rD4QOy-uu851FGGw-ARqUCG_l4V_z5rtRL420,5236
9
9
  sonatoki/Tokenizers.py,sha256=8lpC70bzXOpHyhVr5bmqpYKmdmQvJdf7X5-Icc9RRCw,5040
10
10
  sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  sonatoki/__main__.py,sha256=6n4kUF80APl6a0jV46h_ncHNuQbrLpZ_nAmiNAakiag,5673
12
12
  sonatoki/alphabetic.txt,sha256=duyqAKilD2vLIr75RShCIAnktNJcGeEoQIk18V6czmg,11702
13
- sonatoki/constants.py,sha256=mPbU-X9PNzelOHVZn-8ZqR_ewKYNjDA6lj2XQpnuoRw,19212
14
- sonatoki/ilo.py,sha256=PWZa202Q4h7IjnLxmfgT93iAPJL7dqJbA97L9kQDPiA,5658
13
+ sonatoki/constants.py,sha256=BxE_MME2XZUZLg9ZezPirUO2sxw4JkujsrKoENeYORc,19313
14
+ sonatoki/ilo.py,sha256=Dsn0yagkwjqpAQoCj6mkZ6NqWeanRF2lxNDNoqjWGLo,5993
15
15
  sonatoki/linku.json,sha256=d72Dvht-a4gBmdqLLI8mElvo83zSpbxDmxJj05hOudM,295413
16
16
  sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  sonatoki/sandbox.json,sha256=44csrQDaVtV-n8OyewabX1J9MmUFCsPct5C8E5Xuc58,140197
18
18
  sonatoki/syllabic.txt,sha256=HnqY4TrZ3tPcHah3TsvG9F9gjMrnAGdJ8hHJNHyyUPc,1712
19
+ sonatoki/types.py,sha256=zoVJeaDLOPstREiHtoD9pv-AOCsJq2C4_GG3nTYd114,1267
19
20
  sonatoki/utils.py,sha256=sT5xLMEj0aLpy8GP92HKblJU1Wt1m8NUlMgCFWB32xQ,2265
20
- sonatoki-0.6.2.dist-info/RECORD,,
21
+ sonatoki-0.7.0.dist-info/RECORD,,