sonatoki 0.6.3__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {sonatoki-0.6.3 → sonatoki-0.8.0}/PKG-INFO +1 -1
  2. {sonatoki-0.6.3 → sonatoki-0.8.0}/pyproject.toml +1 -1
  3. {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/Configs.py +13 -38
  4. {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/Filters.py +60 -36
  5. {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/Preprocessors.py +9 -0
  6. {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/Scorers.py +61 -6
  7. {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/constants.py +38 -27
  8. {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/ilo.py +34 -27
  9. sonatoki-0.8.0/src/sonatoki/types.py +60 -0
  10. {sonatoki-0.6.3 → sonatoki-0.8.0}/tests/test_filters.py +40 -36
  11. {sonatoki-0.6.3 → sonatoki-0.8.0}/tests/test_ilo.py +54 -5
  12. {sonatoki-0.6.3 → sonatoki-0.8.0}/tests/test_preprocessors.py +20 -0
  13. {sonatoki-0.6.3 → sonatoki-0.8.0}/tests/test_properties.py +12 -22
  14. {sonatoki-0.6.3 → sonatoki-0.8.0}/tests/test_scorers.py +2 -2
  15. {sonatoki-0.6.3 → sonatoki-0.8.0}/tests/test_utils.py +2 -5
  16. {sonatoki-0.6.3 → sonatoki-0.8.0}/LICENSE +0 -0
  17. {sonatoki-0.6.3 → sonatoki-0.8.0}/README.md +0 -0
  18. {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/Cleaners.py +0 -0
  19. {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/Tokenizers.py +0 -0
  20. {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/__init__.py +0 -0
  21. {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/__main__.py +0 -0
  22. {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/alphabetic.txt +0 -0
  23. {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/linku.json +0 -0
  24. {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/py.typed +0 -0
  25. {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/sandbox.json +0 -0
  26. {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/syllabic.txt +0 -0
  27. {sonatoki-0.6.3 → sonatoki-0.8.0}/src/sonatoki/utils.py +0 -0
  28. {sonatoki-0.6.3 → sonatoki-0.8.0}/tests/__init__.py +0 -0
  29. {sonatoki-0.6.3 → sonatoki-0.8.0}/tests/test_cleaners.py +0 -0
  30. {sonatoki-0.6.3 → sonatoki-0.8.0}/tests/test_tokenize.py +0 -0
  31. {sonatoki-0.6.3 → sonatoki-0.8.0}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
  32. {sonatoki-0.6.3 → sonatoki-0.8.0}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.6.3
3
+ Version: 0.8.0
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.6.3"
3
+ version = "0.8.0"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -1,42 +1,41 @@
1
1
  # STL
2
- from copy import deepcopy
3
2
  from typing import List, Type, TypedDict
4
3
 
5
4
  # PDM
6
5
  from typing_extensions import NotRequired
7
6
 
8
7
  # LOCAL
8
+ from sonatoki.types import Number
9
9
  from sonatoki.Filters import (
10
10
  Or,
11
11
  And,
12
12
  Not,
13
13
  Filter,
14
+ PuName,
14
15
  Numeric,
15
16
  NimiUCSUR,
16
17
  Alphabetic,
17
18
  NimiKuLili,
18
19
  NimiKuSuli,
19
- ProperName,
20
20
  Punctuation,
21
21
  LongSyllabic,
22
22
  Miscellaneous,
23
- NimiLinkuCore,
24
23
  LongAlphabetic,
25
24
  LongProperName,
26
- NimiLinkuCommon,
27
25
  FalsePosSyllabic,
26
+ NimiLinkuByUsage,
28
27
  NimiLinkuObscure,
29
28
  NimiLinkuSandbox,
30
29
  NimiLinkuUncommon,
31
30
  FalsePosAlphabetic,
32
31
  )
33
- from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
32
+ from sonatoki.Scorers import Scorer, PassFail, SoftScaling, SoftPassFail
34
33
  from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
35
34
  from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
36
35
  from sonatoki.Preprocessors import (
37
36
  URLs,
38
37
  Emoji,
39
- Backticks,
38
+ Codeblock,
40
39
  Reference,
41
40
  Preprocessor,
42
41
  AngleBracketObject,
@@ -95,11 +94,11 @@ BaseConfig: IloConfig = {
95
94
 
96
95
 
97
96
  PrefConfig: IloConfig = {
98
- "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
97
+ "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
99
98
  "cleaners": [ConsecutiveDuplicates],
100
99
  "ignoring_filters": [Numeric, Punctuation],
101
100
  "scoring_filters": [
102
- Or(NimiLinkuCore, NimiLinkuCommon, NimiLinkuUncommon, NimiUCSUR),
101
+ Or(NimiLinkuByUsage(30), NimiUCSUR),
103
102
  And(LongSyllabic, Not(FalsePosSyllabic)),
104
103
  # NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
105
104
  LongProperName,
@@ -110,16 +109,13 @@ PrefConfig: IloConfig = {
110
109
  }
111
110
 
112
111
  CorpusConfig: IloConfig = {
113
- "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
112
+ "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
114
113
  "cleaners": [ConsecutiveDuplicates],
115
114
  "ignoring_filters": [Numeric, Punctuation],
116
115
  "scoring_filters": [
117
116
  Or(
118
- NimiLinkuCore,
119
- NimiLinkuCommon,
120
- NimiLinkuUncommon,
121
- NimiLinkuObscure(sub=__DICT_PHONOMATCHES),
122
- NimiLinkuSandbox(sub=__DICT_PHONOMATCHES),
117
+ # awkward but efficient syntax
118
+ NimiLinkuByUsage(0)(sub=__DICT_PHONOMATCHES),
123
119
  NimiUCSUR,
124
120
  Miscellaneous,
125
121
  ),
@@ -132,17 +128,17 @@ CorpusConfig: IloConfig = {
132
128
  }
133
129
  """Mimics the previous implementation of ilo pi toki pona taso."""
134
130
  LazyConfig: IloConfig = {
135
- "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
131
+ "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
136
132
  "cleaners": [ConsecutiveDuplicates],
137
133
  "ignoring_filters": [Numeric, Punctuation],
138
- "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
134
+ "scoring_filters": [Alphabetic, NimiUCSUR, PuName, Miscellaneous],
139
135
  "scorer": SoftPassFail,
140
136
  "passing_score": 0.8,
141
137
  "word_tokenizer": WordTokenizerRe, # mimics old tokenizer
142
138
  }
143
139
  """This is extremely silly."""
144
140
  IsipinEpikuConfig: IloConfig = {
145
- "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
141
+ "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
146
142
  "cleaners": [ConsecutiveDuplicates],
147
143
  "ignoring_filters": [Numeric, Punctuation],
148
144
  "scoring_filters": [
@@ -162,31 +158,10 @@ IsipinEpikuConfig: IloConfig = {
162
158
  }
163
159
 
164
160
 
165
- DiscordConfig: IloConfig = {
166
- "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
167
- "cleaners": [ConsecutiveDuplicates],
168
- "ignoring_filters": [Numeric, Punctuation],
169
- "scoring_filters": [
170
- Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
171
- And(LongSyllabic, Not(FalsePosSyllabic)),
172
- LongProperName,
173
- And(LongAlphabetic, Not(FalsePosAlphabetic)),
174
- ],
175
- "scorer": SoftScaling,
176
- "passing_score": 0.8,
177
- }
178
-
179
- TelegramConfig: IloConfig = deepcopy(PrefConfig)
180
- ForumConfig: IloConfig = deepcopy(PrefConfig)
181
-
182
-
183
161
  __all__ = [
184
162
  "BaseConfig",
185
163
  "CorpusConfig",
186
- "DiscordConfig",
187
- "ForumConfig",
188
164
  "IloConfig",
189
165
  "LazyConfig",
190
166
  "PrefConfig",
191
- "TelegramConfig",
192
167
  ]
@@ -2,37 +2,32 @@
2
2
  import re
3
3
  from abc import ABC, abstractmethod
4
4
  from copy import deepcopy
5
- from typing import Set, List, Type, Optional
5
+ from typing import Set, List, Type, Union, Literal, Optional
6
6
  from functools import lru_cache as cache # cache comes in 3.9
7
7
 
8
8
  # PDM
9
9
  import regex
10
- from typing_extensions import override, deprecated
10
+ from typing_extensions import override
11
11
 
12
12
  # LOCAL
13
+ from sonatoki.types import LinkuBooks, LinkuUsageDate, LinkuUsageCategory
13
14
  from sonatoki.utils import prep_dictionary
14
15
  from sonatoki.constants import (
15
16
  VOWELS,
16
- NIMI_PU,
17
17
  ALPHABET,
18
18
  ALL_PUNCT,
19
19
  ALLOWABLES,
20
20
  CONSONANTS,
21
21
  NIMI_UCSUR,
22
- NIMI_KU_LILI,
23
- NIMI_KU_SULI,
24
- NIMI_LINKU_CORE,
25
22
  NIMI_PU_SYNONYMS,
26
- NIMI_LINKU_COMMON,
27
23
  FALSE_POS_SYLLABIC,
28
- NIMI_LINKU_OBSCURE,
29
- NIMI_LINKU_SANDBOX,
30
24
  NOT_IN_PUNCT_CLASS,
31
- NIMI_LINKU_UNCOMMON,
32
25
  ALL_PUNCT_RANGES_STR,
33
26
  FALSE_POS_ALPHABETIC,
34
27
  UCSUR_PUNCT_RANGES_STR,
35
28
  EMOJI_VARIATION_SELECTOR_RANGES_STR,
29
+ words_by_tag,
30
+ words_by_usage,
36
31
  )
37
32
 
38
33
  regex.DEFAULT_VERSION = regex.VERSION1
@@ -146,8 +141,27 @@ class FalsePosAlphabetic(MemberFilter):
146
141
 
147
142
 
148
143
  class ProperName(Filter):
149
- """Determines if a given token is a valid name (also called a loan word).
150
- When Toki Pona is written with the Latin alphabet, names are generally
144
+ """Determine if a given token is a valid name based on a reasonable weakening of
145
+ the rules given in Toki Pona: The Language of Good. A token matches if it has a capital
146
+ letter at its start and is **not** fully capitalized.
147
+
148
+ This corrects an issue with PuName, where scripts lacking a case distinction are
149
+ errantly counted"""
150
+
151
+ @classmethod
152
+ @override
153
+ @cache(maxsize=None)
154
+ def filter(cls, token: str) -> bool:
155
+ first_capitalized = token[0].isupper()
156
+ all_caps = token.isupper()
157
+
158
+ return first_capitalized and not all_caps
159
+
160
+
161
+ class PuName(Filter):
162
+ """Determine if a given token is a valid name (also called a loan word) based on
163
+ the rules given in Toki Pona: The Language of Good.
164
+ When Toki Pona is written with the Latin alphabet, names are
151
165
  capitalized at their start. This filter identifies those tokens.
152
166
 
153
167
  Note that this alone cannot determine if a token is a valid name,
@@ -161,6 +175,9 @@ class ProperName(Filter):
161
175
  @override
162
176
  @cache(maxsize=None)
163
177
  def filter(cls, token: str) -> bool:
178
+ # first_capitalized = token[0].isupper()
179
+ # rest_capitalized = token[1:] == token[1:].upper()
180
+ # return first_capitalized and not rest_capitalized
164
181
  return token == token.capitalize()
165
182
  # TODO: If the token is in a script which doesn't have a case distinction,
166
183
  # this will errantly match.
@@ -170,40 +187,46 @@ class LongProperName(MinLen, ProperName):
170
187
  length = 2 # reject "names" of length 1
171
188
 
172
189
 
173
- class NimiPu(MemberFilter):
174
- tokens = prep_dictionary(NIMI_PU)
175
-
176
-
177
- class NimiPuSynonyms(MemberFilter):
178
- tokens = prep_dictionary(NIMI_PU_SYNONYMS)
179
-
180
-
181
- class NimiKuSuli(MemberFilter):
182
- tokens = prep_dictionary(NIMI_KU_SULI)
183
-
184
-
185
- class NimiKuLili(MemberFilter):
186
- tokens = prep_dictionary(NIMI_KU_LILI)
190
+ class NimiLinkuByUsage:
191
+ def __new__(
192
+ cls,
193
+ usage: int,
194
+ date: Optional[LinkuUsageDate] = None,
195
+ ) -> Type[MemberFilter]:
196
+ words = words_by_usage(usage, date)
187
197
 
198
+ class AnonLinkuMemberFilter(MemberFilter):
199
+ tokens = prep_dictionary(words)
188
200
 
189
- class NimiLinkuCore(MemberFilter):
190
- tokens = prep_dictionary(NIMI_LINKU_CORE)
201
+ return AnonLinkuMemberFilter
191
202
 
192
203
 
193
- class NimiLinkuCommon(MemberFilter):
194
- tokens = prep_dictionary(NIMI_LINKU_COMMON)
204
+ class NimiLinkuByTag:
205
+ def __new__(
206
+ cls,
207
+ tag: Union[Literal["usage_category"], Literal["book"]],
208
+ category: Union[LinkuUsageCategory, LinkuBooks],
209
+ ) -> Type[MemberFilter]:
210
+ words = words_by_tag(tag, category)
195
211
 
212
+ class AnonLinkuMemberFilter(MemberFilter):
213
+ tokens = prep_dictionary(words)
196
214
 
197
- class NimiLinkuUncommon(MemberFilter):
198
- tokens = prep_dictionary(NIMI_LINKU_UNCOMMON)
215
+ return AnonLinkuMemberFilter
199
216
 
200
217
 
201
- class NimiLinkuObscure(MemberFilter):
202
- tokens = prep_dictionary(NIMI_LINKU_OBSCURE)
218
+ NimiPu = NimiLinkuByTag("book", "pu")
219
+ NimiKuSuli = NimiLinkuByTag("book", "ku suli")
220
+ NimiKuLili = NimiLinkuByTag("book", "ku lili")
221
+ NimiLinkuCore = NimiLinkuByTag("usage_category", "core")
222
+ NimiLinkuCommon = NimiLinkuByTag("usage_category", "common")
223
+ NimiLinkuUncommon = NimiLinkuByTag("usage_category", "uncommon")
224
+ NimiLinkuObscure = NimiLinkuByTag("usage_category", "obscure")
225
+ NimiLinkuSandbox = NimiLinkuByTag("usage_category", "sandbox")
203
226
 
204
227
 
205
- class NimiLinkuSandbox(MemberFilter):
206
- tokens = prep_dictionary(NIMI_LINKU_SANDBOX)
228
+ class NimiPuSynonyms(MemberFilter):
229
+ tokens = prep_dictionary(NIMI_PU_SYNONYMS)
207
230
 
208
231
 
209
232
  class NimiUCSUR(MemberFilter):
@@ -444,6 +467,7 @@ __all__ = [
444
467
  "Or",
445
468
  "Phonotactic",
446
469
  "ProperName",
470
+ "PuName",
447
471
  "Punctuation",
448
472
  "Syllabic",
449
473
  ]
@@ -143,6 +143,15 @@ class Backticks(RegexPreprocessor):
143
143
  pattern = re.compile(r"`[^`]+`", flags=re.DOTALL)
144
144
 
145
145
 
146
+ class Codeblock(RegexPreprocessor):
147
+ """Remove codeblocks marked by a set of three backticks on their own lines.
148
+
149
+ Subset of what would be removed by Backticks, but may be preferable.
150
+ """
151
+
152
+ pattern = re.compile(r"```\n(?:(?!```).*?)?```", flags=re.DOTALL)
153
+
154
+
146
155
  class Spoilers(RegexPreprocessor):
147
156
  """Remove paired double bars and their contents `||like this||`"""
148
157
 
@@ -1,17 +1,15 @@
1
1
  # STL
2
2
  import math
3
3
  from abc import ABC, abstractmethod
4
- from typing import Dict, List, Type, Union
4
+ from typing import List, Type
5
5
 
6
6
  # PDM
7
7
  from typing_extensions import override
8
8
 
9
9
  # LOCAL
10
+ from sonatoki.types import Number, Scorecard
10
11
  from sonatoki.Filters import Filter
11
12
 
12
- Number = Union[int, float]
13
- Weights = Dict[str, Number]
14
-
15
13
 
16
14
  class Scorer(ABC):
17
15
  @classmethod
@@ -124,7 +122,64 @@ class SoftScaling(Soften, Scaling):
124
122
  scoring."""
125
123
 
126
124
 
127
- # class Logarithmic(Scorer): ...
125
+ class SentenceScorer(ABC):
126
+ @classmethod
127
+ @abstractmethod
128
+ def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
129
+ """Re-score a list of sentences (scorecards, sentences with all their
130
+ metadata) and return them."""
131
+ raise NotImplementedError
132
+
133
+
134
+ class SentNoOp(SentenceScorer):
135
+ @classmethod
136
+ @override
137
+ def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
138
+ return scorecards
128
139
 
129
140
 
130
- __all__ = ["PassFail", "SoftPassFail", "Scaling", "SoftScaling"]
141
+ class SentAvg(SentenceScorer):
142
+ @classmethod
143
+ @override
144
+ def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
145
+ if not scorecards:
146
+ return scorecards
147
+
148
+ total = sum(card["score"] for card in scorecards)
149
+ avg = total / len(scorecards)
150
+ for card in scorecards:
151
+ card["score"] = avg
152
+ return scorecards
153
+
154
+
155
+ class SentWeightedAvg(SentenceScorer):
156
+ @classmethod
157
+ @override
158
+ def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
159
+ if not scorecards:
160
+ return scorecards
161
+
162
+ weighted_total = 0
163
+ total_len = 0
164
+ for card in scorecards:
165
+ cardlen = len(card["cleaned"])
166
+ cardscore = card["score"]
167
+
168
+ weighted_total += cardlen * cardscore
169
+ total_len += cardlen
170
+
171
+ weighted_avg = weighted_total / total_len
172
+ for card in scorecards:
173
+ card["score"] = weighted_avg
174
+ return scorecards
175
+
176
+
177
+ __all__ = [
178
+ "PassFail",
179
+ "Scaling",
180
+ "SoftPassFail",
181
+ "SoftScaling",
182
+ "Soften",
183
+ "SentAvg",
184
+ "SentWeightedAvg",
185
+ ]
@@ -1,11 +1,16 @@
1
1
  # STL
2
2
  import json
3
- from typing import Set, Dict
3
+ from typing import Set, Dict, Optional
4
4
  from pathlib import Path
5
5
 
6
6
  # LOCAL
7
+ from sonatoki.types import LinkuWord, LinkuUsageDate
7
8
  from sonatoki.utils import find_unicode_chars, find_unicode_ranges
8
9
 
10
+ LATEST_DATE = "2023-09"
11
+ # hardcoding this seems bad, but it means the parser is stable w.r.t. Linku!
12
+
13
+
9
14
  # `\p{Punctuation}` character class
10
15
  # https://www.compart.com/en/unicode/category
11
16
  # https://unicode.org/Public/UNIDATA/UnicodeData.txt
@@ -638,6 +643,7 @@ FALSE_POS_SYLLABIC = {
638
643
  "iluminate",
639
644
  "imense",
640
645
  "imitate",
646
+ "inanimate",
641
647
  "injoke",
642
648
  "insane",
643
649
  "insolate",
@@ -689,26 +695,42 @@ NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
689
695
  # NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
690
696
 
691
697
 
692
- def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> Set[str]:
693
- return {d["word"] for d in data.values() if d[key] == value}
698
+ def linku_data() -> Dict[str, LinkuWord]:
699
+ # NOTE: this does open+read+parse two files each time you construct a filter
700
+ # but i expect users to construct filters only at the start of runtime
701
+ # there is no reason to waste your RAM by leaving the linku data in it
702
+ with open(LINKU) as f:
703
+ linku: Dict[str, LinkuWord] = json.loads(f.read())
704
+ with open(SANDBOX) as f:
705
+ sandbox: Dict[str, LinkuWord] = json.loads(f.read())
706
+
707
+ return {**linku, **sandbox}
708
+
694
709
 
710
+ def words_by_tag(tag: str, value: str) -> Set[str]:
711
+ data = linku_data()
712
+ return {d["word"] for d in data.values() if d[tag] == value}
695
713
 
696
- with open(LINKU) as f:
697
- linku: Dict[str, Dict[str, str]] = json.loads(f.read())
698
- NIMI_PU = category_helper(linku, "book", "pu")
699
- NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
700
714
 
701
- NIMI_KU_SULI = category_helper(linku, "book", "ku suli")
702
- NIMI_KU_LILI = category_helper(linku, "book", "ku lili")
715
+ def words_by_usage(
716
+ usage: int,
717
+ date: Optional[LinkuUsageDate] = None,
718
+ ) -> Set[str]:
719
+ if not date:
720
+ date = LATEST_DATE
721
+ data = linku_data()
703
722
 
704
- NIMI_LINKU_CORE = category_helper(linku, "usage_category", "core")
705
- NIMI_LINKU_COMMON = category_helper(linku, "usage_category", "common")
706
- NIMI_LINKU_UNCOMMON = category_helper(linku, "usage_category", "uncommon")
707
- NIMI_LINKU_OBSCURE = category_helper(linku, "usage_category", "obscure")
723
+ result: Set[str] = set()
724
+ for word in data.values():
725
+ usages = word["usage"]
726
+ if date in usages and usages[date] >= usage:
727
+ result.add(word["word"])
728
+
729
+ return result
730
+
731
+
732
+ NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
708
733
 
709
- with open(SANDBOX) as f:
710
- sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
711
- NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
712
734
 
713
735
  # with open(SYLLABICS) as f:
714
736
  # FALSE_POS_SYLLABIC = {line.strip() for line in f}
@@ -716,9 +738,6 @@ with open(SANDBOX) as f:
716
738
  # with open(ALPHABETICS) as f:
717
739
  # FALSE_POS_ALPHABETIC = {line.strip() for line in f}
718
740
 
719
- del linku
720
- del sandbox
721
-
722
741
  __all__ = [
723
742
  "ALLOWABLES",
724
743
  "ALL_PUNCT",
@@ -727,14 +746,6 @@ __all__ = [
727
746
  "CONSONANTS",
728
747
  "EMOJI_VARIATION_SELECTOR_RANGES",
729
748
  "EMOJI_VARIATION_SELECTOR_RANGES_STR",
730
- "NIMI_KU_LILI",
731
- "NIMI_KU_SULI",
732
- "NIMI_LINKU_COMMON",
733
- "NIMI_LINKU_CORE",
734
- "NIMI_LINKU_OBSCURE",
735
- "NIMI_LINKU_SANDBOX",
736
- "NIMI_LINKU_UNCOMMON",
737
- "NIMI_PU",
738
749
  "NIMI_PU_SYNONYMS",
739
750
  "POSIX_PUNCT",
740
751
  "POSIX_PUNCT_RANGES",
@@ -1,17 +1,14 @@
1
1
  # STL
2
- from typing import List, Type, Tuple
2
+ from typing import List, Type
3
3
 
4
4
  # LOCAL
5
+ from sonatoki.types import Number, Scorecard
5
6
  from sonatoki.Filters import Filter
6
- from sonatoki.Scorers import Number, Scorer
7
+ from sonatoki.Scorers import Scorer, SentNoOp, SentenceScorer
7
8
  from sonatoki.Cleaners import Cleaner
8
9
  from sonatoki.Tokenizers import Tokenizer, SentTokenizer, WordTokenizer
9
10
  from sonatoki.Preprocessors import Preprocessor
10
11
 
11
- # tokenized, filtered, cleaned, score, result
12
- Scorecard = Tuple[List[str], List[str], List[str], Number, bool]
13
- # TODO: scorecard kinda sucks as a name
14
-
15
12
 
16
13
  class Ilo:
17
14
  __preprocessors: List[Type[Preprocessor]]
@@ -21,6 +18,7 @@ class Ilo:
21
18
  __ignoring_filters: List[Type[Filter]]
22
19
  __scoring_filters: List[Type[Filter]]
23
20
  __scorer: Type[Scorer]
21
+ __sentence_scorer: Type[SentenceScorer]
24
22
  __passing_score: Number
25
23
 
26
24
  def __init__(
@@ -31,6 +29,7 @@ class Ilo:
31
29
  scoring_filters: List[Type[Filter]],
32
30
  scorer: Type[Scorer],
33
31
  passing_score: Number,
32
+ sentence_scorer: Type[SentenceScorer] = SentNoOp,
34
33
  word_tokenizer: Type[Tokenizer] = WordTokenizer,
35
34
  sent_tokenizer: Type[Tokenizer] = SentTokenizer,
36
35
  ):
@@ -43,6 +42,7 @@ class Ilo:
43
42
  self.__ignoring_filters = [*ignoring_filters]
44
43
  self.__scoring_filters = [*scoring_filters]
45
44
  self.__scorer = scorer
45
+ self.__sentence_scorer = sentence_scorer
46
46
  self.__passing_score = passing_score
47
47
 
48
48
  def preprocess(self, msg: str) -> str:
@@ -55,6 +55,7 @@ class Ilo:
55
55
  return self.__word_tokenizer.tokenize(msg)
56
56
 
57
57
  def sent_tokenize(self, msg: str) -> List[str]:
58
+ """It is *highly* recommended that you run `ilo.preprocess` first."""
58
59
  return self.__sent_tokenizer.tokenize(msg)
59
60
 
60
61
  def clean_token(self, token: str) -> str:
@@ -93,44 +94,50 @@ class Ilo:
93
94
  def score_tokens(self, tokens: List[str]) -> float:
94
95
  return self.__scorer.score(tokens, self.__scoring_filters)
95
96
 
97
+ def score_sentences(self, scorecards: List[Scorecard]) -> List[Scorecard]:
98
+ return self.__sentence_scorer.score(scorecards)
99
+
96
100
  def _is_toki_pona(self, message: str) -> Scorecard:
97
101
  """Process a message into its tokens, then filters, cleans, and scores
98
- them. Returns all parts. Message must already be preprocessed, normally
99
- done in `self.is_toki_pona(message)`.
100
-
101
- Returns all components of the processing algorithm except preprocessing:
102
- - Tokenized message (list[str])
103
- - Filtered message (list[str])
104
- - Cleaned message (list[str])
105
- - Score (float)
106
- - Result (bool)
102
+ them. Message must already be preprocessed, normally done in
103
+ `self.is_toki_pona(message)`.
104
+
105
+ Returns a `Scorecard` with all changes to the input text and a score.
107
106
  """
108
107
  tokenized = self.word_tokenize(message)
109
108
  filtered = self.filter_tokens(tokenized)
110
109
  cleaned = self.clean_tokens(filtered)
111
110
  score = self.score_tokens(cleaned)
112
- result = score >= self.__passing_score
113
111
 
114
- return tokenized, filtered, cleaned, score, result
112
+ scorecard: Scorecard = {
113
+ "text": message,
114
+ "tokenized": tokenized,
115
+ "filtered": filtered,
116
+ "cleaned": cleaned,
117
+ "score": score,
118
+ }
119
+
120
+ return scorecard
115
121
 
116
122
  def is_toki_pona(self, message: str) -> bool:
117
- """Determines whether a single statement is or is not Toki Pona."""
123
+ """Determines whether a text is or is not Toki Pona."""
118
124
  message = self.preprocess(message)
119
- *_, result = self._is_toki_pona(message)
120
- return result
125
+ scorecard = self._is_toki_pona(message)
126
+ return scorecard["score"] >= self.__passing_score
121
127
 
122
128
  def _are_toki_pona(self, message: str) -> List[Scorecard]:
123
- """Split a message into sentences, then return a list each sentence's
124
- results via `self._is_toki_pona()`.
129
+ """Split a message into sentences, then return a list with each
130
+ sentence's scorecard from `self._is_toki_pona()`.
125
131
 
126
132
  Message must already be preprocessed, normally done in
127
133
  `self.are_toki_pona(message)`.
128
134
  """
129
- results: List[Scorecard] = list()
135
+ scorecards: List[Scorecard] = list()
130
136
  for sentence in self.sent_tokenize(message):
131
137
  result = self._is_toki_pona(sentence)
132
- results.append(result)
133
- return results
138
+ scorecards.append(result)
139
+ scorecards = self.score_sentences(scorecards)
140
+ return scorecards
134
141
 
135
142
  def are_toki_pona(self, message: str) -> List[bool]:
136
143
  """Splits a statement into sentences, then determines if each is or is not Toki Pona.
@@ -148,5 +155,5 @@ class Ilo:
148
155
  ```
149
156
  """
150
157
  message = self.preprocess(message)
151
- results = self._are_toki_pona(message)
152
- return [res[-1] for res in results]
158
+ scorecards = self._are_toki_pona(message)
159
+ return [card["score"] >= self.__passing_score for card in scorecards]
@@ -0,0 +1,60 @@
1
+ # STL
2
+ from typing import Dict, List, Union, Literal, TypedDict
3
+
4
+ Number = Union[int, float]
5
+
6
+
7
+ # TODO: scorecard kinda sucks as a name
8
+ class Scorecard(TypedDict):
9
+ text: str
10
+ tokenized: List[str]
11
+ filtered: List[str]
12
+ cleaned: List[str]
13
+ score: Number
14
+
15
+
16
+ LinkuUsageDate = Union[
17
+ Literal["2020-04"],
18
+ Literal["2021-10"],
19
+ Literal["2022-08"],
20
+ Literal["2023-09"],
21
+ # Literal["2024-09"],
22
+ ]
23
+
24
+ LinkuUsageCategory = Union[
25
+ Literal["core"],
26
+ Literal["common"],
27
+ Literal["uncommon"],
28
+ Literal["obscure"],
29
+ Literal["sandbox"],
30
+ ]
31
+
32
+ LinkuBooks = Union[
33
+ Literal["pu"],
34
+ Literal["ku suli"],
35
+ Literal["ku lili"],
36
+ Literal["none"],
37
+ ]
38
+
39
+
40
+ class LinkuWord(TypedDict):
41
+ id: str
42
+ author_verbatim: str
43
+ author_verbatim_source: str
44
+ book: str
45
+ coined_era: str
46
+ coined_year: str
47
+ creator: List[str]
48
+ ku_data: Dict[str, int]
49
+ see_also: List[str]
50
+ resources: Dict[str, str]
51
+ representations: Dict[str, Union[str, List[str]]]
52
+ source_language: str
53
+ usage_category: LinkuUsageCategory
54
+ word: str
55
+ deprecated: bool
56
+ etymology: List[Dict[str, str]]
57
+ audio: List[Dict[str, str]]
58
+ pu_verbatim: Dict[str, str]
59
+ usage: Dict[LinkuUsageDate, int]
60
+ translations: Dict[str, Dict[str, str]]
@@ -11,12 +11,12 @@ from sonatoki.Filters import (
11
11
  And,
12
12
  Not,
13
13
  NimiPu,
14
+ PuName,
14
15
  Numeric,
15
16
  Syllabic,
16
17
  Alphabetic,
17
18
  NimiKuLili,
18
19
  NimiKuSuli,
19
- ProperName,
20
20
  Phonotactic,
21
21
  Punctuation,
22
22
  AlphabeticRe,
@@ -34,23 +34,13 @@ from sonatoki.Filters import (
34
34
  NimiLinkuUncommon,
35
35
  )
36
36
  from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates
37
- from sonatoki.constants import (
38
- NIMI_PU,
39
- NIMI_KU_LILI,
40
- NIMI_KU_SULI,
41
- NIMI_LINKU_CORE,
42
- NIMI_LINKU_COMMON,
43
- FALSE_POS_SYLLABIC,
44
- NIMI_LINKU_OBSCURE,
45
- NIMI_LINKU_SANDBOX,
46
- NIMI_LINKU_UNCOMMON,
47
- )
37
+ from sonatoki.constants import FALSE_POS_SYLLABIC, words_by_tag
48
38
 
49
39
  # FILESYSTEM
50
40
  from .test_utils import PROPER_NAME_RE
51
41
 
52
42
 
53
- @given(st.sampled_from(list(NIMI_PU)))
43
+ @given(st.sampled_from(list(words_by_tag("book", "pu"))))
54
44
  @example("lukin")
55
45
  @example("selo")
56
46
  @example("li")
@@ -59,14 +49,14 @@ def test_NimiPu(s: str):
59
49
  assert res, repr(s)
60
50
 
61
51
 
62
- @given(st.sampled_from(list(NIMI_LINKU_CORE)))
52
+ @given(st.sampled_from(list(words_by_tag("usage_category", "core"))))
63
53
  @example("pona")
64
54
  def test_NimiLinkuCore(s: str):
65
55
  res = NimiLinkuCore.filter(s)
66
56
  assert res, repr(s)
67
57
 
68
58
 
69
- @given(st.sampled_from(list(NIMI_LINKU_COMMON)))
59
+ @given(st.sampled_from(list(words_by_tag("usage_category", "common"))))
70
60
  @example("n")
71
61
  @example("tonsi")
72
62
  @example("kipisi")
@@ -75,19 +65,21 @@ def test_NimiLinkuCommon(s: str):
75
65
  assert res, repr(s)
76
66
 
77
67
 
78
- @given(st.sampled_from(list(NIMI_LINKU_UNCOMMON)))
68
+ @given(st.sampled_from(list(words_by_tag("usage_category", "uncommon"))))
79
69
  def test_NimiLinkuUncommon(s: str):
80
70
  res = NimiLinkuUncommon.filter(s)
81
71
  assert res, repr(s)
82
72
 
83
73
 
84
- @given(st.sampled_from(list(NIMI_LINKU_OBSCURE)))
74
+ @given(st.sampled_from(list(words_by_tag("usage_category", "obscure"))))
75
+ @example("pake")
76
+ @example("san")
85
77
  def test_NimiLinkuObscure(s: str):
86
78
  res = NimiLinkuObscure.filter(s)
87
79
  assert res, repr(s)
88
80
 
89
81
 
90
- @given(st.sampled_from(list(NIMI_LINKU_SANDBOX)))
82
+ @given(st.sampled_from(list(words_by_tag("usage_category", "sandbox"))))
91
83
  @example("kalamARR")
92
84
  @example("Pingo")
93
85
  def test_NimiLinkuSandbox(s: str):
@@ -152,7 +144,7 @@ def test_AlphabeticRe(s: str):
152
144
 
153
145
  @given(st.from_regex(PROPER_NAME_RE, fullmatch=True))
154
146
  def test_ProperName(s: str):
155
- res = ProperName.filter(s)
147
+ res = PuName.filter(s)
156
148
  assert res, repr(s)
157
149
 
158
150
 
@@ -207,7 +199,11 @@ def test_OrFilter(s: str):
207
199
  # NOTE: No subset filter test because A | B is not the same as A combined with B.
208
200
  # e.g. "apple" passes Alphabetic, "..." passes Punctuation, "apple..." passes neither
209
201
  # but would incorrectly pass a combined filter.
210
- @given(st.sampled_from(list(NIMI_PU | NIMI_LINKU_OBSCURE)))
202
+ @given(
203
+ st.sampled_from(
204
+ list(words_by_tag("book", "pu") | words_by_tag("usage_category", "obscure"))
205
+ )
206
+ )
211
207
  def test_MemberFilters_OrFilter(s: str):
212
208
  filter = Or(NimiPu, NimiLinkuObscure)
213
209
  assert issubclass(filter, MemberFilter)
@@ -221,11 +217,11 @@ def test_MemberFilters_OrFilter(s: str):
221
217
  @given(
222
218
  st.sampled_from(
223
219
  list(
224
- NIMI_KU_SULI
225
- | NIMI_KU_LILI
226
- | NIMI_LINKU_UNCOMMON
227
- | NIMI_LINKU_OBSCURE
228
- | NIMI_LINKU_SANDBOX
220
+ words_by_tag("book", "ku suli")
221
+ | words_by_tag("book", "ku lili")
222
+ | words_by_tag("usage_category", "uncommon")
223
+ | words_by_tag("usage_category", "obscure")
224
+ | words_by_tag("usage_category", "sandbox")
229
225
  ),
230
226
  )
231
227
  )
@@ -248,14 +244,14 @@ def test_OrFilter_IsipinEpiku(s: str):
248
244
  )
249
245
 
250
246
 
251
- @given(st.sampled_from(list(NIMI_PU)))
247
+ @given(st.sampled_from(list(words_by_tag("book", "pu"))))
252
248
  def test_AndFilter(s: str):
253
249
  s = s.capitalize()
254
- f = And(ProperName, NimiPu)
250
+ f = And(PuName, NimiPu)
255
251
  assert f.filter(s)
256
252
 
257
253
 
258
- @given(st.sampled_from(list(NIMI_PU)))
254
+ @given(st.sampled_from(list(words_by_tag("book", "pu"))))
259
255
  def test_NotFilter(s: str):
260
256
  f = Not(NimiPu)
261
257
  assert not f.filter(s)
@@ -282,13 +278,21 @@ def test_AndNotFilter(s: str):
282
278
  assert not res_composed
283
279
 
284
280
 
285
- @given(st.sampled_from(list(NIMI_PU | NIMI_KU_SULI)))
281
+ @given(
282
+ st.sampled_from(list(words_by_tag("book", "pu") | words_by_tag("book", "ku suli")))
283
+ )
286
284
  def test_AddTokensToMemberFilter(s: str):
287
285
  PuEnKuSuliFilter = NimiPu(add=NimiKuSuli.tokens)
288
286
  assert PuEnKuSuliFilter.filter(s)
289
287
 
290
288
 
291
- @given(st.sampled_from(list(NIMI_LINKU_SANDBOX | NIMI_KU_LILI)))
289
+ @given(
290
+ st.sampled_from(
291
+ list(
292
+ words_by_tag("usage_category", "sandbox") | words_by_tag("book", "ku lili")
293
+ )
294
+ )
295
+ )
292
296
  def test_AddTokensToMemberFilterNegative(s: str):
293
297
  PuEnKuSuliFilter = NimiPu(add=NimiKuSuli.tokens)
294
298
  assert not PuEnKuSuliFilter.filter(s)
@@ -297,12 +301,12 @@ def test_AddTokensToMemberFilterNegative(s: str):
297
301
  @given(
298
302
  st.sampled_from(
299
303
  list(
300
- NIMI_PU
301
- | NIMI_KU_SULI
302
- | NIMI_KU_LILI
303
- | NIMI_LINKU_UNCOMMON
304
- | NIMI_LINKU_OBSCURE
305
- | NIMI_LINKU_SANDBOX
304
+ words_by_tag("book", "pu")
305
+ | words_by_tag("book", "ku suli")
306
+ | words_by_tag("book", "ku lili")
307
+ | words_by_tag("usage_category", "uncommon")
308
+ | words_by_tag("usage_category", "obscure")
309
+ | words_by_tag("usage_category", "sandbox")
306
310
  ),
307
311
  )
308
312
  | st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
@@ -1,3 +1,6 @@
1
+ # STL
2
+ from typing import List, Tuple
3
+
1
4
  # PDM
2
5
  import pytest
3
6
 
@@ -35,6 +38,10 @@ ALL_VALID = [
35
38
  "󱥄󱥬󱥩󱤴", # "o toki tawa mi" in UCSUR
36
39
  "󱤴󱤧󱤑󱥍󱦗󱤖󱥡󱦘󱤬󱥭‍󱥡󱥚",
37
40
  "󱤑󱦐󱥗󱦜󱦈󱦜󱥉󱦜󱦑󱥄󱤤󱤂󱤉󱥆󱤀",
41
+ "o lukin, 󱤴󱥬󱥩󱤴󱤧wawa",
42
+ "ni li sona kiwen",
43
+ "nimi namako li toki e ale",
44
+ "mi open mute a", # mostly eng words
38
45
  ]
39
46
 
40
47
  IGNORABLES = [
@@ -55,10 +62,9 @@ IGNORABLES = [
55
62
  "❤️", # heart
56
63
  "😊",
57
64
  "👨‍👩‍👧‍👧", # family emoji with zwj
58
- # every non-emoji in
65
+ # every non-emoji in the writables
59
66
  "🄀🄁🄂🄃🄄🄅🄆🄇🄈🄉🄊🄋🄌🄍🄎🄏🄐🄑🄒🄓🄔🄕🄖🄗🄘🄙🄚🄛🄜🄝🄞🄟🄠🄡🄢🄣🄤🄥🄦🄧🄨🄩🄪🄫🄬🄭🄮🄯🄰🄱🄲🄳🄴🄵🄶🄷🄸🄹🄺🄻🄼🄽🄾🄿🅀🅁🅂🅃🅄🅅🅆🅇🅈🅉🅊🅋🅌🅍🅎🅏🅐🅑🅒🅓🅔🅕🅖🅗🅘🅙🅚🅛🅜🅝🅞🅟🅠🅡🅢🅣🅤🅥🅦🅧🅨🅩🅪🅫🅬🅭🅮🅯🅲🅳🅴🅵🅶🅷🅸🅹🅺🅻🅼🅽🆀🆁🆂🆃🆄🆅🆆🆇🆈🆉🆊🆋🆌🆍🆏🆐 🆛🆜🆝🆞🆟🆠🆡🆢🆣🆤🆥🆦🆧🆨🆩🆪🆫🆬🆭🇦🇧🇨🇩🇪🇫🇬🇭🇮🇯🇰🇱🇲🇳🇴🇵🇶🇷🇸🇹🇺🇻🇼🇽🇾🇿",
60
67
  "🅰️🅱️🅾️🅱️🅰️", # blood type emojis
61
- # "😃⃢👍", # sincerely, no idea, but it came up
62
68
  ]
63
69
 
64
70
  SYLLABIC_MATCHES = [
@@ -88,6 +94,9 @@ NAME_MATCHES = [
88
94
  "toki Kanse li lon",
89
95
  "toki Lojban li nasa e lawa mi",
90
96
  "ilo Firefox",
97
+ "ilo FaceBook li nasa",
98
+ "mi kepeken ilo MySQL",
99
+ "poki li nasin SQLite",
91
100
  "mi musi Space Station 13",
92
101
  "jan Tepo en jan Salo en jan Lakuse en pipi Kewapi en soweli Eweke en mi li musi",
93
102
  ]
@@ -108,7 +117,7 @@ CORPUS_SPECIFIC = [
108
117
  "Pingo",
109
118
  "we Luke li alente wa",
110
119
  ]
111
- CORPUS_SPECIFIC_XFAIL = []
120
+ CORPUS_SPECIFIC_XFAIL: List[str] = []
112
121
 
113
122
 
114
123
  EXCESSIVE_SYLLABICS = [
@@ -129,7 +138,6 @@ EXCESSIVE_SYLLABICS = [
129
138
  ]
130
139
 
131
140
  EXCESSIVE_ALPHABETICS = [
132
- "21st", # candidate for xfails?
133
141
  "wen i tok usin onli notes in toki pona i look silli. ",
134
142
  "I wait, I sulk, as a tool I make stoops to ineptness.",
135
143
  "aaa i non-saw usa's most multiple element-set. it's as asinine as in `e`-less speak",
@@ -155,6 +163,7 @@ EXCESSIVE_ENGLISH = [
155
163
  "i'm online all the time",
156
164
  "How to Cut a Kiwi",
157
165
  "a e i o u",
166
+ "21st", # previous false positive; fixed by ProperName change
158
167
  ]
159
168
 
160
169
  NON_MATCHES = [
@@ -193,10 +202,20 @@ FALSE_NEGATIVES = [
193
202
  "mtue",
194
203
  "mi nasa B^)", # emoticon
195
204
  "lete li ike x.x", # this is an emoticon but passes because 'x' is in Filters.Miscellaneous
205
+ "😃⃢👍", # sincerely, no idea, but it came up and it should be omitted by emojis but isn't
196
206
  ]
197
207
 
198
208
  FALSE_POSITIVES = [
199
- "Knowing a little toki pona",
209
+ "Knowing a little toki pona", # name, dict, alphabet, dict, dict- damn, that's hard.
210
+ ]
211
+
212
+ IGNORABLE_PAIRS: List[Tuple[str, str]] = [
213
+ ("o lukin e ni: https://example.com/", "o lukin e ni:"),
214
+ ("ni li nasa anu seme <:musiwawa:198591138591>", "ni li nasa anu seme"),
215
+ ("seme la ni li toki pona ala https://example.com/", "seme la ni li toki pona ala"),
216
+ ("```\ndef bad():\n pass\n``` o lukin e ni", "o lukin e ni"),
217
+ ("mi tawa tomo telo 💦💦", "mi tawa tomo telo"),
218
+ ("o lukin e lipu ni: [[wp:Canvassing]]", "o lukin e lipu ni:"),
200
219
  ]
201
220
 
202
221
 
@@ -254,3 +273,33 @@ def test_false_negatives_pref(ilo: Ilo, text: str):
254
273
  @pytest.mark.parametrize("text", CORPUS_SPECIFIC_XFAIL)
255
274
  def test_false_positives_corpus(corpus_ilo: Ilo, text: str):
256
275
  assert not corpus_ilo.is_toki_pona(text)
276
+
277
+
278
+ @pytest.mark.parametrize("pair", IGNORABLE_PAIRS)
279
+ def test_pref_ignorable_doesnt_change_score(ilo: Ilo, pair: Tuple[str, str]):
280
+ with_ignorable, without_ignorable = pair
281
+ with_ignorable = ilo.preprocess(with_ignorable)
282
+ without_ignorable = ilo.preprocess(without_ignorable)
283
+ score_with = ilo._is_toki_pona(with_ignorable)["score"]
284
+ score_without = ilo._is_toki_pona(without_ignorable)["score"]
285
+ assert score_with == score_without
286
+
287
+
288
+ @pytest.mark.parametrize("pair", IGNORABLE_PAIRS)
289
+ def test_lazy_ignorable_doesnt_change_score(lazy_ilo: Ilo, pair: Tuple[str, str]):
290
+ with_ignorable, without_ignorable = pair
291
+ with_ignorable = lazy_ilo.preprocess(with_ignorable)
292
+ without_ignorable = lazy_ilo.preprocess(without_ignorable)
293
+ score_with = lazy_ilo._is_toki_pona(with_ignorable)["score"]
294
+ score_without = lazy_ilo._is_toki_pona(without_ignorable)["score"]
295
+ assert score_with == score_without
296
+
297
+
298
+ @pytest.mark.parametrize("pair", IGNORABLE_PAIRS)
299
+ def test_corpus_ignorable_doesnt_change_score(corpus_ilo: Ilo, pair: Tuple[str, str]):
300
+ with_ignorable, without_ignorable = pair
301
+ with_ignorable = corpus_ilo.preprocess(with_ignorable)
302
+ without_ignorable = corpus_ilo.preprocess(without_ignorable)
303
+ score_with = corpus_ilo._is_toki_pona(with_ignorable)["score"]
304
+ score_without = corpus_ilo._is_toki_pona(without_ignorable)["score"]
305
+ assert score_with == score_without
@@ -8,6 +8,7 @@ from sonatoki.Preprocessors import (
8
8
  Spoilers,
9
9
  AllQuotes,
10
10
  Backticks,
11
+ Codeblock,
11
12
  Reference,
12
13
  ArrowQuote,
13
14
  ColonEmotes,
@@ -48,6 +49,25 @@ def test_Backticks(s: str):
48
49
  assert res == "", (repr(s), repr(res))
49
50
 
50
51
 
52
+ @given(st.from_regex(Codeblock.pattern.pattern, fullmatch=True))
53
+ @example(
54
+ """```
55
+ ```"""
56
+ )
57
+ @example(
58
+ """```
59
+ blocky message
60
+ ```
61
+
62
+ ```
63
+ second blocky message
64
+ ```"""
65
+ )
66
+ def test_Codeblock(s: str):
67
+ res = Codeblock.process(s).strip()
68
+ assert res == "", (repr(s), repr(res))
69
+
70
+
51
71
  @given(st.from_regex(ArrowQuote.pattern.pattern, fullmatch=True))
52
72
  @example("> base")
53
73
  @example("> newline\n> newline")
@@ -19,45 +19,35 @@ from sonatoki.Filters import (
19
19
  )
20
20
  from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates
21
21
  from sonatoki.constants import (
22
- NIMI_PU,
23
- NIMI_KU_LILI,
24
- NIMI_KU_SULI,
25
- NIMI_LINKU_CORE,
26
22
  NIMI_PU_SYNONYMS,
27
- NIMI_LINKU_COMMON,
28
23
  FALSE_POS_SYLLABIC,
29
- NIMI_LINKU_OBSCURE,
30
- NIMI_LINKU_SANDBOX,
31
- NIMI_LINKU_UNCOMMON,
32
24
  FALSE_POS_ALPHABETIC,
25
+ words_by_tag,
26
+ words_by_usage,
33
27
  )
34
28
 
35
29
 
36
- @given(st.sampled_from(list(NIMI_PU | NIMI_PU_SYNONYMS)))
30
+ @given(st.sampled_from(list(words_by_tag("book", "pu") | NIMI_PU_SYNONYMS)))
37
31
  def test_pu_filters_non_overlap(s: str):
38
32
  res_pu = NimiPu.filter(s)
39
33
  res_synonyms = NimiPuSynonyms.filter(s)
40
34
  assert (res_pu + res_synonyms) == 1
41
35
 
42
36
 
43
- @given(st.sampled_from(list(NIMI_KU_SULI | NIMI_KU_LILI)))
37
+ @given(
38
+ st.sampled_from(
39
+ list(words_by_tag("book", "ku suli") | words_by_tag("book", "ku lili"))
40
+ )
41
+ )
44
42
  def test_ku_filters_non_overlap(s: str):
43
+ s = Lowercase.clean(s)
44
+ s = ConsecutiveDuplicates.clean(s)
45
45
  res_ku_suli = NimiKuSuli.filter(s)
46
46
  res_ku_lili = NimiKuLili.filter(s)
47
47
  assert (res_ku_suli + res_ku_lili) == 1
48
48
 
49
49
 
50
- @given(
51
- st.sampled_from(
52
- list(
53
- NIMI_LINKU_CORE
54
- | NIMI_LINKU_COMMON
55
- | NIMI_LINKU_UNCOMMON
56
- | NIMI_LINKU_OBSCURE
57
- | NIMI_LINKU_SANDBOX
58
- )
59
- )
60
- )
50
+ @given(st.sampled_from(list(words_by_usage(0))))
61
51
  def test_linku_filters_non_overlap(s: str):
62
52
  _ = assume(s != "su")
63
53
 
@@ -73,7 +63,7 @@ def test_linku_filters_non_overlap(s: str):
73
63
  assert (res_core + res_common + res_uncommon + res_obscure + res_sandbox) == 1
74
64
 
75
65
 
76
- @given(st.sampled_from(list(NIMI_LINKU_CORE | NIMI_LINKU_COMMON | NIMI_LINKU_UNCOMMON)))
66
+ @given(st.sampled_from(list(words_by_usage(30))))
77
67
  def test_nimi_linku_properties(s: str):
78
68
  assert ConsecutiveDuplicates.clean(s) == s, repr(s)
79
69
  assert Alphabetic.filter(s), repr(s)
@@ -10,10 +10,10 @@ from hypothesis import given, example
10
10
  from sonatoki.Filters import (
11
11
  Filter,
12
12
  NimiPu,
13
+ PuName,
13
14
  Numeric,
14
15
  Syllabic,
15
16
  Alphabetic,
16
- ProperName,
17
17
  Phonotactic,
18
18
  NimiLinkuCore,
19
19
  PunctuationRe,
@@ -31,7 +31,7 @@ FILTERS = [
31
31
  NimiLinkuCore,
32
32
  NimiLinkuCommon,
33
33
  Alphabetic,
34
- ProperName,
34
+ PuName,
35
35
  Phonotactic,
36
36
  PunctuationRe,
37
37
  ]
@@ -1,17 +1,14 @@
1
- # STL
2
- import re
3
-
4
1
  # PDM
5
2
  import hypothesis.strategies as st
6
3
 
7
4
  # LOCAL
8
5
  from sonatoki.Filters import Syllabic, Phonotactic, AlphabeticRe
9
- from sonatoki.constants import NIMI_LINKU_CORE, NIMI_LINKU_COMMON
6
+ from sonatoki.constants import words_by_usage
10
7
 
11
8
  PROPER_NAME_RE = r"[A-Z][a-z]*"
12
9
 
13
10
  token_strategy = (
14
- st.sampled_from(list(NIMI_LINKU_CORE | NIMI_LINKU_COMMON))
11
+ st.sampled_from(list(words_by_usage(60)))
15
12
  | st.from_regex(Phonotactic.pattern.pattern, fullmatch=True)
16
13
  | st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
17
14
  | st.from_regex(PROPER_NAME_RE, fullmatch=True)
File without changes
File without changes
File without changes
File without changes
File without changes