sonatoki 0.6.2__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {sonatoki-0.6.2 → sonatoki-0.7.0}/PKG-INFO +1 -1
  2. {sonatoki-0.6.2 → sonatoki-0.7.0}/pyproject.toml +1 -1
  3. {sonatoki-0.6.2 → sonatoki-0.7.0}/src/sonatoki/Configs.py +40 -73
  4. {sonatoki-0.6.2 → sonatoki-0.7.0}/src/sonatoki/Filters.py +50 -34
  5. {sonatoki-0.6.2 → sonatoki-0.7.0}/src/sonatoki/Preprocessors.py +9 -0
  6. {sonatoki-0.6.2 → sonatoki-0.7.0}/src/sonatoki/Scorers.py +61 -6
  7. {sonatoki-0.6.2 → sonatoki-0.7.0}/src/sonatoki/constants.py +38 -27
  8. {sonatoki-0.6.2 → sonatoki-0.7.0}/src/sonatoki/ilo.py +34 -27
  9. sonatoki-0.7.0/src/sonatoki/types.py +60 -0
  10. {sonatoki-0.6.2 → sonatoki-0.7.0}/tests/test_filters.py +62 -25
  11. {sonatoki-0.6.2 → sonatoki-0.7.0}/tests/test_ilo.py +50 -4
  12. {sonatoki-0.6.2 → sonatoki-0.7.0}/tests/test_preprocessors.py +20 -0
  13. {sonatoki-0.6.2 → sonatoki-0.7.0}/tests/test_properties.py +12 -22
  14. {sonatoki-0.6.2 → sonatoki-0.7.0}/tests/test_utils.py +2 -5
  15. {sonatoki-0.6.2 → sonatoki-0.7.0}/LICENSE +0 -0
  16. {sonatoki-0.6.2 → sonatoki-0.7.0}/README.md +0 -0
  17. {sonatoki-0.6.2 → sonatoki-0.7.0}/src/sonatoki/Cleaners.py +0 -0
  18. {sonatoki-0.6.2 → sonatoki-0.7.0}/src/sonatoki/Tokenizers.py +0 -0
  19. {sonatoki-0.6.2 → sonatoki-0.7.0}/src/sonatoki/__init__.py +0 -0
  20. {sonatoki-0.6.2 → sonatoki-0.7.0}/src/sonatoki/__main__.py +0 -0
  21. {sonatoki-0.6.2 → sonatoki-0.7.0}/src/sonatoki/alphabetic.txt +0 -0
  22. {sonatoki-0.6.2 → sonatoki-0.7.0}/src/sonatoki/linku.json +0 -0
  23. {sonatoki-0.6.2 → sonatoki-0.7.0}/src/sonatoki/py.typed +0 -0
  24. {sonatoki-0.6.2 → sonatoki-0.7.0}/src/sonatoki/sandbox.json +0 -0
  25. {sonatoki-0.6.2 → sonatoki-0.7.0}/src/sonatoki/syllabic.txt +0 -0
  26. {sonatoki-0.6.2 → sonatoki-0.7.0}/src/sonatoki/utils.py +0 -0
  27. {sonatoki-0.6.2 → sonatoki-0.7.0}/tests/__init__.py +0 -0
  28. {sonatoki-0.6.2 → sonatoki-0.7.0}/tests/test_cleaners.py +0 -0
  29. {sonatoki-0.6.2 → sonatoki-0.7.0}/tests/test_scorers.py +0 -0
  30. {sonatoki-0.6.2 → sonatoki-0.7.0}/tests/test_tokenize.py +0 -0
  31. {sonatoki-0.6.2 → sonatoki-0.7.0}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
  32. {sonatoki-0.6.2 → sonatoki-0.7.0}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.6.2
3
+ Version: 0.7.0
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.6.2"
3
+ version = "0.7.0"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -1,49 +1,74 @@
1
1
  # STL
2
- from copy import deepcopy
3
- from typing import Set, List, Type, TypedDict, cast
2
+ from typing import List, Type, TypedDict
4
3
 
5
4
  # PDM
6
5
  from typing_extensions import NotRequired
7
6
 
8
7
  # LOCAL
8
+ from sonatoki.types import Number
9
9
  from sonatoki.Filters import (
10
10
  Or,
11
11
  And,
12
12
  Not,
13
13
  Filter,
14
14
  Numeric,
15
- Syllabic,
16
15
  NimiUCSUR,
17
16
  Alphabetic,
18
17
  NimiKuLili,
19
18
  NimiKuSuli,
20
19
  ProperName,
21
- Phonotactic,
22
20
  Punctuation,
23
21
  LongSyllabic,
24
22
  Miscellaneous,
25
- NimiLinkuCore,
26
23
  LongAlphabetic,
27
24
  LongProperName,
28
- NimiLinkuCommon,
29
25
  FalsePosSyllabic,
26
+ NimiLinkuByUsage,
30
27
  NimiLinkuObscure,
31
28
  NimiLinkuSandbox,
32
29
  NimiLinkuUncommon,
33
30
  FalsePosAlphabetic,
34
31
  )
35
- from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
32
+ from sonatoki.Scorers import Scorer, PassFail, SoftScaling, SoftPassFail
36
33
  from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
37
34
  from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
38
35
  from sonatoki.Preprocessors import (
39
36
  URLs,
40
37
  Emoji,
41
- Backticks,
38
+ Codeblock,
42
39
  Reference,
43
40
  Preprocessor,
44
41
  AngleBracketObject,
45
42
  )
46
43
 
44
+ __DICT_PHONOMATCHES = {
45
+ # Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
46
+ # In this case, all of these appear more often in English by a factor of at least 10.
47
+ "aka", # also known as
48
+ "an", # article
49
+ "api", # API
50
+ "i", # 1st person
51
+ "kana", # japanese script
52
+ "me", # 1st person singular, english
53
+ "ne", # "no" in several languages
54
+ "nu", # "new" in english, "now" in dutch
55
+ "se", # spanish particle, english "see"
56
+ "take", # acquire, perhaps forcefully or without permission
57
+ "ten", # 10
58
+ "to", # to, too
59
+ "je", # 1st person pronoun, french
60
+ "u", # no u
61
+ "we", # 1st person plural, english
62
+ "wi", # wii and discussions of syllables
63
+ "sole", # singular, of shoe
64
+ # unexplored candidates for removal
65
+ # "omen", # ominous
66
+ # "papa", # father
67
+ # "lo", # "lo" and "loo"
68
+ # "ewe", # sheep
69
+ # "pa", # father- eh?
70
+ }
71
+
47
72
 
48
73
  class IloConfig(TypedDict):
49
74
  preprocessors: List[Type[Preprocessor]]
@@ -69,11 +94,11 @@ BaseConfig: IloConfig = {
69
94
 
70
95
 
71
96
  PrefConfig: IloConfig = {
72
- "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
97
+ "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
73
98
  "cleaners": [ConsecutiveDuplicates],
74
99
  "ignoring_filters": [Numeric, Punctuation],
75
100
  "scoring_filters": [
76
- Or(NimiLinkuCore, NimiLinkuCommon, NimiLinkuUncommon, NimiUCSUR),
101
+ Or(NimiLinkuByUsage(30), NimiUCSUR),
77
102
  And(LongSyllabic, Not(FalsePosSyllabic)),
78
103
  # NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
79
104
  LongProperName,
@@ -84,16 +109,13 @@ PrefConfig: IloConfig = {
84
109
  }
85
110
 
86
111
  CorpusConfig: IloConfig = {
87
- "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
112
+ "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
88
113
  "cleaners": [ConsecutiveDuplicates],
89
114
  "ignoring_filters": [Numeric, Punctuation],
90
115
  "scoring_filters": [
91
116
  Or(
92
- NimiLinkuCore,
93
- NimiLinkuCommon,
94
- NimiLinkuUncommon,
95
- NimiLinkuObscure,
96
- NimiLinkuSandbox,
117
+ # awkward but efficient syntax
118
+ NimiLinkuByUsage(0)(sub=__DICT_PHONOMATCHES),
97
119
  NimiUCSUR,
98
120
  Miscellaneous,
99
121
  ),
@@ -104,43 +126,9 @@ CorpusConfig: IloConfig = {
104
126
  "scorer": SoftScaling,
105
127
  "passing_score": 0.8,
106
128
  }
107
-
108
- # TODO: create a mechanism to omit tokens from a filter with more granularity
109
- __corpus_tokens_dict: Set[str] = cast(
110
- Set[str],
111
- CorpusConfig["scoring_filters"][
112
- 0
113
- ].tokens, # pyright: ignore[reportAttributeAccessIssue]
114
- )
115
- __corpus_tokens_dict -= {
116
- # Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
117
- # In this case, all of these appear more often in English by a factor of at least 10.
118
- "aka", # also known as
119
- "an", # article
120
- "api", # API
121
- "i", # 1st person
122
- "kana", # japanese script
123
- "me", # 1st person
124
- "ne", # "no" in several languages
125
- "nu", # "new", now in dutch
126
- "se", # spanish particle, "see"
127
- "take", # acquire, perhaps forcefully or without permission
128
- "ten", # 10
129
- "to", # to, too
130
- "u", # no u
131
- "we", # 1st person plural
132
- "wi", # wii and discussions of syllables
133
- "sole", # singular, of shoe
134
- # unexplored candidates for removal
135
- # "omen", # ominous
136
- # "papa", # father
137
- # "lo", # "lo" and "loo"
138
- # "ewe", # sheep
139
- # "pa", # father- eh?
140
- }
141
129
  """Mimics the previous implementation of ilo pi toki pona taso."""
142
130
  LazyConfig: IloConfig = {
143
- "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
131
+ "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
144
132
  "cleaners": [ConsecutiveDuplicates],
145
133
  "ignoring_filters": [Numeric, Punctuation],
146
134
  "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
@@ -150,7 +138,7 @@ LazyConfig: IloConfig = {
150
138
  }
151
139
  """This is extremely silly."""
152
140
  IsipinEpikuConfig: IloConfig = {
153
- "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
141
+ "preprocessors": [Emoji, Codeblock, URLs, AngleBracketObject, Reference],
154
142
  "cleaners": [ConsecutiveDuplicates],
155
143
  "ignoring_filters": [Numeric, Punctuation],
156
144
  "scoring_filters": [
@@ -170,31 +158,10 @@ IsipinEpikuConfig: IloConfig = {
170
158
  }
171
159
 
172
160
 
173
- DiscordConfig: IloConfig = {
174
- "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
175
- "cleaners": [ConsecutiveDuplicates],
176
- "ignoring_filters": [Numeric, Punctuation],
177
- "scoring_filters": [
178
- Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
179
- And(LongSyllabic, Not(FalsePosSyllabic)),
180
- LongProperName,
181
- And(LongAlphabetic, Not(FalsePosAlphabetic)),
182
- ],
183
- "scorer": SoftScaling,
184
- "passing_score": 0.8,
185
- }
186
-
187
- TelegramConfig: IloConfig = deepcopy(PrefConfig)
188
- ForumConfig: IloConfig = deepcopy(PrefConfig)
189
-
190
-
191
161
  __all__ = [
192
162
  "BaseConfig",
193
163
  "CorpusConfig",
194
- "DiscordConfig",
195
- "ForumConfig",
196
164
  "IloConfig",
197
165
  "LazyConfig",
198
166
  "PrefConfig",
199
- "TelegramConfig",
200
167
  ]
@@ -1,37 +1,33 @@
1
1
  # STL
2
2
  import re
3
3
  from abc import ABC, abstractmethod
4
- from typing import Set, List, Type
4
+ from copy import deepcopy
5
+ from typing import Set, List, Type, Union, Literal, Optional
5
6
  from functools import lru_cache as cache # cache comes in 3.9
6
7
 
7
8
  # PDM
8
9
  import regex
9
- from typing_extensions import override, deprecated
10
+ from typing_extensions import override
10
11
 
11
12
  # LOCAL
13
+ from sonatoki.types import LinkuBooks, LinkuUsageDate, LinkuUsageCategory
12
14
  from sonatoki.utils import prep_dictionary
13
15
  from sonatoki.constants import (
14
16
  VOWELS,
15
- NIMI_PU,
16
17
  ALPHABET,
17
18
  ALL_PUNCT,
18
19
  ALLOWABLES,
19
20
  CONSONANTS,
20
21
  NIMI_UCSUR,
21
- NIMI_KU_LILI,
22
- NIMI_KU_SULI,
23
- NIMI_LINKU_CORE,
24
22
  NIMI_PU_SYNONYMS,
25
- NIMI_LINKU_COMMON,
26
23
  FALSE_POS_SYLLABIC,
27
- NIMI_LINKU_OBSCURE,
28
- NIMI_LINKU_SANDBOX,
29
24
  NOT_IN_PUNCT_CLASS,
30
- NIMI_LINKU_UNCOMMON,
31
25
  ALL_PUNCT_RANGES_STR,
32
26
  FALSE_POS_ALPHABETIC,
33
27
  UCSUR_PUNCT_RANGES_STR,
34
28
  EMOJI_VARIATION_SELECTOR_RANGES_STR,
29
+ words_by_tag,
30
+ words_by_usage,
35
31
  )
36
32
 
37
33
  regex.DEFAULT_VERSION = regex.VERSION1
@@ -101,6 +97,20 @@ class MemberFilter(Filter):
101
97
  def filter(cls, token: str) -> bool:
102
98
  return token.lower() in cls.tokens
103
99
 
100
+ def __new__(
101
+ cls, add: Optional[Set[str]] = None, sub: Optional[Set[str]] = None
102
+ ) -> Type[Filter]:
103
+ parent_tokens = deepcopy(cls.tokens)
104
+ if add:
105
+ parent_tokens = parent_tokens.union(add)
106
+ if sub:
107
+ parent_tokens -= sub
108
+
109
+ class AnonMemberFilter(MemberFilter):
110
+ tokens = parent_tokens
111
+
112
+ return AnonMemberFilter
113
+
104
114
 
105
115
  class SubsetFilter(Filter):
106
116
  tokens: Set[str]
@@ -155,40 +165,46 @@ class LongProperName(MinLen, ProperName):
155
165
  length = 2 # reject "names" of length 1
156
166
 
157
167
 
158
- class NimiPu(MemberFilter):
159
- tokens = prep_dictionary(NIMI_PU)
160
-
161
-
162
- class NimiPuSynonyms(MemberFilter):
163
- tokens = prep_dictionary(NIMI_PU_SYNONYMS)
168
+ class NimiLinkuByUsage:
169
+ def __new__(
170
+ cls,
171
+ usage: int,
172
+ date: Optional[LinkuUsageDate] = None,
173
+ ) -> Type[MemberFilter]:
174
+ words = words_by_usage(usage, date)
164
175
 
176
+ class AnonLinkuMemberFilter(MemberFilter):
177
+ tokens = prep_dictionary(words)
165
178
 
166
- class NimiKuSuli(MemberFilter):
167
- tokens = prep_dictionary(NIMI_KU_SULI)
179
+ return AnonLinkuMemberFilter
168
180
 
169
181
 
170
- class NimiKuLili(MemberFilter):
171
- tokens = prep_dictionary(NIMI_KU_LILI)
182
+ class NimiLinkuByTag:
183
+ def __new__(
184
+ cls,
185
+ tag: Union[Literal["usage_category"], Literal["book"]],
186
+ category: Union[LinkuUsageCategory, LinkuBooks],
187
+ ) -> Type[MemberFilter]:
188
+ words = words_by_tag(tag, category)
172
189
 
190
+ class AnonLinkuMemberFilter(MemberFilter):
191
+ tokens = prep_dictionary(words)
173
192
 
174
- class NimiLinkuCore(MemberFilter):
175
- tokens = prep_dictionary(NIMI_LINKU_CORE)
193
+ return AnonLinkuMemberFilter
176
194
 
177
195
 
178
- class NimiLinkuCommon(MemberFilter):
179
- tokens = prep_dictionary(NIMI_LINKU_COMMON)
196
+ NimiPu = NimiLinkuByTag("book", "pu")
197
+ NimiKuSuli = NimiLinkuByTag("book", "ku suli")
198
+ NimiKuLili = NimiLinkuByTag("book", "ku lili")
199
+ NimiLinkuCore = NimiLinkuByTag("usage_category", "core")
200
+ NimiLinkuCommon = NimiLinkuByTag("usage_category", "common")
201
+ NimiLinkuUncommon = NimiLinkuByTag("usage_category", "uncommon")
202
+ NimiLinkuObscure = NimiLinkuByTag("usage_category", "obscure")
203
+ NimiLinkuSandbox = NimiLinkuByTag("usage_category", "sandbox")
180
204
 
181
205
 
182
- class NimiLinkuUncommon(MemberFilter):
183
- tokens = prep_dictionary(NIMI_LINKU_UNCOMMON)
184
-
185
-
186
- class NimiLinkuObscure(MemberFilter):
187
- tokens = prep_dictionary(NIMI_LINKU_OBSCURE)
188
-
189
-
190
- class NimiLinkuSandbox(MemberFilter):
191
- tokens = prep_dictionary(NIMI_LINKU_SANDBOX)
206
+ class NimiPuSynonyms(MemberFilter):
207
+ tokens = prep_dictionary(NIMI_PU_SYNONYMS)
192
208
 
193
209
 
194
210
  class NimiUCSUR(MemberFilter):
@@ -143,6 +143,15 @@ class Backticks(RegexPreprocessor):
143
143
  pattern = re.compile(r"`[^`]+`", flags=re.DOTALL)
144
144
 
145
145
 
146
+ class Codeblock(RegexPreprocessor):
147
+ """Remove codeblocks marked by a set of three backticks on their own lines.
148
+
149
+ Subset of what would be removed by Backticks, but may be preferable.
150
+ """
151
+
152
+ pattern = re.compile(r"```\n(?:(?!```).*?)?```", flags=re.DOTALL)
153
+
154
+
146
155
  class Spoilers(RegexPreprocessor):
147
156
  """Remove paired double bars and their contents `||like this||`"""
148
157
 
@@ -1,17 +1,15 @@
1
1
  # STL
2
2
  import math
3
3
  from abc import ABC, abstractmethod
4
- from typing import Dict, List, Type, Union
4
+ from typing import List, Type
5
5
 
6
6
  # PDM
7
7
  from typing_extensions import override
8
8
 
9
9
  # LOCAL
10
+ from sonatoki.types import Number, Scorecard
10
11
  from sonatoki.Filters import Filter
11
12
 
12
- Number = Union[int, float]
13
- Weights = Dict[str, Number]
14
-
15
13
 
16
14
  class Scorer(ABC):
17
15
  @classmethod
@@ -124,7 +122,64 @@ class SoftScaling(Soften, Scaling):
124
122
  scoring."""
125
123
 
126
124
 
127
- # class Logarithmic(Scorer): ...
125
+ class SentenceScorer(ABC):
126
+ @classmethod
127
+ @abstractmethod
128
+ def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
129
+ """Re-score a list of sentences (scorecards, sentences with all their
130
+ metadata) and return them."""
131
+ raise NotImplementedError
132
+
133
+
134
+ class SentNoOp(SentenceScorer):
135
+ @classmethod
136
+ @override
137
+ def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
138
+ return scorecards
128
139
 
129
140
 
130
- __all__ = ["PassFail", "SoftPassFail", "Scaling", "SoftScaling"]
141
+ class SentAvg(SentenceScorer):
142
+ @classmethod
143
+ @override
144
+ def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
145
+ if not scorecards:
146
+ return scorecards
147
+
148
+ total = sum(card["score"] for card in scorecards)
149
+ avg = total / len(scorecards)
150
+ for card in scorecards:
151
+ card["score"] = avg
152
+ return scorecards
153
+
154
+
155
+ class SentWeightedAvg(SentenceScorer):
156
+ @classmethod
157
+ @override
158
+ def score(cls, scorecards: List[Scorecard]) -> List[Scorecard]:
159
+ if not scorecards:
160
+ return scorecards
161
+
162
+ weighted_total = 0
163
+ total_len = 0
164
+ for card in scorecards:
165
+ cardlen = len(card["cleaned"])
166
+ cardscore = card["score"]
167
+
168
+ weighted_total += cardlen * cardscore
169
+ total_len += cardlen
170
+
171
+ weighted_avg = weighted_total / total_len
172
+ for card in scorecards:
173
+ card["score"] = weighted_avg
174
+ return scorecards
175
+
176
+
177
+ __all__ = [
178
+ "PassFail",
179
+ "Scaling",
180
+ "SoftPassFail",
181
+ "SoftScaling",
182
+ "Soften",
183
+ "SentAvg",
184
+ "SentWeightedAvg",
185
+ ]
@@ -1,11 +1,16 @@
1
1
  # STL
2
2
  import json
3
- from typing import Set, Dict
3
+ from typing import Set, Dict, Optional
4
4
  from pathlib import Path
5
5
 
6
6
  # LOCAL
7
+ from sonatoki.types import LinkuWord, LinkuUsageDate
7
8
  from sonatoki.utils import find_unicode_chars, find_unicode_ranges
8
9
 
10
+ LATEST_DATE = "2023-09"
11
+ # hardcoding this seems bad, but it means the parser is stable w.r.t. Linku!
12
+
13
+
9
14
  # `\p{Punctuation}` character class
10
15
  # https://www.compart.com/en/unicode/category
11
16
  # https://unicode.org/Public/UNIDATA/UnicodeData.txt
@@ -638,6 +643,7 @@ FALSE_POS_SYLLABIC = {
638
643
  "iluminate",
639
644
  "imense",
640
645
  "imitate",
646
+ "inanimate",
641
647
  "injoke",
642
648
  "insane",
643
649
  "insolate",
@@ -689,26 +695,42 @@ NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
689
695
  # NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
690
696
 
691
697
 
692
- def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> Set[str]:
693
- return {d["word"] for d in data.values() if d[key] == value}
698
+ def linku_data() -> Dict[str, LinkuWord]:
699
+ # NOTE: this does open+read+parse two files each time you construct a filter
700
+ # but i expect users to construct filters only at the start of runtime
701
+ # there is no reason to waste your RAM by leaving the linku data in it
702
+ with open(LINKU) as f:
703
+ linku: Dict[str, LinkuWord] = json.loads(f.read())
704
+ with open(SANDBOX) as f:
705
+ sandbox: Dict[str, LinkuWord] = json.loads(f.read())
706
+
707
+ return {**linku, **sandbox}
708
+
694
709
 
710
+ def words_by_tag(tag: str, value: str) -> Set[str]:
711
+ data = linku_data()
712
+ return {d["word"] for d in data.values() if d[tag] == value}
695
713
 
696
- with open(LINKU) as f:
697
- linku: Dict[str, Dict[str, str]] = json.loads(f.read())
698
- NIMI_PU = category_helper(linku, "book", "pu")
699
- NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
700
714
 
701
- NIMI_KU_SULI = category_helper(linku, "book", "ku suli")
702
- NIMI_KU_LILI = category_helper(linku, "book", "ku lili")
715
+ def words_by_usage(
716
+ usage: int,
717
+ date: Optional[LinkuUsageDate] = None,
718
+ ) -> Set[str]:
719
+ if not date:
720
+ date = LATEST_DATE
721
+ data = linku_data()
703
722
 
704
- NIMI_LINKU_CORE = category_helper(linku, "usage_category", "core")
705
- NIMI_LINKU_COMMON = category_helper(linku, "usage_category", "common")
706
- NIMI_LINKU_UNCOMMON = category_helper(linku, "usage_category", "uncommon")
707
- NIMI_LINKU_OBSCURE = category_helper(linku, "usage_category", "obscure")
723
+ result: Set[str] = set()
724
+ for word in data.values():
725
+ usages = word["usage"]
726
+ if date in usages and usages[date] >= usage:
727
+ result.add(word["word"])
728
+
729
+ return result
730
+
731
+
732
+ NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
708
733
 
709
- with open(SANDBOX) as f:
710
- sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
711
- NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
712
734
 
713
735
  # with open(SYLLABICS) as f:
714
736
  # FALSE_POS_SYLLABIC = {line.strip() for line in f}
@@ -716,9 +738,6 @@ with open(SANDBOX) as f:
716
738
  # with open(ALPHABETICS) as f:
717
739
  # FALSE_POS_ALPHABETIC = {line.strip() for line in f}
718
740
 
719
- del linku
720
- del sandbox
721
-
722
741
  __all__ = [
723
742
  "ALLOWABLES",
724
743
  "ALL_PUNCT",
@@ -727,14 +746,6 @@ __all__ = [
727
746
  "CONSONANTS",
728
747
  "EMOJI_VARIATION_SELECTOR_RANGES",
729
748
  "EMOJI_VARIATION_SELECTOR_RANGES_STR",
730
- "NIMI_KU_LILI",
731
- "NIMI_KU_SULI",
732
- "NIMI_LINKU_COMMON",
733
- "NIMI_LINKU_CORE",
734
- "NIMI_LINKU_OBSCURE",
735
- "NIMI_LINKU_SANDBOX",
736
- "NIMI_LINKU_UNCOMMON",
737
- "NIMI_PU",
738
749
  "NIMI_PU_SYNONYMS",
739
750
  "POSIX_PUNCT",
740
751
  "POSIX_PUNCT_RANGES",
@@ -1,17 +1,14 @@
1
1
  # STL
2
- from typing import List, Type, Tuple
2
+ from typing import List, Type
3
3
 
4
4
  # LOCAL
5
+ from sonatoki.types import Number, Scorecard
5
6
  from sonatoki.Filters import Filter
6
- from sonatoki.Scorers import Number, Scorer
7
+ from sonatoki.Scorers import Scorer, SentNoOp, SentenceScorer
7
8
  from sonatoki.Cleaners import Cleaner
8
9
  from sonatoki.Tokenizers import Tokenizer, SentTokenizer, WordTokenizer
9
10
  from sonatoki.Preprocessors import Preprocessor
10
11
 
11
- # tokenized, filtered, cleaned, score, result
12
- Scorecard = Tuple[List[str], List[str], List[str], Number, bool]
13
- # TODO: scorecard kinda sucks as a name
14
-
15
12
 
16
13
  class Ilo:
17
14
  __preprocessors: List[Type[Preprocessor]]
@@ -21,6 +18,7 @@ class Ilo:
21
18
  __ignoring_filters: List[Type[Filter]]
22
19
  __scoring_filters: List[Type[Filter]]
23
20
  __scorer: Type[Scorer]
21
+ __sentence_scorer: Type[SentenceScorer]
24
22
  __passing_score: Number
25
23
 
26
24
  def __init__(
@@ -31,6 +29,7 @@ class Ilo:
31
29
  scoring_filters: List[Type[Filter]],
32
30
  scorer: Type[Scorer],
33
31
  passing_score: Number,
32
+ sentence_scorer: Type[SentenceScorer] = SentNoOp,
34
33
  word_tokenizer: Type[Tokenizer] = WordTokenizer,
35
34
  sent_tokenizer: Type[Tokenizer] = SentTokenizer,
36
35
  ):
@@ -43,6 +42,7 @@ class Ilo:
43
42
  self.__ignoring_filters = [*ignoring_filters]
44
43
  self.__scoring_filters = [*scoring_filters]
45
44
  self.__scorer = scorer
45
+ self.__sentence_scorer = sentence_scorer
46
46
  self.__passing_score = passing_score
47
47
 
48
48
  def preprocess(self, msg: str) -> str:
@@ -55,6 +55,7 @@ class Ilo:
55
55
  return self.__word_tokenizer.tokenize(msg)
56
56
 
57
57
  def sent_tokenize(self, msg: str) -> List[str]:
58
+ """It is *highly* recommended that you run `ilo.preprocess` first."""
58
59
  return self.__sent_tokenizer.tokenize(msg)
59
60
 
60
61
  def clean_token(self, token: str) -> str:
@@ -93,44 +94,50 @@ class Ilo:
93
94
  def score_tokens(self, tokens: List[str]) -> float:
94
95
  return self.__scorer.score(tokens, self.__scoring_filters)
95
96
 
97
+ def score_sentences(self, scorecards: List[Scorecard]) -> List[Scorecard]:
98
+ return self.__sentence_scorer.score(scorecards)
99
+
96
100
  def _is_toki_pona(self, message: str) -> Scorecard:
97
101
  """Process a message into its tokens, then filters, cleans, and scores
98
- them. Returns all parts. Message must already be preprocessed, normally
99
- done in `self.is_toki_pona(message)`.
100
-
101
- Returns all components of the processing algorithm except preprocessing:
102
- - Tokenized message (list[str])
103
- - Filtered message (list[str])
104
- - Cleaned message (list[str])
105
- - Score (float)
106
- - Result (bool)
102
+ them. Message must already be preprocessed, normally done in
103
+ `self.is_toki_pona(message)`.
104
+
105
+ Returns a `Scorecard` with all changes to the input text and a score.
107
106
  """
108
107
  tokenized = self.word_tokenize(message)
109
108
  filtered = self.filter_tokens(tokenized)
110
109
  cleaned = self.clean_tokens(filtered)
111
110
  score = self.score_tokens(cleaned)
112
- result = score >= self.__passing_score
113
111
 
114
- return tokenized, filtered, cleaned, score, result
112
+ scorecard: Scorecard = {
113
+ "text": message,
114
+ "tokenized": tokenized,
115
+ "filtered": filtered,
116
+ "cleaned": cleaned,
117
+ "score": score,
118
+ }
119
+
120
+ return scorecard
115
121
 
116
122
  def is_toki_pona(self, message: str) -> bool:
117
- """Determines whether a single statement is or is not Toki Pona."""
123
+ """Determines whether a text is or is not Toki Pona."""
118
124
  message = self.preprocess(message)
119
- *_, result = self._is_toki_pona(message)
120
- return result
125
+ scorecard = self._is_toki_pona(message)
126
+ return scorecard["score"] >= self.__passing_score
121
127
 
122
128
  def _are_toki_pona(self, message: str) -> List[Scorecard]:
123
- """Split a message into sentences, then return a list each sentence's
124
- results via `self._is_toki_pona()`.
129
+ """Split a message into sentences, then return a list with each
130
+ sentence's scorecard from `self._is_toki_pona()`.
125
131
 
126
132
  Message must already be preprocessed, normally done in
127
133
  `self.are_toki_pona(message)`.
128
134
  """
129
- results: List[Scorecard] = list()
135
+ scorecards: List[Scorecard] = list()
130
136
  for sentence in self.sent_tokenize(message):
131
137
  result = self._is_toki_pona(sentence)
132
- results.append(result)
133
- return results
138
+ scorecards.append(result)
139
+ scorecards = self.score_sentences(scorecards)
140
+ return scorecards
134
141
 
135
142
  def are_toki_pona(self, message: str) -> List[bool]:
136
143
  """Splits a statement into sentences, then determines if each is or is not Toki Pona.
@@ -148,5 +155,5 @@ class Ilo:
148
155
  ```
149
156
  """
150
157
  message = self.preprocess(message)
151
- results = self._are_toki_pona(message)
152
- return [res[-1] for res in results]
158
+ scorecards = self._are_toki_pona(message)
159
+ return [card["score"] >= self.__passing_score for card in scorecards]
@@ -0,0 +1,60 @@
1
+ # STL
2
+ from typing import Dict, List, Union, Literal, TypedDict
3
+
4
+ Number = Union[int, float]
5
+
6
+
7
+ # TODO: scorecard kinda sucks as a name
8
+ class Scorecard(TypedDict):
9
+ text: str
10
+ tokenized: List[str]
11
+ filtered: List[str]
12
+ cleaned: List[str]
13
+ score: Number
14
+
15
+
16
+ LinkuUsageDate = Union[
17
+ Literal["2020-04"],
18
+ Literal["2021-10"],
19
+ Literal["2022-08"],
20
+ Literal["2023-09"],
21
+ # Literal["2024-09"],
22
+ ]
23
+
24
+ LinkuUsageCategory = Union[
25
+ Literal["core"],
26
+ Literal["common"],
27
+ Literal["uncommon"],
28
+ Literal["obscure"],
29
+ Literal["sandbox"],
30
+ ]
31
+
32
+ LinkuBooks = Union[
33
+ Literal["pu"],
34
+ Literal["ku suli"],
35
+ Literal["ku lili"],
36
+ Literal["none"],
37
+ ]
38
+
39
+
40
+ class LinkuWord(TypedDict):
41
+ id: str
42
+ author_verbatim: str
43
+ author_verbatim_source: str
44
+ book: str
45
+ coined_era: str
46
+ coined_year: str
47
+ creator: List[str]
48
+ ku_data: Dict[str, int]
49
+ see_also: List[str]
50
+ resources: Dict[str, str]
51
+ representations: Dict[str, Union[str, List[str]]]
52
+ source_language: str
53
+ usage_category: LinkuUsageCategory
54
+ word: str
55
+ deprecated: bool
56
+ etymology: List[Dict[str, str]]
57
+ audio: List[Dict[str, str]]
58
+ pu_verbatim: Dict[str, str]
59
+ usage: Dict[LinkuUsageDate, int]
60
+ translations: Dict[str, Dict[str, str]]
@@ -34,23 +34,13 @@ from sonatoki.Filters import (
34
34
  NimiLinkuUncommon,
35
35
  )
36
36
  from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates
37
- from sonatoki.constants import (
38
- NIMI_PU,
39
- NIMI_KU_LILI,
40
- NIMI_KU_SULI,
41
- NIMI_LINKU_CORE,
42
- NIMI_LINKU_COMMON,
43
- FALSE_POS_SYLLABIC,
44
- NIMI_LINKU_OBSCURE,
45
- NIMI_LINKU_SANDBOX,
46
- NIMI_LINKU_UNCOMMON,
47
- )
37
+ from sonatoki.constants import FALSE_POS_SYLLABIC, words_by_tag
48
38
 
49
39
  # FILESYSTEM
50
40
  from .test_utils import PROPER_NAME_RE
51
41
 
52
42
 
53
- @given(st.sampled_from(list(NIMI_PU)))
43
+ @given(st.sampled_from(list(words_by_tag("book", "pu"))))
54
44
  @example("lukin")
55
45
  @example("selo")
56
46
  @example("li")
@@ -59,14 +49,14 @@ def test_NimiPu(s: str):
59
49
  assert res, repr(s)
60
50
 
61
51
 
62
- @given(st.sampled_from(list(NIMI_LINKU_CORE)))
52
+ @given(st.sampled_from(list(words_by_tag("usage_category", "core"))))
63
53
  @example("pona")
64
54
  def test_NimiLinkuCore(s: str):
65
55
  res = NimiLinkuCore.filter(s)
66
56
  assert res, repr(s)
67
57
 
68
58
 
69
- @given(st.sampled_from(list(NIMI_LINKU_COMMON)))
59
+ @given(st.sampled_from(list(words_by_tag("usage_category", "common"))))
70
60
  @example("n")
71
61
  @example("tonsi")
72
62
  @example("kipisi")
@@ -75,19 +65,21 @@ def test_NimiLinkuCommon(s: str):
75
65
  assert res, repr(s)
76
66
 
77
67
 
78
- @given(st.sampled_from(list(NIMI_LINKU_UNCOMMON)))
68
+ @given(st.sampled_from(list(words_by_tag("usage_category", "uncommon"))))
79
69
  def test_NimiLinkuUncommon(s: str):
80
70
  res = NimiLinkuUncommon.filter(s)
81
71
  assert res, repr(s)
82
72
 
83
73
 
84
- @given(st.sampled_from(list(NIMI_LINKU_OBSCURE)))
74
+ @given(st.sampled_from(list(words_by_tag("usage_category", "obscure"))))
75
+ @example("pake")
76
+ @example("san")
85
77
  def test_NimiLinkuObscure(s: str):
86
78
  res = NimiLinkuObscure.filter(s)
87
79
  assert res, repr(s)
88
80
 
89
81
 
90
- @given(st.sampled_from(list(NIMI_LINKU_SANDBOX)))
82
+ @given(st.sampled_from(list(words_by_tag("usage_category", "sandbox"))))
91
83
  @example("kalamARR")
92
84
  @example("Pingo")
93
85
  def test_NimiLinkuSandbox(s: str):
@@ -207,7 +199,11 @@ def test_OrFilter(s: str):
207
199
  # NOTE: No subset filter test because A | B is not the same as A combined with B.
208
200
  # e.g. "apple" passes Alphabetic, "..." passes Punctuation, "apple..." passes neither
209
201
  # but would incorrectly pass a combined filter.
210
- @given(st.sampled_from(list(NIMI_PU | NIMI_LINKU_OBSCURE)))
202
+ @given(
203
+ st.sampled_from(
204
+ list(words_by_tag("book", "pu") | words_by_tag("usage_category", "obscure"))
205
+ )
206
+ )
211
207
  def test_MemberFilters_OrFilter(s: str):
212
208
  filter = Or(NimiPu, NimiLinkuObscure)
213
209
  assert issubclass(filter, MemberFilter)
@@ -221,11 +217,11 @@ def test_MemberFilters_OrFilter(s: str):
221
217
  @given(
222
218
  st.sampled_from(
223
219
  list(
224
- NIMI_KU_SULI
225
- | NIMI_KU_LILI
226
- | NIMI_LINKU_UNCOMMON
227
- | NIMI_LINKU_OBSCURE
228
- | NIMI_LINKU_SANDBOX
220
+ words_by_tag("book", "ku suli")
221
+ | words_by_tag("book", "ku lili")
222
+ | words_by_tag("usage_category", "uncommon")
223
+ | words_by_tag("usage_category", "obscure")
224
+ | words_by_tag("usage_category", "sandbox")
229
225
  ),
230
226
  )
231
227
  )
@@ -248,14 +244,14 @@ def test_OrFilter_IsipinEpiku(s: str):
248
244
  )
249
245
 
250
246
 
251
- @given(st.sampled_from(list(NIMI_PU)))
247
+ @given(st.sampled_from(list(words_by_tag("book", "pu"))))
252
248
  def test_AndFilter(s: str):
253
249
  s = s.capitalize()
254
250
  f = And(ProperName, NimiPu)
255
251
  assert f.filter(s)
256
252
 
257
253
 
258
- @given(st.sampled_from(list(NIMI_PU)))
254
+ @given(st.sampled_from(list(words_by_tag("book", "pu"))))
259
255
  def test_NotFilter(s: str):
260
256
  f = Not(NimiPu)
261
257
  assert not f.filter(s)
@@ -280,3 +276,44 @@ def test_AndNotFilter(s: str):
280
276
  if res_fp:
281
277
  # syl matched- but if fp matches, then the composed filter should not match
282
278
  assert not res_composed
279
+
280
+
281
+ @given(
282
+ st.sampled_from(list(words_by_tag("book", "pu") | words_by_tag("book", "ku suli")))
283
+ )
284
+ def test_AddTokensToMemberFilter(s: str):
285
+ PuEnKuSuliFilter = NimiPu(add=NimiKuSuli.tokens)
286
+ assert PuEnKuSuliFilter.filter(s)
287
+
288
+
289
+ @given(
290
+ st.sampled_from(
291
+ list(
292
+ words_by_tag("usage_category", "sandbox") | words_by_tag("book", "ku lili")
293
+ )
294
+ )
295
+ )
296
+ def test_AddTokensToMemberFilterNegative(s: str):
297
+ PuEnKuSuliFilter = NimiPu(add=NimiKuSuli.tokens)
298
+ assert not PuEnKuSuliFilter.filter(s)
299
+
300
+
301
+ @given(
302
+ st.sampled_from(
303
+ list(
304
+ words_by_tag("book", "pu")
305
+ | words_by_tag("book", "ku suli")
306
+ | words_by_tag("book", "ku lili")
307
+ | words_by_tag("usage_category", "uncommon")
308
+ | words_by_tag("usage_category", "obscure")
309
+ | words_by_tag("usage_category", "sandbox")
310
+ ),
311
+ )
312
+ | st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
313
+ )
314
+ def test_SubTokensFromMemberFilter(s: str):
315
+ NimiAlaFilter = NimiLinkuCore(sub=NimiPu.tokens)
316
+ # core is a strict subset of pu
317
+ # if kin becomes core, needs to be corrected
318
+
319
+ assert not NimiAlaFilter.filter(s)
@@ -1,3 +1,6 @@
1
+ # STL
2
+ from typing import List, Tuple
3
+
1
4
  # PDM
2
5
  import pytest
3
6
 
@@ -35,6 +38,10 @@ ALL_VALID = [
35
38
  "󱥄󱥬󱥩󱤴", # "o toki tawa mi" in UCSUR
36
39
  "󱤴󱤧󱤑󱥍󱦗󱤖󱥡󱦘󱤬󱥭‍󱥡󱥚",
37
40
  "󱤑󱦐󱥗󱦜󱦈󱦜󱥉󱦜󱦑󱥄󱤤󱤂󱤉󱥆󱤀",
41
+ "o lukin, 󱤴󱥬󱥩󱤴󱤧wawa",
42
+ "ni li sona kiwen",
43
+ "nimi namako li toki e ale",
44
+ "mi open mute a", # mostly eng words
38
45
  ]
39
46
 
40
47
  IGNORABLES = [
@@ -55,10 +62,9 @@ IGNORABLES = [
55
62
  "❤️", # heart
56
63
  "😊",
57
64
  "👨‍👩‍👧‍👧", # family emoji with zwj
58
- # every non-emoji in
65
+ # every non-emoji in the writables
59
66
  "🄀🄁🄂🄃🄄🄅🄆🄇🄈🄉🄊🄋🄌🄍🄎🄏🄐🄑🄒🄓🄔🄕🄖🄗🄘🄙🄚🄛🄜🄝🄞🄟🄠🄡🄢🄣🄤🄥🄦🄧🄨🄩🄪🄫🄬🄭🄮🄯🄰🄱🄲🄳🄴🄵🄶🄷🄸🄹🄺🄻🄼🄽🄾🄿🅀🅁🅂🅃🅄🅅🅆🅇🅈🅉🅊🅋🅌🅍🅎🅏🅐🅑🅒🅓🅔🅕🅖🅗🅘🅙🅚🅛🅜🅝🅞🅟🅠🅡🅢🅣🅤🅥🅦🅧🅨🅩🅪🅫🅬🅭🅮🅯🅲🅳🅴🅵🅶🅷🅸🅹🅺🅻🅼🅽🆀🆁🆂🆃🆄🆅🆆🆇🆈🆉🆊🆋🆌🆍🆏🆐 🆛🆜🆝🆞🆟🆠🆡🆢🆣🆤🆥🆦🆧🆨🆩🆪🆫🆬🆭🇦🇧🇨🇩🇪🇫🇬🇭🇮🇯🇰🇱🇲🇳🇴🇵🇶🇷🇸🇹🇺🇻🇼🇽🇾🇿",
60
67
  "🅰️🅱️🅾️🅱️🅰️", # blood type emojis
61
- # "😃⃢👍", # sincerely, no idea, but it came up
62
68
  ]
63
69
 
64
70
  SYLLABIC_MATCHES = [
@@ -108,7 +114,7 @@ CORPUS_SPECIFIC = [
108
114
  "Pingo",
109
115
  "we Luke li alente wa",
110
116
  ]
111
- CORPUS_SPECIFIC_XFAIL = []
117
+ CORPUS_SPECIFIC_XFAIL: List[str] = []
112
118
 
113
119
 
114
120
  EXCESSIVE_SYLLABICS = [
@@ -193,10 +199,20 @@ FALSE_NEGATIVES = [
193
199
  "mtue",
194
200
  "mi nasa B^)", # emoticon
195
201
  "lete li ike x.x", # this is an emoticon but passes because 'x' is in Filters.Miscellaneous
202
+ "😃⃢👍", # sincerely, no idea, but it came up and it should be omitted by emojis but isn't
196
203
  ]
197
204
 
198
205
  FALSE_POSITIVES = [
199
- "Knowing a little toki pona",
206
+ "Knowing a little toki pona", # name, dict, alphabet, dict, dict- damn, that's hard.
207
+ ]
208
+
209
+ IGNORABLE_PAIRS: List[Tuple[str, str]] = [
210
+ ("o lukin e ni: https://example.com/", "o lukin e ni:"),
211
+ ("ni li nasa anu seme <:musiwawa:198591138591>", "ni li nasa anu seme"),
212
+ ("seme la ni li toki pona ala https://example.com/", "seme la ni li toki pona ala"),
213
+ ("```\ndef bad():\n pass\n``` o lukin e ni", "o lukin e ni"),
214
+ ("mi tawa tomo telo 💦💦", "mi tawa tomo telo"),
215
+ ("o lukin e lipu ni: [[wp:Canvassing]]", "o lukin e lipu ni:"),
200
216
  ]
201
217
 
202
218
 
@@ -254,3 +270,33 @@ def test_false_negatives_pref(ilo: Ilo, text: str):
254
270
  @pytest.mark.parametrize("text", CORPUS_SPECIFIC_XFAIL)
255
271
  def test_false_positives_corpus(corpus_ilo: Ilo, text: str):
256
272
  assert not corpus_ilo.is_toki_pona(text)
273
+
274
+
275
+ @pytest.mark.parametrize("pair", IGNORABLE_PAIRS)
276
+ def test_pref_ignorable_doesnt_change_score(ilo: Ilo, pair: Tuple[str, str]):
277
+ with_ignorable, without_ignorable = pair
278
+ with_ignorable = ilo.preprocess(with_ignorable)
279
+ without_ignorable = ilo.preprocess(without_ignorable)
280
+ score_with = ilo._is_toki_pona(with_ignorable)["score"]
281
+ score_without = ilo._is_toki_pona(without_ignorable)["score"]
282
+ assert score_with == score_without
283
+
284
+
285
+ @pytest.mark.parametrize("pair", IGNORABLE_PAIRS)
286
+ def test_lazy_ignorable_doesnt_change_score(lazy_ilo: Ilo, pair: Tuple[str, str]):
287
+ with_ignorable, without_ignorable = pair
288
+ with_ignorable = lazy_ilo.preprocess(with_ignorable)
289
+ without_ignorable = lazy_ilo.preprocess(without_ignorable)
290
+ score_with = lazy_ilo._is_toki_pona(with_ignorable)["score"]
291
+ score_without = lazy_ilo._is_toki_pona(without_ignorable)["score"]
292
+ assert score_with == score_without
293
+
294
+
295
+ @pytest.mark.parametrize("pair", IGNORABLE_PAIRS)
296
+ def test_corpus_ignorable_doesnt_change_score(corpus_ilo: Ilo, pair: Tuple[str, str]):
297
+ with_ignorable, without_ignorable = pair
298
+ with_ignorable = corpus_ilo.preprocess(with_ignorable)
299
+ without_ignorable = corpus_ilo.preprocess(without_ignorable)
300
+ score_with = corpus_ilo._is_toki_pona(with_ignorable)["score"]
301
+ score_without = corpus_ilo._is_toki_pona(without_ignorable)["score"]
302
+ assert score_with == score_without
@@ -8,6 +8,7 @@ from sonatoki.Preprocessors import (
8
8
  Spoilers,
9
9
  AllQuotes,
10
10
  Backticks,
11
+ Codeblock,
11
12
  Reference,
12
13
  ArrowQuote,
13
14
  ColonEmotes,
@@ -48,6 +49,25 @@ def test_Backticks(s: str):
48
49
  assert res == "", (repr(s), repr(res))
49
50
 
50
51
 
52
+ @given(st.from_regex(Codeblock.pattern.pattern, fullmatch=True))
53
+ @example(
54
+ """```
55
+ ```"""
56
+ )
57
+ @example(
58
+ """```
59
+ blocky message
60
+ ```
61
+
62
+ ```
63
+ second blocky message
64
+ ```"""
65
+ )
66
+ def test_Codeblock(s: str):
67
+ res = Codeblock.process(s).strip()
68
+ assert res == "", (repr(s), repr(res))
69
+
70
+
51
71
  @given(st.from_regex(ArrowQuote.pattern.pattern, fullmatch=True))
52
72
  @example("> base")
53
73
  @example("> newline\n> newline")
@@ -19,45 +19,35 @@ from sonatoki.Filters import (
19
19
  )
20
20
  from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates
21
21
  from sonatoki.constants import (
22
- NIMI_PU,
23
- NIMI_KU_LILI,
24
- NIMI_KU_SULI,
25
- NIMI_LINKU_CORE,
26
22
  NIMI_PU_SYNONYMS,
27
- NIMI_LINKU_COMMON,
28
23
  FALSE_POS_SYLLABIC,
29
- NIMI_LINKU_OBSCURE,
30
- NIMI_LINKU_SANDBOX,
31
- NIMI_LINKU_UNCOMMON,
32
24
  FALSE_POS_ALPHABETIC,
25
+ words_by_tag,
26
+ words_by_usage,
33
27
  )
34
28
 
35
29
 
36
- @given(st.sampled_from(list(NIMI_PU | NIMI_PU_SYNONYMS)))
30
+ @given(st.sampled_from(list(words_by_tag("book", "pu") | NIMI_PU_SYNONYMS)))
37
31
  def test_pu_filters_non_overlap(s: str):
38
32
  res_pu = NimiPu.filter(s)
39
33
  res_synonyms = NimiPuSynonyms.filter(s)
40
34
  assert (res_pu + res_synonyms) == 1
41
35
 
42
36
 
43
- @given(st.sampled_from(list(NIMI_KU_SULI | NIMI_KU_LILI)))
37
+ @given(
38
+ st.sampled_from(
39
+ list(words_by_tag("book", "ku suli") | words_by_tag("book", "ku lili"))
40
+ )
41
+ )
44
42
  def test_ku_filters_non_overlap(s: str):
43
+ s = Lowercase.clean(s)
44
+ s = ConsecutiveDuplicates.clean(s)
45
45
  res_ku_suli = NimiKuSuli.filter(s)
46
46
  res_ku_lili = NimiKuLili.filter(s)
47
47
  assert (res_ku_suli + res_ku_lili) == 1
48
48
 
49
49
 
50
- @given(
51
- st.sampled_from(
52
- list(
53
- NIMI_LINKU_CORE
54
- | NIMI_LINKU_COMMON
55
- | NIMI_LINKU_UNCOMMON
56
- | NIMI_LINKU_OBSCURE
57
- | NIMI_LINKU_SANDBOX
58
- )
59
- )
60
- )
50
+ @given(st.sampled_from(list(words_by_usage(0))))
61
51
  def test_linku_filters_non_overlap(s: str):
62
52
  _ = assume(s != "su")
63
53
 
@@ -73,7 +63,7 @@ def test_linku_filters_non_overlap(s: str):
73
63
  assert (res_core + res_common + res_uncommon + res_obscure + res_sandbox) == 1
74
64
 
75
65
 
76
- @given(st.sampled_from(list(NIMI_LINKU_CORE | NIMI_LINKU_COMMON | NIMI_LINKU_UNCOMMON)))
66
+ @given(st.sampled_from(list(words_by_usage(30))))
77
67
  def test_nimi_linku_properties(s: str):
78
68
  assert ConsecutiveDuplicates.clean(s) == s, repr(s)
79
69
  assert Alphabetic.filter(s), repr(s)
@@ -1,17 +1,14 @@
1
- # STL
2
- import re
3
-
4
1
  # PDM
5
2
  import hypothesis.strategies as st
6
3
 
7
4
  # LOCAL
8
5
  from sonatoki.Filters import Syllabic, Phonotactic, AlphabeticRe
9
- from sonatoki.constants import NIMI_LINKU_CORE, NIMI_LINKU_COMMON
6
+ from sonatoki.constants import words_by_usage
10
7
 
11
8
  PROPER_NAME_RE = r"[A-Z][a-z]*"
12
9
 
13
10
  token_strategy = (
14
- st.sampled_from(list(NIMI_LINKU_CORE | NIMI_LINKU_COMMON))
11
+ st.sampled_from(list(words_by_usage(60)))
15
12
  | st.from_regex(Phonotactic.pattern.pattern, fullmatch=True)
16
13
  | st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
17
14
  | st.from_regex(PROPER_NAME_RE, fullmatch=True)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes