sonatoki 0.3.1__tar.gz → 0.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {sonatoki-0.3.1 → sonatoki-0.3.3}/PKG-INFO +1 -1
  2. {sonatoki-0.3.1 → sonatoki-0.3.3}/pyproject.toml +2 -3
  3. {sonatoki-0.3.1 → sonatoki-0.3.3}/src/sonatoki/Cleaners.py +4 -1
  4. {sonatoki-0.3.1 → sonatoki-0.3.3}/src/sonatoki/Configs.py +52 -31
  5. {sonatoki-0.3.1 → sonatoki-0.3.3}/src/sonatoki/Filters.py +96 -33
  6. {sonatoki-0.3.1 → sonatoki-0.3.3}/src/sonatoki/Preprocessors.py +12 -6
  7. {sonatoki-0.3.1 → sonatoki-0.3.3}/src/sonatoki/Scorers.py +54 -51
  8. {sonatoki-0.3.1 → sonatoki-0.3.3}/src/sonatoki/constants.py +21 -29
  9. sonatoki-0.3.3/src/sonatoki/linku.json +1 -0
  10. sonatoki-0.3.3/src/sonatoki/sandbox.json +1 -0
  11. {sonatoki-0.3.1 → sonatoki-0.3.3}/src/sonatoki/utils.py +23 -5
  12. {sonatoki-0.3.1 → sonatoki-0.3.3}/tests/test_cleaners.py +9 -2
  13. {sonatoki-0.3.1 → sonatoki-0.3.3}/tests/test_filters.py +44 -57
  14. {sonatoki-0.3.1 → sonatoki-0.3.3}/tests/test_ilo.py +31 -11
  15. sonatoki-0.3.3/tests/test_properties.py +78 -0
  16. {sonatoki-0.3.1 → sonatoki-0.3.3}/tests/test_utils.py +1 -11
  17. sonatoki-0.3.1/src/sonatoki/linku.json +0 -1
  18. sonatoki-0.3.1/src/sonatoki/sandbox.json +0 -1
  19. {sonatoki-0.3.1 → sonatoki-0.3.3}/LICENSE +0 -0
  20. {sonatoki-0.3.1 → sonatoki-0.3.3}/README.md +0 -0
  21. {sonatoki-0.3.1 → sonatoki-0.3.3}/src/sonatoki/Tokenizers.py +0 -0
  22. {sonatoki-0.3.1 → sonatoki-0.3.3}/src/sonatoki/__init__.py +0 -0
  23. {sonatoki-0.3.1 → sonatoki-0.3.3}/src/sonatoki/__main__.py +0 -0
  24. {sonatoki-0.3.1 → sonatoki-0.3.3}/src/sonatoki/ilo.py +0 -0
  25. {sonatoki-0.3.1 → sonatoki-0.3.3}/src/sonatoki/py.typed +0 -0
  26. {sonatoki-0.3.1 → sonatoki-0.3.3}/tests/__init__.py +0 -0
  27. {sonatoki-0.3.1 → sonatoki-0.3.3}/tests/test_preprocessors.py +0 -0
  28. {sonatoki-0.3.1 → sonatoki-0.3.3}/tests/test_scorers.py +0 -0
  29. {sonatoki-0.3.1 → sonatoki-0.3.3}/tests/test_tokenize.py +0 -0
  30. {sonatoki-0.3.1 → sonatoki-0.3.3}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
  31. {sonatoki-0.3.1 → sonatoki-0.3.3}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.3.1
3
+ Version: 0.3.3
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.3.1"
3
+ version = "0.3.3"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -16,8 +16,6 @@ readme = "README.md"
16
16
  [project.license]
17
17
  text = "AGPL-3.0-or-later"
18
18
 
19
- [project.optional-dependencies]
20
-
21
19
  [build-system]
22
20
  requires = [
23
21
  "pdm-backend",
@@ -40,6 +38,7 @@ test = [
40
38
  lint = [
41
39
  "black>=23.9.1",
42
40
  "isort>=5.12.0",
41
+ "docformatter>=1.7.5",
43
42
  ]
44
43
  doc = [
45
44
  "sphinx>=7.1.2",
@@ -10,6 +10,7 @@ class Cleaner(ABC):
10
10
  @classmethod
11
11
  @abstractmethod
12
12
  def clean(cls, token: str) -> str:
13
+ """Transform a token to remove some undesirable part."""
13
14
  raise NotImplementedError
14
15
 
15
16
 
@@ -33,7 +34,8 @@ class ConsecutiveDuplicates(Cleaner):
33
34
  may be altered for emphasis or effect, such as in "sonaaaa" or "AAAAAA".
34
35
 
35
36
  This may be undesirable for moraic scripts like Hiragana, where `わわ` would be
36
- incorrectly reduced to `わ`. This does preserve phonotactic validity, though."""
37
+ incorrectly reduced to `わ`. This does preserve phonotactic validity, though.
38
+ """
37
39
 
38
40
  @classmethod
39
41
  @override
@@ -69,4 +71,5 @@ class Lowercase(Cleaner):
69
71
 
70
72
  __all__ = [
71
73
  "ConsecutiveDuplicates",
74
+ "Lowercase",
72
75
  ]
@@ -5,22 +5,23 @@ from typing import List, Type, TypedDict
5
5
  # LOCAL
6
6
  from sonatoki.Filters import (
7
7
  Filter,
8
- NimiPu,
9
8
  Numeric,
10
- OrFilter,
11
9
  Syllabic,
12
10
  NimiUCSUR,
13
11
  Alphabetic,
12
+ NimiKuLili,
13
+ NimiKuSuli,
14
14
  ProperName,
15
- Phonotactic,
16
15
  Punctuation,
16
+ LongSyllabic,
17
+ Miscellaneous,
17
18
  NimiLinkuCore,
18
- NimiPuSynonyms,
19
+ LongAlphabetic,
20
+ LongProperName,
19
21
  OrMemberFilter,
20
22
  NimiLinkuCommon,
21
23
  NimiLinkuObscure,
22
24
  NimiLinkuSandbox,
23
- EnglishIgnorables,
24
25
  NimiLinkuUncommon,
25
26
  )
26
27
  from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
@@ -28,12 +29,9 @@ from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
28
29
  from sonatoki.Tokenizers import Tokenizer, WordTokenizer
29
30
  from sonatoki.Preprocessors import (
30
31
  URLs,
32
+ Backticks,
31
33
  Reference,
32
34
  Preprocessor,
33
- DiscordEmotes,
34
- DiscordSpecial,
35
- DiscordChannels,
36
- DiscordMentions,
37
35
  AngleBracketObject,
38
36
  )
39
37
 
@@ -48,7 +46,7 @@ class IloConfig(TypedDict):
48
46
  passing_score: Number
49
47
 
50
48
 
51
- # TODO: branching configs?
49
+ # TODO: branching configs? config builder?
52
50
 
53
51
  BaseConfig: IloConfig = {
54
52
  "preprocessors": [URLs],
@@ -62,14 +60,14 @@ BaseConfig: IloConfig = {
62
60
 
63
61
 
64
62
  PrefConfig: IloConfig = {
65
- "preprocessors": [URLs, Reference],
63
+ "preprocessors": [Backticks, URLs, Reference],
66
64
  "cleaners": [ConsecutiveDuplicates],
67
- "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
65
+ "ignoring_filters": [Numeric, Punctuation],
68
66
  "scoring_filters": [
69
- OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
70
- Syllabic,
71
- ProperName,
72
- Alphabetic,
67
+ OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
68
+ LongSyllabic,
69
+ LongProperName,
70
+ LongAlphabetic,
73
71
  ],
74
72
  "scorer": SoftScaling,
75
73
  "passing_score": 0.8,
@@ -77,9 +75,9 @@ PrefConfig: IloConfig = {
77
75
  }
78
76
 
79
77
  CorpusConfig: IloConfig = {
80
- "preprocessors": [URLs, AngleBracketObject, Reference],
78
+ "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
81
79
  "cleaners": [ConsecutiveDuplicates],
82
- "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
80
+ "ignoring_filters": [Numeric, Punctuation],
83
81
  "scoring_filters": [
84
82
  OrMemberFilter(
85
83
  NimiLinkuCore,
@@ -88,36 +86,58 @@ CorpusConfig: IloConfig = {
88
86
  NimiLinkuObscure,
89
87
  NimiLinkuSandbox,
90
88
  NimiUCSUR,
89
+ Miscellaneous,
91
90
  ),
92
- Syllabic,
93
- ProperName,
94
- Alphabetic,
91
+ LongSyllabic,
92
+ LongProperName,
93
+ LongAlphabetic,
95
94
  ],
96
95
  "scorer": SoftScaling,
97
96
  "passing_score": 0.8,
98
97
  "word_tokenizer": WordTokenizer,
99
98
  }
100
-
101
-
99
+ """Mimics the previous implementation of ilo pi toki pona taso."""
102
100
  LazyConfig: IloConfig = {
103
- "preprocessors": [URLs],
101
+ "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
104
102
  "cleaners": [ConsecutiveDuplicates],
105
103
  "ignoring_filters": [Numeric, Punctuation],
106
- "scoring_filters": [Alphabetic, NimiUCSUR, ProperName],
104
+ "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
107
105
  "scorer": SoftPassFail,
108
106
  "passing_score": 0.8,
109
107
  "word_tokenizer": WordTokenizer,
110
108
  }
109
+ """This is extremely silly."""
110
+ IsipinEpikuConfig: IloConfig = {
111
+ "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
112
+ "cleaners": [ConsecutiveDuplicates],
113
+ "ignoring_filters": [Numeric, Punctuation],
114
+ "scoring_filters": [
115
+ OrMemberFilter(
116
+ NimiKuSuli,
117
+ NimiKuLili,
118
+ NimiLinkuUncommon,
119
+ NimiLinkuObscure,
120
+ NimiLinkuSandbox,
121
+ ),
122
+ LongSyllabic,
123
+ LongProperName,
124
+ LongAlphabetic,
125
+ ],
126
+ "scorer": SoftScaling,
127
+ "passing_score": 0.8,
128
+ "word_tokenizer": WordTokenizer,
129
+ }
130
+
111
131
 
112
132
  DiscordConfig: IloConfig = {
113
- "preprocessors": [URLs, AngleBracketObject, Reference],
133
+ "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
114
134
  "cleaners": [ConsecutiveDuplicates],
115
- "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
135
+ "ignoring_filters": [Numeric, Punctuation],
116
136
  "scoring_filters": [
117
- OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
118
- Syllabic,
119
- ProperName,
120
- Alphabetic,
137
+ OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
138
+ LongSyllabic,
139
+ LongProperName,
140
+ LongAlphabetic,
121
141
  ],
122
142
  "scorer": SoftScaling,
123
143
  "passing_score": 0.8,
@@ -127,6 +147,7 @@ DiscordConfig: IloConfig = {
127
147
  TelegramConfig: IloConfig = deepcopy(PrefConfig)
128
148
  ForumConfig: IloConfig = deepcopy(PrefConfig)
129
149
 
150
+
130
151
  __all__ = [
131
152
  "BaseConfig",
132
153
  "CorpusConfig",
@@ -42,6 +42,33 @@ class Filter(ABC):
42
42
  raise NotImplementedError
43
43
 
44
44
 
45
+ class MinLen(Filter):
46
+ """
47
+ Meta filter meant to be inherited by another filter to add a length requirement.
48
+ Multiple-inherit with `MinLen` as the first argument so `super()` resolves correctly.
49
+ You may also construct any other filter with a minimum length filter like so:
50
+
51
+ ```
52
+ MinLen(Alphabetic, 3)
53
+ ```
54
+ """
55
+
56
+ length = 0
57
+
58
+ @classmethod
59
+ @cache(maxsize=None)
60
+ def filter(cls, token: str) -> bool:
61
+ if len(token) < cls.length:
62
+ return False
63
+ return super().filter(token)
64
+
65
+ def __new__(cls, filter: Type[Filter], length_: int) -> Type[Filter]:
66
+ class MinLenFilter(MinLen, Filter):
67
+ length = length_
68
+
69
+ return MinLenFilter
70
+
71
+
45
72
  class RegexFilter(Filter):
46
73
  pattern: "re.Pattern[str]"
47
74
 
@@ -83,11 +110,16 @@ class SubsetFilter(Filter):
83
110
 
84
111
 
85
112
  class Miscellaneous(MemberFilter):
86
- tokens = set(ALLOWABLES)
113
+ tokens = prep_dictionary(ALLOWABLES)
87
114
 
88
115
 
89
116
  class EnglishIgnorables(MemberFilter):
90
- tokens = set(IGNORABLES)
117
+ """NOTE: Not recommended for use.
118
+ It is better to use a Long* filter such as LongSyllabic than to use this filter.
119
+ This filter hides words from scoring rather than scoring them poorly,
120
+ which is more of a benefit than a loss for a word you would like to omit."""
121
+
122
+ tokens = prep_dictionary(IGNORABLES)
91
123
 
92
124
 
93
125
  class ProperName(Filter):
@@ -95,9 +127,11 @@ class ProperName(Filter):
95
127
  When Toki Pona is written with the Latin alphabet, names are generally
96
128
  capitalized at their start. This filter identifies those tokens.
97
129
 
98
- Note that this alone cannot determine if a token is a valid name, because
99
- a standalone name is considered invalid in Toki Pona- names generally have head nouns.
100
- This tool only examines one token at a time, so cannot detect names any better than identifying their capital letter.
130
+ Note that this alone cannot determine if a token is a valid name,
131
+ because a standalone name is considered invalid in Toki Pona- names
132
+ generally have head nouns. This tool only examines one token at a
133
+ time, so cannot detect names any better than identifying their
134
+ capital letter.
101
135
  """
102
136
 
103
137
  @classmethod
@@ -109,6 +143,10 @@ class ProperName(Filter):
109
143
  # this will errantly match.
110
144
 
111
145
 
146
+ class LongProperName(MinLen, ProperName):
147
+ length = 2 # reject "names" of length 1
148
+
149
+
112
150
  class NimiPu(MemberFilter):
113
151
  tokens = prep_dictionary(NIMI_PU)
114
152
 
@@ -151,12 +189,14 @@ class NimiUCSUR(MemberFilter):
151
189
 
152
190
  class Phonotactic(RegexFilter):
153
191
  """Determines if a given token is phonotactically valid Toki Pona (or `n`).
192
+
154
193
  Excludes both consecutive nasals and the illegal syllables:
155
194
  - "nm", "nn"
156
195
  - "wu", "wo", "ji", "ti"
157
196
 
158
197
  Note that if this validator is used after `Cleaners.ConsecutiveDuplicates`,
159
- "nn" cannot be found."""
198
+ "nn" cannot be found.
199
+ """
160
200
 
161
201
  pattern = re.compile(
162
202
  rf"^((^[{VOWELS}]|[klmnps][{VOWELS}]|[jt][aeou]|[w][aei])(n(?![mn]))?)+$|^n$",
@@ -166,10 +206,16 @@ class Phonotactic(RegexFilter):
166
206
  )
167
207
 
168
208
 
209
+ class LongPhonotactic(MinLen, Phonotactic):
210
+ length = 3
211
+
212
+
169
213
  class Syllabic(RegexFilter):
170
214
  """Determines if a given token is syllabically valid Toki Pona (or `n`).
171
- Words must have correctly ordered vowels and consonants, but the phonotactic
172
- exceptions are not considered."""
215
+
216
+ Words must have correctly ordered vowels and consonants, but the
217
+ phonotactic exceptions are not considered.
218
+ """
173
219
 
174
220
  # rf"^((^[{VOWELS}]|[{CONSONANTS}][{VOWELS}])n?)+$|^n$"
175
221
  # Alterative I was exploring takes ~15% more steps
@@ -179,6 +225,10 @@ class Syllabic(RegexFilter):
179
225
  )
180
226
 
181
227
 
228
+ class LongSyllabic(MinLen, Syllabic):
229
+ length = 3
230
+
231
+
182
232
  class Alphabetic(SubsetFilter):
183
233
  tokens = set(ALPHABET)
184
234
 
@@ -187,19 +237,19 @@ class AlphabeticRe(RegexFilter):
187
237
  pattern = re.compile(rf"[{ALPHABET}]+", flags=re.IGNORECASE)
188
238
 
189
239
 
190
- class TwoOrMoreAlphabetic(Filter):
191
- # TODO: alphabetic implementation that ignores single characters
192
- pass
240
+ class LongAlphabetic(MinLen, Alphabetic):
241
+ length = 3
193
242
 
194
243
 
195
244
  class Numeric(Filter):
196
- """Determine if a given token is entirely numeric.
197
- Covers all numeric symbols in Unicode.
245
+ """Determine if a given token is entirely numeric. Covers all numeric
246
+ symbols in Unicode.
198
247
 
199
248
  This will fail to find numeric tokens such as "1.111" or "-42",
200
249
  but if used with the aggressive tokenizer designed for `tok`, these will be
201
250
  split into `["1", ".", "111"]` and `["-", "42"]` respectively. As such, the
202
- numeric tokens will be split from their punctuation."""
251
+ numeric tokens will be split from their punctuation.
252
+ """
203
253
 
204
254
  @classmethod
205
255
  @override
@@ -209,13 +259,17 @@ class Numeric(Filter):
209
259
 
210
260
 
211
261
  class Punctuation(SubsetFilter):
212
- """Identify whether a token is entirely punctuation. Fastest implementation."""
262
+ """Identify whether a token is entirely punctuation.
263
+
264
+ Fastest implementation.
265
+ """
213
266
 
214
267
  tokens = set(ALL_PUNCT)
215
268
 
216
269
 
217
270
  class PunctuationRe(RegexFilter):
218
271
  """Faster implementation of `PunctuationRe1`.
272
+
219
273
  Goes out of date compared to the `regex` library if UNICODE_PUNCT_RANGES is not updated.
220
274
  """
221
275
 
@@ -223,7 +277,8 @@ class PunctuationRe(RegexFilter):
223
277
 
224
278
 
225
279
  class PunctuationRe1(Regex1Filter):
226
- """Reference implementation for identifying tokens made entirely of punctuation."""
280
+ """Reference implementation for identifying tokens made entirely of
281
+ punctuation."""
227
282
 
228
283
  pattern = regex.compile(
229
284
  rf"[\p{{Punctuation}}\p{{posix_punct}}{UCSUR_PUNCT_RANGES}]+"
@@ -235,14 +290,16 @@ class OrFilter:
235
290
  returning True when any individual filter matches or False otherwise.
236
291
  Requires at least two filters.
237
292
 
238
- OrFilter exists as a compromise between the need to score some filters equally,
239
- while not adding custom behavior to scorers.
240
- I could have allowed a position to have a list of filters instead of one filter,
241
- but this would require cleaning the user's input, and nested handling of lists.
242
- It also would not have been as powerful- I would need another param for the and/or switch,
243
- or to not give users the choice.
293
+ OrFilter exists as a compromise between the need to score some
294
+ filters equally, while not adding custom behavior to scorers. I
295
+ could have allowed a position to have a list of filters instead of
296
+ one filter, but this would require cleaning the user's input, and
297
+ nested handling of lists. It also would not have been as powerful- I
298
+ would need another param for the and/or switch, or to not give users
299
+ the choice.
244
300
 
245
- Instead, the user is responsible for building an OrFilter out of their desired filters.
301
+ Instead, the user is responsible for building an OrFilter out of
302
+ their desired filters.
246
303
  """
247
304
 
248
305
  @staticmethod
@@ -266,11 +323,9 @@ class OrFilter:
266
323
  if not len(filters) >= 2:
267
324
  raise ValueError("Provide at least two Filters to OrFilter.")
268
325
 
269
- subset_filters = [f for f in filters if issubclass(f, MemberFilter)]
270
- if len(subset_filters) >= 2:
271
- raise Warning(
272
- "Prefer OrMemberFilter for combining two or more MemberFilters."
273
- )
326
+ member_filters = [f for f in filters if issubclass(f, MemberFilter)]
327
+ if len(member_filters) >= 2:
328
+ raise Warning("Use OrMemberFilter for combining two or more MemberFilters.")
274
329
 
275
330
  filter = cls.__generic_filter(*filters)
276
331
 
@@ -279,7 +334,7 @@ class OrFilter:
279
334
 
280
335
  class OrMemberFilter:
281
336
  @staticmethod
282
- def __subset_filter(*filters: Type[MemberFilter]) -> Type[MemberFilter]:
337
+ def __member_filter(*filters: Type[MemberFilter]) -> Type[MemberFilter]:
283
338
  all_token_sets: List[Set[str]] = [f.tokens for f in filters]
284
339
  all_tokens: Set[str] = set().union(*all_token_sets)
285
340
 
@@ -291,14 +346,17 @@ class OrMemberFilter:
291
346
  def __new__(cls, *filters_: Type[MemberFilter]) -> Type[MemberFilter]:
292
347
  if not len(filters_) >= 2:
293
348
  raise ValueError("Provide two or more MemberFilters to OrMemberFilter.")
294
- filter = cls.__subset_filter(*filters_)
349
+ filter = cls.__member_filter(*filters_)
295
350
  return filter
296
351
 
297
352
 
298
- class AndFilter(Filter):
353
+ class AndFilter:
299
354
  """Instantiate with more than one filter to compose them into one filter,
300
- returning False when any individual filter fails to match or True otherwise.
301
- Requires at least two filters."""
355
+ returning False when any individual filter fails to match or True
356
+ otherwise.
357
+
358
+ Requires at least two filters.
359
+ """
302
360
 
303
361
  def __new__(cls, *filters_: Type[Filter]) -> Type[Filter]:
304
362
  if not len(filters_) >= 2:
@@ -323,6 +381,11 @@ __all__ = [
323
381
  "Alphabetic",
324
382
  "AndFilter",
325
383
  "EnglishIgnorables",
384
+ "LongAlphabetic",
385
+ "LongPhonotactic",
386
+ "LongProperName",
387
+ "LongSyllabic",
388
+ "MinLen",
326
389
  "NimiLinkuCore",
327
390
  "NimiLinkuSandbox",
328
391
  "NimiPu",
@@ -2,7 +2,7 @@
2
2
  "Preprocessors" are classes which strip content from a given string prior to tokenization.
3
3
  There are currently two distinct types of Preprocessor:
4
4
 
5
- - Remove a token from a string which would be difficult to identify after tokenization.
5
+ - Remove a token from a string which would be difficult to identify after tokenization.
6
6
  - URLs
7
7
  - DiscordEmotes
8
8
  - Remove a section of a string which is contained in or marked by certain character(s). Also called "Containers"
@@ -61,21 +61,24 @@ Ignorables are tokens which do not count toward the accepted number of tokens
61
61
  or the total number of tokens.
62
62
  This is generally because they are considered external to Toki Pona.
63
63
 
64
- It is likely that every user will want to use these.
64
+ It is likely that every user will want to use these.
65
65
  Not having them will cause many false negatives, such as when a URL is divided
66
66
  into its parts and checked as a token.
67
67
  """
68
68
 
69
69
 
70
70
  class URLs(RegexPreprocessor):
71
- """Remove http(s) protocol URLs"""
71
+ """Remove http(s) protocol URLs."""
72
72
 
73
73
  pattern = re.compile(r"https?:\/\/\S+")
74
74
 
75
75
 
76
76
  class Reference(RegexPreprocessor):
77
77
  """Remove text contained in double brackets.
78
- Often used to fetch articles on Wikipedia, or Magic the Gathering cards."""
78
+
79
+ Often used to fetch articles on Wikipedia, or Magic the Gathering
80
+ cards.
81
+ """
79
82
 
80
83
  pattern = re.compile(r"\[\[.+\]\]")
81
84
 
@@ -100,7 +103,10 @@ class DiscordSpecial(RegexPreprocessor):
100
103
 
101
104
  class AngleBracketObject(RegexPreprocessor):
102
105
  """A generalized version of the Discord-specific angle bracket objects.
103
- Removes any contiguous (not broken by whitespace) text in angle brackets."""
106
+
107
+ Removes any contiguous (not broken by whitespace) text in angle
108
+ brackets.
109
+ """
104
110
 
105
111
  pattern = re.compile(r"<[^<>\s]+>")
106
112
 
@@ -111,7 +117,7 @@ The following classes are Containers.
111
117
  Containers are a special case of Ignorables, where an entire segment of an input
112
118
  may be removed and not counted toward the accepted or total number of tokens.
113
119
 
114
- Some users may prefer to use these so that they may quote third parties who
120
+ Some users may prefer to use these so that they may quote third parties who
115
121
  would likely be using a language other than Toki Pona.
116
122
  """
117
123
 
@@ -13,22 +13,52 @@ Number = Union[int, float]
13
13
  Weights = Dict[str, Number]
14
14
 
15
15
 
16
- def sigmoid(n: int) -> Number:
17
- return 1 / (1 + math.exp(-(0.30 * (n - 1))))
18
- # n-1 makes sigmoid(1) == 0.5
19
- # 0.30 softens scaling in favor of short input
20
- # return n / (1+abs(n)) # too weak in 0.7+
21
-
22
-
23
16
  class Scorer(ABC):
24
17
  @classmethod
25
18
  @abstractmethod
26
19
  def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
20
+ """Score a list of tokens using the given `Filter`s, returning a
21
+ `Number` between 0 and 1 inclusive."""
27
22
  raise NotImplementedError
28
23
 
29
24
 
25
+ class Soften(Scorer):
26
+ """Meta `Scorer` which scales the scores of short messages to reduce the
27
+ impact of shortness on scoring.
28
+
29
+ The scores of short messages are scaled by mapping the token count
30
+ to [0.5, 1.0] via the sigmoid function, then raising the score to
31
+ the resultant power.
32
+
33
+ For example, a single token scoring 0.64 will score 0.8 instead.
34
+ """
35
+
36
+ @staticmethod
37
+ def sigmoid(n: int) -> Number:
38
+ return 1 / (1 + math.exp(-(0.30 * (n - 1))))
39
+ # n-1 makes sigmoid(1) == 0.5
40
+ # 0.30 softens scaling in favor of short input
41
+ # return n / (1+abs(n)) # too weak in 0.7+
42
+
43
+ @classmethod
44
+ @override
45
+ def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
46
+ percentage = super().score(tokens, filters) # type: ignore [abstractmethod]
47
+ len_tokens = len(tokens)
48
+ percentage **= cls.sigmoid(len_tokens)
49
+ return percentage
50
+
51
+ def __new__(cls, scorer: Type[Scorer]) -> Type[Scorer]:
52
+ class SoftenedScorer(Soften, scorer): ...
53
+
54
+ return SoftenedScorer
55
+
56
+
30
57
  class PassFail(Scorer):
31
- """The token passes any filter or fails all of them, scoring 1 or 0 respectively."""
58
+ """If a token matches any filter, it scores 1.
59
+
60
+ Otherwise, it scores 0.
61
+ """
32
62
 
33
63
  @classmethod
34
64
  def score_token(cls, token: str, filters: List[Type[Filter]]) -> Number:
@@ -50,28 +80,17 @@ class PassFail(Scorer):
50
80
  return total_score / len_tokens if len_tokens else 0
51
81
 
52
82
 
53
- class SoftPassFail(PassFail):
54
- @classmethod
55
- @override
56
- def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
57
- if not tokens:
58
- return 1
59
-
60
- total_score = 0
61
- len_tokens = len(tokens)
62
- for token in tokens:
63
- total_score += cls.score_token(token, filters)
64
-
65
- percentage = total_score / len_tokens if len_tokens else 0
66
- percentage **= sigmoid(len_tokens)
67
- return percentage
83
+ class Scaling(Scorer):
84
+ """Tokens score 1 for matching the first filter, and a linearly reduced
85
+ amount for matching later filters based on how many filters there are.
68
86
 
87
+ For example, if there are 4 filters, a token scores 1.0, 0.75, 0.50,
88
+ and 0.25 for matching each respectively.
69
89
 
70
- class Scaling(Scorer):
71
- """
72
- The sooner a token matches a filter, the higher its score.
73
- In other words, filter order matters, weighing earlier listed filters higher than later ones.
74
- This is desirable to avoid messages which would only match weaker filters, as these are less likely to be Toki Pona.
90
+ In other words, filter order matters, weighing earlier listed
91
+ filters higher than later ones. This is desirable to avoid messages
92
+ which would only match weaker filters, as these are less likely to
93
+ be Toki Pona.
75
94
  """
76
95
 
77
96
  @classmethod
@@ -95,33 +114,17 @@ class Scaling(Scorer):
95
114
  return total_score / max_score if max_score else 0
96
115
 
97
116
 
98
- class SoftScaling(Scaling):
99
- """Shorter messages are subject to less harsh scoring
100
- by mapping the token count to [0.5, 1.0] via the sigmoid function,
101
- then raising the score to the resultant power.
102
- For example, a single token scoring 0.64 will now score 0.8.
103
- """
117
+ class SoftPassFail(Soften, PassFail):
118
+ """Same as `PassFail`, but shorter messages are subject to less harsh
119
+ scoring."""
104
120
 
105
- @classmethod
106
- @override
107
- def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
108
- if not tokens:
109
- return 1
110
121
 
111
- total_score = 0
112
- len_filters = len(filters)
113
- len_tokens = len(tokens)
114
-
115
- max_score = len_tokens * len_filters
116
- for token in tokens:
117
- total_score += cls.score_token(token, filters, len_filters)
118
-
119
- percentage = total_score / max_score if max_score else 0
120
- percentage **= sigmoid(len_tokens)
121
- return percentage
122
+ class SoftScaling(Soften, Scaling):
123
+ """Same as `Scaling`, but shorter messages are subject to less harsh
124
+ scoring."""
122
125
 
123
126
 
124
- class Logarithmic(Scorer): ...
127
+ # class Logarithmic(Scorer): ...
125
128
 
126
129
 
127
130
  __all__ = ["PassFail", "SoftPassFail", "Scaling", "SoftScaling"]