sonatoki 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/Cleaners.py CHANGED
@@ -10,6 +10,7 @@ class Cleaner(ABC):
10
10
  @classmethod
11
11
  @abstractmethod
12
12
  def clean(cls, token: str) -> str:
13
+ """Transform a token to remove some undesirable part."""
13
14
  raise NotImplementedError
14
15
 
15
16
 
@@ -33,7 +34,8 @@ class ConsecutiveDuplicates(Cleaner):
33
34
  may be altered for emphasis or effect, such as in "sonaaaa" or "AAAAAA".
34
35
 
35
36
  This may be undesirable for moraic scripts like Hiragana, where `わわ` would be
36
- incorrectly reduced to `わ`. This does preserve phonotactic validity, though."""
37
+ incorrectly reduced to `わ`. This does preserve phonotactic validity, though.
38
+ """
37
39
 
38
40
  @classmethod
39
41
  @override
@@ -69,4 +71,5 @@ class Lowercase(Cleaner):
69
71
 
70
72
  __all__ = [
71
73
  "ConsecutiveDuplicates",
74
+ "Lowercase",
72
75
  ]
sonatoki/Configs.py CHANGED
@@ -2,6 +2,9 @@
2
2
  from copy import deepcopy
3
3
  from typing import List, Type, TypedDict
4
4
 
5
+ # PDM
6
+ from typing_extensions import NotRequired
7
+
5
8
  # LOCAL
6
9
  from sonatoki.Filters import (
7
10
  Filter,
@@ -9,6 +12,8 @@ from sonatoki.Filters import (
9
12
  Syllabic,
10
13
  NimiUCSUR,
11
14
  Alphabetic,
15
+ NimiKuLili,
16
+ NimiKuSuli,
12
17
  ProperName,
13
18
  Punctuation,
14
19
  LongSyllabic,
@@ -20,12 +25,11 @@ from sonatoki.Filters import (
20
25
  NimiLinkuCommon,
21
26
  NimiLinkuObscure,
22
27
  NimiLinkuSandbox,
23
- EnglishIgnorables,
24
28
  NimiLinkuUncommon,
25
29
  )
26
30
  from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
27
31
  from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
28
- from sonatoki.Tokenizers import Tokenizer, WordTokenizer
32
+ from sonatoki.Tokenizers import Tokenizer
29
33
  from sonatoki.Preprocessors import (
30
34
  URLs,
31
35
  Backticks,
@@ -37,15 +41,16 @@ from sonatoki.Preprocessors import (
37
41
 
38
42
  class IloConfig(TypedDict):
39
43
  preprocessors: List[Type[Preprocessor]]
40
- word_tokenizer: Type[Tokenizer]
41
44
  cleaners: List[Type[Cleaner]]
42
45
  ignoring_filters: List[Type[Filter]]
43
46
  scoring_filters: List[Type[Filter]]
44
47
  scorer: Type[Scorer]
45
48
  passing_score: Number
49
+ word_tokenizer: NotRequired[Type[Tokenizer]]
50
+ sent_tokenizer: NotRequired[Type[Tokenizer]]
46
51
 
47
52
 
48
- # TODO: branching configs?
53
+ # TODO: branching configs? config builder?
49
54
 
50
55
  BaseConfig: IloConfig = {
51
56
  "preprocessors": [URLs],
@@ -54,7 +59,6 @@ BaseConfig: IloConfig = {
54
59
  "scoring_filters": [],
55
60
  "scorer": PassFail,
56
61
  "passing_score": 0.8,
57
- "word_tokenizer": WordTokenizer,
58
62
  }
59
63
 
60
64
 
@@ -70,7 +74,6 @@ PrefConfig: IloConfig = {
70
74
  ],
71
75
  "scorer": SoftScaling,
72
76
  "passing_score": 0.8,
73
- "word_tokenizer": WordTokenizer,
74
77
  }
75
78
 
76
79
  CorpusConfig: IloConfig = {
@@ -93,13 +96,8 @@ CorpusConfig: IloConfig = {
93
96
  ],
94
97
  "scorer": SoftScaling,
95
98
  "passing_score": 0.8,
96
- "word_tokenizer": WordTokenizer,
97
99
  }
98
-
99
-
100
- """
101
- Mimics the previous implementation of ilo pi toki pona taso
102
- """
100
+ """Mimics the previous implementation of ilo pi toki pona taso."""
103
101
  LazyConfig: IloConfig = {
104
102
  "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
105
103
  "cleaners": [ConsecutiveDuplicates],
@@ -107,27 +105,47 @@ LazyConfig: IloConfig = {
107
105
  "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
108
106
  "scorer": SoftPassFail,
109
107
  "passing_score": 0.8,
110
- "word_tokenizer": WordTokenizer,
111
108
  }
109
+ """This is extremely silly."""
110
+ IsipinEpikuConfig: IloConfig = {
111
+ "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
112
+ "cleaners": [ConsecutiveDuplicates],
113
+ "ignoring_filters": [Numeric, Punctuation],
114
+ "scoring_filters": [
115
+ OrMemberFilter(
116
+ NimiKuSuli,
117
+ NimiKuLili,
118
+ NimiLinkuUncommon,
119
+ NimiLinkuObscure,
120
+ NimiLinkuSandbox,
121
+ ),
122
+ LongSyllabic,
123
+ LongProperName,
124
+ LongAlphabetic,
125
+ ],
126
+ "scorer": SoftScaling,
127
+ "passing_score": 0.8,
128
+ }
129
+
112
130
 
113
131
  DiscordConfig: IloConfig = {
114
132
  "preprocessors": [Backticks, URLs, AngleBracketObject, Reference],
115
133
  "cleaners": [ConsecutiveDuplicates],
116
- "ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
134
+ "ignoring_filters": [Numeric, Punctuation],
117
135
  "scoring_filters": [
118
- OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
136
+ OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
119
137
  LongSyllabic,
120
138
  LongProperName,
121
139
  LongAlphabetic,
122
140
  ],
123
141
  "scorer": SoftScaling,
124
142
  "passing_score": 0.8,
125
- "word_tokenizer": WordTokenizer,
126
143
  }
127
144
 
128
145
  TelegramConfig: IloConfig = deepcopy(PrefConfig)
129
146
  ForumConfig: IloConfig = deepcopy(PrefConfig)
130
147
 
148
+
131
149
  __all__ = [
132
150
  "BaseConfig",
133
151
  "CorpusConfig",
sonatoki/Filters.py CHANGED
@@ -127,9 +127,11 @@ class ProperName(Filter):
127
127
  When Toki Pona is written with the Latin alphabet, names are generally
128
128
  capitalized at their start. This filter identifies those tokens.
129
129
 
130
- Note that this alone cannot determine if a token is a valid name, because
131
- a standalone name is considered invalid in Toki Pona- names generally have head nouns.
132
- This tool only examines one token at a time, so cannot detect names any better than identifying their capital letter.
130
+ Note that this alone cannot determine if a token is a valid name,
131
+ because a standalone name is considered invalid in Toki Pona- names
132
+ generally have head nouns. This tool only examines one token at a
133
+ time, so cannot detect names any better than identifying their
134
+ capital letter.
133
135
  """
134
136
 
135
137
  @classmethod
@@ -187,12 +189,14 @@ class NimiUCSUR(MemberFilter):
187
189
 
188
190
  class Phonotactic(RegexFilter):
189
191
  """Determines if a given token is phonotactically valid Toki Pona (or `n`).
192
+
190
193
  Excludes both consecutive nasals and the illegal syllables:
191
194
  - "nm", "nn"
192
195
  - "wu", "wo", "ji", "ti"
193
196
 
194
197
  Note that if this validator is used after `Cleaners.ConsecutiveDuplicates`,
195
- "nn" cannot be found."""
198
+ "nn" cannot be found.
199
+ """
196
200
 
197
201
  pattern = re.compile(
198
202
  rf"^((^[{VOWELS}]|[klmnps][{VOWELS}]|[jt][aeou]|[w][aei])(n(?![mn]))?)+$|^n$",
@@ -208,8 +212,10 @@ class LongPhonotactic(MinLen, Phonotactic):
208
212
 
209
213
  class Syllabic(RegexFilter):
210
214
  """Determines if a given token is syllabically valid Toki Pona (or `n`).
211
- Words must have correctly ordered vowels and consonants, but the phonotactic
212
- exceptions are not considered."""
215
+
216
+ Words must have correctly ordered vowels and consonants, but the
217
+ phonotactic exceptions are not considered.
218
+ """
213
219
 
214
220
  # rf"^((^[{VOWELS}]|[{CONSONANTS}][{VOWELS}])n?)+$|^n$"
215
221
  # Alterative I was exploring takes ~15% more steps
@@ -236,13 +242,14 @@ class LongAlphabetic(MinLen, Alphabetic):
236
242
 
237
243
 
238
244
  class Numeric(Filter):
239
- """Determine if a given token is entirely numeric.
240
- Covers all numeric symbols in Unicode.
245
+ """Determine if a given token is entirely numeric. Covers all numeric
246
+ symbols in Unicode.
241
247
 
242
248
  This will fail to find numeric tokens such as "1.111" or "-42",
243
249
  but if used with the aggressive tokenizer designed for `tok`, these will be
244
250
  split into `["1", ".", "111"]` and `["-", "42"]` respectively. As such, the
245
- numeric tokens will be split from their punctuation."""
251
+ numeric tokens will be split from their punctuation.
252
+ """
246
253
 
247
254
  @classmethod
248
255
  @override
@@ -252,13 +259,17 @@ class Numeric(Filter):
252
259
 
253
260
 
254
261
  class Punctuation(SubsetFilter):
255
- """Identify whether a token is entirely punctuation. Fastest implementation."""
262
+ """Identify whether a token is entirely punctuation.
263
+
264
+ Fastest implementation.
265
+ """
256
266
 
257
267
  tokens = set(ALL_PUNCT)
258
268
 
259
269
 
260
270
  class PunctuationRe(RegexFilter):
261
271
  """Faster implementation of `PunctuationRe1`.
272
+
262
273
  Goes out of date compared to the `regex` library if UNICODE_PUNCT_RANGES is not updated.
263
274
  """
264
275
 
@@ -266,7 +277,8 @@ class PunctuationRe(RegexFilter):
266
277
 
267
278
 
268
279
  class PunctuationRe1(Regex1Filter):
269
- """Reference implementation for identifying tokens made entirely of punctuation."""
280
+ """Reference implementation for identifying tokens made entirely of
281
+ punctuation."""
270
282
 
271
283
  pattern = regex.compile(
272
284
  rf"[\p{{Punctuation}}\p{{posix_punct}}{UCSUR_PUNCT_RANGES}]+"
@@ -278,14 +290,16 @@ class OrFilter:
278
290
  returning True when any individual filter matches or False otherwise.
279
291
  Requires at least two filters.
280
292
 
281
- OrFilter exists as a compromise between the need to score some filters equally,
282
- while not adding custom behavior to scorers.
283
- I could have allowed a position to have a list of filters instead of one filter,
284
- but this would require cleaning the user's input, and nested handling of lists.
285
- It also would not have been as powerful- I would need another param for the and/or switch,
286
- or to not give users the choice.
293
+ OrFilter exists as a compromise between the need to score some
294
+ filters equally, while not adding custom behavior to scorers. I
295
+ could have allowed a position to have a list of filters instead of
296
+ one filter, but this would require cleaning the user's input, and
297
+ nested handling of lists. It also would not have been as powerful- I
298
+ would need another param for the and/or switch, or to not give users
299
+ the choice.
287
300
 
288
- Instead, the user is responsible for building an OrFilter out of their desired filters.
301
+ Instead, the user is responsible for building an OrFilter out of
302
+ their desired filters.
289
303
  """
290
304
 
291
305
  @staticmethod
@@ -336,10 +350,13 @@ class OrMemberFilter:
336
350
  return filter
337
351
 
338
352
 
339
- class AndFilter(Filter):
353
+ class AndFilter:
340
354
  """Instantiate with more than one filter to compose them into one filter,
341
- returning False when any individual filter fails to match or True otherwise.
342
- Requires at least two filters."""
355
+ returning False when any individual filter fails to match or True
356
+ otherwise.
357
+
358
+ Requires at least two filters.
359
+ """
343
360
 
344
361
  def __new__(cls, *filters_: Type[Filter]) -> Type[Filter]:
345
362
  if not len(filters_) >= 2:
sonatoki/Preprocessors.py CHANGED
@@ -2,7 +2,7 @@
2
2
  "Preprocessors" are classes which strip content from a given string prior to tokenization.
3
3
  There are currently two distinct types of Preprocessor:
4
4
 
5
- - Remove a token from a string which would be difficult to identify after tokenization.
5
+ - Remove a token from a string which would be difficult to identify after tokenization.
6
6
  - URLs
7
7
  - DiscordEmotes
8
8
  - Remove a section of a string which is contained in or marked by certain character(s). Also called "Containers"
@@ -61,21 +61,24 @@ Ignorables are tokens which do not count toward the accepted number of tokens
61
61
  or the total number of tokens.
62
62
  This is generally because they are considered external to Toki Pona.
63
63
 
64
- It is likely that every user will want to use these.
64
+ It is likely that every user will want to use these.
65
65
  Not having them will cause many false negatives, such as when a URL is divided
66
66
  into its parts and checked as a token.
67
67
  """
68
68
 
69
69
 
70
70
  class URLs(RegexPreprocessor):
71
- """Remove http(s) protocol URLs"""
71
+ """Remove http(s) protocol URLs."""
72
72
 
73
73
  pattern = re.compile(r"https?:\/\/\S+")
74
74
 
75
75
 
76
76
  class Reference(RegexPreprocessor):
77
77
  """Remove text contained in double brackets.
78
- Often used to fetch articles on Wikipedia, or Magic the Gathering cards."""
78
+
79
+ Often used to fetch articles on Wikipedia, or Magic the Gathering
80
+ cards.
81
+ """
79
82
 
80
83
  pattern = re.compile(r"\[\[.+\]\]")
81
84
 
@@ -100,7 +103,10 @@ class DiscordSpecial(RegexPreprocessor):
100
103
 
101
104
  class AngleBracketObject(RegexPreprocessor):
102
105
  """A generalized version of the Discord-specific angle bracket objects.
103
- Removes any contiguous (not broken by whitespace) text in angle brackets."""
106
+
107
+ Removes any contiguous (not broken by whitespace) text in angle
108
+ brackets.
109
+ """
104
110
 
105
111
  pattern = re.compile(r"<[^<>\s]+>")
106
112
 
@@ -111,7 +117,7 @@ The following classes are Containers.
111
117
  Containers are a special case of Ignorables, where an entire segment of an input
112
118
  may be removed and not counted toward the accepted or total number of tokens.
113
119
 
114
- Some users may prefer to use these so that they may quote third parties who
120
+ Some users may prefer to use these so that they may quote third parties who
115
121
  would likely be using a language other than Toki Pona.
116
122
  """
117
123
 
sonatoki/Scorers.py CHANGED
@@ -13,22 +13,52 @@ Number = Union[int, float]
13
13
  Weights = Dict[str, Number]
14
14
 
15
15
 
16
- def sigmoid(n: int) -> Number:
17
- return 1 / (1 + math.exp(-(0.30 * (n - 1))))
18
- # n-1 makes sigmoid(1) == 0.5
19
- # 0.30 softens scaling in favor of short input
20
- # return n / (1+abs(n)) # too weak in 0.7+
21
-
22
-
23
16
  class Scorer(ABC):
24
17
  @classmethod
25
18
  @abstractmethod
26
19
  def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
20
+ """Score a list of tokens using the given `Filter`s, returning a
21
+ `Number` between 0 and 1 inclusive."""
27
22
  raise NotImplementedError
28
23
 
29
24
 
25
+ class Soften(Scorer):
26
+ """Meta `Scorer` which scales the scores of short messages to reduce the
27
+ impact of shortness on scoring.
28
+
29
+ The scores of short messages are scaled by mapping the token count
30
+ to [0.5, 1.0] via the sigmoid function, then raising the score to
31
+ the resultant power.
32
+
33
+ For example, a single token scoring 0.64 will score 0.8 instead.
34
+ """
35
+
36
+ @staticmethod
37
+ def sigmoid(n: int) -> Number:
38
+ return 1 / (1 + math.exp(-(0.30 * (n - 1))))
39
+ # n-1 makes sigmoid(1) == 0.5
40
+ # 0.30 softens scaling in favor of short input
41
+ # return n / (1+abs(n)) # too weak in 0.7+
42
+
43
+ @classmethod
44
+ @override
45
+ def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
46
+ percentage = super().score(tokens, filters) # type: ignore [abstractmethod]
47
+ len_tokens = len(tokens)
48
+ percentage **= cls.sigmoid(len_tokens)
49
+ return percentage
50
+
51
+ def __new__(cls, scorer: Type[Scorer]) -> Type[Scorer]:
52
+ class SoftenedScorer(Soften, scorer): ...
53
+
54
+ return SoftenedScorer
55
+
56
+
30
57
  class PassFail(Scorer):
31
- """The token passes any filter or fails all of them, scoring 1 or 0 respectively."""
58
+ """If a token matches any filter, it scores 1.
59
+
60
+ Otherwise, it scores 0.
61
+ """
32
62
 
33
63
  @classmethod
34
64
  def score_token(cls, token: str, filters: List[Type[Filter]]) -> Number:
@@ -50,28 +80,17 @@ class PassFail(Scorer):
50
80
  return total_score / len_tokens if len_tokens else 0
51
81
 
52
82
 
53
- class SoftPassFail(PassFail):
54
- @classmethod
55
- @override
56
- def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
57
- if not tokens:
58
- return 1
59
-
60
- total_score = 0
61
- len_tokens = len(tokens)
62
- for token in tokens:
63
- total_score += cls.score_token(token, filters)
64
-
65
- percentage = total_score / len_tokens if len_tokens else 0
66
- percentage **= sigmoid(len_tokens)
67
- return percentage
83
+ class Scaling(Scorer):
84
+ """Tokens score 1 for matching the first filter, and a linearly reduced
85
+ amount for matching later filters based on how many filters there are.
68
86
 
87
+ For example, if there are 4 filters, a token scores 1.0, 0.75, 0.50,
88
+ and 0.25 for matching each respectively.
69
89
 
70
- class Scaling(Scorer):
71
- """
72
- The sooner a token matches a filter, the higher its score.
73
- In other words, filter order matters, weighing earlier listed filters higher than later ones.
74
- This is desirable to avoid messages which would only match weaker filters, as these are less likely to be Toki Pona.
90
+ In other words, filter order matters, weighing earlier listed
91
+ filters higher than later ones. This is desirable to avoid messages
92
+ which would only match weaker filters, as these are less likely to
93
+ be Toki Pona.
75
94
  """
76
95
 
77
96
  @classmethod
@@ -95,33 +114,17 @@ class Scaling(Scorer):
95
114
  return total_score / max_score if max_score else 0
96
115
 
97
116
 
98
- class SoftScaling(Scaling):
99
- """Shorter messages are subject to less harsh scoring
100
- by mapping the token count to [0.5, 1.0] via the sigmoid function,
101
- then raising the score to the resultant power.
102
- For example, a single token scoring 0.64 will now score 0.8.
103
- """
117
+ class SoftPassFail(Soften, PassFail):
118
+ """Same as `PassFail`, but shorter messages are subject to less harsh
119
+ scoring."""
104
120
 
105
- @classmethod
106
- @override
107
- def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
108
- if not tokens:
109
- return 1
110
121
 
111
- total_score = 0
112
- len_filters = len(filters)
113
- len_tokens = len(tokens)
114
-
115
- max_score = len_tokens * len_filters
116
- for token in tokens:
117
- total_score += cls.score_token(token, filters, len_filters)
118
-
119
- percentage = total_score / max_score if max_score else 0
120
- percentage **= sigmoid(len_tokens)
121
- return percentage
122
+ class SoftScaling(Soften, Scaling):
123
+ """Same as `Scaling`, but shorter messages are subject to less harsh
124
+ scoring."""
122
125
 
123
126
 
124
- class Logarithmic(Scorer): ...
127
+ # class Logarithmic(Scorer): ...
125
128
 
126
129
 
127
130
  __all__ = ["PassFail", "SoftPassFail", "Scaling", "SoftScaling"]
sonatoki/constants.py CHANGED
@@ -380,62 +380,29 @@ CONSONANTS = "jklmnpstw"
380
380
  ALPHABET = VOWELS + CONSONANTS
381
381
 
382
382
  LANGUAGE = "english" # for NLTK
383
-
384
- """Commonly occurring strings which are some kind of valid Toki Pona or external token"""
383
+ """Commonly occurring strings which are some kind of valid Toki Pona or
384
+ external token."""
385
385
  ALLOWABLES = {
386
386
  "x", # ala
387
387
  "y", # anu
388
388
  "kxk", # ken ala ken
389
389
  "wxw", # wile ala wile
390
+ "msa",
390
391
  }
391
392
 
392
393
  PHONOMATCHES = {
393
- # "a", # ignore
394
- # "an", # against
395
- # "i", # against
396
- # "in", # against
394
+ "non",
395
+ "nope",
397
396
  "some",
398
- "like", # against
399
- # "me", # against
400
- # "no", # against
401
- # "on", # against
402
- # "se", # against
403
- # "so", # against
404
- # "some", # against
405
- "to", # ignore
406
- # "u", # against
407
- # "un", # against
408
- "use", # against
409
- # "we", # against
397
+ "like",
398
+ "use",
399
+ "imo",
400
+ "time",
401
+ "man",
402
+ "also",
410
403
  }
411
404
 
412
- ALPHABETIC_MATCHES = PHONOMATCHES | {
413
- "a",
414
- # "am",
415
- # "as",
416
- # "at",
417
- # "aw", # aww
418
- # "ek", # eek
419
- # "ew",
420
- # "ik",
421
- # "il", # ill
422
- # "im",
423
- # "im",
424
- # "ip",
425
- # "is",
426
- # "it",
427
- # "l", # they'll
428
- # "m", # i'm
429
- # "ok",
430
- # "op",
431
- # "ow",
432
- # "s", # let's
433
- # "t", # don't
434
- # "up",
435
- # "us",
436
- # "ut",
437
- # "uw",
438
- }
405
+ ALPHABETIC_MATCHES: Set[str] = set()
439
406
 
440
407
  IGNORABLES = PHONOMATCHES | ALPHABETIC_MATCHES
441
408
 
sonatoki/ilo.py CHANGED
@@ -5,12 +5,17 @@ from typing import List, Type, Tuple
5
5
  from sonatoki.Filters import Filter
6
6
  from sonatoki.Scorers import Number, Scorer
7
7
  from sonatoki.Cleaners import Cleaner
8
- from sonatoki.Tokenizers import Tokenizer
8
+ from sonatoki.Tokenizers import Tokenizer, SentTokenizer, WordTokenizer
9
9
  from sonatoki.Preprocessors import Preprocessor
10
10
 
11
+ # tokenized, filtered, cleaned, score, result
12
+ Scorecard = Tuple[List[str], List[str], List[str], Number, bool]
13
+ # TODO: scorecard kinda sucks as a name
14
+
11
15
 
12
16
  class Ilo:
13
17
  __preprocessors: List[Type[Preprocessor]]
18
+ __sent_tokenizer: Type[Tokenizer]
14
19
  __word_tokenizer: Type[Tokenizer]
15
20
  __cleaners: List[Type[Cleaner]]
16
21
  __ignoring_filters: List[Type[Filter]]
@@ -26,11 +31,13 @@ class Ilo:
26
31
  scoring_filters: List[Type[Filter]],
27
32
  scorer: Type[Scorer],
28
33
  passing_score: Number,
29
- word_tokenizer: Type[Tokenizer],
34
+ word_tokenizer: Type[Tokenizer] = WordTokenizer,
35
+ sent_tokenizer: Type[Tokenizer] = SentTokenizer,
30
36
  ):
31
37
  super().__init__()
32
38
  # avoid keeping a ref to user's list just in case
33
39
  self.__preprocessors = [*preprocessors]
40
+ self.__sent_tokenizer = sent_tokenizer
34
41
  self.__word_tokenizer = word_tokenizer
35
42
  self.__cleaners = [*cleaners]
36
43
  self.__ignoring_filters = [*ignoring_filters]
@@ -47,6 +54,9 @@ class Ilo:
47
54
  """It is *highly* recommended that you run `ilo.preprocess` first."""
48
55
  return self.__word_tokenizer.tokenize(msg)
49
56
 
57
+ def sent_tokenize(self, msg: str) -> List[str]:
58
+ return self.__sent_tokenizer.tokenize(msg)
59
+
50
60
  def clean_token(self, token: str) -> str:
51
61
  for c in self.__cleaners:
52
62
  token = c.clean(token)
@@ -83,26 +93,60 @@ class Ilo:
83
93
  def score_tokens(self, tokens: List[str]) -> float:
84
94
  return self.__scorer.score(tokens, self.__scoring_filters)
85
95
 
86
- def _is_toki_pona(
87
- self, message: str
88
- ) -> Tuple[str, List[str], List[str], List[str], Number, bool]:
89
- """Returns all components of the processing algorithm:
90
- - Preprocessed message (str)
96
+ def _is_toki_pona(self, message: str) -> Scorecard:
97
+ """Process a message into its tokens, then filters, cleans, and scores
98
+ them. Returns all parts. Message must already be preprocessed, normally
99
+ done in `self.is_toki_pona(message)`.
100
+
101
+ Returns all components of the processing algorithm except preprocessing:
91
102
  - Tokenized message (list[str])
92
103
  - Filtered message (list[str])
93
104
  - Cleaned message (list[str])
94
105
  - Score (float)
95
- - Result (bool)"""
96
- preprocessed = self.preprocess(message)
97
- tokenized = self.word_tokenize(preprocessed)
106
+ - Result (bool)
107
+ """
108
+ tokenized = self.word_tokenize(message)
98
109
  filtered = self.filter_tokens(tokenized)
99
110
  cleaned = self.clean_tokens(filtered)
100
111
  score = self.score_tokens(cleaned)
101
112
  result = score >= self.__passing_score
102
113
 
103
- return preprocessed, tokenized, filtered, cleaned, score, result
114
+ return tokenized, filtered, cleaned, score, result
104
115
 
105
116
  def is_toki_pona(self, message: str) -> bool:
106
117
  """Determines whether a single statement is or is not Toki Pona."""
118
+ message = self.preprocess(message)
107
119
  *_, result = self._is_toki_pona(message)
108
120
  return result
121
+
122
+ def _are_toki_pona(self, message: str):
123
+ """Split a message into sentences, then return a list each sentence's
124
+ results via `self._is_toki_pona()`.
125
+
126
+ Message must already be preprocessed, normally done in
127
+ `self.are_toki_pona(message)`.
128
+ """
129
+ results: List[Scorecard] = list()
130
+ for sentence in self.sent_tokenize(message):
131
+ result = self._is_toki_pona(sentence)
132
+ results.append(result)
133
+ return results
134
+
135
+ def are_toki_pona(self, message: str) -> List[bool]:
136
+ """Splits a statement into sentences, then determines if each is or is not Toki Pona.
137
+ NOTE: You will need to decide how to score the result. Examples:
138
+
139
+ ```
140
+ def all_must_pass(message: str) -> bool:
141
+ return all(ILO.are_toki_pona(message))
142
+
143
+ def portion_must_pass(message: str, score: Number = 0.8) -> bool:
144
+ results = ILO.are_toki_pona(message)
145
+ sent_count = len(results)
146
+ passing = results.count(True)
147
+ return (passing / sent_count) >= score
148
+ ```
149
+ """
150
+ message = self.preprocess(message)
151
+ results = self._are_toki_pona(message)
152
+ return [res[-1] for res in results]