sonatoki 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/Configs.py CHANGED
@@ -2,6 +2,9 @@
2
2
  from copy import deepcopy
3
3
  from typing import List, Type, TypedDict
4
4
 
5
+ # PDM
6
+ from typing_extensions import NotRequired
7
+
5
8
  # LOCAL
6
9
  from sonatoki.Filters import (
7
10
  Filter,
@@ -26,7 +29,7 @@ from sonatoki.Filters import (
26
29
  )
27
30
  from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
28
31
  from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
29
- from sonatoki.Tokenizers import Tokenizer, WordTokenizer
32
+ from sonatoki.Tokenizers import Tokenizer
30
33
  from sonatoki.Preprocessors import (
31
34
  URLs,
32
35
  Backticks,
@@ -38,12 +41,13 @@ from sonatoki.Preprocessors import (
38
41
 
39
42
  class IloConfig(TypedDict):
40
43
  preprocessors: List[Type[Preprocessor]]
41
- word_tokenizer: Type[Tokenizer]
42
44
  cleaners: List[Type[Cleaner]]
43
45
  ignoring_filters: List[Type[Filter]]
44
46
  scoring_filters: List[Type[Filter]]
45
47
  scorer: Type[Scorer]
46
48
  passing_score: Number
49
+ word_tokenizer: NotRequired[Type[Tokenizer]]
50
+ sent_tokenizer: NotRequired[Type[Tokenizer]]
47
51
 
48
52
 
49
53
  # TODO: branching configs? config builder?
@@ -55,7 +59,6 @@ BaseConfig: IloConfig = {
55
59
  "scoring_filters": [],
56
60
  "scorer": PassFail,
57
61
  "passing_score": 0.8,
58
- "word_tokenizer": WordTokenizer,
59
62
  }
60
63
 
61
64
 
@@ -71,7 +74,6 @@ PrefConfig: IloConfig = {
71
74
  ],
72
75
  "scorer": SoftScaling,
73
76
  "passing_score": 0.8,
74
- "word_tokenizer": WordTokenizer,
75
77
  }
76
78
 
77
79
  CorpusConfig: IloConfig = {
@@ -94,7 +96,6 @@ CorpusConfig: IloConfig = {
94
96
  ],
95
97
  "scorer": SoftScaling,
96
98
  "passing_score": 0.8,
97
- "word_tokenizer": WordTokenizer,
98
99
  }
99
100
  """Mimics the previous implementation of ilo pi toki pona taso."""
100
101
  LazyConfig: IloConfig = {
@@ -104,7 +105,6 @@ LazyConfig: IloConfig = {
104
105
  "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
105
106
  "scorer": SoftPassFail,
106
107
  "passing_score": 0.8,
107
- "word_tokenizer": WordTokenizer,
108
108
  }
109
109
  """This is extremely silly."""
110
110
  IsipinEpikuConfig: IloConfig = {
@@ -125,7 +125,6 @@ IsipinEpikuConfig: IloConfig = {
125
125
  ],
126
126
  "scorer": SoftScaling,
127
127
  "passing_score": 0.8,
128
- "word_tokenizer": WordTokenizer,
129
128
  }
130
129
 
131
130
 
@@ -141,7 +140,6 @@ DiscordConfig: IloConfig = {
141
140
  ],
142
141
  "scorer": SoftScaling,
143
142
  "passing_score": 0.8,
144
- "word_tokenizer": WordTokenizer,
145
143
  }
146
144
 
147
145
  TelegramConfig: IloConfig = deepcopy(PrefConfig)
sonatoki/ilo.py CHANGED
@@ -5,12 +5,17 @@ from typing import List, Type, Tuple
5
5
  from sonatoki.Filters import Filter
6
6
  from sonatoki.Scorers import Number, Scorer
7
7
  from sonatoki.Cleaners import Cleaner
8
- from sonatoki.Tokenizers import Tokenizer
8
+ from sonatoki.Tokenizers import Tokenizer, SentTokenizer, WordTokenizer
9
9
  from sonatoki.Preprocessors import Preprocessor
10
10
 
11
+ # tokenized, filtered, cleaned, score, result
12
+ Scorecard = Tuple[List[str], List[str], List[str], Number, bool]
13
+ # TODO: scorecard kinda sucks as a name
14
+
11
15
 
12
16
  class Ilo:
13
17
  __preprocessors: List[Type[Preprocessor]]
18
+ __sent_tokenizer: Type[Tokenizer]
14
19
  __word_tokenizer: Type[Tokenizer]
15
20
  __cleaners: List[Type[Cleaner]]
16
21
  __ignoring_filters: List[Type[Filter]]
@@ -26,11 +31,13 @@ class Ilo:
26
31
  scoring_filters: List[Type[Filter]],
27
32
  scorer: Type[Scorer],
28
33
  passing_score: Number,
29
- word_tokenizer: Type[Tokenizer],
34
+ word_tokenizer: Type[Tokenizer] = WordTokenizer,
35
+ sent_tokenizer: Type[Tokenizer] = SentTokenizer,
30
36
  ):
31
37
  super().__init__()
32
38
  # avoid keeping a ref to user's list just in case
33
39
  self.__preprocessors = [*preprocessors]
40
+ self.__sent_tokenizer = sent_tokenizer
34
41
  self.__word_tokenizer = word_tokenizer
35
42
  self.__cleaners = [*cleaners]
36
43
  self.__ignoring_filters = [*ignoring_filters]
@@ -47,6 +54,9 @@ class Ilo:
47
54
  """It is *highly* recommended that you run `ilo.preprocess` first."""
48
55
  return self.__word_tokenizer.tokenize(msg)
49
56
 
57
+ def sent_tokenize(self, msg: str) -> List[str]:
58
+ return self.__sent_tokenizer.tokenize(msg)
59
+
50
60
  def clean_token(self, token: str) -> str:
51
61
  for c in self.__cleaners:
52
62
  token = c.clean(token)
@@ -83,26 +93,60 @@ class Ilo:
83
93
  def score_tokens(self, tokens: List[str]) -> float:
84
94
  return self.__scorer.score(tokens, self.__scoring_filters)
85
95
 
86
- def _is_toki_pona(
87
- self, message: str
88
- ) -> Tuple[str, List[str], List[str], List[str], Number, bool]:
89
- """Returns all components of the processing algorithm:
90
- - Preprocessed message (str)
96
+ def _is_toki_pona(self, message: str) -> Scorecard:
97
+ """Process a message into its tokens, then filters, cleans, and scores
98
+ them. Returns all parts. Message must already be preprocessed, normally
99
+ done in `self.is_toki_pona(message)`.
100
+
101
+ Returns all components of the processing algorithm except preprocessing:
91
102
  - Tokenized message (list[str])
92
103
  - Filtered message (list[str])
93
104
  - Cleaned message (list[str])
94
105
  - Score (float)
95
- - Result (bool)"""
96
- preprocessed = self.preprocess(message)
97
- tokenized = self.word_tokenize(preprocessed)
106
+ - Result (bool)
107
+ """
108
+ tokenized = self.word_tokenize(message)
98
109
  filtered = self.filter_tokens(tokenized)
99
110
  cleaned = self.clean_tokens(filtered)
100
111
  score = self.score_tokens(cleaned)
101
112
  result = score >= self.__passing_score
102
113
 
103
- return preprocessed, tokenized, filtered, cleaned, score, result
114
+ return tokenized, filtered, cleaned, score, result
104
115
 
105
116
  def is_toki_pona(self, message: str) -> bool:
106
117
  """Determines whether a single statement is or is not Toki Pona."""
118
+ message = self.preprocess(message)
107
119
  *_, result = self._is_toki_pona(message)
108
120
  return result
121
+
122
+ def _are_toki_pona(self, message: str):
123
+ """Split a message into sentences, then return a list each sentence's
124
+ results via `self._is_toki_pona()`.
125
+
126
+ Message must already be preprocessed, normally done in
127
+ `self.are_toki_pona(message)`.
128
+ """
129
+ results: List[Scorecard] = list()
130
+ for sentence in self.sent_tokenize(message):
131
+ result = self._is_toki_pona(sentence)
132
+ results.append(result)
133
+ return results
134
+
135
+ def are_toki_pona(self, message: str) -> List[bool]:
136
+ """Splits a statement into sentences, then determines if each is or is not Toki Pona.
137
+ NOTE: You will need to decide how to score the result. Examples:
138
+
139
+ ```
140
+ def all_must_pass(message: str) -> bool:
141
+ return all(ILO.are_toki_pona(message))
142
+
143
+ def portion_must_pass(message: str, score: Number = 0.8) -> bool:
144
+ results = ILO.are_toki_pona(message)
145
+ sent_count = len(results)
146
+ passing = results.count(True)
147
+ return (passing / sent_count) >= score
148
+ ```
149
+ """
150
+ message = self.preprocess(message)
151
+ results = self._are_toki_pona(message)
152
+ return [res[-1] for res in results]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.3.3
3
+ Version: 0.4.0
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,8 +1,8 @@
1
- sonatoki-0.3.3.dist-info/METADATA,sha256=b78h2-lsc4aBzkEWrzXTslfvJb-ZVbFjCpgPOF_kYrg,6341
2
- sonatoki-0.3.3.dist-info/WHEEL,sha256=SOP-4bEE0jbVaCHQGVvF08uWxk5rcSsfEybvoQVHlD8,90
3
- sonatoki-0.3.3.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
1
+ sonatoki-0.4.0.dist-info/METADATA,sha256=Z89tIHyGG9RRAgcr_3E4XW2IMX9NyT9mawcCeMQfXPU,6341
2
+ sonatoki-0.4.0.dist-info/WHEEL,sha256=SOP-4bEE0jbVaCHQGVvF08uWxk5rcSsfEybvoQVHlD8,90
3
+ sonatoki-0.4.0.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
4
4
  sonatoki/Cleaners.py,sha256=x2dT3MpDUfbrHA0EP2D3n1sTiKFFi5jw9ha-1dX973o,1958
5
- sonatoki/Configs.py,sha256=ZNKJGeAxgolAxqNSeS7iADvQEcN4X3tY0iaoiAaL95U,4160
5
+ sonatoki/Configs.py,sha256=tOeJSlYXMBHbRPBxERGWGT5AjvCxNb3ZGu8GA4BYve4,4034
6
6
  sonatoki/Filters.py,sha256=mpJBl-YPMF-Yl6mKFXf0D6DwkPR6H424RlvrkSeh4Dc,10714
7
7
  sonatoki/Preprocessors.py,sha256=nvAzxpWP9WwT6gOCKcuiz5F8xYDdKIt9bOVUvy9o-G0,4459
8
8
  sonatoki/Scorers.py,sha256=LRQLgXKTU2VqhkMHFPVxyVt83DXf85_zrpDGk4ThU24,3811
@@ -10,9 +10,9 @@ sonatoki/Tokenizers.py,sha256=So5_Tu6J98MD3yVcwB_X3lw2uMG0TN6XHcTbQjFCu5Q,4254
10
10
  sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  sonatoki/__main__.py,sha256=6xc-wIrrFo9wTyn4zRQNAmqwmJBtVvCMwV-CrM-hueA,82
12
12
  sonatoki/constants.py,sha256=wH3iR32-Ic7vSkrMjAZIvmIysTtkJ-KBVU5zv3Oamqs,12656
13
- sonatoki/ilo.py,sha256=yyLgNPI0Hmb4f1BzX6IRHr11FPChfL2xDR_9odlr8_8,3849
13
+ sonatoki/ilo.py,sha256=7KwTZgczzU2gbhC69yZbxtpTHy_fGtg_MnG_bDpiSxM,5639
14
14
  sonatoki/linku.json,sha256=fm4-dks5s9x1bs7q82GNngAedVCWilMPCQ_o-j35QL0,270950
15
15
  sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  sonatoki/sandbox.json,sha256=zPtZgJ_CpJa-2Den0gTNlk52f-YEwFVcjMarQXeeu5U,77563
17
17
  sonatoki/utils.py,sha256=L984aXxvzfJaZ6GSWRKs7LweOGZYTLK11CdAhpLQr0g,4067
18
- sonatoki-0.3.3.dist-info/RECORD,,
18
+ sonatoki-0.4.0.dist-info/RECORD,,