sonatoki 0.3.3__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sonatoki-0.3.3 → sonatoki-0.4.0}/PKG-INFO +1 -1
- {sonatoki-0.3.3 → sonatoki-0.4.0}/pyproject.toml +1 -1
- {sonatoki-0.3.3 → sonatoki-0.4.0}/src/sonatoki/Configs.py +6 -8
- {sonatoki-0.3.3 → sonatoki-0.4.0}/src/sonatoki/ilo.py +55 -11
- {sonatoki-0.3.3 → sonatoki-0.4.0}/LICENSE +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/README.md +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/src/sonatoki/Cleaners.py +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/src/sonatoki/Filters.py +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/src/sonatoki/Preprocessors.py +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/src/sonatoki/Scorers.py +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/src/sonatoki/Tokenizers.py +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/src/sonatoki/__init__.py +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/src/sonatoki/__main__.py +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/src/sonatoki/constants.py +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/src/sonatoki/linku.json +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/src/sonatoki/py.typed +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/src/sonatoki/sandbox.json +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/src/sonatoki/utils.py +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/tests/__init__.py +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/tests/test_cleaners.py +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/tests/test_filters.py +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/tests/test_ilo.py +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/tests/test_preprocessors.py +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/tests/test_properties.py +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/tests/test_scorers.py +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/tests/test_tokenize.py +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/tests/test_utils.py +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
- {sonatoki-0.3.3 → sonatoki-0.4.0}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -2,6 +2,9 @@
|
|
2
2
|
from copy import deepcopy
|
3
3
|
from typing import List, Type, TypedDict
|
4
4
|
|
5
|
+
# PDM
|
6
|
+
from typing_extensions import NotRequired
|
7
|
+
|
5
8
|
# LOCAL
|
6
9
|
from sonatoki.Filters import (
|
7
10
|
Filter,
|
@@ -26,7 +29,7 @@ from sonatoki.Filters import (
|
|
26
29
|
)
|
27
30
|
from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
|
28
31
|
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
29
|
-
from sonatoki.Tokenizers import Tokenizer
|
32
|
+
from sonatoki.Tokenizers import Tokenizer
|
30
33
|
from sonatoki.Preprocessors import (
|
31
34
|
URLs,
|
32
35
|
Backticks,
|
@@ -38,12 +41,13 @@ from sonatoki.Preprocessors import (
|
|
38
41
|
|
39
42
|
class IloConfig(TypedDict):
|
40
43
|
preprocessors: List[Type[Preprocessor]]
|
41
|
-
word_tokenizer: Type[Tokenizer]
|
42
44
|
cleaners: List[Type[Cleaner]]
|
43
45
|
ignoring_filters: List[Type[Filter]]
|
44
46
|
scoring_filters: List[Type[Filter]]
|
45
47
|
scorer: Type[Scorer]
|
46
48
|
passing_score: Number
|
49
|
+
word_tokenizer: NotRequired[Type[Tokenizer]]
|
50
|
+
sent_tokenizer: NotRequired[Type[Tokenizer]]
|
47
51
|
|
48
52
|
|
49
53
|
# TODO: branching configs? config builder?
|
@@ -55,7 +59,6 @@ BaseConfig: IloConfig = {
|
|
55
59
|
"scoring_filters": [],
|
56
60
|
"scorer": PassFail,
|
57
61
|
"passing_score": 0.8,
|
58
|
-
"word_tokenizer": WordTokenizer,
|
59
62
|
}
|
60
63
|
|
61
64
|
|
@@ -71,7 +74,6 @@ PrefConfig: IloConfig = {
|
|
71
74
|
],
|
72
75
|
"scorer": SoftScaling,
|
73
76
|
"passing_score": 0.8,
|
74
|
-
"word_tokenizer": WordTokenizer,
|
75
77
|
}
|
76
78
|
|
77
79
|
CorpusConfig: IloConfig = {
|
@@ -94,7 +96,6 @@ CorpusConfig: IloConfig = {
|
|
94
96
|
],
|
95
97
|
"scorer": SoftScaling,
|
96
98
|
"passing_score": 0.8,
|
97
|
-
"word_tokenizer": WordTokenizer,
|
98
99
|
}
|
99
100
|
"""Mimics the previous implementation of ilo pi toki pona taso."""
|
100
101
|
LazyConfig: IloConfig = {
|
@@ -104,7 +105,6 @@ LazyConfig: IloConfig = {
|
|
104
105
|
"scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
|
105
106
|
"scorer": SoftPassFail,
|
106
107
|
"passing_score": 0.8,
|
107
|
-
"word_tokenizer": WordTokenizer,
|
108
108
|
}
|
109
109
|
"""This is extremely silly."""
|
110
110
|
IsipinEpikuConfig: IloConfig = {
|
@@ -125,7 +125,6 @@ IsipinEpikuConfig: IloConfig = {
|
|
125
125
|
],
|
126
126
|
"scorer": SoftScaling,
|
127
127
|
"passing_score": 0.8,
|
128
|
-
"word_tokenizer": WordTokenizer,
|
129
128
|
}
|
130
129
|
|
131
130
|
|
@@ -141,7 +140,6 @@ DiscordConfig: IloConfig = {
|
|
141
140
|
],
|
142
141
|
"scorer": SoftScaling,
|
143
142
|
"passing_score": 0.8,
|
144
|
-
"word_tokenizer": WordTokenizer,
|
145
143
|
}
|
146
144
|
|
147
145
|
TelegramConfig: IloConfig = deepcopy(PrefConfig)
|
@@ -5,12 +5,17 @@ from typing import List, Type, Tuple
|
|
5
5
|
from sonatoki.Filters import Filter
|
6
6
|
from sonatoki.Scorers import Number, Scorer
|
7
7
|
from sonatoki.Cleaners import Cleaner
|
8
|
-
from sonatoki.Tokenizers import Tokenizer
|
8
|
+
from sonatoki.Tokenizers import Tokenizer, SentTokenizer, WordTokenizer
|
9
9
|
from sonatoki.Preprocessors import Preprocessor
|
10
10
|
|
11
|
+
# tokenized, filtered, cleaned, score, result
|
12
|
+
Scorecard = Tuple[List[str], List[str], List[str], Number, bool]
|
13
|
+
# TODO: scorecard kinda sucks as a name
|
14
|
+
|
11
15
|
|
12
16
|
class Ilo:
|
13
17
|
__preprocessors: List[Type[Preprocessor]]
|
18
|
+
__sent_tokenizer: Type[Tokenizer]
|
14
19
|
__word_tokenizer: Type[Tokenizer]
|
15
20
|
__cleaners: List[Type[Cleaner]]
|
16
21
|
__ignoring_filters: List[Type[Filter]]
|
@@ -26,11 +31,13 @@ class Ilo:
|
|
26
31
|
scoring_filters: List[Type[Filter]],
|
27
32
|
scorer: Type[Scorer],
|
28
33
|
passing_score: Number,
|
29
|
-
word_tokenizer: Type[Tokenizer],
|
34
|
+
word_tokenizer: Type[Tokenizer] = WordTokenizer,
|
35
|
+
sent_tokenizer: Type[Tokenizer] = SentTokenizer,
|
30
36
|
):
|
31
37
|
super().__init__()
|
32
38
|
# avoid keeping a ref to user's list just in case
|
33
39
|
self.__preprocessors = [*preprocessors]
|
40
|
+
self.__sent_tokenizer = sent_tokenizer
|
34
41
|
self.__word_tokenizer = word_tokenizer
|
35
42
|
self.__cleaners = [*cleaners]
|
36
43
|
self.__ignoring_filters = [*ignoring_filters]
|
@@ -47,6 +54,9 @@ class Ilo:
|
|
47
54
|
"""It is *highly* recommended that you run `ilo.preprocess` first."""
|
48
55
|
return self.__word_tokenizer.tokenize(msg)
|
49
56
|
|
57
|
+
def sent_tokenize(self, msg: str) -> List[str]:
|
58
|
+
return self.__sent_tokenizer.tokenize(msg)
|
59
|
+
|
50
60
|
def clean_token(self, token: str) -> str:
|
51
61
|
for c in self.__cleaners:
|
52
62
|
token = c.clean(token)
|
@@ -83,26 +93,60 @@ class Ilo:
|
|
83
93
|
def score_tokens(self, tokens: List[str]) -> float:
|
84
94
|
return self.__scorer.score(tokens, self.__scoring_filters)
|
85
95
|
|
86
|
-
def _is_toki_pona(
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
96
|
+
def _is_toki_pona(self, message: str) -> Scorecard:
|
97
|
+
"""Process a message into its tokens, then filters, cleans, and scores
|
98
|
+
them. Returns all parts. Message must already be preprocessed, normally
|
99
|
+
done in `self.is_toki_pona(message)`.
|
100
|
+
|
101
|
+
Returns all components of the processing algorithm except preprocessing:
|
91
102
|
- Tokenized message (list[str])
|
92
103
|
- Filtered message (list[str])
|
93
104
|
- Cleaned message (list[str])
|
94
105
|
- Score (float)
|
95
|
-
- Result (bool)
|
96
|
-
|
97
|
-
tokenized = self.word_tokenize(
|
106
|
+
- Result (bool)
|
107
|
+
"""
|
108
|
+
tokenized = self.word_tokenize(message)
|
98
109
|
filtered = self.filter_tokens(tokenized)
|
99
110
|
cleaned = self.clean_tokens(filtered)
|
100
111
|
score = self.score_tokens(cleaned)
|
101
112
|
result = score >= self.__passing_score
|
102
113
|
|
103
|
-
return
|
114
|
+
return tokenized, filtered, cleaned, score, result
|
104
115
|
|
105
116
|
def is_toki_pona(self, message: str) -> bool:
|
106
117
|
"""Determines whether a single statement is or is not Toki Pona."""
|
118
|
+
message = self.preprocess(message)
|
107
119
|
*_, result = self._is_toki_pona(message)
|
108
120
|
return result
|
121
|
+
|
122
|
+
def _are_toki_pona(self, message: str):
|
123
|
+
"""Split a message into sentences, then return a list each sentence's
|
124
|
+
results via `self._is_toki_pona()`.
|
125
|
+
|
126
|
+
Message must already be preprocessed, normally done in
|
127
|
+
`self.are_toki_pona(message)`.
|
128
|
+
"""
|
129
|
+
results: List[Scorecard] = list()
|
130
|
+
for sentence in self.sent_tokenize(message):
|
131
|
+
result = self._is_toki_pona(sentence)
|
132
|
+
results.append(result)
|
133
|
+
return results
|
134
|
+
|
135
|
+
def are_toki_pona(self, message: str) -> List[bool]:
|
136
|
+
"""Splits a statement into sentences, then determines if each is or is not Toki Pona.
|
137
|
+
NOTE: You will need to decide how to score the result. Examples:
|
138
|
+
|
139
|
+
```
|
140
|
+
def all_must_pass(message: str) -> bool:
|
141
|
+
return all(ILO.are_toki_pona(message))
|
142
|
+
|
143
|
+
def portion_must_pass(message: str, score: Number = 0.8) -> bool:
|
144
|
+
results = ILO.are_toki_pona(message)
|
145
|
+
sent_count = len(results)
|
146
|
+
passing = results.count(True)
|
147
|
+
return (passing / sent_count) >= score
|
148
|
+
```
|
149
|
+
"""
|
150
|
+
message = self.preprocess(message)
|
151
|
+
results = self._are_toki_pona(message)
|
152
|
+
return [res[-1] for res in results]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|