sonatoki 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Configs.py +80 -0
- sonatoki/Filters.py +5 -0
- sonatoki/Preprocessors.py +4 -4
- sonatoki/Scorers.py +28 -11
- sonatoki/Tokenizers.py +43 -31
- sonatoki/constants.py +10 -0
- sonatoki/ilo.py +30 -30
- sonatoki/linku.json +1 -1
- sonatoki/sandbox.json +1 -0
- {sonatoki-0.1.1.dist-info → sonatoki-0.1.3.dist-info}/METADATA +30 -24
- sonatoki-0.1.3.dist-info/RECORD +16 -0
- sonatoki-0.1.1.dist-info/RECORD +0 -14
- {sonatoki-0.1.1.dist-info → sonatoki-0.1.3.dist-info}/WHEEL +0 -0
- {sonatoki-0.1.1.dist-info → sonatoki-0.1.3.dist-info}/licenses/LICENSE +0 -0
sonatoki/Configs.py
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
# STL
|
2
|
+
from copy import deepcopy
|
3
|
+
from typing import List, Type, TypedDict
|
4
|
+
|
5
|
+
# PDM
|
6
|
+
from typing_extensions import NotRequired
|
7
|
+
|
8
|
+
# LOCAL
|
9
|
+
from sonatoki.Filters import (
|
10
|
+
Filter,
|
11
|
+
NimiPu,
|
12
|
+
Numerics,
|
13
|
+
Syllabic,
|
14
|
+
NimiLinku,
|
15
|
+
NimiPuAle,
|
16
|
+
Alphabetic,
|
17
|
+
ProperName,
|
18
|
+
Phonotactic,
|
19
|
+
NimiLinkuAle,
|
20
|
+
Punctuations,
|
21
|
+
)
|
22
|
+
from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
|
23
|
+
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
24
|
+
from sonatoki.Tokenizers import Tokenizer, WordTokenizerTok
|
25
|
+
from sonatoki.Preprocessors import (
|
26
|
+
URLs,
|
27
|
+
Preprocessor,
|
28
|
+
DiscordEmotes,
|
29
|
+
DiscordSpecial,
|
30
|
+
DiscordChannels,
|
31
|
+
DiscordMentions,
|
32
|
+
)
|
33
|
+
|
34
|
+
|
35
|
+
class IloConfig(TypedDict):
|
36
|
+
preprocessors: List[Type[Preprocessor]]
|
37
|
+
word_tokenizer: Type[Tokenizer]
|
38
|
+
cleaners: List[Type[Cleaner]]
|
39
|
+
ignoring_filters: List[Type[Filter]]
|
40
|
+
scoring_filters: List[Type[Filter]]
|
41
|
+
scorer: Type[Scorer]
|
42
|
+
passing_score: Number
|
43
|
+
|
44
|
+
|
45
|
+
BaseConfig: IloConfig = {
|
46
|
+
"preprocessors": [URLs],
|
47
|
+
"cleaners": [ConsecutiveDuplicates],
|
48
|
+
"ignoring_filters": [Numerics, Punctuations],
|
49
|
+
"scoring_filters": [],
|
50
|
+
"scorer": PassFail,
|
51
|
+
"passing_score": 0.8,
|
52
|
+
"word_tokenizer": WordTokenizerTok,
|
53
|
+
}
|
54
|
+
|
55
|
+
|
56
|
+
PrefConfig: IloConfig = deepcopy(BaseConfig)
|
57
|
+
PrefConfig["scoring_filters"].extend([NimiLinku, Syllabic, ProperName, Alphabetic])
|
58
|
+
PrefConfig["scorer"] = SoftScaling
|
59
|
+
|
60
|
+
|
61
|
+
LazyConfig: IloConfig = deepcopy(BaseConfig)
|
62
|
+
LazyConfig["scoring_filters"].extend([Alphabetic, ProperName])
|
63
|
+
LazyConfig["scorer"] = SoftPassFail
|
64
|
+
|
65
|
+
DiscordConfig: IloConfig = deepcopy(PrefConfig)
|
66
|
+
DiscordConfig["preprocessors"].extend(
|
67
|
+
[DiscordEmotes, DiscordMentions, DiscordChannels, DiscordSpecial]
|
68
|
+
)
|
69
|
+
TelegramConfig: IloConfig = deepcopy(PrefConfig)
|
70
|
+
ForumConfig: IloConfig = deepcopy(PrefConfig)
|
71
|
+
|
72
|
+
__all__ = [
|
73
|
+
"IloConfig",
|
74
|
+
"BaseConfig",
|
75
|
+
"PrefConfig",
|
76
|
+
"LazyConfig",
|
77
|
+
"DiscordConfig",
|
78
|
+
"TelegramConfig",
|
79
|
+
"ForumConfig",
|
80
|
+
]
|
sonatoki/Filters.py
CHANGED
@@ -17,6 +17,7 @@ from sonatoki.constants import (
|
|
17
17
|
NIMI_LINKU_SET,
|
18
18
|
NIMI_PU_ALE_SET,
|
19
19
|
NIMI_LINKU_ALE_SET,
|
20
|
+
NIMI_LINKU_SANDBOX_SET,
|
20
21
|
)
|
21
22
|
|
22
23
|
re.DEFAULT_VERSION = re.VERSION1
|
@@ -87,6 +88,10 @@ class NimiLinkuAle(SetFilter):
|
|
87
88
|
tokens = NIMI_LINKU_ALE_SET
|
88
89
|
|
89
90
|
|
91
|
+
class NimiLinkuSandbox(SetFilter):
|
92
|
+
tokens = NIMI_LINKU_SANDBOX_SET
|
93
|
+
|
94
|
+
|
90
95
|
class Phonotactic(RegexFilter):
|
91
96
|
"""Determines if a given token is phonotactically valid Toki Pona (or `n`).
|
92
97
|
Excludes both consecutive nasals and the illegal syllables:
|
sonatoki/Preprocessors.py
CHANGED
@@ -13,7 +13,7 @@ There are currently two distinct types of Preprocessor:
|
|
13
13
|
- ArrowQuote
|
14
14
|
|
15
15
|
Order does not generally matter, but if there were two overlapping containers such as in the string "|| spoiler ` monospace || `", order would matter.
|
16
|
-
|
16
|
+
It is up to the user to order them appropriately.
|
17
17
|
"""
|
18
18
|
|
19
19
|
# STL
|
@@ -27,8 +27,6 @@ re.DEFAULT_VERSION = re.VERSION1
|
|
27
27
|
|
28
28
|
|
29
29
|
class Preprocessor(ABC):
|
30
|
-
precedence: int = 0
|
31
|
-
|
32
30
|
@classmethod # order matters
|
33
31
|
@abstractmethod
|
34
32
|
def process(cls, msg: str) -> str:
|
@@ -104,7 +102,6 @@ class DoubleQuotes(RegexPreprocessor):
|
|
104
102
|
class Backticks(RegexPreprocessor):
|
105
103
|
"""Remove paired backticks and their contents `like this`"""
|
106
104
|
|
107
|
-
precedence = -10
|
108
105
|
pattern = re.compile(r"`[^`]+`", flags=re.S)
|
109
106
|
|
110
107
|
|
@@ -121,6 +118,9 @@ class ArrowQuote(RegexPreprocessor):
|
|
121
118
|
|
122
119
|
|
123
120
|
__all__ = [
|
121
|
+
"DiscordChannels",
|
122
|
+
"DiscordMentions",
|
123
|
+
"DiscordSpecial",
|
124
124
|
"DiscordEmotes",
|
125
125
|
"SingleQuotes",
|
126
126
|
"DoubleQuotes",
|
sonatoki/Scorers.py
CHANGED
@@ -16,6 +16,13 @@ Number = Union[int, float]
|
|
16
16
|
Weights = Dict[str, Number]
|
17
17
|
|
18
18
|
|
19
|
+
def sigmoid(n: int) -> Number:
|
20
|
+
return 1 / (1 + math.exp(-(0.30 * (n - 1))))
|
21
|
+
# n-1 makes sigmoid(1) == 0.5
|
22
|
+
# 0.30 softens scaling in favor of short input
|
23
|
+
# return n / (1+abs(n)) # too weak in 0.7+
|
24
|
+
|
25
|
+
|
19
26
|
class Scorer(ABC):
|
20
27
|
@classmethod
|
21
28
|
@abstractmethod
|
@@ -27,7 +34,7 @@ class PassFail(Scorer):
|
|
27
34
|
"""The token passes any filter or fails all of them, scoring 1 or 0 respectively."""
|
28
35
|
|
29
36
|
@classmethod
|
30
|
-
def
|
37
|
+
def score_token(cls, token: str, filters: List[Type[Filter]]) -> Number:
|
31
38
|
for f in filters:
|
32
39
|
if f.filter(token):
|
33
40
|
score = 1
|
@@ -47,10 +54,27 @@ class PassFail(Scorer):
|
|
47
54
|
total_score = 0
|
48
55
|
len_tokens = len(tokens)
|
49
56
|
for token in tokens:
|
50
|
-
total_score += cls.
|
57
|
+
total_score += cls.score_token(token, filters)
|
51
58
|
return total_score / len_tokens if len_tokens else 0
|
52
59
|
|
53
60
|
|
61
|
+
class SoftPassFail(PassFail):
|
62
|
+
@classmethod
|
63
|
+
@override
|
64
|
+
def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
|
65
|
+
if not tokens:
|
66
|
+
return 1
|
67
|
+
|
68
|
+
total_score = 0
|
69
|
+
len_tokens = len(tokens)
|
70
|
+
for token in tokens:
|
71
|
+
total_score += cls.score_token(token, filters)
|
72
|
+
|
73
|
+
percentage = total_score / len_tokens if len_tokens else 0
|
74
|
+
percentage **= sigmoid(len_tokens)
|
75
|
+
return percentage
|
76
|
+
|
77
|
+
|
54
78
|
class Scaling(Scorer):
|
55
79
|
"""
|
56
80
|
The sooner a token matches a filter, the higher its score.
|
@@ -91,13 +115,6 @@ class SoftScaling(Scaling):
|
|
91
115
|
For example, a single token scoring 0.64 will now score 0.8.
|
92
116
|
"""
|
93
117
|
|
94
|
-
@staticmethod
|
95
|
-
def sigmoid(n: int) -> Number:
|
96
|
-
return 1 / (1 + math.exp(-(0.30 * (n - 1))))
|
97
|
-
# n-1 makes sigmoid(1) == 0.5
|
98
|
-
# 0.30 softens scaling in favor of short input
|
99
|
-
# return n / (1+abs(n)) # too weak in 0.7+
|
100
|
-
|
101
118
|
@classmethod
|
102
119
|
@override
|
103
120
|
def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
|
@@ -113,11 +130,11 @@ class SoftScaling(Scaling):
|
|
113
130
|
total_score += cls.score_token(token, filters, len_filters)
|
114
131
|
|
115
132
|
percentage = total_score / max_score if max_score else 0
|
116
|
-
percentage **=
|
133
|
+
percentage **= sigmoid(len_tokens)
|
117
134
|
return percentage
|
118
135
|
|
119
136
|
|
120
137
|
class Logarithmic(Scorer): ...
|
121
138
|
|
122
139
|
|
123
|
-
__all__ = ["PassFail", "Scaling", "SoftScaling"]
|
140
|
+
__all__ = ["PassFail", "SoftPassFail", "Scaling", "SoftScaling"]
|
sonatoki/Tokenizers.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
# STL
|
2
|
-
from
|
2
|
+
from abc import ABC, abstractmethod
|
3
|
+
from typing import List
|
3
4
|
|
4
5
|
# PDM
|
5
6
|
import regex as re
|
7
|
+
from typing_extensions import override
|
6
8
|
|
7
9
|
try:
|
8
10
|
# PDM
|
@@ -15,50 +17,60 @@ except ImportError as e:
|
|
15
17
|
|
16
18
|
LANGUAGE = "english" # for NLTK
|
17
19
|
|
18
|
-
SENT_DELIMS_RE = r"""(.*?[.?!;:])|(.+?$)"""
|
19
|
-
SENT_DELIMS_RE = re.compile(SENT_DELIMS_RE)
|
20
20
|
|
21
|
-
|
22
|
-
|
21
|
+
class Tokenizer(ABC):
|
22
|
+
@classmethod
|
23
|
+
@abstractmethod
|
24
|
+
def tokenize(cls, s: str) -> List[str]: ...
|
23
25
|
|
24
26
|
|
25
|
-
|
26
|
-
|
27
|
+
class NoOpTokenizer(Tokenizer):
|
28
|
+
"""This is a special case that you do not want or need."""
|
27
29
|
|
28
|
-
|
29
|
-
|
30
|
+
@classmethod
|
31
|
+
@override
|
32
|
+
def tokenize(cls, s: str) -> List[str]:
|
33
|
+
return [s]
|
30
34
|
|
31
|
-
Tokenizer = Callable[[str], List[str]]
|
32
35
|
|
36
|
+
class RegexTokenizer(Tokenizer):
|
37
|
+
pattern: "re.Pattern[str]"
|
38
|
+
|
39
|
+
@classmethod
|
40
|
+
@override
|
41
|
+
def tokenize(cls, s: str) -> List[str]:
|
42
|
+
return [clean for word in re.split(cls.pattern, s) if (clean := word.strip())]
|
33
43
|
|
34
|
-
if not isinstance(nltk, ImportError):
|
35
44
|
|
36
|
-
|
37
|
-
|
45
|
+
class WordTokenizerTok(RegexTokenizer):
|
46
|
+
pattern = re.compile(r"""([\p{Punctuation}\p{posix_punct}]+|\s+)""")
|
47
|
+
# TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
|
48
|
+
# TODO: do the typography characters matter?
|
49
|
+
# NOTE: | / and , are *not* sentence delimiters for my purpose
|
38
50
|
|
39
|
-
def word_tokenize_nltk(s: str) -> List[str]:
|
40
|
-
return __word_tokenize_nltk(text=s, language=LANGUAGE)
|
41
51
|
|
52
|
+
class SentTokenizerTok(RegexTokenizer):
|
53
|
+
pattern = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-]|$)""")
|
42
54
|
|
43
|
-
def sent_tokenize_re(s: str) -> List[str]:
|
44
|
-
return [
|
45
|
-
clean
|
46
|
-
for sent in re.findall(SENT_DELIMS_RE, s)
|
47
|
-
if (clean := sent[0].strip() or sent[1].strip())
|
48
|
-
]
|
49
55
|
|
56
|
+
class WordTokenizerRe(RegexTokenizer):
|
57
|
+
pattern = re.compile(r"""(?<=[.?!;:'"-])""")
|
50
58
|
|
51
|
-
def word_tokenize_re(s: str) -> List[str]:
|
52
|
-
return [clean for word in re.split(WORD_DELIMS_RE, s) if (clean := word.strip())]
|
53
59
|
|
60
|
+
class SentTokenizerRe(RegexTokenizer):
|
61
|
+
pattern = re.compile(r"""(.*?[.?!;:])|(.+?$)""")
|
54
62
|
|
55
|
-
def sent_tokenize_tok(s: str) -> List[str]:
|
56
|
-
return [
|
57
|
-
clean
|
58
|
-
for sent in re.findall(SENT_DELIMS_TOK, s)
|
59
|
-
if (clean := sent[0].strip() or sent[1].strip())
|
60
|
-
]
|
61
63
|
|
64
|
+
if not isinstance(nltk, ImportError):
|
62
65
|
|
63
|
-
|
64
|
-
|
66
|
+
class WordTokenizerNLTK(Tokenizer):
|
67
|
+
@classmethod
|
68
|
+
@override
|
69
|
+
def tokenize(cls, s: str) -> List[str]:
|
70
|
+
return __word_tokenize_nltk(text=s, language=LANGUAGE)
|
71
|
+
|
72
|
+
class SentTokenizerNLTK(Tokenizer):
|
73
|
+
@classmethod
|
74
|
+
@override
|
75
|
+
def tokenize(cls, s: str) -> List[str]:
|
76
|
+
return __sent_tokenize_nltk(text=s, language=LANGUAGE)
|
sonatoki/constants.py
CHANGED
@@ -4,6 +4,7 @@ from typing import Dict, List
|
|
4
4
|
from pathlib import Path
|
5
5
|
|
6
6
|
LINKU = Path(__file__).resolve().parent / Path("linku.json")
|
7
|
+
SANDBOX = Path(__file__).resolve().parent / Path("sandbox.json")
|
7
8
|
|
8
9
|
VOWELS = "aeiou"
|
9
10
|
CONSONANTS = "jklmnpstw"
|
@@ -29,10 +30,16 @@ with open(LINKU) as f:
|
|
29
30
|
]
|
30
31
|
NIMI_LINKU_ALE: List[str] = [d["word"] for d in r.values()]
|
31
32
|
|
33
|
+
with open(SANDBOX) as f:
|
34
|
+
r: Dict[str, Dict[str, str]] = json.loads(f.read())
|
35
|
+
NIMI_LINKU_SANDBOX: List[str] = [d["word"] for d in r.values()]
|
36
|
+
|
37
|
+
|
32
38
|
NIMI_PU_SET = set(NIMI_PU)
|
33
39
|
NIMI_PU_ALE_SET = set(NIMI_PU_ALE)
|
34
40
|
NIMI_LINKU_SET = set(NIMI_LINKU)
|
35
41
|
NIMI_LINKU_ALE_SET = set(NIMI_LINKU_ALE)
|
42
|
+
NIMI_LINKU_SANDBOX_SET = set(NIMI_LINKU_SANDBOX)
|
36
43
|
ALLOWABLES_SET = set(ALLOWABLES)
|
37
44
|
|
38
45
|
__all__ = [
|
@@ -54,4 +61,7 @@ __all__ = [
|
|
54
61
|
#
|
55
62
|
"NIMI_LINKU_ALE",
|
56
63
|
"NIMI_LINKU_ALE_SET",
|
64
|
+
#
|
65
|
+
"NIMI_LINKU_SANDBOX",
|
66
|
+
"NIMI_LINKU_SANDBOX_SET",
|
57
67
|
]
|
sonatoki/ilo.py
CHANGED
@@ -14,13 +14,13 @@ LOG = logging.getLogger(__name__)
|
|
14
14
|
|
15
15
|
class Ilo:
|
16
16
|
__preprocessors: List[Type[Preprocessor]]
|
17
|
+
__word_tokenizer: Type[Tokenizer]
|
17
18
|
__cleaners: List[Type[Cleaner]]
|
18
19
|
__ignoring_filters: List[Type[Filter]]
|
19
20
|
__scoring_filters: List[Type[Filter]]
|
20
21
|
__scorer: Type[Scorer]
|
21
|
-
__tokenize: Tokenizer
|
22
22
|
__passing_score: Number
|
23
|
-
logging_threshold: Number = 1
|
23
|
+
logging_threshold: Number = -1
|
24
24
|
|
25
25
|
def __init__(
|
26
26
|
self,
|
@@ -29,61 +29,62 @@ class Ilo:
|
|
29
29
|
ignoring_filters: List[Type[Filter]],
|
30
30
|
scoring_filters: List[Type[Filter]],
|
31
31
|
scorer: Type[Scorer],
|
32
|
-
tokenizer: Tokenizer, # NOTE: no wrapper needed?
|
33
32
|
passing_score: Number,
|
33
|
+
word_tokenizer: Type[Tokenizer],
|
34
34
|
):
|
35
35
|
super().__init__()
|
36
36
|
# avoid keeping a ref to user's list just in case
|
37
37
|
self.__preprocessors = [*preprocessors]
|
38
|
+
self.__word_tokenizer = word_tokenizer
|
38
39
|
self.__cleaners = [*cleaners]
|
39
40
|
self.__ignoring_filters = [*ignoring_filters]
|
40
41
|
self.__scoring_filters = [*scoring_filters]
|
41
42
|
self.__scorer = scorer
|
42
|
-
self.__tokenize = tokenizer
|
43
43
|
self.__passing_score = passing_score
|
44
44
|
|
45
|
-
def
|
45
|
+
def preprocess(self, msg: str) -> str:
|
46
46
|
for p in self.__preprocessors:
|
47
47
|
msg = p.process(msg)
|
48
48
|
return msg
|
49
49
|
|
50
|
-
def
|
50
|
+
def word_tokenize(self, msg: str) -> List[str]:
|
51
|
+
"""It is *highly* recommended that you run `ilo.preprocess` first."""
|
52
|
+
return self.__word_tokenizer.tokenize(msg)
|
53
|
+
|
54
|
+
def clean_token(self, token: str) -> str:
|
51
55
|
for c in self.__cleaners:
|
52
56
|
token = c.clean(token)
|
53
57
|
return token
|
54
58
|
|
55
|
-
def
|
56
|
-
# NOTE: tested, making a new list with a for loop *is* faster than
|
57
|
-
# -
|
58
|
-
# - generator comps
|
59
|
-
# - in-place replacement/removal
|
60
|
-
# - in place replacement with result of generator comp
|
59
|
+
def clean_tokens(self, tokens: List[str]) -> List[str]:
|
60
|
+
# NOTE: tested, making a new list with a for loop *is* faster than:
|
61
|
+
# list comp, generator comp, in-place replacement
|
61
62
|
cleaned_tokens: List[str] = list()
|
62
63
|
for token in tokens:
|
63
|
-
cleaned_token = self.
|
64
|
+
cleaned_token = self.clean_token(token)
|
64
65
|
if not cleaned_token:
|
65
66
|
# TODO: warn user?
|
66
67
|
continue
|
67
68
|
cleaned_tokens.append(cleaned_token)
|
68
69
|
return cleaned_tokens
|
69
70
|
|
70
|
-
def
|
71
|
+
def _filter_token(self, token: str) -> bool:
|
71
72
|
for f in self.__ignoring_filters:
|
72
73
|
if f.filter(token):
|
73
74
|
return True
|
74
75
|
return False
|
75
76
|
|
76
|
-
def
|
77
|
+
def filter_tokens(self, tokens: List[str]) -> List[str]:
|
77
78
|
filtered_tokens: List[str] = []
|
78
79
|
for token in tokens:
|
79
|
-
if self.
|
80
|
+
if self._filter_token(token):
|
80
81
|
continue
|
81
82
|
# the ignoring filter is true if the token matches
|
82
83
|
# the user wants to ignore these so keep non-matching tokens
|
83
84
|
filtered_tokens.append(token)
|
84
85
|
return filtered_tokens
|
85
86
|
|
86
|
-
def
|
87
|
+
def score_tokens(self, tokens: List[str]) -> float:
|
87
88
|
return self.__scorer.score(tokens, self.__scoring_filters)
|
88
89
|
|
89
90
|
def _is_toki_pona(
|
@@ -95,26 +96,25 @@ class Ilo:
|
|
95
96
|
- Filtered message (list[str])
|
96
97
|
- Cleaned message (list[str])
|
97
98
|
- Score (float)
|
98
|
-
- Result (bool)
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
score = self.__score_tokens(cleaned)
|
99
|
+
- Result (bool)"""
|
100
|
+
preprocessed = self.preprocess(message)
|
101
|
+
tokenized = self.word_tokenize(preprocessed)
|
102
|
+
filtered = self.filter_tokens(tokenized)
|
103
|
+
cleaned = self.clean_tokens(filtered)
|
104
|
+
score = self.score_tokens(cleaned)
|
105
105
|
result = score >= self.__passing_score
|
106
106
|
|
107
|
-
# NOTE: this method may break if above funcs start sharing a list
|
108
107
|
if score <= self.logging_threshold:
|
109
|
-
LOG.debug("
|
110
|
-
LOG.debug("
|
111
|
-
LOG.debug("
|
112
|
-
LOG.debug("
|
113
|
-
LOG.debug("
|
108
|
+
LOG.debug("msg: %.2f %s", score, repr(message))
|
109
|
+
LOG.debug("preproc: %s", repr(preprocessed))
|
110
|
+
LOG.debug("tokenized: %s", tokenized)
|
111
|
+
LOG.debug("filtered: %s", filtered)
|
112
|
+
LOG.debug("cleaned: %s", cleaned)
|
114
113
|
# TODO: Move to each function? Loses ability to control when logging occurs by threshold
|
115
114
|
|
116
115
|
return preprocessed, tokenized, filtered, cleaned, score, result
|
117
116
|
|
118
117
|
def is_toki_pona(self, message: str) -> bool:
|
118
|
+
"""Determines whether a single statement is or is not Toki Pona."""
|
119
119
|
*_, result = self._is_toki_pona(message)
|
120
120
|
return result
|