sonatoki 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/Cleaners.py ADDED
@@ -0,0 +1,42 @@
1
+ # STL
2
+ import re
3
+ from abc import ABC, abstractmethod
4
+
5
+ # PDM
6
+ from typing_extensions import override
7
+
8
+
9
+ class Cleaner(ABC):
10
+ @classmethod
11
+ @abstractmethod
12
+ def clean(cls, token: str) -> str:
13
+ raise NotImplementedError
14
+
15
+
16
+ class RegexCleaner(Cleaner):
17
+ pattern: "re.Pattern[str]"
18
+ replace: str
19
+
20
+ @classmethod
21
+ @override
22
+ def clean(cls, token: str) -> str:
23
+ return re.sub(cls.pattern, cls.replace, token)
24
+
25
+
26
+ class ConsecutiveDuplicates(RegexCleaner):
27
+ """Remove consecutive duplicates from an input string, ignoring case.
28
+
29
+ The first match of any 2+ will become `\\1`, preserving initial case.
30
+ For example, `FfFoo` will reduce to `Foo`, and `bBAR` will reduce to `bAR`.
31
+
32
+ This is desirable for Toki Pona written with the Latin alphabet because strings
33
+ may be altered for emphasis or effect, such as in "sonaaaa" or "AAAAAA".
34
+
35
+ This may be undesirable for moraic scripts like Hiragana, where `わわ` would be
36
+ incorrectly reduced to `わ`. This does preserve phonotactic validity, though."""
37
+
38
+ pattern = re.compile(r"(.)\1+", flags=re.IGNORECASE)
39
+ replace = r"\1"
40
+
41
+
42
+ __all__ = ["ConsecutiveDuplicates"]
sonatoki/Filters.py ADDED
@@ -0,0 +1,159 @@
1
+ # STL
2
+ from abc import ABC, abstractmethod
3
+ from typing import Set
4
+ from functools import lru_cache as cache # cache comes in 3.9
5
+
6
+ # PDM
7
+ import regex as re
8
+ from typing_extensions import override
9
+
10
+ # LOCAL
11
+ from sonatoki.constants import (
12
+ VOWELS,
13
+ CONSONANTS,
14
+ NIMI_PU_SET,
15
+ ALPHABET_SET,
16
+ ALLOWABLES_SET,
17
+ NIMI_LINKU_SET,
18
+ NIMI_PU_ALE_SET,
19
+ NIMI_LINKU_ALE_SET,
20
+ )
21
+
22
+ re.DEFAULT_VERSION = re.VERSION1
23
+
24
+
25
+ class Filter(ABC):
26
+ @classmethod
27
+ @abstractmethod
28
+ @cache(maxsize=None)
29
+ def filter(cls, token: str) -> bool:
30
+ raise NotImplementedError
31
+
32
+
33
+ class RegexFilter(Filter):
34
+ pattern: "re.Pattern[str]"
35
+
36
+ @classmethod
37
+ @override
38
+ @cache(maxsize=None)
39
+ def filter(cls, token: str) -> bool:
40
+ return not not re.fullmatch(cls.pattern, token)
41
+
42
+
43
+ class SetFilter(Filter):
44
+ tokens: Set[str]
45
+
46
+ @classmethod
47
+ @override
48
+ @cache(maxsize=None)
49
+ def filter(cls, token: str) -> bool:
50
+ return token.lower() in cls.tokens
51
+
52
+
53
+ class Miscellaneous(SetFilter):
54
+ tokens = ALLOWABLES_SET
55
+
56
+
57
+ class ProperName(Filter):
58
+ """Determines if a given token is a valid name (also called a loan word).
59
+ When Toki Pona is written with the Latin alphabet, names are generally
60
+ capitalized at their start. This filter identifies those tokens.
61
+
62
+ Note that this alone cannot determine if a token is a valid name, because
63
+ a standalone name is considered invalid in Toki Pona- names generally have head nouns.
64
+ This tool only examines one token at a time, so cannot detect names any better than identifying their capital letter.
65
+ """
66
+
67
+ @classmethod
68
+ @override
69
+ @cache(maxsize=None)
70
+ def filter(cls, token: str) -> bool:
71
+ return token == token.capitalize()
72
+
73
+
74
+ class NimiPu(SetFilter):
75
+ tokens = NIMI_PU_SET
76
+
77
+
78
+ class NimiPuAle(SetFilter):
79
+ tokens = NIMI_PU_ALE_SET
80
+
81
+
82
+ class NimiLinku(SetFilter):
83
+ tokens = NIMI_LINKU_SET
84
+
85
+
86
+ class NimiLinkuAle(SetFilter):
87
+ tokens = NIMI_LINKU_ALE_SET
88
+
89
+
90
+ class Phonotactic(RegexFilter):
91
+ """Determines if a given token is phonotactically valid Toki Pona (or `n`).
92
+ Excludes both consecutive nasals and the illegal syllables:
93
+ - "nm", "nn"
94
+ - "wu", "wo", "ji", "ti"
95
+
96
+ Note that if this validator is used after `Cleaners.ConsecutiveDuplicates`,
97
+ "nn" cannot be found."""
98
+
99
+ pattern = re.compile(
100
+ rf"^((^[{VOWELS}]|[klmnps][{VOWELS}]|[jt][aeou]|[w][aei])(n(?![mn]))?)+$|^n$",
101
+ # Can't split initial vowel group off like in Syllabics because of
102
+ # consecutive nasal detection; it is costly to duplicate
103
+ flags=re.IGNORECASE,
104
+ )
105
+
106
+
107
+ class Syllabic(RegexFilter):
108
+ """Determines if a given token is syllabically valid Toki Pona (or `n`).
109
+ Words must have correctly ordered vowels and consonants, but the phonotactic
110
+ exceptions are not considered."""
111
+
112
+ # rf"^((^[{VOWELS}]|[{CONSONANTS}][{VOWELS}])n?)+$|^n$"
113
+ # Alterative I was exploring takes ~15% more steps
114
+ pattern = re.compile(
115
+ rf"^(?:^[{VOWELS}]n?)?(?:[{CONSONANTS}][{VOWELS}]n?)*$|^n$",
116
+ flags=re.IGNORECASE,
117
+ )
118
+
119
+
120
+ class Alphabetic(Filter):
121
+ @classmethod
122
+ @override
123
+ @cache(maxsize=None)
124
+ def filter(cls, token: str) -> bool:
125
+ # Faster than regex version
126
+ return set(token.lower()).issubset(ALPHABET_SET)
127
+
128
+
129
+ class Numerics(Filter):
130
+ """Determine if a given token is entirely numeric.
131
+ Covers all numeric symbols in Unicode.
132
+
133
+ This will fail to find numeric tokens such as "1.111" or "-42",
134
+ but if used with the aggressive tokenizer designed for `tok`, these will be
135
+ split into `["1", ".", "111"]` and `["-", "42"]` respectively. As such, the
136
+ numeric tokens will be split from their punctuation."""
137
+
138
+ @classmethod
139
+ @override
140
+ @cache(maxsize=None)
141
+ def filter(cls, msg: str) -> bool:
142
+ return msg.isnumeric()
143
+
144
+
145
+ class Punctuations(RegexFilter):
146
+ pattern = re.compile(r"[\p{Punctuation}\p{posix_punct}]+")
147
+
148
+
149
+ __all__ = [
150
+ "NimiPu",
151
+ "NimiLinku",
152
+ "NimiLinkuAle",
153
+ "Phonotactic",
154
+ "Syllabic",
155
+ "Alphabetic",
156
+ "ProperName",
157
+ "Punctuations",
158
+ "Numerics",
159
+ ]
@@ -0,0 +1,131 @@
1
+ """
2
+ "Preprocessors" are classes which strip content from a given string prior to tokenization.
3
+ There are currently two distinct types of Preprocessor:
4
+
5
+ - Remove a token from a string which would be difficult to identify after tokenization.
6
+ - URLs
7
+ - DiscordEmotes
8
+ - Remove a section of a string which is contained in or marked by certain character(s). Also called "Containers"
9
+ - SingleQuotes
10
+ - DoubleQuotes
11
+ - Backticks
12
+ - Spoilers
13
+ - ArrowQuote
14
+
15
+ Order does not generally matter, but if there were two overlapping containers such as in the string "|| spoiler ` monospace || `", order would matter.
16
+ As such, each Preprocessor exposes a .precedence attribute which is optionally usable for ordering them. Lower precedence means it should be applied first.
17
+ """
18
+
19
+ # STL
20
+ from abc import ABC, abstractmethod
21
+
22
+ # PDM
23
+ import regex as re
24
+ from typing_extensions import override
25
+
26
+ re.DEFAULT_VERSION = re.VERSION1
27
+
28
+
29
+ class Preprocessor(ABC):
30
+ precedence: int = 0
31
+
32
+ @classmethod # order matters
33
+ @abstractmethod
34
+ def process(cls, msg: str) -> str:
35
+ raise NotImplementedError
36
+
37
+
38
+ class RegexPreprocessor(Preprocessor):
39
+ pattern: "re.Pattern[str]"
40
+ replace: str = " "
41
+
42
+ @classmethod
43
+ @override
44
+ def process(cls, msg: str) -> str:
45
+ return re.sub(cls.pattern, cls.replace, msg)
46
+
47
+
48
+ """
49
+ The following classes are Ignorables.
50
+
51
+ Ignorables are tokens which do not count toward the accepted number of tokens
52
+ or the total number of tokens.
53
+ This is generally because they are considered external to Toki Pona.
54
+
55
+ It is likely that every user will want to use these.
56
+ Not having them will cause many false negatives, such as when a URL is divided
57
+ into its parts and checked as a token.
58
+ """
59
+
60
+
61
+ class URLs(RegexPreprocessor):
62
+ """Remove http(s) protocol URLs"""
63
+
64
+ pattern = re.compile(r"https?:\/\/\S+")
65
+
66
+
67
+ class DiscordEmotes(RegexPreprocessor):
68
+ """Remove text-formatted Discord emotes `<flags:name:id>`"""
69
+
70
+ pattern = re.compile(r"<a?:[a-zA-Z0-9_]{2,}:[0-9]{2,}>")
71
+
72
+
73
+ class DiscordMentions(RegexPreprocessor):
74
+ pattern = re.compile(r"<@[\!\&]?[0-9]{2,}>")
75
+
76
+
77
+ class DiscordChannels(RegexPreprocessor):
78
+ pattern = re.compile(r"<#[0-9]{2,}>")
79
+
80
+
81
+ class DiscordSpecial(RegexPreprocessor):
82
+ pattern = re.compile(r"<id:[a-zA-Z0-9_]{4,}>")
83
+
84
+
85
+ """
86
+ The following classes are Containers.
87
+
88
+ Containers are a special case of Ignorables, where an entire segment of an input
89
+ may be removed and not counted toward the accepted or total number of tokens.
90
+
91
+ Some users may prefer to use these so that they may quote third parties who
92
+ would likely be using a language other than Toki Pona.
93
+ """
94
+
95
+
96
+ class SingleQuotes(RegexPreprocessor):
97
+ pattern = re.compile(r"'[^']+'", flags=re.S) # . matches newline
98
+
99
+
100
+ class DoubleQuotes(RegexPreprocessor):
101
+ pattern = re.compile(r'"[^"]+"', flags=re.S)
102
+
103
+
104
+ class Backticks(RegexPreprocessor):
105
+ """Remove paired backticks and their contents `like this`"""
106
+
107
+ precedence = -10
108
+ pattern = re.compile(r"`[^`]+`", flags=re.S)
109
+
110
+
111
+ class Spoilers(RegexPreprocessor):
112
+ """Remove paired double bars and their contents `||like this||`"""
113
+
114
+ pattern = re.compile(r"\|\|(?:(?!\|\|).)+\|\|", flags=re.S)
115
+
116
+
117
+ class ArrowQuote(RegexPreprocessor):
118
+ """Remove lines beginning with `> `"""
119
+
120
+ pattern = re.compile(r"^>\ .+$", re.MULTILINE)
121
+
122
+
123
+ __all__ = [
124
+ "DiscordEmotes",
125
+ "SingleQuotes",
126
+ "DoubleQuotes",
127
+ "ArrowQuote",
128
+ "Backticks",
129
+ "Spoilers",
130
+ "URLs",
131
+ ]
sonatoki/Scorers.py ADDED
@@ -0,0 +1,123 @@
1
+ # STL
2
+ import math
3
+ from abc import ABC, abstractmethod
4
+ from typing import Dict, List, Type, Union
5
+
6
+ # PDM
7
+ from typing_extensions import override
8
+
9
+ # LOCAL
10
+ from sonatoki.Filters import Filter
11
+
12
+ Number = Union[int, float]
13
+ Weights = Dict[str, Number]
14
+
15
+
16
+ class Scorer(ABC):
17
+ weights: Weights
18
+
19
+ # @classmethod
20
+ # def __score(cls, token: str, filters: List[Type[Filter]]) -> Tuple[int, Number]:
21
+ # for filter in filters:
22
+ # if not filter.filter(token):
23
+ # continue
24
+ # # NOTE: We assume the filters are ordered by their score
25
+ # # Thus the first match is also the highest scoring
26
+ # return filter.counts, cls.weights[filter.__name__]
27
+ # # TODO: override weight if count is 0?
28
+ # return 1, 0
29
+
30
+ @classmethod
31
+ @abstractmethod
32
+ def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
33
+ raise NotImplementedError
34
+
35
+
36
+ class PassFail(Scorer):
37
+ """The token passes any filter or fails all of them, scoring 1 or 0 respectively."""
38
+
39
+ @classmethod
40
+ def __score(cls, token: str, filters: List[Type[Filter]]) -> Number:
41
+ for f in filters:
42
+ if f.filter(token):
43
+ return 1
44
+ return 0
45
+
46
+ @classmethod
47
+ @override
48
+ def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
49
+ if not tokens:
50
+ return 1
51
+
52
+ total_score = 0
53
+ len_tokens = len(tokens)
54
+ for token in tokens:
55
+ total_score += cls.__score(token, filters)
56
+ return total_score / len_tokens if len_tokens else 0
57
+
58
+
59
+ class Scaling(Scorer):
60
+ """
61
+ The sooner a token matches a filter, the higher its score.
62
+ In other words, filter order matters, weighing earlier listed filters higher than later ones.
63
+ This is desirable to avoid messages which would only match weaker filters, as these are less likely to be Toki Pona.
64
+ """
65
+
66
+ @classmethod
67
+ def score_token(cls, token: str, filters: List[Type[Filter]], scale: int):
68
+ for i, f in enumerate(filters):
69
+ if f.filter(token):
70
+ return scale - i
71
+ return 0
72
+
73
+ @classmethod
74
+ @override
75
+ def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
76
+ if not tokens:
77
+ return 1
78
+
79
+ total_score = 0
80
+ len_filters = len(filters)
81
+ max_score = len(tokens) * len_filters
82
+ for token in tokens:
83
+ total_score += cls.score_token(token, filters, len_filters)
84
+ return total_score / max_score if max_score else 0
85
+
86
+
87
+ class SoftScaling(Scaling):
88
+ """Shorter messages are subject to less harsh scoring
89
+ by mapping the token count to [0.5, 1.0] via the sigmoid function,
90
+ then raising the score to the resultant power.
91
+ For example, a single token scoring 0.64 will now score 0.8.
92
+ """
93
+
94
+ @staticmethod
95
+ def sigmoid(n: int) -> Number:
96
+ return 1 / (1 + math.exp(-(0.30 * (n - 1))))
97
+ # n-1 makes sigmoid(1) == 0.5
98
+ # 0.30 softens scaling against input
99
+ # return n / (1+abs(n)) # too weak in 0.7+
100
+
101
+ @classmethod
102
+ @override
103
+ def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
104
+ if not tokens:
105
+ return 1
106
+
107
+ total_score = 0
108
+ len_filters = len(filters)
109
+ len_tokens = len(tokens)
110
+
111
+ max_score = len_tokens * len_filters
112
+ for token in tokens:
113
+ total_score += cls.score_token(token, filters, len_filters)
114
+
115
+ percentage = total_score / max_score if max_score else 0
116
+ percentage **= cls.sigmoid(len_tokens)
117
+ return percentage
118
+
119
+
120
+ class Logarithmic(Scorer): ...
121
+
122
+
123
+ __all__ = ["PassFail", "Scaling", "SoftScaling"]
sonatoki/Tokenizers.py ADDED
@@ -0,0 +1,64 @@
1
+ # STL
2
+ from typing import List, Callable
3
+
4
+ # PDM
5
+ import regex as re
6
+
7
+ try:
8
+ # PDM
9
+ import nltk
10
+ from nltk.tokenize import sent_tokenize as __sent_tokenize_nltk
11
+ from nltk.tokenize import word_tokenize as __word_tokenize_nltk
12
+ except ImportError as e:
13
+ nltk = e
14
+
15
+
16
+ LANGUAGE = "english" # for NLTK
17
+
18
+ SENT_DELIMS_RE = r"""(.*?[.?!;:])|(.+?$)"""
19
+ SENT_DELIMS_RE = re.compile(SENT_DELIMS_RE)
20
+
21
+ SENT_DELIMS_TOK = r"""(.*?[.?!;:-])|(.+?$)"""
22
+ SENT_DELIMS_TOK = re.compile(SENT_DELIMS_TOK)
23
+
24
+
25
+ WORD_DELIMS_RE = r"""\s+|(?=[.?!;:'"-])"""
26
+ WORD_DELIMS_RE = re.compile(WORD_DELIMS_RE)
27
+
28
+ WORD_DELIMS_TOK = r"([\p{Punctuation}\p{posix_punct}]+|\s+)"
29
+ WORD_DELIMS_TOK = re.compile(WORD_DELIMS_TOK)
30
+
31
+ Tokenizer = Callable[[str], List[str]]
32
+
33
+
34
+ if not isinstance(nltk, ImportError):
35
+
36
+ def sent_tokenize_nltk(s: str) -> List[str]:
37
+ return __sent_tokenize_nltk(text=s, language=LANGUAGE)
38
+
39
+ def word_tokenize_nltk(s: str) -> List[str]:
40
+ return __word_tokenize_nltk(text=s, language=LANGUAGE)
41
+
42
+
43
+ def sent_tokenize_re(s: str) -> List[str]:
44
+ return [
45
+ clean
46
+ for sent in re.findall(SENT_DELIMS_RE, s)
47
+ if (clean := sent[0].strip() or sent[1].strip())
48
+ ]
49
+
50
+
51
+ def word_tokenize_re(s: str) -> List[str]:
52
+ return [clean for word in re.split(WORD_DELIMS_RE, s) if (clean := word.strip())]
53
+
54
+
55
+ def sent_tokenize_tok(s: str) -> List[str]:
56
+ return [
57
+ clean
58
+ for sent in re.findall(SENT_DELIMS_TOK, s)
59
+ if (clean := sent[0].strip() or sent[1].strip())
60
+ ]
61
+
62
+
63
+ def word_tokenize_tok(s: str) -> List[str]:
64
+ return [clean for word in re.split(WORD_DELIMS_TOK, s) if (clean := word.strip())]
sonatoki/__init__.py ADDED
File without changes
sonatoki/__main__.py ADDED
@@ -0,0 +1,9 @@
1
+ #!/bin/env python3
2
+
3
+
4
+ def open():
5
+ pass
6
+
7
+
8
+ if __name__ == "__main__":
9
+ open()
sonatoki/constants.py ADDED
@@ -0,0 +1,57 @@
1
+ # STL
2
+ import json
3
+ from typing import Dict, List
4
+ from pathlib import Path
5
+
6
+ LINKU = Path(__file__).resolve().parent / Path("linku.json")
7
+
8
+ VOWELS = "aeiou"
9
+ CONSONANTS = "jklmnpstw"
10
+ ALPHABET = VOWELS + CONSONANTS
11
+ ALPHABET_SET = set(ALPHABET)
12
+
13
+ """Commonly occurring strings which are some kind of valid Toki Pona or external token"""
14
+ ALLOWABLES = {
15
+ "cw", # Content Warning
16
+ "x", # ala
17
+ "y", # anu
18
+ "kxk", # ken ala ken
19
+ "wxw", # wile ala wile
20
+ }
21
+
22
+
23
+ with open(LINKU) as f:
24
+ r: Dict[str, Dict[str, str]] = json.loads(f.read())
25
+ NIMI_PU: List[str] = [d["word"] for d in r.values() if d["book"] == "pu"]
26
+ NIMI_PU_ALE: List[str] = NIMI_PU + ["namako", "kin", "oko"]
27
+ NIMI_LINKU: List[str] = [
28
+ d["word"] for d in r.values() if d["usage_category"] in ["core", "common"]
29
+ ]
30
+ NIMI_LINKU_ALE: List[str] = [d["word"] for d in r.values()]
31
+
32
+ NIMI_PU_SET = set(NIMI_PU)
33
+ NIMI_PU_ALE_SET = set(NIMI_PU_ALE)
34
+ NIMI_LINKU_SET = set(NIMI_LINKU)
35
+ NIMI_LINKU_ALE_SET = set(NIMI_LINKU_ALE)
36
+ ALLOWABLES_SET = set(ALLOWABLES)
37
+
38
+ __all__ = [
39
+ "VOWELS",
40
+ #
41
+ "CONSONANTS",
42
+ #
43
+ "ALPHABET",
44
+ "ALPHABET_SET",
45
+ #
46
+ "NIMI_PU",
47
+ "NIMI_PU_SET",
48
+ #
49
+ "NIMI_PU_ALE",
50
+ "NIMI_PU_ALE_SET",
51
+ #
52
+ "NIMI_LINKU",
53
+ "NIMI_LINKU_SET",
54
+ #
55
+ "NIMI_LINKU_ALE",
56
+ "NIMI_LINKU_ALE_SET",
57
+ ]
sonatoki/ilo.py ADDED
@@ -0,0 +1,101 @@
1
+ # STL
2
+ from typing import List, Type
3
+
4
+ # LOCAL
5
+ from sonatoki.Filters import Filter
6
+ from sonatoki.Scorers import Number, Scorer
7
+ from sonatoki.Cleaners import Cleaner
8
+ from sonatoki.Tokenizers import Tokenizer
9
+ from sonatoki.Preprocessors import Preprocessor
10
+
11
+
12
+ class Ilo:
13
+ __preprocessors: List[Type[Preprocessor]]
14
+ __cleaners: List[Type[Cleaner]]
15
+ __ignoring_filters: List[Type[Filter]]
16
+ __scoring_filters: List[Type[Filter]]
17
+ __scorer: Type[Scorer]
18
+ __tokenize: Tokenizer
19
+ __passing_score: Number
20
+ debug: bool = False
21
+
22
+ def __init__(
23
+ self,
24
+ preprocessors: List[Type[Preprocessor]],
25
+ cleaners: List[Type[Cleaner]],
26
+ ignoring_filters: List[Type[Filter]],
27
+ scoring_filters: List[Type[Filter]],
28
+ scorer: Type[Scorer],
29
+ tokenizer: Tokenizer, # NOTE: no wrapper needed?
30
+ passing_score: Number,
31
+ ):
32
+ super().__init__()
33
+ # avoid keeping a ref to user's list just in case
34
+ self.__preprocessors = [*preprocessors]
35
+ self.__cleaners = [*cleaners]
36
+ self.__ignoring_filters = [*ignoring_filters]
37
+ self.__scoring_filters = [*scoring_filters]
38
+ self.__scorer = scorer
39
+ self.__tokenize = tokenizer
40
+ self.__passing_score = passing_score
41
+
42
+ def __preprocess(self, msg: str) -> str:
43
+ for p in self.__preprocessors:
44
+ msg = p.process(msg)
45
+ return msg
46
+
47
+ def __clean_token(self, token: str) -> str:
48
+ for c in self.__cleaners:
49
+ token = c.clean(token)
50
+ return token
51
+
52
+ def __clean_tokens(self, tokens: List[str]) -> List[str]:
53
+ # NOTE: tested, making a new list with a for loop *is* faster than
54
+ # - list comps
55
+ # - generator comps
56
+ # - in-place replacement/removal
57
+ # - in place replacement with result of generator comp
58
+ cleaned_tokens: List[str] = list()
59
+ for token in tokens:
60
+ cleaned_token = self.__clean_token(token)
61
+ if not cleaned_token:
62
+ # TODO: warn user?
63
+ continue
64
+ cleaned_tokens.append(cleaned_token)
65
+ return cleaned_tokens
66
+
67
+ def __filter_token(self, token: str) -> bool:
68
+ for f in self.__ignoring_filters:
69
+ if f.filter(token):
70
+ return True
71
+ return False
72
+
73
+ def __filter_tokens(self, tokens: List[str]) -> List[str]:
74
+ filtered_tokens: List[str] = []
75
+ for token in tokens:
76
+ if self.__filter_token(token):
77
+ continue
78
+ # the ignoring filter is true if the token matches
79
+ # the user wants to ignore these so keep non-matching tokens
80
+ filtered_tokens.append(token)
81
+ return filtered_tokens
82
+
83
+ def __score_tokens(self, tokens: List[str]) -> float:
84
+ return self.__scorer.score(tokens, self.__scoring_filters)
85
+
86
+ def is_toki_pona(self, message: str) -> bool:
87
+ preprocessed = self.__preprocess(message)
88
+ tokenized = self.__tokenize(preprocessed)
89
+ filtered = self.__filter_tokens(tokenized)
90
+ cleaned = self.__clean_tokens(filtered)
91
+ score = self.__score_tokens(cleaned)
92
+
93
+ if self.debug:
94
+ print("msg: %.2f %s" % (score, repr(message)))
95
+ print("Preproc: %s" % repr(preprocessed))
96
+ print("Tokenized: %s" % tokenized)
97
+ print("Filtered: %s" % filtered)
98
+ print("Cleaned: %s" % cleaned)
99
+ print()
100
+
101
+ return score >= self.__passing_score