sonatoki 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Cleaners.py +42 -0
- sonatoki/Filters.py +159 -0
- sonatoki/Preprocessors.py +131 -0
- sonatoki/Scorers.py +123 -0
- sonatoki/Tokenizers.py +64 -0
- sonatoki/__init__.py +0 -0
- sonatoki/__main__.py +9 -0
- sonatoki/constants.py +57 -0
- sonatoki/ilo.py +101 -0
- sonatoki/linku.json +1 -0
- sonatoki-0.1.0.dist-info/METADATA +84 -0
- sonatoki-0.1.0.dist-info/RECORD +14 -0
- sonatoki-0.1.0.dist-info/WHEEL +4 -0
- sonatoki-0.1.0.dist-info/licenses/LICENSE +661 -0
sonatoki/Cleaners.py
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# STL
|
2
|
+
import re
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
|
5
|
+
# PDM
|
6
|
+
from typing_extensions import override
|
7
|
+
|
8
|
+
|
9
|
+
class Cleaner(ABC):
|
10
|
+
@classmethod
|
11
|
+
@abstractmethod
|
12
|
+
def clean(cls, token: str) -> str:
|
13
|
+
raise NotImplementedError
|
14
|
+
|
15
|
+
|
16
|
+
class RegexCleaner(Cleaner):
|
17
|
+
pattern: "re.Pattern[str]"
|
18
|
+
replace: str
|
19
|
+
|
20
|
+
@classmethod
|
21
|
+
@override
|
22
|
+
def clean(cls, token: str) -> str:
|
23
|
+
return re.sub(cls.pattern, cls.replace, token)
|
24
|
+
|
25
|
+
|
26
|
+
class ConsecutiveDuplicates(RegexCleaner):
|
27
|
+
"""Remove consecutive duplicates from an input string, ignoring case.
|
28
|
+
|
29
|
+
The first match of any 2+ will become `\\1`, preserving initial case.
|
30
|
+
For example, `FfFoo` will reduce to `Foo`, and `bBAR` will reduce to `bAR`.
|
31
|
+
|
32
|
+
This is desirable for Toki Pona written with the Latin alphabet because strings
|
33
|
+
may be altered for emphasis or effect, such as in "sonaaaa" or "AAAAAA".
|
34
|
+
|
35
|
+
This may be undesirable for moraic scripts like Hiragana, where `わわ` would be
|
36
|
+
incorrectly reduced to `わ`. This does preserve phonotactic validity, though."""
|
37
|
+
|
38
|
+
pattern = re.compile(r"(.)\1+", flags=re.IGNORECASE)
|
39
|
+
replace = r"\1"
|
40
|
+
|
41
|
+
|
42
|
+
__all__ = ["ConsecutiveDuplicates"]
|
sonatoki/Filters.py
ADDED
@@ -0,0 +1,159 @@
|
|
1
|
+
# STL
|
2
|
+
from abc import ABC, abstractmethod
|
3
|
+
from typing import Set
|
4
|
+
from functools import lru_cache as cache # cache comes in 3.9
|
5
|
+
|
6
|
+
# PDM
|
7
|
+
import regex as re
|
8
|
+
from typing_extensions import override
|
9
|
+
|
10
|
+
# LOCAL
|
11
|
+
from sonatoki.constants import (
|
12
|
+
VOWELS,
|
13
|
+
CONSONANTS,
|
14
|
+
NIMI_PU_SET,
|
15
|
+
ALPHABET_SET,
|
16
|
+
ALLOWABLES_SET,
|
17
|
+
NIMI_LINKU_SET,
|
18
|
+
NIMI_PU_ALE_SET,
|
19
|
+
NIMI_LINKU_ALE_SET,
|
20
|
+
)
|
21
|
+
|
22
|
+
re.DEFAULT_VERSION = re.VERSION1
|
23
|
+
|
24
|
+
|
25
|
+
class Filter(ABC):
|
26
|
+
@classmethod
|
27
|
+
@abstractmethod
|
28
|
+
@cache(maxsize=None)
|
29
|
+
def filter(cls, token: str) -> bool:
|
30
|
+
raise NotImplementedError
|
31
|
+
|
32
|
+
|
33
|
+
class RegexFilter(Filter):
|
34
|
+
pattern: "re.Pattern[str]"
|
35
|
+
|
36
|
+
@classmethod
|
37
|
+
@override
|
38
|
+
@cache(maxsize=None)
|
39
|
+
def filter(cls, token: str) -> bool:
|
40
|
+
return not not re.fullmatch(cls.pattern, token)
|
41
|
+
|
42
|
+
|
43
|
+
class SetFilter(Filter):
|
44
|
+
tokens: Set[str]
|
45
|
+
|
46
|
+
@classmethod
|
47
|
+
@override
|
48
|
+
@cache(maxsize=None)
|
49
|
+
def filter(cls, token: str) -> bool:
|
50
|
+
return token.lower() in cls.tokens
|
51
|
+
|
52
|
+
|
53
|
+
class Miscellaneous(SetFilter):
|
54
|
+
tokens = ALLOWABLES_SET
|
55
|
+
|
56
|
+
|
57
|
+
class ProperName(Filter):
|
58
|
+
"""Determines if a given token is a valid name (also called a loan word).
|
59
|
+
When Toki Pona is written with the Latin alphabet, names are generally
|
60
|
+
capitalized at their start. This filter identifies those tokens.
|
61
|
+
|
62
|
+
Note that this alone cannot determine if a token is a valid name, because
|
63
|
+
a standalone name is considered invalid in Toki Pona- names generally have head nouns.
|
64
|
+
This tool only examines one token at a time, so cannot detect names any better than identifying their capital letter.
|
65
|
+
"""
|
66
|
+
|
67
|
+
@classmethod
|
68
|
+
@override
|
69
|
+
@cache(maxsize=None)
|
70
|
+
def filter(cls, token: str) -> bool:
|
71
|
+
return token == token.capitalize()
|
72
|
+
|
73
|
+
|
74
|
+
class NimiPu(SetFilter):
|
75
|
+
tokens = NIMI_PU_SET
|
76
|
+
|
77
|
+
|
78
|
+
class NimiPuAle(SetFilter):
|
79
|
+
tokens = NIMI_PU_ALE_SET
|
80
|
+
|
81
|
+
|
82
|
+
class NimiLinku(SetFilter):
|
83
|
+
tokens = NIMI_LINKU_SET
|
84
|
+
|
85
|
+
|
86
|
+
class NimiLinkuAle(SetFilter):
|
87
|
+
tokens = NIMI_LINKU_ALE_SET
|
88
|
+
|
89
|
+
|
90
|
+
class Phonotactic(RegexFilter):
|
91
|
+
"""Determines if a given token is phonotactically valid Toki Pona (or `n`).
|
92
|
+
Excludes both consecutive nasals and the illegal syllables:
|
93
|
+
- "nm", "nn"
|
94
|
+
- "wu", "wo", "ji", "ti"
|
95
|
+
|
96
|
+
Note that if this validator is used after `Cleaners.ConsecutiveDuplicates`,
|
97
|
+
"nn" cannot be found."""
|
98
|
+
|
99
|
+
pattern = re.compile(
|
100
|
+
rf"^((^[{VOWELS}]|[klmnps][{VOWELS}]|[jt][aeou]|[w][aei])(n(?![mn]))?)+$|^n$",
|
101
|
+
# Can't split initial vowel group off like in Syllabics because of
|
102
|
+
# consecutive nasal detection; it is costly to duplicate
|
103
|
+
flags=re.IGNORECASE,
|
104
|
+
)
|
105
|
+
|
106
|
+
|
107
|
+
class Syllabic(RegexFilter):
|
108
|
+
"""Determines if a given token is syllabically valid Toki Pona (or `n`).
|
109
|
+
Words must have correctly ordered vowels and consonants, but the phonotactic
|
110
|
+
exceptions are not considered."""
|
111
|
+
|
112
|
+
# rf"^((^[{VOWELS}]|[{CONSONANTS}][{VOWELS}])n?)+$|^n$"
|
113
|
+
# Alterative I was exploring takes ~15% more steps
|
114
|
+
pattern = re.compile(
|
115
|
+
rf"^(?:^[{VOWELS}]n?)?(?:[{CONSONANTS}][{VOWELS}]n?)*$|^n$",
|
116
|
+
flags=re.IGNORECASE,
|
117
|
+
)
|
118
|
+
|
119
|
+
|
120
|
+
class Alphabetic(Filter):
|
121
|
+
@classmethod
|
122
|
+
@override
|
123
|
+
@cache(maxsize=None)
|
124
|
+
def filter(cls, token: str) -> bool:
|
125
|
+
# Faster than regex version
|
126
|
+
return set(token.lower()).issubset(ALPHABET_SET)
|
127
|
+
|
128
|
+
|
129
|
+
class Numerics(Filter):
|
130
|
+
"""Determine if a given token is entirely numeric.
|
131
|
+
Covers all numeric symbols in Unicode.
|
132
|
+
|
133
|
+
This will fail to find numeric tokens such as "1.111" or "-42",
|
134
|
+
but if used with the aggressive tokenizer designed for `tok`, these will be
|
135
|
+
split into `["1", ".", "111"]` and `["-", "42"]` respectively. As such, the
|
136
|
+
numeric tokens will be split from their punctuation."""
|
137
|
+
|
138
|
+
@classmethod
|
139
|
+
@override
|
140
|
+
@cache(maxsize=None)
|
141
|
+
def filter(cls, msg: str) -> bool:
|
142
|
+
return msg.isnumeric()
|
143
|
+
|
144
|
+
|
145
|
+
class Punctuations(RegexFilter):
|
146
|
+
pattern = re.compile(r"[\p{Punctuation}\p{posix_punct}]+")
|
147
|
+
|
148
|
+
|
149
|
+
__all__ = [
|
150
|
+
"NimiPu",
|
151
|
+
"NimiLinku",
|
152
|
+
"NimiLinkuAle",
|
153
|
+
"Phonotactic",
|
154
|
+
"Syllabic",
|
155
|
+
"Alphabetic",
|
156
|
+
"ProperName",
|
157
|
+
"Punctuations",
|
158
|
+
"Numerics",
|
159
|
+
]
|
@@ -0,0 +1,131 @@
|
|
1
|
+
"""
|
2
|
+
"Preprocessors" are classes which strip content from a given string prior to tokenization.
|
3
|
+
There are currently two distinct types of Preprocessor:
|
4
|
+
|
5
|
+
- Remove a token from a string which would be difficult to identify after tokenization.
|
6
|
+
- URLs
|
7
|
+
- DiscordEmotes
|
8
|
+
- Remove a section of a string which is contained in or marked by certain character(s). Also called "Containers"
|
9
|
+
- SingleQuotes
|
10
|
+
- DoubleQuotes
|
11
|
+
- Backticks
|
12
|
+
- Spoilers
|
13
|
+
- ArrowQuote
|
14
|
+
|
15
|
+
Order does not generally matter, but if there were two overlapping containers such as in the string "|| spoiler ` monospace || `", order would matter.
|
16
|
+
As such, each Preprocessor exposes a .precedence attribute which is optionally usable for ordering them. Lower precedence means it should be applied first.
|
17
|
+
"""
|
18
|
+
|
19
|
+
# STL
|
20
|
+
from abc import ABC, abstractmethod
|
21
|
+
|
22
|
+
# PDM
|
23
|
+
import regex as re
|
24
|
+
from typing_extensions import override
|
25
|
+
|
26
|
+
re.DEFAULT_VERSION = re.VERSION1
|
27
|
+
|
28
|
+
|
29
|
+
class Preprocessor(ABC):
|
30
|
+
precedence: int = 0
|
31
|
+
|
32
|
+
@classmethod # order matters
|
33
|
+
@abstractmethod
|
34
|
+
def process(cls, msg: str) -> str:
|
35
|
+
raise NotImplementedError
|
36
|
+
|
37
|
+
|
38
|
+
class RegexPreprocessor(Preprocessor):
|
39
|
+
pattern: "re.Pattern[str]"
|
40
|
+
replace: str = " "
|
41
|
+
|
42
|
+
@classmethod
|
43
|
+
@override
|
44
|
+
def process(cls, msg: str) -> str:
|
45
|
+
return re.sub(cls.pattern, cls.replace, msg)
|
46
|
+
|
47
|
+
|
48
|
+
"""
|
49
|
+
The following classes are Ignorables.
|
50
|
+
|
51
|
+
Ignorables are tokens which do not count toward the accepted number of tokens
|
52
|
+
or the total number of tokens.
|
53
|
+
This is generally because they are considered external to Toki Pona.
|
54
|
+
|
55
|
+
It is likely that every user will want to use these.
|
56
|
+
Not having them will cause many false negatives, such as when a URL is divided
|
57
|
+
into its parts and checked as a token.
|
58
|
+
"""
|
59
|
+
|
60
|
+
|
61
|
+
class URLs(RegexPreprocessor):
|
62
|
+
"""Remove http(s) protocol URLs"""
|
63
|
+
|
64
|
+
pattern = re.compile(r"https?:\/\/\S+")
|
65
|
+
|
66
|
+
|
67
|
+
class DiscordEmotes(RegexPreprocessor):
|
68
|
+
"""Remove text-formatted Discord emotes `<flags:name:id>`"""
|
69
|
+
|
70
|
+
pattern = re.compile(r"<a?:[a-zA-Z0-9_]{2,}:[0-9]{2,}>")
|
71
|
+
|
72
|
+
|
73
|
+
class DiscordMentions(RegexPreprocessor):
|
74
|
+
pattern = re.compile(r"<@[\!\&]?[0-9]{2,}>")
|
75
|
+
|
76
|
+
|
77
|
+
class DiscordChannels(RegexPreprocessor):
|
78
|
+
pattern = re.compile(r"<#[0-9]{2,}>")
|
79
|
+
|
80
|
+
|
81
|
+
class DiscordSpecial(RegexPreprocessor):
|
82
|
+
pattern = re.compile(r"<id:[a-zA-Z0-9_]{4,}>")
|
83
|
+
|
84
|
+
|
85
|
+
"""
|
86
|
+
The following classes are Containers.
|
87
|
+
|
88
|
+
Containers are a special case of Ignorables, where an entire segment of an input
|
89
|
+
may be removed and not counted toward the accepted or total number of tokens.
|
90
|
+
|
91
|
+
Some users may prefer to use these so that they may quote third parties who
|
92
|
+
would likely be using a language other than Toki Pona.
|
93
|
+
"""
|
94
|
+
|
95
|
+
|
96
|
+
class SingleQuotes(RegexPreprocessor):
|
97
|
+
pattern = re.compile(r"'[^']+'", flags=re.S) # . matches newline
|
98
|
+
|
99
|
+
|
100
|
+
class DoubleQuotes(RegexPreprocessor):
|
101
|
+
pattern = re.compile(r'"[^"]+"', flags=re.S)
|
102
|
+
|
103
|
+
|
104
|
+
class Backticks(RegexPreprocessor):
|
105
|
+
"""Remove paired backticks and their contents `like this`"""
|
106
|
+
|
107
|
+
precedence = -10
|
108
|
+
pattern = re.compile(r"`[^`]+`", flags=re.S)
|
109
|
+
|
110
|
+
|
111
|
+
class Spoilers(RegexPreprocessor):
|
112
|
+
"""Remove paired double bars and their contents `||like this||`"""
|
113
|
+
|
114
|
+
pattern = re.compile(r"\|\|(?:(?!\|\|).)+\|\|", flags=re.S)
|
115
|
+
|
116
|
+
|
117
|
+
class ArrowQuote(RegexPreprocessor):
|
118
|
+
"""Remove lines beginning with `> `"""
|
119
|
+
|
120
|
+
pattern = re.compile(r"^>\ .+$", re.MULTILINE)
|
121
|
+
|
122
|
+
|
123
|
+
__all__ = [
|
124
|
+
"DiscordEmotes",
|
125
|
+
"SingleQuotes",
|
126
|
+
"DoubleQuotes",
|
127
|
+
"ArrowQuote",
|
128
|
+
"Backticks",
|
129
|
+
"Spoilers",
|
130
|
+
"URLs",
|
131
|
+
]
|
sonatoki/Scorers.py
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
# STL
|
2
|
+
import math
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
from typing import Dict, List, Type, Union
|
5
|
+
|
6
|
+
# PDM
|
7
|
+
from typing_extensions import override
|
8
|
+
|
9
|
+
# LOCAL
|
10
|
+
from sonatoki.Filters import Filter
|
11
|
+
|
12
|
+
Number = Union[int, float]
|
13
|
+
Weights = Dict[str, Number]
|
14
|
+
|
15
|
+
|
16
|
+
class Scorer(ABC):
|
17
|
+
weights: Weights
|
18
|
+
|
19
|
+
# @classmethod
|
20
|
+
# def __score(cls, token: str, filters: List[Type[Filter]]) -> Tuple[int, Number]:
|
21
|
+
# for filter in filters:
|
22
|
+
# if not filter.filter(token):
|
23
|
+
# continue
|
24
|
+
# # NOTE: We assume the filters are ordered by their score
|
25
|
+
# # Thus the first match is also the highest scoring
|
26
|
+
# return filter.counts, cls.weights[filter.__name__]
|
27
|
+
# # TODO: override weight if count is 0?
|
28
|
+
# return 1, 0
|
29
|
+
|
30
|
+
@classmethod
|
31
|
+
@abstractmethod
|
32
|
+
def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
|
33
|
+
raise NotImplementedError
|
34
|
+
|
35
|
+
|
36
|
+
class PassFail(Scorer):
|
37
|
+
"""The token passes any filter or fails all of them, scoring 1 or 0 respectively."""
|
38
|
+
|
39
|
+
@classmethod
|
40
|
+
def __score(cls, token: str, filters: List[Type[Filter]]) -> Number:
|
41
|
+
for f in filters:
|
42
|
+
if f.filter(token):
|
43
|
+
return 1
|
44
|
+
return 0
|
45
|
+
|
46
|
+
@classmethod
|
47
|
+
@override
|
48
|
+
def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
|
49
|
+
if not tokens:
|
50
|
+
return 1
|
51
|
+
|
52
|
+
total_score = 0
|
53
|
+
len_tokens = len(tokens)
|
54
|
+
for token in tokens:
|
55
|
+
total_score += cls.__score(token, filters)
|
56
|
+
return total_score / len_tokens if len_tokens else 0
|
57
|
+
|
58
|
+
|
59
|
+
class Scaling(Scorer):
|
60
|
+
"""
|
61
|
+
The sooner a token matches a filter, the higher its score.
|
62
|
+
In other words, filter order matters, weighing earlier listed filters higher than later ones.
|
63
|
+
This is desirable to avoid messages which would only match weaker filters, as these are less likely to be Toki Pona.
|
64
|
+
"""
|
65
|
+
|
66
|
+
@classmethod
|
67
|
+
def score_token(cls, token: str, filters: List[Type[Filter]], scale: int):
|
68
|
+
for i, f in enumerate(filters):
|
69
|
+
if f.filter(token):
|
70
|
+
return scale - i
|
71
|
+
return 0
|
72
|
+
|
73
|
+
@classmethod
|
74
|
+
@override
|
75
|
+
def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
|
76
|
+
if not tokens:
|
77
|
+
return 1
|
78
|
+
|
79
|
+
total_score = 0
|
80
|
+
len_filters = len(filters)
|
81
|
+
max_score = len(tokens) * len_filters
|
82
|
+
for token in tokens:
|
83
|
+
total_score += cls.score_token(token, filters, len_filters)
|
84
|
+
return total_score / max_score if max_score else 0
|
85
|
+
|
86
|
+
|
87
|
+
class SoftScaling(Scaling):
|
88
|
+
"""Shorter messages are subject to less harsh scoring
|
89
|
+
by mapping the token count to [0.5, 1.0] via the sigmoid function,
|
90
|
+
then raising the score to the resultant power.
|
91
|
+
For example, a single token scoring 0.64 will now score 0.8.
|
92
|
+
"""
|
93
|
+
|
94
|
+
@staticmethod
|
95
|
+
def sigmoid(n: int) -> Number:
|
96
|
+
return 1 / (1 + math.exp(-(0.30 * (n - 1))))
|
97
|
+
# n-1 makes sigmoid(1) == 0.5
|
98
|
+
# 0.30 softens scaling against input
|
99
|
+
# return n / (1+abs(n)) # too weak in 0.7+
|
100
|
+
|
101
|
+
@classmethod
|
102
|
+
@override
|
103
|
+
def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
|
104
|
+
if not tokens:
|
105
|
+
return 1
|
106
|
+
|
107
|
+
total_score = 0
|
108
|
+
len_filters = len(filters)
|
109
|
+
len_tokens = len(tokens)
|
110
|
+
|
111
|
+
max_score = len_tokens * len_filters
|
112
|
+
for token in tokens:
|
113
|
+
total_score += cls.score_token(token, filters, len_filters)
|
114
|
+
|
115
|
+
percentage = total_score / max_score if max_score else 0
|
116
|
+
percentage **= cls.sigmoid(len_tokens)
|
117
|
+
return percentage
|
118
|
+
|
119
|
+
|
120
|
+
class Logarithmic(Scorer): ...
|
121
|
+
|
122
|
+
|
123
|
+
__all__ = ["PassFail", "Scaling", "SoftScaling"]
|
sonatoki/Tokenizers.py
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
# STL
|
2
|
+
from typing import List, Callable
|
3
|
+
|
4
|
+
# PDM
|
5
|
+
import regex as re
|
6
|
+
|
7
|
+
try:
|
8
|
+
# PDM
|
9
|
+
import nltk
|
10
|
+
from nltk.tokenize import sent_tokenize as __sent_tokenize_nltk
|
11
|
+
from nltk.tokenize import word_tokenize as __word_tokenize_nltk
|
12
|
+
except ImportError as e:
|
13
|
+
nltk = e
|
14
|
+
|
15
|
+
|
16
|
+
LANGUAGE = "english" # for NLTK
|
17
|
+
|
18
|
+
SENT_DELIMS_RE = r"""(.*?[.?!;:])|(.+?$)"""
|
19
|
+
SENT_DELIMS_RE = re.compile(SENT_DELIMS_RE)
|
20
|
+
|
21
|
+
SENT_DELIMS_TOK = r"""(.*?[.?!;:-])|(.+?$)"""
|
22
|
+
SENT_DELIMS_TOK = re.compile(SENT_DELIMS_TOK)
|
23
|
+
|
24
|
+
|
25
|
+
WORD_DELIMS_RE = r"""\s+|(?=[.?!;:'"-])"""
|
26
|
+
WORD_DELIMS_RE = re.compile(WORD_DELIMS_RE)
|
27
|
+
|
28
|
+
WORD_DELIMS_TOK = r"([\p{Punctuation}\p{posix_punct}]+|\s+)"
|
29
|
+
WORD_DELIMS_TOK = re.compile(WORD_DELIMS_TOK)
|
30
|
+
|
31
|
+
Tokenizer = Callable[[str], List[str]]
|
32
|
+
|
33
|
+
|
34
|
+
if not isinstance(nltk, ImportError):
|
35
|
+
|
36
|
+
def sent_tokenize_nltk(s: str) -> List[str]:
|
37
|
+
return __sent_tokenize_nltk(text=s, language=LANGUAGE)
|
38
|
+
|
39
|
+
def word_tokenize_nltk(s: str) -> List[str]:
|
40
|
+
return __word_tokenize_nltk(text=s, language=LANGUAGE)
|
41
|
+
|
42
|
+
|
43
|
+
def sent_tokenize_re(s: str) -> List[str]:
|
44
|
+
return [
|
45
|
+
clean
|
46
|
+
for sent in re.findall(SENT_DELIMS_RE, s)
|
47
|
+
if (clean := sent[0].strip() or sent[1].strip())
|
48
|
+
]
|
49
|
+
|
50
|
+
|
51
|
+
def word_tokenize_re(s: str) -> List[str]:
|
52
|
+
return [clean for word in re.split(WORD_DELIMS_RE, s) if (clean := word.strip())]
|
53
|
+
|
54
|
+
|
55
|
+
def sent_tokenize_tok(s: str) -> List[str]:
|
56
|
+
return [
|
57
|
+
clean
|
58
|
+
for sent in re.findall(SENT_DELIMS_TOK, s)
|
59
|
+
if (clean := sent[0].strip() or sent[1].strip())
|
60
|
+
]
|
61
|
+
|
62
|
+
|
63
|
+
def word_tokenize_tok(s: str) -> List[str]:
|
64
|
+
return [clean for word in re.split(WORD_DELIMS_TOK, s) if (clean := word.strip())]
|
sonatoki/__init__.py
ADDED
File without changes
|
sonatoki/__main__.py
ADDED
sonatoki/constants.py
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
# STL
|
2
|
+
import json
|
3
|
+
from typing import Dict, List
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
LINKU = Path(__file__).resolve().parent / Path("linku.json")
|
7
|
+
|
8
|
+
VOWELS = "aeiou"
|
9
|
+
CONSONANTS = "jklmnpstw"
|
10
|
+
ALPHABET = VOWELS + CONSONANTS
|
11
|
+
ALPHABET_SET = set(ALPHABET)
|
12
|
+
|
13
|
+
"""Commonly occurring strings which are some kind of valid Toki Pona or external token"""
|
14
|
+
ALLOWABLES = {
|
15
|
+
"cw", # Content Warning
|
16
|
+
"x", # ala
|
17
|
+
"y", # anu
|
18
|
+
"kxk", # ken ala ken
|
19
|
+
"wxw", # wile ala wile
|
20
|
+
}
|
21
|
+
|
22
|
+
|
23
|
+
with open(LINKU) as f:
|
24
|
+
r: Dict[str, Dict[str, str]] = json.loads(f.read())
|
25
|
+
NIMI_PU: List[str] = [d["word"] for d in r.values() if d["book"] == "pu"]
|
26
|
+
NIMI_PU_ALE: List[str] = NIMI_PU + ["namako", "kin", "oko"]
|
27
|
+
NIMI_LINKU: List[str] = [
|
28
|
+
d["word"] for d in r.values() if d["usage_category"] in ["core", "common"]
|
29
|
+
]
|
30
|
+
NIMI_LINKU_ALE: List[str] = [d["word"] for d in r.values()]
|
31
|
+
|
32
|
+
NIMI_PU_SET = set(NIMI_PU)
|
33
|
+
NIMI_PU_ALE_SET = set(NIMI_PU_ALE)
|
34
|
+
NIMI_LINKU_SET = set(NIMI_LINKU)
|
35
|
+
NIMI_LINKU_ALE_SET = set(NIMI_LINKU_ALE)
|
36
|
+
ALLOWABLES_SET = set(ALLOWABLES)
|
37
|
+
|
38
|
+
__all__ = [
|
39
|
+
"VOWELS",
|
40
|
+
#
|
41
|
+
"CONSONANTS",
|
42
|
+
#
|
43
|
+
"ALPHABET",
|
44
|
+
"ALPHABET_SET",
|
45
|
+
#
|
46
|
+
"NIMI_PU",
|
47
|
+
"NIMI_PU_SET",
|
48
|
+
#
|
49
|
+
"NIMI_PU_ALE",
|
50
|
+
"NIMI_PU_ALE_SET",
|
51
|
+
#
|
52
|
+
"NIMI_LINKU",
|
53
|
+
"NIMI_LINKU_SET",
|
54
|
+
#
|
55
|
+
"NIMI_LINKU_ALE",
|
56
|
+
"NIMI_LINKU_ALE_SET",
|
57
|
+
]
|
sonatoki/ilo.py
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
# STL
|
2
|
+
from typing import List, Type
|
3
|
+
|
4
|
+
# LOCAL
|
5
|
+
from sonatoki.Filters import Filter
|
6
|
+
from sonatoki.Scorers import Number, Scorer
|
7
|
+
from sonatoki.Cleaners import Cleaner
|
8
|
+
from sonatoki.Tokenizers import Tokenizer
|
9
|
+
from sonatoki.Preprocessors import Preprocessor
|
10
|
+
|
11
|
+
|
12
|
+
class Ilo:
|
13
|
+
__preprocessors: List[Type[Preprocessor]]
|
14
|
+
__cleaners: List[Type[Cleaner]]
|
15
|
+
__ignoring_filters: List[Type[Filter]]
|
16
|
+
__scoring_filters: List[Type[Filter]]
|
17
|
+
__scorer: Type[Scorer]
|
18
|
+
__tokenize: Tokenizer
|
19
|
+
__passing_score: Number
|
20
|
+
debug: bool = False
|
21
|
+
|
22
|
+
def __init__(
|
23
|
+
self,
|
24
|
+
preprocessors: List[Type[Preprocessor]],
|
25
|
+
cleaners: List[Type[Cleaner]],
|
26
|
+
ignoring_filters: List[Type[Filter]],
|
27
|
+
scoring_filters: List[Type[Filter]],
|
28
|
+
scorer: Type[Scorer],
|
29
|
+
tokenizer: Tokenizer, # NOTE: no wrapper needed?
|
30
|
+
passing_score: Number,
|
31
|
+
):
|
32
|
+
super().__init__()
|
33
|
+
# avoid keeping a ref to user's list just in case
|
34
|
+
self.__preprocessors = [*preprocessors]
|
35
|
+
self.__cleaners = [*cleaners]
|
36
|
+
self.__ignoring_filters = [*ignoring_filters]
|
37
|
+
self.__scoring_filters = [*scoring_filters]
|
38
|
+
self.__scorer = scorer
|
39
|
+
self.__tokenize = tokenizer
|
40
|
+
self.__passing_score = passing_score
|
41
|
+
|
42
|
+
def __preprocess(self, msg: str) -> str:
|
43
|
+
for p in self.__preprocessors:
|
44
|
+
msg = p.process(msg)
|
45
|
+
return msg
|
46
|
+
|
47
|
+
def __clean_token(self, token: str) -> str:
|
48
|
+
for c in self.__cleaners:
|
49
|
+
token = c.clean(token)
|
50
|
+
return token
|
51
|
+
|
52
|
+
def __clean_tokens(self, tokens: List[str]) -> List[str]:
|
53
|
+
# NOTE: tested, making a new list with a for loop *is* faster than
|
54
|
+
# - list comps
|
55
|
+
# - generator comps
|
56
|
+
# - in-place replacement/removal
|
57
|
+
# - in place replacement with result of generator comp
|
58
|
+
cleaned_tokens: List[str] = list()
|
59
|
+
for token in tokens:
|
60
|
+
cleaned_token = self.__clean_token(token)
|
61
|
+
if not cleaned_token:
|
62
|
+
# TODO: warn user?
|
63
|
+
continue
|
64
|
+
cleaned_tokens.append(cleaned_token)
|
65
|
+
return cleaned_tokens
|
66
|
+
|
67
|
+
def __filter_token(self, token: str) -> bool:
|
68
|
+
for f in self.__ignoring_filters:
|
69
|
+
if f.filter(token):
|
70
|
+
return True
|
71
|
+
return False
|
72
|
+
|
73
|
+
def __filter_tokens(self, tokens: List[str]) -> List[str]:
|
74
|
+
filtered_tokens: List[str] = []
|
75
|
+
for token in tokens:
|
76
|
+
if self.__filter_token(token):
|
77
|
+
continue
|
78
|
+
# the ignoring filter is true if the token matches
|
79
|
+
# the user wants to ignore these so keep non-matching tokens
|
80
|
+
filtered_tokens.append(token)
|
81
|
+
return filtered_tokens
|
82
|
+
|
83
|
+
def __score_tokens(self, tokens: List[str]) -> float:
|
84
|
+
return self.__scorer.score(tokens, self.__scoring_filters)
|
85
|
+
|
86
|
+
def is_toki_pona(self, message: str) -> bool:
|
87
|
+
preprocessed = self.__preprocess(message)
|
88
|
+
tokenized = self.__tokenize(preprocessed)
|
89
|
+
filtered = self.__filter_tokens(tokenized)
|
90
|
+
cleaned = self.__clean_tokens(filtered)
|
91
|
+
score = self.__score_tokens(cleaned)
|
92
|
+
|
93
|
+
if self.debug:
|
94
|
+
print("msg: %.2f %s" % (score, repr(message)))
|
95
|
+
print("Preproc: %s" % repr(preprocessed))
|
96
|
+
print("Tokenized: %s" % tokenized)
|
97
|
+
print("Filtered: %s" % filtered)
|
98
|
+
print("Cleaned: %s" % cleaned)
|
99
|
+
print()
|
100
|
+
|
101
|
+
return score >= self.__passing_score
|