sonatoki 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Configs.py +80 -0
- sonatoki/Filters.py +5 -0
- sonatoki/Preprocessors.py +1 -4
- sonatoki/Tokenizers.py +46 -28
- sonatoki/constants.py +10 -0
- sonatoki/ilo.py +30 -30
- sonatoki/linku.json +1 -1
- sonatoki/sandbox.json +1 -0
- {sonatoki-0.1.2.dist-info → sonatoki-0.1.3.dist-info}/METADATA +30 -24
- sonatoki-0.1.3.dist-info/RECORD +16 -0
- sonatoki-0.1.2.dist-info/RECORD +0 -14
- {sonatoki-0.1.2.dist-info → sonatoki-0.1.3.dist-info}/WHEEL +0 -0
- {sonatoki-0.1.2.dist-info → sonatoki-0.1.3.dist-info}/licenses/LICENSE +0 -0
sonatoki/Configs.py
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
# STL
|
2
|
+
from copy import deepcopy
|
3
|
+
from typing import List, Type, TypedDict
|
4
|
+
|
5
|
+
# PDM
|
6
|
+
from typing_extensions import NotRequired
|
7
|
+
|
8
|
+
# LOCAL
|
9
|
+
from sonatoki.Filters import (
|
10
|
+
Filter,
|
11
|
+
NimiPu,
|
12
|
+
Numerics,
|
13
|
+
Syllabic,
|
14
|
+
NimiLinku,
|
15
|
+
NimiPuAle,
|
16
|
+
Alphabetic,
|
17
|
+
ProperName,
|
18
|
+
Phonotactic,
|
19
|
+
NimiLinkuAle,
|
20
|
+
Punctuations,
|
21
|
+
)
|
22
|
+
from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
|
23
|
+
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
24
|
+
from sonatoki.Tokenizers import Tokenizer, WordTokenizerTok
|
25
|
+
from sonatoki.Preprocessors import (
|
26
|
+
URLs,
|
27
|
+
Preprocessor,
|
28
|
+
DiscordEmotes,
|
29
|
+
DiscordSpecial,
|
30
|
+
DiscordChannels,
|
31
|
+
DiscordMentions,
|
32
|
+
)
|
33
|
+
|
34
|
+
|
35
|
+
class IloConfig(TypedDict):
|
36
|
+
preprocessors: List[Type[Preprocessor]]
|
37
|
+
word_tokenizer: Type[Tokenizer]
|
38
|
+
cleaners: List[Type[Cleaner]]
|
39
|
+
ignoring_filters: List[Type[Filter]]
|
40
|
+
scoring_filters: List[Type[Filter]]
|
41
|
+
scorer: Type[Scorer]
|
42
|
+
passing_score: Number
|
43
|
+
|
44
|
+
|
45
|
+
BaseConfig: IloConfig = {
|
46
|
+
"preprocessors": [URLs],
|
47
|
+
"cleaners": [ConsecutiveDuplicates],
|
48
|
+
"ignoring_filters": [Numerics, Punctuations],
|
49
|
+
"scoring_filters": [],
|
50
|
+
"scorer": PassFail,
|
51
|
+
"passing_score": 0.8,
|
52
|
+
"word_tokenizer": WordTokenizerTok,
|
53
|
+
}
|
54
|
+
|
55
|
+
|
56
|
+
PrefConfig: IloConfig = deepcopy(BaseConfig)
|
57
|
+
PrefConfig["scoring_filters"].extend([NimiLinku, Syllabic, ProperName, Alphabetic])
|
58
|
+
PrefConfig["scorer"] = SoftScaling
|
59
|
+
|
60
|
+
|
61
|
+
LazyConfig: IloConfig = deepcopy(BaseConfig)
|
62
|
+
LazyConfig["scoring_filters"].extend([Alphabetic, ProperName])
|
63
|
+
LazyConfig["scorer"] = SoftPassFail
|
64
|
+
|
65
|
+
DiscordConfig: IloConfig = deepcopy(PrefConfig)
|
66
|
+
DiscordConfig["preprocessors"].extend(
|
67
|
+
[DiscordEmotes, DiscordMentions, DiscordChannels, DiscordSpecial]
|
68
|
+
)
|
69
|
+
TelegramConfig: IloConfig = deepcopy(PrefConfig)
|
70
|
+
ForumConfig: IloConfig = deepcopy(PrefConfig)
|
71
|
+
|
72
|
+
__all__ = [
|
73
|
+
"IloConfig",
|
74
|
+
"BaseConfig",
|
75
|
+
"PrefConfig",
|
76
|
+
"LazyConfig",
|
77
|
+
"DiscordConfig",
|
78
|
+
"TelegramConfig",
|
79
|
+
"ForumConfig",
|
80
|
+
]
|
sonatoki/Filters.py
CHANGED
@@ -17,6 +17,7 @@ from sonatoki.constants import (
|
|
17
17
|
NIMI_LINKU_SET,
|
18
18
|
NIMI_PU_ALE_SET,
|
19
19
|
NIMI_LINKU_ALE_SET,
|
20
|
+
NIMI_LINKU_SANDBOX_SET,
|
20
21
|
)
|
21
22
|
|
22
23
|
re.DEFAULT_VERSION = re.VERSION1
|
@@ -87,6 +88,10 @@ class NimiLinkuAle(SetFilter):
|
|
87
88
|
tokens = NIMI_LINKU_ALE_SET
|
88
89
|
|
89
90
|
|
91
|
+
class NimiLinkuSandbox(SetFilter):
|
92
|
+
tokens = NIMI_LINKU_SANDBOX_SET
|
93
|
+
|
94
|
+
|
90
95
|
class Phonotactic(RegexFilter):
|
91
96
|
"""Determines if a given token is phonotactically valid Toki Pona (or `n`).
|
92
97
|
Excludes both consecutive nasals and the illegal syllables:
|
sonatoki/Preprocessors.py
CHANGED
@@ -13,7 +13,7 @@ There are currently two distinct types of Preprocessor:
|
|
13
13
|
- ArrowQuote
|
14
14
|
|
15
15
|
Order does not generally matter, but if there were two overlapping containers such as in the string "|| spoiler ` monospace || `", order would matter.
|
16
|
-
|
16
|
+
It is up to the user to order them appropriately.
|
17
17
|
"""
|
18
18
|
|
19
19
|
# STL
|
@@ -27,8 +27,6 @@ re.DEFAULT_VERSION = re.VERSION1
|
|
27
27
|
|
28
28
|
|
29
29
|
class Preprocessor(ABC):
|
30
|
-
precedence: int = 0
|
31
|
-
|
32
30
|
@classmethod # order matters
|
33
31
|
@abstractmethod
|
34
32
|
def process(cls, msg: str) -> str:
|
@@ -104,7 +102,6 @@ class DoubleQuotes(RegexPreprocessor):
|
|
104
102
|
class Backticks(RegexPreprocessor):
|
105
103
|
"""Remove paired backticks and their contents `like this`"""
|
106
104
|
|
107
|
-
precedence = -10
|
108
105
|
pattern = re.compile(r"`[^`]+`", flags=re.S)
|
109
106
|
|
110
107
|
|
sonatoki/Tokenizers.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
# STL
|
2
|
-
from
|
2
|
+
from abc import ABC, abstractmethod
|
3
|
+
from typing import List
|
3
4
|
|
4
5
|
# PDM
|
5
6
|
import regex as re
|
6
|
-
|
7
|
-
# TODO: Entire module should be reworked to match the class scheme of the rest of the module, imo
|
7
|
+
from typing_extensions import override
|
8
8
|
|
9
9
|
try:
|
10
10
|
# PDM
|
@@ -17,42 +17,60 @@ except ImportError as e:
|
|
17
17
|
|
18
18
|
LANGUAGE = "english" # for NLTK
|
19
19
|
|
20
|
-
SENT_DELIMS_RE = re.compile(r"""(.*?[.?!;:])|(.+?$)""")
|
21
|
-
SENT_DELIMS_TOK = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-]|$)""")
|
22
|
-
# TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
|
23
|
-
# TODO: do the typography characters matter?
|
24
|
-
# NOTE: | / and , are *not* sentence delimiters for my purpose
|
25
20
|
|
26
|
-
|
27
|
-
|
21
|
+
class Tokenizer(ABC):
|
22
|
+
@classmethod
|
23
|
+
@abstractmethod
|
24
|
+
def tokenize(cls, s: str) -> List[str]: ...
|
28
25
|
|
29
|
-
Tokenizer = Callable[[str], List[str]]
|
30
26
|
|
27
|
+
class NoOpTokenizer(Tokenizer):
|
28
|
+
"""This is a special case that you do not want or need."""
|
31
29
|
|
32
|
-
|
30
|
+
@classmethod
|
31
|
+
@override
|
32
|
+
def tokenize(cls, s: str) -> List[str]:
|
33
|
+
return [s]
|
34
|
+
|
35
|
+
|
36
|
+
class RegexTokenizer(Tokenizer):
|
37
|
+
pattern: "re.Pattern[str]"
|
33
38
|
|
34
|
-
|
35
|
-
|
39
|
+
@classmethod
|
40
|
+
@override
|
41
|
+
def tokenize(cls, s: str) -> List[str]:
|
42
|
+
return [clean for word in re.split(cls.pattern, s) if (clean := word.strip())]
|
36
43
|
|
37
|
-
def word_tokenize_nltk(s: str) -> List[str]:
|
38
|
-
return __word_tokenize_nltk(text=s, language=LANGUAGE)
|
39
44
|
|
45
|
+
class WordTokenizerTok(RegexTokenizer):
|
46
|
+
pattern = re.compile(r"""([\p{Punctuation}\p{posix_punct}]+|\s+)""")
|
47
|
+
# TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
|
48
|
+
# TODO: do the typography characters matter?
|
49
|
+
# NOTE: | / and , are *not* sentence delimiters for my purpose
|
40
50
|
|
41
|
-
def sent_tokenize_re(s: str) -> List[str]:
|
42
|
-
return [
|
43
|
-
clean
|
44
|
-
for sent in re.findall(SENT_DELIMS_RE, s)
|
45
|
-
if (clean := sent[0].strip() or sent[1].strip())
|
46
|
-
]
|
47
51
|
|
52
|
+
class SentTokenizerTok(RegexTokenizer):
|
53
|
+
pattern = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-]|$)""")
|
48
54
|
|
49
|
-
def word_tokenize_re(s: str) -> List[str]:
|
50
|
-
return [clean for word in re.split(WORD_DELIMS_RE, s) if (clean := word.strip())]
|
51
55
|
|
56
|
+
class WordTokenizerRe(RegexTokenizer):
|
57
|
+
pattern = re.compile(r"""(?<=[.?!;:'"-])""")
|
52
58
|
|
53
|
-
def sent_tokenize_tok(s: str) -> List[str]:
|
54
|
-
return [clean for sent in re.split(SENT_DELIMS_TOK, s) if (clean := sent.strip())]
|
55
59
|
|
60
|
+
class SentTokenizerRe(RegexTokenizer):
|
61
|
+
pattern = re.compile(r"""(.*?[.?!;:])|(.+?$)""")
|
62
|
+
|
63
|
+
|
64
|
+
if not isinstance(nltk, ImportError):
|
56
65
|
|
57
|
-
|
58
|
-
|
66
|
+
class WordTokenizerNLTK(Tokenizer):
|
67
|
+
@classmethod
|
68
|
+
@override
|
69
|
+
def tokenize(cls, s: str) -> List[str]:
|
70
|
+
return __word_tokenize_nltk(text=s, language=LANGUAGE)
|
71
|
+
|
72
|
+
class SentTokenizerNLTK(Tokenizer):
|
73
|
+
@classmethod
|
74
|
+
@override
|
75
|
+
def tokenize(cls, s: str) -> List[str]:
|
76
|
+
return __sent_tokenize_nltk(text=s, language=LANGUAGE)
|
sonatoki/constants.py
CHANGED
@@ -4,6 +4,7 @@ from typing import Dict, List
|
|
4
4
|
from pathlib import Path
|
5
5
|
|
6
6
|
LINKU = Path(__file__).resolve().parent / Path("linku.json")
|
7
|
+
SANDBOX = Path(__file__).resolve().parent / Path("sandbox.json")
|
7
8
|
|
8
9
|
VOWELS = "aeiou"
|
9
10
|
CONSONANTS = "jklmnpstw"
|
@@ -29,10 +30,16 @@ with open(LINKU) as f:
|
|
29
30
|
]
|
30
31
|
NIMI_LINKU_ALE: List[str] = [d["word"] for d in r.values()]
|
31
32
|
|
33
|
+
with open(SANDBOX) as f:
|
34
|
+
r: Dict[str, Dict[str, str]] = json.loads(f.read())
|
35
|
+
NIMI_LINKU_SANDBOX: List[str] = [d["word"] for d in r.values()]
|
36
|
+
|
37
|
+
|
32
38
|
NIMI_PU_SET = set(NIMI_PU)
|
33
39
|
NIMI_PU_ALE_SET = set(NIMI_PU_ALE)
|
34
40
|
NIMI_LINKU_SET = set(NIMI_LINKU)
|
35
41
|
NIMI_LINKU_ALE_SET = set(NIMI_LINKU_ALE)
|
42
|
+
NIMI_LINKU_SANDBOX_SET = set(NIMI_LINKU_SANDBOX)
|
36
43
|
ALLOWABLES_SET = set(ALLOWABLES)
|
37
44
|
|
38
45
|
__all__ = [
|
@@ -54,4 +61,7 @@ __all__ = [
|
|
54
61
|
#
|
55
62
|
"NIMI_LINKU_ALE",
|
56
63
|
"NIMI_LINKU_ALE_SET",
|
64
|
+
#
|
65
|
+
"NIMI_LINKU_SANDBOX",
|
66
|
+
"NIMI_LINKU_SANDBOX_SET",
|
57
67
|
]
|
sonatoki/ilo.py
CHANGED
@@ -14,13 +14,13 @@ LOG = logging.getLogger(__name__)
|
|
14
14
|
|
15
15
|
class Ilo:
|
16
16
|
__preprocessors: List[Type[Preprocessor]]
|
17
|
+
__word_tokenizer: Type[Tokenizer]
|
17
18
|
__cleaners: List[Type[Cleaner]]
|
18
19
|
__ignoring_filters: List[Type[Filter]]
|
19
20
|
__scoring_filters: List[Type[Filter]]
|
20
21
|
__scorer: Type[Scorer]
|
21
|
-
__tokenize: Tokenizer
|
22
22
|
__passing_score: Number
|
23
|
-
logging_threshold: Number = 1
|
23
|
+
logging_threshold: Number = -1
|
24
24
|
|
25
25
|
def __init__(
|
26
26
|
self,
|
@@ -29,61 +29,62 @@ class Ilo:
|
|
29
29
|
ignoring_filters: List[Type[Filter]],
|
30
30
|
scoring_filters: List[Type[Filter]],
|
31
31
|
scorer: Type[Scorer],
|
32
|
-
tokenizer: Tokenizer, # NOTE: no wrapper needed?
|
33
32
|
passing_score: Number,
|
33
|
+
word_tokenizer: Type[Tokenizer],
|
34
34
|
):
|
35
35
|
super().__init__()
|
36
36
|
# avoid keeping a ref to user's list just in case
|
37
37
|
self.__preprocessors = [*preprocessors]
|
38
|
+
self.__word_tokenizer = word_tokenizer
|
38
39
|
self.__cleaners = [*cleaners]
|
39
40
|
self.__ignoring_filters = [*ignoring_filters]
|
40
41
|
self.__scoring_filters = [*scoring_filters]
|
41
42
|
self.__scorer = scorer
|
42
|
-
self.__tokenize = tokenizer
|
43
43
|
self.__passing_score = passing_score
|
44
44
|
|
45
|
-
def
|
45
|
+
def preprocess(self, msg: str) -> str:
|
46
46
|
for p in self.__preprocessors:
|
47
47
|
msg = p.process(msg)
|
48
48
|
return msg
|
49
49
|
|
50
|
-
def
|
50
|
+
def word_tokenize(self, msg: str) -> List[str]:
|
51
|
+
"""It is *highly* recommended that you run `ilo.preprocess` first."""
|
52
|
+
return self.__word_tokenizer.tokenize(msg)
|
53
|
+
|
54
|
+
def clean_token(self, token: str) -> str:
|
51
55
|
for c in self.__cleaners:
|
52
56
|
token = c.clean(token)
|
53
57
|
return token
|
54
58
|
|
55
|
-
def
|
56
|
-
# NOTE: tested, making a new list with a for loop *is* faster than
|
57
|
-
# -
|
58
|
-
# - generator comps
|
59
|
-
# - in-place replacement/removal
|
60
|
-
# - in place replacement with result of generator comp
|
59
|
+
def clean_tokens(self, tokens: List[str]) -> List[str]:
|
60
|
+
# NOTE: tested, making a new list with a for loop *is* faster than:
|
61
|
+
# list comp, generator comp, in-place replacement
|
61
62
|
cleaned_tokens: List[str] = list()
|
62
63
|
for token in tokens:
|
63
|
-
cleaned_token = self.
|
64
|
+
cleaned_token = self.clean_token(token)
|
64
65
|
if not cleaned_token:
|
65
66
|
# TODO: warn user?
|
66
67
|
continue
|
67
68
|
cleaned_tokens.append(cleaned_token)
|
68
69
|
return cleaned_tokens
|
69
70
|
|
70
|
-
def
|
71
|
+
def _filter_token(self, token: str) -> bool:
|
71
72
|
for f in self.__ignoring_filters:
|
72
73
|
if f.filter(token):
|
73
74
|
return True
|
74
75
|
return False
|
75
76
|
|
76
|
-
def
|
77
|
+
def filter_tokens(self, tokens: List[str]) -> List[str]:
|
77
78
|
filtered_tokens: List[str] = []
|
78
79
|
for token in tokens:
|
79
|
-
if self.
|
80
|
+
if self._filter_token(token):
|
80
81
|
continue
|
81
82
|
# the ignoring filter is true if the token matches
|
82
83
|
# the user wants to ignore these so keep non-matching tokens
|
83
84
|
filtered_tokens.append(token)
|
84
85
|
return filtered_tokens
|
85
86
|
|
86
|
-
def
|
87
|
+
def score_tokens(self, tokens: List[str]) -> float:
|
87
88
|
return self.__scorer.score(tokens, self.__scoring_filters)
|
88
89
|
|
89
90
|
def _is_toki_pona(
|
@@ -95,26 +96,25 @@ class Ilo:
|
|
95
96
|
- Filtered message (list[str])
|
96
97
|
- Cleaned message (list[str])
|
97
98
|
- Score (float)
|
98
|
-
- Result (bool)
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
score = self.__score_tokens(cleaned)
|
99
|
+
- Result (bool)"""
|
100
|
+
preprocessed = self.preprocess(message)
|
101
|
+
tokenized = self.word_tokenize(preprocessed)
|
102
|
+
filtered = self.filter_tokens(tokenized)
|
103
|
+
cleaned = self.clean_tokens(filtered)
|
104
|
+
score = self.score_tokens(cleaned)
|
105
105
|
result = score >= self.__passing_score
|
106
106
|
|
107
|
-
# NOTE: this method may break if above funcs start sharing a list
|
108
107
|
if score <= self.logging_threshold:
|
109
|
-
LOG.debug("
|
110
|
-
LOG.debug("
|
111
|
-
LOG.debug("
|
112
|
-
LOG.debug("
|
113
|
-
LOG.debug("
|
108
|
+
LOG.debug("msg: %.2f %s", score, repr(message))
|
109
|
+
LOG.debug("preproc: %s", repr(preprocessed))
|
110
|
+
LOG.debug("tokenized: %s", tokenized)
|
111
|
+
LOG.debug("filtered: %s", filtered)
|
112
|
+
LOG.debug("cleaned: %s", cleaned)
|
114
113
|
# TODO: Move to each function? Loses ability to control when logging occurs by threshold
|
115
114
|
|
116
115
|
return preprocessed, tokenized, filtered, cleaned, score, result
|
117
116
|
|
118
117
|
def is_toki_pona(self, message: str) -> bool:
|
118
|
+
"""Determines whether a single statement is or is not Toki Pona."""
|
119
119
|
*_, result = self._is_toki_pona(message)
|
120
120
|
return result
|