sonatoki 0.1.2__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sonatoki-0.1.2 → sonatoki-0.1.3}/PKG-INFO +30 -24
- {sonatoki-0.1.2 → sonatoki-0.1.3}/README.md +29 -23
- {sonatoki-0.1.2 → sonatoki-0.1.3}/pyproject.toml +1 -1
- sonatoki-0.1.3/src/sonatoki/Configs.py +80 -0
- {sonatoki-0.1.2 → sonatoki-0.1.3}/src/sonatoki/Filters.py +5 -0
- {sonatoki-0.1.2 → sonatoki-0.1.3}/src/sonatoki/Preprocessors.py +1 -4
- sonatoki-0.1.3/src/sonatoki/Tokenizers.py +76 -0
- {sonatoki-0.1.2 → sonatoki-0.1.3}/src/sonatoki/constants.py +10 -0
- {sonatoki-0.1.2 → sonatoki-0.1.3}/src/sonatoki/ilo.py +30 -30
- sonatoki-0.1.3/src/sonatoki/linku.json +1 -0
- sonatoki-0.1.3/src/sonatoki/sandbox.json +1 -0
- {sonatoki-0.1.2 → sonatoki-0.1.3}/tests/test_ilo.py +3 -30
- {sonatoki-0.1.2 → sonatoki-0.1.3}/tests/test_tokenize.py +11 -11
- sonatoki-0.1.2/src/sonatoki/Tokenizers.py +0 -58
- sonatoki-0.1.2/src/sonatoki/linku.json +0 -1
- {sonatoki-0.1.2 → sonatoki-0.1.3}/LICENSE +0 -0
- {sonatoki-0.1.2 → sonatoki-0.1.3}/src/sonatoki/Cleaners.py +0 -0
- {sonatoki-0.1.2 → sonatoki-0.1.3}/src/sonatoki/Scorers.py +0 -0
- {sonatoki-0.1.2 → sonatoki-0.1.3}/src/sonatoki/__init__.py +0 -0
- {sonatoki-0.1.2 → sonatoki-0.1.3}/src/sonatoki/__main__.py +0 -0
- {sonatoki-0.1.2 → sonatoki-0.1.3}/tests/__init__.py +0 -0
- {sonatoki-0.1.2 → sonatoki-0.1.3}/tests/test_cleaners.py +0 -0
- {sonatoki-0.1.2 → sonatoki-0.1.3}/tests/test_filters.py +0 -0
- {sonatoki-0.1.2 → sonatoki-0.1.3}/tests/test_preprocessors.py +0 -0
- {sonatoki-0.1.2 → sonatoki-0.1.3}/tests/test_scorers.py +0 -0
- {sonatoki-0.1.2 → sonatoki-0.1.3}/tests/test_utils.py +0 -0
- {sonatoki-0.1.2 → sonatoki-0.1.3}/tests/tokenize_cases/tokenize_sentences.yml +0 -0
- {sonatoki-0.1.2 → sonatoki-0.1.3}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
- {sonatoki-0.1.2 → sonatoki-0.1.3}/tests/tokenize_cases/tokenize_words.yml +0 -0
- {sonatoki-0.1.2 → sonatoki-0.1.3}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sonatoki
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.3
|
4
4
|
Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
|
5
5
|
Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
|
6
6
|
License: AGPL-3.0-or-later
|
@@ -20,9 +20,9 @@ This library, "Language Knowledge," helps you identify whether a message is in T
|
|
20
20
|
|
21
21
|
I wrote it with a variety of scraps and lessons learned from a prior project, [ilo pi toki pona taso, "toki-pona-only tool"](https://github.com/gregdan3/ilo-pi-toki-pona-taso). That tool will be rewritten to use this library shortly.
|
22
22
|
|
23
|
-
If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in
|
23
|
+
If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language, and this question applies to Toki Pona too.
|
24
24
|
|
25
|
-
This project "solves" that complex problem by offering a highly configurable
|
25
|
+
This project "solves" that complex problem by offering a highly configurable parser, so you can tune it to your preferences and goals.
|
26
26
|
|
27
27
|
## Quick Start
|
28
28
|
|
@@ -36,28 +36,11 @@ pdm add sonatoki
|
|
36
36
|
Then get started with a script along these lines:
|
37
37
|
|
38
38
|
```py
|
39
|
-
from sonatoki.
|
40
|
-
|
41
|
-
Syllabic,
|
42
|
-
NimiLinku,
|
43
|
-
Alphabetic,
|
44
|
-
ProperName,
|
45
|
-
Punctuations,
|
46
|
-
)
|
47
|
-
from sonatoki.Scorers import SoftScaling
|
48
|
-
from sonatoki.Cleaners import ConsecutiveDuplicates
|
49
|
-
from sonatoki.Tokenizers import word_tokenize_tok
|
50
|
-
from sonatoki.Preprocessors import URLs, DiscordEmotes
|
39
|
+
from sonatoki.ilo import Ilo
|
40
|
+
from sonatoki.Configs import PrefConfig
|
51
41
|
|
52
42
|
def main():
|
53
|
-
ilo = Ilo(
|
54
|
-
preprocessors=[URLs, DiscordEmotes],
|
55
|
-
ignoring_filters=[Numerics, Punctuations],
|
56
|
-
scoring_filters=[NimiLinku, Syllabic, ProperName, Alphabetic],
|
57
|
-
cleaners=[ConsecutiveDuplicates],
|
58
|
-
scorer=SoftScaling,
|
59
|
-
tokenizer=word_tokenize_tok,
|
60
|
-
)
|
43
|
+
ilo = Ilo(**PrefConfig)
|
61
44
|
ilo.is_toki_pona("imagine how is touch the sky") # False
|
62
45
|
ilo.is_toki_pona("o pilin insa e ni: sina pilin e sewi") # True
|
63
46
|
ilo.is_toki_pona("I Think I Can Evade Detection") # False
|
@@ -66,7 +49,30 @@ if __name__ == "__main__":
|
|
66
49
|
main()
|
67
50
|
```
|
68
51
|
|
69
|
-
|
52
|
+
Or if you'd prefer to configure on your own:
|
53
|
+
|
54
|
+
```py
|
55
|
+
from copy import deepcopy
|
56
|
+
from sonatoki.ilo import Ilo
|
57
|
+
from sonatoki.Configs import BaseConfig
|
58
|
+
from sonatoki.Filters import NimiPuAle, Phonotactic, ProperName
|
59
|
+
from sonatoki.Scorers import SoftPassFail
|
60
|
+
|
61
|
+
def main():
|
62
|
+
config = deepcopy(BaseConfig)
|
63
|
+
config["scoring_filters"].extend([NimiPuAle, Phonotactic, ProperName])
|
64
|
+
config["scorer"] = SoftPassFail
|
65
|
+
|
66
|
+
ilo = Ilo(**config)
|
67
|
+
ilo.is_toki_pona("mu mu!") # True
|
68
|
+
ilo.is_toki_pona("mi namako e moku mi") # True
|
69
|
+
ilo.is_toki_pona("ma wulin") # False
|
70
|
+
|
71
|
+
if __name__ == "__main__":
|
72
|
+
main()
|
73
|
+
```
|
74
|
+
|
75
|
+
`Ilo` is highly configurable by necessity, so I recommend looking through the premade configs in `Configs` as well as the individual `Preprocessors`, `Filters`, and `Scorers`. The `Cleaners` module only contains one cleaner, which I recommend always using. Similarly, the `Tokenizers` module contains several other word tokenizers, but their performance will be worse than the dedicated Toki Pona tokenizer `WordTokenizerTok`.
|
70
76
|
|
71
77
|
## Development
|
72
78
|
|
@@ -6,9 +6,9 @@ This library, "Language Knowledge," helps you identify whether a message is in T
|
|
6
6
|
|
7
7
|
I wrote it with a variety of scraps and lessons learned from a prior project, [ilo pi toki pona taso, "toki-pona-only tool"](https://github.com/gregdan3/ilo-pi-toki-pona-taso). That tool will be rewritten to use this library shortly.
|
8
8
|
|
9
|
-
If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in
|
9
|
+
If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language, and this question applies to Toki Pona too.
|
10
10
|
|
11
|
-
This project "solves" that complex problem by offering a highly configurable
|
11
|
+
This project "solves" that complex problem by offering a highly configurable parser, so you can tune it to your preferences and goals.
|
12
12
|
|
13
13
|
## Quick Start
|
14
14
|
|
@@ -22,28 +22,11 @@ pdm add sonatoki
|
|
22
22
|
Then get started with a script along these lines:
|
23
23
|
|
24
24
|
```py
|
25
|
-
from sonatoki.
|
26
|
-
|
27
|
-
Syllabic,
|
28
|
-
NimiLinku,
|
29
|
-
Alphabetic,
|
30
|
-
ProperName,
|
31
|
-
Punctuations,
|
32
|
-
)
|
33
|
-
from sonatoki.Scorers import SoftScaling
|
34
|
-
from sonatoki.Cleaners import ConsecutiveDuplicates
|
35
|
-
from sonatoki.Tokenizers import word_tokenize_tok
|
36
|
-
from sonatoki.Preprocessors import URLs, DiscordEmotes
|
25
|
+
from sonatoki.ilo import Ilo
|
26
|
+
from sonatoki.Configs import PrefConfig
|
37
27
|
|
38
28
|
def main():
|
39
|
-
ilo = Ilo(
|
40
|
-
preprocessors=[URLs, DiscordEmotes],
|
41
|
-
ignoring_filters=[Numerics, Punctuations],
|
42
|
-
scoring_filters=[NimiLinku, Syllabic, ProperName, Alphabetic],
|
43
|
-
cleaners=[ConsecutiveDuplicates],
|
44
|
-
scorer=SoftScaling,
|
45
|
-
tokenizer=word_tokenize_tok,
|
46
|
-
)
|
29
|
+
ilo = Ilo(**PrefConfig)
|
47
30
|
ilo.is_toki_pona("imagine how is touch the sky") # False
|
48
31
|
ilo.is_toki_pona("o pilin insa e ni: sina pilin e sewi") # True
|
49
32
|
ilo.is_toki_pona("I Think I Can Evade Detection") # False
|
@@ -52,7 +35,30 @@ if __name__ == "__main__":
|
|
52
35
|
main()
|
53
36
|
```
|
54
37
|
|
55
|
-
|
38
|
+
Or if you'd prefer to configure on your own:
|
39
|
+
|
40
|
+
```py
|
41
|
+
from copy import deepcopy
|
42
|
+
from sonatoki.ilo import Ilo
|
43
|
+
from sonatoki.Configs import BaseConfig
|
44
|
+
from sonatoki.Filters import NimiPuAle, Phonotactic, ProperName
|
45
|
+
from sonatoki.Scorers import SoftPassFail
|
46
|
+
|
47
|
+
def main():
|
48
|
+
config = deepcopy(BaseConfig)
|
49
|
+
config["scoring_filters"].extend([NimiPuAle, Phonotactic, ProperName])
|
50
|
+
config["scorer"] = SoftPassFail
|
51
|
+
|
52
|
+
ilo = Ilo(**config)
|
53
|
+
ilo.is_toki_pona("mu mu!") # True
|
54
|
+
ilo.is_toki_pona("mi namako e moku mi") # True
|
55
|
+
ilo.is_toki_pona("ma wulin") # False
|
56
|
+
|
57
|
+
if __name__ == "__main__":
|
58
|
+
main()
|
59
|
+
```
|
60
|
+
|
61
|
+
`Ilo` is highly configurable by necessity, so I recommend looking through the premade configs in `Configs` as well as the individual `Preprocessors`, `Filters`, and `Scorers`. The `Cleaners` module only contains one cleaner, which I recommend always using. Similarly, the `Tokenizers` module contains several other word tokenizers, but their performance will be worse than the dedicated Toki Pona tokenizer `WordTokenizerTok`.
|
56
62
|
|
57
63
|
## Development
|
58
64
|
|
@@ -0,0 +1,80 @@
|
|
1
|
+
# STL
|
2
|
+
from copy import deepcopy
|
3
|
+
from typing import List, Type, TypedDict
|
4
|
+
|
5
|
+
# PDM
|
6
|
+
from typing_extensions import NotRequired
|
7
|
+
|
8
|
+
# LOCAL
|
9
|
+
from sonatoki.Filters import (
|
10
|
+
Filter,
|
11
|
+
NimiPu,
|
12
|
+
Numerics,
|
13
|
+
Syllabic,
|
14
|
+
NimiLinku,
|
15
|
+
NimiPuAle,
|
16
|
+
Alphabetic,
|
17
|
+
ProperName,
|
18
|
+
Phonotactic,
|
19
|
+
NimiLinkuAle,
|
20
|
+
Punctuations,
|
21
|
+
)
|
22
|
+
from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
|
23
|
+
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
24
|
+
from sonatoki.Tokenizers import Tokenizer, WordTokenizerTok
|
25
|
+
from sonatoki.Preprocessors import (
|
26
|
+
URLs,
|
27
|
+
Preprocessor,
|
28
|
+
DiscordEmotes,
|
29
|
+
DiscordSpecial,
|
30
|
+
DiscordChannels,
|
31
|
+
DiscordMentions,
|
32
|
+
)
|
33
|
+
|
34
|
+
|
35
|
+
class IloConfig(TypedDict):
|
36
|
+
preprocessors: List[Type[Preprocessor]]
|
37
|
+
word_tokenizer: Type[Tokenizer]
|
38
|
+
cleaners: List[Type[Cleaner]]
|
39
|
+
ignoring_filters: List[Type[Filter]]
|
40
|
+
scoring_filters: List[Type[Filter]]
|
41
|
+
scorer: Type[Scorer]
|
42
|
+
passing_score: Number
|
43
|
+
|
44
|
+
|
45
|
+
BaseConfig: IloConfig = {
|
46
|
+
"preprocessors": [URLs],
|
47
|
+
"cleaners": [ConsecutiveDuplicates],
|
48
|
+
"ignoring_filters": [Numerics, Punctuations],
|
49
|
+
"scoring_filters": [],
|
50
|
+
"scorer": PassFail,
|
51
|
+
"passing_score": 0.8,
|
52
|
+
"word_tokenizer": WordTokenizerTok,
|
53
|
+
}
|
54
|
+
|
55
|
+
|
56
|
+
PrefConfig: IloConfig = deepcopy(BaseConfig)
|
57
|
+
PrefConfig["scoring_filters"].extend([NimiLinku, Syllabic, ProperName, Alphabetic])
|
58
|
+
PrefConfig["scorer"] = SoftScaling
|
59
|
+
|
60
|
+
|
61
|
+
LazyConfig: IloConfig = deepcopy(BaseConfig)
|
62
|
+
LazyConfig["scoring_filters"].extend([Alphabetic, ProperName])
|
63
|
+
LazyConfig["scorer"] = SoftPassFail
|
64
|
+
|
65
|
+
DiscordConfig: IloConfig = deepcopy(PrefConfig)
|
66
|
+
DiscordConfig["preprocessors"].extend(
|
67
|
+
[DiscordEmotes, DiscordMentions, DiscordChannels, DiscordSpecial]
|
68
|
+
)
|
69
|
+
TelegramConfig: IloConfig = deepcopy(PrefConfig)
|
70
|
+
ForumConfig: IloConfig = deepcopy(PrefConfig)
|
71
|
+
|
72
|
+
__all__ = [
|
73
|
+
"IloConfig",
|
74
|
+
"BaseConfig",
|
75
|
+
"PrefConfig",
|
76
|
+
"LazyConfig",
|
77
|
+
"DiscordConfig",
|
78
|
+
"TelegramConfig",
|
79
|
+
"ForumConfig",
|
80
|
+
]
|
@@ -17,6 +17,7 @@ from sonatoki.constants import (
|
|
17
17
|
NIMI_LINKU_SET,
|
18
18
|
NIMI_PU_ALE_SET,
|
19
19
|
NIMI_LINKU_ALE_SET,
|
20
|
+
NIMI_LINKU_SANDBOX_SET,
|
20
21
|
)
|
21
22
|
|
22
23
|
re.DEFAULT_VERSION = re.VERSION1
|
@@ -87,6 +88,10 @@ class NimiLinkuAle(SetFilter):
|
|
87
88
|
tokens = NIMI_LINKU_ALE_SET
|
88
89
|
|
89
90
|
|
91
|
+
class NimiLinkuSandbox(SetFilter):
|
92
|
+
tokens = NIMI_LINKU_SANDBOX_SET
|
93
|
+
|
94
|
+
|
90
95
|
class Phonotactic(RegexFilter):
|
91
96
|
"""Determines if a given token is phonotactically valid Toki Pona (or `n`).
|
92
97
|
Excludes both consecutive nasals and the illegal syllables:
|
@@ -13,7 +13,7 @@ There are currently two distinct types of Preprocessor:
|
|
13
13
|
- ArrowQuote
|
14
14
|
|
15
15
|
Order does not generally matter, but if there were two overlapping containers such as in the string "|| spoiler ` monospace || `", order would matter.
|
16
|
-
|
16
|
+
It is up to the user to order them appropriately.
|
17
17
|
"""
|
18
18
|
|
19
19
|
# STL
|
@@ -27,8 +27,6 @@ re.DEFAULT_VERSION = re.VERSION1
|
|
27
27
|
|
28
28
|
|
29
29
|
class Preprocessor(ABC):
|
30
|
-
precedence: int = 0
|
31
|
-
|
32
30
|
@classmethod # order matters
|
33
31
|
@abstractmethod
|
34
32
|
def process(cls, msg: str) -> str:
|
@@ -104,7 +102,6 @@ class DoubleQuotes(RegexPreprocessor):
|
|
104
102
|
class Backticks(RegexPreprocessor):
|
105
103
|
"""Remove paired backticks and their contents `like this`"""
|
106
104
|
|
107
|
-
precedence = -10
|
108
105
|
pattern = re.compile(r"`[^`]+`", flags=re.S)
|
109
106
|
|
110
107
|
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# STL
|
2
|
+
from abc import ABC, abstractmethod
|
3
|
+
from typing import List
|
4
|
+
|
5
|
+
# PDM
|
6
|
+
import regex as re
|
7
|
+
from typing_extensions import override
|
8
|
+
|
9
|
+
try:
|
10
|
+
# PDM
|
11
|
+
import nltk
|
12
|
+
from nltk.tokenize import sent_tokenize as __sent_tokenize_nltk
|
13
|
+
from nltk.tokenize import word_tokenize as __word_tokenize_nltk
|
14
|
+
except ImportError as e:
|
15
|
+
nltk = e
|
16
|
+
|
17
|
+
|
18
|
+
LANGUAGE = "english" # for NLTK
|
19
|
+
|
20
|
+
|
21
|
+
class Tokenizer(ABC):
|
22
|
+
@classmethod
|
23
|
+
@abstractmethod
|
24
|
+
def tokenize(cls, s: str) -> List[str]: ...
|
25
|
+
|
26
|
+
|
27
|
+
class NoOpTokenizer(Tokenizer):
|
28
|
+
"""This is a special case that you do not want or need."""
|
29
|
+
|
30
|
+
@classmethod
|
31
|
+
@override
|
32
|
+
def tokenize(cls, s: str) -> List[str]:
|
33
|
+
return [s]
|
34
|
+
|
35
|
+
|
36
|
+
class RegexTokenizer(Tokenizer):
|
37
|
+
pattern: "re.Pattern[str]"
|
38
|
+
|
39
|
+
@classmethod
|
40
|
+
@override
|
41
|
+
def tokenize(cls, s: str) -> List[str]:
|
42
|
+
return [clean for word in re.split(cls.pattern, s) if (clean := word.strip())]
|
43
|
+
|
44
|
+
|
45
|
+
class WordTokenizerTok(RegexTokenizer):
|
46
|
+
pattern = re.compile(r"""([\p{Punctuation}\p{posix_punct}]+|\s+)""")
|
47
|
+
# TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
|
48
|
+
# TODO: do the typography characters matter?
|
49
|
+
# NOTE: | / and , are *not* sentence delimiters for my purpose
|
50
|
+
|
51
|
+
|
52
|
+
class SentTokenizerTok(RegexTokenizer):
|
53
|
+
pattern = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-]|$)""")
|
54
|
+
|
55
|
+
|
56
|
+
class WordTokenizerRe(RegexTokenizer):
|
57
|
+
pattern = re.compile(r"""(?<=[.?!;:'"-])""")
|
58
|
+
|
59
|
+
|
60
|
+
class SentTokenizerRe(RegexTokenizer):
|
61
|
+
pattern = re.compile(r"""(.*?[.?!;:])|(.+?$)""")
|
62
|
+
|
63
|
+
|
64
|
+
if not isinstance(nltk, ImportError):
|
65
|
+
|
66
|
+
class WordTokenizerNLTK(Tokenizer):
|
67
|
+
@classmethod
|
68
|
+
@override
|
69
|
+
def tokenize(cls, s: str) -> List[str]:
|
70
|
+
return __word_tokenize_nltk(text=s, language=LANGUAGE)
|
71
|
+
|
72
|
+
class SentTokenizerNLTK(Tokenizer):
|
73
|
+
@classmethod
|
74
|
+
@override
|
75
|
+
def tokenize(cls, s: str) -> List[str]:
|
76
|
+
return __sent_tokenize_nltk(text=s, language=LANGUAGE)
|
@@ -4,6 +4,7 @@ from typing import Dict, List
|
|
4
4
|
from pathlib import Path
|
5
5
|
|
6
6
|
LINKU = Path(__file__).resolve().parent / Path("linku.json")
|
7
|
+
SANDBOX = Path(__file__).resolve().parent / Path("sandbox.json")
|
7
8
|
|
8
9
|
VOWELS = "aeiou"
|
9
10
|
CONSONANTS = "jklmnpstw"
|
@@ -29,10 +30,16 @@ with open(LINKU) as f:
|
|
29
30
|
]
|
30
31
|
NIMI_LINKU_ALE: List[str] = [d["word"] for d in r.values()]
|
31
32
|
|
33
|
+
with open(SANDBOX) as f:
|
34
|
+
r: Dict[str, Dict[str, str]] = json.loads(f.read())
|
35
|
+
NIMI_LINKU_SANDBOX: List[str] = [d["word"] for d in r.values()]
|
36
|
+
|
37
|
+
|
32
38
|
NIMI_PU_SET = set(NIMI_PU)
|
33
39
|
NIMI_PU_ALE_SET = set(NIMI_PU_ALE)
|
34
40
|
NIMI_LINKU_SET = set(NIMI_LINKU)
|
35
41
|
NIMI_LINKU_ALE_SET = set(NIMI_LINKU_ALE)
|
42
|
+
NIMI_LINKU_SANDBOX_SET = set(NIMI_LINKU_SANDBOX)
|
36
43
|
ALLOWABLES_SET = set(ALLOWABLES)
|
37
44
|
|
38
45
|
__all__ = [
|
@@ -54,4 +61,7 @@ __all__ = [
|
|
54
61
|
#
|
55
62
|
"NIMI_LINKU_ALE",
|
56
63
|
"NIMI_LINKU_ALE_SET",
|
64
|
+
#
|
65
|
+
"NIMI_LINKU_SANDBOX",
|
66
|
+
"NIMI_LINKU_SANDBOX_SET",
|
57
67
|
]
|
@@ -14,13 +14,13 @@ LOG = logging.getLogger(__name__)
|
|
14
14
|
|
15
15
|
class Ilo:
|
16
16
|
__preprocessors: List[Type[Preprocessor]]
|
17
|
+
__word_tokenizer: Type[Tokenizer]
|
17
18
|
__cleaners: List[Type[Cleaner]]
|
18
19
|
__ignoring_filters: List[Type[Filter]]
|
19
20
|
__scoring_filters: List[Type[Filter]]
|
20
21
|
__scorer: Type[Scorer]
|
21
|
-
__tokenize: Tokenizer
|
22
22
|
__passing_score: Number
|
23
|
-
logging_threshold: Number = 1
|
23
|
+
logging_threshold: Number = -1
|
24
24
|
|
25
25
|
def __init__(
|
26
26
|
self,
|
@@ -29,61 +29,62 @@ class Ilo:
|
|
29
29
|
ignoring_filters: List[Type[Filter]],
|
30
30
|
scoring_filters: List[Type[Filter]],
|
31
31
|
scorer: Type[Scorer],
|
32
|
-
tokenizer: Tokenizer, # NOTE: no wrapper needed?
|
33
32
|
passing_score: Number,
|
33
|
+
word_tokenizer: Type[Tokenizer],
|
34
34
|
):
|
35
35
|
super().__init__()
|
36
36
|
# avoid keeping a ref to user's list just in case
|
37
37
|
self.__preprocessors = [*preprocessors]
|
38
|
+
self.__word_tokenizer = word_tokenizer
|
38
39
|
self.__cleaners = [*cleaners]
|
39
40
|
self.__ignoring_filters = [*ignoring_filters]
|
40
41
|
self.__scoring_filters = [*scoring_filters]
|
41
42
|
self.__scorer = scorer
|
42
|
-
self.__tokenize = tokenizer
|
43
43
|
self.__passing_score = passing_score
|
44
44
|
|
45
|
-
def
|
45
|
+
def preprocess(self, msg: str) -> str:
|
46
46
|
for p in self.__preprocessors:
|
47
47
|
msg = p.process(msg)
|
48
48
|
return msg
|
49
49
|
|
50
|
-
def
|
50
|
+
def word_tokenize(self, msg: str) -> List[str]:
|
51
|
+
"""It is *highly* recommended that you run `ilo.preprocess` first."""
|
52
|
+
return self.__word_tokenizer.tokenize(msg)
|
53
|
+
|
54
|
+
def clean_token(self, token: str) -> str:
|
51
55
|
for c in self.__cleaners:
|
52
56
|
token = c.clean(token)
|
53
57
|
return token
|
54
58
|
|
55
|
-
def
|
56
|
-
# NOTE: tested, making a new list with a for loop *is* faster than
|
57
|
-
# -
|
58
|
-
# - generator comps
|
59
|
-
# - in-place replacement/removal
|
60
|
-
# - in place replacement with result of generator comp
|
59
|
+
def clean_tokens(self, tokens: List[str]) -> List[str]:
|
60
|
+
# NOTE: tested, making a new list with a for loop *is* faster than:
|
61
|
+
# list comp, generator comp, in-place replacement
|
61
62
|
cleaned_tokens: List[str] = list()
|
62
63
|
for token in tokens:
|
63
|
-
cleaned_token = self.
|
64
|
+
cleaned_token = self.clean_token(token)
|
64
65
|
if not cleaned_token:
|
65
66
|
# TODO: warn user?
|
66
67
|
continue
|
67
68
|
cleaned_tokens.append(cleaned_token)
|
68
69
|
return cleaned_tokens
|
69
70
|
|
70
|
-
def
|
71
|
+
def _filter_token(self, token: str) -> bool:
|
71
72
|
for f in self.__ignoring_filters:
|
72
73
|
if f.filter(token):
|
73
74
|
return True
|
74
75
|
return False
|
75
76
|
|
76
|
-
def
|
77
|
+
def filter_tokens(self, tokens: List[str]) -> List[str]:
|
77
78
|
filtered_tokens: List[str] = []
|
78
79
|
for token in tokens:
|
79
|
-
if self.
|
80
|
+
if self._filter_token(token):
|
80
81
|
continue
|
81
82
|
# the ignoring filter is true if the token matches
|
82
83
|
# the user wants to ignore these so keep non-matching tokens
|
83
84
|
filtered_tokens.append(token)
|
84
85
|
return filtered_tokens
|
85
86
|
|
86
|
-
def
|
87
|
+
def score_tokens(self, tokens: List[str]) -> float:
|
87
88
|
return self.__scorer.score(tokens, self.__scoring_filters)
|
88
89
|
|
89
90
|
def _is_toki_pona(
|
@@ -95,26 +96,25 @@ class Ilo:
|
|
95
96
|
- Filtered message (list[str])
|
96
97
|
- Cleaned message (list[str])
|
97
98
|
- Score (float)
|
98
|
-
- Result (bool)
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
score = self.__score_tokens(cleaned)
|
99
|
+
- Result (bool)"""
|
100
|
+
preprocessed = self.preprocess(message)
|
101
|
+
tokenized = self.word_tokenize(preprocessed)
|
102
|
+
filtered = self.filter_tokens(tokenized)
|
103
|
+
cleaned = self.clean_tokens(filtered)
|
104
|
+
score = self.score_tokens(cleaned)
|
105
105
|
result = score >= self.__passing_score
|
106
106
|
|
107
|
-
# NOTE: this method may break if above funcs start sharing a list
|
108
107
|
if score <= self.logging_threshold:
|
109
|
-
LOG.debug("
|
110
|
-
LOG.debug("
|
111
|
-
LOG.debug("
|
112
|
-
LOG.debug("
|
113
|
-
LOG.debug("
|
108
|
+
LOG.debug("msg: %.2f %s", score, repr(message))
|
109
|
+
LOG.debug("preproc: %s", repr(preprocessed))
|
110
|
+
LOG.debug("tokenized: %s", tokenized)
|
111
|
+
LOG.debug("filtered: %s", filtered)
|
112
|
+
LOG.debug("cleaned: %s", cleaned)
|
114
113
|
# TODO: Move to each function? Loses ability to control when logging occurs by threshold
|
115
114
|
|
116
115
|
return preprocessed, tokenized, filtered, cleaned, score, result
|
117
116
|
|
118
117
|
def is_toki_pona(self, message: str) -> bool:
|
118
|
+
"""Determines whether a single statement is or is not Toki Pona."""
|
119
119
|
*_, result = self._is_toki_pona(message)
|
120
120
|
return result
|