sonatoki 0.1.0__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sonatoki-0.1.0 → sonatoki-0.1.2}/PKG-INFO +29 -6
- {sonatoki-0.1.0 → sonatoki-0.1.2}/README.md +28 -5
- {sonatoki-0.1.0 → sonatoki-0.1.2}/pyproject.toml +1 -1
- {sonatoki-0.1.0 → sonatoki-0.1.2}/src/sonatoki/Preprocessors.py +3 -0
- {sonatoki-0.1.0 → sonatoki-0.1.2}/src/sonatoki/Scorers.py +43 -26
- {sonatoki-0.1.0 → sonatoki-0.1.2}/src/sonatoki/Tokenizers.py +10 -16
- {sonatoki-0.1.0 → sonatoki-0.1.2}/src/sonatoki/ilo.py +30 -11
- sonatoki-0.1.2/tests/test_ilo.py +185 -0
- sonatoki-0.1.2/tests/tokenize_cases/tokenize_sentences_tok.yml +37 -0
- {sonatoki-0.1.0 → sonatoki-0.1.2}/tests/tokenize_cases/tokenize_words.yml +0 -4
- {sonatoki-0.1.0 → sonatoki-0.1.2}/tests/tokenize_cases/tokenize_words_tok.yml +27 -0
- sonatoki-0.1.0/tests/test_ilo.py +0 -53
- sonatoki-0.1.0/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -20
- {sonatoki-0.1.0 → sonatoki-0.1.2}/LICENSE +0 -0
- {sonatoki-0.1.0 → sonatoki-0.1.2}/src/sonatoki/Cleaners.py +0 -0
- {sonatoki-0.1.0 → sonatoki-0.1.2}/src/sonatoki/Filters.py +0 -0
- {sonatoki-0.1.0 → sonatoki-0.1.2}/src/sonatoki/__init__.py +0 -0
- {sonatoki-0.1.0 → sonatoki-0.1.2}/src/sonatoki/__main__.py +0 -0
- {sonatoki-0.1.0 → sonatoki-0.1.2}/src/sonatoki/constants.py +0 -0
- {sonatoki-0.1.0 → sonatoki-0.1.2}/src/sonatoki/linku.json +0 -0
- {sonatoki-0.1.0 → sonatoki-0.1.2}/tests/__init__.py +0 -0
- {sonatoki-0.1.0 → sonatoki-0.1.2}/tests/test_cleaners.py +0 -0
- {sonatoki-0.1.0 → sonatoki-0.1.2}/tests/test_filters.py +0 -0
- {sonatoki-0.1.0 → sonatoki-0.1.2}/tests/test_preprocessors.py +0 -0
- {sonatoki-0.1.0 → sonatoki-0.1.2}/tests/test_scorers.py +0 -0
- {sonatoki-0.1.0 → sonatoki-0.1.2}/tests/test_tokenize.py +0 -0
- {sonatoki-0.1.0 → sonatoki-0.1.2}/tests/test_utils.py +0 -0
- {sonatoki-0.1.0 → sonatoki-0.1.2}/tests/tokenize_cases/tokenize_sentences.yml +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sonatoki
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.2
|
4
4
|
Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
|
5
5
|
Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
|
6
6
|
License: AGPL-3.0-or-later
|
@@ -44,7 +44,7 @@ from sonatoki.Filters import (
|
|
44
44
|
ProperName,
|
45
45
|
Punctuations,
|
46
46
|
)
|
47
|
-
from sonatoki.Scorers import
|
47
|
+
from sonatoki.Scorers import SoftScaling
|
48
48
|
from sonatoki.Cleaners import ConsecutiveDuplicates
|
49
49
|
from sonatoki.Tokenizers import word_tokenize_tok
|
50
50
|
from sonatoki.Preprocessors import URLs, DiscordEmotes
|
@@ -55,22 +55,23 @@ def main():
|
|
55
55
|
ignoring_filters=[Numerics, Punctuations],
|
56
56
|
scoring_filters=[NimiLinku, Syllabic, ProperName, Alphabetic],
|
57
57
|
cleaners=[ConsecutiveDuplicates],
|
58
|
-
scorer=
|
58
|
+
scorer=SoftScaling,
|
59
59
|
tokenizer=word_tokenize_tok,
|
60
60
|
)
|
61
61
|
ilo.is_toki_pona("imagine how is touch the sky") # False
|
62
62
|
ilo.is_toki_pona("o pilin insa e ni: sina pilin e sewi") # True
|
63
|
+
ilo.is_toki_pona("I Think I Can Evade Detection") # False
|
63
64
|
|
64
65
|
if __name__ == "__main__":
|
65
66
|
main()
|
66
67
|
```
|
67
68
|
|
68
|
-
`Ilo` is highly configurable by design, so I recommend exploring the `Preprocessors`, `Filters`, and `Scorers` modules. The `Cleaners` module only contains one cleaner, which I
|
69
|
+
`Ilo` is highly configurable by design, so I recommend exploring the `Preprocessors`, `Filters`, and `Scorers` modules. The `Cleaners` module only contains one cleaner, which I recommend using. The `Tokenizers` module contains several other word tokenizers, but their performance will be worse than the dedicated Toki Pona tokenizer `word_tokenize_tok`.
|
69
70
|
|
70
71
|
## Development
|
71
72
|
|
72
73
|
1. Install [pdm](https://github.com/pdm-project/pdm)
|
73
|
-
1. `pdm
|
74
|
+
1. `pdm install --dev`
|
74
75
|
1. Open any file you like!
|
75
76
|
|
76
77
|
## FAQ
|
@@ -81,4 +82,26 @@ The intent is to show our methodology to the Unicode Consortium, particularly to
|
|
81
82
|
|
82
83
|
After our proposal has been examined and a result given by the committee, I will translate this file and library into Toki Pona, with a note left behind for those who do not understand it.
|
83
84
|
|
84
|
-
###
|
85
|
+
### What's the deal with the tokenizers?
|
86
|
+
|
87
|
+
The Toki Pona tokenizer `word_tokenize_tok` is very specific in always separating writing characters from punctuation, and leaving contiguous punctuation as contiguous- this is a level of precision that NLTK's English tokenizer does not want for several reasons, such as that English words can have "punctuation" characters in them.
|
88
|
+
|
89
|
+
Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet, so a more aggressive tokenizer is highly desirable.
|
90
|
+
|
91
|
+
The other tokenizers are provided as a comparison case more than anything. I do not recommend their use.
|
92
|
+
|
93
|
+
### Aren't there a lot of false positives?
|
94
|
+
|
95
|
+
Yes. It's up to you to use this tool responsibly on input you've done your best to clean, and better, use stronger filters before weaker ones. For now though, here's a list of relevant false positives:
|
96
|
+
|
97
|
+
- `ProperName` will errantly match text in languages without a capital/lowercase distinction, artificially inflating the scores.
|
98
|
+
- `Alphabetic` will match a _lot_ of undesirable text- it essentially allows 14 letters of the English alphabet.
|
99
|
+
|
100
|
+
### Don't some of the cleaners/filters conflict?
|
101
|
+
|
102
|
+
Yes. Some do so
|
103
|
+
|
104
|
+
- `ConsecutiveDuplicates` may errantly change a word's validity. For example, "manna" is phonotactically invalid in Toki Pona, but would become "mana" which is valid.
|
105
|
+
- `ConsecutiveDuplicates` will not work correctly with syllabaries (alphabets, but representing a pair of consonant and vowel).
|
106
|
+
|
107
|
+
You'll notice a _lot_ of these are troubles regarding the application of latin alphabet filters to non-latin text. Working on it!
|
@@ -30,7 +30,7 @@ from sonatoki.Filters import (
|
|
30
30
|
ProperName,
|
31
31
|
Punctuations,
|
32
32
|
)
|
33
|
-
from sonatoki.Scorers import
|
33
|
+
from sonatoki.Scorers import SoftScaling
|
34
34
|
from sonatoki.Cleaners import ConsecutiveDuplicates
|
35
35
|
from sonatoki.Tokenizers import word_tokenize_tok
|
36
36
|
from sonatoki.Preprocessors import URLs, DiscordEmotes
|
@@ -41,22 +41,23 @@ def main():
|
|
41
41
|
ignoring_filters=[Numerics, Punctuations],
|
42
42
|
scoring_filters=[NimiLinku, Syllabic, ProperName, Alphabetic],
|
43
43
|
cleaners=[ConsecutiveDuplicates],
|
44
|
-
scorer=
|
44
|
+
scorer=SoftScaling,
|
45
45
|
tokenizer=word_tokenize_tok,
|
46
46
|
)
|
47
47
|
ilo.is_toki_pona("imagine how is touch the sky") # False
|
48
48
|
ilo.is_toki_pona("o pilin insa e ni: sina pilin e sewi") # True
|
49
|
+
ilo.is_toki_pona("I Think I Can Evade Detection") # False
|
49
50
|
|
50
51
|
if __name__ == "__main__":
|
51
52
|
main()
|
52
53
|
```
|
53
54
|
|
54
|
-
`Ilo` is highly configurable by design, so I recommend exploring the `Preprocessors`, `Filters`, and `Scorers` modules. The `Cleaners` module only contains one cleaner, which I
|
55
|
+
`Ilo` is highly configurable by design, so I recommend exploring the `Preprocessors`, `Filters`, and `Scorers` modules. The `Cleaners` module only contains one cleaner, which I recommend using. The `Tokenizers` module contains several other word tokenizers, but their performance will be worse than the dedicated Toki Pona tokenizer `word_tokenize_tok`.
|
55
56
|
|
56
57
|
## Development
|
57
58
|
|
58
59
|
1. Install [pdm](https://github.com/pdm-project/pdm)
|
59
|
-
1. `pdm
|
60
|
+
1. `pdm install --dev`
|
60
61
|
1. Open any file you like!
|
61
62
|
|
62
63
|
## FAQ
|
@@ -67,4 +68,26 @@ The intent is to show our methodology to the Unicode Consortium, particularly to
|
|
67
68
|
|
68
69
|
After our proposal has been examined and a result given by the committee, I will translate this file and library into Toki Pona, with a note left behind for those who do not understand it.
|
69
70
|
|
70
|
-
###
|
71
|
+
### What's the deal with the tokenizers?
|
72
|
+
|
73
|
+
The Toki Pona tokenizer `word_tokenize_tok` is very specific in always separating writing characters from punctuation, and leaving contiguous punctuation as contiguous- this is a level of precision that NLTK's English tokenizer does not want for several reasons, such as that English words can have "punctuation" characters in them.
|
74
|
+
|
75
|
+
Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet, so a more aggressive tokenizer is highly desirable.
|
76
|
+
|
77
|
+
The other tokenizers are provided as a comparison case more than anything. I do not recommend their use.
|
78
|
+
|
79
|
+
### Aren't there a lot of false positives?
|
80
|
+
|
81
|
+
Yes. It's up to you to use this tool responsibly on input you've done your best to clean, and better, use stronger filters before weaker ones. For now though, here's a list of relevant false positives:
|
82
|
+
|
83
|
+
- `ProperName` will errantly match text in languages without a capital/lowercase distinction, artificially inflating the scores.
|
84
|
+
- `Alphabetic` will match a _lot_ of undesirable text- it essentially allows 14 letters of the English alphabet.
|
85
|
+
|
86
|
+
### Don't some of the cleaners/filters conflict?
|
87
|
+
|
88
|
+
Yes. Some do so
|
89
|
+
|
90
|
+
- `ConsecutiveDuplicates` may errantly change a word's validity. For example, "manna" is phonotactically invalid in Toki Pona, but would become "mana" which is valid.
|
91
|
+
- `ConsecutiveDuplicates` will not work correctly with syllabaries (alphabets, but representing a pair of consonant and vowel).
|
92
|
+
|
93
|
+
You'll notice a _lot_ of these are troubles regarding the application of latin alphabet filters to non-latin text. Working on it!
|
@@ -1,5 +1,6 @@
|
|
1
1
|
# STL
|
2
2
|
import math
|
3
|
+
import logging
|
3
4
|
from abc import ABC, abstractmethod
|
4
5
|
from typing import Dict, List, Type, Union
|
5
6
|
|
@@ -9,24 +10,20 @@ from typing_extensions import override
|
|
9
10
|
# LOCAL
|
10
11
|
from sonatoki.Filters import Filter
|
11
12
|
|
13
|
+
LOG = logging.getLogger(__name__)
|
14
|
+
|
12
15
|
Number = Union[int, float]
|
13
16
|
Weights = Dict[str, Number]
|
14
17
|
|
15
18
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
#
|
20
|
-
#
|
21
|
-
|
22
|
-
# if not filter.filter(token):
|
23
|
-
# continue
|
24
|
-
# # NOTE: We assume the filters are ordered by their score
|
25
|
-
# # Thus the first match is also the highest scoring
|
26
|
-
# return filter.counts, cls.weights[filter.__name__]
|
27
|
-
# # TODO: override weight if count is 0?
|
28
|
-
# return 1, 0
|
19
|
+
def sigmoid(n: int) -> Number:
|
20
|
+
return 1 / (1 + math.exp(-(0.30 * (n - 1))))
|
21
|
+
# n-1 makes sigmoid(1) == 0.5
|
22
|
+
# 0.30 softens scaling in favor of short input
|
23
|
+
# return n / (1+abs(n)) # too weak in 0.7+
|
24
|
+
|
29
25
|
|
26
|
+
class Scorer(ABC):
|
30
27
|
@classmethod
|
31
28
|
@abstractmethod
|
32
29
|
def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
|
@@ -37,10 +34,15 @@ class PassFail(Scorer):
|
|
37
34
|
"""The token passes any filter or fails all of them, scoring 1 or 0 respectively."""
|
38
35
|
|
39
36
|
@classmethod
|
40
|
-
def
|
37
|
+
def score_token(cls, token: str, filters: List[Type[Filter]]) -> Number:
|
41
38
|
for f in filters:
|
42
39
|
if f.filter(token):
|
43
|
-
|
40
|
+
score = 1
|
41
|
+
LOG.debug(
|
42
|
+
"%12s.%s('%s') = %.2f", cls.__name__, f.__name__, token, score
|
43
|
+
)
|
44
|
+
return score
|
45
|
+
LOG.debug("%12s('%s') = 0.00", cls.__name__, token)
|
44
46
|
return 0
|
45
47
|
|
46
48
|
@classmethod
|
@@ -52,10 +54,27 @@ class PassFail(Scorer):
|
|
52
54
|
total_score = 0
|
53
55
|
len_tokens = len(tokens)
|
54
56
|
for token in tokens:
|
55
|
-
total_score += cls.
|
57
|
+
total_score += cls.score_token(token, filters)
|
56
58
|
return total_score / len_tokens if len_tokens else 0
|
57
59
|
|
58
60
|
|
61
|
+
class SoftPassFail(PassFail):
|
62
|
+
@classmethod
|
63
|
+
@override
|
64
|
+
def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
|
65
|
+
if not tokens:
|
66
|
+
return 1
|
67
|
+
|
68
|
+
total_score = 0
|
69
|
+
len_tokens = len(tokens)
|
70
|
+
for token in tokens:
|
71
|
+
total_score += cls.score_token(token, filters)
|
72
|
+
|
73
|
+
percentage = total_score / len_tokens if len_tokens else 0
|
74
|
+
percentage **= sigmoid(len_tokens)
|
75
|
+
return percentage
|
76
|
+
|
77
|
+
|
59
78
|
class Scaling(Scorer):
|
60
79
|
"""
|
61
80
|
The sooner a token matches a filter, the higher its score.
|
@@ -67,7 +86,12 @@ class Scaling(Scorer):
|
|
67
86
|
def score_token(cls, token: str, filters: List[Type[Filter]], scale: int):
|
68
87
|
for i, f in enumerate(filters):
|
69
88
|
if f.filter(token):
|
70
|
-
|
89
|
+
score = scale - i
|
90
|
+
LOG.debug(
|
91
|
+
"%12s.%s('%s') = %.2f", cls.__name__, f.__name__, token, score
|
92
|
+
)
|
93
|
+
return score
|
94
|
+
LOG.debug("%12s('%s') = 0.00", cls.__name__, token)
|
71
95
|
return 0
|
72
96
|
|
73
97
|
@classmethod
|
@@ -91,13 +115,6 @@ class SoftScaling(Scaling):
|
|
91
115
|
For example, a single token scoring 0.64 will now score 0.8.
|
92
116
|
"""
|
93
117
|
|
94
|
-
@staticmethod
|
95
|
-
def sigmoid(n: int) -> Number:
|
96
|
-
return 1 / (1 + math.exp(-(0.30 * (n - 1))))
|
97
|
-
# n-1 makes sigmoid(1) == 0.5
|
98
|
-
# 0.30 softens scaling against input
|
99
|
-
# return n / (1+abs(n)) # too weak in 0.7+
|
100
|
-
|
101
118
|
@classmethod
|
102
119
|
@override
|
103
120
|
def score(cls, tokens: List[str], filters: List[Type[Filter]]) -> Number:
|
@@ -113,11 +130,11 @@ class SoftScaling(Scaling):
|
|
113
130
|
total_score += cls.score_token(token, filters, len_filters)
|
114
131
|
|
115
132
|
percentage = total_score / max_score if max_score else 0
|
116
|
-
percentage **=
|
133
|
+
percentage **= sigmoid(len_tokens)
|
117
134
|
return percentage
|
118
135
|
|
119
136
|
|
120
137
|
class Logarithmic(Scorer): ...
|
121
138
|
|
122
139
|
|
123
|
-
__all__ = ["PassFail", "Scaling", "SoftScaling"]
|
140
|
+
__all__ = ["PassFail", "SoftPassFail", "Scaling", "SoftScaling"]
|
@@ -4,6 +4,8 @@ from typing import List, Callable
|
|
4
4
|
# PDM
|
5
5
|
import regex as re
|
6
6
|
|
7
|
+
# TODO: Entire module should be reworked to match the class scheme of the rest of the module, imo
|
8
|
+
|
7
9
|
try:
|
8
10
|
# PDM
|
9
11
|
import nltk
|
@@ -15,18 +17,14 @@ except ImportError as e:
|
|
15
17
|
|
16
18
|
LANGUAGE = "english" # for NLTK
|
17
19
|
|
18
|
-
SENT_DELIMS_RE = r"""(.*?[.?!;:])|(.+?$)"""
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
WORD_DELIMS_RE = r"""\s+|(?=[.?!;:'"-])"""
|
26
|
-
WORD_DELIMS_RE = re.compile(WORD_DELIMS_RE)
|
20
|
+
SENT_DELIMS_RE = re.compile(r"""(.*?[.?!;:])|(.+?$)""")
|
21
|
+
SENT_DELIMS_TOK = re.compile(r"""(?<=[.?!:;·…“”"'()\[\]\-]|$)""")
|
22
|
+
# TODO: are <> or {} that common as *sentence* delims? [] are already a stretch
|
23
|
+
# TODO: do the typography characters matter?
|
24
|
+
# NOTE: | / and , are *not* sentence delimiters for my purpose
|
27
25
|
|
28
|
-
|
29
|
-
WORD_DELIMS_TOK = re.compile(
|
26
|
+
WORD_DELIMS_RE = re.compile(r"""\s+|(?=[.?!;:'"-])""")
|
27
|
+
WORD_DELIMS_TOK = re.compile(r"([\p{Punctuation}\p{posix_punct}]+|\s+)")
|
30
28
|
|
31
29
|
Tokenizer = Callable[[str], List[str]]
|
32
30
|
|
@@ -53,11 +51,7 @@ def word_tokenize_re(s: str) -> List[str]:
|
|
53
51
|
|
54
52
|
|
55
53
|
def sent_tokenize_tok(s: str) -> List[str]:
|
56
|
-
return [
|
57
|
-
clean
|
58
|
-
for sent in re.findall(SENT_DELIMS_TOK, s)
|
59
|
-
if (clean := sent[0].strip() or sent[1].strip())
|
60
|
-
]
|
54
|
+
return [clean for sent in re.split(SENT_DELIMS_TOK, s) if (clean := sent.strip())]
|
61
55
|
|
62
56
|
|
63
57
|
def word_tokenize_tok(s: str) -> List[str]:
|
@@ -1,5 +1,6 @@
|
|
1
1
|
# STL
|
2
|
-
|
2
|
+
import logging
|
3
|
+
from typing import List, Type, Tuple
|
3
4
|
|
4
5
|
# LOCAL
|
5
6
|
from sonatoki.Filters import Filter
|
@@ -8,6 +9,8 @@ from sonatoki.Cleaners import Cleaner
|
|
8
9
|
from sonatoki.Tokenizers import Tokenizer
|
9
10
|
from sonatoki.Preprocessors import Preprocessor
|
10
11
|
|
12
|
+
LOG = logging.getLogger(__name__)
|
13
|
+
|
11
14
|
|
12
15
|
class Ilo:
|
13
16
|
__preprocessors: List[Type[Preprocessor]]
|
@@ -17,7 +20,7 @@ class Ilo:
|
|
17
20
|
__scorer: Type[Scorer]
|
18
21
|
__tokenize: Tokenizer
|
19
22
|
__passing_score: Number
|
20
|
-
|
23
|
+
logging_threshold: Number = 1.0
|
21
24
|
|
22
25
|
def __init__(
|
23
26
|
self,
|
@@ -83,19 +86,35 @@ class Ilo:
|
|
83
86
|
def __score_tokens(self, tokens: List[str]) -> float:
|
84
87
|
return self.__scorer.score(tokens, self.__scoring_filters)
|
85
88
|
|
86
|
-
def
|
89
|
+
def _is_toki_pona(
|
90
|
+
self, message: str
|
91
|
+
) -> Tuple[str, List[str], List[str], List[str], Number, bool]:
|
92
|
+
"""Returns all components of the processing algorithm:
|
93
|
+
- Preprocessed message (str)
|
94
|
+
- Tokenized message (list[str])
|
95
|
+
- Filtered message (list[str])
|
96
|
+
- Cleaned message (list[str])
|
97
|
+
- Score (float)
|
98
|
+
- Result (bool)
|
99
|
+
"""
|
87
100
|
preprocessed = self.__preprocess(message)
|
88
101
|
tokenized = self.__tokenize(preprocessed)
|
89
102
|
filtered = self.__filter_tokens(tokenized)
|
90
103
|
cleaned = self.__clean_tokens(filtered)
|
91
104
|
score = self.__score_tokens(cleaned)
|
105
|
+
result = score >= self.__passing_score
|
92
106
|
|
93
|
-
if
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
107
|
+
# NOTE: this method may break if above funcs start sharing a list
|
108
|
+
if score <= self.logging_threshold:
|
109
|
+
LOG.debug("Msg: %.2f %s", score, repr(message))
|
110
|
+
LOG.debug("Preproc: %s", repr(preprocessed))
|
111
|
+
LOG.debug("Tokenized: %s", tokenized)
|
112
|
+
LOG.debug("Filtered: %s", filtered)
|
113
|
+
LOG.debug("Cleaned: %s", cleaned)
|
114
|
+
# TODO: Move to each function? Loses ability to control when logging occurs by threshold
|
100
115
|
|
101
|
-
return score
|
116
|
+
return preprocessed, tokenized, filtered, cleaned, score, result
|
117
|
+
|
118
|
+
def is_toki_pona(self, message: str) -> bool:
|
119
|
+
*_, result = self._is_toki_pona(message)
|
120
|
+
return result
|
@@ -0,0 +1,185 @@
|
|
1
|
+
# PDM
|
2
|
+
import pytest
|
3
|
+
|
4
|
+
# LOCAL
|
5
|
+
from sonatoki.ilo import Ilo
|
6
|
+
from sonatoki.Filters import (
|
7
|
+
Numerics,
|
8
|
+
Syllabic,
|
9
|
+
NimiLinku,
|
10
|
+
Alphabetic,
|
11
|
+
ProperName,
|
12
|
+
Punctuations,
|
13
|
+
)
|
14
|
+
from sonatoki.Scorers import SoftScaling, SoftPassFail
|
15
|
+
from sonatoki.Cleaners import ConsecutiveDuplicates
|
16
|
+
from sonatoki.Tokenizers import word_tokenize_tok
|
17
|
+
from sonatoki.Preprocessors import URLs
|
18
|
+
|
19
|
+
|
20
|
+
@pytest.fixture
|
21
|
+
def ilo():
|
22
|
+
ilo = Ilo(
|
23
|
+
preprocessors=[URLs],
|
24
|
+
ignoring_filters=[Numerics, Punctuations],
|
25
|
+
scoring_filters=[NimiLinku, Syllabic, ProperName, Alphabetic],
|
26
|
+
cleaners=[ConsecutiveDuplicates],
|
27
|
+
scorer=SoftScaling,
|
28
|
+
tokenizer=word_tokenize_tok,
|
29
|
+
passing_score=0.8,
|
30
|
+
)
|
31
|
+
# ilo.logging_threshold = 0.8
|
32
|
+
return ilo
|
33
|
+
|
34
|
+
|
35
|
+
@pytest.fixture()
|
36
|
+
def lazy_ilo():
|
37
|
+
ilo = Ilo(
|
38
|
+
preprocessors=[URLs],
|
39
|
+
ignoring_filters=[Numerics, Punctuations],
|
40
|
+
scoring_filters=[Alphabetic, ProperName],
|
41
|
+
cleaners=[ConsecutiveDuplicates],
|
42
|
+
scorer=SoftPassFail,
|
43
|
+
tokenizer=word_tokenize_tok,
|
44
|
+
passing_score=0.8,
|
45
|
+
)
|
46
|
+
# ilo.logging_threshold = 0.8
|
47
|
+
return ilo
|
48
|
+
|
49
|
+
|
50
|
+
ALL_VALID = [
|
51
|
+
"mi unpa e mama sina",
|
52
|
+
"mama sina li lon seme? mi wile toki tawa ona",
|
53
|
+
"sina sike pakala",
|
54
|
+
" sina seme e mi ?",
|
55
|
+
"AAAAAAAAAAA",
|
56
|
+
"muuuu MUUU muUuUuU",
|
57
|
+
"wawa mute. " * 10,
|
58
|
+
]
|
59
|
+
|
60
|
+
IGNORABLES = [
|
61
|
+
"",
|
62
|
+
" ",
|
63
|
+
"2+2=5",
|
64
|
+
"kiwen moli 42",
|
65
|
+
"https://mun.la/sona",
|
66
|
+
"https://example.com/",
|
67
|
+
"mi wile e ni: <https://example.com> li pona",
|
68
|
+
"lipu https://example.com li kama pona",
|
69
|
+
"...",
|
70
|
+
" ⟨·⟩, a",
|
71
|
+
"·····",
|
72
|
+
]
|
73
|
+
|
74
|
+
SYLLABIC_MATCHES = [
|
75
|
+
"ni li tenpo penpo",
|
76
|
+
"sipisi",
|
77
|
+
"walawa malama walama malama mupi",
|
78
|
+
"mi sona ala e nimi sunopatikuna",
|
79
|
+
"kalama wuwojiti li pana e sona",
|
80
|
+
"jan Awaja en jan Alasali en jan Akesinu li pona", # syllables match before names here
|
81
|
+
]
|
82
|
+
|
83
|
+
ALPHABETIC_MATCHES = [
|
84
|
+
"mi mtue o kama sona",
|
85
|
+
"mi mute o kma son",
|
86
|
+
"ni li tptpt",
|
87
|
+
"mi wile pana lon sptp",
|
88
|
+
"tmo tawa mi li pona mute la mi kepeken ona lon tenpo mute",
|
89
|
+
"mi pakla lon nimi pi mute lili, taso ale li pona tan ni: mi toki mute",
|
90
|
+
]
|
91
|
+
|
92
|
+
NAME_MATCHES = [
|
93
|
+
"musi Homestuck li ike tawa mi",
|
94
|
+
"ilo Google li sona ala e nimi Emoticon la mi wile utala e ona",
|
95
|
+
"toki Kanse li lon",
|
96
|
+
"toki Lojban li nasa e lawa mi",
|
97
|
+
]
|
98
|
+
|
99
|
+
SOME_INVALID = ["kulupu xerox li ike", "mi tawa ma ohio"]
|
100
|
+
|
101
|
+
|
102
|
+
EXCESSIVE_SYLLABICS = [
|
103
|
+
"manama manama namana namana majani makala",
|
104
|
+
]
|
105
|
+
|
106
|
+
EXCESSIVE_ALPHABETICS = [
|
107
|
+
"21st", # candidate for xfails?
|
108
|
+
"tok",
|
109
|
+
"mut",
|
110
|
+
"mtue",
|
111
|
+
"I wait, I sulk, as a tool I make stoops to ineptness.",
|
112
|
+
"aaa i non-saw usa's most multiple element-set. it's as asinine as in `e`-less speak",
|
113
|
+
"mi pakla ln tepo mtue ls mi kn ala tok poan aun seem",
|
114
|
+
"so, to atone like papa—an awesome anon (no-name) sin man—i ate an asinine lemon-limelike tomato jalapeno isotope. 'nonsense!' amen. note to Oman: take mine katana to imitate a ninja in pantomime. atomise one nuke? 'insane misuse!' same. likewise, Susan, awaken a pepino melon in a linen pipeline. (penile) emanate semen. joke: manipulate a tame toneme to elope online tonite",
|
115
|
+
]
|
116
|
+
|
117
|
+
EXCESSIVE_NAMES = [
|
118
|
+
"I Want To Evade The Filter",
|
119
|
+
"If You Do This The Bot Can't See You",
|
120
|
+
"This Is A Statement In Perfect Toki Pona, I Guarantee",
|
121
|
+
]
|
122
|
+
|
123
|
+
NON_MATCHES = [
|
124
|
+
"bong",
|
125
|
+
"super bruh moment 64",
|
126
|
+
"homestuck",
|
127
|
+
"homestuck Homestuck",
|
128
|
+
]
|
129
|
+
|
130
|
+
XFAILS = [
|
131
|
+
"lete li ike x.x", # emoticon should not be a problem
|
132
|
+
]
|
133
|
+
|
134
|
+
|
135
|
+
@pytest.mark.parametrize(
|
136
|
+
"text",
|
137
|
+
ALL_VALID
|
138
|
+
+ SYLLABIC_MATCHES
|
139
|
+
+ ALPHABETIC_MATCHES
|
140
|
+
+ NAME_MATCHES
|
141
|
+
+ SOME_INVALID
|
142
|
+
+ IGNORABLES,
|
143
|
+
)
|
144
|
+
def test_known_good(ilo: Ilo, lazy_ilo: Ilo, text: str):
|
145
|
+
assert ilo.is_toki_pona(text), text
|
146
|
+
|
147
|
+
|
148
|
+
@pytest.mark.parametrize(
|
149
|
+
"text", EXCESSIVE_SYLLABICS + EXCESSIVE_ALPHABETICS + EXCESSIVE_NAMES + NON_MATCHES
|
150
|
+
)
|
151
|
+
def test_known_bad(ilo: Ilo, text: str):
|
152
|
+
assert not ilo.is_toki_pona(text), text
|
153
|
+
|
154
|
+
|
155
|
+
@pytest.mark.parametrize(
|
156
|
+
"text",
|
157
|
+
ALL_VALID
|
158
|
+
+ SYLLABIC_MATCHES
|
159
|
+
+ ALPHABETIC_MATCHES
|
160
|
+
+ NAME_MATCHES
|
161
|
+
+ SOME_INVALID
|
162
|
+
+ IGNORABLES,
|
163
|
+
)
|
164
|
+
def test_known_good_lazy(lazy_ilo: Ilo, text: str):
|
165
|
+
assert lazy_ilo.is_toki_pona(text), text
|
166
|
+
# assumption: lazy ilo should pass anything the more strict ilo does
|
167
|
+
|
168
|
+
|
169
|
+
@pytest.mark.parametrize("text", NON_MATCHES)
|
170
|
+
def test_known_bad_lazy(lazy_ilo: Ilo, text: str):
|
171
|
+
assert not lazy_ilo.is_toki_pona(text), text
|
172
|
+
|
173
|
+
|
174
|
+
@pytest.mark.parametrize(
|
175
|
+
"text", EXCESSIVE_SYLLABICS + EXCESSIVE_ALPHABETICS + EXCESSIVE_NAMES
|
176
|
+
)
|
177
|
+
def test_weakness_of_lazy(lazy_ilo: Ilo, text: str):
|
178
|
+
# NOTE: This is demonstrative, not preferential
|
179
|
+
assert lazy_ilo.is_toki_pona(text), text
|
180
|
+
|
181
|
+
|
182
|
+
@pytest.mark.xfail
|
183
|
+
@pytest.mark.parametrize("text", XFAILS)
|
184
|
+
def test_known_xfails(ilo: Ilo, text: str):
|
185
|
+
assert ilo.is_toki_pona(text)
|
@@ -0,0 +1,37 @@
|
|
1
|
+
---
|
2
|
+
- name: "basic1"
|
3
|
+
input: "mu. mu."
|
4
|
+
output:
|
5
|
+
- "mu."
|
6
|
+
- "mu."
|
7
|
+
- name: "basic2"
|
8
|
+
input: "mu! mu!"
|
9
|
+
output:
|
10
|
+
- "mu!"
|
11
|
+
- "mu!"
|
12
|
+
- name: "basic3"
|
13
|
+
input: "mu? mu?"
|
14
|
+
output:
|
15
|
+
- "mu?"
|
16
|
+
- "mu?"
|
17
|
+
- name: "basic4"
|
18
|
+
input: "mi mu. mi wawa."
|
19
|
+
output:
|
20
|
+
- "mi mu."
|
21
|
+
- "mi wawa."
|
22
|
+
- name: "dash"
|
23
|
+
input: "mi sona ala e ni- sina seme a"
|
24
|
+
output:
|
25
|
+
- "mi sona ala e ni-"
|
26
|
+
- "sina seme a"
|
27
|
+
- name: "comma"
|
28
|
+
input: "mi mu tawa sina, mi wawa e sina."
|
29
|
+
output:
|
30
|
+
- "mi mu tawa sina, mi wawa e sina."
|
31
|
+
- name: "quotes"
|
32
|
+
input: "toki li tan kulupu Kuko li ni: 'o ike ala!'"
|
33
|
+
output: # expected; we split on right of all sentence-ending puncts
|
34
|
+
- "toki li tan kulupu Kuko li ni:"
|
35
|
+
- "'"
|
36
|
+
- "o ike ala!"
|
37
|
+
- "'"
|
@@ -1,19 +1,15 @@
|
|
1
1
|
---
|
2
2
|
- name: "basic"
|
3
3
|
input: "mi mu mute tawa sina."
|
4
|
-
should_be_equal: true
|
5
4
|
- name: "spoilers"
|
6
5
|
input: "||ni li toki len.||"
|
7
|
-
should_be_equal: true
|
8
6
|
xfail: true # lookbehind for . breaks it
|
9
7
|
- name: "quotes"
|
10
8
|
input: "toki li tan kulupu Kuko li ni: 'o ike ala!'"
|
11
|
-
should_be_equal: true
|
12
9
|
xfail: true
|
13
10
|
- name: periods every word
|
14
11
|
input: "mi.unpa.e.mama.sina"
|
15
12
|
xfail: true # lookbehind for . breaks it
|
16
13
|
- name: "url"
|
17
14
|
input: "https://mun.la/sona/"
|
18
|
-
should_be_equal: true
|
19
15
|
xfail: true # i have no idea how to emulate the : behavior
|
@@ -73,3 +73,30 @@
|
|
73
73
|
- "are"
|
74
74
|
- "boring"
|
75
75
|
- "'"
|
76
|
+
- name: periods every word
|
77
|
+
input: "mi.unpa.e.mama.sina"
|
78
|
+
output:
|
79
|
+
- "mi"
|
80
|
+
- "."
|
81
|
+
- "unpa"
|
82
|
+
- "."
|
83
|
+
- "e"
|
84
|
+
- "."
|
85
|
+
- "mama"
|
86
|
+
- "."
|
87
|
+
- "sina"
|
88
|
+
- name: "discovered case 1"
|
89
|
+
input: "***__U T A L A__ __M U N__***"
|
90
|
+
output:
|
91
|
+
- "***__"
|
92
|
+
- "U"
|
93
|
+
- "T"
|
94
|
+
- "A"
|
95
|
+
- "L"
|
96
|
+
- "A"
|
97
|
+
- "__"
|
98
|
+
- "__"
|
99
|
+
- "M"
|
100
|
+
- "U"
|
101
|
+
- "N"
|
102
|
+
- "__***"
|
sonatoki-0.1.0/tests/test_ilo.py
DELETED
@@ -1,53 +0,0 @@
|
|
1
|
-
# LOCAL
|
2
|
-
from sonatoki.ilo import Ilo
|
3
|
-
from sonatoki.Filters import (
|
4
|
-
Numerics,
|
5
|
-
Syllabic,
|
6
|
-
NimiLinku,
|
7
|
-
Alphabetic,
|
8
|
-
ProperName,
|
9
|
-
Punctuations,
|
10
|
-
)
|
11
|
-
from sonatoki.Scorers import Scaling, SoftScaling
|
12
|
-
from sonatoki.Cleaners import ConsecutiveDuplicates
|
13
|
-
from sonatoki.Tokenizers import word_tokenize_tok
|
14
|
-
from sonatoki.Preprocessors import (
|
15
|
-
URLs,
|
16
|
-
DiscordEmotes,
|
17
|
-
DiscordSpecial,
|
18
|
-
DiscordChannels,
|
19
|
-
DiscordMentions,
|
20
|
-
)
|
21
|
-
|
22
|
-
|
23
|
-
def test_constructor():
|
24
|
-
ilo = Ilo(
|
25
|
-
preprocessors=[
|
26
|
-
URLs,
|
27
|
-
DiscordEmotes,
|
28
|
-
DiscordMentions,
|
29
|
-
DiscordChannels,
|
30
|
-
DiscordSpecial,
|
31
|
-
],
|
32
|
-
ignoring_filters=[Numerics, Punctuations],
|
33
|
-
scoring_filters=[NimiLinku, Syllabic, ProperName, Alphabetic],
|
34
|
-
cleaners=[ConsecutiveDuplicates],
|
35
|
-
scorer=SoftScaling,
|
36
|
-
tokenizer=word_tokenize_tok,
|
37
|
-
passing_score=0.8,
|
38
|
-
)
|
39
|
-
ilo.debug = True
|
40
|
-
assert not ilo.is_toki_pona("super bruh moment 64")
|
41
|
-
assert ilo.is_toki_pona("mi unpa e mama sina")
|
42
|
-
assert ilo.is_toki_pona("mama sina li mu tan mi")
|
43
|
-
assert ilo.is_toki_pona("toki. sike li pona ala. o anpa.")
|
44
|
-
assert ilo.is_toki_pona("musi Homestuck li ike tawa mi")
|
45
|
-
assert ilo.is_toki_pona("mi mtue o kama sona")
|
46
|
-
assert ilo.is_toki_pona("ni li tenpo penpo")
|
47
|
-
assert ilo.is_toki_pona("ni li tptpt")
|
48
|
-
|
49
|
-
assert not ilo.is_toki_pona("I'm Trying To Evade The Filter")
|
50
|
-
assert not ilo.is_toki_pona(
|
51
|
-
"""aaa i non-saw usa's most multiple element-set
|
52
|
-
it's as asinine as in `e`-less speak"""
|
53
|
-
)
|
@@ -1,20 +0,0 @@
|
|
1
|
-
---
|
2
|
-
- name: "basic"
|
3
|
-
input: "mi mu. mi wawa."
|
4
|
-
output:
|
5
|
-
- "mi mu."
|
6
|
-
- "mi wawa."
|
7
|
-
should_be_equal: true
|
8
|
-
- name: "dash"
|
9
|
-
input: "mi sona ala e ni- sina seme a"
|
10
|
-
output:
|
11
|
-
- "mi sona ala e ni-"
|
12
|
-
- "sina seme a"
|
13
|
-
should_be_equal: true
|
14
|
-
- name: "quotes"
|
15
|
-
input: "toki li tan kulupu Kuko li ni: 'o ike ala!'"
|
16
|
-
output:
|
17
|
-
- "toki li tan kulupu Kuko li ni:"
|
18
|
-
- "'o ike ala!'"
|
19
|
-
should_be_equal: true
|
20
|
-
xfail: true
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|