sonatoki 0.1.3__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sonatoki-0.1.3 → sonatoki-0.1.4}/PKG-INFO +1 -1
- {sonatoki-0.1.3 → sonatoki-0.1.4}/pyproject.toml +1 -1
- {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/Configs.py +3 -3
- {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/Filters.py +4 -4
- {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/Preprocessors.py +35 -4
- {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/test_filters.py +6 -6
- {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/test_preprocessors.py +40 -0
- {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/test_scorers.py +8 -6
- {sonatoki-0.1.3 → sonatoki-0.1.4}/LICENSE +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.4}/README.md +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/Cleaners.py +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/Scorers.py +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/Tokenizers.py +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/__init__.py +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/__main__.py +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/constants.py +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/ilo.py +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/linku.json +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.4}/src/sonatoki/sandbox.json +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/__init__.py +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/test_cleaners.py +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/test_ilo.py +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/test_tokenize.py +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/test_utils.py +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/tokenize_cases/tokenize_sentences.yml +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/tokenize_cases/tokenize_words.yml +0 -0
- {sonatoki-0.1.3 → sonatoki-0.1.4}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -9,15 +9,15 @@ from typing_extensions import NotRequired
|
|
9
9
|
from sonatoki.Filters import (
|
10
10
|
Filter,
|
11
11
|
NimiPu,
|
12
|
-
|
12
|
+
Numeric,
|
13
13
|
Syllabic,
|
14
14
|
NimiLinku,
|
15
15
|
NimiPuAle,
|
16
16
|
Alphabetic,
|
17
17
|
ProperName,
|
18
18
|
Phonotactic,
|
19
|
+
Punctuation,
|
19
20
|
NimiLinkuAle,
|
20
|
-
Punctuations,
|
21
21
|
)
|
22
22
|
from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
|
23
23
|
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
@@ -45,7 +45,7 @@ class IloConfig(TypedDict):
|
|
45
45
|
BaseConfig: IloConfig = {
|
46
46
|
"preprocessors": [URLs],
|
47
47
|
"cleaners": [ConsecutiveDuplicates],
|
48
|
-
"ignoring_filters": [
|
48
|
+
"ignoring_filters": [Numeric, Punctuation],
|
49
49
|
"scoring_filters": [],
|
50
50
|
"scorer": PassFail,
|
51
51
|
"passing_score": 0.8,
|
@@ -131,7 +131,7 @@ class Alphabetic(Filter):
|
|
131
131
|
return set(token.lower()).issubset(ALPHABET_SET)
|
132
132
|
|
133
133
|
|
134
|
-
class
|
134
|
+
class Numeric(Filter):
|
135
135
|
"""Determine if a given token is entirely numeric.
|
136
136
|
Covers all numeric symbols in Unicode.
|
137
137
|
|
@@ -147,7 +147,7 @@ class Numerics(Filter):
|
|
147
147
|
return msg.isnumeric()
|
148
148
|
|
149
149
|
|
150
|
-
class
|
150
|
+
class Punctuation(RegexFilter):
|
151
151
|
pattern = re.compile(r"[\p{Punctuation}\p{posix_punct}]+")
|
152
152
|
|
153
153
|
|
@@ -159,6 +159,6 @@ __all__ = [
|
|
159
159
|
"Syllabic",
|
160
160
|
"Alphabetic",
|
161
161
|
"ProperName",
|
162
|
-
"
|
163
|
-
"
|
162
|
+
"Punctuation",
|
163
|
+
"Numeric",
|
164
164
|
]
|
@@ -62,6 +62,13 @@ class URLs(RegexPreprocessor):
|
|
62
62
|
pattern = re.compile(r"https?:\/\/\S+")
|
63
63
|
|
64
64
|
|
65
|
+
class Reference(RegexPreprocessor):
|
66
|
+
"""Remove text contained in double brackets.
|
67
|
+
Often used to fetch articles on Wikipedia, or Magic the Gathering cards."""
|
68
|
+
|
69
|
+
pattern = re.compile(r"\[\[.+\]\]")
|
70
|
+
|
71
|
+
|
65
72
|
class DiscordEmotes(RegexPreprocessor):
|
66
73
|
"""Remove text-formatted Discord emotes `<flags:name:id>`"""
|
67
74
|
|
@@ -80,6 +87,13 @@ class DiscordSpecial(RegexPreprocessor):
|
|
80
87
|
pattern = re.compile(r"<id:[a-zA-Z0-9_]{4,}>")
|
81
88
|
|
82
89
|
|
90
|
+
class AngleBracketObject(RegexPreprocessor):
|
91
|
+
"""A generalized version of the Discord-specific angle bracket objects.
|
92
|
+
Removes any contiguous (not broken by whitespace) text in angle brackets."""
|
93
|
+
|
94
|
+
pattern = re.compile(r"<[^<>\s]+>")
|
95
|
+
|
96
|
+
|
83
97
|
"""
|
84
98
|
The following classes are Containers.
|
85
99
|
|
@@ -92,23 +106,23 @@ would likely be using a language other than Toki Pona.
|
|
92
106
|
|
93
107
|
|
94
108
|
class SingleQuotes(RegexPreprocessor):
|
95
|
-
pattern = re.compile(r"'[^']+'", flags=re.
|
109
|
+
pattern = re.compile(r"'[^']+'", flags=re.DOTALL)
|
96
110
|
|
97
111
|
|
98
112
|
class DoubleQuotes(RegexPreprocessor):
|
99
|
-
pattern = re.compile(r'"[^"]+"', flags=re.
|
113
|
+
pattern = re.compile(r'"[^"]+"', flags=re.DOTALL)
|
100
114
|
|
101
115
|
|
102
116
|
class Backticks(RegexPreprocessor):
|
103
117
|
"""Remove paired backticks and their contents `like this`"""
|
104
118
|
|
105
|
-
pattern = re.compile(r"`[^`]+`", flags=re.
|
119
|
+
pattern = re.compile(r"`[^`]+`", flags=re.DOTALL)
|
106
120
|
|
107
121
|
|
108
122
|
class Spoilers(RegexPreprocessor):
|
109
123
|
"""Remove paired double bars and their contents `||like this||`"""
|
110
124
|
|
111
|
-
pattern = re.compile(r"\|\|(?:(?!\|\|).)+\|\|", flags=re.
|
125
|
+
pattern = re.compile(r"\|\|(?:(?!\|\|).)+\|\|", flags=re.DOTALL)
|
112
126
|
|
113
127
|
|
114
128
|
class ArrowQuote(RegexPreprocessor):
|
@@ -117,7 +131,22 @@ class ArrowQuote(RegexPreprocessor):
|
|
117
131
|
pattern = re.compile(r"^>\ .+$", re.MULTILINE)
|
118
132
|
|
119
133
|
|
134
|
+
class AllQuotes(RegexPreprocessor):
|
135
|
+
pattern = re.compile(
|
136
|
+
"|".join(
|
137
|
+
[
|
138
|
+
SingleQuotes.pattern.pattern,
|
139
|
+
DoubleQuotes.pattern.pattern,
|
140
|
+
Backticks.pattern.pattern,
|
141
|
+
ArrowQuote.pattern.pattern,
|
142
|
+
]
|
143
|
+
),
|
144
|
+
flags=re.MULTILINE | re.DOTALL,
|
145
|
+
)
|
146
|
+
|
147
|
+
|
120
148
|
__all__ = [
|
149
|
+
"AngleBracketObject",
|
121
150
|
"DiscordChannels",
|
122
151
|
"DiscordMentions",
|
123
152
|
"DiscordSpecial",
|
@@ -125,7 +154,9 @@ __all__ = [
|
|
125
154
|
"SingleQuotes",
|
126
155
|
"DoubleQuotes",
|
127
156
|
"ArrowQuote",
|
157
|
+
"AllQuotes",
|
128
158
|
"Backticks",
|
159
|
+
"Reference",
|
129
160
|
"Spoilers",
|
130
161
|
"URLs",
|
131
162
|
]
|
@@ -9,13 +9,13 @@ from hypothesis import HealthCheck, given, assume, example, settings
|
|
9
9
|
# LOCAL
|
10
10
|
from sonatoki.Filters import (
|
11
11
|
NimiPu,
|
12
|
-
|
12
|
+
Numeric,
|
13
13
|
Syllabic,
|
14
14
|
NimiLinku,
|
15
15
|
Alphabetic,
|
16
16
|
ProperName,
|
17
17
|
Phonotactic,
|
18
|
-
|
18
|
+
Punctuation,
|
19
19
|
)
|
20
20
|
from sonatoki.Cleaners import ConsecutiveDuplicates
|
21
21
|
from sonatoki.constants import NIMI_PU, NIMI_LINKU
|
@@ -90,9 +90,9 @@ def test_ProperName(s: str):
|
|
90
90
|
@example("「 」")
|
91
91
|
@example(string.punctuation)
|
92
92
|
@settings(suppress_health_check=[HealthCheck.filter_too_much]) # FIXME
|
93
|
-
def
|
94
|
-
_ = assume(re.fullmatch(
|
95
|
-
res =
|
93
|
+
def test_Punctuation(s: str):
|
94
|
+
_ = assume(re.fullmatch(Punctuation.pattern.pattern, s))
|
95
|
+
res = Punctuation.filter(s)
|
96
96
|
assert res, repr(s)
|
97
97
|
|
98
98
|
|
@@ -100,5 +100,5 @@ def test_Punctuations(s: str):
|
|
100
100
|
@example("124125")
|
101
101
|
@example("99990000")
|
102
102
|
def test_Numeric(s: str):
|
103
|
-
res =
|
103
|
+
res = Numeric.filter(s)
|
104
104
|
assert res, repr(s)
|
@@ -6,7 +6,9 @@ from hypothesis import given, example
|
|
6
6
|
from sonatoki.Preprocessors import (
|
7
7
|
URLs,
|
8
8
|
Spoilers,
|
9
|
+
AllQuotes,
|
9
10
|
Backticks,
|
11
|
+
Reference,
|
10
12
|
ArrowQuote,
|
11
13
|
DoubleQuotes,
|
12
14
|
SingleQuotes,
|
@@ -14,6 +16,7 @@ from sonatoki.Preprocessors import (
|
|
14
16
|
DiscordSpecial,
|
15
17
|
DiscordChannels,
|
16
18
|
DiscordMentions,
|
19
|
+
AngleBracketObject,
|
17
20
|
)
|
18
21
|
|
19
22
|
|
@@ -101,3 +104,40 @@ def test_DiscordChannels(s: str):
|
|
101
104
|
def test_DiscordSpecial(s: str):
|
102
105
|
res = DiscordSpecial.process(s).strip()
|
103
106
|
assert res == "", (repr(s), repr(res))
|
107
|
+
|
108
|
+
|
109
|
+
@given(
|
110
|
+
st.from_regex(DiscordEmotes.pattern.pattern, fullmatch=True)
|
111
|
+
| st.from_regex(DiscordMentions.pattern.pattern, fullmatch=True)
|
112
|
+
| st.from_regex(DiscordChannels.pattern.pattern, fullmatch=True)
|
113
|
+
| st.from_regex(DiscordSpecial.pattern.pattern, fullmatch=True)
|
114
|
+
| st.from_regex(AngleBracketObject.pattern.pattern, fullmatch=True)
|
115
|
+
)
|
116
|
+
@example("<https://example.com>")
|
117
|
+
@example("<#123124125125>")
|
118
|
+
def test_AngleBracketObject(s: str):
|
119
|
+
res = AngleBracketObject.process(s).strip()
|
120
|
+
assert res == "", (repr(s), repr(res))
|
121
|
+
|
122
|
+
|
123
|
+
@given(
|
124
|
+
st.from_regex(SingleQuotes.pattern.pattern, fullmatch=True)
|
125
|
+
| st.from_regex(DoubleQuotes.pattern.pattern, fullmatch=True)
|
126
|
+
| st.from_regex(Backticks.pattern.pattern, fullmatch=True)
|
127
|
+
| st.from_regex(ArrowQuote.pattern.pattern, fullmatch=True)
|
128
|
+
| st.from_regex(AllQuotes.pattern.pattern, fullmatch=True)
|
129
|
+
)
|
130
|
+
@example("> bruh")
|
131
|
+
@example("`bruh`")
|
132
|
+
def test_AllQuotes(s: str):
|
133
|
+
res = AllQuotes.process(s).strip()
|
134
|
+
assert res == "", (repr(s), repr(res))
|
135
|
+
|
136
|
+
|
137
|
+
@given(st.from_regex(Reference.pattern.pattern, fullmatch=True))
|
138
|
+
@example("[[Brainstorm]]")
|
139
|
+
@example("[[Phatic Phrases]]")
|
140
|
+
@example("[[Yahoo!]]")
|
141
|
+
def test_Reference(s: str):
|
142
|
+
res = Reference.process(s).strip()
|
143
|
+
assert res == "", (repr(s), repr(res))
|
@@ -4,38 +4,39 @@ from typing import List, Type
|
|
4
4
|
# PDM
|
5
5
|
import pytest
|
6
6
|
import hypothesis.strategies as st
|
7
|
-
from hypothesis import given
|
7
|
+
from hypothesis import given, example
|
8
8
|
|
9
9
|
# LOCAL
|
10
10
|
from sonatoki.Filters import (
|
11
11
|
Filter,
|
12
12
|
NimiPu,
|
13
|
-
|
13
|
+
Numeric,
|
14
14
|
Syllabic,
|
15
15
|
NimiLinku,
|
16
16
|
Alphabetic,
|
17
17
|
ProperName,
|
18
18
|
Phonotactic,
|
19
|
-
|
19
|
+
Punctuation,
|
20
20
|
)
|
21
|
-
from sonatoki.Scorers import Scorer, Scaling, PassFail, SoftScaling
|
21
|
+
from sonatoki.Scorers import Scorer, Scaling, PassFail, SoftScaling, SoftPassFail
|
22
22
|
|
23
23
|
# FILESYSTEM
|
24
24
|
from .test_utils import token_strategy
|
25
25
|
|
26
26
|
FILTERS = [
|
27
27
|
NimiPu,
|
28
|
-
|
28
|
+
Numeric,
|
29
29
|
Syllabic,
|
30
30
|
NimiLinku,
|
31
31
|
Alphabetic,
|
32
32
|
ProperName,
|
33
33
|
Phonotactic,
|
34
|
-
|
34
|
+
Punctuation,
|
35
35
|
]
|
36
36
|
|
37
37
|
SCORERS = [
|
38
38
|
PassFail,
|
39
|
+
SoftPassFail,
|
39
40
|
Scaling,
|
40
41
|
SoftScaling,
|
41
42
|
]
|
@@ -46,6 +47,7 @@ SCORERS = [
|
|
46
47
|
st.lists(st.sampled_from(FILTERS), min_size=1, unique=True),
|
47
48
|
st.lists(token_strategy, min_size=0, max_size=10),
|
48
49
|
)
|
50
|
+
@example(st.sampled_from(FILTERS), [])
|
49
51
|
def test_score_bounds(scorer: Scorer, filters: List[Type[Filter]], text: List[str]):
|
50
52
|
score = scorer.score(text, filters)
|
51
53
|
assert 0 <= score <= 1, (score, filters, text)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|