sonatoki 0.3.0__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sonatoki-0.3.0 → sonatoki-0.3.1}/PKG-INFO +28 -17
- {sonatoki-0.3.0 → sonatoki-0.3.1}/README.md +27 -16
- {sonatoki-0.3.0 → sonatoki-0.3.1}/pyproject.toml +1 -1
- {sonatoki-0.3.0 → sonatoki-0.3.1}/src/sonatoki/Configs.py +17 -7
- {sonatoki-0.3.0 → sonatoki-0.3.1}/src/sonatoki/Filters.py +70 -19
- {sonatoki-0.3.0 → sonatoki-0.3.1}/src/sonatoki/constants.py +25 -14
- {sonatoki-0.3.0 → sonatoki-0.3.1}/src/sonatoki/utils.py +14 -1
- sonatoki-0.3.1/tests/test_filters.py +263 -0
- {sonatoki-0.3.0 → sonatoki-0.3.1}/tests/test_ilo.py +5 -0
- {sonatoki-0.3.0 → sonatoki-0.3.1}/tests/test_scorers.py +4 -2
- {sonatoki-0.3.0 → sonatoki-0.3.1}/tests/test_utils.py +2 -2
- sonatoki-0.3.0/tests/test_filters.py +0 -127
- {sonatoki-0.3.0 → sonatoki-0.3.1}/LICENSE +0 -0
- {sonatoki-0.3.0 → sonatoki-0.3.1}/src/sonatoki/Cleaners.py +0 -0
- {sonatoki-0.3.0 → sonatoki-0.3.1}/src/sonatoki/Preprocessors.py +0 -0
- {sonatoki-0.3.0 → sonatoki-0.3.1}/src/sonatoki/Scorers.py +0 -0
- {sonatoki-0.3.0 → sonatoki-0.3.1}/src/sonatoki/Tokenizers.py +0 -0
- {sonatoki-0.3.0 → sonatoki-0.3.1}/src/sonatoki/__init__.py +0 -0
- {sonatoki-0.3.0 → sonatoki-0.3.1}/src/sonatoki/__main__.py +0 -0
- {sonatoki-0.3.0 → sonatoki-0.3.1}/src/sonatoki/ilo.py +0 -0
- {sonatoki-0.3.0 → sonatoki-0.3.1}/src/sonatoki/linku.json +0 -0
- {sonatoki-0.3.0 → sonatoki-0.3.1}/src/sonatoki/py.typed +0 -0
- {sonatoki-0.3.0 → sonatoki-0.3.1}/src/sonatoki/sandbox.json +0 -0
- {sonatoki-0.3.0 → sonatoki-0.3.1}/tests/__init__.py +0 -0
- {sonatoki-0.3.0 → sonatoki-0.3.1}/tests/test_cleaners.py +0 -0
- {sonatoki-0.3.0 → sonatoki-0.3.1}/tests/test_preprocessors.py +0 -0
- {sonatoki-0.3.0 → sonatoki-0.3.1}/tests/test_tokenize.py +0 -0
- {sonatoki-0.3.0 → sonatoki-0.3.1}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
- {sonatoki-0.3.0 → sonatoki-0.3.1}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sonatoki
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.1
|
4
4
|
Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
|
5
5
|
Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
|
6
6
|
License: AGPL-3.0-or-later
|
@@ -12,15 +12,22 @@ Description-Content-Type: text/markdown
|
|
12
12
|
|
13
13
|
# sona toki
|
14
14
|
|
15
|
+
<div align="center">
|
16
|
+
|
17
|
+

|
18
|
+
[](https://pypi.org/project/sonatoki)
|
19
|
+
|
20
|
+
</div>
|
21
|
+
|
15
22
|
## What is **sona toki**?
|
16
23
|
|
17
|
-
This library, "Language Knowledge," helps you identify whether a message is in Toki Pona.
|
24
|
+
This library, "Language Knowledge," helps you identify whether a message is in Toki Pona. It does so by determining whether a large enough number of words in a statement are "in Toki Pona". No grammar checking, yet.
|
18
25
|
|
19
|
-
I wrote
|
26
|
+
I wrote this library with a variety of scraps and lessons learned from a prior project, [ilo pi toki pona taso, "toki-pona-only tool"](https://github.com/gregdan3/ilo-pi-toki-pona-taso). That tool now uses this library to great success!
|
20
27
|
|
21
|
-
If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language
|
28
|
+
If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language. This complexity applies to Toki Pona too.
|
22
29
|
|
23
|
-
|
30
|
+
So, this project "solves" that complex problem by offering an opinionated tokenizer and a configurable parser, allowing you to tune its output to your preferences and goals. [Even silly ones.](https://sona.pona.la/wiki/isipin_epiku).
|
24
31
|
|
25
32
|
## Quick Start
|
26
33
|
|
@@ -53,12 +60,12 @@ Or if you'd prefer to configure on your own:
|
|
53
60
|
from copy import deepcopy
|
54
61
|
from sonatoki.ilo import Ilo
|
55
62
|
from sonatoki.Configs import BaseConfig
|
56
|
-
from sonatoki.Filters import
|
63
|
+
from sonatoki.Filters import NimiLinkuCore, Phonotactic, ProperName
|
57
64
|
from sonatoki.Scorers import SoftPassFail
|
58
65
|
|
59
66
|
def main():
|
60
67
|
config = deepcopy(BaseConfig)
|
61
|
-
config["scoring_filters"].extend([
|
68
|
+
config["scoring_filters"].extend([NimiLinkuCore, Phonotactic, ProperName])
|
62
69
|
config["scorer"] = SoftPassFail
|
63
70
|
|
64
71
|
ilo = Ilo(**config)
|
@@ -88,24 +95,28 @@ After our proposal has been examined and a result given by the committee, I will
|
|
88
95
|
|
89
96
|
### What's the deal with the tokenizers?
|
90
97
|
|
91
|
-
The Toki Pona tokenizer `
|
92
|
-
|
93
|
-
Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet, so a more aggressive tokenizer is highly desirable.
|
98
|
+
The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer` has the goal of tokenizing statements such that every token either represents a word candidate ("toki", "mumumu") or a complete non-candidate ("..!", "123").
|
99
|
+
This design is highly undesirable for NLTK's English tokenizer because English words can have "punctuation" characters in them.
|
100
|
+
But Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet or in [Private Use Area Unicode characters](https://www.kreativekorp.com/ucsur/), so a more aggressive tokenizer is highly desirable.
|
94
101
|
|
95
|
-
The
|
102
|
+
The goal of splitting into word candidates and non-candidates is important, because any [encoding of Toki Pona's logographic script](https://www.kreativekorp.com/ucsur/charts/sitelen.html) will require each character be split into its own token, where the default behavior would be to leave consecutive non-punctuation together.
|
96
103
|
|
97
104
|
### Aren't there a lot of false positives?
|
98
105
|
|
99
|
-
Yes
|
106
|
+
Yes, depending on the filter you choose and how you apply it.
|
107
|
+
It's up to you to use this tool responsibly on input you've done your best to clean, such as by using stronger filters before weaker ones.
|
108
|
+
For now though, here's a list of relevant false positives:
|
100
109
|
|
101
|
-
- `ProperName` will errantly match text in languages without a capital/lowercase distinction, artificially
|
102
|
-
- `Alphabetic` will match a _lot_ of undesirable text- it essentially allows 14 letters of the English alphabet.
|
110
|
+
- `ProperName` will errantly match text in languages without a capital/lowercase distinction, artificially increasing scores.
|
111
|
+
- `Alphabetic` will match a _lot_ of undesirable text- it essentially allows 14 letters of the English alphabet. For example, "I'm well" would match as _three_ words: "i", "m", "well".
|
112
|
+
- `NimiPu` and other sets containing `a`, `mute`, `open`, and others will unavoidably match those words in English text too.
|
103
113
|
|
104
114
|
### Don't some of the cleaners/filters conflict?
|
105
115
|
|
106
|
-
Yes
|
116
|
+
Yes, though not terribly much.
|
107
117
|
|
108
118
|
- `ConsecutiveDuplicates` may errantly change a word's validity. For example, "manna" is phonotactically invalid in Toki Pona, but would become "mana" which is valid.
|
109
|
-
- `ConsecutiveDuplicates` will not work correctly with syllabaries
|
119
|
+
- `ConsecutiveDuplicates` will not work correctly with syllabaries, though this should not change the validity of the analyzed word unless you attempt to dictionary match these words.
|
120
|
+
- If you build your own `MemberFilter` with words that have capital letters or consecutive duplicates, they will never match unless you use `prep_dictionary`.
|
110
121
|
|
111
|
-
You'll notice
|
122
|
+
You'll notice these are mostly casued by applying latin alphabet filters to non-latin text. Working on it!
|
@@ -1,14 +1,21 @@
|
|
1
1
|
# sona toki
|
2
2
|
|
3
|
+
<div align="center">
|
4
|
+
|
5
|
+

|
6
|
+
[](https://pypi.org/project/sonatoki)
|
7
|
+
|
8
|
+
</div>
|
9
|
+
|
3
10
|
## What is **sona toki**?
|
4
11
|
|
5
|
-
This library, "Language Knowledge," helps you identify whether a message is in Toki Pona.
|
12
|
+
This library, "Language Knowledge," helps you identify whether a message is in Toki Pona. It does so by determining whether a large enough number of words in a statement are "in Toki Pona". No grammar checking, yet.
|
6
13
|
|
7
|
-
I wrote
|
14
|
+
I wrote this library with a variety of scraps and lessons learned from a prior project, [ilo pi toki pona taso, "toki-pona-only tool"](https://github.com/gregdan3/ilo-pi-toki-pona-taso). That tool now uses this library to great success!
|
8
15
|
|
9
|
-
If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language
|
16
|
+
If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language. This complexity applies to Toki Pona too.
|
10
17
|
|
11
|
-
|
18
|
+
So, this project "solves" that complex problem by offering an opinionated tokenizer and a configurable parser, allowing you to tune its output to your preferences and goals. [Even silly ones.](https://sona.pona.la/wiki/isipin_epiku).
|
12
19
|
|
13
20
|
## Quick Start
|
14
21
|
|
@@ -41,12 +48,12 @@ Or if you'd prefer to configure on your own:
|
|
41
48
|
from copy import deepcopy
|
42
49
|
from sonatoki.ilo import Ilo
|
43
50
|
from sonatoki.Configs import BaseConfig
|
44
|
-
from sonatoki.Filters import
|
51
|
+
from sonatoki.Filters import NimiLinkuCore, Phonotactic, ProperName
|
45
52
|
from sonatoki.Scorers import SoftPassFail
|
46
53
|
|
47
54
|
def main():
|
48
55
|
config = deepcopy(BaseConfig)
|
49
|
-
config["scoring_filters"].extend([
|
56
|
+
config["scoring_filters"].extend([NimiLinkuCore, Phonotactic, ProperName])
|
50
57
|
config["scorer"] = SoftPassFail
|
51
58
|
|
52
59
|
ilo = Ilo(**config)
|
@@ -76,24 +83,28 @@ After our proposal has been examined and a result given by the committee, I will
|
|
76
83
|
|
77
84
|
### What's the deal with the tokenizers?
|
78
85
|
|
79
|
-
The Toki Pona tokenizer `
|
80
|
-
|
81
|
-
Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet, so a more aggressive tokenizer is highly desirable.
|
86
|
+
The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer` has the goal of tokenizing statements such that every token either represents a word candidate ("toki", "mumumu") or a complete non-candidate ("..!", "123").
|
87
|
+
This design is highly undesirable for NLTK's English tokenizer because English words can have "punctuation" characters in them.
|
88
|
+
But Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet or in [Private Use Area Unicode characters](https://www.kreativekorp.com/ucsur/), so a more aggressive tokenizer is highly desirable.
|
82
89
|
|
83
|
-
The
|
90
|
+
The goal of splitting into word candidates and non-candidates is important, because any [encoding of Toki Pona's logographic script](https://www.kreativekorp.com/ucsur/charts/sitelen.html) will require each character be split into its own token, where the default behavior would be to leave consecutive non-punctuation together.
|
84
91
|
|
85
92
|
### Aren't there a lot of false positives?
|
86
93
|
|
87
|
-
Yes
|
94
|
+
Yes, depending on the filter you choose and how you apply it.
|
95
|
+
It's up to you to use this tool responsibly on input you've done your best to clean, such as by using stronger filters before weaker ones.
|
96
|
+
For now though, here's a list of relevant false positives:
|
88
97
|
|
89
|
-
- `ProperName` will errantly match text in languages without a capital/lowercase distinction, artificially
|
90
|
-
- `Alphabetic` will match a _lot_ of undesirable text- it essentially allows 14 letters of the English alphabet.
|
98
|
+
- `ProperName` will errantly match text in languages without a capital/lowercase distinction, artificially increasing scores.
|
99
|
+
- `Alphabetic` will match a _lot_ of undesirable text- it essentially allows 14 letters of the English alphabet. For example, "I'm well" would match as _three_ words: "i", "m", "well".
|
100
|
+
- `NimiPu` and other sets containing `a`, `mute`, `open`, and others will unavoidably match those words in English text too.
|
91
101
|
|
92
102
|
### Don't some of the cleaners/filters conflict?
|
93
103
|
|
94
|
-
Yes
|
104
|
+
Yes, though not terribly much.
|
95
105
|
|
96
106
|
- `ConsecutiveDuplicates` may errantly change a word's validity. For example, "manna" is phonotactically invalid in Toki Pona, but would become "mana" which is valid.
|
97
|
-
- `ConsecutiveDuplicates` will not work correctly with syllabaries
|
107
|
+
- `ConsecutiveDuplicates` will not work correctly with syllabaries, though this should not change the validity of the analyzed word unless you attempt to dictionary match these words.
|
108
|
+
- If you build your own `MemberFilter` with words that have capital letters or consecutive duplicates, they will never match unless you use `prep_dictionary`.
|
98
109
|
|
99
|
-
You'll notice
|
110
|
+
You'll notice these are mostly casued by applying latin alphabet filters to non-latin text. Working on it!
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# STL
|
2
2
|
from copy import deepcopy
|
3
|
-
from typing import List, Type,
|
3
|
+
from typing import List, Type, TypedDict
|
4
4
|
|
5
5
|
# LOCAL
|
6
6
|
from sonatoki.Filters import (
|
@@ -9,16 +9,19 @@ from sonatoki.Filters import (
|
|
9
9
|
Numeric,
|
10
10
|
OrFilter,
|
11
11
|
Syllabic,
|
12
|
-
NimiLinku,
|
13
|
-
NimiPuAle,
|
14
12
|
NimiUCSUR,
|
15
13
|
Alphabetic,
|
16
14
|
ProperName,
|
17
15
|
Phonotactic,
|
18
16
|
Punctuation,
|
19
|
-
|
17
|
+
NimiLinkuCore,
|
18
|
+
NimiPuSynonyms,
|
19
|
+
OrMemberFilter,
|
20
|
+
NimiLinkuCommon,
|
21
|
+
NimiLinkuObscure,
|
20
22
|
NimiLinkuSandbox,
|
21
23
|
EnglishIgnorables,
|
24
|
+
NimiLinkuUncommon,
|
22
25
|
)
|
23
26
|
from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
|
24
27
|
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
@@ -63,7 +66,7 @@ PrefConfig: IloConfig = {
|
|
63
66
|
"cleaners": [ConsecutiveDuplicates],
|
64
67
|
"ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
|
65
68
|
"scoring_filters": [
|
66
|
-
|
69
|
+
OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
|
67
70
|
Syllabic,
|
68
71
|
ProperName,
|
69
72
|
Alphabetic,
|
@@ -78,7 +81,14 @@ CorpusConfig: IloConfig = {
|
|
78
81
|
"cleaners": [ConsecutiveDuplicates],
|
79
82
|
"ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
|
80
83
|
"scoring_filters": [
|
81
|
-
|
84
|
+
OrMemberFilter(
|
85
|
+
NimiLinkuCore,
|
86
|
+
NimiLinkuCommon,
|
87
|
+
NimiLinkuUncommon,
|
88
|
+
NimiLinkuObscure,
|
89
|
+
NimiLinkuSandbox,
|
90
|
+
NimiUCSUR,
|
91
|
+
),
|
82
92
|
Syllabic,
|
83
93
|
ProperName,
|
84
94
|
Alphabetic,
|
@@ -104,7 +114,7 @@ DiscordConfig: IloConfig = {
|
|
104
114
|
"cleaners": [ConsecutiveDuplicates],
|
105
115
|
"ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
|
106
116
|
"scoring_filters": [
|
107
|
-
|
117
|
+
OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
|
108
118
|
Syllabic,
|
109
119
|
ProperName,
|
110
120
|
Alphabetic,
|
@@ -9,6 +9,7 @@ import regex
|
|
9
9
|
from typing_extensions import override
|
10
10
|
|
11
11
|
# LOCAL
|
12
|
+
from sonatoki.utils import prep_dictionary
|
12
13
|
from sonatoki.constants import (
|
13
14
|
VOWELS,
|
14
15
|
NIMI_PU,
|
@@ -17,13 +18,17 @@ from sonatoki.constants import (
|
|
17
18
|
ALLOWABLES,
|
18
19
|
CONSONANTS,
|
19
20
|
IGNORABLES,
|
20
|
-
NIMI_LINKU,
|
21
21
|
NIMI_UCSUR,
|
22
|
-
|
22
|
+
NIMI_KU_LILI,
|
23
|
+
NIMI_KU_SULI,
|
24
|
+
NIMI_LINKU_CORE,
|
23
25
|
ALL_PUNCT_RANGES,
|
24
26
|
NIMI_PU_SYNONYMS,
|
27
|
+
NIMI_LINKU_COMMON,
|
28
|
+
NIMI_LINKU_OBSCURE,
|
25
29
|
NIMI_LINKU_SANDBOX,
|
26
30
|
UCSUR_PUNCT_RANGES,
|
31
|
+
NIMI_LINKU_UNCOMMON,
|
27
32
|
)
|
28
33
|
|
29
34
|
regex.DEFAULT_VERSION = regex.VERSION1
|
@@ -105,27 +110,43 @@ class ProperName(Filter):
|
|
105
110
|
|
106
111
|
|
107
112
|
class NimiPu(MemberFilter):
|
108
|
-
tokens =
|
113
|
+
tokens = prep_dictionary(NIMI_PU)
|
109
114
|
|
110
115
|
|
111
|
-
class
|
112
|
-
tokens =
|
116
|
+
class NimiPuSynonyms(MemberFilter):
|
117
|
+
tokens = prep_dictionary(NIMI_PU_SYNONYMS)
|
113
118
|
|
114
119
|
|
115
|
-
class
|
116
|
-
tokens =
|
120
|
+
class NimiKuSuli(MemberFilter):
|
121
|
+
tokens = prep_dictionary(NIMI_KU_SULI)
|
117
122
|
|
118
123
|
|
119
|
-
class
|
120
|
-
tokens =
|
124
|
+
class NimiKuLili(MemberFilter):
|
125
|
+
tokens = prep_dictionary(NIMI_KU_LILI)
|
126
|
+
|
127
|
+
|
128
|
+
class NimiLinkuCore(MemberFilter):
|
129
|
+
tokens = prep_dictionary(NIMI_LINKU_CORE)
|
130
|
+
|
131
|
+
|
132
|
+
class NimiLinkuCommon(MemberFilter):
|
133
|
+
tokens = prep_dictionary(NIMI_LINKU_COMMON)
|
134
|
+
|
135
|
+
|
136
|
+
class NimiLinkuUncommon(MemberFilter):
|
137
|
+
tokens = prep_dictionary(NIMI_LINKU_UNCOMMON)
|
138
|
+
|
139
|
+
|
140
|
+
class NimiLinkuObscure(MemberFilter):
|
141
|
+
tokens = prep_dictionary(NIMI_LINKU_OBSCURE)
|
121
142
|
|
122
143
|
|
123
144
|
class NimiLinkuSandbox(MemberFilter):
|
124
|
-
tokens =
|
145
|
+
tokens = prep_dictionary(NIMI_LINKU_SANDBOX)
|
125
146
|
|
126
147
|
|
127
148
|
class NimiUCSUR(MemberFilter):
|
128
|
-
tokens =
|
149
|
+
tokens = prep_dictionary(NIMI_UCSUR)
|
129
150
|
|
130
151
|
|
131
152
|
class Phonotactic(RegexFilter):
|
@@ -224,11 +245,10 @@ class OrFilter:
|
|
224
245
|
Instead, the user is responsible for building an OrFilter out of their desired filters.
|
225
246
|
"""
|
226
247
|
|
227
|
-
|
228
|
-
|
229
|
-
raise ValueError("Must provide at least two Filters to OrFilter.")
|
248
|
+
@staticmethod
|
249
|
+
def __generic_filter(*filters_: Type[Filter]) -> Type[Filter]:
|
230
250
|
|
231
|
-
class
|
251
|
+
class CombinedFilter(Filter):
|
232
252
|
filters: List[Type[Filter]] = list(filters_) # TODO: tuple better?
|
233
253
|
|
234
254
|
@classmethod
|
@@ -240,7 +260,39 @@ class OrFilter:
|
|
240
260
|
return True
|
241
261
|
return False
|
242
262
|
|
243
|
-
return
|
263
|
+
return CombinedFilter
|
264
|
+
|
265
|
+
def __new__(cls, *filters: Type[Filter]) -> Type[Filter]:
|
266
|
+
if not len(filters) >= 2:
|
267
|
+
raise ValueError("Provide at least two Filters to OrFilter.")
|
268
|
+
|
269
|
+
subset_filters = [f for f in filters if issubclass(f, MemberFilter)]
|
270
|
+
if len(subset_filters) >= 2:
|
271
|
+
raise Warning(
|
272
|
+
"Prefer OrMemberFilter for combining two or more MemberFilters."
|
273
|
+
)
|
274
|
+
|
275
|
+
filter = cls.__generic_filter(*filters)
|
276
|
+
|
277
|
+
return filter
|
278
|
+
|
279
|
+
|
280
|
+
class OrMemberFilter:
|
281
|
+
@staticmethod
|
282
|
+
def __subset_filter(*filters: Type[MemberFilter]) -> Type[MemberFilter]:
|
283
|
+
all_token_sets: List[Set[str]] = [f.tokens for f in filters]
|
284
|
+
all_tokens: Set[str] = set().union(*all_token_sets)
|
285
|
+
|
286
|
+
class CombinedFilter(MemberFilter):
|
287
|
+
tokens = all_tokens
|
288
|
+
|
289
|
+
return CombinedFilter
|
290
|
+
|
291
|
+
def __new__(cls, *filters_: Type[MemberFilter]) -> Type[MemberFilter]:
|
292
|
+
if not len(filters_) >= 2:
|
293
|
+
raise ValueError("Provide two or more MemberFilters to OrMemberFilter.")
|
294
|
+
filter = cls.__subset_filter(*filters_)
|
295
|
+
return filter
|
244
296
|
|
245
297
|
|
246
298
|
class AndFilter(Filter):
|
@@ -271,11 +323,10 @@ __all__ = [
|
|
271
323
|
"Alphabetic",
|
272
324
|
"AndFilter",
|
273
325
|
"EnglishIgnorables",
|
274
|
-
"
|
275
|
-
"NimiLinkuAle",
|
326
|
+
"NimiLinkuCore",
|
276
327
|
"NimiLinkuSandbox",
|
277
328
|
"NimiPu",
|
278
|
-
"
|
329
|
+
"NimiPuSynonyms",
|
279
330
|
"NimiUCSUR",
|
280
331
|
"Numeric",
|
281
332
|
"OrFilter",
|
@@ -421,24 +421,31 @@ UCSUR_RANGES = [
|
|
421
421
|
]
|
422
422
|
NIMI_UCSUR = find_unicode_chars(UCSUR_RANGES)
|
423
423
|
|
424
|
+
|
425
|
+
# NIMI_PU_UCSUR_RANGES = ["\\U000F1900-\\U000F1977"]
|
426
|
+
# NIMI_PU_ALE_UCSUR_RANGES = NIMI_PU_UCSUR_RANGES + ["\\U000F1978-\\U000F197A"]
|
427
|
+
|
428
|
+
|
429
|
+
def category_helper(data: Dict[str, Dict[str, str]], key: str, value: str) -> List[str]:
|
430
|
+
return [d["word"] for d in data.values() if d[key] == value]
|
431
|
+
|
432
|
+
|
424
433
|
with open(LINKU) as f:
|
425
434
|
linku: Dict[str, Dict[str, str]] = json.loads(f.read())
|
426
|
-
NIMI_PU: List[str] =
|
435
|
+
NIMI_PU: List[str] = category_helper(linku, "book", "pu")
|
427
436
|
NIMI_PU_SYNONYMS: List[str] = ["namako", "kin", "oko"]
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
437
|
+
|
438
|
+
NIMI_KU_SULI = category_helper(linku, "book", "ku suli")
|
439
|
+
NIMI_KU_LILI = category_helper(linku, "book", "ku lili")
|
440
|
+
|
441
|
+
NIMI_LINKU_CORE = category_helper(linku, "usage_category", "core")
|
442
|
+
NIMI_LINKU_COMMON = category_helper(linku, "usage_category", "common")
|
443
|
+
NIMI_LINKU_UNCOMMON = category_helper(linku, "usage_category", "uncommon")
|
444
|
+
NIMI_LINKU_OBSCURE = category_helper(linku, "usage_category", "obscure")
|
436
445
|
|
437
446
|
with open(SANDBOX) as f:
|
438
447
|
sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
|
439
|
-
NIMI_LINKU_SANDBOX: List[str] =
|
440
|
-
d["word"] for d in sandbox.values()
|
441
|
-
]
|
448
|
+
NIMI_LINKU_SANDBOX: List[str] = [d["word"] for d in sandbox.values()]
|
442
449
|
|
443
450
|
del linku
|
444
451
|
del sandbox
|
@@ -449,9 +456,13 @@ __all__ = [
|
|
449
456
|
"ALL_PUNCT_RANGES",
|
450
457
|
"ALPHABET",
|
451
458
|
"CONSONANTS",
|
452
|
-
"
|
453
|
-
"
|
459
|
+
"NIMI_KU_LILI",
|
460
|
+
"NIMI_KU_SULI",
|
461
|
+
"NIMI_LINKU_COMMON",
|
462
|
+
"NIMI_LINKU_CORE",
|
463
|
+
"NIMI_LINKU_OBSCURE",
|
454
464
|
"NIMI_LINKU_SANDBOX",
|
465
|
+
"NIMI_LINKU_UNCOMMON",
|
455
466
|
"NIMI_PU",
|
456
467
|
"NIMI_PU_SYNONYMS",
|
457
468
|
"POSIX_PUNCT",
|
@@ -1,10 +1,23 @@
|
|
1
1
|
# STL
|
2
2
|
import re
|
3
|
-
from typing import List
|
3
|
+
from typing import Set, List, Iterable
|
4
|
+
|
5
|
+
# LOCAL
|
6
|
+
from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates
|
4
7
|
|
5
8
|
TO_ESCAPE = ["\\", "^", "[", "]", "-"]
|
6
9
|
|
7
10
|
|
11
|
+
def prep_dictionary(words: Iterable[str]) -> Set[str]:
|
12
|
+
out: Set[str] = set()
|
13
|
+
cleaners = [Lowercase, ConsecutiveDuplicates]
|
14
|
+
for word in words:
|
15
|
+
for c in cleaners:
|
16
|
+
word = c.clean(word)
|
17
|
+
out.add(word)
|
18
|
+
return out
|
19
|
+
|
20
|
+
|
8
21
|
def regex_escape(s: str) -> str:
|
9
22
|
"""Escape all characters which must be escaped when embedded in a character class."""
|
10
23
|
for c in TO_ESCAPE:
|
@@ -0,0 +1,263 @@
|
|
1
|
+
# STL
|
2
|
+
import string
|
3
|
+
|
4
|
+
# PDM
|
5
|
+
import hypothesis.strategies as st
|
6
|
+
from hypothesis import given, example
|
7
|
+
|
8
|
+
# LOCAL
|
9
|
+
from sonatoki.Filters import (
|
10
|
+
NimiPu,
|
11
|
+
Numeric,
|
12
|
+
OrFilter,
|
13
|
+
Syllabic,
|
14
|
+
Alphabetic,
|
15
|
+
NimiKuLili,
|
16
|
+
NimiKuSuli,
|
17
|
+
ProperName,
|
18
|
+
Phonotactic,
|
19
|
+
Punctuation,
|
20
|
+
AlphabeticRe,
|
21
|
+
NimiLinkuCore,
|
22
|
+
PunctuationRe,
|
23
|
+
NimiPuSynonyms,
|
24
|
+
OrMemberFilter,
|
25
|
+
PunctuationRe1,
|
26
|
+
NimiLinkuCommon,
|
27
|
+
NimiLinkuObscure,
|
28
|
+
NimiLinkuSandbox,
|
29
|
+
NimiLinkuUncommon,
|
30
|
+
)
|
31
|
+
from sonatoki.Cleaners import Lowercase, ConsecutiveDuplicates
|
32
|
+
from sonatoki.constants import (
|
33
|
+
NIMI_PU,
|
34
|
+
NIMI_KU_LILI,
|
35
|
+
NIMI_KU_SULI,
|
36
|
+
NIMI_LINKU_CORE,
|
37
|
+
NIMI_PU_SYNONYMS,
|
38
|
+
NIMI_LINKU_COMMON,
|
39
|
+
NIMI_LINKU_OBSCURE,
|
40
|
+
NIMI_LINKU_SANDBOX,
|
41
|
+
NIMI_LINKU_UNCOMMON,
|
42
|
+
)
|
43
|
+
|
44
|
+
# FILESYSTEM
|
45
|
+
from .test_utils import PROPER_NAME_RE
|
46
|
+
|
47
|
+
|
48
|
+
@given(st.sampled_from(NIMI_PU))
|
49
|
+
@example("lukin")
|
50
|
+
@example("selo")
|
51
|
+
@example("li")
|
52
|
+
def test_NimiPu(s: str):
|
53
|
+
res = NimiPu.filter(s)
|
54
|
+
assert res, repr(s)
|
55
|
+
|
56
|
+
|
57
|
+
@given(st.sampled_from(NIMI_LINKU_CORE))
|
58
|
+
@example("pona")
|
59
|
+
def test_NimiLinkuCore(s: str):
|
60
|
+
res = NimiLinkuCore.filter(s)
|
61
|
+
assert res, repr(s)
|
62
|
+
|
63
|
+
|
64
|
+
@given(st.sampled_from(NIMI_LINKU_COMMON))
|
65
|
+
@example("n")
|
66
|
+
@example("tonsi")
|
67
|
+
@example("kipisi")
|
68
|
+
def test_NimiLinkuCommon(s: str):
|
69
|
+
res = NimiLinkuCommon.filter(s)
|
70
|
+
assert res, repr(s)
|
71
|
+
|
72
|
+
|
73
|
+
@given(st.sampled_from(NIMI_LINKU_UNCOMMON))
|
74
|
+
def test_NimiLinkuUncommon(s: str):
|
75
|
+
res = NimiLinkuUncommon.filter(s)
|
76
|
+
assert res, repr(s)
|
77
|
+
|
78
|
+
|
79
|
+
@given(st.sampled_from(NIMI_LINKU_OBSCURE))
|
80
|
+
def test_NimiLinkuObscure(s: str):
|
81
|
+
res = NimiLinkuObscure.filter(s)
|
82
|
+
assert res, repr(s)
|
83
|
+
|
84
|
+
|
85
|
+
@given(st.sampled_from(NIMI_LINKU_SANDBOX))
|
86
|
+
@example("kalamARR")
|
87
|
+
@example("Pingo")
|
88
|
+
def test_NimiLinkuSandbox(s: str):
|
89
|
+
s = Lowercase.clean(s)
|
90
|
+
s = ConsecutiveDuplicates.clean(s)
|
91
|
+
# above two are necessary due to kalamARR and Pingo
|
92
|
+
res = NimiLinkuSandbox.filter(s)
|
93
|
+
assert res, repr(s)
|
94
|
+
|
95
|
+
|
96
|
+
@given(st.from_regex(Phonotactic.pattern.pattern, fullmatch=True))
|
97
|
+
@example("kijetesantakalu")
|
98
|
+
@example("n")
|
99
|
+
def test_Phonotactic(s: str):
|
100
|
+
res = Phonotactic.filter(s)
|
101
|
+
assert res, repr(s)
|
102
|
+
|
103
|
+
|
104
|
+
@given(st.from_regex(Syllabic.pattern.pattern, fullmatch=True))
|
105
|
+
@example("wuwojitiwunwonjintinmanna")
|
106
|
+
def test_Syllabic(s: str):
|
107
|
+
res = Syllabic.filter(s)
|
108
|
+
assert res, repr(s)
|
109
|
+
|
110
|
+
|
111
|
+
@given(st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True))
|
112
|
+
@example("muems")
|
113
|
+
@example("mpptp")
|
114
|
+
@example("tptpt")
|
115
|
+
def test_Alphabetic(s: str):
|
116
|
+
res_fn = Alphabetic.filter(s)
|
117
|
+
res_re = AlphabeticRe.filter(s)
|
118
|
+
assert res_fn == res_re, repr(s)
|
119
|
+
|
120
|
+
|
121
|
+
@given(st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True))
|
122
|
+
def test_AlphabeticRe(s: str):
|
123
|
+
res_re = AlphabeticRe.filter(s)
|
124
|
+
assert res_re, repr(s)
|
125
|
+
|
126
|
+
|
127
|
+
@given(st.from_regex(PROPER_NAME_RE, fullmatch=True))
|
128
|
+
def test_ProperName(s: str):
|
129
|
+
res = ProperName.filter(s)
|
130
|
+
assert res, repr(s)
|
131
|
+
|
132
|
+
|
133
|
+
@given(st.from_regex(PunctuationRe.pattern.pattern, fullmatch=True))
|
134
|
+
@example("[]")
|
135
|
+
@example(r"\\")
|
136
|
+
@example(r"\"")
|
137
|
+
@example("⟨·⟩")
|
138
|
+
@example("…")
|
139
|
+
@example("「」")
|
140
|
+
@example(string.punctuation)
|
141
|
+
def test_PunctuationRe1(s: str):
|
142
|
+
res = PunctuationRe1.filter(s)
|
143
|
+
assert res, repr(s)
|
144
|
+
|
145
|
+
|
146
|
+
@given(st.from_regex(PunctuationRe.pattern.pattern, fullmatch=True))
|
147
|
+
def test_PunctuationRe(s: str):
|
148
|
+
res_re = PunctuationRe.filter(s)
|
149
|
+
res_re1 = PunctuationRe1.filter(s)
|
150
|
+
assert res_re == res_re1, repr(s)
|
151
|
+
|
152
|
+
|
153
|
+
@given(st.from_regex(PunctuationRe.pattern.pattern, fullmatch=True))
|
154
|
+
@example("\U000f1990") # UCSUR char
|
155
|
+
def test_Punctuation(s: str):
|
156
|
+
res_fn = Punctuation.filter(s)
|
157
|
+
res_re1 = PunctuationRe1.filter(s)
|
158
|
+
assert res_fn == res_re1, repr(s)
|
159
|
+
|
160
|
+
|
161
|
+
@given(st.from_regex(r"\d+", fullmatch=True))
|
162
|
+
@example("124125")
|
163
|
+
@example("99990000")
|
164
|
+
def test_Numeric(s: str):
|
165
|
+
res = Numeric.filter(s)
|
166
|
+
assert res, repr(s)
|
167
|
+
|
168
|
+
|
169
|
+
@given(
|
170
|
+
st.from_regex(PunctuationRe.pattern.pattern, fullmatch=True)
|
171
|
+
| st.from_regex(r"\d+", fullmatch=True),
|
172
|
+
)
|
173
|
+
def test_OrFilter(s: str):
|
174
|
+
filter = OrFilter(Punctuation, Numeric)
|
175
|
+
res = filter.filter(s)
|
176
|
+
res_punctuation = Punctuation.filter(s)
|
177
|
+
res_numeric = Numeric.filter(s)
|
178
|
+
assert res and (res_punctuation or res_numeric)
|
179
|
+
|
180
|
+
|
181
|
+
# NOTE: No subset filter test because A | B is not the same as A combined with B.
|
182
|
+
# e.g. "apple" passes Alphabetic, "..." passes Punctuation, "apple..." passes neither
|
183
|
+
# but would incorrectly pass a combined filter.
|
184
|
+
@given(st.sampled_from(NIMI_PU + NIMI_LINKU_OBSCURE))
|
185
|
+
def test_OrMemberFilter(s: str):
|
186
|
+
filter = OrMemberFilter(NimiPu, NimiLinkuObscure)
|
187
|
+
res = filter.filter(s)
|
188
|
+
res_pu = NimiPu.filter(s)
|
189
|
+
res_obscure = NimiLinkuObscure.filter(s)
|
190
|
+
assert res and (res_pu or res_obscure)
|
191
|
+
|
192
|
+
|
193
|
+
@given(
|
194
|
+
st.sampled_from(
|
195
|
+
NIMI_KU_SULI
|
196
|
+
+ NIMI_KU_LILI
|
197
|
+
+ NIMI_LINKU_UNCOMMON
|
198
|
+
+ NIMI_LINKU_OBSCURE
|
199
|
+
+ NIMI_LINKU_SANDBOX,
|
200
|
+
)
|
201
|
+
)
|
202
|
+
def test_OrMemberFilter_IsipinEpiku(s: str):
|
203
|
+
filter = OrMemberFilter(
|
204
|
+
NimiKuSuli, NimiKuLili, NimiLinkuUncommon, NimiLinkuObscure, NimiLinkuSandbox
|
205
|
+
)
|
206
|
+
|
207
|
+
s = Lowercase.clean(s)
|
208
|
+
s = ConsecutiveDuplicates.clean(s)
|
209
|
+
|
210
|
+
res = filter.filter(s)
|
211
|
+
res_ku_suli = NimiKuSuli.filter(s)
|
212
|
+
res_ku_lili = NimiKuLili.filter(s)
|
213
|
+
res_uncommon = NimiLinkuUncommon.filter(s)
|
214
|
+
res_obscure = NimiLinkuObscure.filter(s)
|
215
|
+
res_sandbox = NimiLinkuSandbox.filter(s)
|
216
|
+
assert res and (
|
217
|
+
res_ku_suli or res_ku_lili or res_uncommon or res_obscure or res_sandbox
|
218
|
+
)
|
219
|
+
|
220
|
+
|
221
|
+
@given(st.sampled_from(NIMI_PU + NIMI_PU_SYNONYMS))
|
222
|
+
def test_pu_filters_non_overlap(s: str):
|
223
|
+
res_pu = NimiPu.filter(s)
|
224
|
+
res_synonyms = NimiPuSynonyms.filter(s)
|
225
|
+
assert (res_pu + res_synonyms) == 1
|
226
|
+
|
227
|
+
|
228
|
+
@given(st.sampled_from(NIMI_KU_SULI + NIMI_KU_LILI))
|
229
|
+
def test_ku_filters_non_overlap(s: str):
|
230
|
+
res_ku_suli = NimiKuSuli.filter(s)
|
231
|
+
res_ku_lili = NimiKuLili.filter(s)
|
232
|
+
assert (res_ku_suli + res_ku_lili) == 1
|
233
|
+
|
234
|
+
|
235
|
+
@given(
|
236
|
+
st.sampled_from(
|
237
|
+
NIMI_LINKU_CORE
|
238
|
+
+ NIMI_LINKU_COMMON
|
239
|
+
+ NIMI_LINKU_UNCOMMON
|
240
|
+
+ NIMI_LINKU_OBSCURE
|
241
|
+
+ NIMI_LINKU_SANDBOX
|
242
|
+
)
|
243
|
+
)
|
244
|
+
def test_linku_filters_non_overlap(s: str):
|
245
|
+
s = Lowercase.clean(s)
|
246
|
+
s = ConsecutiveDuplicates.clean(s)
|
247
|
+
|
248
|
+
res_core = NimiLinkuCore.filter(s)
|
249
|
+
res_common = NimiLinkuCommon.filter(s)
|
250
|
+
res_uncommon = NimiLinkuUncommon.filter(s)
|
251
|
+
res_obscure = NimiLinkuObscure.filter(s)
|
252
|
+
res_sandbox = NimiLinkuSandbox.filter(s)
|
253
|
+
|
254
|
+
assert (res_core + res_common + res_uncommon + res_obscure + res_sandbox) == 1
|
255
|
+
|
256
|
+
|
257
|
+
@given(st.sampled_from(NIMI_LINKU_CORE + NIMI_LINKU_COMMON + NIMI_LINKU_UNCOMMON))
|
258
|
+
def test_nimi_linku_properties(s: str):
|
259
|
+
assert ConsecutiveDuplicates.clean(s) == s, repr(s)
|
260
|
+
assert Alphabetic.filter(s), repr(s)
|
261
|
+
assert Syllabic.filter(s), repr(s)
|
262
|
+
assert Phonotactic.filter(s), repr(s)
|
263
|
+
# Passing phonotactic implies all of the above
|
@@ -75,6 +75,8 @@ NAME_MATCHES = [
|
|
75
75
|
"ilo Google li sona ala e nimi Emoticon la mi wile utala e ona",
|
76
76
|
"toki Kanse li lon",
|
77
77
|
"toki Lojban li nasa e lawa mi",
|
78
|
+
"ilo Firefox",
|
79
|
+
"mi musi Space Station 13",
|
78
80
|
]
|
79
81
|
|
80
82
|
SOME_INVALID = [
|
@@ -88,6 +90,8 @@ SOME_INVALID = [
|
|
88
90
|
CORPUS_SPECIFIC = [
|
89
91
|
"ki le konsi si te isipin epiku le pasila to",
|
90
92
|
'jasima omekapo, ki nimisin "jasima enko nimisin". ki enko alu linluwi Jutu alu epiku ki epiku baba is you. ki likujo "SINtelen pona", ki epiku alu "sitelen pona". ki kepen wawajete isipin, kin ki yupekosi alu lipamanka alu wawajete, kin ki enko isipin lipamanka linluwi alu wawajete',
|
93
|
+
"kalamARRRR",
|
94
|
+
"Pingo",
|
91
95
|
]
|
92
96
|
|
93
97
|
|
@@ -153,6 +157,7 @@ FALSE_NEGATIVES = [
|
|
153
157
|
# emoticon should not be a problem
|
154
158
|
"lete li ike x.x",
|
155
159
|
# a token that is one edit off a known word should be allowed
|
160
|
+
"mi pnoa",
|
156
161
|
"tok",
|
157
162
|
"mut",
|
158
163
|
"poan",
|
@@ -12,11 +12,12 @@ from sonatoki.Filters import (
|
|
12
12
|
NimiPu,
|
13
13
|
Numeric,
|
14
14
|
Syllabic,
|
15
|
-
NimiLinku,
|
16
15
|
Alphabetic,
|
17
16
|
ProperName,
|
18
17
|
Phonotactic,
|
18
|
+
NimiLinkuCore,
|
19
19
|
PunctuationRe,
|
20
|
+
NimiLinkuCommon,
|
20
21
|
)
|
21
22
|
from sonatoki.Scorers import Scorer, Scaling, PassFail, SoftScaling, SoftPassFail
|
22
23
|
|
@@ -27,7 +28,8 @@ FILTERS = [
|
|
27
28
|
NimiPu,
|
28
29
|
Numeric,
|
29
30
|
Syllabic,
|
30
|
-
|
31
|
+
NimiLinkuCore,
|
32
|
+
NimiLinkuCommon,
|
31
33
|
Alphabetic,
|
32
34
|
ProperName,
|
33
35
|
Phonotactic,
|
@@ -6,12 +6,12 @@ import hypothesis.strategies as st
|
|
6
6
|
|
7
7
|
# LOCAL
|
8
8
|
from sonatoki.Filters import Syllabic, Phonotactic, AlphabeticRe
|
9
|
-
from sonatoki.constants import
|
9
|
+
from sonatoki.constants import NIMI_LINKU_CORE, NIMI_LINKU_COMMON
|
10
10
|
|
11
11
|
PROPER_NAME_RE = r"[A-Z][a-z]*"
|
12
12
|
|
13
13
|
token_strategy = (
|
14
|
-
st.sampled_from(
|
14
|
+
st.sampled_from(NIMI_LINKU_CORE + NIMI_LINKU_COMMON)
|
15
15
|
| st.from_regex(Phonotactic.pattern.pattern, fullmatch=True)
|
16
16
|
| st.from_regex(Syllabic.pattern.pattern, fullmatch=True)
|
17
17
|
| st.from_regex(PROPER_NAME_RE, fullmatch=True)
|
@@ -1,127 +0,0 @@
|
|
1
|
-
# STL
|
2
|
-
import string
|
3
|
-
|
4
|
-
# PDM
|
5
|
-
import hypothesis.strategies as st
|
6
|
-
from hypothesis import given, example
|
7
|
-
|
8
|
-
# LOCAL
|
9
|
-
from sonatoki.Filters import (
|
10
|
-
NimiPu,
|
11
|
-
Numeric,
|
12
|
-
Syllabic,
|
13
|
-
NimiLinku,
|
14
|
-
Alphabetic,
|
15
|
-
ProperName,
|
16
|
-
Phonotactic,
|
17
|
-
Punctuation,
|
18
|
-
AlphabeticRe,
|
19
|
-
PunctuationRe,
|
20
|
-
PunctuationRe1,
|
21
|
-
)
|
22
|
-
from sonatoki.Cleaners import ConsecutiveDuplicates
|
23
|
-
from sonatoki.constants import NIMI_PU, NIMI_LINKU
|
24
|
-
|
25
|
-
# FILESYSTEM
|
26
|
-
from .test_utils import PROPER_NAME_RE
|
27
|
-
|
28
|
-
|
29
|
-
@given(st.sampled_from(NIMI_PU))
|
30
|
-
@example("lukin")
|
31
|
-
@example("selo")
|
32
|
-
@example("li")
|
33
|
-
def test_NimiPu(s: str):
|
34
|
-
res = NimiPu.filter(s)
|
35
|
-
assert res, repr(s)
|
36
|
-
|
37
|
-
|
38
|
-
@given(st.sampled_from(NIMI_LINKU))
|
39
|
-
@example("pona")
|
40
|
-
@example("tonsi")
|
41
|
-
@example("kipisi")
|
42
|
-
@example("n")
|
43
|
-
def test_NimiLinku(s: str):
|
44
|
-
res = NimiLinku.filter(s)
|
45
|
-
assert res, repr(s)
|
46
|
-
|
47
|
-
|
48
|
-
@given(st.sampled_from(NIMI_LINKU))
|
49
|
-
def test_nimi_linku_properties(s: str):
|
50
|
-
assert ConsecutiveDuplicates.clean(s) == s, repr(s)
|
51
|
-
assert Alphabetic.filter(s), repr(s)
|
52
|
-
assert Syllabic.filter(s), repr(s)
|
53
|
-
assert Phonotactic.filter(s), repr(s)
|
54
|
-
# Passing phonotactic implies all of the above
|
55
|
-
|
56
|
-
|
57
|
-
@given(st.from_regex(Phonotactic.pattern.pattern, fullmatch=True))
|
58
|
-
@example("kijetesantakalu")
|
59
|
-
@example("n")
|
60
|
-
def test_Phonotactic(s: str):
|
61
|
-
res = Phonotactic.filter(s)
|
62
|
-
assert res, repr(s)
|
63
|
-
|
64
|
-
|
65
|
-
@given(st.from_regex(Syllabic.pattern.pattern, fullmatch=True))
|
66
|
-
@example("wuwojitiwunwonjintinmanna")
|
67
|
-
def test_Syllabic(s: str):
|
68
|
-
res = Syllabic.filter(s)
|
69
|
-
assert res, repr(s)
|
70
|
-
|
71
|
-
|
72
|
-
@given(st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True))
|
73
|
-
@example("muems")
|
74
|
-
@example("mpptp")
|
75
|
-
@example("tptpt")
|
76
|
-
def test_Alphabetic(s: str):
|
77
|
-
res_fn = Alphabetic.filter(s)
|
78
|
-
res_re = AlphabeticRe.filter(s)
|
79
|
-
assert res_fn == res_re, repr(s)
|
80
|
-
|
81
|
-
|
82
|
-
@given(st.from_regex(AlphabeticRe.pattern.pattern, fullmatch=True))
|
83
|
-
def test_AlphabeticRe(s: str):
|
84
|
-
res_re = AlphabeticRe.filter(s)
|
85
|
-
assert res_re, repr(s)
|
86
|
-
|
87
|
-
|
88
|
-
@given(st.from_regex(PROPER_NAME_RE, fullmatch=True))
|
89
|
-
def test_ProperName(s: str):
|
90
|
-
res = ProperName.filter(s)
|
91
|
-
assert res, repr(s)
|
92
|
-
|
93
|
-
|
94
|
-
@given(st.from_regex(PunctuationRe.pattern.pattern, fullmatch=True))
|
95
|
-
@example("[]")
|
96
|
-
@example(r"\\")
|
97
|
-
@example(r"\"")
|
98
|
-
@example("⟨·⟩")
|
99
|
-
@example("…")
|
100
|
-
@example("「」")
|
101
|
-
@example(string.punctuation)
|
102
|
-
def test_PunctuationRe1(s: str):
|
103
|
-
res = PunctuationRe1.filter(s)
|
104
|
-
assert res, repr(s)
|
105
|
-
|
106
|
-
|
107
|
-
@given(st.from_regex(PunctuationRe.pattern.pattern, fullmatch=True))
|
108
|
-
def test_PunctuationRe(s: str):
|
109
|
-
res_re = PunctuationRe.filter(s)
|
110
|
-
res_re1 = PunctuationRe1.filter(s)
|
111
|
-
assert res_re == res_re1, repr(s)
|
112
|
-
|
113
|
-
|
114
|
-
@given(st.from_regex(PunctuationRe.pattern.pattern, fullmatch=True))
|
115
|
-
@example("\U000f1990") # UCSUR char
|
116
|
-
def test_Punctuation(s: str):
|
117
|
-
res_fn = Punctuation.filter(s)
|
118
|
-
res_re1 = PunctuationRe1.filter(s)
|
119
|
-
assert res_fn == res_re1, repr(s)
|
120
|
-
|
121
|
-
|
122
|
-
@given(st.from_regex(r"\d+", fullmatch=True))
|
123
|
-
@example("124125")
|
124
|
-
@example("99990000")
|
125
|
-
def test_Numeric(s: str):
|
126
|
-
res = Numeric.filter(s)
|
127
|
-
assert res, repr(s)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|