sonatoki 0.2.2__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sonatoki-0.2.2 → sonatoki-0.3.1}/PKG-INFO +28 -17
- {sonatoki-0.2.2 → sonatoki-0.3.1}/README.md +27 -16
- {sonatoki-0.2.2 → sonatoki-0.3.1}/pyproject.toml +1 -1
- {sonatoki-0.2.2 → sonatoki-0.3.1}/src/sonatoki/Cleaners.py +7 -0
- sonatoki-0.3.1/src/sonatoki/Configs.py +139 -0
- {sonatoki-0.2.2 → sonatoki-0.3.1}/src/sonatoki/Filters.py +150 -19
- {sonatoki-0.2.2 → sonatoki-0.3.1}/src/sonatoki/Tokenizers.py +33 -17
- sonatoki-0.3.1/src/sonatoki/constants.py +473 -0
- {sonatoki-0.2.2 → sonatoki-0.3.1}/src/sonatoki/utils.py +40 -2
- sonatoki-0.3.1/tests/__init__.py +0 -0
- sonatoki-0.3.1/tests/test_filters.py +263 -0
- {sonatoki-0.2.2 → sonatoki-0.3.1}/tests/test_ilo.py +97 -35
- {sonatoki-0.2.2 → sonatoki-0.3.1}/tests/test_scorers.py +4 -2
- {sonatoki-0.2.2 → sonatoki-0.3.1}/tests/test_tokenize.py +28 -27
- {sonatoki-0.2.2 → sonatoki-0.3.1}/tests/test_utils.py +2 -2
- {sonatoki-0.2.2 → sonatoki-0.3.1}/tests/tokenize_cases/tokenize_words_tok.yml +44 -0
- sonatoki-0.2.2/src/sonatoki/Configs.py +0 -80
- sonatoki-0.2.2/src/sonatoki/constants.py +0 -72
- sonatoki-0.2.2/tests/test_filters.py +0 -126
- {sonatoki-0.2.2 → sonatoki-0.3.1}/LICENSE +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.1}/src/sonatoki/Preprocessors.py +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.1}/src/sonatoki/Scorers.py +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.1}/src/sonatoki/__init__.py +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.1}/src/sonatoki/__main__.py +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.1}/src/sonatoki/ilo.py +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.1}/src/sonatoki/linku.json +0 -0
- /sonatoki-0.2.2/tests/__init__.py → /sonatoki-0.3.1/src/sonatoki/py.typed +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.1}/src/sonatoki/sandbox.json +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.1}/tests/test_cleaners.py +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.1}/tests/test_preprocessors.py +0 -0
- {sonatoki-0.2.2 → sonatoki-0.3.1}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sonatoki
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.1
|
4
4
|
Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
|
5
5
|
Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
|
6
6
|
License: AGPL-3.0-or-later
|
@@ -12,15 +12,22 @@ Description-Content-Type: text/markdown
|
|
12
12
|
|
13
13
|
# sona toki
|
14
14
|
|
15
|
+
<div align="center">
|
16
|
+
|
17
|
+

|
18
|
+
[](https://pypi.org/project/sonatoki)
|
19
|
+
|
20
|
+
</div>
|
21
|
+
|
15
22
|
## What is **sona toki**?
|
16
23
|
|
17
|
-
This library, "Language Knowledge," helps you identify whether a message is in Toki Pona.
|
24
|
+
This library, "Language Knowledge," helps you identify whether a message is in Toki Pona. It does so by determining whether a large enough number of words in a statement are "in Toki Pona". No grammar checking, yet.
|
18
25
|
|
19
|
-
I wrote
|
26
|
+
I wrote this library with a variety of scraps and lessons learned from a prior project, [ilo pi toki pona taso, "toki-pona-only tool"](https://github.com/gregdan3/ilo-pi-toki-pona-taso). That tool now uses this library to great success!
|
20
27
|
|
21
|
-
If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language
|
28
|
+
If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language. This complexity applies to Toki Pona too.
|
22
29
|
|
23
|
-
|
30
|
+
So, this project "solves" that complex problem by offering an opinionated tokenizer and a configurable parser, allowing you to tune its output to your preferences and goals. [Even silly ones.](https://sona.pona.la/wiki/isipin_epiku).
|
24
31
|
|
25
32
|
## Quick Start
|
26
33
|
|
@@ -53,12 +60,12 @@ Or if you'd prefer to configure on your own:
|
|
53
60
|
from copy import deepcopy
|
54
61
|
from sonatoki.ilo import Ilo
|
55
62
|
from sonatoki.Configs import BaseConfig
|
56
|
-
from sonatoki.Filters import
|
63
|
+
from sonatoki.Filters import NimiLinkuCore, Phonotactic, ProperName
|
57
64
|
from sonatoki.Scorers import SoftPassFail
|
58
65
|
|
59
66
|
def main():
|
60
67
|
config = deepcopy(BaseConfig)
|
61
|
-
config["scoring_filters"].extend([
|
68
|
+
config["scoring_filters"].extend([NimiLinkuCore, Phonotactic, ProperName])
|
62
69
|
config["scorer"] = SoftPassFail
|
63
70
|
|
64
71
|
ilo = Ilo(**config)
|
@@ -88,24 +95,28 @@ After our proposal has been examined and a result given by the committee, I will
|
|
88
95
|
|
89
96
|
### What's the deal with the tokenizers?
|
90
97
|
|
91
|
-
The Toki Pona tokenizer `
|
92
|
-
|
93
|
-
Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet, so a more aggressive tokenizer is highly desirable.
|
98
|
+
The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer` has the goal of tokenizing statements such that every token either represents a word candidate ("toki", "mumumu") or a complete non-candidate ("..!", "123").
|
99
|
+
This design is highly undesirable for NLTK's English tokenizer because English words can have "punctuation" characters in them.
|
100
|
+
But Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet or in [Private Use Area Unicode characters](https://www.kreativekorp.com/ucsur/), so a more aggressive tokenizer is highly desirable.
|
94
101
|
|
95
|
-
The
|
102
|
+
The goal of splitting into word candidates and non-candidates is important, because any [encoding of Toki Pona's logographic script](https://www.kreativekorp.com/ucsur/charts/sitelen.html) will require each character be split into its own token, where the default behavior would be to leave consecutive non-punctuation together.
|
96
103
|
|
97
104
|
### Aren't there a lot of false positives?
|
98
105
|
|
99
|
-
Yes
|
106
|
+
Yes, depending on the filter you choose and how you apply it.
|
107
|
+
It's up to you to use this tool responsibly on input you've done your best to clean, such as by using stronger filters before weaker ones.
|
108
|
+
For now though, here's a list of relevant false positives:
|
100
109
|
|
101
|
-
- `ProperName` will errantly match text in languages without a capital/lowercase distinction, artificially
|
102
|
-
- `Alphabetic` will match a _lot_ of undesirable text- it essentially allows 14 letters of the English alphabet.
|
110
|
+
- `ProperName` will errantly match text in languages without a capital/lowercase distinction, artificially increasing scores.
|
111
|
+
- `Alphabetic` will match a _lot_ of undesirable text- it essentially allows 14 letters of the English alphabet. For example, "I'm well" would match as _three_ words: "i", "m", "well".
|
112
|
+
- `NimiPu` and other sets containing `a`, `mute`, `open`, and others will unavoidably match those words in English text too.
|
103
113
|
|
104
114
|
### Don't some of the cleaners/filters conflict?
|
105
115
|
|
106
|
-
Yes
|
116
|
+
Yes, though not terribly much.
|
107
117
|
|
108
118
|
- `ConsecutiveDuplicates` may errantly change a word's validity. For example, "manna" is phonotactically invalid in Toki Pona, but would become "mana" which is valid.
|
109
|
-
- `ConsecutiveDuplicates` will not work correctly with syllabaries
|
119
|
+
- `ConsecutiveDuplicates` will not work correctly with syllabaries, though this should not change the validity of the analyzed word unless you attempt to dictionary match these words.
|
120
|
+
- If you build your own `MemberFilter` with words that have capital letters or consecutive duplicates, they will never match unless you use `prep_dictionary`.
|
110
121
|
|
111
|
-
You'll notice
|
122
|
+
You'll notice these are mostly casued by applying latin alphabet filters to non-latin text. Working on it!
|
@@ -1,14 +1,21 @@
|
|
1
1
|
# sona toki
|
2
2
|
|
3
|
+
<div align="center">
|
4
|
+
|
5
|
+

|
6
|
+
[](https://pypi.org/project/sonatoki)
|
7
|
+
|
8
|
+
</div>
|
9
|
+
|
3
10
|
## What is **sona toki**?
|
4
11
|
|
5
|
-
This library, "Language Knowledge," helps you identify whether a message is in Toki Pona.
|
12
|
+
This library, "Language Knowledge," helps you identify whether a message is in Toki Pona. It does so by determining whether a large enough number of words in a statement are "in Toki Pona". No grammar checking, yet.
|
6
13
|
|
7
|
-
I wrote
|
14
|
+
I wrote this library with a variety of scraps and lessons learned from a prior project, [ilo pi toki pona taso, "toki-pona-only tool"](https://github.com/gregdan3/ilo-pi-toki-pona-taso). That tool now uses this library to great success!
|
8
15
|
|
9
|
-
If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language
|
16
|
+
If you've ever worked on a similar project, you know the question "is this message in [language]" is not a consistent one- the environment, time, preferences of the speaker, and much more, can all alter whether a given message is "in" any specific language. This complexity applies to Toki Pona too.
|
10
17
|
|
11
|
-
|
18
|
+
So, this project "solves" that complex problem by offering an opinionated tokenizer and a configurable parser, allowing you to tune its output to your preferences and goals. [Even silly ones.](https://sona.pona.la/wiki/isipin_epiku).
|
12
19
|
|
13
20
|
## Quick Start
|
14
21
|
|
@@ -41,12 +48,12 @@ Or if you'd prefer to configure on your own:
|
|
41
48
|
from copy import deepcopy
|
42
49
|
from sonatoki.ilo import Ilo
|
43
50
|
from sonatoki.Configs import BaseConfig
|
44
|
-
from sonatoki.Filters import
|
51
|
+
from sonatoki.Filters import NimiLinkuCore, Phonotactic, ProperName
|
45
52
|
from sonatoki.Scorers import SoftPassFail
|
46
53
|
|
47
54
|
def main():
|
48
55
|
config = deepcopy(BaseConfig)
|
49
|
-
config["scoring_filters"].extend([
|
56
|
+
config["scoring_filters"].extend([NimiLinkuCore, Phonotactic, ProperName])
|
50
57
|
config["scorer"] = SoftPassFail
|
51
58
|
|
52
59
|
ilo = Ilo(**config)
|
@@ -76,24 +83,28 @@ After our proposal has been examined and a result given by the committee, I will
|
|
76
83
|
|
77
84
|
### What's the deal with the tokenizers?
|
78
85
|
|
79
|
-
The Toki Pona tokenizer `
|
80
|
-
|
81
|
-
Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet, so a more aggressive tokenizer is highly desirable.
|
86
|
+
The Toki Pona tokenizer `sonatoki.Tokenizers.WordTokenizer` has the goal of tokenizing statements such that every token either represents a word candidate ("toki", "mumumu") or a complete non-candidate ("..!", "123").
|
87
|
+
This design is highly undesirable for NLTK's English tokenizer because English words can have "punctuation" characters in them.
|
88
|
+
But Toki Pona doesn't have any mid-word symbols when rendered in the Latin alphabet or in [Private Use Area Unicode characters](https://www.kreativekorp.com/ucsur/), so a more aggressive tokenizer is highly desirable.
|
82
89
|
|
83
|
-
The
|
90
|
+
The goal of splitting into word candidates and non-candidates is important, because any [encoding of Toki Pona's logographic script](https://www.kreativekorp.com/ucsur/charts/sitelen.html) will require each character be split into its own token, where the default behavior would be to leave consecutive non-punctuation together.
|
84
91
|
|
85
92
|
### Aren't there a lot of false positives?
|
86
93
|
|
87
|
-
Yes
|
94
|
+
Yes, depending on the filter you choose and how you apply it.
|
95
|
+
It's up to you to use this tool responsibly on input you've done your best to clean, such as by using stronger filters before weaker ones.
|
96
|
+
For now though, here's a list of relevant false positives:
|
88
97
|
|
89
|
-
- `ProperName` will errantly match text in languages without a capital/lowercase distinction, artificially
|
90
|
-
- `Alphabetic` will match a _lot_ of undesirable text- it essentially allows 14 letters of the English alphabet.
|
98
|
+
- `ProperName` will errantly match text in languages without a capital/lowercase distinction, artificially increasing scores.
|
99
|
+
- `Alphabetic` will match a _lot_ of undesirable text- it essentially allows 14 letters of the English alphabet. For example, "I'm well" would match as _three_ words: "i", "m", "well".
|
100
|
+
- `NimiPu` and other sets containing `a`, `mute`, `open`, and others will unavoidably match those words in English text too.
|
91
101
|
|
92
102
|
### Don't some of the cleaners/filters conflict?
|
93
103
|
|
94
|
-
Yes
|
104
|
+
Yes, though not terribly much.
|
95
105
|
|
96
106
|
- `ConsecutiveDuplicates` may errantly change a word's validity. For example, "manna" is phonotactically invalid in Toki Pona, but would become "mana" which is valid.
|
97
|
-
- `ConsecutiveDuplicates` will not work correctly with syllabaries
|
107
|
+
- `ConsecutiveDuplicates` will not work correctly with syllabaries, though this should not change the validity of the analyzed word unless you attempt to dictionary match these words.
|
108
|
+
- If you build your own `MemberFilter` with words that have capital letters or consecutive duplicates, they will never match unless you use `prep_dictionary`.
|
98
109
|
|
99
|
-
You'll notice
|
110
|
+
You'll notice these are mostly casued by applying latin alphabet filters to non-latin text. Working on it!
|
@@ -0,0 +1,139 @@
|
|
1
|
+
# STL
|
2
|
+
from copy import deepcopy
|
3
|
+
from typing import List, Type, TypedDict
|
4
|
+
|
5
|
+
# LOCAL
|
6
|
+
from sonatoki.Filters import (
|
7
|
+
Filter,
|
8
|
+
NimiPu,
|
9
|
+
Numeric,
|
10
|
+
OrFilter,
|
11
|
+
Syllabic,
|
12
|
+
NimiUCSUR,
|
13
|
+
Alphabetic,
|
14
|
+
ProperName,
|
15
|
+
Phonotactic,
|
16
|
+
Punctuation,
|
17
|
+
NimiLinkuCore,
|
18
|
+
NimiPuSynonyms,
|
19
|
+
OrMemberFilter,
|
20
|
+
NimiLinkuCommon,
|
21
|
+
NimiLinkuObscure,
|
22
|
+
NimiLinkuSandbox,
|
23
|
+
EnglishIgnorables,
|
24
|
+
NimiLinkuUncommon,
|
25
|
+
)
|
26
|
+
from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
|
27
|
+
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
28
|
+
from sonatoki.Tokenizers import Tokenizer, WordTokenizer
|
29
|
+
from sonatoki.Preprocessors import (
|
30
|
+
URLs,
|
31
|
+
Reference,
|
32
|
+
Preprocessor,
|
33
|
+
DiscordEmotes,
|
34
|
+
DiscordSpecial,
|
35
|
+
DiscordChannels,
|
36
|
+
DiscordMentions,
|
37
|
+
AngleBracketObject,
|
38
|
+
)
|
39
|
+
|
40
|
+
|
41
|
+
class IloConfig(TypedDict):
|
42
|
+
preprocessors: List[Type[Preprocessor]]
|
43
|
+
word_tokenizer: Type[Tokenizer]
|
44
|
+
cleaners: List[Type[Cleaner]]
|
45
|
+
ignoring_filters: List[Type[Filter]]
|
46
|
+
scoring_filters: List[Type[Filter]]
|
47
|
+
scorer: Type[Scorer]
|
48
|
+
passing_score: Number
|
49
|
+
|
50
|
+
|
51
|
+
# TODO: branching configs?
|
52
|
+
|
53
|
+
BaseConfig: IloConfig = {
|
54
|
+
"preprocessors": [URLs],
|
55
|
+
"cleaners": [ConsecutiveDuplicates],
|
56
|
+
"ignoring_filters": [Numeric, Punctuation],
|
57
|
+
"scoring_filters": [],
|
58
|
+
"scorer": PassFail,
|
59
|
+
"passing_score": 0.8,
|
60
|
+
"word_tokenizer": WordTokenizer,
|
61
|
+
}
|
62
|
+
|
63
|
+
|
64
|
+
PrefConfig: IloConfig = {
|
65
|
+
"preprocessors": [URLs, Reference],
|
66
|
+
"cleaners": [ConsecutiveDuplicates],
|
67
|
+
"ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
|
68
|
+
"scoring_filters": [
|
69
|
+
OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
|
70
|
+
Syllabic,
|
71
|
+
ProperName,
|
72
|
+
Alphabetic,
|
73
|
+
],
|
74
|
+
"scorer": SoftScaling,
|
75
|
+
"passing_score": 0.8,
|
76
|
+
"word_tokenizer": WordTokenizer,
|
77
|
+
}
|
78
|
+
|
79
|
+
CorpusConfig: IloConfig = {
|
80
|
+
"preprocessors": [URLs, AngleBracketObject, Reference],
|
81
|
+
"cleaners": [ConsecutiveDuplicates],
|
82
|
+
"ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
|
83
|
+
"scoring_filters": [
|
84
|
+
OrMemberFilter(
|
85
|
+
NimiLinkuCore,
|
86
|
+
NimiLinkuCommon,
|
87
|
+
NimiLinkuUncommon,
|
88
|
+
NimiLinkuObscure,
|
89
|
+
NimiLinkuSandbox,
|
90
|
+
NimiUCSUR,
|
91
|
+
),
|
92
|
+
Syllabic,
|
93
|
+
ProperName,
|
94
|
+
Alphabetic,
|
95
|
+
],
|
96
|
+
"scorer": SoftScaling,
|
97
|
+
"passing_score": 0.8,
|
98
|
+
"word_tokenizer": WordTokenizer,
|
99
|
+
}
|
100
|
+
|
101
|
+
|
102
|
+
LazyConfig: IloConfig = {
|
103
|
+
"preprocessors": [URLs],
|
104
|
+
"cleaners": [ConsecutiveDuplicates],
|
105
|
+
"ignoring_filters": [Numeric, Punctuation],
|
106
|
+
"scoring_filters": [Alphabetic, NimiUCSUR, ProperName],
|
107
|
+
"scorer": SoftPassFail,
|
108
|
+
"passing_score": 0.8,
|
109
|
+
"word_tokenizer": WordTokenizer,
|
110
|
+
}
|
111
|
+
|
112
|
+
DiscordConfig: IloConfig = {
|
113
|
+
"preprocessors": [URLs, AngleBracketObject, Reference],
|
114
|
+
"cleaners": [ConsecutiveDuplicates],
|
115
|
+
"ignoring_filters": [Numeric, Punctuation, EnglishIgnorables],
|
116
|
+
"scoring_filters": [
|
117
|
+
OrMemberFilter(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR),
|
118
|
+
Syllabic,
|
119
|
+
ProperName,
|
120
|
+
Alphabetic,
|
121
|
+
],
|
122
|
+
"scorer": SoftScaling,
|
123
|
+
"passing_score": 0.8,
|
124
|
+
"word_tokenizer": WordTokenizer,
|
125
|
+
}
|
126
|
+
|
127
|
+
TelegramConfig: IloConfig = deepcopy(PrefConfig)
|
128
|
+
ForumConfig: IloConfig = deepcopy(PrefConfig)
|
129
|
+
|
130
|
+
__all__ = [
|
131
|
+
"BaseConfig",
|
132
|
+
"CorpusConfig",
|
133
|
+
"DiscordConfig",
|
134
|
+
"ForumConfig",
|
135
|
+
"IloConfig",
|
136
|
+
"LazyConfig",
|
137
|
+
"PrefConfig",
|
138
|
+
"TelegramConfig",
|
139
|
+
]
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# STL
|
2
2
|
import re
|
3
3
|
from abc import ABC, abstractmethod
|
4
|
-
from typing import Set
|
4
|
+
from typing import Set, List, Type
|
5
5
|
from functools import lru_cache as cache # cache comes in 3.9
|
6
6
|
|
7
7
|
# PDM
|
@@ -9,19 +9,26 @@ import regex
|
|
9
9
|
from typing_extensions import override
|
10
10
|
|
11
11
|
# LOCAL
|
12
|
+
from sonatoki.utils import prep_dictionary
|
12
13
|
from sonatoki.constants import (
|
13
14
|
VOWELS,
|
14
15
|
NIMI_PU,
|
15
16
|
ALPHABET,
|
17
|
+
ALL_PUNCT,
|
16
18
|
ALLOWABLES,
|
17
19
|
CONSONANTS,
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
20
|
+
IGNORABLES,
|
21
|
+
NIMI_UCSUR,
|
22
|
+
NIMI_KU_LILI,
|
23
|
+
NIMI_KU_SULI,
|
24
|
+
NIMI_LINKU_CORE,
|
22
25
|
ALL_PUNCT_RANGES,
|
23
26
|
NIMI_PU_SYNONYMS,
|
27
|
+
NIMI_LINKU_COMMON,
|
28
|
+
NIMI_LINKU_OBSCURE,
|
24
29
|
NIMI_LINKU_SANDBOX,
|
30
|
+
UCSUR_PUNCT_RANGES,
|
31
|
+
NIMI_LINKU_UNCOMMON,
|
25
32
|
)
|
26
33
|
|
27
34
|
regex.DEFAULT_VERSION = regex.VERSION1
|
@@ -79,6 +86,10 @@ class Miscellaneous(MemberFilter):
|
|
79
86
|
tokens = set(ALLOWABLES)
|
80
87
|
|
81
88
|
|
89
|
+
class EnglishIgnorables(MemberFilter):
|
90
|
+
tokens = set(IGNORABLES)
|
91
|
+
|
92
|
+
|
82
93
|
class ProperName(Filter):
|
83
94
|
"""Determines if a given token is a valid name (also called a loan word).
|
84
95
|
When Toki Pona is written with the Latin alphabet, names are generally
|
@@ -99,23 +110,43 @@ class ProperName(Filter):
|
|
99
110
|
|
100
111
|
|
101
112
|
class NimiPu(MemberFilter):
|
102
|
-
tokens =
|
113
|
+
tokens = prep_dictionary(NIMI_PU)
|
114
|
+
|
115
|
+
|
116
|
+
class NimiPuSynonyms(MemberFilter):
|
117
|
+
tokens = prep_dictionary(NIMI_PU_SYNONYMS)
|
118
|
+
|
119
|
+
|
120
|
+
class NimiKuSuli(MemberFilter):
|
121
|
+
tokens = prep_dictionary(NIMI_KU_SULI)
|
122
|
+
|
123
|
+
|
124
|
+
class NimiKuLili(MemberFilter):
|
125
|
+
tokens = prep_dictionary(NIMI_KU_LILI)
|
126
|
+
|
127
|
+
|
128
|
+
class NimiLinkuCore(MemberFilter):
|
129
|
+
tokens = prep_dictionary(NIMI_LINKU_CORE)
|
103
130
|
|
104
131
|
|
105
|
-
class
|
106
|
-
tokens =
|
132
|
+
class NimiLinkuCommon(MemberFilter):
|
133
|
+
tokens = prep_dictionary(NIMI_LINKU_COMMON)
|
107
134
|
|
108
135
|
|
109
|
-
class
|
110
|
-
tokens =
|
136
|
+
class NimiLinkuUncommon(MemberFilter):
|
137
|
+
tokens = prep_dictionary(NIMI_LINKU_UNCOMMON)
|
111
138
|
|
112
139
|
|
113
|
-
class
|
114
|
-
tokens =
|
140
|
+
class NimiLinkuObscure(MemberFilter):
|
141
|
+
tokens = prep_dictionary(NIMI_LINKU_OBSCURE)
|
115
142
|
|
116
143
|
|
117
144
|
class NimiLinkuSandbox(MemberFilter):
|
118
|
-
tokens =
|
145
|
+
tokens = prep_dictionary(NIMI_LINKU_SANDBOX)
|
146
|
+
|
147
|
+
|
148
|
+
class NimiUCSUR(MemberFilter):
|
149
|
+
tokens = prep_dictionary(NIMI_UCSUR)
|
119
150
|
|
120
151
|
|
121
152
|
class Phonotactic(RegexFilter):
|
@@ -156,6 +187,11 @@ class AlphabeticRe(RegexFilter):
|
|
156
187
|
pattern = re.compile(rf"[{ALPHABET}]+", flags=re.IGNORECASE)
|
157
188
|
|
158
189
|
|
190
|
+
class TwoOrMoreAlphabetic(Filter):
|
191
|
+
# TODO: alphabetic implementation that ignores single characters
|
192
|
+
pass
|
193
|
+
|
194
|
+
|
159
195
|
class Numeric(Filter):
|
160
196
|
"""Determine if a given token is entirely numeric.
|
161
197
|
Covers all numeric symbols in Unicode.
|
@@ -175,12 +211,13 @@ class Numeric(Filter):
|
|
175
211
|
class Punctuation(SubsetFilter):
|
176
212
|
"""Identify whether a token is entirely punctuation. Fastest implementation."""
|
177
213
|
|
178
|
-
tokens = set(
|
214
|
+
tokens = set(ALL_PUNCT)
|
179
215
|
|
180
216
|
|
181
217
|
class PunctuationRe(RegexFilter):
|
182
218
|
"""Faster implementation of `PunctuationRe1`.
|
183
|
-
Goes out of date compared to the `regex` library if
|
219
|
+
Goes out of date compared to the `regex` library if UNICODE_PUNCT_RANGES is not updated.
|
220
|
+
"""
|
184
221
|
|
185
222
|
pattern = re.compile(rf"[{ALL_PUNCT_RANGES}]+")
|
186
223
|
|
@@ -188,17 +225,111 @@ class PunctuationRe(RegexFilter):
|
|
188
225
|
class PunctuationRe1(Regex1Filter):
|
189
226
|
"""Reference implementation for identifying tokens made entirely of punctuation."""
|
190
227
|
|
191
|
-
pattern = regex.compile(
|
228
|
+
pattern = regex.compile(
|
229
|
+
rf"[\p{{Punctuation}}\p{{posix_punct}}{UCSUR_PUNCT_RANGES}]+"
|
230
|
+
)
|
231
|
+
|
232
|
+
|
233
|
+
class OrFilter:
|
234
|
+
"""Instantiate with more than one filter to compose them into one filter,
|
235
|
+
returning True when any individual filter matches or False otherwise.
|
236
|
+
Requires at least two filters.
|
237
|
+
|
238
|
+
OrFilter exists as a compromise between the need to score some filters equally,
|
239
|
+
while not adding custom behavior to scorers.
|
240
|
+
I could have allowed a position to have a list of filters instead of one filter,
|
241
|
+
but this would require cleaning the user's input, and nested handling of lists.
|
242
|
+
It also would not have been as powerful- I would need another param for the and/or switch,
|
243
|
+
or to not give users the choice.
|
244
|
+
|
245
|
+
Instead, the user is responsible for building an OrFilter out of their desired filters.
|
246
|
+
"""
|
247
|
+
|
248
|
+
@staticmethod
|
249
|
+
def __generic_filter(*filters_: Type[Filter]) -> Type[Filter]:
|
250
|
+
|
251
|
+
class CombinedFilter(Filter):
|
252
|
+
filters: List[Type[Filter]] = list(filters_) # TODO: tuple better?
|
253
|
+
|
254
|
+
@classmethod
|
255
|
+
@override
|
256
|
+
@cache(maxsize=None)
|
257
|
+
def filter(cls, token: str) -> bool:
|
258
|
+
for f in cls.filters:
|
259
|
+
if f.filter(token):
|
260
|
+
return True
|
261
|
+
return False
|
262
|
+
|
263
|
+
return CombinedFilter
|
264
|
+
|
265
|
+
def __new__(cls, *filters: Type[Filter]) -> Type[Filter]:
|
266
|
+
if not len(filters) >= 2:
|
267
|
+
raise ValueError("Provide at least two Filters to OrFilter.")
|
268
|
+
|
269
|
+
subset_filters = [f for f in filters if issubclass(f, MemberFilter)]
|
270
|
+
if len(subset_filters) >= 2:
|
271
|
+
raise Warning(
|
272
|
+
"Prefer OrMemberFilter for combining two or more MemberFilters."
|
273
|
+
)
|
274
|
+
|
275
|
+
filter = cls.__generic_filter(*filters)
|
276
|
+
|
277
|
+
return filter
|
278
|
+
|
279
|
+
|
280
|
+
class OrMemberFilter:
|
281
|
+
@staticmethod
|
282
|
+
def __subset_filter(*filters: Type[MemberFilter]) -> Type[MemberFilter]:
|
283
|
+
all_token_sets: List[Set[str]] = [f.tokens for f in filters]
|
284
|
+
all_tokens: Set[str] = set().union(*all_token_sets)
|
285
|
+
|
286
|
+
class CombinedFilter(MemberFilter):
|
287
|
+
tokens = all_tokens
|
288
|
+
|
289
|
+
return CombinedFilter
|
290
|
+
|
291
|
+
def __new__(cls, *filters_: Type[MemberFilter]) -> Type[MemberFilter]:
|
292
|
+
if not len(filters_) >= 2:
|
293
|
+
raise ValueError("Provide two or more MemberFilters to OrMemberFilter.")
|
294
|
+
filter = cls.__subset_filter(*filters_)
|
295
|
+
return filter
|
296
|
+
|
297
|
+
|
298
|
+
class AndFilter(Filter):
|
299
|
+
"""Instantiate with more than one filter to compose them into one filter,
|
300
|
+
returning False when any individual filter fails to match or True otherwise.
|
301
|
+
Requires at least two filters."""
|
302
|
+
|
303
|
+
def __new__(cls, *filters_: Type[Filter]) -> Type[Filter]:
|
304
|
+
if not len(filters_) >= 2:
|
305
|
+
raise ValueError("Must provide at least two Filters to AndFilter.")
|
306
|
+
|
307
|
+
class AnonymousAndFilter(Filter):
|
308
|
+
filters: List[Type[Filter]] = list(filters_) # TODO: tuple better?
|
309
|
+
|
310
|
+
@classmethod
|
311
|
+
@override
|
312
|
+
@cache(maxsize=None)
|
313
|
+
def filter(cls, token: str) -> bool:
|
314
|
+
for f in cls.filters:
|
315
|
+
if not f.filter(token):
|
316
|
+
return False
|
317
|
+
return True
|
318
|
+
|
319
|
+
return AnonymousAndFilter
|
192
320
|
|
193
321
|
|
194
322
|
__all__ = [
|
195
323
|
"Alphabetic",
|
196
|
-
"
|
197
|
-
"
|
324
|
+
"AndFilter",
|
325
|
+
"EnglishIgnorables",
|
326
|
+
"NimiLinkuCore",
|
198
327
|
"NimiLinkuSandbox",
|
199
328
|
"NimiPu",
|
200
|
-
"
|
329
|
+
"NimiPuSynonyms",
|
330
|
+
"NimiUCSUR",
|
201
331
|
"Numeric",
|
332
|
+
"OrFilter",
|
202
333
|
"Phonotactic",
|
203
334
|
"ProperName",
|
204
335
|
"Punctuation",
|
@@ -5,16 +5,12 @@ from typing import Set, List
|
|
5
5
|
|
6
6
|
# PDM
|
7
7
|
import regex
|
8
|
-
from typing_extensions import override
|
8
|
+
from typing_extensions import override, deprecated
|
9
9
|
|
10
10
|
# LOCAL
|
11
11
|
from sonatoki.utils import regex_escape
|
12
|
-
from sonatoki.
|
13
|
-
|
14
|
-
UNICODE_PUNCT,
|
15
|
-
SENTENCE_PUNCT,
|
16
|
-
ALL_PUNCT_RANGES,
|
17
|
-
)
|
12
|
+
from sonatoki.Filters import NimiUCSUR # seriously this sucks
|
13
|
+
from sonatoki.constants import ALL_PUNCT, SENTENCE_PUNCT, ALL_PUNCT_RANGES
|
18
14
|
|
19
15
|
regex.DEFAULT_VERSION = regex.VERSION1
|
20
16
|
|
@@ -50,7 +46,12 @@ class Regex1Tokenizer(Tokenizer):
|
|
50
46
|
|
51
47
|
|
52
48
|
class WordTokenizer(SetTokenizer):
|
53
|
-
delimiters = set(
|
49
|
+
delimiters = set(ALL_PUNCT)
|
50
|
+
|
51
|
+
@classmethod
|
52
|
+
def __helper(cls, s: str, tokens: List[str], last_match: int, i: int):
|
53
|
+
match = s[last_match:i].split()
|
54
|
+
[tokens.append(t) for t in match if t]
|
54
55
|
|
55
56
|
@classmethod
|
56
57
|
@override
|
@@ -60,32 +61,47 @@ class WordTokenizer(SetTokenizer):
|
|
60
61
|
|
61
62
|
tokens: List[str] = []
|
62
63
|
|
64
|
+
i = 0 # ensure i is bound
|
63
65
|
last_match = 0
|
64
66
|
last_membership = s[0] in cls.delimiters
|
65
67
|
for i, char in enumerate(s):
|
66
68
|
mem = char in cls.delimiters
|
67
|
-
|
69
|
+
ucsur = NimiUCSUR.filter(char) # always "changed" means
|
70
|
+
changed = (mem != last_membership) or ucsur
|
71
|
+
# this keeps contiguous words together, but splits UCSUR
|
72
|
+
if not changed:
|
73
|
+
continue
|
74
|
+
|
75
|
+
if ucsur:
|
76
|
+
if i > last_match:
|
77
|
+
# Add the token before UCSUR character
|
78
|
+
cls.__helper(s, tokens, last_match, i)
|
79
|
+
# Add UCSUR character itself as a token
|
80
|
+
tokens.append(char)
|
81
|
+
last_match = i + 1
|
82
|
+
last_membership = mem
|
68
83
|
continue
|
69
84
|
|
70
|
-
|
71
|
-
# TODO: kinda sucks? what about unicode whitespace?
|
85
|
+
cls.__helper(s, tokens, last_match, i)
|
72
86
|
last_match = i
|
73
87
|
last_membership = mem
|
74
|
-
[tokens.append(t) for t in match if t]
|
75
|
-
|
76
|
-
match = s[last_match:].strip().split()
|
77
|
-
if match:
|
78
|
-
tokens.extend(match)
|
79
88
|
|
89
|
+
cls.__helper(s, tokens, last_match, i + 1)
|
80
90
|
return tokens
|
81
91
|
|
82
92
|
|
93
|
+
@deprecated(
|
94
|
+
"WordTokenizerRe is a previous reference implementation. Its behavior has diverged from WordTokenizer and it may not be restored."
|
95
|
+
)
|
83
96
|
class WordTokenizerRe(RegexTokenizer):
|
84
97
|
pattern = re.compile(rf"""([{ALL_PUNCT_RANGES}]+|\s+)""")
|
85
98
|
|
86
99
|
|
100
|
+
@deprecated(
|
101
|
+
"WordTokenizerRe1 is a previous reference implementation. Its behavior has diverged from WordTokenizer and it may not be restored."
|
102
|
+
)
|
87
103
|
class WordTokenizerRe1(Regex1Tokenizer):
|
88
|
-
"""Reference implementation for
|
104
|
+
"""Reference implementation for WordTokenizer."""
|
89
105
|
|
90
106
|
pattern = regex.compile(r"""([\p{posix_punct}\p{Punctuation}]+|\s+)""")
|
91
107
|
|