sonatoki 0.5.3__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sonatoki-0.5.3 → sonatoki-0.6.0}/PKG-INFO +1 -1
- {sonatoki-0.5.3 → sonatoki-0.6.0}/pyproject.toml +1 -1
- {sonatoki-0.5.3 → sonatoki-0.6.0}/src/sonatoki/Configs.py +8 -6
- {sonatoki-0.5.3 → sonatoki-0.6.0}/src/sonatoki/Tokenizers.py +64 -29
- {sonatoki-0.5.3 → sonatoki-0.6.0}/src/sonatoki/constants.py +13 -3
- {sonatoki-0.5.3 → sonatoki-0.6.0}/tests/tokenize_cases/tokenize_sentences_tok.yml +8 -4
- {sonatoki-0.5.3 → sonatoki-0.6.0}/tests/tokenize_cases/tokenize_words_tok.yml +115 -3
- {sonatoki-0.5.3 → sonatoki-0.6.0}/LICENSE +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/README.md +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/src/sonatoki/Cleaners.py +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/src/sonatoki/Filters.py +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/src/sonatoki/Preprocessors.py +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/src/sonatoki/Scorers.py +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/src/sonatoki/__init__.py +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/src/sonatoki/__main__.py +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/src/sonatoki/alphabetic.txt +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/src/sonatoki/ilo.py +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/src/sonatoki/linku.json +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/src/sonatoki/py.typed +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/src/sonatoki/sandbox.json +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/src/sonatoki/syllabic.txt +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/src/sonatoki/utils.py +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/tests/__init__.py +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/tests/test_cleaners.py +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/tests/test_filters.py +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/tests/test_ilo.py +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/tests/test_preprocessors.py +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/tests/test_properties.py +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/tests/test_scorers.py +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/tests/test_tokenize.py +0 -0
- {sonatoki-0.5.3 → sonatoki-0.6.0}/tests/test_utils.py +0 -0
@@ -30,10 +30,11 @@ from sonatoki.Filters import (
|
|
30
30
|
NimiLinkuObscure,
|
31
31
|
NimiLinkuSandbox,
|
32
32
|
NimiLinkuUncommon,
|
33
|
+
FalsePosAlphabetic,
|
33
34
|
)
|
34
35
|
from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
|
35
36
|
from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
|
36
|
-
from sonatoki.Tokenizers import Tokenizer
|
37
|
+
from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
|
37
38
|
from sonatoki.Preprocessors import (
|
38
39
|
URLs,
|
39
40
|
Emoji,
|
@@ -72,11 +73,11 @@ PrefConfig: IloConfig = {
|
|
72
73
|
"cleaners": [ConsecutiveDuplicates],
|
73
74
|
"ignoring_filters": [Numeric, Punctuation],
|
74
75
|
"scoring_filters": [
|
75
|
-
Or(NimiLinkuCore, NimiLinkuCommon,
|
76
|
+
Or(NimiLinkuCore, NimiLinkuCommon, NimiLinkuUncommon, NimiUCSUR),
|
76
77
|
And(LongSyllabic, Not(FalsePosSyllabic)),
|
77
78
|
# NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
|
78
79
|
LongProperName,
|
79
|
-
LongAlphabetic,
|
80
|
+
And(LongAlphabetic, Not(FalsePosAlphabetic)),
|
80
81
|
],
|
81
82
|
"scorer": SoftScaling,
|
82
83
|
"passing_score": 0.8,
|
@@ -98,7 +99,7 @@ CorpusConfig: IloConfig = {
|
|
98
99
|
),
|
99
100
|
And(LongSyllabic, Not(FalsePosSyllabic)),
|
100
101
|
LongProperName,
|
101
|
-
LongAlphabetic,
|
102
|
+
And(LongAlphabetic, Not(FalsePosAlphabetic)),
|
102
103
|
],
|
103
104
|
"scorer": SoftScaling,
|
104
105
|
"passing_score": 0.8,
|
@@ -145,6 +146,7 @@ LazyConfig: IloConfig = {
|
|
145
146
|
"scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
|
146
147
|
"scorer": SoftPassFail,
|
147
148
|
"passing_score": 0.8,
|
149
|
+
"word_tokenizer": WordTokenizerRe, # mimics old tokenizer
|
148
150
|
}
|
149
151
|
"""This is extremely silly."""
|
150
152
|
IsipinEpikuConfig: IloConfig = {
|
@@ -161,7 +163,7 @@ IsipinEpikuConfig: IloConfig = {
|
|
161
163
|
),
|
162
164
|
And(LongSyllabic, Not(FalsePosSyllabic)),
|
163
165
|
LongProperName,
|
164
|
-
LongAlphabetic,
|
166
|
+
And(LongAlphabetic, Not(FalsePosAlphabetic)),
|
165
167
|
],
|
166
168
|
"scorer": SoftScaling,
|
167
169
|
"passing_score": 0.8,
|
@@ -176,7 +178,7 @@ DiscordConfig: IloConfig = {
|
|
176
178
|
Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
|
177
179
|
And(LongSyllabic, Not(FalsePosSyllabic)),
|
178
180
|
LongProperName,
|
179
|
-
LongAlphabetic,
|
181
|
+
And(LongAlphabetic, Not(FalsePosAlphabetic)),
|
180
182
|
],
|
181
183
|
"scorer": SoftScaling,
|
182
184
|
"passing_score": 0.8,
|
@@ -10,7 +10,12 @@ from typing_extensions import override, deprecated
|
|
10
10
|
# LOCAL
|
11
11
|
from sonatoki.utils import regex_escape
|
12
12
|
from sonatoki.Filters import NimiUCSUR # seriously this sucks
|
13
|
-
from sonatoki.constants import
|
13
|
+
from sonatoki.constants import (
|
14
|
+
ALL_PUNCT,
|
15
|
+
SENTENCE_PUNCT,
|
16
|
+
INTRA_WORD_PUNCT,
|
17
|
+
ALL_PUNCT_RANGES_STR,
|
18
|
+
)
|
14
19
|
|
15
20
|
regex.DEFAULT_VERSION = regex.VERSION1
|
16
21
|
|
@@ -47,11 +52,62 @@ class Regex1Tokenizer(Tokenizer):
|
|
47
52
|
|
48
53
|
class WordTokenizer(SetTokenizer):
|
49
54
|
delimiters = set(ALL_PUNCT)
|
55
|
+
intra_word_punct = set(INTRA_WORD_PUNCT)
|
56
|
+
|
57
|
+
@classmethod
|
58
|
+
def is_delimiter(cls, c: str) -> bool:
|
59
|
+
return c in cls.delimiters or not c
|
50
60
|
|
51
61
|
@classmethod
|
52
|
-
def
|
53
|
-
|
54
|
-
|
62
|
+
def add_token(cls, s: str, tokens: List[str], last_match: int, i: int):
|
63
|
+
if i > last_match:
|
64
|
+
tokens.append(s[last_match:i])
|
65
|
+
|
66
|
+
@classmethod
|
67
|
+
def to_tokens(cls, s: str) -> List[str]:
|
68
|
+
tokens: List[str] = []
|
69
|
+
|
70
|
+
slen = len(s)
|
71
|
+
i = 0
|
72
|
+
did_skip = False # ensure exists
|
73
|
+
while i < slen:
|
74
|
+
|
75
|
+
# contiguous punctuation chars
|
76
|
+
last_match = i
|
77
|
+
while i < slen and cls.is_delimiter(s[i]):
|
78
|
+
# no special case
|
79
|
+
i += 1
|
80
|
+
cls.add_token(s, tokens, last_match, i)
|
81
|
+
|
82
|
+
# contiguous writing chars (much harder)
|
83
|
+
last_match = i
|
84
|
+
while i < slen and not cls.is_delimiter(s[i]):
|
85
|
+
did_skip = False
|
86
|
+
# we skip and see another writing char, or init
|
87
|
+
|
88
|
+
if NimiUCSUR.filter(s[i]):
|
89
|
+
cls.add_token(s, tokens, last_match, i)
|
90
|
+
tokens.append(s[i])
|
91
|
+
i += 1
|
92
|
+
last_match = i
|
93
|
+
continue
|
94
|
+
|
95
|
+
next_char = s[i + 1] if i + 1 < slen else ""
|
96
|
+
if next_char in cls.intra_word_punct:
|
97
|
+
did_skip = True
|
98
|
+
i += 2
|
99
|
+
continue
|
100
|
+
|
101
|
+
i += 1
|
102
|
+
|
103
|
+
if did_skip:
|
104
|
+
# we skipped, but there wasn't another writing character
|
105
|
+
cls.add_token(s, tokens, last_match, i - 1)
|
106
|
+
last_match = i - 1
|
107
|
+
|
108
|
+
cls.add_token(s, tokens, last_match, i)
|
109
|
+
|
110
|
+
return tokens
|
55
111
|
|
56
112
|
@classmethod
|
57
113
|
@override
|
@@ -60,33 +116,12 @@ class WordTokenizer(SetTokenizer):
|
|
60
116
|
return []
|
61
117
|
|
62
118
|
tokens: List[str] = []
|
119
|
+
candidates: List[str] = s.split()
|
63
120
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
for i, char in enumerate(s):
|
68
|
-
mem = char in cls.delimiters
|
69
|
-
ucsur = NimiUCSUR.filter(char)
|
70
|
-
changed = (mem != last_membership) or ucsur
|
71
|
-
# this keeps contiguous words together, but splits UCSUR
|
72
|
-
if not changed:
|
73
|
-
continue
|
74
|
-
|
75
|
-
if ucsur:
|
76
|
-
if i > last_match:
|
77
|
-
# Add the token before UCSUR character
|
78
|
-
cls.__helper(s, tokens, last_match, i)
|
79
|
-
# Add UCSUR character itself as a token
|
80
|
-
tokens.append(char)
|
81
|
-
last_match = i + 1
|
82
|
-
last_membership = mem
|
83
|
-
continue
|
84
|
-
|
85
|
-
cls.__helper(s, tokens, last_match, i)
|
86
|
-
last_match = i
|
87
|
-
last_membership = mem
|
121
|
+
for candidate in candidates:
|
122
|
+
results = cls.to_tokens(candidate)
|
123
|
+
tokens.extend(results)
|
88
124
|
|
89
|
-
cls.__helper(s, tokens, last_match, i + 1)
|
90
125
|
return tokens
|
91
126
|
|
92
127
|
|
@@ -498,7 +498,10 @@ ALL_PUNCT = "".join(sorted(list(set(POSIX_PUNCT + UNICODE_PUNCT))))
|
|
498
498
|
ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
|
499
499
|
# combined bc the result could be simpler
|
500
500
|
|
501
|
-
SENTENCE_PUNCT = """.?!:;
|
501
|
+
SENTENCE_PUNCT = """.?!:;()[-]·…"""
|
502
|
+
# NOTE: quotes were previously included, but in TP they are *not* reliably sentence boundaries
|
503
|
+
|
504
|
+
INTRA_WORD_PUNCT = """-'"""
|
502
505
|
|
503
506
|
|
504
507
|
LINKU = Path(__file__).resolve().parent / Path("linku.json")
|
@@ -514,8 +517,8 @@ LANGUAGE = "english" # for NLTK
|
|
514
517
|
"""Commonly occurring strings which are some kind of valid Toki Pona or
|
515
518
|
external token."""
|
516
519
|
ALLOWABLES = {
|
517
|
-
"x", # ala
|
518
|
-
"y", # anu
|
520
|
+
# "x", # ala
|
521
|
+
# "y", # anu
|
519
522
|
"kxk", # ken ala ken
|
520
523
|
"wxw", # wile ala wile
|
521
524
|
"msa",
|
@@ -539,6 +542,7 @@ FALSE_POS_SYLLABIC = {
|
|
539
542
|
"name",
|
540
543
|
"time",
|
541
544
|
"imo", # "in my opinion"
|
545
|
+
"ime", # "in my experience"
|
542
546
|
"man",
|
543
547
|
# "son", # sona typo?
|
544
548
|
"joke",
|
@@ -616,6 +620,7 @@ FALSE_POS_SYLLABIC = {
|
|
616
620
|
# manual additions
|
617
621
|
"alike",
|
618
622
|
"amuse",
|
623
|
+
"animate",
|
619
624
|
"antelope",
|
620
625
|
"antena",
|
621
626
|
"apetite",
|
@@ -638,16 +643,21 @@ FALSE_POS_SYLLABIC = {
|
|
638
643
|
"insolate",
|
639
644
|
"insulate",
|
640
645
|
"intense",
|
646
|
+
# "june",
|
641
647
|
"lemon",
|
642
648
|
"manipulate",
|
643
649
|
"misuse",
|
644
650
|
"ne", # "no" in many other languages
|
651
|
+
"tape",
|
652
|
+
"onto",
|
645
653
|
"wana",
|
654
|
+
"muse",
|
646
655
|
}
|
647
656
|
|
648
657
|
FALSE_POS_ALPHABETIC: Set[str] = {
|
649
658
|
"t",
|
650
659
|
"is",
|
660
|
+
"os", # some command prefix...
|
651
661
|
"as",
|
652
662
|
"not",
|
653
663
|
"link",
|
@@ -46,13 +46,17 @@
|
|
46
46
|
input: "mi mu tawa sina, mi wawa e sina."
|
47
47
|
output:
|
48
48
|
- "mi mu tawa sina, mi wawa e sina."
|
49
|
-
- name: "
|
49
|
+
- name: "singlequotes"
|
50
50
|
input: "toki li tan kulupu Kuko li ni: 'o ike ala!'"
|
51
|
-
output:
|
51
|
+
output:
|
52
52
|
- "toki li tan kulupu Kuko li ni:"
|
53
|
+
- "'o ike ala!"
|
53
54
|
- "'"
|
54
|
-
|
55
|
-
|
55
|
+
- name: "doublequotes"
|
56
|
+
input: 'ona li toki e ni: "mama sina"'
|
57
|
+
output:
|
58
|
+
- "ona li toki e ni:"
|
59
|
+
- '"mama sina"'
|
56
60
|
- name: "discovered case 1"
|
57
61
|
input: "ona li ken lukin e sitelen [_ike_nanpa_lete_ike]. ni li pona kin."
|
58
62
|
output:
|
@@ -34,7 +34,73 @@
|
|
34
34
|
- "ike"
|
35
35
|
- "ala"
|
36
36
|
- "!'"
|
37
|
+
- name: "english 1"
|
38
|
+
input: "isn't that strange?"
|
39
|
+
output:
|
40
|
+
- "isn't"
|
41
|
+
- "that"
|
42
|
+
- "strange"
|
43
|
+
- "?"
|
44
|
+
- name: "english 2"
|
45
|
+
input: "i have self-respect..."
|
46
|
+
output:
|
47
|
+
- "i"
|
48
|
+
- "have"
|
49
|
+
- "self-respect"
|
50
|
+
- "..."
|
51
|
+
- name: "english 3"
|
52
|
+
input: "i'm an m.d."
|
53
|
+
output:
|
54
|
+
- "i'm"
|
55
|
+
- "an"
|
56
|
+
- "m"
|
57
|
+
- "."
|
58
|
+
- "d"
|
59
|
+
- "."
|
60
|
+
- name: "english 4"
|
61
|
+
input: "it's mind-numbing honestly"
|
62
|
+
output:
|
63
|
+
- "it's"
|
64
|
+
- "mind-numbing"
|
65
|
+
- "honestly"
|
66
|
+
- name: "english 5"
|
67
|
+
input: "Here's what they said: 'single quotes are boring'"
|
68
|
+
output:
|
69
|
+
- "Here's"
|
70
|
+
- "what"
|
71
|
+
- "they"
|
72
|
+
- "said"
|
73
|
+
- ":"
|
74
|
+
- "'"
|
75
|
+
- "single"
|
76
|
+
- "quotes"
|
77
|
+
- "are"
|
78
|
+
- "boring"
|
79
|
+
- "'"
|
80
|
+
- name: "english 6"
|
81
|
+
input: "Here's what they said: 'single quotes are boring' and true"
|
82
|
+
output:
|
83
|
+
- "Here's"
|
84
|
+
- "what"
|
85
|
+
- "they"
|
86
|
+
- "said"
|
87
|
+
- ":"
|
88
|
+
- "'"
|
89
|
+
- "single"
|
90
|
+
- "quotes"
|
91
|
+
- "are"
|
92
|
+
- "boring"
|
93
|
+
- "'"
|
94
|
+
- "and"
|
95
|
+
- "true"
|
96
|
+
- name: "non-consecutive puncts"
|
97
|
+
input: ". . ."
|
98
|
+
output:
|
99
|
+
- "."
|
100
|
+
- "."
|
101
|
+
- "."
|
37
102
|
- name: "url"
|
103
|
+
xfail: true # we get rid of URLs before tokenizing, so the result isn't material
|
38
104
|
input: "https://mun.la/sona/"
|
39
105
|
output:
|
40
106
|
- "https"
|
@@ -85,6 +151,16 @@
|
|
85
151
|
- "mama"
|
86
152
|
- "."
|
87
153
|
- "sina"
|
154
|
+
- name: simple bold
|
155
|
+
input: "**mi unpa e mama sina**"
|
156
|
+
output:
|
157
|
+
- "**"
|
158
|
+
- "mi"
|
159
|
+
- "unpa"
|
160
|
+
- "e"
|
161
|
+
- "mama"
|
162
|
+
- "sina"
|
163
|
+
- "**"
|
88
164
|
- name: weird punctuation characters
|
89
165
|
input: "mi^en$sina-li*toki()="
|
90
166
|
output:
|
@@ -92,9 +168,7 @@
|
|
92
168
|
- "^"
|
93
169
|
- "en"
|
94
170
|
- "$"
|
95
|
-
- "sina"
|
96
|
-
- "-"
|
97
|
-
- "li"
|
171
|
+
- "sina-li" # intended; looks like valid intrapunct
|
98
172
|
- "*"
|
99
173
|
- "toki"
|
100
174
|
- "()="
|
@@ -225,3 +299,41 @@
|
|
225
299
|
- "「"
|
226
300
|
- "Direct"
|
227
301
|
- "」"
|
302
|
+
|
303
|
+
- name: "simple intrapunct 1"
|
304
|
+
input: "i'm"
|
305
|
+
output:
|
306
|
+
- "i'm"
|
307
|
+
- name: "intrapunct and punct"
|
308
|
+
input: "i'm."
|
309
|
+
output:
|
310
|
+
- "i'm"
|
311
|
+
- "."
|
312
|
+
- name: "simple intrapunct 2"
|
313
|
+
input: "isn't"
|
314
|
+
output:
|
315
|
+
- "isn't"
|
316
|
+
- name: "quoted with intrapunct"
|
317
|
+
input: "'bother'"
|
318
|
+
output:
|
319
|
+
- "'"
|
320
|
+
- "bother"
|
321
|
+
- "'"
|
322
|
+
- name: "quoted intrapunct with intrapunct 1"
|
323
|
+
input: "'isn't'"
|
324
|
+
output:
|
325
|
+
- "'"
|
326
|
+
- "isn't"
|
327
|
+
- "'"
|
328
|
+
- name: "quoted intrapunct with intrapunct 2"
|
329
|
+
input: "'isn't it gross?'"
|
330
|
+
output:
|
331
|
+
- "'"
|
332
|
+
- "isn't"
|
333
|
+
- "it"
|
334
|
+
- "gross"
|
335
|
+
- "?'"
|
336
|
+
- name: "multiple intrapunct"
|
337
|
+
input: "whom's't'd've'n't"
|
338
|
+
output:
|
339
|
+
- "whom's't'd've'n't"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|