sonatoki 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Configs.py +19 -14
- sonatoki/Filters.py +79 -45
- sonatoki/Preprocessors.py +31 -0
- sonatoki/Tokenizers.py +3 -3
- sonatoki/__main__.py +176 -3
- sonatoki/alphabetic.txt +1771 -0
- sonatoki/constants.py +315 -47
- sonatoki/ilo.py +1 -1
- sonatoki/linku.json +1 -1
- sonatoki/sandbox.json +1 -1
- sonatoki/syllabic.txt +297 -0
- sonatoki/utils.py +0 -56
- {sonatoki-0.4.0.dist-info → sonatoki-0.5.1.dist-info}/METADATA +2 -1
- sonatoki-0.5.1.dist-info/RECORD +20 -0
- sonatoki-0.4.0.dist-info/RECORD +0 -18
- {sonatoki-0.4.0.dist-info → sonatoki-0.5.1.dist-info}/WHEEL +0 -0
- {sonatoki-0.4.0.dist-info → sonatoki-0.5.1.dist-info}/licenses/LICENSE +0 -0
sonatoki/syllabic.txt
ADDED
@@ -0,0 +1,297 @@
|
|
1
|
+
alamo
|
2
|
+
alan
|
3
|
+
alike
|
4
|
+
alone
|
5
|
+
ama
|
6
|
+
amen
|
7
|
+
ami
|
8
|
+
amin
|
9
|
+
amino
|
10
|
+
amo
|
11
|
+
amuse
|
12
|
+
ana
|
13
|
+
ane
|
14
|
+
ani
|
15
|
+
anise
|
16
|
+
anon
|
17
|
+
antelope
|
18
|
+
antena
|
19
|
+
anti
|
20
|
+
ape
|
21
|
+
apetite
|
22
|
+
apolo
|
23
|
+
asasin
|
24
|
+
asasinate
|
25
|
+
asimilate
|
26
|
+
asinine
|
27
|
+
asume
|
28
|
+
ate
|
29
|
+
awake
|
30
|
+
awaken
|
31
|
+
awe
|
32
|
+
awesome
|
33
|
+
awoke
|
34
|
+
eliminate
|
35
|
+
elite
|
36
|
+
elope
|
37
|
+
enema
|
38
|
+
eta
|
39
|
+
ewe
|
40
|
+
iluminate
|
41
|
+
imense
|
42
|
+
imitate
|
43
|
+
imune
|
44
|
+
inmate
|
45
|
+
insane
|
46
|
+
insulin
|
47
|
+
intake
|
48
|
+
intense
|
49
|
+
intimate
|
50
|
+
into
|
51
|
+
isolate
|
52
|
+
jake
|
53
|
+
jane
|
54
|
+
japan
|
55
|
+
jese
|
56
|
+
jetison
|
57
|
+
jin
|
58
|
+
joke
|
59
|
+
juke
|
60
|
+
kale
|
61
|
+
kane
|
62
|
+
kapa
|
63
|
+
keno
|
64
|
+
kilo
|
65
|
+
kimono
|
66
|
+
kite
|
67
|
+
kiten
|
68
|
+
kiwi
|
69
|
+
lake
|
70
|
+
lama
|
71
|
+
lame
|
72
|
+
lane
|
73
|
+
late
|
74
|
+
latino
|
75
|
+
lemon
|
76
|
+
leno
|
77
|
+
lese
|
78
|
+
lesen
|
79
|
+
leson
|
80
|
+
like
|
81
|
+
likewise
|
82
|
+
lima
|
83
|
+
lime
|
84
|
+
limo
|
85
|
+
lin
|
86
|
+
line
|
87
|
+
linen
|
88
|
+
lite
|
89
|
+
lone
|
90
|
+
lonesome
|
91
|
+
lose
|
92
|
+
losen
|
93
|
+
lote
|
94
|
+
loto
|
95
|
+
lowe
|
96
|
+
lulu
|
97
|
+
luna
|
98
|
+
make
|
99
|
+
male
|
100
|
+
man
|
101
|
+
mana
|
102
|
+
manate
|
103
|
+
manila
|
104
|
+
manipulate
|
105
|
+
mano
|
106
|
+
masa
|
107
|
+
mason
|
108
|
+
mate
|
109
|
+
matine
|
110
|
+
melon
|
111
|
+
memento
|
112
|
+
memo
|
113
|
+
men
|
114
|
+
mensa
|
115
|
+
menu
|
116
|
+
mesa
|
117
|
+
meta
|
118
|
+
mike
|
119
|
+
mile
|
120
|
+
milo
|
121
|
+
mime
|
122
|
+
mina
|
123
|
+
mine
|
124
|
+
mini
|
125
|
+
minute
|
126
|
+
misile
|
127
|
+
misuse
|
128
|
+
mite
|
129
|
+
miten
|
130
|
+
mojo
|
131
|
+
mola
|
132
|
+
mole
|
133
|
+
moma
|
134
|
+
momento
|
135
|
+
mon
|
136
|
+
mono
|
137
|
+
monson
|
138
|
+
monte
|
139
|
+
mope
|
140
|
+
mose
|
141
|
+
mote
|
142
|
+
moto
|
143
|
+
mule
|
144
|
+
mumu
|
145
|
+
muse
|
146
|
+
mutilate
|
147
|
+
muton
|
148
|
+
name
|
149
|
+
namesake
|
150
|
+
nan
|
151
|
+
nana
|
152
|
+
nine
|
153
|
+
nineten
|
154
|
+
ninja
|
155
|
+
nite
|
156
|
+
nome
|
157
|
+
nominate
|
158
|
+
nomine
|
159
|
+
non
|
160
|
+
none
|
161
|
+
nonsense
|
162
|
+
nope
|
163
|
+
nose
|
164
|
+
note
|
165
|
+
nuke
|
166
|
+
nun
|
167
|
+
ole
|
168
|
+
omelete
|
169
|
+
omen
|
170
|
+
one
|
171
|
+
onto
|
172
|
+
opose
|
173
|
+
oposite
|
174
|
+
ose
|
175
|
+
oto
|
176
|
+
otoman
|
177
|
+
pajama
|
178
|
+
pale
|
179
|
+
palete
|
180
|
+
palomino
|
181
|
+
panama
|
182
|
+
pane
|
183
|
+
papa
|
184
|
+
pate
|
185
|
+
paten
|
186
|
+
pele
|
187
|
+
pen
|
188
|
+
pene
|
189
|
+
peninsula
|
190
|
+
petite
|
191
|
+
pike
|
192
|
+
pile
|
193
|
+
pin
|
194
|
+
pina
|
195
|
+
pinata
|
196
|
+
pine
|
197
|
+
pinto
|
198
|
+
pipe
|
199
|
+
pipeline
|
200
|
+
poke
|
201
|
+
pole
|
202
|
+
polen
|
203
|
+
polite
|
204
|
+
polo
|
205
|
+
polute
|
206
|
+
ponton
|
207
|
+
popa
|
208
|
+
pope
|
209
|
+
pose
|
210
|
+
potato
|
211
|
+
puke
|
212
|
+
pun
|
213
|
+
sake
|
214
|
+
saki
|
215
|
+
salami
|
216
|
+
sale
|
217
|
+
salina
|
218
|
+
saline
|
219
|
+
salon
|
220
|
+
salute
|
221
|
+
same
|
222
|
+
sane
|
223
|
+
santo
|
224
|
+
satelite
|
225
|
+
satin
|
226
|
+
semen
|
227
|
+
semi
|
228
|
+
sen
|
229
|
+
senate
|
230
|
+
senile
|
231
|
+
sense
|
232
|
+
sepuku
|
233
|
+
sesame
|
234
|
+
simulate
|
235
|
+
sine
|
236
|
+
site
|
237
|
+
sole
|
238
|
+
solo
|
239
|
+
some
|
240
|
+
sometime
|
241
|
+
son
|
242
|
+
sonata
|
243
|
+
sulen
|
244
|
+
sumo
|
245
|
+
sumon
|
246
|
+
sun
|
247
|
+
sunken
|
248
|
+
suntan
|
249
|
+
supose
|
250
|
+
take
|
251
|
+
taken
|
252
|
+
takin
|
253
|
+
tale
|
254
|
+
tame
|
255
|
+
tape
|
256
|
+
tate
|
257
|
+
tato
|
258
|
+
ten
|
259
|
+
tense
|
260
|
+
tiki
|
261
|
+
tile
|
262
|
+
time
|
263
|
+
timeline
|
264
|
+
tin
|
265
|
+
titan
|
266
|
+
toke
|
267
|
+
token
|
268
|
+
tomato
|
269
|
+
tome
|
270
|
+
ton
|
271
|
+
tone
|
272
|
+
tote
|
273
|
+
tule
|
274
|
+
tuna
|
275
|
+
tune
|
276
|
+
tuti
|
277
|
+
tutu
|
278
|
+
unite
|
279
|
+
unlike
|
280
|
+
unsen
|
281
|
+
unto
|
282
|
+
unwise
|
283
|
+
upon
|
284
|
+
use
|
285
|
+
wake
|
286
|
+
waken
|
287
|
+
wala
|
288
|
+
wanton
|
289
|
+
win
|
290
|
+
wine
|
291
|
+
wipe
|
292
|
+
wise
|
293
|
+
woke
|
294
|
+
woken
|
295
|
+
woman
|
296
|
+
women
|
297
|
+
won
|
sonatoki/utils.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
# STL
|
2
|
-
import re
|
3
2
|
import itertools
|
4
3
|
from typing import Set, List, TypeVar, Iterable
|
5
4
|
|
@@ -87,58 +86,3 @@ def overlapping_ntuples(iterable: Iterable[T], n: int) -> Iterable[T]:
|
|
87
86
|
|
88
87
|
# ends when any iter is empty; all groups will be same size
|
89
88
|
return zip(*teed)
|
90
|
-
|
91
|
-
|
92
|
-
if __name__ == "__main__":
|
93
|
-
"""Helper script to fetch UNICODE_PUNCT in constants.py."""
|
94
|
-
|
95
|
-
PUNCT_CATEGORIES = {
|
96
|
-
"Pc",
|
97
|
-
"Pd",
|
98
|
-
"Pe",
|
99
|
-
"Pf",
|
100
|
-
"Pi",
|
101
|
-
"Po",
|
102
|
-
"Ps",
|
103
|
-
"Sm",
|
104
|
-
"Sk",
|
105
|
-
"Sc",
|
106
|
-
"So",
|
107
|
-
}
|
108
|
-
# Connector, Dash, Close (end), Final, Initial, Other, Open (sOpen), Math, Modifier (kModifier), Currency, Other
|
109
|
-
|
110
|
-
# NOTE: UnicodeData.txt lists character ranges if there would be many characters.
|
111
|
-
# (e.g. CJK Ideograph, First at 4E00 and CJK Ideograph, Last at 9FFF).
|
112
|
-
# This does not apply to any currently defined punctuation category.
|
113
|
-
|
114
|
-
EXCEPTION_RANGES = re.compile(r"""[Ⓐ-ⓩ🄰-🅉🅐-🅩🅰-🆉]+""")
|
115
|
-
# These groups are in Symbol other (So) but are not part of `\p{Punctuation}`
|
116
|
-
# NOTE: There are many characters which look like writing characters but are not. Examples:
|
117
|
-
# - kangxi radicals from ⺀ to ⿕ which are for demonstration
|
118
|
-
# - circled katakana from to ㋾ which... shouldn't be in \p{Punctuation} but oh well
|
119
|
-
|
120
|
-
def is_punctuation(data: List[str]):
|
121
|
-
return data[2] in PUNCT_CATEGORIES
|
122
|
-
|
123
|
-
def get_character(data: List[str]):
|
124
|
-
return chr(int(data[0], 16))
|
125
|
-
|
126
|
-
def is_exception(c: str):
|
127
|
-
return not not re.fullmatch(EXCEPTION_RANGES, c)
|
128
|
-
|
129
|
-
# http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
|
130
|
-
unicode_punctuation = ""
|
131
|
-
with open("UnicodeData.txt", "r") as f:
|
132
|
-
for line in f:
|
133
|
-
data = line.split(";")
|
134
|
-
if not is_punctuation(data):
|
135
|
-
continue
|
136
|
-
|
137
|
-
char = get_character(data)
|
138
|
-
if is_exception(char):
|
139
|
-
continue
|
140
|
-
|
141
|
-
unicode_punctuation += char
|
142
|
-
|
143
|
-
with open("UnicodePunctuation.txt", "w") as f:
|
144
|
-
_ = f.write(unicode_punctuation)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sonatoki
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.5.1
|
4
4
|
Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
|
5
5
|
Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
|
6
6
|
License: AGPL-3.0-or-later
|
@@ -8,6 +8,7 @@ Requires-Python: >=3.8
|
|
8
8
|
Requires-Dist: unidecode>=1.3.6
|
9
9
|
Requires-Dist: regex>=2023.12.25
|
10
10
|
Requires-Dist: typing-extensions>=4.11.0
|
11
|
+
Requires-Dist: emoji>=2.12.1
|
11
12
|
Description-Content-Type: text/markdown
|
12
13
|
|
13
14
|
# sona toki
|
@@ -0,0 +1,20 @@
|
|
1
|
+
sonatoki-0.5.1.dist-info/METADATA,sha256=gj5B_q10R5l-w0jEuzFY2035qzp9tpmBQ-sZ0q73zXE,6370
|
2
|
+
sonatoki-0.5.1.dist-info/WHEEL,sha256=SOP-4bEE0jbVaCHQGVvF08uWxk5rcSsfEybvoQVHlD8,90
|
3
|
+
sonatoki-0.5.1.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
|
4
|
+
sonatoki/Cleaners.py,sha256=x2dT3MpDUfbrHA0EP2D3n1sTiKFFi5jw9ha-1dX973o,1958
|
5
|
+
sonatoki/Configs.py,sha256=HHaSAA7hus7aY6Xy-3fNlbzMwk3wJO0HrjTssg8P78M,4291
|
6
|
+
sonatoki/Filters.py,sha256=nVSmw5M4sEYA_8KI1fI53rMHkd9KO6yWbKfdxxExxN8,11700
|
7
|
+
sonatoki/Preprocessors.py,sha256=zuu-6SLqFgk88vfSnYlyZjZrzoZQ56U_1SFXoxThQDQ,5628
|
8
|
+
sonatoki/Scorers.py,sha256=LRQLgXKTU2VqhkMHFPVxyVt83DXf85_zrpDGk4ThU24,3811
|
9
|
+
sonatoki/Tokenizers.py,sha256=qFaA1-v-wjKMihtEJMeZpi3m4cSkJQgWhGhL-w0VgPE,4236
|
10
|
+
sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
+
sonatoki/__main__.py,sha256=6n4kUF80APl6a0jV46h_ncHNuQbrLpZ_nAmiNAakiag,5673
|
12
|
+
sonatoki/alphabetic.txt,sha256=duyqAKilD2vLIr75RShCIAnktNJcGeEoQIk18V6czmg,11702
|
13
|
+
sonatoki/constants.py,sha256=a3OjhtH2Jp6RDot1NE-PrQfm2VzfM850b-qipFLnjS4,18868
|
14
|
+
sonatoki/ilo.py,sha256=PWZa202Q4h7IjnLxmfgT93iAPJL7dqJbA97L9kQDPiA,5658
|
15
|
+
sonatoki/linku.json,sha256=FLsaESG01rQ88OU8HvwOUl_P9qtGykJ1X-1xoMVDkKA,295077
|
16
|
+
sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
+
sonatoki/sandbox.json,sha256=3BpCEjw-kB4z7DJAJ2UrE1YuFIe3knat8qi1iYuAIq4,83555
|
18
|
+
sonatoki/syllabic.txt,sha256=HnqY4TrZ3tPcHah3TsvG9F9gjMrnAGdJ8hHJNHyyUPc,1712
|
19
|
+
sonatoki/utils.py,sha256=sT5xLMEj0aLpy8GP92HKblJU1Wt1m8NUlMgCFWB32xQ,2265
|
20
|
+
sonatoki-0.5.1.dist-info/RECORD,,
|
sonatoki-0.4.0.dist-info/RECORD
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
sonatoki-0.4.0.dist-info/METADATA,sha256=Z89tIHyGG9RRAgcr_3E4XW2IMX9NyT9mawcCeMQfXPU,6341
|
2
|
-
sonatoki-0.4.0.dist-info/WHEEL,sha256=SOP-4bEE0jbVaCHQGVvF08uWxk5rcSsfEybvoQVHlD8,90
|
3
|
-
sonatoki-0.4.0.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
|
4
|
-
sonatoki/Cleaners.py,sha256=x2dT3MpDUfbrHA0EP2D3n1sTiKFFi5jw9ha-1dX973o,1958
|
5
|
-
sonatoki/Configs.py,sha256=tOeJSlYXMBHbRPBxERGWGT5AjvCxNb3ZGu8GA4BYve4,4034
|
6
|
-
sonatoki/Filters.py,sha256=mpJBl-YPMF-Yl6mKFXf0D6DwkPR6H424RlvrkSeh4Dc,10714
|
7
|
-
sonatoki/Preprocessors.py,sha256=nvAzxpWP9WwT6gOCKcuiz5F8xYDdKIt9bOVUvy9o-G0,4459
|
8
|
-
sonatoki/Scorers.py,sha256=LRQLgXKTU2VqhkMHFPVxyVt83DXf85_zrpDGk4ThU24,3811
|
9
|
-
sonatoki/Tokenizers.py,sha256=So5_Tu6J98MD3yVcwB_X3lw2uMG0TN6XHcTbQjFCu5Q,4254
|
10
|
-
sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
-
sonatoki/__main__.py,sha256=6xc-wIrrFo9wTyn4zRQNAmqwmJBtVvCMwV-CrM-hueA,82
|
12
|
-
sonatoki/constants.py,sha256=wH3iR32-Ic7vSkrMjAZIvmIysTtkJ-KBVU5zv3Oamqs,12656
|
13
|
-
sonatoki/ilo.py,sha256=7KwTZgczzU2gbhC69yZbxtpTHy_fGtg_MnG_bDpiSxM,5639
|
14
|
-
sonatoki/linku.json,sha256=fm4-dks5s9x1bs7q82GNngAedVCWilMPCQ_o-j35QL0,270950
|
15
|
-
sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
-
sonatoki/sandbox.json,sha256=zPtZgJ_CpJa-2Den0gTNlk52f-YEwFVcjMarQXeeu5U,77563
|
17
|
-
sonatoki/utils.py,sha256=L984aXxvzfJaZ6GSWRKs7LweOGZYTLK11CdAhpLQr0g,4067
|
18
|
-
sonatoki-0.4.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|