sonatoki 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Configs.py +18 -14
- sonatoki/Filters.py +75 -45
- sonatoki/Preprocessors.py +31 -0
- sonatoki/Tokenizers.py +3 -3
- sonatoki/__main__.py +176 -3
- sonatoki/alphabetic.txt +1771 -0
- sonatoki/constants.py +236 -47
- sonatoki/ilo.py +1 -1
- sonatoki/linku.json +1 -1
- sonatoki/sandbox.json +1 -1
- sonatoki/syllabic.txt +297 -0
- sonatoki/utils.py +0 -56
- {sonatoki-0.4.0.dist-info → sonatoki-0.5.0.dist-info}/METADATA +2 -1
- sonatoki-0.5.0.dist-info/RECORD +20 -0
- sonatoki-0.4.0.dist-info/RECORD +0 -18
- {sonatoki-0.4.0.dist-info → sonatoki-0.5.0.dist-info}/WHEEL +0 -0
- {sonatoki-0.4.0.dist-info → sonatoki-0.5.0.dist-info}/licenses/LICENSE +0 -0
sonatoki/constants.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# STL
|
2
2
|
import json
|
3
|
-
from typing import Set, Dict
|
3
|
+
from typing import Set, Dict
|
4
4
|
from pathlib import Path
|
5
5
|
|
6
6
|
# LOCAL
|
@@ -15,9 +15,9 @@ UNICODE_PUNCT_RANGES = [
|
|
15
15
|
"\\U0000003a-\\U00000040",
|
16
16
|
"\\U0000005b-\\U00000060",
|
17
17
|
"\\U0000007b-\\U0000007e",
|
18
|
-
"\\U000000a1-\\
|
18
|
+
"\\U000000a1-\\U000000a8",
|
19
19
|
"\\U000000ab-\\U000000ac",
|
20
|
-
"\\
|
20
|
+
"\\U000000af-\\U000000b1",
|
21
21
|
"\\U000000b4",
|
22
22
|
"\\U000000b6-\\U000000b8",
|
23
23
|
"\\U000000bb",
|
@@ -118,7 +118,9 @@ UNICODE_PUNCT_RANGES = [
|
|
118
118
|
"\\U00001fed-\\U00001fef",
|
119
119
|
"\\U00001ffd-\\U00001ffe",
|
120
120
|
"\\U00002010-\\U00002027",
|
121
|
-
"\\U00002030-\\
|
121
|
+
"\\U00002030-\\U0000203b",
|
122
|
+
"\\U0000203d-\\U00002048",
|
123
|
+
"\\U0000204a-\\U0000205e",
|
122
124
|
"\\U0000207a-\\U0000207e",
|
123
125
|
"\\U0000208a-\\U0000208e",
|
124
126
|
"\\U000020a0-\\U000020c0",
|
@@ -127,7 +129,8 @@ UNICODE_PUNCT_RANGES = [
|
|
127
129
|
"\\U00002108-\\U00002109",
|
128
130
|
"\\U00002114",
|
129
131
|
"\\U00002116-\\U00002118",
|
130
|
-
"\\U0000211e-\\
|
132
|
+
"\\U0000211e-\\U00002121",
|
133
|
+
"\\U00002123",
|
131
134
|
"\\U00002125",
|
132
135
|
"\\U00002127",
|
133
136
|
"\\U00002129",
|
@@ -137,11 +140,88 @@ UNICODE_PUNCT_RANGES = [
|
|
137
140
|
"\\U0000214a-\\U0000214d",
|
138
141
|
"\\U0000214f",
|
139
142
|
"\\U0000218a-\\U0000218b",
|
140
|
-
"\\U00002190-\\
|
143
|
+
"\\U00002190-\\U00002193",
|
144
|
+
"\\U0000219a-\\U000021a8",
|
145
|
+
"\\U000021ab-\\U00002319",
|
146
|
+
"\\U0000231c-\\U00002327",
|
147
|
+
"\\U00002329-\\U000023ce",
|
148
|
+
"\\U000023d0-\\U000023e8",
|
149
|
+
"\\U000023f4-\\U000023f7",
|
150
|
+
"\\U000023fb-\\U00002426",
|
141
151
|
"\\U00002440-\\U0000244a",
|
142
|
-
"\\U0000249c-\\
|
143
|
-
"\\
|
144
|
-
"\\
|
152
|
+
"\\U0000249c-\\U000024c1",
|
153
|
+
"\\U000024c3-\\U000024e9",
|
154
|
+
"\\U00002500-\\U000025a9",
|
155
|
+
"\\U000025ac-\\U000025b5",
|
156
|
+
"\\U000025b7-\\U000025bf",
|
157
|
+
"\\U000025c1-\\U000025fa",
|
158
|
+
"\\U000025ff",
|
159
|
+
"\\U00002605-\\U0000260d",
|
160
|
+
"\\U0000260f-\\U00002610",
|
161
|
+
"\\U00002612-\\U00002613",
|
162
|
+
"\\U00002616-\\U00002617",
|
163
|
+
"\\U00002619-\\U0000261c",
|
164
|
+
"\\U0000261e-\\U0000261f",
|
165
|
+
"\\U00002621",
|
166
|
+
"\\U00002624-\\U00002625",
|
167
|
+
"\\U00002627-\\U00002629",
|
168
|
+
"\\U0000262b-\\U0000262d",
|
169
|
+
"\\U00002630-\\U00002637",
|
170
|
+
"\\U0000263b-\\U0000263f",
|
171
|
+
"\\U00002641",
|
172
|
+
"\\U00002643-\\U00002647",
|
173
|
+
"\\U00002654-\\U0000265e",
|
174
|
+
"\\U00002661-\\U00002662",
|
175
|
+
"\\U00002664",
|
176
|
+
"\\U00002667",
|
177
|
+
"\\U00002669-\\U0000267a",
|
178
|
+
"\\U0000267c-\\U0000267d",
|
179
|
+
"\\U00002680-\\U00002691",
|
180
|
+
"\\U00002698",
|
181
|
+
"\\U0000269a",
|
182
|
+
"\\U0000269d-\\U0000269f",
|
183
|
+
"\\U000026a2-\\U000026a6",
|
184
|
+
"\\U000026a8-\\U000026a9",
|
185
|
+
"\\U000026ac-\\U000026af",
|
186
|
+
"\\U000026b2-\\U000026bc",
|
187
|
+
"\\U000026bf-\\U000026c3",
|
188
|
+
"\\U000026c6-\\U000026c7",
|
189
|
+
"\\U000026c9-\\U000026cd",
|
190
|
+
"\\U000026d0",
|
191
|
+
"\\U000026d2",
|
192
|
+
"\\U000026d5-\\U000026e8",
|
193
|
+
"\\U000026eb-\\U000026ef",
|
194
|
+
"\\U000026f6",
|
195
|
+
"\\U000026fb-\\U000026fc",
|
196
|
+
"\\U000026fe-\\U00002701",
|
197
|
+
"\\U00002703-\\U00002704",
|
198
|
+
"\\U00002706-\\U00002707",
|
199
|
+
"\\U0000270e",
|
200
|
+
"\\U00002710-\\U00002711",
|
201
|
+
"\\U00002713",
|
202
|
+
"\\U00002715",
|
203
|
+
"\\U00002717-\\U0000271c",
|
204
|
+
"\\U0000271e-\\U00002720",
|
205
|
+
"\\U00002722-\\U00002727",
|
206
|
+
"\\U00002729-\\U00002732",
|
207
|
+
"\\U00002735-\\U00002743",
|
208
|
+
"\\U00002745-\\U00002746",
|
209
|
+
"\\U00002748-\\U0000274b",
|
210
|
+
"\\U0000274d",
|
211
|
+
"\\U0000274f-\\U00002752",
|
212
|
+
"\\U00002756",
|
213
|
+
"\\U00002758-\\U00002762",
|
214
|
+
"\\U00002765-\\U00002775",
|
215
|
+
"\\U00002794",
|
216
|
+
"\\U00002798-\\U000027a0",
|
217
|
+
"\\U000027a2-\\U000027af",
|
218
|
+
"\\U000027b1-\\U000027be",
|
219
|
+
"\\U000027c0-\\U00002933",
|
220
|
+
"\\U00002936-\\U00002b04",
|
221
|
+
"\\U00002b08-\\U00002b1a",
|
222
|
+
"\\U00002b1d-\\U00002b4f",
|
223
|
+
"\\U00002b51-\\U00002b54",
|
224
|
+
"\\U00002b56-\\U00002b73",
|
145
225
|
"\\U00002b76-\\U00002b95",
|
146
226
|
"\\U00002b97-\\U00002bff",
|
147
227
|
"\\U00002ce5-\\U00002cea",
|
@@ -156,9 +236,8 @@ UNICODE_PUNCT_RANGES = [
|
|
156
236
|
"\\U00002ff0-\\U00002fff",
|
157
237
|
"\\U00003001-\\U00003004",
|
158
238
|
"\\U00003008-\\U00003020",
|
159
|
-
"\\U00003030",
|
160
239
|
"\\U00003036-\\U00003037",
|
161
|
-
"\\
|
240
|
+
"\\U0000303e-\\U0000303f",
|
162
241
|
"\\U0000309b-\\U0000309c",
|
163
242
|
"\\U000030a0",
|
164
243
|
"\\U000030fb",
|
@@ -170,7 +249,9 @@ UNICODE_PUNCT_RANGES = [
|
|
170
249
|
"\\U0000322a-\\U00003247",
|
171
250
|
"\\U00003250",
|
172
251
|
"\\U00003260-\\U0000327f",
|
173
|
-
"\\U0000328a-\\
|
252
|
+
"\\U0000328a-\\U00003296",
|
253
|
+
"\\U00003298",
|
254
|
+
"\\U0000329a-\\U000032b0",
|
174
255
|
"\\U000032c0-\\U000033ff",
|
175
256
|
"\\U00004dc0-\\U00004dff",
|
176
257
|
"\\U0000a490-\\U0000a4c6",
|
@@ -314,49 +395,97 @@ UNICODE_PUNCT_RANGES = [
|
|
314
395
|
"\\U0001ecb0",
|
315
396
|
"\\U0001ed2e",
|
316
397
|
"\\U0001eef0-\\U0001eef1",
|
317
|
-
"\\U0001f000-\\
|
398
|
+
"\\U0001f000-\\U0001f003",
|
399
|
+
"\\U0001f005-\\U0001f02b",
|
318
400
|
"\\U0001f030-\\U0001f093",
|
319
401
|
"\\U0001f0a0-\\U0001f0ae",
|
320
402
|
"\\U0001f0b1-\\U0001f0bf",
|
321
|
-
"\\U0001f0c1-\\
|
403
|
+
"\\U0001f0c1-\\U0001f0ce",
|
322
404
|
"\\U0001f0d1-\\U0001f0f5",
|
323
|
-
"\\U0001f10d-\\
|
324
|
-
"\\
|
325
|
-
"\\
|
326
|
-
"\\
|
327
|
-
"\\
|
328
|
-
"\\
|
405
|
+
"\\U0001f10d-\\U0001f16f",
|
406
|
+
"\\U0001f172-\\U0001f17d",
|
407
|
+
"\\U0001f180-\\U0001f18d",
|
408
|
+
"\\U0001f18f-\\U0001f190",
|
409
|
+
"\\U0001f19b-\\U0001f1ad",
|
410
|
+
"\\U0001f1e6-\\U0001f1e7",
|
411
|
+
"\\U0001f1ea-\\U0001f1eb",
|
412
|
+
"\\U0001f1ee-\\U0001f1f1",
|
413
|
+
"\\U0001f1f4-\\U0001f1f6",
|
414
|
+
"\\U0001f1f9-\\U0001f200",
|
415
|
+
"\\U0001f210-\\U0001f219",
|
416
|
+
"\\U0001f21b-\\U0001f22e",
|
417
|
+
"\\U0001f230-\\U0001f231",
|
418
|
+
"\\U0001f23b",
|
329
419
|
"\\U0001f240-\\U0001f248",
|
330
|
-
"\\U0001f250-\\U0001f251",
|
331
420
|
"\\U0001f260-\\U0001f265",
|
332
|
-
"\\
|
333
|
-
"\\
|
334
|
-
"\\
|
421
|
+
"\\U0001f322-\\U0001f323",
|
422
|
+
"\\U0001f394-\\U0001f395",
|
423
|
+
"\\U0001f398",
|
424
|
+
"\\U0001f39c-\\U0001f39d",
|
425
|
+
"\\U0001f3f1-\\U0001f3f2",
|
426
|
+
"\\U0001f3f6",
|
427
|
+
"\\U0001f4fe",
|
428
|
+
"\\U0001f53e-\\U0001f548",
|
429
|
+
"\\U0001f54f",
|
430
|
+
"\\U0001f568-\\U0001f56e",
|
431
|
+
"\\U0001f571-\\U0001f572",
|
432
|
+
"\\U0001f57b-\\U0001f586",
|
433
|
+
"\\U0001f588-\\U0001f589",
|
434
|
+
"\\U0001f58e-\\U0001f58f",
|
435
|
+
"\\U0001f591-\\U0001f594",
|
436
|
+
"\\U0001f597-\\U0001f5a3",
|
437
|
+
"\\U0001f5a6-\\U0001f5a7",
|
438
|
+
"\\U0001f5a9-\\U0001f5b0",
|
439
|
+
"\\U0001f5b3-\\U0001f5bb",
|
440
|
+
"\\U0001f5bd-\\U0001f5c1",
|
441
|
+
"\\U0001f5c5-\\U0001f5d0",
|
442
|
+
"\\U0001f5d4-\\U0001f5db",
|
443
|
+
"\\U0001f5df-\\U0001f5e0",
|
444
|
+
"\\U0001f5e2",
|
445
|
+
"\\U0001f5e4-\\U0001f5e7",
|
446
|
+
"\\U0001f5e9-\\U0001f5ee",
|
447
|
+
"\\U0001f5f0-\\U0001f5f2",
|
448
|
+
"\\U0001f5f4-\\U0001f5f9",
|
449
|
+
"\\U0001f650-\\U0001f67f",
|
450
|
+
"\\U0001f6c6-\\U0001f6ca",
|
451
|
+
"\\U0001f6d3-\\U0001f6d4",
|
452
|
+
"\\U0001f6e6-\\U0001f6e8",
|
453
|
+
"\\U0001f6ea",
|
454
|
+
"\\U0001f6f1-\\U0001f6f2",
|
335
455
|
"\\U0001f700-\\U0001f776",
|
336
456
|
"\\U0001f77b-\\U0001f7d9",
|
337
|
-
"\\U0001f7e0-\\U0001f7eb",
|
338
|
-
"\\U0001f7f0",
|
339
457
|
"\\U0001f800-\\U0001f80b",
|
340
458
|
"\\U0001f810-\\U0001f847",
|
341
459
|
"\\U0001f850-\\U0001f859",
|
342
460
|
"\\U0001f860-\\U0001f887",
|
343
461
|
"\\U0001f890-\\U0001f8ad",
|
344
462
|
"\\U0001f8b0-\\U0001f8b1",
|
345
|
-
"\\U0001f900-\\
|
463
|
+
"\\U0001f900-\\U0001f90b",
|
464
|
+
"\\U0001f93b",
|
465
|
+
"\\U0001f946",
|
466
|
+
"\\U0001fa00-\\U0001fa53",
|
346
467
|
"\\U0001fa60-\\U0001fa6d",
|
347
|
-
"\\U0001fa70-\\U0001fa7c",
|
348
|
-
"\\U0001fa80-\\U0001fa88",
|
349
|
-
"\\U0001fa90-\\U0001fabd",
|
350
|
-
"\\U0001fabf-\\U0001fac5",
|
351
|
-
"\\U0001face-\\U0001fadb",
|
352
|
-
"\\U0001fae0-\\U0001fae8",
|
353
|
-
"\\U0001faf0-\\U0001faf8",
|
354
468
|
"\\U0001fb00-\\U0001fb92",
|
355
469
|
"\\U0001fb94-\\U0001fbca",
|
356
|
-
"\\U000f1990-\\U000f199d",
|
470
|
+
"\\U000f1990-\\U000f199d",
|
357
471
|
]
|
358
472
|
|
359
|
-
|
473
|
+
|
474
|
+
NOT_IN_PUNCT_CLASS = r"Ⓐ-ⓩ🄰-🅉🅐-🅩🅰-🆉"
|
475
|
+
ALL_VARIATION_SELECTOR_RANGES = ["\\U0000fe00-\\U0000fe0f", "\\U000e0100-\\U000e01ef"]
|
476
|
+
EMOJI_VARIATION_SELECTOR_RANGES = ["\\U0000fe0e-\\U0000fe0f"]
|
477
|
+
EMOJI_VARIATION_SELECTOR_RANGES_STR = "".join(EMOJI_VARIATION_SELECTOR_RANGES)
|
478
|
+
"""All variation selectors are in Nonspacing Mark (Mn), but it is more apt to
|
479
|
+
mark these two as punctuation, since they are used exclusively for rendering
|
480
|
+
emoji.
|
481
|
+
|
482
|
+
But it's even better to use the Emoji filter.
|
483
|
+
"""
|
484
|
+
|
485
|
+
UCSUR_PUNCT_RANGES = ["\\U000f1990-\\U000f199d"]
|
486
|
+
UCSUR_PUNCT_RANGES_STR = "".join(UCSUR_PUNCT_RANGES)
|
487
|
+
"""Private Use Area glyphs are given the apt but unhelpful 'Private Use'
|
488
|
+
class."""
|
360
489
|
|
361
490
|
UNICODE_PUNCT = find_unicode_chars(UNICODE_PUNCT_RANGES)
|
362
491
|
# this is a large string.
|
@@ -366,7 +495,7 @@ POSIX_PUNCT = r"""-!"#$%&'()*+,./:;<=>?@[\]^_`{|}~"""
|
|
366
495
|
POSIX_PUNCT_RANGES = find_unicode_ranges(POSIX_PUNCT)
|
367
496
|
|
368
497
|
ALL_PUNCT = "".join(sorted(list(set(POSIX_PUNCT + UNICODE_PUNCT))))
|
369
|
-
|
498
|
+
ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
|
370
499
|
# combined bc the result could be simpler
|
371
500
|
|
372
501
|
SENTENCE_PUNCT = """.?!:;'"()[-]“”·…"""
|
@@ -374,6 +503,8 @@ SENTENCE_PUNCT = """.?!:;'"()[-]“”·…"""
|
|
374
503
|
|
375
504
|
LINKU = Path(__file__).resolve().parent / Path("linku.json")
|
376
505
|
SANDBOX = Path(__file__).resolve().parent / Path("sandbox.json")
|
506
|
+
SYLLABICS = Path(__file__).resolve().parent / Path("syllabic.txt")
|
507
|
+
ALPHABETICS = Path(__file__).resolve().parent / Path("alphabetic.txt")
|
377
508
|
|
378
509
|
VOWELS = "aeiou"
|
379
510
|
CONSONANTS = "jklmnpstw"
|
@@ -390,21 +521,69 @@ ALLOWABLES = {
|
|
390
521
|
"msa",
|
391
522
|
}
|
392
523
|
|
393
|
-
|
394
|
-
|
395
|
-
"nope",
|
396
|
-
"some",
|
524
|
+
FALSE_POS_SYLLABIC = {
|
525
|
+
# ordered by frequency in previous TPT data
|
397
526
|
"like",
|
527
|
+
"same",
|
528
|
+
"nope",
|
529
|
+
"uwu", # TODO: emoticon?? uhh?
|
530
|
+
"non",
|
531
|
+
"owo", # TODO: emoticon??
|
532
|
+
"one",
|
533
|
+
"to",
|
534
|
+
"i",
|
535
|
+
"awesome",
|
398
536
|
"use",
|
399
|
-
"
|
537
|
+
"name",
|
400
538
|
"time",
|
539
|
+
"imo", # "in my opinion"
|
401
540
|
"man",
|
402
|
-
"
|
541
|
+
# "son", # sona typo?
|
542
|
+
"joke",
|
543
|
+
"so",
|
544
|
+
"ten",
|
545
|
+
"make",
|
546
|
+
"pin",
|
547
|
+
"note",
|
548
|
+
# "aka" # in sandbox
|
549
|
+
"into",
|
550
|
+
"in",
|
551
|
+
"some",
|
552
|
+
"on",
|
553
|
+
"me",
|
554
|
+
"ipa",
|
555
|
+
"sun",
|
556
|
+
"sense",
|
557
|
+
"none",
|
558
|
+
"meme",
|
559
|
+
"wise",
|
560
|
+
# "ono", # TODO: what is this
|
561
|
+
"mon",
|
562
|
+
"take",
|
563
|
+
"luna",
|
564
|
+
"anti",
|
565
|
+
"elo",
|
566
|
+
"an",
|
567
|
+
"win",
|
568
|
+
"won",
|
569
|
+
"we",
|
570
|
+
"men",
|
571
|
+
"ton",
|
572
|
+
"woke",
|
573
|
+
"semi",
|
574
|
+
"male",
|
403
575
|
}
|
404
576
|
|
405
|
-
|
406
|
-
|
407
|
-
|
577
|
+
FALSE_POS_ALPHABETIC: Set[str] = {
|
578
|
+
"t",
|
579
|
+
"is",
|
580
|
+
"not",
|
581
|
+
"lol",
|
582
|
+
"also",
|
583
|
+
"isn", # TODO: tokenizer....
|
584
|
+
"mean",
|
585
|
+
"means",
|
586
|
+
}
|
408
587
|
|
409
588
|
UCSUR_RANGES = [
|
410
589
|
"\\U000F1900-\\U000F1977", # pu
|
@@ -439,15 +618,23 @@ with open(SANDBOX) as f:
|
|
439
618
|
sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
|
440
619
|
NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
|
441
620
|
|
621
|
+
# with open(SYLLABICS) as f:
|
622
|
+
# FALSE_POS_SYLLABIC = {line.strip() for line in f}
|
623
|
+
#
|
624
|
+
# with open(ALPHABETICS) as f:
|
625
|
+
# FALSE_POS_ALPHABETIC = {line.strip() for line in f}
|
626
|
+
|
442
627
|
del linku
|
443
628
|
del sandbox
|
444
629
|
|
445
630
|
__all__ = [
|
446
631
|
"ALLOWABLES",
|
447
632
|
"ALL_PUNCT",
|
448
|
-
"
|
633
|
+
"ALL_PUNCT_RANGES_STR",
|
449
634
|
"ALPHABET",
|
450
635
|
"CONSONANTS",
|
636
|
+
"EMOJI_VARIATION_SELECTOR_RANGES",
|
637
|
+
"EMOJI_VARIATION_SELECTOR_RANGES_STR",
|
451
638
|
"NIMI_KU_LILI",
|
452
639
|
"NIMI_KU_SULI",
|
453
640
|
"NIMI_LINKU_COMMON",
|
@@ -459,6 +646,8 @@ __all__ = [
|
|
459
646
|
"NIMI_PU_SYNONYMS",
|
460
647
|
"POSIX_PUNCT",
|
461
648
|
"POSIX_PUNCT_RANGES",
|
649
|
+
"UCSUR_PUNCT_RANGES",
|
650
|
+
"UCSUR_PUNCT_RANGES_STR",
|
462
651
|
"UNICODE_PUNCT",
|
463
652
|
"UNICODE_PUNCT_RANGES",
|
464
653
|
"VOWELS",
|
sonatoki/ilo.py
CHANGED
@@ -119,7 +119,7 @@ class Ilo:
|
|
119
119
|
*_, result = self._is_toki_pona(message)
|
120
120
|
return result
|
121
121
|
|
122
|
-
def _are_toki_pona(self, message: str):
|
122
|
+
def _are_toki_pona(self, message: str) -> List[Scorecard]:
|
123
123
|
"""Split a message into sentences, then return a list each sentence's
|
124
124
|
results via `self._is_toki_pona()`.
|
125
125
|
|