sonatoki 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonatoki/Configs.py +19 -14
- sonatoki/Filters.py +79 -45
- sonatoki/Preprocessors.py +31 -0
- sonatoki/Tokenizers.py +3 -3
- sonatoki/__main__.py +176 -3
- sonatoki/alphabetic.txt +1771 -0
- sonatoki/constants.py +315 -47
- sonatoki/ilo.py +1 -1
- sonatoki/linku.json +1 -1
- sonatoki/sandbox.json +1 -1
- sonatoki/syllabic.txt +297 -0
- sonatoki/utils.py +0 -56
- {sonatoki-0.4.0.dist-info → sonatoki-0.5.1.dist-info}/METADATA +2 -1
- sonatoki-0.5.1.dist-info/RECORD +20 -0
- sonatoki-0.4.0.dist-info/RECORD +0 -18
- {sonatoki-0.4.0.dist-info → sonatoki-0.5.1.dist-info}/WHEEL +0 -0
- {sonatoki-0.4.0.dist-info → sonatoki-0.5.1.dist-info}/licenses/LICENSE +0 -0
sonatoki/constants.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# STL
|
2
2
|
import json
|
3
|
-
from typing import Set, Dict
|
3
|
+
from typing import Set, Dict
|
4
4
|
from pathlib import Path
|
5
5
|
|
6
6
|
# LOCAL
|
@@ -15,9 +15,9 @@ UNICODE_PUNCT_RANGES = [
|
|
15
15
|
"\\U0000003a-\\U00000040",
|
16
16
|
"\\U0000005b-\\U00000060",
|
17
17
|
"\\U0000007b-\\U0000007e",
|
18
|
-
"\\U000000a1-\\
|
18
|
+
"\\U000000a1-\\U000000a8",
|
19
19
|
"\\U000000ab-\\U000000ac",
|
20
|
-
"\\
|
20
|
+
"\\U000000af-\\U000000b1",
|
21
21
|
"\\U000000b4",
|
22
22
|
"\\U000000b6-\\U000000b8",
|
23
23
|
"\\U000000bb",
|
@@ -118,7 +118,9 @@ UNICODE_PUNCT_RANGES = [
|
|
118
118
|
"\\U00001fed-\\U00001fef",
|
119
119
|
"\\U00001ffd-\\U00001ffe",
|
120
120
|
"\\U00002010-\\U00002027",
|
121
|
-
"\\U00002030-\\
|
121
|
+
"\\U00002030-\\U0000203b",
|
122
|
+
"\\U0000203d-\\U00002048",
|
123
|
+
"\\U0000204a-\\U0000205e",
|
122
124
|
"\\U0000207a-\\U0000207e",
|
123
125
|
"\\U0000208a-\\U0000208e",
|
124
126
|
"\\U000020a0-\\U000020c0",
|
@@ -127,7 +129,8 @@ UNICODE_PUNCT_RANGES = [
|
|
127
129
|
"\\U00002108-\\U00002109",
|
128
130
|
"\\U00002114",
|
129
131
|
"\\U00002116-\\U00002118",
|
130
|
-
"\\U0000211e-\\
|
132
|
+
"\\U0000211e-\\U00002121",
|
133
|
+
"\\U00002123",
|
131
134
|
"\\U00002125",
|
132
135
|
"\\U00002127",
|
133
136
|
"\\U00002129",
|
@@ -137,11 +140,88 @@ UNICODE_PUNCT_RANGES = [
|
|
137
140
|
"\\U0000214a-\\U0000214d",
|
138
141
|
"\\U0000214f",
|
139
142
|
"\\U0000218a-\\U0000218b",
|
140
|
-
"\\U00002190-\\
|
143
|
+
"\\U00002190-\\U00002193",
|
144
|
+
"\\U0000219a-\\U000021a8",
|
145
|
+
"\\U000021ab-\\U00002319",
|
146
|
+
"\\U0000231c-\\U00002327",
|
147
|
+
"\\U00002329-\\U000023ce",
|
148
|
+
"\\U000023d0-\\U000023e8",
|
149
|
+
"\\U000023f4-\\U000023f7",
|
150
|
+
"\\U000023fb-\\U00002426",
|
141
151
|
"\\U00002440-\\U0000244a",
|
142
|
-
"\\U0000249c-\\
|
143
|
-
"\\
|
144
|
-
"\\
|
152
|
+
"\\U0000249c-\\U000024c1",
|
153
|
+
"\\U000024c3-\\U000024e9",
|
154
|
+
"\\U00002500-\\U000025a9",
|
155
|
+
"\\U000025ac-\\U000025b5",
|
156
|
+
"\\U000025b7-\\U000025bf",
|
157
|
+
"\\U000025c1-\\U000025fa",
|
158
|
+
"\\U000025ff",
|
159
|
+
"\\U00002605-\\U0000260d",
|
160
|
+
"\\U0000260f-\\U00002610",
|
161
|
+
"\\U00002612-\\U00002613",
|
162
|
+
"\\U00002616-\\U00002617",
|
163
|
+
"\\U00002619-\\U0000261c",
|
164
|
+
"\\U0000261e-\\U0000261f",
|
165
|
+
"\\U00002621",
|
166
|
+
"\\U00002624-\\U00002625",
|
167
|
+
"\\U00002627-\\U00002629",
|
168
|
+
"\\U0000262b-\\U0000262d",
|
169
|
+
"\\U00002630-\\U00002637",
|
170
|
+
"\\U0000263b-\\U0000263f",
|
171
|
+
"\\U00002641",
|
172
|
+
"\\U00002643-\\U00002647",
|
173
|
+
"\\U00002654-\\U0000265e",
|
174
|
+
"\\U00002661-\\U00002662",
|
175
|
+
"\\U00002664",
|
176
|
+
"\\U00002667",
|
177
|
+
"\\U00002669-\\U0000267a",
|
178
|
+
"\\U0000267c-\\U0000267d",
|
179
|
+
"\\U00002680-\\U00002691",
|
180
|
+
"\\U00002698",
|
181
|
+
"\\U0000269a",
|
182
|
+
"\\U0000269d-\\U0000269f",
|
183
|
+
"\\U000026a2-\\U000026a6",
|
184
|
+
"\\U000026a8-\\U000026a9",
|
185
|
+
"\\U000026ac-\\U000026af",
|
186
|
+
"\\U000026b2-\\U000026bc",
|
187
|
+
"\\U000026bf-\\U000026c3",
|
188
|
+
"\\U000026c6-\\U000026c7",
|
189
|
+
"\\U000026c9-\\U000026cd",
|
190
|
+
"\\U000026d0",
|
191
|
+
"\\U000026d2",
|
192
|
+
"\\U000026d5-\\U000026e8",
|
193
|
+
"\\U000026eb-\\U000026ef",
|
194
|
+
"\\U000026f6",
|
195
|
+
"\\U000026fb-\\U000026fc",
|
196
|
+
"\\U000026fe-\\U00002701",
|
197
|
+
"\\U00002703-\\U00002704",
|
198
|
+
"\\U00002706-\\U00002707",
|
199
|
+
"\\U0000270e",
|
200
|
+
"\\U00002710-\\U00002711",
|
201
|
+
"\\U00002713",
|
202
|
+
"\\U00002715",
|
203
|
+
"\\U00002717-\\U0000271c",
|
204
|
+
"\\U0000271e-\\U00002720",
|
205
|
+
"\\U00002722-\\U00002727",
|
206
|
+
"\\U00002729-\\U00002732",
|
207
|
+
"\\U00002735-\\U00002743",
|
208
|
+
"\\U00002745-\\U00002746",
|
209
|
+
"\\U00002748-\\U0000274b",
|
210
|
+
"\\U0000274d",
|
211
|
+
"\\U0000274f-\\U00002752",
|
212
|
+
"\\U00002756",
|
213
|
+
"\\U00002758-\\U00002762",
|
214
|
+
"\\U00002765-\\U00002775",
|
215
|
+
"\\U00002794",
|
216
|
+
"\\U00002798-\\U000027a0",
|
217
|
+
"\\U000027a2-\\U000027af",
|
218
|
+
"\\U000027b1-\\U000027be",
|
219
|
+
"\\U000027c0-\\U00002933",
|
220
|
+
"\\U00002936-\\U00002b04",
|
221
|
+
"\\U00002b08-\\U00002b1a",
|
222
|
+
"\\U00002b1d-\\U00002b4f",
|
223
|
+
"\\U00002b51-\\U00002b54",
|
224
|
+
"\\U00002b56-\\U00002b73",
|
145
225
|
"\\U00002b76-\\U00002b95",
|
146
226
|
"\\U00002b97-\\U00002bff",
|
147
227
|
"\\U00002ce5-\\U00002cea",
|
@@ -156,9 +236,8 @@ UNICODE_PUNCT_RANGES = [
|
|
156
236
|
"\\U00002ff0-\\U00002fff",
|
157
237
|
"\\U00003001-\\U00003004",
|
158
238
|
"\\U00003008-\\U00003020",
|
159
|
-
"\\U00003030",
|
160
239
|
"\\U00003036-\\U00003037",
|
161
|
-
"\\
|
240
|
+
"\\U0000303e-\\U0000303f",
|
162
241
|
"\\U0000309b-\\U0000309c",
|
163
242
|
"\\U000030a0",
|
164
243
|
"\\U000030fb",
|
@@ -170,7 +249,9 @@ UNICODE_PUNCT_RANGES = [
|
|
170
249
|
"\\U0000322a-\\U00003247",
|
171
250
|
"\\U00003250",
|
172
251
|
"\\U00003260-\\U0000327f",
|
173
|
-
"\\U0000328a-\\
|
252
|
+
"\\U0000328a-\\U00003296",
|
253
|
+
"\\U00003298",
|
254
|
+
"\\U0000329a-\\U000032b0",
|
174
255
|
"\\U000032c0-\\U000033ff",
|
175
256
|
"\\U00004dc0-\\U00004dff",
|
176
257
|
"\\U0000a490-\\U0000a4c6",
|
@@ -314,49 +395,97 @@ UNICODE_PUNCT_RANGES = [
|
|
314
395
|
"\\U0001ecb0",
|
315
396
|
"\\U0001ed2e",
|
316
397
|
"\\U0001eef0-\\U0001eef1",
|
317
|
-
"\\U0001f000-\\
|
398
|
+
"\\U0001f000-\\U0001f003",
|
399
|
+
"\\U0001f005-\\U0001f02b",
|
318
400
|
"\\U0001f030-\\U0001f093",
|
319
401
|
"\\U0001f0a0-\\U0001f0ae",
|
320
402
|
"\\U0001f0b1-\\U0001f0bf",
|
321
|
-
"\\U0001f0c1-\\
|
403
|
+
"\\U0001f0c1-\\U0001f0ce",
|
322
404
|
"\\U0001f0d1-\\U0001f0f5",
|
323
|
-
"\\U0001f10d-\\
|
324
|
-
"\\
|
325
|
-
"\\
|
326
|
-
"\\
|
327
|
-
"\\
|
328
|
-
"\\
|
405
|
+
"\\U0001f10d-\\U0001f16f",
|
406
|
+
"\\U0001f172-\\U0001f17d",
|
407
|
+
"\\U0001f180-\\U0001f18d",
|
408
|
+
"\\U0001f18f-\\U0001f190",
|
409
|
+
"\\U0001f19b-\\U0001f1ad",
|
410
|
+
"\\U0001f1e6-\\U0001f1e7",
|
411
|
+
"\\U0001f1ea-\\U0001f1eb",
|
412
|
+
"\\U0001f1ee-\\U0001f1f1",
|
413
|
+
"\\U0001f1f4-\\U0001f1f6",
|
414
|
+
"\\U0001f1f9-\\U0001f200",
|
415
|
+
"\\U0001f210-\\U0001f219",
|
416
|
+
"\\U0001f21b-\\U0001f22e",
|
417
|
+
"\\U0001f230-\\U0001f231",
|
418
|
+
"\\U0001f23b",
|
329
419
|
"\\U0001f240-\\U0001f248",
|
330
|
-
"\\U0001f250-\\U0001f251",
|
331
420
|
"\\U0001f260-\\U0001f265",
|
332
|
-
"\\
|
333
|
-
"\\
|
334
|
-
"\\
|
421
|
+
"\\U0001f322-\\U0001f323",
|
422
|
+
"\\U0001f394-\\U0001f395",
|
423
|
+
"\\U0001f398",
|
424
|
+
"\\U0001f39c-\\U0001f39d",
|
425
|
+
"\\U0001f3f1-\\U0001f3f2",
|
426
|
+
"\\U0001f3f6",
|
427
|
+
"\\U0001f4fe",
|
428
|
+
"\\U0001f53e-\\U0001f548",
|
429
|
+
"\\U0001f54f",
|
430
|
+
"\\U0001f568-\\U0001f56e",
|
431
|
+
"\\U0001f571-\\U0001f572",
|
432
|
+
"\\U0001f57b-\\U0001f586",
|
433
|
+
"\\U0001f588-\\U0001f589",
|
434
|
+
"\\U0001f58e-\\U0001f58f",
|
435
|
+
"\\U0001f591-\\U0001f594",
|
436
|
+
"\\U0001f597-\\U0001f5a3",
|
437
|
+
"\\U0001f5a6-\\U0001f5a7",
|
438
|
+
"\\U0001f5a9-\\U0001f5b0",
|
439
|
+
"\\U0001f5b3-\\U0001f5bb",
|
440
|
+
"\\U0001f5bd-\\U0001f5c1",
|
441
|
+
"\\U0001f5c5-\\U0001f5d0",
|
442
|
+
"\\U0001f5d4-\\U0001f5db",
|
443
|
+
"\\U0001f5df-\\U0001f5e0",
|
444
|
+
"\\U0001f5e2",
|
445
|
+
"\\U0001f5e4-\\U0001f5e7",
|
446
|
+
"\\U0001f5e9-\\U0001f5ee",
|
447
|
+
"\\U0001f5f0-\\U0001f5f2",
|
448
|
+
"\\U0001f5f4-\\U0001f5f9",
|
449
|
+
"\\U0001f650-\\U0001f67f",
|
450
|
+
"\\U0001f6c6-\\U0001f6ca",
|
451
|
+
"\\U0001f6d3-\\U0001f6d4",
|
452
|
+
"\\U0001f6e6-\\U0001f6e8",
|
453
|
+
"\\U0001f6ea",
|
454
|
+
"\\U0001f6f1-\\U0001f6f2",
|
335
455
|
"\\U0001f700-\\U0001f776",
|
336
456
|
"\\U0001f77b-\\U0001f7d9",
|
337
|
-
"\\U0001f7e0-\\U0001f7eb",
|
338
|
-
"\\U0001f7f0",
|
339
457
|
"\\U0001f800-\\U0001f80b",
|
340
458
|
"\\U0001f810-\\U0001f847",
|
341
459
|
"\\U0001f850-\\U0001f859",
|
342
460
|
"\\U0001f860-\\U0001f887",
|
343
461
|
"\\U0001f890-\\U0001f8ad",
|
344
462
|
"\\U0001f8b0-\\U0001f8b1",
|
345
|
-
"\\U0001f900-\\
|
463
|
+
"\\U0001f900-\\U0001f90b",
|
464
|
+
"\\U0001f93b",
|
465
|
+
"\\U0001f946",
|
466
|
+
"\\U0001fa00-\\U0001fa53",
|
346
467
|
"\\U0001fa60-\\U0001fa6d",
|
347
|
-
"\\U0001fa70-\\U0001fa7c",
|
348
|
-
"\\U0001fa80-\\U0001fa88",
|
349
|
-
"\\U0001fa90-\\U0001fabd",
|
350
|
-
"\\U0001fabf-\\U0001fac5",
|
351
|
-
"\\U0001face-\\U0001fadb",
|
352
|
-
"\\U0001fae0-\\U0001fae8",
|
353
|
-
"\\U0001faf0-\\U0001faf8",
|
354
468
|
"\\U0001fb00-\\U0001fb92",
|
355
469
|
"\\U0001fb94-\\U0001fbca",
|
356
|
-
"\\U000f1990-\\U000f199d",
|
470
|
+
"\\U000f1990-\\U000f199d",
|
357
471
|
]
|
358
472
|
|
359
|
-
|
473
|
+
|
474
|
+
NOT_IN_PUNCT_CLASS = r"Ⓐ-ⓩ🄰-🅉🅐-🅩🅰-🆉"
|
475
|
+
ALL_VARIATION_SELECTOR_RANGES = ["\\U0000fe00-\\U0000fe0f", "\\U000e0100-\\U000e01ef"]
|
476
|
+
EMOJI_VARIATION_SELECTOR_RANGES = ["\\U0000fe0e-\\U0000fe0f"]
|
477
|
+
EMOJI_VARIATION_SELECTOR_RANGES_STR = "".join(EMOJI_VARIATION_SELECTOR_RANGES)
|
478
|
+
"""All variation selectors are in Nonspacing Mark (Mn), but it is more apt to
|
479
|
+
mark these two as punctuation, since they are used exclusively for rendering
|
480
|
+
emoji.
|
481
|
+
|
482
|
+
But it's even better to use the Emoji filter.
|
483
|
+
"""
|
484
|
+
|
485
|
+
UCSUR_PUNCT_RANGES = ["\\U000f1990-\\U000f199d"]
|
486
|
+
UCSUR_PUNCT_RANGES_STR = "".join(UCSUR_PUNCT_RANGES)
|
487
|
+
"""Private Use Area glyphs are given the apt but unhelpful 'Private Use'
|
488
|
+
class."""
|
360
489
|
|
361
490
|
UNICODE_PUNCT = find_unicode_chars(UNICODE_PUNCT_RANGES)
|
362
491
|
# this is a large string.
|
@@ -366,7 +495,7 @@ POSIX_PUNCT = r"""-!"#$%&'()*+,./:;<=>?@[\]^_`{|}~"""
|
|
366
495
|
POSIX_PUNCT_RANGES = find_unicode_ranges(POSIX_PUNCT)
|
367
496
|
|
368
497
|
ALL_PUNCT = "".join(sorted(list(set(POSIX_PUNCT + UNICODE_PUNCT))))
|
369
|
-
|
498
|
+
ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
|
370
499
|
# combined bc the result could be simpler
|
371
500
|
|
372
501
|
SENTENCE_PUNCT = """.?!:;'"()[-]“”·…"""
|
@@ -374,6 +503,8 @@ SENTENCE_PUNCT = """.?!:;'"()[-]“”·…"""
|
|
374
503
|
|
375
504
|
LINKU = Path(__file__).resolve().parent / Path("linku.json")
|
376
505
|
SANDBOX = Path(__file__).resolve().parent / Path("sandbox.json")
|
506
|
+
SYLLABICS = Path(__file__).resolve().parent / Path("syllabic.txt")
|
507
|
+
ALPHABETICS = Path(__file__).resolve().parent / Path("alphabetic.txt")
|
377
508
|
|
378
509
|
VOWELS = "aeiou"
|
379
510
|
CONSONANTS = "jklmnpstw"
|
@@ -388,23 +519,150 @@ ALLOWABLES = {
|
|
388
519
|
"kxk", # ken ala ken
|
389
520
|
"wxw", # wile ala wile
|
390
521
|
"msa",
|
522
|
+
"anusem",
|
391
523
|
}
|
392
524
|
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
"some",
|
525
|
+
# NOTE: This is being tracked manually rather than fetched from syllabics.txt until I am convinced that solution is appropriate
|
526
|
+
FALSE_POS_SYLLABIC = {
|
527
|
+
# ordered by frequency in previous TPT data
|
397
528
|
"like",
|
529
|
+
"same",
|
530
|
+
"nope",
|
531
|
+
"uwu", # TODO: emoticon?? uhh?
|
532
|
+
"non",
|
533
|
+
"owo", # TODO: emoticon??
|
534
|
+
"one",
|
535
|
+
"to",
|
536
|
+
"i",
|
537
|
+
"awesome",
|
398
538
|
"use",
|
399
|
-
"
|
539
|
+
"name",
|
400
540
|
"time",
|
541
|
+
"imo", # "in my opinion"
|
401
542
|
"man",
|
402
|
-
"
|
543
|
+
# "son", # sona typo?
|
544
|
+
"joke",
|
545
|
+
# pon would go here
|
546
|
+
"so",
|
547
|
+
"ten",
|
548
|
+
"make",
|
549
|
+
"pin",
|
550
|
+
"note",
|
551
|
+
# "aka" # in sandbox
|
552
|
+
"into",
|
553
|
+
"in",
|
554
|
+
"no",
|
555
|
+
"some",
|
556
|
+
# "papa",
|
557
|
+
"on",
|
558
|
+
"me",
|
559
|
+
"ipa",
|
560
|
+
"sun",
|
561
|
+
"mine",
|
562
|
+
"sense",
|
563
|
+
"none",
|
564
|
+
"meme",
|
565
|
+
"wise",
|
566
|
+
# "ono", # TODO: what is this
|
567
|
+
"mon",
|
568
|
+
"take",
|
569
|
+
"luna",
|
570
|
+
"elo",
|
571
|
+
"japanese",
|
572
|
+
"an",
|
573
|
+
"anti",
|
574
|
+
"win",
|
575
|
+
"won",
|
576
|
+
"we", # word in sandbox
|
577
|
+
"men",
|
578
|
+
"ton",
|
579
|
+
"woke",
|
580
|
+
"sen", # seen
|
581
|
+
"se", # see
|
582
|
+
"semi",
|
583
|
+
"male",
|
584
|
+
# "pen", # borderline
|
585
|
+
"woman",
|
586
|
+
"line",
|
587
|
+
"meta",
|
588
|
+
"mini",
|
589
|
+
"sine",
|
590
|
+
# "min", # borderline
|
591
|
+
"oposite",
|
592
|
+
"anime",
|
593
|
+
"potato",
|
594
|
+
# "japan",
|
595
|
+
"nose",
|
596
|
+
"kilo",
|
597
|
+
"alone",
|
598
|
+
"minute",
|
599
|
+
"late",
|
600
|
+
"women",
|
601
|
+
"leson",
|
602
|
+
"amen",
|
603
|
+
"tote",
|
604
|
+
"lame",
|
605
|
+
"online",
|
606
|
+
"tone",
|
607
|
+
"ate",
|
608
|
+
"mile",
|
609
|
+
"melon",
|
610
|
+
"tense",
|
611
|
+
"nonsense",
|
612
|
+
"nine",
|
613
|
+
"emo",
|
614
|
+
"unlike",
|
615
|
+
"lone",
|
616
|
+
# manual additions
|
617
|
+
"alike",
|
618
|
+
"amuse",
|
619
|
+
"antelope",
|
620
|
+
"antena",
|
621
|
+
"apetite",
|
622
|
+
"asasin",
|
623
|
+
"asasinate",
|
624
|
+
"asinine",
|
625
|
+
"asinine",
|
626
|
+
"asume",
|
627
|
+
"atone",
|
628
|
+
"awake",
|
629
|
+
"awaken",
|
630
|
+
"eliminate",
|
631
|
+
"elite",
|
632
|
+
"misuse",
|
633
|
+
"emanate",
|
634
|
+
"iluminate",
|
635
|
+
"imense",
|
636
|
+
"imitate",
|
637
|
+
"insane",
|
638
|
+
"insolate",
|
639
|
+
"insulate",
|
640
|
+
"intense",
|
641
|
+
"lemon",
|
642
|
+
"manipulate",
|
403
643
|
}
|
404
644
|
|
405
|
-
|
406
|
-
|
407
|
-
|
645
|
+
FALSE_POS_ALPHABETIC: Set[str] = {
|
646
|
+
"t",
|
647
|
+
"is",
|
648
|
+
"as",
|
649
|
+
"not",
|
650
|
+
"link",
|
651
|
+
"wait",
|
652
|
+
"lol",
|
653
|
+
"new",
|
654
|
+
"also",
|
655
|
+
"isn", # TODO: tokenizer....
|
656
|
+
"mean",
|
657
|
+
"means",
|
658
|
+
"it",
|
659
|
+
"moment",
|
660
|
+
"its",
|
661
|
+
"lmao",
|
662
|
+
"new",
|
663
|
+
"wel",
|
664
|
+
"makes",
|
665
|
+
}
|
408
666
|
|
409
667
|
UCSUR_RANGES = [
|
410
668
|
"\\U000F1900-\\U000F1977", # pu
|
@@ -439,15 +697,23 @@ with open(SANDBOX) as f:
|
|
439
697
|
sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
|
440
698
|
NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
|
441
699
|
|
700
|
+
# with open(SYLLABICS) as f:
|
701
|
+
# FALSE_POS_SYLLABIC = {line.strip() for line in f}
|
702
|
+
#
|
703
|
+
# with open(ALPHABETICS) as f:
|
704
|
+
# FALSE_POS_ALPHABETIC = {line.strip() for line in f}
|
705
|
+
|
442
706
|
del linku
|
443
707
|
del sandbox
|
444
708
|
|
445
709
|
__all__ = [
|
446
710
|
"ALLOWABLES",
|
447
711
|
"ALL_PUNCT",
|
448
|
-
"
|
712
|
+
"ALL_PUNCT_RANGES_STR",
|
449
713
|
"ALPHABET",
|
450
714
|
"CONSONANTS",
|
715
|
+
"EMOJI_VARIATION_SELECTOR_RANGES",
|
716
|
+
"EMOJI_VARIATION_SELECTOR_RANGES_STR",
|
451
717
|
"NIMI_KU_LILI",
|
452
718
|
"NIMI_KU_SULI",
|
453
719
|
"NIMI_LINKU_COMMON",
|
@@ -459,6 +725,8 @@ __all__ = [
|
|
459
725
|
"NIMI_PU_SYNONYMS",
|
460
726
|
"POSIX_PUNCT",
|
461
727
|
"POSIX_PUNCT_RANGES",
|
728
|
+
"UCSUR_PUNCT_RANGES",
|
729
|
+
"UCSUR_PUNCT_RANGES_STR",
|
462
730
|
"UNICODE_PUNCT",
|
463
731
|
"UNICODE_PUNCT_RANGES",
|
464
732
|
"VOWELS",
|
sonatoki/ilo.py
CHANGED
@@ -119,7 +119,7 @@ class Ilo:
|
|
119
119
|
*_, result = self._is_toki_pona(message)
|
120
120
|
return result
|
121
121
|
|
122
|
-
def _are_toki_pona(self, message: str):
|
122
|
+
def _are_toki_pona(self, message: str) -> List[Scorecard]:
|
123
123
|
"""Split a message into sentences, then return a list each sentence's
|
124
124
|
results via `self._is_toki_pona()`.
|
125
125
|
|