sonatoki 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/constants.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # STL
2
2
  import json
3
- from typing import Set, Dict, List
3
+ from typing import Set, Dict
4
4
  from pathlib import Path
5
5
 
6
6
  # LOCAL
@@ -15,9 +15,9 @@ UNICODE_PUNCT_RANGES = [
15
15
  "\\U0000003a-\\U00000040",
16
16
  "\\U0000005b-\\U00000060",
17
17
  "\\U0000007b-\\U0000007e",
18
- "\\U000000a1-\\U000000a9",
18
+ "\\U000000a1-\\U000000a8",
19
19
  "\\U000000ab-\\U000000ac",
20
- "\\U000000ae-\\U000000b1",
20
+ "\\U000000af-\\U000000b1",
21
21
  "\\U000000b4",
22
22
  "\\U000000b6-\\U000000b8",
23
23
  "\\U000000bb",
@@ -118,7 +118,9 @@ UNICODE_PUNCT_RANGES = [
118
118
  "\\U00001fed-\\U00001fef",
119
119
  "\\U00001ffd-\\U00001ffe",
120
120
  "\\U00002010-\\U00002027",
121
- "\\U00002030-\\U0000205e",
121
+ "\\U00002030-\\U0000203b",
122
+ "\\U0000203d-\\U00002048",
123
+ "\\U0000204a-\\U0000205e",
122
124
  "\\U0000207a-\\U0000207e",
123
125
  "\\U0000208a-\\U0000208e",
124
126
  "\\U000020a0-\\U000020c0",
@@ -127,7 +129,8 @@ UNICODE_PUNCT_RANGES = [
127
129
  "\\U00002108-\\U00002109",
128
130
  "\\U00002114",
129
131
  "\\U00002116-\\U00002118",
130
- "\\U0000211e-\\U00002123",
132
+ "\\U0000211e-\\U00002121",
133
+ "\\U00002123",
131
134
  "\\U00002125",
132
135
  "\\U00002127",
133
136
  "\\U00002129",
@@ -137,11 +140,88 @@ UNICODE_PUNCT_RANGES = [
137
140
  "\\U0000214a-\\U0000214d",
138
141
  "\\U0000214f",
139
142
  "\\U0000218a-\\U0000218b",
140
- "\\U00002190-\\U00002426",
143
+ "\\U00002190-\\U00002193",
144
+ "\\U0000219a-\\U000021a8",
145
+ "\\U000021ab-\\U00002319",
146
+ "\\U0000231c-\\U00002327",
147
+ "\\U00002329-\\U000023ce",
148
+ "\\U000023d0-\\U000023e8",
149
+ "\\U000023f4-\\U000023f7",
150
+ "\\U000023fb-\\U00002426",
141
151
  "\\U00002440-\\U0000244a",
142
- "\\U0000249c-\\U000024b5",
143
- "\\U00002500-\\U00002775",
144
- "\\U00002794-\\U00002b73",
152
+ "\\U0000249c-\\U000024c1",
153
+ "\\U000024c3-\\U000024e9",
154
+ "\\U00002500-\\U000025a9",
155
+ "\\U000025ac-\\U000025b5",
156
+ "\\U000025b7-\\U000025bf",
157
+ "\\U000025c1-\\U000025fa",
158
+ "\\U000025ff",
159
+ "\\U00002605-\\U0000260d",
160
+ "\\U0000260f-\\U00002610",
161
+ "\\U00002612-\\U00002613",
162
+ "\\U00002616-\\U00002617",
163
+ "\\U00002619-\\U0000261c",
164
+ "\\U0000261e-\\U0000261f",
165
+ "\\U00002621",
166
+ "\\U00002624-\\U00002625",
167
+ "\\U00002627-\\U00002629",
168
+ "\\U0000262b-\\U0000262d",
169
+ "\\U00002630-\\U00002637",
170
+ "\\U0000263b-\\U0000263f",
171
+ "\\U00002641",
172
+ "\\U00002643-\\U00002647",
173
+ "\\U00002654-\\U0000265e",
174
+ "\\U00002661-\\U00002662",
175
+ "\\U00002664",
176
+ "\\U00002667",
177
+ "\\U00002669-\\U0000267a",
178
+ "\\U0000267c-\\U0000267d",
179
+ "\\U00002680-\\U00002691",
180
+ "\\U00002698",
181
+ "\\U0000269a",
182
+ "\\U0000269d-\\U0000269f",
183
+ "\\U000026a2-\\U000026a6",
184
+ "\\U000026a8-\\U000026a9",
185
+ "\\U000026ac-\\U000026af",
186
+ "\\U000026b2-\\U000026bc",
187
+ "\\U000026bf-\\U000026c3",
188
+ "\\U000026c6-\\U000026c7",
189
+ "\\U000026c9-\\U000026cd",
190
+ "\\U000026d0",
191
+ "\\U000026d2",
192
+ "\\U000026d5-\\U000026e8",
193
+ "\\U000026eb-\\U000026ef",
194
+ "\\U000026f6",
195
+ "\\U000026fb-\\U000026fc",
196
+ "\\U000026fe-\\U00002701",
197
+ "\\U00002703-\\U00002704",
198
+ "\\U00002706-\\U00002707",
199
+ "\\U0000270e",
200
+ "\\U00002710-\\U00002711",
201
+ "\\U00002713",
202
+ "\\U00002715",
203
+ "\\U00002717-\\U0000271c",
204
+ "\\U0000271e-\\U00002720",
205
+ "\\U00002722-\\U00002727",
206
+ "\\U00002729-\\U00002732",
207
+ "\\U00002735-\\U00002743",
208
+ "\\U00002745-\\U00002746",
209
+ "\\U00002748-\\U0000274b",
210
+ "\\U0000274d",
211
+ "\\U0000274f-\\U00002752",
212
+ "\\U00002756",
213
+ "\\U00002758-\\U00002762",
214
+ "\\U00002765-\\U00002775",
215
+ "\\U00002794",
216
+ "\\U00002798-\\U000027a0",
217
+ "\\U000027a2-\\U000027af",
218
+ "\\U000027b1-\\U000027be",
219
+ "\\U000027c0-\\U00002933",
220
+ "\\U00002936-\\U00002b04",
221
+ "\\U00002b08-\\U00002b1a",
222
+ "\\U00002b1d-\\U00002b4f",
223
+ "\\U00002b51-\\U00002b54",
224
+ "\\U00002b56-\\U00002b73",
145
225
  "\\U00002b76-\\U00002b95",
146
226
  "\\U00002b97-\\U00002bff",
147
227
  "\\U00002ce5-\\U00002cea",
@@ -156,9 +236,8 @@ UNICODE_PUNCT_RANGES = [
156
236
  "\\U00002ff0-\\U00002fff",
157
237
  "\\U00003001-\\U00003004",
158
238
  "\\U00003008-\\U00003020",
159
- "\\U00003030",
160
239
  "\\U00003036-\\U00003037",
161
- "\\U0000303d-\\U0000303f",
240
+ "\\U0000303e-\\U0000303f",
162
241
  "\\U0000309b-\\U0000309c",
163
242
  "\\U000030a0",
164
243
  "\\U000030fb",
@@ -170,7 +249,9 @@ UNICODE_PUNCT_RANGES = [
170
249
  "\\U0000322a-\\U00003247",
171
250
  "\\U00003250",
172
251
  "\\U00003260-\\U0000327f",
173
- "\\U0000328a-\\U000032b0",
252
+ "\\U0000328a-\\U00003296",
253
+ "\\U00003298",
254
+ "\\U0000329a-\\U000032b0",
174
255
  "\\U000032c0-\\U000033ff",
175
256
  "\\U00004dc0-\\U00004dff",
176
257
  "\\U0000a490-\\U0000a4c6",
@@ -314,49 +395,97 @@ UNICODE_PUNCT_RANGES = [
314
395
  "\\U0001ecb0",
315
396
  "\\U0001ed2e",
316
397
  "\\U0001eef0-\\U0001eef1",
317
- "\\U0001f000-\\U0001f02b",
398
+ "\\U0001f000-\\U0001f003",
399
+ "\\U0001f005-\\U0001f02b",
318
400
  "\\U0001f030-\\U0001f093",
319
401
  "\\U0001f0a0-\\U0001f0ae",
320
402
  "\\U0001f0b1-\\U0001f0bf",
321
- "\\U0001f0c1-\\U0001f0cf",
403
+ "\\U0001f0c1-\\U0001f0ce",
322
404
  "\\U0001f0d1-\\U0001f0f5",
323
- "\\U0001f10d-\\U0001f12f",
324
- "\\U0001f14a-\\U0001f14f",
325
- "\\U0001f16a-\\U0001f16f",
326
- "\\U0001f18a-\\U0001f1ad",
327
- "\\U0001f1e6-\\U0001f202",
328
- "\\U0001f210-\\U0001f23b",
405
+ "\\U0001f10d-\\U0001f16f",
406
+ "\\U0001f172-\\U0001f17d",
407
+ "\\U0001f180-\\U0001f18d",
408
+ "\\U0001f18f-\\U0001f190",
409
+ "\\U0001f19b-\\U0001f1ad",
410
+ "\\U0001f1e6-\\U0001f1e7",
411
+ "\\U0001f1ea-\\U0001f1eb",
412
+ "\\U0001f1ee-\\U0001f1f1",
413
+ "\\U0001f1f4-\\U0001f1f6",
414
+ "\\U0001f1f9-\\U0001f200",
415
+ "\\U0001f210-\\U0001f219",
416
+ "\\U0001f21b-\\U0001f22e",
417
+ "\\U0001f230-\\U0001f231",
418
+ "\\U0001f23b",
329
419
  "\\U0001f240-\\U0001f248",
330
- "\\U0001f250-\\U0001f251",
331
420
  "\\U0001f260-\\U0001f265",
332
- "\\U0001f300-\\U0001f6d7",
333
- "\\U0001f6dc-\\U0001f6ec",
334
- "\\U0001f6f0-\\U0001f6fc",
421
+ "\\U0001f322-\\U0001f323",
422
+ "\\U0001f394-\\U0001f395",
423
+ "\\U0001f398",
424
+ "\\U0001f39c-\\U0001f39d",
425
+ "\\U0001f3f1-\\U0001f3f2",
426
+ "\\U0001f3f6",
427
+ "\\U0001f4fe",
428
+ "\\U0001f53e-\\U0001f548",
429
+ "\\U0001f54f",
430
+ "\\U0001f568-\\U0001f56e",
431
+ "\\U0001f571-\\U0001f572",
432
+ "\\U0001f57b-\\U0001f586",
433
+ "\\U0001f588-\\U0001f589",
434
+ "\\U0001f58e-\\U0001f58f",
435
+ "\\U0001f591-\\U0001f594",
436
+ "\\U0001f597-\\U0001f5a3",
437
+ "\\U0001f5a6-\\U0001f5a7",
438
+ "\\U0001f5a9-\\U0001f5b0",
439
+ "\\U0001f5b3-\\U0001f5bb",
440
+ "\\U0001f5bd-\\U0001f5c1",
441
+ "\\U0001f5c5-\\U0001f5d0",
442
+ "\\U0001f5d4-\\U0001f5db",
443
+ "\\U0001f5df-\\U0001f5e0",
444
+ "\\U0001f5e2",
445
+ "\\U0001f5e4-\\U0001f5e7",
446
+ "\\U0001f5e9-\\U0001f5ee",
447
+ "\\U0001f5f0-\\U0001f5f2",
448
+ "\\U0001f5f4-\\U0001f5f9",
449
+ "\\U0001f650-\\U0001f67f",
450
+ "\\U0001f6c6-\\U0001f6ca",
451
+ "\\U0001f6d3-\\U0001f6d4",
452
+ "\\U0001f6e6-\\U0001f6e8",
453
+ "\\U0001f6ea",
454
+ "\\U0001f6f1-\\U0001f6f2",
335
455
  "\\U0001f700-\\U0001f776",
336
456
  "\\U0001f77b-\\U0001f7d9",
337
- "\\U0001f7e0-\\U0001f7eb",
338
- "\\U0001f7f0",
339
457
  "\\U0001f800-\\U0001f80b",
340
458
  "\\U0001f810-\\U0001f847",
341
459
  "\\U0001f850-\\U0001f859",
342
460
  "\\U0001f860-\\U0001f887",
343
461
  "\\U0001f890-\\U0001f8ad",
344
462
  "\\U0001f8b0-\\U0001f8b1",
345
- "\\U0001f900-\\U0001fa53",
463
+ "\\U0001f900-\\U0001f90b",
464
+ "\\U0001f93b",
465
+ "\\U0001f946",
466
+ "\\U0001fa00-\\U0001fa53",
346
467
  "\\U0001fa60-\\U0001fa6d",
347
- "\\U0001fa70-\\U0001fa7c",
348
- "\\U0001fa80-\\U0001fa88",
349
- "\\U0001fa90-\\U0001fabd",
350
- "\\U0001fabf-\\U0001fac5",
351
- "\\U0001face-\\U0001fadb",
352
- "\\U0001fae0-\\U0001fae8",
353
- "\\U0001faf0-\\U0001faf8",
354
468
  "\\U0001fb00-\\U0001fb92",
355
469
  "\\U0001fb94-\\U0001fbca",
356
- "\\U000f1990-\\U000f199d", # UCSUR punctuation
470
+ "\\U000f1990-\\U000f199d",
357
471
  ]
358
472
 
359
- UCSUR_PUNCT_RANGES = UNICODE_PUNCT_RANGES[-1] # NOTE: THIS CAN CHANGE
473
+
474
+ NOT_IN_PUNCT_CLASS = r"Ⓐ-ⓩ🄰-🅉🅐-🅩🅰-🆉"
475
+ ALL_VARIATION_SELECTOR_RANGES = ["\\U0000fe00-\\U0000fe0f", "\\U000e0100-\\U000e01ef"]
476
+ EMOJI_VARIATION_SELECTOR_RANGES = ["\\U0000fe0e-\\U0000fe0f"]
477
+ EMOJI_VARIATION_SELECTOR_RANGES_STR = "".join(EMOJI_VARIATION_SELECTOR_RANGES)
478
+ """All variation selectors are in Nonspacing Mark (Mn), but it is more apt to
479
+ mark these two as punctuation, since they are used exclusively for rendering
480
+ emoji.
481
+
482
+ But it's even better to use the Emoji filter.
483
+ """
484
+
485
+ UCSUR_PUNCT_RANGES = ["\\U000f1990-\\U000f199d"]
486
+ UCSUR_PUNCT_RANGES_STR = "".join(UCSUR_PUNCT_RANGES)
487
+ """Private Use Area glyphs are given the apt but unhelpful 'Private Use'
488
+ class."""
360
489
 
361
490
  UNICODE_PUNCT = find_unicode_chars(UNICODE_PUNCT_RANGES)
362
491
  # this is a large string.
@@ -366,7 +495,7 @@ POSIX_PUNCT = r"""-!"#$%&'()*+,./:;<=>?@[\]^_`{|}~"""
366
495
  POSIX_PUNCT_RANGES = find_unicode_ranges(POSIX_PUNCT)
367
496
 
368
497
  ALL_PUNCT = "".join(sorted(list(set(POSIX_PUNCT + UNICODE_PUNCT))))
369
- ALL_PUNCT_RANGES = "".join(find_unicode_ranges(ALL_PUNCT))
498
+ ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
370
499
  # combined bc the result could be simpler
371
500
 
372
501
  SENTENCE_PUNCT = """.?!:;'"()[-]“”·…"""
@@ -374,6 +503,8 @@ SENTENCE_PUNCT = """.?!:;'"()[-]“”·…"""
374
503
 
375
504
  LINKU = Path(__file__).resolve().parent / Path("linku.json")
376
505
  SANDBOX = Path(__file__).resolve().parent / Path("sandbox.json")
506
+ SYLLABICS = Path(__file__).resolve().parent / Path("syllabic.txt")
507
+ ALPHABETICS = Path(__file__).resolve().parent / Path("alphabetic.txt")
377
508
 
378
509
  VOWELS = "aeiou"
379
510
  CONSONANTS = "jklmnpstw"
@@ -388,23 +519,150 @@ ALLOWABLES = {
388
519
  "kxk", # ken ala ken
389
520
  "wxw", # wile ala wile
390
521
  "msa",
522
+ "anusem",
391
523
  }
392
524
 
393
- PHONOMATCHES = {
394
- "non",
395
- "nope",
396
- "some",
525
+ # NOTE: This is being tracked manually rather than fetched from syllabics.txt until I am convinced that solution is appropriate
526
+ FALSE_POS_SYLLABIC = {
527
+ # ordered by frequency in previous TPT data
397
528
  "like",
529
+ "same",
530
+ "nope",
531
+ "uwu", # TODO: emoticon?? uhh?
532
+ "non",
533
+ "owo", # TODO: emoticon??
534
+ "one",
535
+ "to",
536
+ "i",
537
+ "awesome",
398
538
  "use",
399
- "imo",
539
+ "name",
400
540
  "time",
541
+ "imo", # "in my opinion"
401
542
  "man",
402
- "also",
543
+ # "son", # sona typo?
544
+ "joke",
545
+ # pon would go here
546
+ "so",
547
+ "ten",
548
+ "make",
549
+ "pin",
550
+ "note",
551
+ # "aka" # in sandbox
552
+ "into",
553
+ "in",
554
+ "no",
555
+ "some",
556
+ # "papa",
557
+ "on",
558
+ "me",
559
+ "ipa",
560
+ "sun",
561
+ "mine",
562
+ "sense",
563
+ "none",
564
+ "meme",
565
+ "wise",
566
+ # "ono", # TODO: what is this
567
+ "mon",
568
+ "take",
569
+ "luna",
570
+ "elo",
571
+ "japanese",
572
+ "an",
573
+ "anti",
574
+ "win",
575
+ "won",
576
+ "we", # word in sandbox
577
+ "men",
578
+ "ton",
579
+ "woke",
580
+ "sen", # seen
581
+ "se", # see
582
+ "semi",
583
+ "male",
584
+ # "pen", # borderline
585
+ "woman",
586
+ "line",
587
+ "meta",
588
+ "mini",
589
+ "sine",
590
+ # "min", # borderline
591
+ "oposite",
592
+ "anime",
593
+ "potato",
594
+ # "japan",
595
+ "nose",
596
+ "kilo",
597
+ "alone",
598
+ "minute",
599
+ "late",
600
+ "women",
601
+ "leson",
602
+ "amen",
603
+ "tote",
604
+ "lame",
605
+ "online",
606
+ "tone",
607
+ "ate",
608
+ "mile",
609
+ "melon",
610
+ "tense",
611
+ "nonsense",
612
+ "nine",
613
+ "emo",
614
+ "unlike",
615
+ "lone",
616
+ # manual additions
617
+ "alike",
618
+ "amuse",
619
+ "antelope",
620
+ "antena",
621
+ "apetite",
622
+ "asasin",
623
+ "asasinate",
624
+ "asinine",
625
+ "asinine",
626
+ "asume",
627
+ "atone",
628
+ "awake",
629
+ "awaken",
630
+ "eliminate",
631
+ "elite",
632
+ "misuse",
633
+ "emanate",
634
+ "iluminate",
635
+ "imense",
636
+ "imitate",
637
+ "insane",
638
+ "insolate",
639
+ "insulate",
640
+ "intense",
641
+ "lemon",
642
+ "manipulate",
403
643
  }
404
644
 
405
- ALPHABETIC_MATCHES: Set[str] = set()
406
-
407
- IGNORABLES = PHONOMATCHES | ALPHABETIC_MATCHES
645
+ FALSE_POS_ALPHABETIC: Set[str] = {
646
+ "t",
647
+ "is",
648
+ "as",
649
+ "not",
650
+ "link",
651
+ "wait",
652
+ "lol",
653
+ "new",
654
+ "also",
655
+ "isn", # TODO: tokenizer....
656
+ "mean",
657
+ "means",
658
+ "it",
659
+ "moment",
660
+ "its",
661
+ "lmao",
662
+ "new",
663
+ "wel",
664
+ "makes",
665
+ }
408
666
 
409
667
  UCSUR_RANGES = [
410
668
  "\\U000F1900-\\U000F1977", # pu
@@ -439,15 +697,23 @@ with open(SANDBOX) as f:
439
697
  sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
440
698
  NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
441
699
 
700
+ # with open(SYLLABICS) as f:
701
+ # FALSE_POS_SYLLABIC = {line.strip() for line in f}
702
+ #
703
+ # with open(ALPHABETICS) as f:
704
+ # FALSE_POS_ALPHABETIC = {line.strip() for line in f}
705
+
442
706
  del linku
443
707
  del sandbox
444
708
 
445
709
  __all__ = [
446
710
  "ALLOWABLES",
447
711
  "ALL_PUNCT",
448
- "ALL_PUNCT_RANGES",
712
+ "ALL_PUNCT_RANGES_STR",
449
713
  "ALPHABET",
450
714
  "CONSONANTS",
715
+ "EMOJI_VARIATION_SELECTOR_RANGES",
716
+ "EMOJI_VARIATION_SELECTOR_RANGES_STR",
451
717
  "NIMI_KU_LILI",
452
718
  "NIMI_KU_SULI",
453
719
  "NIMI_LINKU_COMMON",
@@ -459,6 +725,8 @@ __all__ = [
459
725
  "NIMI_PU_SYNONYMS",
460
726
  "POSIX_PUNCT",
461
727
  "POSIX_PUNCT_RANGES",
728
+ "UCSUR_PUNCT_RANGES",
729
+ "UCSUR_PUNCT_RANGES_STR",
462
730
  "UNICODE_PUNCT",
463
731
  "UNICODE_PUNCT_RANGES",
464
732
  "VOWELS",
sonatoki/ilo.py CHANGED
@@ -119,7 +119,7 @@ class Ilo:
119
119
  *_, result = self._is_toki_pona(message)
120
120
  return result
121
121
 
122
- def _are_toki_pona(self, message: str):
122
+ def _are_toki_pona(self, message: str) -> List[Scorecard]:
123
123
  """Split a message into sentences, then return a list each sentence's
124
124
  results via `self._is_toki_pona()`.
125
125