sonatoki 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/constants.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # STL
2
2
  import json
3
- from typing import Set, Dict, List
3
+ from typing import Set, Dict
4
4
  from pathlib import Path
5
5
 
6
6
  # LOCAL
@@ -15,9 +15,9 @@ UNICODE_PUNCT_RANGES = [
15
15
  "\\U0000003a-\\U00000040",
16
16
  "\\U0000005b-\\U00000060",
17
17
  "\\U0000007b-\\U0000007e",
18
- "\\U000000a1-\\U000000a9",
18
+ "\\U000000a1-\\U000000a8",
19
19
  "\\U000000ab-\\U000000ac",
20
- "\\U000000ae-\\U000000b1",
20
+ "\\U000000af-\\U000000b1",
21
21
  "\\U000000b4",
22
22
  "\\U000000b6-\\U000000b8",
23
23
  "\\U000000bb",
@@ -118,7 +118,9 @@ UNICODE_PUNCT_RANGES = [
118
118
  "\\U00001fed-\\U00001fef",
119
119
  "\\U00001ffd-\\U00001ffe",
120
120
  "\\U00002010-\\U00002027",
121
- "\\U00002030-\\U0000205e",
121
+ "\\U00002030-\\U0000203b",
122
+ "\\U0000203d-\\U00002048",
123
+ "\\U0000204a-\\U0000205e",
122
124
  "\\U0000207a-\\U0000207e",
123
125
  "\\U0000208a-\\U0000208e",
124
126
  "\\U000020a0-\\U000020c0",
@@ -127,7 +129,8 @@ UNICODE_PUNCT_RANGES = [
127
129
  "\\U00002108-\\U00002109",
128
130
  "\\U00002114",
129
131
  "\\U00002116-\\U00002118",
130
- "\\U0000211e-\\U00002123",
132
+ "\\U0000211e-\\U00002121",
133
+ "\\U00002123",
131
134
  "\\U00002125",
132
135
  "\\U00002127",
133
136
  "\\U00002129",
@@ -137,11 +140,88 @@ UNICODE_PUNCT_RANGES = [
137
140
  "\\U0000214a-\\U0000214d",
138
141
  "\\U0000214f",
139
142
  "\\U0000218a-\\U0000218b",
140
- "\\U00002190-\\U00002426",
143
+ "\\U00002190-\\U00002193",
144
+ "\\U0000219a-\\U000021a8",
145
+ "\\U000021ab-\\U00002319",
146
+ "\\U0000231c-\\U00002327",
147
+ "\\U00002329-\\U000023ce",
148
+ "\\U000023d0-\\U000023e8",
149
+ "\\U000023f4-\\U000023f7",
150
+ "\\U000023fb-\\U00002426",
141
151
  "\\U00002440-\\U0000244a",
142
- "\\U0000249c-\\U000024b5",
143
- "\\U00002500-\\U00002775",
144
- "\\U00002794-\\U00002b73",
152
+ "\\U0000249c-\\U000024c1",
153
+ "\\U000024c3-\\U000024e9",
154
+ "\\U00002500-\\U000025a9",
155
+ "\\U000025ac-\\U000025b5",
156
+ "\\U000025b7-\\U000025bf",
157
+ "\\U000025c1-\\U000025fa",
158
+ "\\U000025ff",
159
+ "\\U00002605-\\U0000260d",
160
+ "\\U0000260f-\\U00002610",
161
+ "\\U00002612-\\U00002613",
162
+ "\\U00002616-\\U00002617",
163
+ "\\U00002619-\\U0000261c",
164
+ "\\U0000261e-\\U0000261f",
165
+ "\\U00002621",
166
+ "\\U00002624-\\U00002625",
167
+ "\\U00002627-\\U00002629",
168
+ "\\U0000262b-\\U0000262d",
169
+ "\\U00002630-\\U00002637",
170
+ "\\U0000263b-\\U0000263f",
171
+ "\\U00002641",
172
+ "\\U00002643-\\U00002647",
173
+ "\\U00002654-\\U0000265e",
174
+ "\\U00002661-\\U00002662",
175
+ "\\U00002664",
176
+ "\\U00002667",
177
+ "\\U00002669-\\U0000267a",
178
+ "\\U0000267c-\\U0000267d",
179
+ "\\U00002680-\\U00002691",
180
+ "\\U00002698",
181
+ "\\U0000269a",
182
+ "\\U0000269d-\\U0000269f",
183
+ "\\U000026a2-\\U000026a6",
184
+ "\\U000026a8-\\U000026a9",
185
+ "\\U000026ac-\\U000026af",
186
+ "\\U000026b2-\\U000026bc",
187
+ "\\U000026bf-\\U000026c3",
188
+ "\\U000026c6-\\U000026c7",
189
+ "\\U000026c9-\\U000026cd",
190
+ "\\U000026d0",
191
+ "\\U000026d2",
192
+ "\\U000026d5-\\U000026e8",
193
+ "\\U000026eb-\\U000026ef",
194
+ "\\U000026f6",
195
+ "\\U000026fb-\\U000026fc",
196
+ "\\U000026fe-\\U00002701",
197
+ "\\U00002703-\\U00002704",
198
+ "\\U00002706-\\U00002707",
199
+ "\\U0000270e",
200
+ "\\U00002710-\\U00002711",
201
+ "\\U00002713",
202
+ "\\U00002715",
203
+ "\\U00002717-\\U0000271c",
204
+ "\\U0000271e-\\U00002720",
205
+ "\\U00002722-\\U00002727",
206
+ "\\U00002729-\\U00002732",
207
+ "\\U00002735-\\U00002743",
208
+ "\\U00002745-\\U00002746",
209
+ "\\U00002748-\\U0000274b",
210
+ "\\U0000274d",
211
+ "\\U0000274f-\\U00002752",
212
+ "\\U00002756",
213
+ "\\U00002758-\\U00002762",
214
+ "\\U00002765-\\U00002775",
215
+ "\\U00002794",
216
+ "\\U00002798-\\U000027a0",
217
+ "\\U000027a2-\\U000027af",
218
+ "\\U000027b1-\\U000027be",
219
+ "\\U000027c0-\\U00002933",
220
+ "\\U00002936-\\U00002b04",
221
+ "\\U00002b08-\\U00002b1a",
222
+ "\\U00002b1d-\\U00002b4f",
223
+ "\\U00002b51-\\U00002b54",
224
+ "\\U00002b56-\\U00002b73",
145
225
  "\\U00002b76-\\U00002b95",
146
226
  "\\U00002b97-\\U00002bff",
147
227
  "\\U00002ce5-\\U00002cea",
@@ -156,9 +236,8 @@ UNICODE_PUNCT_RANGES = [
156
236
  "\\U00002ff0-\\U00002fff",
157
237
  "\\U00003001-\\U00003004",
158
238
  "\\U00003008-\\U00003020",
159
- "\\U00003030",
160
239
  "\\U00003036-\\U00003037",
161
- "\\U0000303d-\\U0000303f",
240
+ "\\U0000303e-\\U0000303f",
162
241
  "\\U0000309b-\\U0000309c",
163
242
  "\\U000030a0",
164
243
  "\\U000030fb",
@@ -170,7 +249,9 @@ UNICODE_PUNCT_RANGES = [
170
249
  "\\U0000322a-\\U00003247",
171
250
  "\\U00003250",
172
251
  "\\U00003260-\\U0000327f",
173
- "\\U0000328a-\\U000032b0",
252
+ "\\U0000328a-\\U00003296",
253
+ "\\U00003298",
254
+ "\\U0000329a-\\U000032b0",
174
255
  "\\U000032c0-\\U000033ff",
175
256
  "\\U00004dc0-\\U00004dff",
176
257
  "\\U0000a490-\\U0000a4c6",
@@ -314,49 +395,97 @@ UNICODE_PUNCT_RANGES = [
314
395
  "\\U0001ecb0",
315
396
  "\\U0001ed2e",
316
397
  "\\U0001eef0-\\U0001eef1",
317
- "\\U0001f000-\\U0001f02b",
398
+ "\\U0001f000-\\U0001f003",
399
+ "\\U0001f005-\\U0001f02b",
318
400
  "\\U0001f030-\\U0001f093",
319
401
  "\\U0001f0a0-\\U0001f0ae",
320
402
  "\\U0001f0b1-\\U0001f0bf",
321
- "\\U0001f0c1-\\U0001f0cf",
403
+ "\\U0001f0c1-\\U0001f0ce",
322
404
  "\\U0001f0d1-\\U0001f0f5",
323
- "\\U0001f10d-\\U0001f12f",
324
- "\\U0001f14a-\\U0001f14f",
325
- "\\U0001f16a-\\U0001f16f",
326
- "\\U0001f18a-\\U0001f1ad",
327
- "\\U0001f1e6-\\U0001f202",
328
- "\\U0001f210-\\U0001f23b",
405
+ "\\U0001f10d-\\U0001f16f",
406
+ "\\U0001f172-\\U0001f17d",
407
+ "\\U0001f180-\\U0001f18d",
408
+ "\\U0001f18f-\\U0001f190",
409
+ "\\U0001f19b-\\U0001f1ad",
410
+ "\\U0001f1e6-\\U0001f1e7",
411
+ "\\U0001f1ea-\\U0001f1eb",
412
+ "\\U0001f1ee-\\U0001f1f1",
413
+ "\\U0001f1f4-\\U0001f1f6",
414
+ "\\U0001f1f9-\\U0001f200",
415
+ "\\U0001f210-\\U0001f219",
416
+ "\\U0001f21b-\\U0001f22e",
417
+ "\\U0001f230-\\U0001f231",
418
+ "\\U0001f23b",
329
419
  "\\U0001f240-\\U0001f248",
330
- "\\U0001f250-\\U0001f251",
331
420
  "\\U0001f260-\\U0001f265",
332
- "\\U0001f300-\\U0001f6d7",
333
- "\\U0001f6dc-\\U0001f6ec",
334
- "\\U0001f6f0-\\U0001f6fc",
421
+ "\\U0001f322-\\U0001f323",
422
+ "\\U0001f394-\\U0001f395",
423
+ "\\U0001f398",
424
+ "\\U0001f39c-\\U0001f39d",
425
+ "\\U0001f3f1-\\U0001f3f2",
426
+ "\\U0001f3f6",
427
+ "\\U0001f4fe",
428
+ "\\U0001f53e-\\U0001f548",
429
+ "\\U0001f54f",
430
+ "\\U0001f568-\\U0001f56e",
431
+ "\\U0001f571-\\U0001f572",
432
+ "\\U0001f57b-\\U0001f586",
433
+ "\\U0001f588-\\U0001f589",
434
+ "\\U0001f58e-\\U0001f58f",
435
+ "\\U0001f591-\\U0001f594",
436
+ "\\U0001f597-\\U0001f5a3",
437
+ "\\U0001f5a6-\\U0001f5a7",
438
+ "\\U0001f5a9-\\U0001f5b0",
439
+ "\\U0001f5b3-\\U0001f5bb",
440
+ "\\U0001f5bd-\\U0001f5c1",
441
+ "\\U0001f5c5-\\U0001f5d0",
442
+ "\\U0001f5d4-\\U0001f5db",
443
+ "\\U0001f5df-\\U0001f5e0",
444
+ "\\U0001f5e2",
445
+ "\\U0001f5e4-\\U0001f5e7",
446
+ "\\U0001f5e9-\\U0001f5ee",
447
+ "\\U0001f5f0-\\U0001f5f2",
448
+ "\\U0001f5f4-\\U0001f5f9",
449
+ "\\U0001f650-\\U0001f67f",
450
+ "\\U0001f6c6-\\U0001f6ca",
451
+ "\\U0001f6d3-\\U0001f6d4",
452
+ "\\U0001f6e6-\\U0001f6e8",
453
+ "\\U0001f6ea",
454
+ "\\U0001f6f1-\\U0001f6f2",
335
455
  "\\U0001f700-\\U0001f776",
336
456
  "\\U0001f77b-\\U0001f7d9",
337
- "\\U0001f7e0-\\U0001f7eb",
338
- "\\U0001f7f0",
339
457
  "\\U0001f800-\\U0001f80b",
340
458
  "\\U0001f810-\\U0001f847",
341
459
  "\\U0001f850-\\U0001f859",
342
460
  "\\U0001f860-\\U0001f887",
343
461
  "\\U0001f890-\\U0001f8ad",
344
462
  "\\U0001f8b0-\\U0001f8b1",
345
- "\\U0001f900-\\U0001fa53",
463
+ "\\U0001f900-\\U0001f90b",
464
+ "\\U0001f93b",
465
+ "\\U0001f946",
466
+ "\\U0001fa00-\\U0001fa53",
346
467
  "\\U0001fa60-\\U0001fa6d",
347
- "\\U0001fa70-\\U0001fa7c",
348
- "\\U0001fa80-\\U0001fa88",
349
- "\\U0001fa90-\\U0001fabd",
350
- "\\U0001fabf-\\U0001fac5",
351
- "\\U0001face-\\U0001fadb",
352
- "\\U0001fae0-\\U0001fae8",
353
- "\\U0001faf0-\\U0001faf8",
354
468
  "\\U0001fb00-\\U0001fb92",
355
469
  "\\U0001fb94-\\U0001fbca",
356
- "\\U000f1990-\\U000f199d", # UCSUR punctuation
470
+ "\\U000f1990-\\U000f199d",
357
471
  ]
358
472
 
359
- UCSUR_PUNCT_RANGES = UNICODE_PUNCT_RANGES[-1] # NOTE: THIS CAN CHANGE
473
+
474
+ NOT_IN_PUNCT_CLASS = r"Ⓐ-ⓩ🄰-🅉🅐-🅩🅰-🆉"
475
+ ALL_VARIATION_SELECTOR_RANGES = ["\\U0000fe00-\\U0000fe0f", "\\U000e0100-\\U000e01ef"]
476
+ EMOJI_VARIATION_SELECTOR_RANGES = ["\\U0000fe0e-\\U0000fe0f"]
477
+ EMOJI_VARIATION_SELECTOR_RANGES_STR = "".join(EMOJI_VARIATION_SELECTOR_RANGES)
478
+ """All variation selectors are in Nonspacing Mark (Mn), but it is more apt to
479
+ mark these two as punctuation, since they are used exclusively for rendering
480
+ emoji.
481
+
482
+ But it's even better to use the Emoji filter.
483
+ """
484
+
485
+ UCSUR_PUNCT_RANGES = ["\\U000f1990-\\U000f199d"]
486
+ UCSUR_PUNCT_RANGES_STR = "".join(UCSUR_PUNCT_RANGES)
487
+ """Private Use Area glyphs are given the apt but unhelpful 'Private Use'
488
+ class."""
360
489
 
361
490
  UNICODE_PUNCT = find_unicode_chars(UNICODE_PUNCT_RANGES)
362
491
  # this is a large string.
@@ -366,7 +495,7 @@ POSIX_PUNCT = r"""-!"#$%&'()*+,./:;<=>?@[\]^_`{|}~"""
366
495
  POSIX_PUNCT_RANGES = find_unicode_ranges(POSIX_PUNCT)
367
496
 
368
497
  ALL_PUNCT = "".join(sorted(list(set(POSIX_PUNCT + UNICODE_PUNCT))))
369
- ALL_PUNCT_RANGES = "".join(find_unicode_ranges(ALL_PUNCT))
498
+ ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
370
499
  # combined bc the result could be simpler
371
500
 
372
501
  SENTENCE_PUNCT = """.?!:;'"()[-]“”·…"""
@@ -374,6 +503,8 @@ SENTENCE_PUNCT = """.?!:;'"()[-]“”·…"""
374
503
 
375
504
  LINKU = Path(__file__).resolve().parent / Path("linku.json")
376
505
  SANDBOX = Path(__file__).resolve().parent / Path("sandbox.json")
506
+ SYLLABICS = Path(__file__).resolve().parent / Path("syllabic.txt")
507
+ ALPHABETICS = Path(__file__).resolve().parent / Path("alphabetic.txt")
377
508
 
378
509
  VOWELS = "aeiou"
379
510
  CONSONANTS = "jklmnpstw"
@@ -390,21 +521,69 @@ ALLOWABLES = {
390
521
  "msa",
391
522
  }
392
523
 
393
- PHONOMATCHES = {
394
- "non",
395
- "nope",
396
- "some",
524
+ FALSE_POS_SYLLABIC = {
525
+ # ordered by frequency in previous TPT data
397
526
  "like",
527
+ "same",
528
+ "nope",
529
+ "uwu", # TODO: emoticon?? uhh?
530
+ "non",
531
+ "owo", # TODO: emoticon??
532
+ "one",
533
+ "to",
534
+ "i",
535
+ "awesome",
398
536
  "use",
399
- "imo",
537
+ "name",
400
538
  "time",
539
+ "imo", # "in my opinion"
401
540
  "man",
402
- "also",
541
+ # "son", # sona typo?
542
+ "joke",
543
+ "so",
544
+ "ten",
545
+ "make",
546
+ "pin",
547
+ "note",
548
+ # "aka" # in sandbox
549
+ "into",
550
+ "in",
551
+ "some",
552
+ "on",
553
+ "me",
554
+ "ipa",
555
+ "sun",
556
+ "sense",
557
+ "none",
558
+ "meme",
559
+ "wise",
560
+ # "ono", # TODO: what is this
561
+ "mon",
562
+ "take",
563
+ "luna",
564
+ "anti",
565
+ "elo",
566
+ "an",
567
+ "win",
568
+ "won",
569
+ "we",
570
+ "men",
571
+ "ton",
572
+ "woke",
573
+ "semi",
574
+ "male",
403
575
  }
404
576
 
405
- ALPHABETIC_MATCHES: Set[str] = set()
406
-
407
- IGNORABLES = PHONOMATCHES | ALPHABETIC_MATCHES
577
+ FALSE_POS_ALPHABETIC: Set[str] = {
578
+ "t",
579
+ "is",
580
+ "not",
581
+ "lol",
582
+ "also",
583
+ "isn", # TODO: tokenizer....
584
+ "mean",
585
+ "means",
586
+ }
408
587
 
409
588
  UCSUR_RANGES = [
410
589
  "\\U000F1900-\\U000F1977", # pu
@@ -439,15 +618,23 @@ with open(SANDBOX) as f:
439
618
  sandbox: Dict[str, Dict[str, str]] = json.loads(f.read())
440
619
  NIMI_LINKU_SANDBOX = {d["word"] for d in sandbox.values()}
441
620
 
621
+ # with open(SYLLABICS) as f:
622
+ # FALSE_POS_SYLLABIC = {line.strip() for line in f}
623
+ #
624
+ # with open(ALPHABETICS) as f:
625
+ # FALSE_POS_ALPHABETIC = {line.strip() for line in f}
626
+
442
627
  del linku
443
628
  del sandbox
444
629
 
445
630
  __all__ = [
446
631
  "ALLOWABLES",
447
632
  "ALL_PUNCT",
448
- "ALL_PUNCT_RANGES",
633
+ "ALL_PUNCT_RANGES_STR",
449
634
  "ALPHABET",
450
635
  "CONSONANTS",
636
+ "EMOJI_VARIATION_SELECTOR_RANGES",
637
+ "EMOJI_VARIATION_SELECTOR_RANGES_STR",
451
638
  "NIMI_KU_LILI",
452
639
  "NIMI_KU_SULI",
453
640
  "NIMI_LINKU_COMMON",
@@ -459,6 +646,8 @@ __all__ = [
459
646
  "NIMI_PU_SYNONYMS",
460
647
  "POSIX_PUNCT",
461
648
  "POSIX_PUNCT_RANGES",
649
+ "UCSUR_PUNCT_RANGES",
650
+ "UCSUR_PUNCT_RANGES_STR",
462
651
  "UNICODE_PUNCT",
463
652
  "UNICODE_PUNCT_RANGES",
464
653
  "VOWELS",
sonatoki/ilo.py CHANGED
@@ -119,7 +119,7 @@ class Ilo:
119
119
  *_, result = self._is_toki_pona(message)
120
120
  return result
121
121
 
122
- def _are_toki_pona(self, message: str):
122
+ def _are_toki_pona(self, message: str) -> List[Scorecard]:
123
123
  """Split a message into sentences, then return a list each sentence's
124
124
  results via `self._is_toki_pona()`.
125
125