sonatoki 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/Configs.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # STL
2
2
  from copy import deepcopy
3
- from typing import List, Type, TypedDict
3
+ from typing import Set, List, Type, TypedDict, cast
4
4
 
5
5
  # PDM
6
6
  from typing_extensions import NotRequired
@@ -18,6 +18,7 @@ from sonatoki.Filters import (
18
18
  NimiKuLili,
19
19
  NimiKuSuli,
20
20
  ProperName,
21
+ Phonotactic,
21
22
  Punctuation,
22
23
  LongSyllabic,
23
24
  Miscellaneous,
@@ -73,6 +74,7 @@ PrefConfig: IloConfig = {
73
74
  "scoring_filters": [
74
75
  Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
75
76
  And(LongSyllabic, Not(FalsePosSyllabic)),
77
+ # NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
76
78
  LongProperName,
77
79
  LongAlphabetic,
78
80
  ],
@@ -101,6 +103,26 @@ CorpusConfig: IloConfig = {
101
103
  "scorer": SoftScaling,
102
104
  "passing_score": 0.8,
103
105
  }
106
+
107
+ # TODO: create a mechanism to omit tokens from a filter with more granularity
108
+ __corpus_tokens_dict: Set[str] = cast(
109
+ Set[str],
110
+ CorpusConfig["scoring_filters"][
111
+ 0
112
+ ].tokens, # pyright: ignore[reportAttributeAccessIssue]
113
+ )
114
+ __corpus_tokens_dict -= {
115
+ "an",
116
+ "i",
117
+ "me",
118
+ "ne",
119
+ "se",
120
+ "take",
121
+ "ten",
122
+ "to",
123
+ "u",
124
+ "we",
125
+ }
104
126
  """Mimics the previous implementation of ilo pi toki pona taso."""
105
127
  LazyConfig: IloConfig = {
106
128
  "preprocessors": [Emoji, Backticks, URLs, AngleBracketObject, Reference],
sonatoki/Filters.py CHANGED
@@ -351,6 +351,10 @@ class Or:
351
351
  else:
352
352
  other_filters.extend(member_filters)
353
353
 
354
+ if len(other_filters) == 1: # we only had member filters
355
+ # TODO: this sucks?
356
+ return other_filters[0]
357
+
354
358
  filter = cls.__generic_filter(*other_filters)
355
359
  return filter
356
360
 
sonatoki/Preprocessors.py CHANGED
@@ -90,6 +90,12 @@ class DiscordEmotes(RegexPreprocessor):
90
90
  pattern = re.compile(r"<a?:[a-zA-Z0-9_]{2,}:[0-9]{2,}>")
91
91
 
92
92
 
93
+ class ColonEmotes(RegexPreprocessor):
94
+ """Remove colon-marked emotes `:name:`"""
95
+
96
+ pattern = re.compile(r":[a-zA-Z0-9_]{2,}:")
97
+
98
+
93
99
  class DiscordMentions(RegexPreprocessor):
94
100
  pattern = re.compile(r"<@[\!\&]?[0-9]{2,}>")
95
101
 
sonatoki/constants.py CHANGED
@@ -519,8 +519,10 @@ ALLOWABLES = {
519
519
  "kxk", # ken ala ken
520
520
  "wxw", # wile ala wile
521
521
  "msa",
522
+ "anusem",
522
523
  }
523
524
 
525
+ # NOTE: This is being tracked manually rather than fetched from syllabics.txt until I am convinced that solution is appropriate
524
526
  FALSE_POS_SYLLABIC = {
525
527
  # ordered by frequency in previous TPT data
526
528
  "like",
@@ -540,6 +542,7 @@ FALSE_POS_SYLLABIC = {
540
542
  "man",
541
543
  # "son", # sona typo?
542
544
  "joke",
545
+ # pon would go here
543
546
  "so",
544
547
  "ten",
545
548
  "make",
@@ -548,11 +551,14 @@ FALSE_POS_SYLLABIC = {
548
551
  # "aka" # in sandbox
549
552
  "into",
550
553
  "in",
554
+ "no",
551
555
  "some",
556
+ # "papa", # now in sandbox
552
557
  "on",
553
558
  "me",
554
559
  "ipa",
555
560
  "sun",
561
+ "mine",
556
562
  "sense",
557
563
  "none",
558
564
  "meme",
@@ -561,28 +567,104 @@ FALSE_POS_SYLLABIC = {
561
567
  "mon",
562
568
  "take",
563
569
  "luna",
564
- "anti",
565
570
  "elo",
571
+ "japanese",
566
572
  "an",
573
+ "anti",
567
574
  "win",
568
575
  "won",
569
- "we",
576
+ "we", # word in sandbox
570
577
  "men",
571
578
  "ton",
572
579
  "woke",
580
+ "sen", # seen
581
+ "se", # see
573
582
  "semi",
574
583
  "male",
584
+ # "pen", # borderline
585
+ "woman",
586
+ "line",
587
+ "meta",
588
+ "mini",
589
+ "sine",
590
+ # "min", # borderline
591
+ "oposite",
592
+ "anime",
593
+ "potato",
594
+ "japan",
595
+ "nose",
596
+ "kilo",
597
+ "alone",
598
+ "minute",
599
+ "late",
600
+ "women",
601
+ "leson",
602
+ "amen",
603
+ "tote",
604
+ "lame",
605
+ "online",
606
+ "tone",
607
+ "ate",
608
+ "mile",
609
+ "melon",
610
+ "tense",
611
+ "nonsense",
612
+ "nine",
613
+ "emo",
614
+ "unlike",
615
+ "lone",
616
+ # manual additions
617
+ "alike",
618
+ "amuse",
619
+ "antelope",
620
+ "antena",
621
+ "apetite",
622
+ "asasin",
623
+ "asasinate",
624
+ "asinine",
625
+ "asinine",
626
+ "asume",
627
+ "atone",
628
+ "awake",
629
+ "awaken",
630
+ "eliminate",
631
+ "elite",
632
+ "emanate",
633
+ "iluminate",
634
+ "imense",
635
+ "imitate",
636
+ "injoke",
637
+ "insane",
638
+ "insolate",
639
+ "insulate",
640
+ "intense",
641
+ "lemon",
642
+ "manipulate",
643
+ "misuse",
644
+ "ne", # "no" in many other languages
645
+ "wana",
575
646
  }
576
647
 
577
648
  FALSE_POS_ALPHABETIC: Set[str] = {
578
649
  "t",
579
650
  "is",
651
+ "as",
580
652
  "not",
653
+ "link",
654
+ "wait",
581
655
  "lol",
656
+ "new",
582
657
  "also",
583
658
  "isn", # TODO: tokenizer....
584
659
  "mean",
585
660
  "means",
661
+ "it",
662
+ "moment",
663
+ "its",
664
+ "lmao",
665
+ "new",
666
+ "wel",
667
+ "makes",
586
668
  }
587
669
 
588
670
  UCSUR_RANGES = [