sonatoki 0.5.0__tar.gz → 0.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sonatoki-0.5.0 → sonatoki-0.5.1}/PKG-INFO +1 -1
- {sonatoki-0.5.0 → sonatoki-0.5.1}/pyproject.toml +1 -1
- {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/Configs.py +1 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/Filters.py +4 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/constants.py +81 -2
- {sonatoki-0.5.0 → sonatoki-0.5.1}/tests/test_filters.py +3 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/tests/test_ilo.py +2 -2
- {sonatoki-0.5.0 → sonatoki-0.5.1}/tests/test_properties.py +9 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/LICENSE +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/README.md +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/Cleaners.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/Preprocessors.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/Scorers.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/Tokenizers.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/__init__.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/__main__.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/alphabetic.txt +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/ilo.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/linku.json +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/py.typed +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/sandbox.json +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/syllabic.txt +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/utils.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/tests/__init__.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/tests/test_cleaners.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/tests/test_preprocessors.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/tests/test_scorers.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/tests/test_tokenize.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/tests/test_utils.py +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
- {sonatoki-0.5.0 → sonatoki-0.5.1}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -73,6 +73,7 @@ PrefConfig: IloConfig = {
|
|
73
73
|
"scoring_filters": [
|
74
74
|
Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
|
75
75
|
And(LongSyllabic, Not(FalsePosSyllabic)),
|
76
|
+
# NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
|
76
77
|
LongProperName,
|
77
78
|
LongAlphabetic,
|
78
79
|
],
|
@@ -351,6 +351,10 @@ class Or:
|
|
351
351
|
else:
|
352
352
|
other_filters.extend(member_filters)
|
353
353
|
|
354
|
+
if len(other_filters) == 1: # we only had member filters
|
355
|
+
# TODO: this sucks?
|
356
|
+
return other_filters[0]
|
357
|
+
|
354
358
|
filter = cls.__generic_filter(*other_filters)
|
355
359
|
return filter
|
356
360
|
|
@@ -519,8 +519,10 @@ ALLOWABLES = {
|
|
519
519
|
"kxk", # ken ala ken
|
520
520
|
"wxw", # wile ala wile
|
521
521
|
"msa",
|
522
|
+
"anusem",
|
522
523
|
}
|
523
524
|
|
525
|
+
# NOTE: This is being tracked manually rather than fetched from syllabics.txt until I am convinced that solution is appropriate
|
524
526
|
FALSE_POS_SYLLABIC = {
|
525
527
|
# ordered by frequency in previous TPT data
|
526
528
|
"like",
|
@@ -540,6 +542,7 @@ FALSE_POS_SYLLABIC = {
|
|
540
542
|
"man",
|
541
543
|
# "son", # sona typo?
|
542
544
|
"joke",
|
545
|
+
# pon would go here
|
543
546
|
"so",
|
544
547
|
"ten",
|
545
548
|
"make",
|
@@ -548,11 +551,14 @@ FALSE_POS_SYLLABIC = {
|
|
548
551
|
# "aka" # in sandbox
|
549
552
|
"into",
|
550
553
|
"in",
|
554
|
+
"no",
|
551
555
|
"some",
|
556
|
+
# "papa",
|
552
557
|
"on",
|
553
558
|
"me",
|
554
559
|
"ipa",
|
555
560
|
"sun",
|
561
|
+
"mine",
|
556
562
|
"sense",
|
557
563
|
"none",
|
558
564
|
"meme",
|
@@ -561,28 +567,101 @@ FALSE_POS_SYLLABIC = {
|
|
561
567
|
"mon",
|
562
568
|
"take",
|
563
569
|
"luna",
|
564
|
-
"anti",
|
565
570
|
"elo",
|
571
|
+
"japanese",
|
566
572
|
"an",
|
573
|
+
"anti",
|
567
574
|
"win",
|
568
575
|
"won",
|
569
|
-
"we",
|
576
|
+
"we", # word in sandbox
|
570
577
|
"men",
|
571
578
|
"ton",
|
572
579
|
"woke",
|
580
|
+
"sen", # seen
|
581
|
+
"se", # see
|
573
582
|
"semi",
|
574
583
|
"male",
|
584
|
+
# "pen", # borderline
|
585
|
+
"woman",
|
586
|
+
"line",
|
587
|
+
"meta",
|
588
|
+
"mini",
|
589
|
+
"sine",
|
590
|
+
# "min", # borderline
|
591
|
+
"oposite",
|
592
|
+
"anime",
|
593
|
+
"potato",
|
594
|
+
# "japan",
|
595
|
+
"nose",
|
596
|
+
"kilo",
|
597
|
+
"alone",
|
598
|
+
"minute",
|
599
|
+
"late",
|
600
|
+
"women",
|
601
|
+
"leson",
|
602
|
+
"amen",
|
603
|
+
"tote",
|
604
|
+
"lame",
|
605
|
+
"online",
|
606
|
+
"tone",
|
607
|
+
"ate",
|
608
|
+
"mile",
|
609
|
+
"melon",
|
610
|
+
"tense",
|
611
|
+
"nonsense",
|
612
|
+
"nine",
|
613
|
+
"emo",
|
614
|
+
"unlike",
|
615
|
+
"lone",
|
616
|
+
# manual additions
|
617
|
+
"alike",
|
618
|
+
"amuse",
|
619
|
+
"antelope",
|
620
|
+
"antena",
|
621
|
+
"apetite",
|
622
|
+
"asasin",
|
623
|
+
"asasinate",
|
624
|
+
"asinine",
|
625
|
+
"asinine",
|
626
|
+
"asume",
|
627
|
+
"atone",
|
628
|
+
"awake",
|
629
|
+
"awaken",
|
630
|
+
"eliminate",
|
631
|
+
"elite",
|
632
|
+
"misuse",
|
633
|
+
"emanate",
|
634
|
+
"iluminate",
|
635
|
+
"imense",
|
636
|
+
"imitate",
|
637
|
+
"insane",
|
638
|
+
"insolate",
|
639
|
+
"insulate",
|
640
|
+
"intense",
|
641
|
+
"lemon",
|
642
|
+
"manipulate",
|
575
643
|
}
|
576
644
|
|
577
645
|
FALSE_POS_ALPHABETIC: Set[str] = {
|
578
646
|
"t",
|
579
647
|
"is",
|
648
|
+
"as",
|
580
649
|
"not",
|
650
|
+
"link",
|
651
|
+
"wait",
|
581
652
|
"lol",
|
653
|
+
"new",
|
582
654
|
"also",
|
583
655
|
"isn", # TODO: tokenizer....
|
584
656
|
"mean",
|
585
657
|
"means",
|
658
|
+
"it",
|
659
|
+
"moment",
|
660
|
+
"its",
|
661
|
+
"lmao",
|
662
|
+
"new",
|
663
|
+
"wel",
|
664
|
+
"makes",
|
586
665
|
}
|
587
666
|
|
588
667
|
UCSUR_RANGES = [
|
@@ -21,6 +21,7 @@ from sonatoki.Filters import (
|
|
21
21
|
Punctuation,
|
22
22
|
AlphabeticRe,
|
23
23
|
LongSyllabic,
|
24
|
+
MemberFilter,
|
24
25
|
NimiLinkuCore,
|
25
26
|
PunctuationRe,
|
26
27
|
LongAlphabetic,
|
@@ -207,6 +208,8 @@ def test_OrFilter(s: str):
|
|
207
208
|
@given(st.sampled_from(list(NIMI_PU | NIMI_LINKU_OBSCURE)))
|
208
209
|
def test_MemberFilters_OrFilter(s: str):
|
209
210
|
filter = Or(NimiPu, NimiLinkuObscure)
|
211
|
+
assert issubclass(filter, MemberFilter)
|
212
|
+
|
210
213
|
res = filter.filter(s)
|
211
214
|
res_pu = NimiPu.filter(s)
|
212
215
|
res_obscure = NimiLinkuObscure.filter(s)
|
@@ -73,6 +73,7 @@ SYLLABIC_MATCHES = [
|
|
73
73
|
|
74
74
|
ALPHABETIC_MATCHES = [
|
75
75
|
"mi mtue o kama sona",
|
76
|
+
"mi mute o kma son", # this one is odd because `son` is an unintended phonetic match
|
76
77
|
"mi mute o kama kne snoa a",
|
77
78
|
"ni li tptpt",
|
78
79
|
"mi wile pana lon sptp",
|
@@ -120,6 +121,7 @@ EXCESSIVE_SYLLABICS = [
|
|
120
121
|
"a ton of insolate puke. make no amen, no joke.",
|
121
122
|
"I elope so, to an elite untaken tune, some unwise tone",
|
122
123
|
"insane asinine lemon awesome atone joke",
|
124
|
+
"insane asinine lemon awesome atone", # i got more clever
|
123
125
|
]
|
124
126
|
|
125
127
|
EXCESSIVE_ALPHABETICS = [
|
@@ -177,7 +179,6 @@ KNOWN_BAD = (
|
|
177
179
|
FALSE_NEGATIVES = [
|
178
180
|
# emoticon should not be a problem
|
179
181
|
# a token that is one edit off a known word should be allowed
|
180
|
-
"mi mute o kma son", # this one is obnoxious because `son` did match phonetically before
|
181
182
|
"mi pnoa",
|
182
183
|
"tok",
|
183
184
|
"mut",
|
@@ -187,7 +188,6 @@ FALSE_NEGATIVES = [
|
|
187
188
|
]
|
188
189
|
|
189
190
|
FALSE_POSITIVES = [
|
190
|
-
"insane asinine lemon awesome atone",
|
191
191
|
"lete li ike x.x", # this is an emoticon but passes because 'x' is in Filters.Miscellaneous
|
192
192
|
]
|
193
193
|
|
@@ -25,9 +25,11 @@ from sonatoki.constants import (
|
|
25
25
|
NIMI_LINKU_CORE,
|
26
26
|
NIMI_PU_SYNONYMS,
|
27
27
|
NIMI_LINKU_COMMON,
|
28
|
+
FALSE_POS_SYLLABIC,
|
28
29
|
NIMI_LINKU_OBSCURE,
|
29
30
|
NIMI_LINKU_SANDBOX,
|
30
31
|
NIMI_LINKU_UNCOMMON,
|
32
|
+
FALSE_POS_ALPHABETIC,
|
31
33
|
)
|
32
34
|
|
33
35
|
|
@@ -76,3 +78,10 @@ def test_nimi_linku_properties(s: str):
|
|
76
78
|
assert Syllabic.filter(s), repr(s)
|
77
79
|
assert Phonotactic.filter(s), repr(s)
|
78
80
|
# Passing phonotactic implies all of the above
|
81
|
+
|
82
|
+
|
83
|
+
@given(st.sampled_from(list(FALSE_POS_ALPHABETIC)))
|
84
|
+
def test_false_pos_properties(s: str):
|
85
|
+
res_syllabic = Syllabic.filter(s)
|
86
|
+
res_alphabetic = Alphabetic.filter(s)
|
87
|
+
assert res_alphabetic and not res_syllabic
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|