sonatoki 0.5.0__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {sonatoki-0.5.0 → sonatoki-0.5.1}/PKG-INFO +1 -1
  2. {sonatoki-0.5.0 → sonatoki-0.5.1}/pyproject.toml +1 -1
  3. {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/Configs.py +1 -0
  4. {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/Filters.py +4 -0
  5. {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/constants.py +81 -2
  6. {sonatoki-0.5.0 → sonatoki-0.5.1}/tests/test_filters.py +3 -0
  7. {sonatoki-0.5.0 → sonatoki-0.5.1}/tests/test_ilo.py +2 -2
  8. {sonatoki-0.5.0 → sonatoki-0.5.1}/tests/test_properties.py +9 -0
  9. {sonatoki-0.5.0 → sonatoki-0.5.1}/LICENSE +0 -0
  10. {sonatoki-0.5.0 → sonatoki-0.5.1}/README.md +0 -0
  11. {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/Cleaners.py +0 -0
  12. {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/Preprocessors.py +0 -0
  13. {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/Scorers.py +0 -0
  14. {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/Tokenizers.py +0 -0
  15. {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/__init__.py +0 -0
  16. {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/__main__.py +0 -0
  17. {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/alphabetic.txt +0 -0
  18. {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/ilo.py +0 -0
  19. {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/linku.json +0 -0
  20. {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/py.typed +0 -0
  21. {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/sandbox.json +0 -0
  22. {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/syllabic.txt +0 -0
  23. {sonatoki-0.5.0 → sonatoki-0.5.1}/src/sonatoki/utils.py +0 -0
  24. {sonatoki-0.5.0 → sonatoki-0.5.1}/tests/__init__.py +0 -0
  25. {sonatoki-0.5.0 → sonatoki-0.5.1}/tests/test_cleaners.py +0 -0
  26. {sonatoki-0.5.0 → sonatoki-0.5.1}/tests/test_preprocessors.py +0 -0
  27. {sonatoki-0.5.0 → sonatoki-0.5.1}/tests/test_scorers.py +0 -0
  28. {sonatoki-0.5.0 → sonatoki-0.5.1}/tests/test_tokenize.py +0 -0
  29. {sonatoki-0.5.0 → sonatoki-0.5.1}/tests/test_utils.py +0 -0
  30. {sonatoki-0.5.0 → sonatoki-0.5.1}/tests/tokenize_cases/tokenize_sentences_tok.yml +0 -0
  31. {sonatoki-0.5.0 → sonatoki-0.5.1}/tests/tokenize_cases/tokenize_words_tok.yml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.5.0"
3
+ version = "0.5.1"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -73,6 +73,7 @@ PrefConfig: IloConfig = {
73
73
  "scoring_filters": [
74
74
  Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
75
75
  And(LongSyllabic, Not(FalsePosSyllabic)),
76
+ # NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
76
77
  LongProperName,
77
78
  LongAlphabetic,
78
79
  ],
@@ -351,6 +351,10 @@ class Or:
351
351
  else:
352
352
  other_filters.extend(member_filters)
353
353
 
354
+ if len(other_filters) == 1: # we only had member filters
355
+ # TODO: this sucks?
356
+ return other_filters[0]
357
+
354
358
  filter = cls.__generic_filter(*other_filters)
355
359
  return filter
356
360
 
@@ -519,8 +519,10 @@ ALLOWABLES = {
519
519
  "kxk", # ken ala ken
520
520
  "wxw", # wile ala wile
521
521
  "msa",
522
+ "anusem",
522
523
  }
523
524
 
525
+ # NOTE: This is being tracked manually rather than fetched from syllabics.txt until I am convinced that solution is appropriate
524
526
  FALSE_POS_SYLLABIC = {
525
527
  # ordered by frequency in previous TPT data
526
528
  "like",
@@ -540,6 +542,7 @@ FALSE_POS_SYLLABIC = {
540
542
  "man",
541
543
  # "son", # sona typo?
542
544
  "joke",
545
+ # pon would go here
543
546
  "so",
544
547
  "ten",
545
548
  "make",
@@ -548,11 +551,14 @@ FALSE_POS_SYLLABIC = {
548
551
  # "aka" # in sandbox
549
552
  "into",
550
553
  "in",
554
+ "no",
551
555
  "some",
556
+ # "papa",
552
557
  "on",
553
558
  "me",
554
559
  "ipa",
555
560
  "sun",
561
+ "mine",
556
562
  "sense",
557
563
  "none",
558
564
  "meme",
@@ -561,28 +567,101 @@ FALSE_POS_SYLLABIC = {
561
567
  "mon",
562
568
  "take",
563
569
  "luna",
564
- "anti",
565
570
  "elo",
571
+ "japanese",
566
572
  "an",
573
+ "anti",
567
574
  "win",
568
575
  "won",
569
- "we",
576
+ "we", # word in sandbox
570
577
  "men",
571
578
  "ton",
572
579
  "woke",
580
+ "sen", # seen
581
+ "se", # see
573
582
  "semi",
574
583
  "male",
584
+ # "pen", # borderline
585
+ "woman",
586
+ "line",
587
+ "meta",
588
+ "mini",
589
+ "sine",
590
+ # "min", # borderline
591
+ "oposite",
592
+ "anime",
593
+ "potato",
594
+ # "japan",
595
+ "nose",
596
+ "kilo",
597
+ "alone",
598
+ "minute",
599
+ "late",
600
+ "women",
601
+ "leson",
602
+ "amen",
603
+ "tote",
604
+ "lame",
605
+ "online",
606
+ "tone",
607
+ "ate",
608
+ "mile",
609
+ "melon",
610
+ "tense",
611
+ "nonsense",
612
+ "nine",
613
+ "emo",
614
+ "unlike",
615
+ "lone",
616
+ # manual additions
617
+ "alike",
618
+ "amuse",
619
+ "antelope",
620
+ "antena",
621
+ "apetite",
622
+ "asasin",
623
+ "asasinate",
624
+ "asinine",
625
+ "asinine",
626
+ "asume",
627
+ "atone",
628
+ "awake",
629
+ "awaken",
630
+ "eliminate",
631
+ "elite",
632
+ "misuse",
633
+ "emanate",
634
+ "iluminate",
635
+ "imense",
636
+ "imitate",
637
+ "insane",
638
+ "insolate",
639
+ "insulate",
640
+ "intense",
641
+ "lemon",
642
+ "manipulate",
575
643
  }
576
644
 
577
645
  FALSE_POS_ALPHABETIC: Set[str] = {
578
646
  "t",
579
647
  "is",
648
+ "as",
580
649
  "not",
650
+ "link",
651
+ "wait",
581
652
  "lol",
653
+ "new",
582
654
  "also",
583
655
  "isn", # TODO: tokenizer....
584
656
  "mean",
585
657
  "means",
658
+ "it",
659
+ "moment",
660
+ "its",
661
+ "lmao",
662
+ "new",
663
+ "wel",
664
+ "makes",
586
665
  }
587
666
 
588
667
  UCSUR_RANGES = [
@@ -21,6 +21,7 @@ from sonatoki.Filters import (
21
21
  Punctuation,
22
22
  AlphabeticRe,
23
23
  LongSyllabic,
24
+ MemberFilter,
24
25
  NimiLinkuCore,
25
26
  PunctuationRe,
26
27
  LongAlphabetic,
@@ -207,6 +208,8 @@ def test_OrFilter(s: str):
207
208
  @given(st.sampled_from(list(NIMI_PU | NIMI_LINKU_OBSCURE)))
208
209
  def test_MemberFilters_OrFilter(s: str):
209
210
  filter = Or(NimiPu, NimiLinkuObscure)
211
+ assert issubclass(filter, MemberFilter)
212
+
210
213
  res = filter.filter(s)
211
214
  res_pu = NimiPu.filter(s)
212
215
  res_obscure = NimiLinkuObscure.filter(s)
@@ -73,6 +73,7 @@ SYLLABIC_MATCHES = [
73
73
 
74
74
  ALPHABETIC_MATCHES = [
75
75
  "mi mtue o kama sona",
76
+ "mi mute o kma son", # this one is odd because `son` is an unintended phonetic match
76
77
  "mi mute o kama kne snoa a",
77
78
  "ni li tptpt",
78
79
  "mi wile pana lon sptp",
@@ -120,6 +121,7 @@ EXCESSIVE_SYLLABICS = [
120
121
  "a ton of insolate puke. make no amen, no joke.",
121
122
  "I elope so, to an elite untaken tune, some unwise tone",
122
123
  "insane asinine lemon awesome atone joke",
124
+ "insane asinine lemon awesome atone", # i got more clever
123
125
  ]
124
126
 
125
127
  EXCESSIVE_ALPHABETICS = [
@@ -177,7 +179,6 @@ KNOWN_BAD = (
177
179
  FALSE_NEGATIVES = [
178
180
  # emoticon should not be a problem
179
181
  # a token that is one edit off a known word should be allowed
180
- "mi mute o kma son", # this one is obnoxious because `son` did match phonetically before
181
182
  "mi pnoa",
182
183
  "tok",
183
184
  "mut",
@@ -187,7 +188,6 @@ FALSE_NEGATIVES = [
187
188
  ]
188
189
 
189
190
  FALSE_POSITIVES = [
190
- "insane asinine lemon awesome atone",
191
191
  "lete li ike x.x", # this is an emoticon but passes because 'x' is in Filters.Miscellaneous
192
192
  ]
193
193
 
@@ -25,9 +25,11 @@ from sonatoki.constants import (
25
25
  NIMI_LINKU_CORE,
26
26
  NIMI_PU_SYNONYMS,
27
27
  NIMI_LINKU_COMMON,
28
+ FALSE_POS_SYLLABIC,
28
29
  NIMI_LINKU_OBSCURE,
29
30
  NIMI_LINKU_SANDBOX,
30
31
  NIMI_LINKU_UNCOMMON,
32
+ FALSE_POS_ALPHABETIC,
31
33
  )
32
34
 
33
35
 
@@ -76,3 +78,10 @@ def test_nimi_linku_properties(s: str):
76
78
  assert Syllabic.filter(s), repr(s)
77
79
  assert Phonotactic.filter(s), repr(s)
78
80
  # Passing phonotactic implies all of the above
81
+
82
+
83
+ @given(st.sampled_from(list(FALSE_POS_ALPHABETIC)))
84
+ def test_false_pos_properties(s: str):
85
+ res_syllabic = Syllabic.filter(s)
86
+ res_alphabetic = Alphabetic.filter(s)
87
+ assert res_alphabetic and not res_syllabic
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes