sonatoki 0.5.2__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {sonatoki-0.5.2 → sonatoki-0.6.0}/PKG-INFO +1 -1
  2. {sonatoki-0.5.2 → sonatoki-0.6.0}/pyproject.toml +1 -1
  3. {sonatoki-0.5.2 → sonatoki-0.6.0}/src/sonatoki/Configs.py +32 -16
  4. {sonatoki-0.5.2 → sonatoki-0.6.0}/src/sonatoki/Tokenizers.py +64 -29
  5. {sonatoki-0.5.2 → sonatoki-0.6.0}/src/sonatoki/constants.py +13 -3
  6. {sonatoki-0.5.2 → sonatoki-0.6.0}/tests/test_properties.py +4 -2
  7. {sonatoki-0.5.2 → sonatoki-0.6.0}/tests/tokenize_cases/tokenize_sentences_tok.yml +8 -4
  8. {sonatoki-0.5.2 → sonatoki-0.6.0}/tests/tokenize_cases/tokenize_words_tok.yml +115 -3
  9. {sonatoki-0.5.2 → sonatoki-0.6.0}/LICENSE +0 -0
  10. {sonatoki-0.5.2 → sonatoki-0.6.0}/README.md +0 -0
  11. {sonatoki-0.5.2 → sonatoki-0.6.0}/src/sonatoki/Cleaners.py +0 -0
  12. {sonatoki-0.5.2 → sonatoki-0.6.0}/src/sonatoki/Filters.py +0 -0
  13. {sonatoki-0.5.2 → sonatoki-0.6.0}/src/sonatoki/Preprocessors.py +0 -0
  14. {sonatoki-0.5.2 → sonatoki-0.6.0}/src/sonatoki/Scorers.py +0 -0
  15. {sonatoki-0.5.2 → sonatoki-0.6.0}/src/sonatoki/__init__.py +0 -0
  16. {sonatoki-0.5.2 → sonatoki-0.6.0}/src/sonatoki/__main__.py +0 -0
  17. {sonatoki-0.5.2 → sonatoki-0.6.0}/src/sonatoki/alphabetic.txt +0 -0
  18. {sonatoki-0.5.2 → sonatoki-0.6.0}/src/sonatoki/ilo.py +0 -0
  19. {sonatoki-0.5.2 → sonatoki-0.6.0}/src/sonatoki/linku.json +0 -0
  20. {sonatoki-0.5.2 → sonatoki-0.6.0}/src/sonatoki/py.typed +0 -0
  21. {sonatoki-0.5.2 → sonatoki-0.6.0}/src/sonatoki/sandbox.json +0 -0
  22. {sonatoki-0.5.2 → sonatoki-0.6.0}/src/sonatoki/syllabic.txt +0 -0
  23. {sonatoki-0.5.2 → sonatoki-0.6.0}/src/sonatoki/utils.py +0 -0
  24. {sonatoki-0.5.2 → sonatoki-0.6.0}/tests/__init__.py +0 -0
  25. {sonatoki-0.5.2 → sonatoki-0.6.0}/tests/test_cleaners.py +0 -0
  26. {sonatoki-0.5.2 → sonatoki-0.6.0}/tests/test_filters.py +0 -0
  27. {sonatoki-0.5.2 → sonatoki-0.6.0}/tests/test_ilo.py +0 -0
  28. {sonatoki-0.5.2 → sonatoki-0.6.0}/tests/test_preprocessors.py +0 -0
  29. {sonatoki-0.5.2 → sonatoki-0.6.0}/tests/test_scorers.py +0 -0
  30. {sonatoki-0.5.2 → sonatoki-0.6.0}/tests/test_tokenize.py +0 -0
  31. {sonatoki-0.5.2 → sonatoki-0.6.0}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.5.2
3
+ Version: 0.6.0
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.5.2"
3
+ version = "0.6.0"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -30,10 +30,11 @@ from sonatoki.Filters import (
30
30
  NimiLinkuObscure,
31
31
  NimiLinkuSandbox,
32
32
  NimiLinkuUncommon,
33
+ FalsePosAlphabetic,
33
34
  )
34
35
  from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
35
36
  from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
36
- from sonatoki.Tokenizers import Tokenizer
37
+ from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
37
38
  from sonatoki.Preprocessors import (
38
39
  URLs,
39
40
  Emoji,
@@ -72,11 +73,11 @@ PrefConfig: IloConfig = {
72
73
  "cleaners": [ConsecutiveDuplicates],
73
74
  "ignoring_filters": [Numeric, Punctuation],
74
75
  "scoring_filters": [
75
- Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
76
+ Or(NimiLinkuCore, NimiLinkuCommon, NimiLinkuUncommon, NimiUCSUR),
76
77
  And(LongSyllabic, Not(FalsePosSyllabic)),
77
78
  # NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
78
79
  LongProperName,
79
- LongAlphabetic,
80
+ And(LongAlphabetic, Not(FalsePosAlphabetic)),
80
81
  ],
81
82
  "scorer": SoftScaling,
82
83
  "passing_score": 0.8,
@@ -98,7 +99,7 @@ CorpusConfig: IloConfig = {
98
99
  ),
99
100
  And(LongSyllabic, Not(FalsePosSyllabic)),
100
101
  LongProperName,
101
- LongAlphabetic,
102
+ And(LongAlphabetic, Not(FalsePosAlphabetic)),
102
103
  ],
103
104
  "scorer": SoftScaling,
104
105
  "passing_score": 0.8,
@@ -112,16 +113,30 @@ __corpus_tokens_dict: Set[str] = cast(
112
113
  ].tokens, # pyright: ignore[reportAttributeAccessIssue]
113
114
  )
114
115
  __corpus_tokens_dict -= {
115
- "an",
116
- "i",
117
- "me",
118
- "ne",
119
- "se",
120
- "take",
121
- "ten",
122
- "to",
123
- "u",
124
- "we",
116
+ # Sandbox words are removed from the CorpusConfig if they appear more frequently in English than Toki Pona by a factor of at least 3.
117
+ # In this case, all of these appear more often in English by a factor of at least 10.
118
+ "aka", # also known as
119
+ "an", # article
120
+ "api", # API
121
+ "i", # 1st person
122
+ "kana", # japanese script
123
+ "me", # 1st person
124
+ "ne", # "no" in several languages
125
+ "nu", # "new", now in dutch
126
+ "se", # spanish particle, "see"
127
+ "take", # acquire, perhaps forcefully or without permission
128
+ "ten", # 10
129
+ "to", # to, too
130
+ "u", # no u
131
+ "we", # 1st person plural
132
+ "wi", # wii and discussions of syllables
133
+ "sole", # singular, of shoe
134
+ # unexplored candidates for removal
135
+ # "omen", # ominous
136
+ # "papa", # father
137
+ # "lo", # "lo" and "loo"
138
+ # "ewe", # sheep
139
+ # "pa", # father- eh?
125
140
  }
126
141
  """Mimics the previous implementation of ilo pi toki pona taso."""
127
142
  LazyConfig: IloConfig = {
@@ -131,6 +146,7 @@ LazyConfig: IloConfig = {
131
146
  "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
132
147
  "scorer": SoftPassFail,
133
148
  "passing_score": 0.8,
149
+ "word_tokenizer": WordTokenizerRe, # mimics old tokenizer
134
150
  }
135
151
  """This is extremely silly."""
136
152
  IsipinEpikuConfig: IloConfig = {
@@ -147,7 +163,7 @@ IsipinEpikuConfig: IloConfig = {
147
163
  ),
148
164
  And(LongSyllabic, Not(FalsePosSyllabic)),
149
165
  LongProperName,
150
- LongAlphabetic,
166
+ And(LongAlphabetic, Not(FalsePosAlphabetic)),
151
167
  ],
152
168
  "scorer": SoftScaling,
153
169
  "passing_score": 0.8,
@@ -162,7 +178,7 @@ DiscordConfig: IloConfig = {
162
178
  Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
163
179
  And(LongSyllabic, Not(FalsePosSyllabic)),
164
180
  LongProperName,
165
- LongAlphabetic,
181
+ And(LongAlphabetic, Not(FalsePosAlphabetic)),
166
182
  ],
167
183
  "scorer": SoftScaling,
168
184
  "passing_score": 0.8,
@@ -10,7 +10,12 @@ from typing_extensions import override, deprecated
10
10
  # LOCAL
11
11
  from sonatoki.utils import regex_escape
12
12
  from sonatoki.Filters import NimiUCSUR # seriously this sucks
13
- from sonatoki.constants import ALL_PUNCT, SENTENCE_PUNCT, ALL_PUNCT_RANGES_STR
13
+ from sonatoki.constants import (
14
+ ALL_PUNCT,
15
+ SENTENCE_PUNCT,
16
+ INTRA_WORD_PUNCT,
17
+ ALL_PUNCT_RANGES_STR,
18
+ )
14
19
 
15
20
  regex.DEFAULT_VERSION = regex.VERSION1
16
21
 
@@ -47,11 +52,62 @@ class Regex1Tokenizer(Tokenizer):
47
52
 
48
53
  class WordTokenizer(SetTokenizer):
49
54
  delimiters = set(ALL_PUNCT)
55
+ intra_word_punct = set(INTRA_WORD_PUNCT)
56
+
57
+ @classmethod
58
+ def is_delimiter(cls, c: str) -> bool:
59
+ return c in cls.delimiters or not c
50
60
 
51
61
  @classmethod
52
- def __helper(cls, s: str, tokens: List[str], last_match: int, i: int):
53
- match = s[last_match:i].split()
54
- [tokens.append(t) for t in match if t]
62
+ def add_token(cls, s: str, tokens: List[str], last_match: int, i: int):
63
+ if i > last_match:
64
+ tokens.append(s[last_match:i])
65
+
66
+ @classmethod
67
+ def to_tokens(cls, s: str) -> List[str]:
68
+ tokens: List[str] = []
69
+
70
+ slen = len(s)
71
+ i = 0
72
+ did_skip = False # ensure exists
73
+ while i < slen:
74
+
75
+ # contiguous punctuation chars
76
+ last_match = i
77
+ while i < slen and cls.is_delimiter(s[i]):
78
+ # no special case
79
+ i += 1
80
+ cls.add_token(s, tokens, last_match, i)
81
+
82
+ # contiguous writing chars (much harder)
83
+ last_match = i
84
+ while i < slen and not cls.is_delimiter(s[i]):
85
+ did_skip = False
86
+ # we skip and see another writing char, or init
87
+
88
+ if NimiUCSUR.filter(s[i]):
89
+ cls.add_token(s, tokens, last_match, i)
90
+ tokens.append(s[i])
91
+ i += 1
92
+ last_match = i
93
+ continue
94
+
95
+ next_char = s[i + 1] if i + 1 < slen else ""
96
+ if next_char in cls.intra_word_punct:
97
+ did_skip = True
98
+ i += 2
99
+ continue
100
+
101
+ i += 1
102
+
103
+ if did_skip:
104
+ # we skipped, but there wasn't another writing character
105
+ cls.add_token(s, tokens, last_match, i - 1)
106
+ last_match = i - 1
107
+
108
+ cls.add_token(s, tokens, last_match, i)
109
+
110
+ return tokens
55
111
 
56
112
  @classmethod
57
113
  @override
@@ -60,33 +116,12 @@ class WordTokenizer(SetTokenizer):
60
116
  return []
61
117
 
62
118
  tokens: List[str] = []
119
+ candidates: List[str] = s.split()
63
120
 
64
- i = 0 # ensure i is bound
65
- last_match = 0
66
- last_membership = s[0] in cls.delimiters
67
- for i, char in enumerate(s):
68
- mem = char in cls.delimiters
69
- ucsur = NimiUCSUR.filter(char)
70
- changed = (mem != last_membership) or ucsur
71
- # this keeps contiguous words together, but splits UCSUR
72
- if not changed:
73
- continue
74
-
75
- if ucsur:
76
- if i > last_match:
77
- # Add the token before UCSUR character
78
- cls.__helper(s, tokens, last_match, i)
79
- # Add UCSUR character itself as a token
80
- tokens.append(char)
81
- last_match = i + 1
82
- last_membership = mem
83
- continue
84
-
85
- cls.__helper(s, tokens, last_match, i)
86
- last_match = i
87
- last_membership = mem
121
+ for candidate in candidates:
122
+ results = cls.to_tokens(candidate)
123
+ tokens.extend(results)
88
124
 
89
- cls.__helper(s, tokens, last_match, i + 1)
90
125
  return tokens
91
126
 
92
127
 
@@ -498,7 +498,10 @@ ALL_PUNCT = "".join(sorted(list(set(POSIX_PUNCT + UNICODE_PUNCT))))
498
498
  ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
499
499
  # combined bc the result could be simpler
500
500
 
501
- SENTENCE_PUNCT = """.?!:;'"()[-]“”·…"""
501
+ SENTENCE_PUNCT = """.?!:;()[-]·…"""
502
+ # NOTE: quotes were previously included, but in TP they are *not* reliably sentence boundaries
503
+
504
+ INTRA_WORD_PUNCT = """-'"""
502
505
 
503
506
 
504
507
  LINKU = Path(__file__).resolve().parent / Path("linku.json")
@@ -514,8 +517,8 @@ LANGUAGE = "english" # for NLTK
514
517
  """Commonly occurring strings which are some kind of valid Toki Pona or
515
518
  external token."""
516
519
  ALLOWABLES = {
517
- "x", # ala
518
- "y", # anu
520
+ # "x", # ala
521
+ # "y", # anu
519
522
  "kxk", # ken ala ken
520
523
  "wxw", # wile ala wile
521
524
  "msa",
@@ -539,6 +542,7 @@ FALSE_POS_SYLLABIC = {
539
542
  "name",
540
543
  "time",
541
544
  "imo", # "in my opinion"
545
+ "ime", # "in my experience"
542
546
  "man",
543
547
  # "son", # sona typo?
544
548
  "joke",
@@ -616,6 +620,7 @@ FALSE_POS_SYLLABIC = {
616
620
  # manual additions
617
621
  "alike",
618
622
  "amuse",
623
+ "animate",
619
624
  "antelope",
620
625
  "antena",
621
626
  "apetite",
@@ -638,16 +643,21 @@ FALSE_POS_SYLLABIC = {
638
643
  "insolate",
639
644
  "insulate",
640
645
  "intense",
646
+ # "june",
641
647
  "lemon",
642
648
  "manipulate",
643
649
  "misuse",
644
650
  "ne", # "no" in many other languages
651
+ "tape",
652
+ "onto",
645
653
  "wana",
654
+ "muse",
646
655
  }
647
656
 
648
657
  FALSE_POS_ALPHABETIC: Set[str] = {
649
658
  "t",
650
659
  "is",
660
+ "os", # some command prefix...
651
661
  "as",
652
662
  "not",
653
663
  "link",
@@ -1,6 +1,6 @@
1
1
  # PDM
2
2
  import hypothesis.strategies as st
3
- from hypothesis import given
3
+ from hypothesis import given, assume
4
4
 
5
5
  # LOCAL
6
6
  from sonatoki.Filters import (
@@ -54,11 +54,13 @@ def test_ku_filters_non_overlap(s: str):
54
54
  | NIMI_LINKU_COMMON
55
55
  | NIMI_LINKU_UNCOMMON
56
56
  | NIMI_LINKU_OBSCURE
57
- | NIMI_LINKU_SANDBOX - {"su"}
57
+ | NIMI_LINKU_SANDBOX
58
58
  )
59
59
  )
60
60
  )
61
61
  def test_linku_filters_non_overlap(s: str):
62
+ _ = assume(s != "su")
63
+
62
64
  s = Lowercase.clean(s)
63
65
  s = ConsecutiveDuplicates.clean(s)
64
66
 
@@ -46,13 +46,17 @@
46
46
  input: "mi mu tawa sina, mi wawa e sina."
47
47
  output:
48
48
  - "mi mu tawa sina, mi wawa e sina."
49
- - name: "quotes"
49
+ - name: "singlequotes"
50
50
  input: "toki li tan kulupu Kuko li ni: 'o ike ala!'"
51
- output: # expected; we split on right of all sentence-ending puncts
51
+ output:
52
52
  - "toki li tan kulupu Kuko li ni:"
53
+ - "'o ike ala!"
53
54
  - "'"
54
- - "o ike ala!"
55
- - "'"
55
+ - name: "doublequotes"
56
+ input: 'ona li toki e ni: "mama sina"'
57
+ output:
58
+ - "ona li toki e ni:"
59
+ - '"mama sina"'
56
60
  - name: "discovered case 1"
57
61
  input: "ona li ken lukin e sitelen [_ike_nanpa_lete_ike]. ni li pona kin."
58
62
  output:
@@ -34,7 +34,73 @@
34
34
  - "ike"
35
35
  - "ala"
36
36
  - "!'"
37
+ - name: "english 1"
38
+ input: "isn't that strange?"
39
+ output:
40
+ - "isn't"
41
+ - "that"
42
+ - "strange"
43
+ - "?"
44
+ - name: "english 2"
45
+ input: "i have self-respect..."
46
+ output:
47
+ - "i"
48
+ - "have"
49
+ - "self-respect"
50
+ - "..."
51
+ - name: "english 3"
52
+ input: "i'm an m.d."
53
+ output:
54
+ - "i'm"
55
+ - "an"
56
+ - "m"
57
+ - "."
58
+ - "d"
59
+ - "."
60
+ - name: "english 4"
61
+ input: "it's mind-numbing honestly"
62
+ output:
63
+ - "it's"
64
+ - "mind-numbing"
65
+ - "honestly"
66
+ - name: "english 5"
67
+ input: "Here's what they said: 'single quotes are boring'"
68
+ output:
69
+ - "Here's"
70
+ - "what"
71
+ - "they"
72
+ - "said"
73
+ - ":"
74
+ - "'"
75
+ - "single"
76
+ - "quotes"
77
+ - "are"
78
+ - "boring"
79
+ - "'"
80
+ - name: "english 6"
81
+ input: "Here's what they said: 'single quotes are boring' and true"
82
+ output:
83
+ - "Here's"
84
+ - "what"
85
+ - "they"
86
+ - "said"
87
+ - ":"
88
+ - "'"
89
+ - "single"
90
+ - "quotes"
91
+ - "are"
92
+ - "boring"
93
+ - "'"
94
+ - "and"
95
+ - "true"
96
+ - name: "non-consecutive puncts"
97
+ input: ". . ."
98
+ output:
99
+ - "."
100
+ - "."
101
+ - "."
37
102
  - name: "url"
103
+ xfail: true # we get rid of URLs before tokenizing, so the result isn't material
38
104
  input: "https://mun.la/sona/"
39
105
  output:
40
106
  - "https"
@@ -85,6 +151,16 @@
85
151
  - "mama"
86
152
  - "."
87
153
  - "sina"
154
+ - name: simple bold
155
+ input: "**mi unpa e mama sina**"
156
+ output:
157
+ - "**"
158
+ - "mi"
159
+ - "unpa"
160
+ - "e"
161
+ - "mama"
162
+ - "sina"
163
+ - "**"
88
164
  - name: weird punctuation characters
89
165
  input: "mi^en$sina-li*toki()="
90
166
  output:
@@ -92,9 +168,7 @@
92
168
  - "^"
93
169
  - "en"
94
170
  - "$"
95
- - "sina"
96
- - "-"
97
- - "li"
171
+ - "sina-li" # intended; looks like valid intrapunct
98
172
  - "*"
99
173
  - "toki"
100
174
  - "()="
@@ -225,3 +299,41 @@
225
299
  - "「"
226
300
  - "Direct"
227
301
  - "」"
302
+
303
+ - name: "simple intrapunct 1"
304
+ input: "i'm"
305
+ output:
306
+ - "i'm"
307
+ - name: "intrapunct and punct"
308
+ input: "i'm."
309
+ output:
310
+ - "i'm"
311
+ - "."
312
+ - name: "simple intrapunct 2"
313
+ input: "isn't"
314
+ output:
315
+ - "isn't"
316
+ - name: "quoted with intrapunct"
317
+ input: "'bother'"
318
+ output:
319
+ - "'"
320
+ - "bother"
321
+ - "'"
322
+ - name: "quoted intrapunct with intrapunct 1"
323
+ input: "'isn't'"
324
+ output:
325
+ - "'"
326
+ - "isn't"
327
+ - "'"
328
+ - name: "quoted intrapunct with intrapunct 2"
329
+ input: "'isn't it gross?'"
330
+ output:
331
+ - "'"
332
+ - "isn't"
333
+ - "it"
334
+ - "gross"
335
+ - "?'"
336
+ - name: "multiple intrapunct"
337
+ input: "whom's't'd've'n't"
338
+ output:
339
+ - "whom's't'd've'n't"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes