sonatoki 0.5.3__tar.gz → 0.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {sonatoki-0.5.3 → sonatoki-0.6.1}/PKG-INFO +1 -1
  2. {sonatoki-0.5.3 → sonatoki-0.6.1}/pyproject.toml +1 -1
  3. {sonatoki-0.5.3 → sonatoki-0.6.1}/src/sonatoki/Configs.py +8 -6
  4. {sonatoki-0.5.3 → sonatoki-0.6.1}/src/sonatoki/Tokenizers.py +64 -29
  5. {sonatoki-0.5.3 → sonatoki-0.6.1}/src/sonatoki/constants.py +13 -3
  6. {sonatoki-0.5.3 → sonatoki-0.6.1}/tests/test_ilo.py +2 -1
  7. {sonatoki-0.5.3 → sonatoki-0.6.1}/tests/tokenize_cases/tokenize_sentences_tok.yml +8 -4
  8. {sonatoki-0.5.3 → sonatoki-0.6.1}/tests/tokenize_cases/tokenize_words_tok.yml +115 -3
  9. {sonatoki-0.5.3 → sonatoki-0.6.1}/LICENSE +0 -0
  10. {sonatoki-0.5.3 → sonatoki-0.6.1}/README.md +0 -0
  11. {sonatoki-0.5.3 → sonatoki-0.6.1}/src/sonatoki/Cleaners.py +0 -0
  12. {sonatoki-0.5.3 → sonatoki-0.6.1}/src/sonatoki/Filters.py +0 -0
  13. {sonatoki-0.5.3 → sonatoki-0.6.1}/src/sonatoki/Preprocessors.py +0 -0
  14. {sonatoki-0.5.3 → sonatoki-0.6.1}/src/sonatoki/Scorers.py +0 -0
  15. {sonatoki-0.5.3 → sonatoki-0.6.1}/src/sonatoki/__init__.py +0 -0
  16. {sonatoki-0.5.3 → sonatoki-0.6.1}/src/sonatoki/__main__.py +0 -0
  17. {sonatoki-0.5.3 → sonatoki-0.6.1}/src/sonatoki/alphabetic.txt +0 -0
  18. {sonatoki-0.5.3 → sonatoki-0.6.1}/src/sonatoki/ilo.py +0 -0
  19. {sonatoki-0.5.3 → sonatoki-0.6.1}/src/sonatoki/linku.json +0 -0
  20. {sonatoki-0.5.3 → sonatoki-0.6.1}/src/sonatoki/py.typed +0 -0
  21. {sonatoki-0.5.3 → sonatoki-0.6.1}/src/sonatoki/sandbox.json +0 -0
  22. {sonatoki-0.5.3 → sonatoki-0.6.1}/src/sonatoki/syllabic.txt +0 -0
  23. {sonatoki-0.5.3 → sonatoki-0.6.1}/src/sonatoki/utils.py +0 -0
  24. {sonatoki-0.5.3 → sonatoki-0.6.1}/tests/__init__.py +0 -0
  25. {sonatoki-0.5.3 → sonatoki-0.6.1}/tests/test_cleaners.py +0 -0
  26. {sonatoki-0.5.3 → sonatoki-0.6.1}/tests/test_filters.py +0 -0
  27. {sonatoki-0.5.3 → sonatoki-0.6.1}/tests/test_preprocessors.py +0 -0
  28. {sonatoki-0.5.3 → sonatoki-0.6.1}/tests/test_properties.py +0 -0
  29. {sonatoki-0.5.3 → sonatoki-0.6.1}/tests/test_scorers.py +0 -0
  30. {sonatoki-0.5.3 → sonatoki-0.6.1}/tests/test_tokenize.py +0 -0
  31. {sonatoki-0.5.3 → sonatoki-0.6.1}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.5.3
3
+ Version: 0.6.1
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sonatoki"
3
- version = "0.5.3"
3
+ version = "0.6.1"
4
4
  description = "ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?"
5
5
  authors = [
6
6
  { name = "jan Kekan San (@gregdan3)", email = "gregory.danielson3@gmail.com" },
@@ -30,10 +30,11 @@ from sonatoki.Filters import (
30
30
  NimiLinkuObscure,
31
31
  NimiLinkuSandbox,
32
32
  NimiLinkuUncommon,
33
+ FalsePosAlphabetic,
33
34
  )
34
35
  from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
35
36
  from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
36
- from sonatoki.Tokenizers import Tokenizer
37
+ from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
37
38
  from sonatoki.Preprocessors import (
38
39
  URLs,
39
40
  Emoji,
@@ -72,11 +73,11 @@ PrefConfig: IloConfig = {
72
73
  "cleaners": [ConsecutiveDuplicates],
73
74
  "ignoring_filters": [Numeric, Punctuation],
74
75
  "scoring_filters": [
75
- Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
76
+ Or(NimiLinkuCore, NimiLinkuCommon, NimiLinkuUncommon, NimiUCSUR),
76
77
  And(LongSyllabic, Not(FalsePosSyllabic)),
77
78
  # NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
78
79
  LongProperName,
79
- LongAlphabetic,
80
+ And(LongAlphabetic, Not(FalsePosAlphabetic)),
80
81
  ],
81
82
  "scorer": SoftScaling,
82
83
  "passing_score": 0.8,
@@ -98,7 +99,7 @@ CorpusConfig: IloConfig = {
98
99
  ),
99
100
  And(LongSyllabic, Not(FalsePosSyllabic)),
100
101
  LongProperName,
101
- LongAlphabetic,
102
+ And(LongAlphabetic, Not(FalsePosAlphabetic)),
102
103
  ],
103
104
  "scorer": SoftScaling,
104
105
  "passing_score": 0.8,
@@ -145,6 +146,7 @@ LazyConfig: IloConfig = {
145
146
  "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
146
147
  "scorer": SoftPassFail,
147
148
  "passing_score": 0.8,
149
+ "word_tokenizer": WordTokenizerRe, # mimics old tokenizer
148
150
  }
149
151
  """This is extremely silly."""
150
152
  IsipinEpikuConfig: IloConfig = {
@@ -161,7 +163,7 @@ IsipinEpikuConfig: IloConfig = {
161
163
  ),
162
164
  And(LongSyllabic, Not(FalsePosSyllabic)),
163
165
  LongProperName,
164
- LongAlphabetic,
166
+ And(LongAlphabetic, Not(FalsePosAlphabetic)),
165
167
  ],
166
168
  "scorer": SoftScaling,
167
169
  "passing_score": 0.8,
@@ -176,7 +178,7 @@ DiscordConfig: IloConfig = {
176
178
  Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
177
179
  And(LongSyllabic, Not(FalsePosSyllabic)),
178
180
  LongProperName,
179
- LongAlphabetic,
181
+ And(LongAlphabetic, Not(FalsePosAlphabetic)),
180
182
  ],
181
183
  "scorer": SoftScaling,
182
184
  "passing_score": 0.8,
@@ -10,7 +10,12 @@ from typing_extensions import override, deprecated
10
10
  # LOCAL
11
11
  from sonatoki.utils import regex_escape
12
12
  from sonatoki.Filters import NimiUCSUR # seriously this sucks
13
- from sonatoki.constants import ALL_PUNCT, SENTENCE_PUNCT, ALL_PUNCT_RANGES_STR
13
+ from sonatoki.constants import (
14
+ ALL_PUNCT,
15
+ SENTENCE_PUNCT,
16
+ INTRA_WORD_PUNCT,
17
+ ALL_PUNCT_RANGES_STR,
18
+ )
14
19
 
15
20
  regex.DEFAULT_VERSION = regex.VERSION1
16
21
 
@@ -47,11 +52,62 @@ class Regex1Tokenizer(Tokenizer):
47
52
 
48
53
  class WordTokenizer(SetTokenizer):
49
54
  delimiters = set(ALL_PUNCT)
55
+ intra_word_punct = set(INTRA_WORD_PUNCT)
56
+
57
+ @classmethod
58
+ def is_delimiter(cls, c: str) -> bool:
59
+ return c in cls.delimiters or not c
50
60
 
51
61
  @classmethod
52
- def __helper(cls, s: str, tokens: List[str], last_match: int, i: int):
53
- match = s[last_match:i].split()
54
- [tokens.append(t) for t in match if t]
62
+ def add_token(cls, s: str, tokens: List[str], last_match: int, i: int):
63
+ if i > last_match:
64
+ tokens.append(s[last_match:i])
65
+
66
+ @classmethod
67
+ def to_tokens(cls, s: str) -> List[str]:
68
+ tokens: List[str] = []
69
+
70
+ slen = len(s)
71
+ i = 0
72
+ did_skip = False # ensure exists
73
+ while i < slen:
74
+
75
+ # contiguous punctuation chars
76
+ last_match = i
77
+ while i < slen and cls.is_delimiter(s[i]):
78
+ # no special case
79
+ i += 1
80
+ cls.add_token(s, tokens, last_match, i)
81
+
82
+ # contiguous writing chars (much harder)
83
+ last_match = i
84
+ while i < slen and not cls.is_delimiter(s[i]):
85
+ did_skip = False
86
+ # we skip and see another writing char, or init
87
+
88
+ if NimiUCSUR.filter(s[i]):
89
+ cls.add_token(s, tokens, last_match, i)
90
+ tokens.append(s[i])
91
+ i += 1
92
+ last_match = i
93
+ continue
94
+
95
+ next_char = s[i + 1] if i + 1 < slen else ""
96
+ if next_char in cls.intra_word_punct:
97
+ did_skip = True
98
+ i += 2
99
+ continue
100
+
101
+ i += 1
102
+
103
+ if did_skip:
104
+ # we skipped, but there wasn't another writing character
105
+ cls.add_token(s, tokens, last_match, i - 1)
106
+ last_match = i - 1
107
+
108
+ cls.add_token(s, tokens, last_match, i)
109
+
110
+ return tokens
55
111
 
56
112
  @classmethod
57
113
  @override
@@ -60,33 +116,12 @@ class WordTokenizer(SetTokenizer):
60
116
  return []
61
117
 
62
118
  tokens: List[str] = []
119
+ candidates: List[str] = s.split()
63
120
 
64
- i = 0 # ensure i is bound
65
- last_match = 0
66
- last_membership = s[0] in cls.delimiters
67
- for i, char in enumerate(s):
68
- mem = char in cls.delimiters
69
- ucsur = NimiUCSUR.filter(char)
70
- changed = (mem != last_membership) or ucsur
71
- # this keeps contiguous words together, but splits UCSUR
72
- if not changed:
73
- continue
74
-
75
- if ucsur:
76
- if i > last_match:
77
- # Add the token before UCSUR character
78
- cls.__helper(s, tokens, last_match, i)
79
- # Add UCSUR character itself as a token
80
- tokens.append(char)
81
- last_match = i + 1
82
- last_membership = mem
83
- continue
84
-
85
- cls.__helper(s, tokens, last_match, i)
86
- last_match = i
87
- last_membership = mem
121
+ for candidate in candidates:
122
+ results = cls.to_tokens(candidate)
123
+ tokens.extend(results)
88
124
 
89
- cls.__helper(s, tokens, last_match, i + 1)
90
125
  return tokens
91
126
 
92
127
 
@@ -498,7 +498,10 @@ ALL_PUNCT = "".join(sorted(list(set(POSIX_PUNCT + UNICODE_PUNCT))))
498
498
  ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
499
499
  # combined bc the result could be simpler
500
500
 
501
- SENTENCE_PUNCT = """.?!:;'"()[-]“”·…"""
501
+ SENTENCE_PUNCT = """.?!:;()[-]·•…"""
502
+ # NOTE: quotes were previously included, but in TP they are *not* reliably sentence boundaries
503
+
504
+ INTRA_WORD_PUNCT = """-'"""
502
505
 
503
506
 
504
507
  LINKU = Path(__file__).resolve().parent / Path("linku.json")
@@ -514,8 +517,8 @@ LANGUAGE = "english" # for NLTK
514
517
  """Commonly occurring strings which are some kind of valid Toki Pona or
515
518
  external token."""
516
519
  ALLOWABLES = {
517
- "x", # ala
518
- "y", # anu
520
+ # "x", # ala
521
+ # "y", # anu
519
522
  "kxk", # ken ala ken
520
523
  "wxw", # wile ala wile
521
524
  "msa",
@@ -539,6 +542,7 @@ FALSE_POS_SYLLABIC = {
539
542
  "name",
540
543
  "time",
541
544
  "imo", # "in my opinion"
545
+ "ime", # "in my experience"
542
546
  "man",
543
547
  # "son", # sona typo?
544
548
  "joke",
@@ -616,6 +620,7 @@ FALSE_POS_SYLLABIC = {
616
620
  # manual additions
617
621
  "alike",
618
622
  "amuse",
623
+ "animate",
619
624
  "antelope",
620
625
  "antena",
621
626
  "apetite",
@@ -638,16 +643,21 @@ FALSE_POS_SYLLABIC = {
638
643
  "insolate",
639
644
  "insulate",
640
645
  "intense",
646
+ # "june",
641
647
  "lemon",
642
648
  "manipulate",
643
649
  "misuse",
644
650
  "ne", # "no" in many other languages
651
+ "tape",
652
+ "onto",
645
653
  "wana",
654
+ "muse",
646
655
  }
647
656
 
648
657
  FALSE_POS_ALPHABETIC: Set[str] = {
649
658
  "t",
650
659
  "is",
660
+ "os", # some command prefix...
651
661
  "as",
652
662
  "not",
653
663
  "link",
@@ -192,10 +192,11 @@ FALSE_NEGATIVES = [
192
192
  "poan",
193
193
  "mtue",
194
194
  "mi nasa B^)", # emoticon
195
+ "lete li ike x.x", # this is an emoticon but passes because 'x' is in Filters.Miscellaneous
195
196
  ]
196
197
 
197
198
  FALSE_POSITIVES = [
198
- "lete li ike x.x", # this is an emoticon but passes because 'x' is in Filters.Miscellaneous
199
+ "Knowing a little toki pona",
199
200
  ]
200
201
 
201
202
 
@@ -46,13 +46,17 @@
46
46
  input: "mi mu tawa sina, mi wawa e sina."
47
47
  output:
48
48
  - "mi mu tawa sina, mi wawa e sina."
49
- - name: "quotes"
49
+ - name: "singlequotes"
50
50
  input: "toki li tan kulupu Kuko li ni: 'o ike ala!'"
51
- output: # expected; we split on right of all sentence-ending puncts
51
+ output:
52
52
  - "toki li tan kulupu Kuko li ni:"
53
+ - "'o ike ala!"
53
54
  - "'"
54
- - "o ike ala!"
55
- - "'"
55
+ - name: "doublequotes"
56
+ input: 'ona li toki e ni: "mama sina"'
57
+ output:
58
+ - "ona li toki e ni:"
59
+ - '"mama sina"'
56
60
  - name: "discovered case 1"
57
61
  input: "ona li ken lukin e sitelen [_ike_nanpa_lete_ike]. ni li pona kin."
58
62
  output:
@@ -34,7 +34,73 @@
34
34
  - "ike"
35
35
  - "ala"
36
36
  - "!'"
37
+ - name: "english 1"
38
+ input: "isn't that strange?"
39
+ output:
40
+ - "isn't"
41
+ - "that"
42
+ - "strange"
43
+ - "?"
44
+ - name: "english 2"
45
+ input: "i have self-respect..."
46
+ output:
47
+ - "i"
48
+ - "have"
49
+ - "self-respect"
50
+ - "..."
51
+ - name: "english 3"
52
+ input: "i'm an m.d."
53
+ output:
54
+ - "i'm"
55
+ - "an"
56
+ - "m"
57
+ - "."
58
+ - "d"
59
+ - "."
60
+ - name: "english 4"
61
+ input: "it's mind-numbing honestly"
62
+ output:
63
+ - "it's"
64
+ - "mind-numbing"
65
+ - "honestly"
66
+ - name: "english 5"
67
+ input: "Here's what they said: 'single quotes are boring'"
68
+ output:
69
+ - "Here's"
70
+ - "what"
71
+ - "they"
72
+ - "said"
73
+ - ":"
74
+ - "'"
75
+ - "single"
76
+ - "quotes"
77
+ - "are"
78
+ - "boring"
79
+ - "'"
80
+ - name: "english 6"
81
+ input: "Here's what they said: 'single quotes are boring' and true"
82
+ output:
83
+ - "Here's"
84
+ - "what"
85
+ - "they"
86
+ - "said"
87
+ - ":"
88
+ - "'"
89
+ - "single"
90
+ - "quotes"
91
+ - "are"
92
+ - "boring"
93
+ - "'"
94
+ - "and"
95
+ - "true"
96
+ - name: "non-consecutive puncts"
97
+ input: ". . ."
98
+ output:
99
+ - "."
100
+ - "."
101
+ - "."
37
102
  - name: "url"
103
+ xfail: true # we get rid of URLs before tokenizing, so the result isn't material
38
104
  input: "https://mun.la/sona/"
39
105
  output:
40
106
  - "https"
@@ -85,6 +151,16 @@
85
151
  - "mama"
86
152
  - "."
87
153
  - "sina"
154
+ - name: simple bold
155
+ input: "**mi unpa e mama sina**"
156
+ output:
157
+ - "**"
158
+ - "mi"
159
+ - "unpa"
160
+ - "e"
161
+ - "mama"
162
+ - "sina"
163
+ - "**"
88
164
  - name: weird punctuation characters
89
165
  input: "mi^en$sina-li*toki()="
90
166
  output:
@@ -92,9 +168,7 @@
92
168
  - "^"
93
169
  - "en"
94
170
  - "$"
95
- - "sina"
96
- - "-"
97
- - "li"
171
+ - "sina-li" # intended; looks like valid intrapunct
98
172
  - "*"
99
173
  - "toki"
100
174
  - "()="
@@ -225,3 +299,41 @@
225
299
  - "「"
226
300
  - "Direct"
227
301
  - "」"
302
+
303
+ - name: "simple intrapunct 1"
304
+ input: "i'm"
305
+ output:
306
+ - "i'm"
307
+ - name: "intrapunct and punct"
308
+ input: "i'm."
309
+ output:
310
+ - "i'm"
311
+ - "."
312
+ - name: "simple intrapunct 2"
313
+ input: "isn't"
314
+ output:
315
+ - "isn't"
316
+ - name: "quoted with intrapunct"
317
+ input: "'bother'"
318
+ output:
319
+ - "'"
320
+ - "bother"
321
+ - "'"
322
+ - name: "quoted intrapunct with intrapunct 1"
323
+ input: "'isn't'"
324
+ output:
325
+ - "'"
326
+ - "isn't"
327
+ - "'"
328
+ - name: "quoted intrapunct with intrapunct 2"
329
+ input: "'isn't it gross?'"
330
+ output:
331
+ - "'"
332
+ - "isn't"
333
+ - "it"
334
+ - "gross"
335
+ - "?'"
336
+ - name: "multiple intrapunct"
337
+ input: "whom's't'd've'n't"
338
+ output:
339
+ - "whom's't'd've'n't"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes