sonatoki 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sonatoki/Configs.py CHANGED
@@ -30,10 +30,11 @@ from sonatoki.Filters import (
30
30
  NimiLinkuObscure,
31
31
  NimiLinkuSandbox,
32
32
  NimiLinkuUncommon,
33
+ FalsePosAlphabetic,
33
34
  )
34
35
  from sonatoki.Scorers import Number, Scorer, PassFail, SoftScaling, SoftPassFail
35
36
  from sonatoki.Cleaners import Cleaner, ConsecutiveDuplicates
36
- from sonatoki.Tokenizers import Tokenizer
37
+ from sonatoki.Tokenizers import Tokenizer, WordTokenizerRe
37
38
  from sonatoki.Preprocessors import (
38
39
  URLs,
39
40
  Emoji,
@@ -72,11 +73,11 @@ PrefConfig: IloConfig = {
72
73
  "cleaners": [ConsecutiveDuplicates],
73
74
  "ignoring_filters": [Numeric, Punctuation],
74
75
  "scoring_filters": [
75
- Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
76
+ Or(NimiLinkuCore, NimiLinkuCommon, NimiLinkuUncommon, NimiUCSUR),
76
77
  And(LongSyllabic, Not(FalsePosSyllabic)),
77
78
  # NOTE: These are allowed to pass name and alphabetic below, because they *could* be wrong
78
79
  LongProperName,
79
- LongAlphabetic,
80
+ And(LongAlphabetic, Not(FalsePosAlphabetic)),
80
81
  ],
81
82
  "scorer": SoftScaling,
82
83
  "passing_score": 0.8,
@@ -98,7 +99,7 @@ CorpusConfig: IloConfig = {
98
99
  ),
99
100
  And(LongSyllabic, Not(FalsePosSyllabic)),
100
101
  LongProperName,
101
- LongAlphabetic,
102
+ And(LongAlphabetic, Not(FalsePosAlphabetic)),
102
103
  ],
103
104
  "scorer": SoftScaling,
104
105
  "passing_score": 0.8,
@@ -145,6 +146,7 @@ LazyConfig: IloConfig = {
145
146
  "scoring_filters": [Alphabetic, NimiUCSUR, ProperName, Miscellaneous],
146
147
  "scorer": SoftPassFail,
147
148
  "passing_score": 0.8,
149
+ "word_tokenizer": WordTokenizerRe, # mimics old tokenizer
148
150
  }
149
151
  """This is extremely silly."""
150
152
  IsipinEpikuConfig: IloConfig = {
@@ -161,7 +163,7 @@ IsipinEpikuConfig: IloConfig = {
161
163
  ),
162
164
  And(LongSyllabic, Not(FalsePosSyllabic)),
163
165
  LongProperName,
164
- LongAlphabetic,
166
+ And(LongAlphabetic, Not(FalsePosAlphabetic)),
165
167
  ],
166
168
  "scorer": SoftScaling,
167
169
  "passing_score": 0.8,
@@ -176,7 +178,7 @@ DiscordConfig: IloConfig = {
176
178
  Or(NimiLinkuCore, NimiLinkuCommon, NimiUCSUR, Miscellaneous),
177
179
  And(LongSyllabic, Not(FalsePosSyllabic)),
178
180
  LongProperName,
179
- LongAlphabetic,
181
+ And(LongAlphabetic, Not(FalsePosAlphabetic)),
180
182
  ],
181
183
  "scorer": SoftScaling,
182
184
  "passing_score": 0.8,
sonatoki/Tokenizers.py CHANGED
@@ -10,7 +10,12 @@ from typing_extensions import override, deprecated
10
10
  # LOCAL
11
11
  from sonatoki.utils import regex_escape
12
12
  from sonatoki.Filters import NimiUCSUR # seriously this sucks
13
- from sonatoki.constants import ALL_PUNCT, SENTENCE_PUNCT, ALL_PUNCT_RANGES_STR
13
+ from sonatoki.constants import (
14
+ ALL_PUNCT,
15
+ SENTENCE_PUNCT,
16
+ INTRA_WORD_PUNCT,
17
+ ALL_PUNCT_RANGES_STR,
18
+ )
14
19
 
15
20
  regex.DEFAULT_VERSION = regex.VERSION1
16
21
 
@@ -47,11 +52,62 @@ class Regex1Tokenizer(Tokenizer):
47
52
 
48
53
  class WordTokenizer(SetTokenizer):
49
54
  delimiters = set(ALL_PUNCT)
55
+ intra_word_punct = set(INTRA_WORD_PUNCT)
56
+
57
+ @classmethod
58
+ def is_delimiter(cls, c: str) -> bool:
59
+ return c in cls.delimiters or not c
50
60
 
51
61
  @classmethod
52
- def __helper(cls, s: str, tokens: List[str], last_match: int, i: int):
53
- match = s[last_match:i].split()
54
- [tokens.append(t) for t in match if t]
62
+ def add_token(cls, s: str, tokens: List[str], last_match: int, i: int):
63
+ if i > last_match:
64
+ tokens.append(s[last_match:i])
65
+
66
+ @classmethod
67
+ def to_tokens(cls, s: str) -> List[str]:
68
+ tokens: List[str] = []
69
+
70
+ slen = len(s)
71
+ i = 0
72
+ did_skip = False # ensure exists
73
+ while i < slen:
74
+
75
+ # contiguous punctuation chars
76
+ last_match = i
77
+ while i < slen and cls.is_delimiter(s[i]):
78
+ # no special case
79
+ i += 1
80
+ cls.add_token(s, tokens, last_match, i)
81
+
82
+ # contiguous writing chars (much harder)
83
+ last_match = i
84
+ while i < slen and not cls.is_delimiter(s[i]):
85
+ did_skip = False
86
+ # we skip and see another writing char, or init
87
+
88
+ if NimiUCSUR.filter(s[i]):
89
+ cls.add_token(s, tokens, last_match, i)
90
+ tokens.append(s[i])
91
+ i += 1
92
+ last_match = i
93
+ continue
94
+
95
+ next_char = s[i + 1] if i + 1 < slen else ""
96
+ if next_char in cls.intra_word_punct:
97
+ did_skip = True
98
+ i += 2
99
+ continue
100
+
101
+ i += 1
102
+
103
+ if did_skip:
104
+ # we skipped, but there wasn't another writing character
105
+ cls.add_token(s, tokens, last_match, i - 1)
106
+ last_match = i - 1
107
+
108
+ cls.add_token(s, tokens, last_match, i)
109
+
110
+ return tokens
55
111
 
56
112
  @classmethod
57
113
  @override
@@ -60,33 +116,12 @@ class WordTokenizer(SetTokenizer):
60
116
  return []
61
117
 
62
118
  tokens: List[str] = []
119
+ candidates: List[str] = s.split()
63
120
 
64
- i = 0 # ensure i is bound
65
- last_match = 0
66
- last_membership = s[0] in cls.delimiters
67
- for i, char in enumerate(s):
68
- mem = char in cls.delimiters
69
- ucsur = NimiUCSUR.filter(char)
70
- changed = (mem != last_membership) or ucsur
71
- # this keeps contiguous words together, but splits UCSUR
72
- if not changed:
73
- continue
74
-
75
- if ucsur:
76
- if i > last_match:
77
- # Add the token before UCSUR character
78
- cls.__helper(s, tokens, last_match, i)
79
- # Add UCSUR character itself as a token
80
- tokens.append(char)
81
- last_match = i + 1
82
- last_membership = mem
83
- continue
84
-
85
- cls.__helper(s, tokens, last_match, i)
86
- last_match = i
87
- last_membership = mem
121
+ for candidate in candidates:
122
+ results = cls.to_tokens(candidate)
123
+ tokens.extend(results)
88
124
 
89
- cls.__helper(s, tokens, last_match, i + 1)
90
125
  return tokens
91
126
 
92
127
 
sonatoki/constants.py CHANGED
@@ -498,7 +498,10 @@ ALL_PUNCT = "".join(sorted(list(set(POSIX_PUNCT + UNICODE_PUNCT))))
498
498
  ALL_PUNCT_RANGES_STR = "".join(find_unicode_ranges(ALL_PUNCT))
499
499
  # combined bc the result could be simpler
500
500
 
501
- SENTENCE_PUNCT = """.?!:;'"()[-]“”·…"""
501
+ SENTENCE_PUNCT = """.?!:;()[-]·…"""
502
+ # NOTE: quotes were previously included, but in TP they are *not* reliably sentence boundaries
503
+
504
+ INTRA_WORD_PUNCT = """-'"""
502
505
 
503
506
 
504
507
  LINKU = Path(__file__).resolve().parent / Path("linku.json")
@@ -514,8 +517,8 @@ LANGUAGE = "english" # for NLTK
514
517
  """Commonly occurring strings which are some kind of valid Toki Pona or
515
518
  external token."""
516
519
  ALLOWABLES = {
517
- "x", # ala
518
- "y", # anu
520
+ # "x", # ala
521
+ # "y", # anu
519
522
  "kxk", # ken ala ken
520
523
  "wxw", # wile ala wile
521
524
  "msa",
@@ -539,6 +542,7 @@ FALSE_POS_SYLLABIC = {
539
542
  "name",
540
543
  "time",
541
544
  "imo", # "in my opinion"
545
+ "ime", # "in my experience"
542
546
  "man",
543
547
  # "son", # sona typo?
544
548
  "joke",
@@ -616,6 +620,7 @@ FALSE_POS_SYLLABIC = {
616
620
  # manual additions
617
621
  "alike",
618
622
  "amuse",
623
+ "animate",
619
624
  "antelope",
620
625
  "antena",
621
626
  "apetite",
@@ -638,16 +643,21 @@ FALSE_POS_SYLLABIC = {
638
643
  "insolate",
639
644
  "insulate",
640
645
  "intense",
646
+ # "june",
641
647
  "lemon",
642
648
  "manipulate",
643
649
  "misuse",
644
650
  "ne", # "no" in many other languages
651
+ "tape",
652
+ "onto",
645
653
  "wana",
654
+ "muse",
646
655
  }
647
656
 
648
657
  FALSE_POS_ALPHABETIC: Set[str] = {
649
658
  "t",
650
659
  "is",
660
+ "os", # some command prefix...
651
661
  "as",
652
662
  "not",
653
663
  "link",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sonatoki
3
- Version: 0.5.3
3
+ Version: 0.6.0
4
4
  Summary: ilo li moku e toki li pana e sona ni: ni li toki ala toki pona?
5
5
  Author-Email: "jan Kekan San (@gregdan3)" <gregory.danielson3@gmail.com>
6
6
  License: AGPL-3.0-or-later
@@ -1,20 +1,20 @@
1
- sonatoki-0.5.3.dist-info/METADATA,sha256=mC-i9FszUcyFA8peFVjRvj5QxCoVFjfHf60UWZNxquA,6517
2
- sonatoki-0.5.3.dist-info/WHEEL,sha256=rSwsxJWe3vzyR5HCwjWXQruDgschpei4h_giTm0dJVE,90
3
- sonatoki-0.5.3.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
1
+ sonatoki-0.6.0.dist-info/METADATA,sha256=JuR9XrtjbWWZwtYz2rzqwMEIzR_ddQ2te2mskmc-evs,6517
2
+ sonatoki-0.6.0.dist-info/WHEEL,sha256=rSwsxJWe3vzyR5HCwjWXQruDgschpei4h_giTm0dJVE,90
3
+ sonatoki-0.6.0.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
4
4
  sonatoki/Cleaners.py,sha256=x2dT3MpDUfbrHA0EP2D3n1sTiKFFi5jw9ha-1dX973o,1958
5
- sonatoki/Configs.py,sha256=yprG3LEMyy6KKJWEEeJ7nEIC3-qtqA7p4CTHYv4a4vU,5469
5
+ sonatoki/Configs.py,sha256=RD6YUYW45pwIFx8ebJgGs5PhIhL9sjn_VqIg4zf3VUE,5697
6
6
  sonatoki/Filters.py,sha256=nVSmw5M4sEYA_8KI1fI53rMHkd9KO6yWbKfdxxExxN8,11700
7
7
  sonatoki/Preprocessors.py,sha256=nN6xL6mvVAnWZjSNW8CaeLm8x4kK3dCoB-1WYqi0ANU,5763
8
8
  sonatoki/Scorers.py,sha256=LRQLgXKTU2VqhkMHFPVxyVt83DXf85_zrpDGk4ThU24,3811
9
- sonatoki/Tokenizers.py,sha256=qFaA1-v-wjKMihtEJMeZpi3m4cSkJQgWhGhL-w0VgPE,4236
9
+ sonatoki/Tokenizers.py,sha256=8lpC70bzXOpHyhVr5bmqpYKmdmQvJdf7X5-Icc9RRCw,5040
10
10
  sonatoki/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  sonatoki/__main__.py,sha256=6n4kUF80APl6a0jV46h_ncHNuQbrLpZ_nAmiNAakiag,5673
12
12
  sonatoki/alphabetic.txt,sha256=duyqAKilD2vLIr75RShCIAnktNJcGeEoQIk18V6czmg,11702
13
- sonatoki/constants.py,sha256=BYML7p9oUELgUDO0xdgmP74idcwjiFSw_NfuDLpsp8k,18952
13
+ sonatoki/constants.py,sha256=qKjWqVcsvfScDPW4lUvRh_Qhwxv6AYkGrGQyhxEbX8w,19206
14
14
  sonatoki/ilo.py,sha256=PWZa202Q4h7IjnLxmfgT93iAPJL7dqJbA97L9kQDPiA,5658
15
15
  sonatoki/linku.json,sha256=d72Dvht-a4gBmdqLLI8mElvo83zSpbxDmxJj05hOudM,295413
16
16
  sonatoki/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  sonatoki/sandbox.json,sha256=44csrQDaVtV-n8OyewabX1J9MmUFCsPct5C8E5Xuc58,140197
18
18
  sonatoki/syllabic.txt,sha256=HnqY4TrZ3tPcHah3TsvG9F9gjMrnAGdJ8hHJNHyyUPc,1712
19
19
  sonatoki/utils.py,sha256=sT5xLMEj0aLpy8GP92HKblJU1Wt1m8NUlMgCFWB32xQ,2265
20
- sonatoki-0.5.3.dist-info/RECORD,,
20
+ sonatoki-0.6.0.dist-info/RECORD,,