tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
tokmor/token_quality.py
ADDED
|
@@ -0,0 +1,1185 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Token quality postprocessing (OSS core)
|
|
3
|
+
======================================
|
|
4
|
+
|
|
5
|
+
Goal:
|
|
6
|
+
- Neutral token quality fixes only (NOT NER policy).
|
|
7
|
+
- Applied uniformly across all tokenizers via TokenizerResult.__post_init__.
|
|
8
|
+
|
|
9
|
+
Extension model (what you asked for):
|
|
10
|
+
- Add **global rules** (safe, language-agnostic)
|
|
11
|
+
- Add **script/family rules** (e.g., cjk/brahmic/rtl/indic/space)
|
|
12
|
+
- Add **language-specific mini rules** (zh/ja/...)
|
|
13
|
+
|
|
14
|
+
All rules are automatically applied to every tokenizer output.
|
|
15
|
+
|
|
16
|
+
Safety guards:
|
|
17
|
+
- adjacency checks (offset continuity) when merging
|
|
18
|
+
- bounded lookahead
|
|
19
|
+
- hard length limits to avoid aggressive merges
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import re
|
|
25
|
+
from typing import Callable, Dict, List, Optional
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
RuleFn = Callable[[List[object], str, str], List[object]]
|
|
29
|
+
|
|
30
|
+
_GLOBAL_RULES: List[RuleFn] = []
|
|
31
|
+
_FAMILY_RULES: Dict[str, List[RuleFn]] = {}
|
|
32
|
+
_LANG_RULES: Dict[str, List[RuleFn]] = {}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def register_global_rule(fn: RuleFn) -> None:
|
|
36
|
+
_GLOBAL_RULES.append(fn)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def register_family_rule(family: str, fn: RuleFn) -> None:
|
|
40
|
+
family = (family or "").strip().lower()
|
|
41
|
+
if not family:
|
|
42
|
+
return
|
|
43
|
+
_FAMILY_RULES.setdefault(family, []).append(fn)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def register_lang_rule(lang: str, fn: RuleFn) -> None:
|
|
47
|
+
lang = (lang or "").strip().lower().replace("_", "-")
|
|
48
|
+
if not lang:
|
|
49
|
+
return
|
|
50
|
+
_LANG_RULES.setdefault(lang, []).append(fn)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _family_for_lang(lang: str) -> Optional[str]:
|
|
54
|
+
"""
|
|
55
|
+
Lightweight script/family routing without importing tokenizer modules
|
|
56
|
+
(avoids circular imports).
|
|
57
|
+
"""
|
|
58
|
+
ll = (lang or "").lower().replace("_", "-")
|
|
59
|
+
if ll in {"zh", "zh-cn", "zh-tw", "ja", "ko"}:
|
|
60
|
+
return "cjk"
|
|
61
|
+
if ll in {"th", "lo", "my", "km"}:
|
|
62
|
+
return "brahmic"
|
|
63
|
+
if ll in {"ar", "he", "fa", "ur", "yi", "ps"}:
|
|
64
|
+
return "rtl"
|
|
65
|
+
if ll in {"hi", "bn", "gu", "pa", "mr", "ne", "si", "ta", "te", "kn", "ml", "or", "as", "sa"}:
|
|
66
|
+
return "indic"
|
|
67
|
+
return "space"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def apply_token_quality(tokens: List[object], *, lang: str, text: str) -> List[object]:
|
|
71
|
+
"""
|
|
72
|
+
Apply token-quality fixes for a given language.
|
|
73
|
+
|
|
74
|
+
`tokens` are expected to be tokmor.base.Token-like objects with:
|
|
75
|
+
- text: str
|
|
76
|
+
- start: int
|
|
77
|
+
- end: int
|
|
78
|
+
"""
|
|
79
|
+
if not tokens:
|
|
80
|
+
return tokens
|
|
81
|
+
|
|
82
|
+
ll = (lang or "").lower().replace("_", "-")
|
|
83
|
+
fam = _family_for_lang(ll) or ""
|
|
84
|
+
|
|
85
|
+
out = tokens
|
|
86
|
+
for fn in _GLOBAL_RULES:
|
|
87
|
+
out = fn(out, ll, text)
|
|
88
|
+
if not out:
|
|
89
|
+
return out
|
|
90
|
+
|
|
91
|
+
for fn in _FAMILY_RULES.get(fam, []):
|
|
92
|
+
out = fn(out, ll, text)
|
|
93
|
+
if not out:
|
|
94
|
+
return out
|
|
95
|
+
|
|
96
|
+
for fn in _LANG_RULES.get(ll, []):
|
|
97
|
+
out = fn(out, ll, text)
|
|
98
|
+
if not out:
|
|
99
|
+
return out
|
|
100
|
+
|
|
101
|
+
return out
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _rule_merge_digit_groups(tokens: List[object], _lang: str, text: str) -> List[object]:
|
|
105
|
+
"""
|
|
106
|
+
Global neutral fix: merge common digit-group / decimal splits that happen in whitespace tokenizers.
|
|
107
|
+
|
|
108
|
+
Examples:
|
|
109
|
+
- "22,000" tokenized as ["22", "000"] -> ["22,000"]
|
|
110
|
+
- "1.28" tokenized as ["1", "28"] -> ["1.28"]
|
|
111
|
+
|
|
112
|
+
Safety:
|
|
113
|
+
- only when separated by a single char in the original text
|
|
114
|
+
- separator must be one of {',', '.', '٬', '٫'}
|
|
115
|
+
- both sides must be all digits
|
|
116
|
+
- bounded total length to avoid over-merging
|
|
117
|
+
"""
|
|
118
|
+
if not tokens or len(tokens) < 2:
|
|
119
|
+
return tokens
|
|
120
|
+
|
|
121
|
+
TokenType = type(tokens[0])
|
|
122
|
+
|
|
123
|
+
def _tok_from_span(s: int, e: int):
|
|
124
|
+
return TokenType(text=text[s:e], start=s, end=e)
|
|
125
|
+
|
|
126
|
+
def _is_digits(s: str) -> bool:
|
|
127
|
+
return bool(s) and all(ch.isdigit() for ch in s)
|
|
128
|
+
|
|
129
|
+
SEPS = {",", ".", "٬", "٫"}
|
|
130
|
+
|
|
131
|
+
out: List[object] = []
|
|
132
|
+
i = 0
|
|
133
|
+
n = len(tokens)
|
|
134
|
+
while i < n:
|
|
135
|
+
a = tokens[i]
|
|
136
|
+
if i + 1 < n:
|
|
137
|
+
b = tokens[i + 1]
|
|
138
|
+
a_s, a_e = int(getattr(a, "start")), int(getattr(a, "end"))
|
|
139
|
+
b_s, b_e = int(getattr(b, "start")), int(getattr(b, "end"))
|
|
140
|
+
if 0 <= a_s <= a_e <= len(text) and 0 <= b_s <= b_e <= len(text):
|
|
141
|
+
if b_s == a_e + 1:
|
|
142
|
+
sep = text[a_e:b_s]
|
|
143
|
+
if sep in SEPS:
|
|
144
|
+
at = getattr(a, "text", "") or text[a_s:a_e]
|
|
145
|
+
bt = getattr(b, "text", "") or text[b_s:b_e]
|
|
146
|
+
if _is_digits(at) and _is_digits(bt):
|
|
147
|
+
merged_txt = text[a_s:b_e]
|
|
148
|
+
if 1 <= len(merged_txt) <= 32:
|
|
149
|
+
out.append(_tok_from_span(a_s, b_e))
|
|
150
|
+
i += 2
|
|
151
|
+
continue
|
|
152
|
+
out.append(a)
|
|
153
|
+
i += 1
|
|
154
|
+
return out
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _rule_zh(tokens: List[object], _lang: str, text: str) -> List[object]:
|
|
158
|
+
"""
|
|
159
|
+
Chinese token quality fixes (conservative):
|
|
160
|
+
- Split over-merged suffix+verbish chunks: ...港聘用 -> ...港 + 聘用
|
|
161
|
+
- Merge dot-connected names (·), allowing small whitespace gaps: 米拉 · 万 托斯 -> 米拉·万托斯
|
|
162
|
+
- Merge short geo-name chains ending with 港 (recover common over-splits): 维 + 伦德 + 尔港 -> 维伦德尔港
|
|
163
|
+
- Split stuck function char in very limited cases: 在维... -> 在 + 维...
|
|
164
|
+
"""
|
|
165
|
+
if not tokens:
|
|
166
|
+
return tokens
|
|
167
|
+
|
|
168
|
+
TokenType = type(tokens[0])
|
|
169
|
+
|
|
170
|
+
def _tok(text_: str, start: int, end: int):
|
|
171
|
+
return TokenType(text=text_, start=start, end=end)
|
|
172
|
+
|
|
173
|
+
def _is_cjk(ch: str) -> bool:
|
|
174
|
+
return bool(ch) and (("\u4e00" <= ch <= "\u9fff") or ("\u3400" <= ch <= "\u4dbf"))
|
|
175
|
+
|
|
176
|
+
def _is_name_piece(s: str) -> bool:
|
|
177
|
+
if not s:
|
|
178
|
+
return False
|
|
179
|
+
for ch in s:
|
|
180
|
+
if ch.isalnum():
|
|
181
|
+
continue
|
|
182
|
+
if _is_cjk(ch):
|
|
183
|
+
continue
|
|
184
|
+
return False
|
|
185
|
+
return True
|
|
186
|
+
|
|
187
|
+
def _gap(a_end: int, b_start: int) -> str:
|
|
188
|
+
if a_end < 0 or b_start < 0 or b_start < a_end or b_start > len(text):
|
|
189
|
+
return ""
|
|
190
|
+
return text[a_end:b_start]
|
|
191
|
+
|
|
192
|
+
def _gap_is_ws(g: str) -> bool:
|
|
193
|
+
return bool(g) and len(g) <= 3 and g.isspace()
|
|
194
|
+
|
|
195
|
+
VERBISH = {
|
|
196
|
+
"抵达",
|
|
197
|
+
"聘用",
|
|
198
|
+
"发布",
|
|
199
|
+
"宣布",
|
|
200
|
+
"表示",
|
|
201
|
+
"前往",
|
|
202
|
+
"返回",
|
|
203
|
+
"访问",
|
|
204
|
+
"会见",
|
|
205
|
+
"举行",
|
|
206
|
+
"发生",
|
|
207
|
+
"完成",
|
|
208
|
+
"启动",
|
|
209
|
+
"加入",
|
|
210
|
+
"离开",
|
|
211
|
+
"进入",
|
|
212
|
+
}
|
|
213
|
+
VERB_TAIL = {"", "了", "着", "过"}
|
|
214
|
+
|
|
215
|
+
def _is_verbish_chunk(s: str) -> bool:
|
|
216
|
+
if not s:
|
|
217
|
+
return False
|
|
218
|
+
for v in VERBISH:
|
|
219
|
+
for tail in VERB_TAIL:
|
|
220
|
+
if s == (v + tail):
|
|
221
|
+
return True
|
|
222
|
+
if s.startswith(v + tail) and len(s) <= len(v + tail) + 1:
|
|
223
|
+
return True
|
|
224
|
+
return False
|
|
225
|
+
|
|
226
|
+
NAME_ENDINGS = {"斯", "尔", "德", "特", "姆", "克", "夫", "诺", "拉", "娜", "尼", "亚", "里", "罗", "多", "恩"}
|
|
227
|
+
|
|
228
|
+
def _looks_like_foreign_name_prefix(s: str) -> bool:
|
|
229
|
+
if not s or len(s) < 2 or len(s) > 8:
|
|
230
|
+
return False
|
|
231
|
+
if not all(_is_cjk(ch) for ch in s):
|
|
232
|
+
return False
|
|
233
|
+
return s[-1] in NAME_ENDINGS
|
|
234
|
+
|
|
235
|
+
# pass 1: split obvious over-merges inside a single token
|
|
236
|
+
split_out: List[object] = []
|
|
237
|
+
for t in tokens:
|
|
238
|
+
txt = getattr(t, "text", "") or ""
|
|
239
|
+
if len(txt) < 3:
|
|
240
|
+
split_out.append(t)
|
|
241
|
+
continue
|
|
242
|
+
|
|
243
|
+
did_split = False
|
|
244
|
+
|
|
245
|
+
# 0) Dot-name over-merge guard:
|
|
246
|
+
# e.g., "米拉·万·托斯抵达维伦德尔港" -> "米拉·万·托斯" + "抵达" + "维伦德尔港"
|
|
247
|
+
if ("·" in txt) and (not did_split):
|
|
248
|
+
for v in VERBISH:
|
|
249
|
+
for tail in VERB_TAIL:
|
|
250
|
+
vv = v + tail
|
|
251
|
+
idx_v = txt.find(vv)
|
|
252
|
+
if idx_v <= 0:
|
|
253
|
+
continue
|
|
254
|
+
if "·" not in txt[:idx_v]:
|
|
255
|
+
continue
|
|
256
|
+
# Ensure offsets align with text length (avoid corrupt spans).
|
|
257
|
+
s0 = int(getattr(t, "start"))
|
|
258
|
+
e0 = int(getattr(t, "end"))
|
|
259
|
+
if (e0 - s0) != len(txt):
|
|
260
|
+
continue
|
|
261
|
+
if len(txt) > 80:
|
|
262
|
+
continue
|
|
263
|
+
left = txt[:idx_v]
|
|
264
|
+
rem = txt[idx_v + len(vv) :]
|
|
265
|
+
split_out.append(_tok(left, s0, s0 + len(left)))
|
|
266
|
+
split_out.append(_tok(vv, s0 + idx_v, s0 + idx_v + len(vv)))
|
|
267
|
+
if rem:
|
|
268
|
+
split_out.append(_tok(rem, s0 + idx_v + len(vv), e0))
|
|
269
|
+
did_split = True
|
|
270
|
+
break
|
|
271
|
+
if did_split:
|
|
272
|
+
break
|
|
273
|
+
if did_split:
|
|
274
|
+
continue
|
|
275
|
+
|
|
276
|
+
idx = txt.rfind("港")
|
|
277
|
+
if idx != -1 and idx < len(txt) - 1:
|
|
278
|
+
rem = txt[idx + 1 :]
|
|
279
|
+
if _is_verbish_chunk(rem):
|
|
280
|
+
left = txt[: idx + 1]
|
|
281
|
+
right = txt[idx + 1 :]
|
|
282
|
+
mid = getattr(t, "start") + len(left)
|
|
283
|
+
split_out.append(_tok(left, getattr(t, "start"), mid))
|
|
284
|
+
split_out.append(_tok(right, mid, getattr(t, "end")))
|
|
285
|
+
did_split = True
|
|
286
|
+
if did_split:
|
|
287
|
+
continue
|
|
288
|
+
|
|
289
|
+
for v in VERBISH:
|
|
290
|
+
for tail in VERB_TAIL:
|
|
291
|
+
suffix = v + tail
|
|
292
|
+
if txt.endswith(suffix) and len(txt) > len(suffix):
|
|
293
|
+
prefix = txt[: -len(suffix)]
|
|
294
|
+
if _looks_like_foreign_name_prefix(prefix):
|
|
295
|
+
mid = getattr(t, "end") - len(suffix)
|
|
296
|
+
split_out.append(_tok(prefix, getattr(t, "start"), mid))
|
|
297
|
+
split_out.append(_tok(suffix, mid, getattr(t, "end")))
|
|
298
|
+
did_split = True
|
|
299
|
+
break
|
|
300
|
+
if did_split:
|
|
301
|
+
break
|
|
302
|
+
if not did_split:
|
|
303
|
+
split_out.append(t)
|
|
304
|
+
|
|
305
|
+
split_out.sort(key=lambda x: getattr(x, "start"))
|
|
306
|
+
|
|
307
|
+
# pass 1.5: split limited stuck function char prefixes (very conservative)
|
|
308
|
+
FUNC_PREFIX = {"在", "到", "于", "从", "往", "去"}
|
|
309
|
+
split2: List[object] = []
|
|
310
|
+
for idx, t in enumerate(split_out):
|
|
311
|
+
txt = getattr(t, "text", "") or ""
|
|
312
|
+
if len(txt) >= 2 and txt[0] in FUNC_PREFIX:
|
|
313
|
+
rem = txt[1:]
|
|
314
|
+
nxt = split_out[idx + 1] if idx + 1 < len(split_out) else None
|
|
315
|
+
if rem and all(_is_cjk(ch) for ch in rem) and nxt and _is_name_piece(getattr(nxt, "text", "")):
|
|
316
|
+
s0 = getattr(t, "start")
|
|
317
|
+
e0 = getattr(t, "end")
|
|
318
|
+
if (s0 + 1) <= e0:
|
|
319
|
+
split2.append(_tok(txt[0], s0, s0 + 1))
|
|
320
|
+
split2.append(_tok(rem, s0 + 1, e0))
|
|
321
|
+
continue
|
|
322
|
+
split2.append(t)
|
|
323
|
+
split2.sort(key=lambda x: getattr(x, "start"))
|
|
324
|
+
|
|
325
|
+
# pass 1.6: merge short geo-name chains ending with 港 (contiguous, bounded)
|
|
326
|
+
geo_merged: List[object] = []
|
|
327
|
+
i = 0
|
|
328
|
+
while i < len(split2):
|
|
329
|
+
t0 = split2[i]
|
|
330
|
+
if getattr(t0, "text", None) in FUNC_PREFIX:
|
|
331
|
+
geo_merged.append(t0)
|
|
332
|
+
i += 1
|
|
333
|
+
continue
|
|
334
|
+
|
|
335
|
+
parts: List[object] = []
|
|
336
|
+
j = i
|
|
337
|
+
merged = False
|
|
338
|
+
while j < len(split2) and len(parts) < 5:
|
|
339
|
+
cur = split2[j]
|
|
340
|
+
if parts and getattr(parts[-1], "end") != getattr(cur, "start"):
|
|
341
|
+
break
|
|
342
|
+
cur_txt = getattr(cur, "text", "") or ""
|
|
343
|
+
if not cur_txt or not all(_is_cjk(ch) for ch in cur_txt):
|
|
344
|
+
break
|
|
345
|
+
# Never merge verbish chunks into toponyms (keeps dot-name split + verb separate).
|
|
346
|
+
if _is_verbish_chunk(cur_txt):
|
|
347
|
+
break
|
|
348
|
+
parts.append(cur)
|
|
349
|
+
joined = "".join(getattr(p, "text", "") or "" for p in parts)
|
|
350
|
+
if len(joined) > 12:
|
|
351
|
+
break
|
|
352
|
+
if joined.endswith("港") and len(parts) >= 2:
|
|
353
|
+
geo_merged.append(_tok(joined, getattr(parts[0], "start"), getattr(parts[-1], "end")))
|
|
354
|
+
i = j + 1
|
|
355
|
+
merged = True
|
|
356
|
+
break
|
|
357
|
+
j += 1
|
|
358
|
+
if not merged:
|
|
359
|
+
geo_merged.append(t0)
|
|
360
|
+
i += 1
|
|
361
|
+
|
|
362
|
+
# pass 2: merge dot-connected names (·) (gap-aware, bounded)
|
|
363
|
+
merged_names: List[object] = []
|
|
364
|
+
i = 0
|
|
365
|
+
while i < len(geo_merged):
|
|
366
|
+
t0 = geo_merged[i]
|
|
367
|
+
if not _is_name_piece(getattr(t0, "text", "")):
|
|
368
|
+
merged_names.append(t0)
|
|
369
|
+
i += 1
|
|
370
|
+
continue
|
|
371
|
+
|
|
372
|
+
parts = [t0]
|
|
373
|
+
connectors: List[str] = []
|
|
374
|
+
j = i + 1
|
|
375
|
+
saw_dot = False
|
|
376
|
+
|
|
377
|
+
while j < len(geo_merged):
|
|
378
|
+
prev = parts[-1]
|
|
379
|
+
nxt = geo_merged[j]
|
|
380
|
+
g = _gap(getattr(prev, "end"), getattr(nxt, "start"))
|
|
381
|
+
g_strip = g.strip()
|
|
382
|
+
nxt_text = getattr(nxt, "text", "") or ""
|
|
383
|
+
|
|
384
|
+
# Hard stop: do NOT let a dot-name merge swallow verbs / location chunks.
|
|
385
|
+
# Example to avoid: "米拉·万·托斯抵达维伦德尔港" becoming one token.
|
|
386
|
+
if _is_verbish_chunk(nxt_text):
|
|
387
|
+
break
|
|
388
|
+
if nxt_text and (nxt_text == "港" or nxt_text.endswith("港") or nxt_text.endswith("市") or nxt_text.endswith("省") or nxt_text.endswith("县") or nxt_text.endswith("区") or nxt_text.endswith("州")):
|
|
389
|
+
break
|
|
390
|
+
|
|
391
|
+
if g_strip == "·" and _is_name_piece(getattr(nxt, "text", "")):
|
|
392
|
+
connectors.append("dot")
|
|
393
|
+
parts.append(nxt)
|
|
394
|
+
saw_dot = True
|
|
395
|
+
j += 1
|
|
396
|
+
continue
|
|
397
|
+
|
|
398
|
+
if (g == "" or _gap_is_ws(g)) and (getattr(nxt, "text", "") == "·") and (j + 1) < len(geo_merged):
|
|
399
|
+
nxt2 = geo_merged[j + 1]
|
|
400
|
+
g2 = _gap(getattr(nxt, "end"), getattr(nxt2, "start"))
|
|
401
|
+
if (g2 == "" or _gap_is_ws(g2)) and _is_name_piece(getattr(nxt2, "text", "")):
|
|
402
|
+
connectors.append("dot")
|
|
403
|
+
parts.append(nxt2)
|
|
404
|
+
saw_dot = True
|
|
405
|
+
j += 2
|
|
406
|
+
continue
|
|
407
|
+
|
|
408
|
+
# Allow limited no-dot concatenation only *within* a dot-name span:
|
|
409
|
+
# - must already have seen a dot
|
|
410
|
+
# - and the previous join must have been a dot (immediately-after-dot window)
|
|
411
|
+
# - and token must be short (name piece), and must not be verbish (checked above)
|
|
412
|
+
if saw_dot and (g == "" or _gap_is_ws(g)) and _is_name_piece(nxt_text):
|
|
413
|
+
prev_join = connectors[-1] if connectors else ""
|
|
414
|
+
if prev_join == "dot" and 1 <= len(nxt_text) <= 3:
|
|
415
|
+
connectors.append("ws")
|
|
416
|
+
parts.append(nxt)
|
|
417
|
+
j += 1
|
|
418
|
+
continue
|
|
419
|
+
|
|
420
|
+
break
|
|
421
|
+
|
|
422
|
+
if saw_dot and len(parts) >= 2:
|
|
423
|
+
if len(parts) <= 6 and sum(len(getattr(p, "text", "") or "") for p in parts) <= 40:
|
|
424
|
+
out_txt = getattr(parts[0], "text", "") or ""
|
|
425
|
+
for k in range(1, len(parts)):
|
|
426
|
+
conn = connectors[k - 1] if (k - 1) < len(connectors) else "ws"
|
|
427
|
+
if conn == "dot":
|
|
428
|
+
out_txt += "·" + (getattr(parts[k], "text", "") or "")
|
|
429
|
+
else:
|
|
430
|
+
out_txt += (getattr(parts[k], "text", "") or "")
|
|
431
|
+
merged_names.append(_tok(out_txt, getattr(parts[0], "start"), getattr(parts[-1], "end")))
|
|
432
|
+
i = j
|
|
433
|
+
continue
|
|
434
|
+
|
|
435
|
+
merged_names.append(t0)
|
|
436
|
+
i += 1
|
|
437
|
+
|
|
438
|
+
# pass 3: merge X + 港 -> X港 (contiguous)
|
|
439
|
+
out3: List[object] = []
|
|
440
|
+
i = 0
|
|
441
|
+
while i < len(merged_names):
|
|
442
|
+
a = merged_names[i]
|
|
443
|
+
if i + 1 < len(merged_names):
|
|
444
|
+
b = merged_names[i + 1]
|
|
445
|
+
if getattr(a, "end") == getattr(b, "start") and getattr(b, "text", "") == "港":
|
|
446
|
+
at = getattr(a, "text", "") or ""
|
|
447
|
+
if at and len(at) <= 12 and all((_is_cjk(ch) or ch.isalnum()) for ch in at):
|
|
448
|
+
out3.append(_tok(at + "港", getattr(a, "start"), getattr(b, "end")))
|
|
449
|
+
i += 2
|
|
450
|
+
continue
|
|
451
|
+
out3.append(a)
|
|
452
|
+
i += 1
|
|
453
|
+
|
|
454
|
+
return out3
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def _rule_ja(tokens: List[object], _lang: str, _text: str) -> List[object]:
|
|
458
|
+
"""
|
|
459
|
+
Japanese compound re-joining (conservative):
|
|
460
|
+
- Katakana + Kanji suffix merge: モルディン + 港 -> モルディン港
|
|
461
|
+
suffix candidates: 港/駅/空港/都/道/府/県/市/区/町/村
|
|
462
|
+
"""
|
|
463
|
+
if not tokens:
|
|
464
|
+
return tokens
|
|
465
|
+
|
|
466
|
+
TokenType = type(tokens[0])
|
|
467
|
+
|
|
468
|
+
def _tok(text_: str, start: int, end: int):
|
|
469
|
+
return TokenType(text=text_, start=start, end=end)
|
|
470
|
+
|
|
471
|
+
JA_SUFFIXES = {"港", "駅", "空港", "都", "道", "府", "県", "市", "区", "町", "村"}
|
|
472
|
+
|
|
473
|
+
def _is_katakana(ch: str) -> bool:
|
|
474
|
+
return "\u30a0" <= ch <= "\u30ff"
|
|
475
|
+
|
|
476
|
+
def _is_katakana_run(s: str) -> bool:
|
|
477
|
+
if not s:
|
|
478
|
+
return False
|
|
479
|
+
for ch in s:
|
|
480
|
+
if _is_katakana(ch) or ch in {"ー", "・"}:
|
|
481
|
+
continue
|
|
482
|
+
return False
|
|
483
|
+
return True
|
|
484
|
+
|
|
485
|
+
out: List[object] = []
|
|
486
|
+
i = 0
|
|
487
|
+
while i < len(tokens):
|
|
488
|
+
a = tokens[i]
|
|
489
|
+
if i + 1 < len(tokens):
|
|
490
|
+
b = tokens[i + 1]
|
|
491
|
+
if getattr(a, "end") == getattr(b, "start") and _is_katakana_run(getattr(a, "text", "") or "") and (getattr(b, "text", "") in JA_SUFFIXES):
|
|
492
|
+
at = getattr(a, "text", "") or ""
|
|
493
|
+
bt = getattr(b, "text", "") or ""
|
|
494
|
+
if len(at) <= 24 and len(at + bt) <= 28:
|
|
495
|
+
out.append(_tok(at + bt, getattr(a, "start"), getattr(b, "end")))
|
|
496
|
+
i += 2
|
|
497
|
+
continue
|
|
498
|
+
out.append(a)
|
|
499
|
+
i += 1
|
|
500
|
+
return out
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
def _rule_merge_simple_punct_runs(tokens: List[object], _lang: str, text: str) -> List[object]:
|
|
504
|
+
"""
|
|
505
|
+
Global neutral fix: merge contiguous single-character punctuation tokens into runs.
|
|
506
|
+
|
|
507
|
+
Example:
|
|
508
|
+
- ["!", "!", "!"] -> ["!!!"]
|
|
509
|
+
- [".", ".", "."] -> ["..."]
|
|
510
|
+
- ["?", "?", "!", "!"] -> ["??!!"]
|
|
511
|
+
|
|
512
|
+
This helps SNS discourse tagging and reduces downstream fragmentation.
|
|
513
|
+
"""
|
|
514
|
+
if not tokens or len(tokens) < 2:
|
|
515
|
+
return tokens
|
|
516
|
+
|
|
517
|
+
TokenType = type(tokens[0])
|
|
518
|
+
|
|
519
|
+
def _tok_from_span(s: int, e: int):
|
|
520
|
+
return TokenType(text=text[s:e], start=s, end=e)
|
|
521
|
+
|
|
522
|
+
P = {"!", "!", "?", "?", ".", "…", "~", "~"}
|
|
523
|
+
|
|
524
|
+
out: List[object] = []
|
|
525
|
+
i = 0
|
|
526
|
+
n = len(tokens)
|
|
527
|
+
while i < n:
|
|
528
|
+
a = tokens[i]
|
|
529
|
+
at = getattr(a, "text", "") or ""
|
|
530
|
+
a_s, a_e = int(getattr(a, "start")), int(getattr(a, "end"))
|
|
531
|
+
if len(at) == 1 and at in P and 0 <= a_s <= a_e <= len(text):
|
|
532
|
+
j = i + 1
|
|
533
|
+
end = a_e
|
|
534
|
+
while j < n:
|
|
535
|
+
b = tokens[j]
|
|
536
|
+
bt = getattr(b, "text", "") or ""
|
|
537
|
+
b_s, b_e = int(getattr(b, "start")), int(getattr(b, "end"))
|
|
538
|
+
if not (len(bt) == 1 and bt in P and 0 <= b_s <= b_e <= len(text)):
|
|
539
|
+
break
|
|
540
|
+
if b_s != end:
|
|
541
|
+
break
|
|
542
|
+
end = b_e
|
|
543
|
+
j += 1
|
|
544
|
+
if j > i + 1:
|
|
545
|
+
out.append(_tok_from_span(a_s, end))
|
|
546
|
+
i = j
|
|
547
|
+
continue
|
|
548
|
+
out.append(a)
|
|
549
|
+
i += 1
|
|
550
|
+
return out
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def _rule_demesh_hangul_keysmash_inside_token(tokens: List[object], _lang: str, text: str) -> List[object]:
|
|
554
|
+
"""
|
|
555
|
+
Global neutral fix (demesh):
|
|
556
|
+
Split tokens that contain an internal Hangul Jamo "keysmash/garble" run.
|
|
557
|
+
|
|
558
|
+
Example (noisy SNS):
|
|
559
|
+
"아ㅣ마ㅓㅣ넣ㄹ아이고" -> ["아", "ㅣ마ㅓㅣ넣ㄹ", "아이고"]
|
|
560
|
+
|
|
561
|
+
Safety:
|
|
562
|
+
- triggers only when the token contains >=3 Hangul Jamo chars (U+3131..U+3163)
|
|
563
|
+
- requires at least one Hangul syllable char (U+AC00..U+D7AF) in the same token
|
|
564
|
+
- splits only around the *longest contiguous jamo run* (length>=3)
|
|
565
|
+
- bounded total token length
|
|
566
|
+
"""
|
|
567
|
+
if not tokens:
|
|
568
|
+
return tokens
|
|
569
|
+
|
|
570
|
+
TokenType = type(tokens[0])
|
|
571
|
+
|
|
572
|
+
def _tok(s: int, e: int):
|
|
573
|
+
return TokenType(text=text[s:e], start=s, end=e)
|
|
574
|
+
|
|
575
|
+
def _is_jamo(ch: str) -> bool:
|
|
576
|
+
o = ord(ch)
|
|
577
|
+
return 0x3131 <= o <= 0x3163
|
|
578
|
+
|
|
579
|
+
def _is_syllable(ch: str) -> bool:
|
|
580
|
+
o = ord(ch)
|
|
581
|
+
return 0xAC00 <= o <= 0xD7AF
|
|
582
|
+
|
|
583
|
+
out: List[object] = []
|
|
584
|
+
for t in tokens:
|
|
585
|
+
tt = getattr(t, "text", "") or ""
|
|
586
|
+
a_s, a_e = int(getattr(t, "start")), int(getattr(t, "end"))
|
|
587
|
+
if not (0 <= a_s <= a_e <= len(text)) or not tt:
|
|
588
|
+
out.append(t)
|
|
589
|
+
continue
|
|
590
|
+
if len(tt) > 48:
|
|
591
|
+
out.append(t)
|
|
592
|
+
continue
|
|
593
|
+
|
|
594
|
+
jamo_total = sum(1 for ch in tt if _is_jamo(ch))
|
|
595
|
+
if jamo_total < 3:
|
|
596
|
+
out.append(t)
|
|
597
|
+
continue
|
|
598
|
+
if not any(_is_syllable(ch) for ch in tt):
|
|
599
|
+
out.append(t)
|
|
600
|
+
continue
|
|
601
|
+
|
|
602
|
+
# find longest contiguous jamo run
|
|
603
|
+
best = None # (len, start_idx, end_idx)
|
|
604
|
+
i = 0
|
|
605
|
+
while i < len(tt):
|
|
606
|
+
if not _is_jamo(tt[i]):
|
|
607
|
+
i += 1
|
|
608
|
+
continue
|
|
609
|
+
j = i
|
|
610
|
+
while j < len(tt) and _is_jamo(tt[j]):
|
|
611
|
+
j += 1
|
|
612
|
+
run_len = j - i
|
|
613
|
+
if run_len >= 3:
|
|
614
|
+
if best is None or run_len > best[0]:
|
|
615
|
+
best = (run_len, i, j)
|
|
616
|
+
i = j
|
|
617
|
+
|
|
618
|
+
if not best:
|
|
619
|
+
out.append(t)
|
|
620
|
+
continue
|
|
621
|
+
|
|
622
|
+
_, rs, re_ = best
|
|
623
|
+
# map to absolute spans in original text
|
|
624
|
+
b_s = a_s + rs
|
|
625
|
+
b_e = a_s + re_
|
|
626
|
+
# Keep only meaningful splits; avoid empty tokens
|
|
627
|
+
if a_s < b_s:
|
|
628
|
+
out.append(_tok(a_s, b_s))
|
|
629
|
+
out.append(_tok(b_s, b_e))
|
|
630
|
+
if b_e < a_e:
|
|
631
|
+
out.append(_tok(b_e, a_e))
|
|
632
|
+
|
|
633
|
+
return [x for x in out if (getattr(x, "text", "") or "")]
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
def _rule_mesh_hangul_keysmash_runs(tokens: List[object], _lang: str, text: str) -> List[object]:
|
|
637
|
+
"""
|
|
638
|
+
Global neutral fix (mesh):
|
|
639
|
+
Merge fragmented Hangul keysmash/garble pieces into a single token.
|
|
640
|
+
|
|
641
|
+
Example:
|
|
642
|
+
["ㅣ", "마", "ㅓㅣ", "넣", "ㄹ"] -> ["ㅣ마ㅓㅣ넣ㄹ"]
|
|
643
|
+
|
|
644
|
+
Safety:
|
|
645
|
+
- merges only contiguous tokens (b.start == a.end)
|
|
646
|
+
- merges only short pieces (<=6 chars each) and bounded total length
|
|
647
|
+
- requires >=3 Hangul Jamo chars across the merged run
|
|
648
|
+
- requires presence of vowel jamo (ㅏ..ㅣ) somewhere in the run
|
|
649
|
+
- avoids swallowing clear words: stops when it sees a token with >=3 Hangul syllables
|
|
650
|
+
"""
|
|
651
|
+
if not tokens or len(tokens) < 2:
|
|
652
|
+
return tokens
|
|
653
|
+
|
|
654
|
+
TokenType = type(tokens[0])
|
|
655
|
+
|
|
656
|
+
def _tok(s: int, e: int):
|
|
657
|
+
return TokenType(text=text[s:e], start=s, end=e)
|
|
658
|
+
|
|
659
|
+
def _is_jamo(ch: str) -> bool:
|
|
660
|
+
o = ord(ch)
|
|
661
|
+
return 0x3131 <= o <= 0x3163
|
|
662
|
+
|
|
663
|
+
def _is_vowel_jamo(ch: str) -> bool:
|
|
664
|
+
o = ord(ch)
|
|
665
|
+
return 0x314F <= o <= 0x3163 # ㅏ..ㅣ
|
|
666
|
+
|
|
667
|
+
def _syllable_count(s: str) -> int:
|
|
668
|
+
return sum(1 for ch in s if 0xAC00 <= ord(ch) <= 0xD7AF)
|
|
669
|
+
|
|
670
|
+
out: List[object] = []
|
|
671
|
+
i = 0
|
|
672
|
+
n = len(tokens)
|
|
673
|
+
while i < n:
|
|
674
|
+
a = tokens[i]
|
|
675
|
+
at = getattr(a, "text", "") or ""
|
|
676
|
+
a_s, a_e = int(getattr(a, "start")), int(getattr(a, "end"))
|
|
677
|
+
if not (0 <= a_s <= a_e <= len(text)) or not at:
|
|
678
|
+
out.append(a)
|
|
679
|
+
i += 1
|
|
680
|
+
continue
|
|
681
|
+
|
|
682
|
+
# start a candidate run only if this token has any jamo or is a tiny mixed piece
|
|
683
|
+
if len(at) > 6 or (_syllable_count(at) >= 3 and not any(_is_jamo(ch) for ch in at)):
|
|
684
|
+
out.append(a)
|
|
685
|
+
i += 1
|
|
686
|
+
continue
|
|
687
|
+
|
|
688
|
+
jamo_cnt = sum(1 for ch in at if _is_jamo(ch))
|
|
689
|
+
has_vowel = any(_is_vowel_jamo(ch) for ch in at)
|
|
690
|
+
start = a_s
|
|
691
|
+
end = a_e
|
|
692
|
+
j = i + 1
|
|
693
|
+
parts = [at]
|
|
694
|
+
|
|
695
|
+
while j < n:
|
|
696
|
+
b = tokens[j]
|
|
697
|
+
bt = getattr(b, "text", "") or ""
|
|
698
|
+
b_s, b_e = int(getattr(b, "start")), int(getattr(b, "end"))
|
|
699
|
+
if not bt or not (0 <= b_s <= b_e <= len(text)) or b_s != end:
|
|
700
|
+
break
|
|
701
|
+
if len(bt) > 6:
|
|
702
|
+
break
|
|
703
|
+
# stop before swallowing a clear multi-syllable word chunk
|
|
704
|
+
if _syllable_count(bt) >= 3 and not any(_is_jamo(ch) for ch in bt):
|
|
705
|
+
break
|
|
706
|
+
# cap merged length
|
|
707
|
+
if (b_e - start) > 24:
|
|
708
|
+
break
|
|
709
|
+
parts.append(bt)
|
|
710
|
+
end = b_e
|
|
711
|
+
jamo_cnt += sum(1 for ch in bt if _is_jamo(ch))
|
|
712
|
+
has_vowel = has_vowel or any(_is_vowel_jamo(ch) for ch in bt)
|
|
713
|
+
j += 1
|
|
714
|
+
|
|
715
|
+
if j > i + 1 and jamo_cnt >= 3 and has_vowel:
|
|
716
|
+
out.append(_tok(start, end))
|
|
717
|
+
i = j
|
|
718
|
+
continue
|
|
719
|
+
|
|
720
|
+
out.append(a)
|
|
721
|
+
i += 1
|
|
722
|
+
|
|
723
|
+
return out
|
|
724
|
+
|
|
725
|
+
|
|
726
|
+
def _rule_merge_base64_and_heart(tokens: List[object], _lang: str, text: str) -> List[object]:
|
|
727
|
+
"""
|
|
728
|
+
Global neutral fix:
|
|
729
|
+
- Re-merge base64/opaque blobs that got split into many tiny punctuation tokens.
|
|
730
|
+
- Re-merge SNS heart emoticon "<3" if it was split into "<" + "3".
|
|
731
|
+
|
|
732
|
+
This is about robustness/UX for "non-sentences", not linguistic correctness.
|
|
733
|
+
"""
|
|
734
|
+
if not tokens or len(tokens) < 2:
|
|
735
|
+
return tokens
|
|
736
|
+
|
|
737
|
+
TokenType = type(tokens[0])
|
|
738
|
+
|
|
739
|
+
def _tok(s: int, e: int):
|
|
740
|
+
return TokenType(text=text[s:e], start=s, end=e)
|
|
741
|
+
|
|
742
|
+
def _is_base64_char(ch: str) -> bool:
|
|
743
|
+
o = ord(ch)
|
|
744
|
+
if 48 <= o <= 57 or 65 <= o <= 90 or 97 <= o <= 122:
|
|
745
|
+
return True
|
|
746
|
+
return ch in {"+", "/", "="}
|
|
747
|
+
|
|
748
|
+
out: List[object] = []
|
|
749
|
+
i = 0
|
|
750
|
+
n = len(tokens)
|
|
751
|
+
while i < n:
|
|
752
|
+
a = tokens[i]
|
|
753
|
+
at = getattr(a, "text", "") or ""
|
|
754
|
+
a_s, a_e = int(getattr(a, "start")), int(getattr(a, "end"))
|
|
755
|
+
if not at:
|
|
756
|
+
i += 1
|
|
757
|
+
continue
|
|
758
|
+
|
|
759
|
+
# Merge "<3"
|
|
760
|
+
if at == "<" and i + 1 < n:
|
|
761
|
+
b = tokens[i + 1]
|
|
762
|
+
bt = getattr(b, "text", "") or ""
|
|
763
|
+
b_s, b_e = int(getattr(b, "start")), int(getattr(b, "end"))
|
|
764
|
+
if bt == "3" and b_s == a_e:
|
|
765
|
+
out.append(_tok(a_s, b_e))
|
|
766
|
+
i += 2
|
|
767
|
+
continue
|
|
768
|
+
|
|
769
|
+
# Merge base64/opaque blob runs split into many tiny tokens.
|
|
770
|
+
# Conditions:
|
|
771
|
+
# - contiguous in text (no spaces)
|
|
772
|
+
# - chars are base64 set [A-Za-z0-9+/=]
|
|
773
|
+
# - total length bounded
|
|
774
|
+
# - must contain at least one of "+/=" to avoid swallowing normal words
|
|
775
|
+
if len(at) <= 128 and all(_is_base64_char(ch) for ch in at) and not any(ch.isspace() for ch in at):
|
|
776
|
+
start = a_s
|
|
777
|
+
end = a_e
|
|
778
|
+
has_sig = any(ch in at for ch in {"+", "/", "="})
|
|
779
|
+
sig_count = sum(1 for ch in at if ch in {"+", "/", "="})
|
|
780
|
+
total_len = len(at)
|
|
781
|
+
j = i + 1
|
|
782
|
+
while j < n:
|
|
783
|
+
b = tokens[j]
|
|
784
|
+
bt = getattr(b, "text", "") or ""
|
|
785
|
+
b_s, b_e = int(getattr(b, "start")), int(getattr(b, "end"))
|
|
786
|
+
if not bt or b_s != end:
|
|
787
|
+
break
|
|
788
|
+
if any(ch.isspace() for ch in bt):
|
|
789
|
+
break
|
|
790
|
+
if not all(_is_base64_char(ch) for ch in bt):
|
|
791
|
+
break
|
|
792
|
+
if len(bt) > 128:
|
|
793
|
+
break
|
|
794
|
+
total_len += len(bt)
|
|
795
|
+
if total_len > 256:
|
|
796
|
+
break
|
|
797
|
+
has_sig = has_sig or any(ch in bt for ch in {"+", "/", "="})
|
|
798
|
+
sig_count += sum(1 for ch in bt if ch in {"+", "/", "="})
|
|
799
|
+
end = b_e
|
|
800
|
+
j += 1
|
|
801
|
+
|
|
802
|
+
# Require enough "signal" chars to avoid swallowing normal words like "internationalization"
|
|
803
|
+
if j > i + 1 and total_len >= 16 and has_sig and sig_count >= 2:
|
|
804
|
+
out.append(_tok(start, end))
|
|
805
|
+
i = j
|
|
806
|
+
continue
|
|
807
|
+
|
|
808
|
+
out.append(a)
|
|
809
|
+
i += 1
|
|
810
|
+
|
|
811
|
+
return out
|
|
812
|
+
|
|
813
|
+
|
|
814
|
+
def _rule_split_punct_before_social_marker(tokens: List[object], _lang: str, text: str) -> List[object]:
|
|
815
|
+
"""
|
|
816
|
+
Global neutral fix: split punctuation runs that accidentally swallow a social marker.
|
|
817
|
+
|
|
818
|
+
Example:
|
|
819
|
+
- "!!!#发布会" might yield a token "!!!#" -> split into "!!!" + "#"
|
|
820
|
+
|
|
821
|
+
This makes the downstream `_rule_merge_social_handles` effective.
|
|
822
|
+
"""
|
|
823
|
+
if not tokens:
|
|
824
|
+
return tokens
|
|
825
|
+
|
|
826
|
+
TokenType = type(tokens[0])
|
|
827
|
+
|
|
828
|
+
def _tok(s: int, e: int):
|
|
829
|
+
return TokenType(text=text[s:e], start=s, end=e)
|
|
830
|
+
|
|
831
|
+
out: List[object] = []
|
|
832
|
+
for t in tokens:
|
|
833
|
+
tt = getattr(t, "text", "") or ""
|
|
834
|
+
s = int(getattr(t, "start"))
|
|
835
|
+
e = int(getattr(t, "end"))
|
|
836
|
+
if not (0 <= s <= e <= len(text)) or len(tt) < 2:
|
|
837
|
+
out.append(t)
|
|
838
|
+
continue
|
|
839
|
+
last = tt[-1]
|
|
840
|
+
if last in {"#", "@", "$"}:
|
|
841
|
+
head = tt[:-1]
|
|
842
|
+
# Only split when head looks like punctuation/emphasis noise.
|
|
843
|
+
# Keep it conservative to avoid breaking things like "C#".
|
|
844
|
+
if head and all((not ch.isalnum()) for ch in head):
|
|
845
|
+
mid = e - 1
|
|
846
|
+
out.append(_tok(s, mid))
|
|
847
|
+
out.append(_tok(mid, e))
|
|
848
|
+
continue
|
|
849
|
+
out.append(t)
|
|
850
|
+
return [x for x in out if (getattr(x, "text", "") or "")]
|
|
851
|
+
|
|
852
|
+
|
|
853
|
+
def _rule_split_punct_digit_ellipsis_clumps(tokens: List[object], _lang: str, text: str) -> List[object]:
|
|
854
|
+
"""
|
|
855
|
+
Global neutral fix: split SNS-ish clumps like "!!!23333……" that sometimes appear in corpora.
|
|
856
|
+
|
|
857
|
+
This helps downstream SNS marker tagging and avoids treating such clumps as a single token.
|
|
858
|
+
|
|
859
|
+
Safety:
|
|
860
|
+
- only splits when the token is entirely: (punct-run){2,} + (digits){2,} + optional (ellipsis-run){2,}
|
|
861
|
+
- does not touch normal words or mixed alnum words.
|
|
862
|
+
"""
|
|
863
|
+
if not tokens:
|
|
864
|
+
return tokens
|
|
865
|
+
|
|
866
|
+
TokenType = type(tokens[0])
|
|
867
|
+
|
|
868
|
+
def _tok(s: int, e: int):
|
|
869
|
+
return TokenType(text=text[s:e], start=s, end=e)
|
|
870
|
+
|
|
871
|
+
rx = re.compile(r"^([!!??~~]{2,})(\d{2,})([.…\.]{2,}|…{2,})?$")
|
|
872
|
+
|
|
873
|
+
out: List[object] = []
|
|
874
|
+
for t in tokens:
|
|
875
|
+
tt = getattr(t, "text", "") or ""
|
|
876
|
+
s = int(getattr(t, "start"))
|
|
877
|
+
e = int(getattr(t, "end"))
|
|
878
|
+
if not (0 <= s <= e <= len(text)) or len(tt) < 5:
|
|
879
|
+
out.append(t)
|
|
880
|
+
continue
|
|
881
|
+
m = rx.fullmatch(tt)
|
|
882
|
+
if not m:
|
|
883
|
+
out.append(t)
|
|
884
|
+
continue
|
|
885
|
+
g1, g2, g3 = m.group(1), m.group(2), m.group(3)
|
|
886
|
+
i1 = s + len(g1)
|
|
887
|
+
i2 = i1 + len(g2)
|
|
888
|
+
if not (s < i1 < i2 <= e):
|
|
889
|
+
out.append(t)
|
|
890
|
+
continue
|
|
891
|
+
out.append(_tok(s, i1))
|
|
892
|
+
out.append(_tok(i1, i2))
|
|
893
|
+
if g3:
|
|
894
|
+
i3 = i2 + len(g3)
|
|
895
|
+
if i2 < i3 <= e:
|
|
896
|
+
out.append(_tok(i2, i3))
|
|
897
|
+
else:
|
|
898
|
+
out.append(_tok(i2, e))
|
|
899
|
+
continue
|
|
900
|
+
return [x for x in out if (getattr(x, "text", "") or "")]
|
|
901
|
+
|
|
902
|
+
|
|
903
|
+
def _rule_merge_social_handles(tokens: List[object], _lang: str, text: str) -> List[object]:
|
|
904
|
+
"""
|
|
905
|
+
Global neutral fix: merge contiguous social handles and tags.
|
|
906
|
+
|
|
907
|
+
Examples:
|
|
908
|
+
- "#AI" split as ["#", "AI"] -> ["#AI"]
|
|
909
|
+
- "@user" split as ["@", "user"] -> ["@user"]
|
|
910
|
+
- "$TSLA" split as ["$", "TSLA"] -> ["$TSLA"]
|
|
911
|
+
- "#发布会" split as ["#", "发布", "会"] -> ["#发布会"]
|
|
912
|
+
|
|
913
|
+
Safety:
|
|
914
|
+
- only when contiguous in the original text (b.start == a.end)
|
|
915
|
+
- tail must be "handle-like" (letters/numbers/marks plus _.- and CJK)
|
|
916
|
+
- bounded tail length
|
|
917
|
+
"""
|
|
918
|
+
if not tokens or len(tokens) < 2:
|
|
919
|
+
return tokens
|
|
920
|
+
|
|
921
|
+
TokenType = type(tokens[0])
|
|
922
|
+
|
|
923
|
+
def _tok(text_: str, start: int, end: int):
|
|
924
|
+
return TokenType(text=text_, start=start, end=end)
|
|
925
|
+
|
|
926
|
+
def _is_tail_ok(s: str) -> bool:
|
|
927
|
+
if not s or len(s) > 64:
|
|
928
|
+
return False
|
|
929
|
+
for ch in s:
|
|
930
|
+
if ch.isalnum():
|
|
931
|
+
continue
|
|
932
|
+
# allow underscore/dot/hyphen
|
|
933
|
+
if ch in {"_", ".", "-"}:
|
|
934
|
+
continue
|
|
935
|
+
# allow CJK characters in tags
|
|
936
|
+
o = ord(ch)
|
|
937
|
+
if 0x4E00 <= o <= 0x9FFF:
|
|
938
|
+
continue
|
|
939
|
+
# allow Japanese kana in tags
|
|
940
|
+
if 0x3040 <= o <= 0x30FF:
|
|
941
|
+
continue
|
|
942
|
+
# allow Hangul in tags
|
|
943
|
+
if 0xAC00 <= o <= 0xD7AF:
|
|
944
|
+
continue
|
|
945
|
+
return False
|
|
946
|
+
return True
|
|
947
|
+
|
|
948
|
+
out: List[object] = []
|
|
949
|
+
i = 0
|
|
950
|
+
n = len(tokens)
|
|
951
|
+
while i < n:
|
|
952
|
+
a = tokens[i]
|
|
953
|
+
if i + 1 < n:
|
|
954
|
+
b = tokens[i + 1]
|
|
955
|
+
at = getattr(a, "text", "") or ""
|
|
956
|
+
bt = getattr(b, "text", "") or ""
|
|
957
|
+
if at in {"#", "@", "$"}:
|
|
958
|
+
a_s, a_e = int(getattr(a, "start")), int(getattr(a, "end"))
|
|
959
|
+
b_s, b_e = int(getattr(b, "start")), int(getattr(b, "end"))
|
|
960
|
+
if 0 <= a_s <= a_e <= len(text) and 0 <= b_s <= b_e <= len(text) and b_s == a_e:
|
|
961
|
+
# Merge a multi-token tail when it stays contiguous and handle-like.
|
|
962
|
+
parts = [bt]
|
|
963
|
+
end = b_e
|
|
964
|
+
j = i + 2
|
|
965
|
+
while j < n:
|
|
966
|
+
c = tokens[j]
|
|
967
|
+
ct = getattr(c, "text", "") or ""
|
|
968
|
+
c_s, c_e = int(getattr(c, "start")), int(getattr(c, "end"))
|
|
969
|
+
if c_s != end:
|
|
970
|
+
break
|
|
971
|
+
# stop if too long
|
|
972
|
+
if sum(len(p) for p in parts) + len(ct) > 64:
|
|
973
|
+
break
|
|
974
|
+
# accept only handle-like chunks
|
|
975
|
+
if not _is_tail_ok(ct):
|
|
976
|
+
break
|
|
977
|
+
parts.append(ct)
|
|
978
|
+
end = c_e
|
|
979
|
+
j += 1
|
|
980
|
+
merged_tail = "".join(parts)
|
|
981
|
+
if _is_tail_ok(merged_tail):
|
|
982
|
+
out.append(_tok(at + merged_tail, a_s, end))
|
|
983
|
+
i = j
|
|
984
|
+
continue
|
|
985
|
+
out.append(a)
|
|
986
|
+
i += 1
|
|
987
|
+
return out
|
|
988
|
+
|
|
989
|
+
|
|
990
|
+
def _rule_merge_emoji_sequences(tokens: List[object], _lang: str, _text: str) -> List[object]:
|
|
991
|
+
"""
|
|
992
|
+
Global neutral fix: merge contiguous emoji sequences into a single token.
|
|
993
|
+
|
|
994
|
+
Handles common emoji composition characters:
|
|
995
|
+
- ZWJ (U+200D)
|
|
996
|
+
- variation selectors (FE0E/FE0F)
|
|
997
|
+
- skin tone modifiers (U+1F3FB..U+1F3FF)
|
|
998
|
+
- regional indicators (flags) (U+1F1E6..U+1F1FF)
|
|
999
|
+
|
|
1000
|
+
Safety:
|
|
1001
|
+
- only merges when tokens are contiguous (a.end == b.start)
|
|
1002
|
+
- only merges tokens that are "emojiish-only"
|
|
1003
|
+
- bounded total length
|
|
1004
|
+
"""
|
|
1005
|
+
if not tokens or len(tokens) < 2:
|
|
1006
|
+
return tokens
|
|
1007
|
+
|
|
1008
|
+
TokenType = type(tokens[0])
|
|
1009
|
+
|
|
1010
|
+
def _tok(text_: str, start: int, end: int):
|
|
1011
|
+
return TokenType(text=text_, start=start, end=end)
|
|
1012
|
+
|
|
1013
|
+
def _is_emojiish_char(ch: str) -> bool:
|
|
1014
|
+
o = ord(ch)
|
|
1015
|
+
if ch == "\u200d": # ZWJ
|
|
1016
|
+
return True
|
|
1017
|
+
if o in {0xFE0E, 0xFE0F}: # variation selectors
|
|
1018
|
+
return True
|
|
1019
|
+
if 0x1F3FB <= o <= 0x1F3FF: # skin tone modifiers
|
|
1020
|
+
return True
|
|
1021
|
+
if 0x1F1E6 <= o <= 0x1F1FF: # regional indicators
|
|
1022
|
+
return True
|
|
1023
|
+
# common emoji blocks
|
|
1024
|
+
if 0x1F300 <= o <= 0x1FAFF:
|
|
1025
|
+
return True
|
|
1026
|
+
# misc symbols / dingbats often used as emoji
|
|
1027
|
+
if 0x2600 <= o <= 0x26FF:
|
|
1028
|
+
return True
|
|
1029
|
+
if 0x2700 <= o <= 0x27BF:
|
|
1030
|
+
return True
|
|
1031
|
+
return False
|
|
1032
|
+
|
|
1033
|
+
def _is_emojiish_token(s: str) -> bool:
|
|
1034
|
+
return bool(s) and all(_is_emojiish_char(ch) for ch in s)
|
|
1035
|
+
|
|
1036
|
+
out: List[object] = []
|
|
1037
|
+
i = 0
|
|
1038
|
+
n = len(tokens)
|
|
1039
|
+
while i < n:
|
|
1040
|
+
a = tokens[i]
|
|
1041
|
+
at = getattr(a, "text", "") or ""
|
|
1042
|
+
if not _is_emojiish_token(at):
|
|
1043
|
+
out.append(a)
|
|
1044
|
+
i += 1
|
|
1045
|
+
continue
|
|
1046
|
+
|
|
1047
|
+
start = int(getattr(a, "start"))
|
|
1048
|
+
end = int(getattr(a, "end"))
|
|
1049
|
+
parts = [at]
|
|
1050
|
+
j = i + 1
|
|
1051
|
+
while j < n:
|
|
1052
|
+
b = tokens[j]
|
|
1053
|
+
bt = getattr(b, "text", "") or ""
|
|
1054
|
+
if not _is_emojiish_token(bt):
|
|
1055
|
+
break
|
|
1056
|
+
if int(getattr(b, "start")) != end:
|
|
1057
|
+
break
|
|
1058
|
+
# cap length to avoid pathological merges
|
|
1059
|
+
if sum(len(p) for p in parts) + len(bt) > 32:
|
|
1060
|
+
break
|
|
1061
|
+
parts.append(bt)
|
|
1062
|
+
end = int(getattr(b, "end"))
|
|
1063
|
+
j += 1
|
|
1064
|
+
|
|
1065
|
+
if j > i + 1:
|
|
1066
|
+
out.append(_tok("".join(parts), start, end))
|
|
1067
|
+
i = j
|
|
1068
|
+
continue
|
|
1069
|
+
out.append(a)
|
|
1070
|
+
i += 1
|
|
1071
|
+
return out
|
|
1072
|
+
|
|
1073
|
+
|
|
1074
|
+
def _rule_ko_sns_markers(tokens: List[object], _lang: str, text: str) -> List[object]:
|
|
1075
|
+
"""
|
|
1076
|
+
Korean SNS marker splitting (neutral preprocessing):
|
|
1077
|
+
- Split embedded/emergent discourse/emotion markers like ㅋㅋ/ㅎㅎ/ㅠㅠ/ㅜㅜ/ㄷㄷ/ㅇㅇ/ㄹㅇ when they
|
|
1078
|
+
are glued to neighboring words.
|
|
1079
|
+
|
|
1080
|
+
Examples:
|
|
1081
|
+
- "실화냐ㅋㅋ" -> ["실화냐", "ㅋㅋ"]
|
|
1082
|
+
- "아ㅋㅋ진짜" -> ["아", "ㅋㅋ", "진짜"]
|
|
1083
|
+
- "미쳤네ㅠㅠ" -> ["미쳤네", "ㅠㅠ"]
|
|
1084
|
+
- "ㄱㄱ!!!" -> ["ㄱㄱ", "!!!"] (only splits when punctuation is present)
|
|
1085
|
+
|
|
1086
|
+
Safety:
|
|
1087
|
+
- only triggers for Hangul/Jamo marker runs (rare in formal text)
|
|
1088
|
+
- preserves offsets by slicing the original `text`
|
|
1089
|
+
"""
|
|
1090
|
+
if not tokens:
|
|
1091
|
+
return tokens
|
|
1092
|
+
|
|
1093
|
+
ll = (_lang or "").lower().replace("_", "-")
|
|
1094
|
+
if ll != "ko":
|
|
1095
|
+
return tokens
|
|
1096
|
+
|
|
1097
|
+
TokenType = type(tokens[0])
|
|
1098
|
+
|
|
1099
|
+
def _tok(s: int, e: int):
|
|
1100
|
+
return TokenType(text=text[s:e], start=s, end=e)
|
|
1101
|
+
|
|
1102
|
+
# Core SNS marker runs: laughter/cry/surprise/affirmation/emphasis.
|
|
1103
|
+
# - laughter/emotion: ㅋ/ㅎ/ㅠ/ㅜ repeated
|
|
1104
|
+
# - surprise: ㄷㄷ
|
|
1105
|
+
# - affirmation: ㅇㅇ
|
|
1106
|
+
# - emphasis slang: ㄹㅇ, ㅈㄴ, ㅅㅂ, ㅇㅋ etc. (consonant-only runs)
|
|
1107
|
+
import re
|
|
1108
|
+
|
|
1109
|
+
rx_marker = re.compile(r"(?:[ㅋㅎㅠㅜ]{2,}|ㄷ{2,}|ㅇ{2,}|[ㄱ-ㅎ]{2,5})")
|
|
1110
|
+
rx_punct_tail = re.compile(r"[!!??~…]+$")
|
|
1111
|
+
|
|
1112
|
+
out: List[object] = []
|
|
1113
|
+
for t in tokens:
|
|
1114
|
+
s0 = getattr(t, "text", "") or ""
|
|
1115
|
+
a_s, a_e = int(getattr(t, "start")), int(getattr(t, "end"))
|
|
1116
|
+
if not (0 <= a_s <= a_e <= len(text)) or not s0:
|
|
1117
|
+
out.append(t)
|
|
1118
|
+
continue
|
|
1119
|
+
|
|
1120
|
+
# Split trailing punctuation runs (SNS intensity) when attached to marker-only token.
|
|
1121
|
+
# e.g., "ㄱㄱ!!!" -> "ㄱㄱ" + "!!!"
|
|
1122
|
+
m_tail = rx_punct_tail.search(s0)
|
|
1123
|
+
if m_tail and m_tail.start() > 0:
|
|
1124
|
+
head = s0[: m_tail.start()]
|
|
1125
|
+
tail = s0[m_tail.start() :]
|
|
1126
|
+
# Only do this for jamo/consonant-like heads (avoid splitting normal words like "go!!!")
|
|
1127
|
+
if rx_marker.fullmatch(head):
|
|
1128
|
+
mid = a_s + m_tail.start()
|
|
1129
|
+
out.append(_tok(a_s, mid))
|
|
1130
|
+
out.append(_tok(mid, a_e))
|
|
1131
|
+
continue
|
|
1132
|
+
|
|
1133
|
+
# Find embedded marker runs; if none, keep as-is.
|
|
1134
|
+
hits = list(rx_marker.finditer(s0))
|
|
1135
|
+
if not hits:
|
|
1136
|
+
out.append(t)
|
|
1137
|
+
continue
|
|
1138
|
+
|
|
1139
|
+
# If the whole token is just a marker run, keep as-is (already good token).
|
|
1140
|
+
if len(hits) == 1 and hits[0].start() == 0 and hits[0].end() == len(s0):
|
|
1141
|
+
out.append(t)
|
|
1142
|
+
continue
|
|
1143
|
+
|
|
1144
|
+
# Split into segments using marker spans.
|
|
1145
|
+
cur = 0
|
|
1146
|
+
any_split = False
|
|
1147
|
+
for h in hits:
|
|
1148
|
+
hs, he = h.start(), h.end()
|
|
1149
|
+
if hs > cur:
|
|
1150
|
+
out.append(_tok(a_s + cur, a_s + hs))
|
|
1151
|
+
out.append(_tok(a_s + hs, a_s + he))
|
|
1152
|
+
if hs != 0 or he != len(s0):
|
|
1153
|
+
any_split = True
|
|
1154
|
+
cur = he
|
|
1155
|
+
if cur < len(s0):
|
|
1156
|
+
out.append(_tok(a_s + cur, a_e))
|
|
1157
|
+
if not any_split:
|
|
1158
|
+
# fallback: no meaningful split detected
|
|
1159
|
+
out.append(t)
|
|
1160
|
+
|
|
1161
|
+
# Drop empty tokens defensively
|
|
1162
|
+
out2: List[object] = []
|
|
1163
|
+
for x in out:
|
|
1164
|
+
if (getattr(x, "text", "") or ""):
|
|
1165
|
+
out2.append(x)
|
|
1166
|
+
return out2
|
|
1167
|
+
|
|
1168
|
+
|
|
1169
|
+
# Register built-in rules (core defaults)
|
|
1170
|
+
register_global_rule(_rule_merge_digit_groups)
|
|
1171
|
+
register_global_rule(_rule_demesh_hangul_keysmash_inside_token)
|
|
1172
|
+
register_global_rule(_rule_mesh_hangul_keysmash_runs)
|
|
1173
|
+
register_global_rule(_rule_merge_simple_punct_runs)
|
|
1174
|
+
register_global_rule(_rule_split_punct_before_social_marker)
|
|
1175
|
+
register_global_rule(_rule_split_punct_digit_ellipsis_clumps)
|
|
1176
|
+
register_global_rule(_rule_merge_base64_and_heart)
|
|
1177
|
+
register_global_rule(_rule_merge_social_handles)
|
|
1178
|
+
register_global_rule(_rule_merge_emoji_sequences)
|
|
1179
|
+
register_lang_rule("ko", _rule_ko_sns_markers)
|
|
1180
|
+
register_lang_rule("zh", _rule_zh)
|
|
1181
|
+
register_lang_rule("zh-cn", _rule_zh)
|
|
1182
|
+
register_lang_rule("zh-tw", _rule_zh)
|
|
1183
|
+
register_lang_rule("ja", _rule_ja)
|
|
1184
|
+
|
|
1185
|
+
|