tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1185 @@
1
+ """
2
+ Token quality postprocessing (OSS core)
3
+ ======================================
4
+
5
+ Goal:
6
+ - Neutral token quality fixes only (NOT NER policy).
7
+ - Applied uniformly across all tokenizers via TokenizerResult.__post_init__.
8
+
9
+ Extension model (what you asked for):
10
+ - Add **global rules** (safe, language-agnostic)
11
+ - Add **script/family rules** (e.g., cjk/brahmic/rtl/indic/space)
12
+ - Add **language-specific mini rules** (zh/ja/...)
13
+
14
+ All rules are automatically applied to every tokenizer output.
15
+
16
+ Safety guards:
17
+ - adjacency checks (offset continuity) when merging
18
+ - bounded lookahead
19
+ - hard length limits to avoid aggressive merges
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import re
25
+ from typing import Callable, Dict, List, Optional
26
+
27
+
28
+ RuleFn = Callable[[List[object], str, str], List[object]]
29
+
30
+ _GLOBAL_RULES: List[RuleFn] = []
31
+ _FAMILY_RULES: Dict[str, List[RuleFn]] = {}
32
+ _LANG_RULES: Dict[str, List[RuleFn]] = {}
33
+
34
+
35
+ def register_global_rule(fn: RuleFn) -> None:
36
+ _GLOBAL_RULES.append(fn)
37
+
38
+
39
+ def register_family_rule(family: str, fn: RuleFn) -> None:
40
+ family = (family or "").strip().lower()
41
+ if not family:
42
+ return
43
+ _FAMILY_RULES.setdefault(family, []).append(fn)
44
+
45
+
46
+ def register_lang_rule(lang: str, fn: RuleFn) -> None:
47
+ lang = (lang or "").strip().lower().replace("_", "-")
48
+ if not lang:
49
+ return
50
+ _LANG_RULES.setdefault(lang, []).append(fn)
51
+
52
+
53
+ def _family_for_lang(lang: str) -> Optional[str]:
54
+ """
55
+ Lightweight script/family routing without importing tokenizer modules
56
+ (avoids circular imports).
57
+ """
58
+ ll = (lang or "").lower().replace("_", "-")
59
+ if ll in {"zh", "zh-cn", "zh-tw", "ja", "ko"}:
60
+ return "cjk"
61
+ if ll in {"th", "lo", "my", "km"}:
62
+ return "brahmic"
63
+ if ll in {"ar", "he", "fa", "ur", "yi", "ps"}:
64
+ return "rtl"
65
+ if ll in {"hi", "bn", "gu", "pa", "mr", "ne", "si", "ta", "te", "kn", "ml", "or", "as", "sa"}:
66
+ return "indic"
67
+ return "space"
68
+
69
+
70
+ def apply_token_quality(tokens: List[object], *, lang: str, text: str) -> List[object]:
71
+ """
72
+ Apply token-quality fixes for a given language.
73
+
74
+ `tokens` are expected to be tokmor.base.Token-like objects with:
75
+ - text: str
76
+ - start: int
77
+ - end: int
78
+ """
79
+ if not tokens:
80
+ return tokens
81
+
82
+ ll = (lang or "").lower().replace("_", "-")
83
+ fam = _family_for_lang(ll) or ""
84
+
85
+ out = tokens
86
+ for fn in _GLOBAL_RULES:
87
+ out = fn(out, ll, text)
88
+ if not out:
89
+ return out
90
+
91
+ for fn in _FAMILY_RULES.get(fam, []):
92
+ out = fn(out, ll, text)
93
+ if not out:
94
+ return out
95
+
96
+ for fn in _LANG_RULES.get(ll, []):
97
+ out = fn(out, ll, text)
98
+ if not out:
99
+ return out
100
+
101
+ return out
102
+
103
+
104
+ def _rule_merge_digit_groups(tokens: List[object], _lang: str, text: str) -> List[object]:
105
+ """
106
+ Global neutral fix: merge common digit-group / decimal splits that happen in whitespace tokenizers.
107
+
108
+ Examples:
109
+ - "22,000" tokenized as ["22", "000"] -> ["22,000"]
110
+ - "1.28" tokenized as ["1", "28"] -> ["1.28"]
111
+
112
+ Safety:
113
+ - only when separated by a single char in the original text
114
+ - separator must be one of {',', '.', '٬', '٫'}
115
+ - both sides must be all digits
116
+ - bounded total length to avoid over-merging
117
+ """
118
+ if not tokens or len(tokens) < 2:
119
+ return tokens
120
+
121
+ TokenType = type(tokens[0])
122
+
123
+ def _tok_from_span(s: int, e: int):
124
+ return TokenType(text=text[s:e], start=s, end=e)
125
+
126
+ def _is_digits(s: str) -> bool:
127
+ return bool(s) and all(ch.isdigit() for ch in s)
128
+
129
+ SEPS = {",", ".", "٬", "٫"}
130
+
131
+ out: List[object] = []
132
+ i = 0
133
+ n = len(tokens)
134
+ while i < n:
135
+ a = tokens[i]
136
+ if i + 1 < n:
137
+ b = tokens[i + 1]
138
+ a_s, a_e = int(getattr(a, "start")), int(getattr(a, "end"))
139
+ b_s, b_e = int(getattr(b, "start")), int(getattr(b, "end"))
140
+ if 0 <= a_s <= a_e <= len(text) and 0 <= b_s <= b_e <= len(text):
141
+ if b_s == a_e + 1:
142
+ sep = text[a_e:b_s]
143
+ if sep in SEPS:
144
+ at = getattr(a, "text", "") or text[a_s:a_e]
145
+ bt = getattr(b, "text", "") or text[b_s:b_e]
146
+ if _is_digits(at) and _is_digits(bt):
147
+ merged_txt = text[a_s:b_e]
148
+ if 1 <= len(merged_txt) <= 32:
149
+ out.append(_tok_from_span(a_s, b_e))
150
+ i += 2
151
+ continue
152
+ out.append(a)
153
+ i += 1
154
+ return out
155
+
156
+
157
+ def _rule_zh(tokens: List[object], _lang: str, text: str) -> List[object]:
158
+ """
159
+ Chinese token quality fixes (conservative):
160
+ - Split over-merged suffix+verbish chunks: ...港聘用 -> ...港 + 聘用
161
+ - Merge dot-connected names (·), allowing small whitespace gaps: 米拉 · 万 托斯 -> 米拉·万托斯
162
+ - Merge short geo-name chains ending with 港 (recover common over-splits): 维 + 伦德 + 尔港 -> 维伦德尔港
163
+ - Split stuck function char in very limited cases: 在维... -> 在 + 维...
164
+ """
165
+ if not tokens:
166
+ return tokens
167
+
168
+ TokenType = type(tokens[0])
169
+
170
+ def _tok(text_: str, start: int, end: int):
171
+ return TokenType(text=text_, start=start, end=end)
172
+
173
+ def _is_cjk(ch: str) -> bool:
174
+ return bool(ch) and (("\u4e00" <= ch <= "\u9fff") or ("\u3400" <= ch <= "\u4dbf"))
175
+
176
+ def _is_name_piece(s: str) -> bool:
177
+ if not s:
178
+ return False
179
+ for ch in s:
180
+ if ch.isalnum():
181
+ continue
182
+ if _is_cjk(ch):
183
+ continue
184
+ return False
185
+ return True
186
+
187
+ def _gap(a_end: int, b_start: int) -> str:
188
+ if a_end < 0 or b_start < 0 or b_start < a_end or b_start > len(text):
189
+ return ""
190
+ return text[a_end:b_start]
191
+
192
+ def _gap_is_ws(g: str) -> bool:
193
+ return bool(g) and len(g) <= 3 and g.isspace()
194
+
195
+ VERBISH = {
196
+ "抵达",
197
+ "聘用",
198
+ "发布",
199
+ "宣布",
200
+ "表示",
201
+ "前往",
202
+ "返回",
203
+ "访问",
204
+ "会见",
205
+ "举行",
206
+ "发生",
207
+ "完成",
208
+ "启动",
209
+ "加入",
210
+ "离开",
211
+ "进入",
212
+ }
213
+ VERB_TAIL = {"", "了", "着", "过"}
214
+
215
+ def _is_verbish_chunk(s: str) -> bool:
216
+ if not s:
217
+ return False
218
+ for v in VERBISH:
219
+ for tail in VERB_TAIL:
220
+ if s == (v + tail):
221
+ return True
222
+ if s.startswith(v + tail) and len(s) <= len(v + tail) + 1:
223
+ return True
224
+ return False
225
+
226
+ NAME_ENDINGS = {"斯", "尔", "德", "特", "姆", "克", "夫", "诺", "拉", "娜", "尼", "亚", "里", "罗", "多", "恩"}
227
+
228
+ def _looks_like_foreign_name_prefix(s: str) -> bool:
229
+ if not s or len(s) < 2 or len(s) > 8:
230
+ return False
231
+ if not all(_is_cjk(ch) for ch in s):
232
+ return False
233
+ return s[-1] in NAME_ENDINGS
234
+
235
+ # pass 1: split obvious over-merges inside a single token
236
+ split_out: List[object] = []
237
+ for t in tokens:
238
+ txt = getattr(t, "text", "") or ""
239
+ if len(txt) < 3:
240
+ split_out.append(t)
241
+ continue
242
+
243
+ did_split = False
244
+
245
+ # 0) Dot-name over-merge guard:
246
+ # e.g., "米拉·万·托斯抵达维伦德尔港" -> "米拉·万·托斯" + "抵达" + "维伦德尔港"
247
+ if ("·" in txt) and (not did_split):
248
+ for v in VERBISH:
249
+ for tail in VERB_TAIL:
250
+ vv = v + tail
251
+ idx_v = txt.find(vv)
252
+ if idx_v <= 0:
253
+ continue
254
+ if "·" not in txt[:idx_v]:
255
+ continue
256
+ # Ensure offsets align with text length (avoid corrupt spans).
257
+ s0 = int(getattr(t, "start"))
258
+ e0 = int(getattr(t, "end"))
259
+ if (e0 - s0) != len(txt):
260
+ continue
261
+ if len(txt) > 80:
262
+ continue
263
+ left = txt[:idx_v]
264
+ rem = txt[idx_v + len(vv) :]
265
+ split_out.append(_tok(left, s0, s0 + len(left)))
266
+ split_out.append(_tok(vv, s0 + idx_v, s0 + idx_v + len(vv)))
267
+ if rem:
268
+ split_out.append(_tok(rem, s0 + idx_v + len(vv), e0))
269
+ did_split = True
270
+ break
271
+ if did_split:
272
+ break
273
+ if did_split:
274
+ continue
275
+
276
+ idx = txt.rfind("港")
277
+ if idx != -1 and idx < len(txt) - 1:
278
+ rem = txt[idx + 1 :]
279
+ if _is_verbish_chunk(rem):
280
+ left = txt[: idx + 1]
281
+ right = txt[idx + 1 :]
282
+ mid = getattr(t, "start") + len(left)
283
+ split_out.append(_tok(left, getattr(t, "start"), mid))
284
+ split_out.append(_tok(right, mid, getattr(t, "end")))
285
+ did_split = True
286
+ if did_split:
287
+ continue
288
+
289
+ for v in VERBISH:
290
+ for tail in VERB_TAIL:
291
+ suffix = v + tail
292
+ if txt.endswith(suffix) and len(txt) > len(suffix):
293
+ prefix = txt[: -len(suffix)]
294
+ if _looks_like_foreign_name_prefix(prefix):
295
+ mid = getattr(t, "end") - len(suffix)
296
+ split_out.append(_tok(prefix, getattr(t, "start"), mid))
297
+ split_out.append(_tok(suffix, mid, getattr(t, "end")))
298
+ did_split = True
299
+ break
300
+ if did_split:
301
+ break
302
+ if not did_split:
303
+ split_out.append(t)
304
+
305
+ split_out.sort(key=lambda x: getattr(x, "start"))
306
+
307
+ # pass 1.5: split limited stuck function char prefixes (very conservative)
308
+ FUNC_PREFIX = {"在", "到", "于", "从", "往", "去"}
309
+ split2: List[object] = []
310
+ for idx, t in enumerate(split_out):
311
+ txt = getattr(t, "text", "") or ""
312
+ if len(txt) >= 2 and txt[0] in FUNC_PREFIX:
313
+ rem = txt[1:]
314
+ nxt = split_out[idx + 1] if idx + 1 < len(split_out) else None
315
+ if rem and all(_is_cjk(ch) for ch in rem) and nxt and _is_name_piece(getattr(nxt, "text", "")):
316
+ s0 = getattr(t, "start")
317
+ e0 = getattr(t, "end")
318
+ if (s0 + 1) <= e0:
319
+ split2.append(_tok(txt[0], s0, s0 + 1))
320
+ split2.append(_tok(rem, s0 + 1, e0))
321
+ continue
322
+ split2.append(t)
323
+ split2.sort(key=lambda x: getattr(x, "start"))
324
+
325
+ # pass 1.6: merge short geo-name chains ending with 港 (contiguous, bounded)
326
+ geo_merged: List[object] = []
327
+ i = 0
328
+ while i < len(split2):
329
+ t0 = split2[i]
330
+ if getattr(t0, "text", None) in FUNC_PREFIX:
331
+ geo_merged.append(t0)
332
+ i += 1
333
+ continue
334
+
335
+ parts: List[object] = []
336
+ j = i
337
+ merged = False
338
+ while j < len(split2) and len(parts) < 5:
339
+ cur = split2[j]
340
+ if parts and getattr(parts[-1], "end") != getattr(cur, "start"):
341
+ break
342
+ cur_txt = getattr(cur, "text", "") or ""
343
+ if not cur_txt or not all(_is_cjk(ch) for ch in cur_txt):
344
+ break
345
+ # Never merge verbish chunks into toponyms (keeps dot-name split + verb separate).
346
+ if _is_verbish_chunk(cur_txt):
347
+ break
348
+ parts.append(cur)
349
+ joined = "".join(getattr(p, "text", "") or "" for p in parts)
350
+ if len(joined) > 12:
351
+ break
352
+ if joined.endswith("港") and len(parts) >= 2:
353
+ geo_merged.append(_tok(joined, getattr(parts[0], "start"), getattr(parts[-1], "end")))
354
+ i = j + 1
355
+ merged = True
356
+ break
357
+ j += 1
358
+ if not merged:
359
+ geo_merged.append(t0)
360
+ i += 1
361
+
362
+ # pass 2: merge dot-connected names (·) (gap-aware, bounded)
363
+ merged_names: List[object] = []
364
+ i = 0
365
+ while i < len(geo_merged):
366
+ t0 = geo_merged[i]
367
+ if not _is_name_piece(getattr(t0, "text", "")):
368
+ merged_names.append(t0)
369
+ i += 1
370
+ continue
371
+
372
+ parts = [t0]
373
+ connectors: List[str] = []
374
+ j = i + 1
375
+ saw_dot = False
376
+
377
+ while j < len(geo_merged):
378
+ prev = parts[-1]
379
+ nxt = geo_merged[j]
380
+ g = _gap(getattr(prev, "end"), getattr(nxt, "start"))
381
+ g_strip = g.strip()
382
+ nxt_text = getattr(nxt, "text", "") or ""
383
+
384
+ # Hard stop: do NOT let a dot-name merge swallow verbs / location chunks.
385
+ # Example to avoid: "米拉·万·托斯抵达维伦德尔港" becoming one token.
386
+ if _is_verbish_chunk(nxt_text):
387
+ break
388
+ if nxt_text and (nxt_text == "港" or nxt_text.endswith("港") or nxt_text.endswith("市") or nxt_text.endswith("省") or nxt_text.endswith("县") or nxt_text.endswith("区") or nxt_text.endswith("州")):
389
+ break
390
+
391
+ if g_strip == "·" and _is_name_piece(getattr(nxt, "text", "")):
392
+ connectors.append("dot")
393
+ parts.append(nxt)
394
+ saw_dot = True
395
+ j += 1
396
+ continue
397
+
398
+ if (g == "" or _gap_is_ws(g)) and (getattr(nxt, "text", "") == "·") and (j + 1) < len(geo_merged):
399
+ nxt2 = geo_merged[j + 1]
400
+ g2 = _gap(getattr(nxt, "end"), getattr(nxt2, "start"))
401
+ if (g2 == "" or _gap_is_ws(g2)) and _is_name_piece(getattr(nxt2, "text", "")):
402
+ connectors.append("dot")
403
+ parts.append(nxt2)
404
+ saw_dot = True
405
+ j += 2
406
+ continue
407
+
408
+ # Allow limited no-dot concatenation only *within* a dot-name span:
409
+ # - must already have seen a dot
410
+ # - and the previous join must have been a dot (immediately-after-dot window)
411
+ # - and token must be short (name piece), and must not be verbish (checked above)
412
+ if saw_dot and (g == "" or _gap_is_ws(g)) and _is_name_piece(nxt_text):
413
+ prev_join = connectors[-1] if connectors else ""
414
+ if prev_join == "dot" and 1 <= len(nxt_text) <= 3:
415
+ connectors.append("ws")
416
+ parts.append(nxt)
417
+ j += 1
418
+ continue
419
+
420
+ break
421
+
422
+ if saw_dot and len(parts) >= 2:
423
+ if len(parts) <= 6 and sum(len(getattr(p, "text", "") or "") for p in parts) <= 40:
424
+ out_txt = getattr(parts[0], "text", "") or ""
425
+ for k in range(1, len(parts)):
426
+ conn = connectors[k - 1] if (k - 1) < len(connectors) else "ws"
427
+ if conn == "dot":
428
+ out_txt += "·" + (getattr(parts[k], "text", "") or "")
429
+ else:
430
+ out_txt += (getattr(parts[k], "text", "") or "")
431
+ merged_names.append(_tok(out_txt, getattr(parts[0], "start"), getattr(parts[-1], "end")))
432
+ i = j
433
+ continue
434
+
435
+ merged_names.append(t0)
436
+ i += 1
437
+
438
+ # pass 3: merge X + 港 -> X港 (contiguous)
439
+ out3: List[object] = []
440
+ i = 0
441
+ while i < len(merged_names):
442
+ a = merged_names[i]
443
+ if i + 1 < len(merged_names):
444
+ b = merged_names[i + 1]
445
+ if getattr(a, "end") == getattr(b, "start") and getattr(b, "text", "") == "港":
446
+ at = getattr(a, "text", "") or ""
447
+ if at and len(at) <= 12 and all((_is_cjk(ch) or ch.isalnum()) for ch in at):
448
+ out3.append(_tok(at + "港", getattr(a, "start"), getattr(b, "end")))
449
+ i += 2
450
+ continue
451
+ out3.append(a)
452
+ i += 1
453
+
454
+ return out3
455
+
456
+
457
+ def _rule_ja(tokens: List[object], _lang: str, _text: str) -> List[object]:
458
+ """
459
+ Japanese compound re-joining (conservative):
460
+ - Katakana + Kanji suffix merge: モルディン + 港 -> モルディン港
461
+ suffix candidates: 港/駅/空港/都/道/府/県/市/区/町/村
462
+ """
463
+ if not tokens:
464
+ return tokens
465
+
466
+ TokenType = type(tokens[0])
467
+
468
+ def _tok(text_: str, start: int, end: int):
469
+ return TokenType(text=text_, start=start, end=end)
470
+
471
+ JA_SUFFIXES = {"港", "駅", "空港", "都", "道", "府", "県", "市", "区", "町", "村"}
472
+
473
+ def _is_katakana(ch: str) -> bool:
474
+ return "\u30a0" <= ch <= "\u30ff"
475
+
476
+ def _is_katakana_run(s: str) -> bool:
477
+ if not s:
478
+ return False
479
+ for ch in s:
480
+ if _is_katakana(ch) or ch in {"ー", "・"}:
481
+ continue
482
+ return False
483
+ return True
484
+
485
+ out: List[object] = []
486
+ i = 0
487
+ while i < len(tokens):
488
+ a = tokens[i]
489
+ if i + 1 < len(tokens):
490
+ b = tokens[i + 1]
491
+ if getattr(a, "end") == getattr(b, "start") and _is_katakana_run(getattr(a, "text", "") or "") and (getattr(b, "text", "") in JA_SUFFIXES):
492
+ at = getattr(a, "text", "") or ""
493
+ bt = getattr(b, "text", "") or ""
494
+ if len(at) <= 24 and len(at + bt) <= 28:
495
+ out.append(_tok(at + bt, getattr(a, "start"), getattr(b, "end")))
496
+ i += 2
497
+ continue
498
+ out.append(a)
499
+ i += 1
500
+ return out
501
+
502
+
503
+ def _rule_merge_simple_punct_runs(tokens: List[object], _lang: str, text: str) -> List[object]:
504
+ """
505
+ Global neutral fix: merge contiguous single-character punctuation tokens into runs.
506
+
507
+ Example:
508
+ - ["!", "!", "!"] -> ["!!!"]
509
+ - [".", ".", "."] -> ["..."]
510
+ - ["?", "?", "!", "!"] -> ["??!!"]
511
+
512
+ This helps SNS discourse tagging and reduces downstream fragmentation.
513
+ """
514
+ if not tokens or len(tokens) < 2:
515
+ return tokens
516
+
517
+ TokenType = type(tokens[0])
518
+
519
+ def _tok_from_span(s: int, e: int):
520
+ return TokenType(text=text[s:e], start=s, end=e)
521
+
522
+ P = {"!", "!", "?", "?", ".", "…", "~", "~"}
523
+
524
+ out: List[object] = []
525
+ i = 0
526
+ n = len(tokens)
527
+ while i < n:
528
+ a = tokens[i]
529
+ at = getattr(a, "text", "") or ""
530
+ a_s, a_e = int(getattr(a, "start")), int(getattr(a, "end"))
531
+ if len(at) == 1 and at in P and 0 <= a_s <= a_e <= len(text):
532
+ j = i + 1
533
+ end = a_e
534
+ while j < n:
535
+ b = tokens[j]
536
+ bt = getattr(b, "text", "") or ""
537
+ b_s, b_e = int(getattr(b, "start")), int(getattr(b, "end"))
538
+ if not (len(bt) == 1 and bt in P and 0 <= b_s <= b_e <= len(text)):
539
+ break
540
+ if b_s != end:
541
+ break
542
+ end = b_e
543
+ j += 1
544
+ if j > i + 1:
545
+ out.append(_tok_from_span(a_s, end))
546
+ i = j
547
+ continue
548
+ out.append(a)
549
+ i += 1
550
+ return out
551
+
552
+
553
+ def _rule_demesh_hangul_keysmash_inside_token(tokens: List[object], _lang: str, text: str) -> List[object]:
554
+ """
555
+ Global neutral fix (demesh):
556
+ Split tokens that contain an internal Hangul Jamo "keysmash/garble" run.
557
+
558
+ Example (noisy SNS):
559
+ "아ㅣ마ㅓㅣ넣ㄹ아이고" -> ["아", "ㅣ마ㅓㅣ넣ㄹ", "아이고"]
560
+
561
+ Safety:
562
+ - triggers only when the token contains >=3 Hangul Jamo chars (U+3131..U+3163)
563
+ - requires at least one Hangul syllable char (U+AC00..U+D7AF) in the same token
564
+ - splits only around the *longest contiguous jamo run* (length>=3)
565
+ - bounded total token length
566
+ """
567
+ if not tokens:
568
+ return tokens
569
+
570
+ TokenType = type(tokens[0])
571
+
572
+ def _tok(s: int, e: int):
573
+ return TokenType(text=text[s:e], start=s, end=e)
574
+
575
+ def _is_jamo(ch: str) -> bool:
576
+ o = ord(ch)
577
+ return 0x3131 <= o <= 0x3163
578
+
579
+ def _is_syllable(ch: str) -> bool:
580
+ o = ord(ch)
581
+ return 0xAC00 <= o <= 0xD7AF
582
+
583
+ out: List[object] = []
584
+ for t in tokens:
585
+ tt = getattr(t, "text", "") or ""
586
+ a_s, a_e = int(getattr(t, "start")), int(getattr(t, "end"))
587
+ if not (0 <= a_s <= a_e <= len(text)) or not tt:
588
+ out.append(t)
589
+ continue
590
+ if len(tt) > 48:
591
+ out.append(t)
592
+ continue
593
+
594
+ jamo_total = sum(1 for ch in tt if _is_jamo(ch))
595
+ if jamo_total < 3:
596
+ out.append(t)
597
+ continue
598
+ if not any(_is_syllable(ch) for ch in tt):
599
+ out.append(t)
600
+ continue
601
+
602
+ # find longest contiguous jamo run
603
+ best = None # (len, start_idx, end_idx)
604
+ i = 0
605
+ while i < len(tt):
606
+ if not _is_jamo(tt[i]):
607
+ i += 1
608
+ continue
609
+ j = i
610
+ while j < len(tt) and _is_jamo(tt[j]):
611
+ j += 1
612
+ run_len = j - i
613
+ if run_len >= 3:
614
+ if best is None or run_len > best[0]:
615
+ best = (run_len, i, j)
616
+ i = j
617
+
618
+ if not best:
619
+ out.append(t)
620
+ continue
621
+
622
+ _, rs, re_ = best
623
+ # map to absolute spans in original text
624
+ b_s = a_s + rs
625
+ b_e = a_s + re_
626
+ # Keep only meaningful splits; avoid empty tokens
627
+ if a_s < b_s:
628
+ out.append(_tok(a_s, b_s))
629
+ out.append(_tok(b_s, b_e))
630
+ if b_e < a_e:
631
+ out.append(_tok(b_e, a_e))
632
+
633
+ return [x for x in out if (getattr(x, "text", "") or "")]
634
+
635
+
636
+ def _rule_mesh_hangul_keysmash_runs(tokens: List[object], _lang: str, text: str) -> List[object]:
637
+ """
638
+ Global neutral fix (mesh):
639
+ Merge fragmented Hangul keysmash/garble pieces into a single token.
640
+
641
+ Example:
642
+ ["ㅣ", "마", "ㅓㅣ", "넣", "ㄹ"] -> ["ㅣ마ㅓㅣ넣ㄹ"]
643
+
644
+ Safety:
645
+ - merges only contiguous tokens (b.start == a.end)
646
+ - merges only short pieces (<=6 chars each) and bounded total length
647
+ - requires >=3 Hangul Jamo chars across the merged run
648
+ - requires presence of vowel jamo (ㅏ..ㅣ) somewhere in the run
649
+ - avoids swallowing clear words: stops when it sees a token with >=3 Hangul syllables
650
+ """
651
+ if not tokens or len(tokens) < 2:
652
+ return tokens
653
+
654
+ TokenType = type(tokens[0])
655
+
656
+ def _tok(s: int, e: int):
657
+ return TokenType(text=text[s:e], start=s, end=e)
658
+
659
+ def _is_jamo(ch: str) -> bool:
660
+ o = ord(ch)
661
+ return 0x3131 <= o <= 0x3163
662
+
663
+ def _is_vowel_jamo(ch: str) -> bool:
664
+ o = ord(ch)
665
+ return 0x314F <= o <= 0x3163 # ㅏ..ㅣ
666
+
667
+ def _syllable_count(s: str) -> int:
668
+ return sum(1 for ch in s if 0xAC00 <= ord(ch) <= 0xD7AF)
669
+
670
+ out: List[object] = []
671
+ i = 0
672
+ n = len(tokens)
673
+ while i < n:
674
+ a = tokens[i]
675
+ at = getattr(a, "text", "") or ""
676
+ a_s, a_e = int(getattr(a, "start")), int(getattr(a, "end"))
677
+ if not (0 <= a_s <= a_e <= len(text)) or not at:
678
+ out.append(a)
679
+ i += 1
680
+ continue
681
+
682
+ # start a candidate run only if this token has any jamo or is a tiny mixed piece
683
+ if len(at) > 6 or (_syllable_count(at) >= 3 and not any(_is_jamo(ch) for ch in at)):
684
+ out.append(a)
685
+ i += 1
686
+ continue
687
+
688
+ jamo_cnt = sum(1 for ch in at if _is_jamo(ch))
689
+ has_vowel = any(_is_vowel_jamo(ch) for ch in at)
690
+ start = a_s
691
+ end = a_e
692
+ j = i + 1
693
+ parts = [at]
694
+
695
+ while j < n:
696
+ b = tokens[j]
697
+ bt = getattr(b, "text", "") or ""
698
+ b_s, b_e = int(getattr(b, "start")), int(getattr(b, "end"))
699
+ if not bt or not (0 <= b_s <= b_e <= len(text)) or b_s != end:
700
+ break
701
+ if len(bt) > 6:
702
+ break
703
+ # stop before swallowing a clear multi-syllable word chunk
704
+ if _syllable_count(bt) >= 3 and not any(_is_jamo(ch) for ch in bt):
705
+ break
706
+ # cap merged length
707
+ if (b_e - start) > 24:
708
+ break
709
+ parts.append(bt)
710
+ end = b_e
711
+ jamo_cnt += sum(1 for ch in bt if _is_jamo(ch))
712
+ has_vowel = has_vowel or any(_is_vowel_jamo(ch) for ch in bt)
713
+ j += 1
714
+
715
+ if j > i + 1 and jamo_cnt >= 3 and has_vowel:
716
+ out.append(_tok(start, end))
717
+ i = j
718
+ continue
719
+
720
+ out.append(a)
721
+ i += 1
722
+
723
+ return out
724
+
725
+
726
+ def _rule_merge_base64_and_heart(tokens: List[object], _lang: str, text: str) -> List[object]:
727
+ """
728
+ Global neutral fix:
729
+ - Re-merge base64/opaque blobs that got split into many tiny punctuation tokens.
730
+ - Re-merge SNS heart emoticon "<3" if it was split into "<" + "3".
731
+
732
+ This is about robustness/UX for "non-sentences", not linguistic correctness.
733
+ """
734
+ if not tokens or len(tokens) < 2:
735
+ return tokens
736
+
737
+ TokenType = type(tokens[0])
738
+
739
+ def _tok(s: int, e: int):
740
+ return TokenType(text=text[s:e], start=s, end=e)
741
+
742
+ def _is_base64_char(ch: str) -> bool:
743
+ o = ord(ch)
744
+ if 48 <= o <= 57 or 65 <= o <= 90 or 97 <= o <= 122:
745
+ return True
746
+ return ch in {"+", "/", "="}
747
+
748
+ out: List[object] = []
749
+ i = 0
750
+ n = len(tokens)
751
+ while i < n:
752
+ a = tokens[i]
753
+ at = getattr(a, "text", "") or ""
754
+ a_s, a_e = int(getattr(a, "start")), int(getattr(a, "end"))
755
+ if not at:
756
+ i += 1
757
+ continue
758
+
759
+ # Merge "<3"
760
+ if at == "<" and i + 1 < n:
761
+ b = tokens[i + 1]
762
+ bt = getattr(b, "text", "") or ""
763
+ b_s, b_e = int(getattr(b, "start")), int(getattr(b, "end"))
764
+ if bt == "3" and b_s == a_e:
765
+ out.append(_tok(a_s, b_e))
766
+ i += 2
767
+ continue
768
+
769
+ # Merge base64/opaque blob runs split into many tiny tokens.
770
+ # Conditions:
771
+ # - contiguous in text (no spaces)
772
+ # - chars are base64 set [A-Za-z0-9+/=]
773
+ # - total length bounded
774
+ # - must contain at least one of "+/=" to avoid swallowing normal words
775
+ if len(at) <= 128 and all(_is_base64_char(ch) for ch in at) and not any(ch.isspace() for ch in at):
776
+ start = a_s
777
+ end = a_e
778
+ has_sig = any(ch in at for ch in {"+", "/", "="})
779
+ sig_count = sum(1 for ch in at if ch in {"+", "/", "="})
780
+ total_len = len(at)
781
+ j = i + 1
782
+ while j < n:
783
+ b = tokens[j]
784
+ bt = getattr(b, "text", "") or ""
785
+ b_s, b_e = int(getattr(b, "start")), int(getattr(b, "end"))
786
+ if not bt or b_s != end:
787
+ break
788
+ if any(ch.isspace() for ch in bt):
789
+ break
790
+ if not all(_is_base64_char(ch) for ch in bt):
791
+ break
792
+ if len(bt) > 128:
793
+ break
794
+ total_len += len(bt)
795
+ if total_len > 256:
796
+ break
797
+ has_sig = has_sig or any(ch in bt for ch in {"+", "/", "="})
798
+ sig_count += sum(1 for ch in bt if ch in {"+", "/", "="})
799
+ end = b_e
800
+ j += 1
801
+
802
+ # Require enough "signal" chars to avoid swallowing normal words like "internationalization"
803
+ if j > i + 1 and total_len >= 16 and has_sig and sig_count >= 2:
804
+ out.append(_tok(start, end))
805
+ i = j
806
+ continue
807
+
808
+ out.append(a)
809
+ i += 1
810
+
811
+ return out
812
+
813
+
814
+ def _rule_split_punct_before_social_marker(tokens: List[object], _lang: str, text: str) -> List[object]:
815
+ """
816
+ Global neutral fix: split punctuation runs that accidentally swallow a social marker.
817
+
818
+ Example:
819
+ - "!!!#发布会" might yield a token "!!!#" -> split into "!!!" + "#"
820
+
821
+ This makes the downstream `_rule_merge_social_handles` effective.
822
+ """
823
+ if not tokens:
824
+ return tokens
825
+
826
+ TokenType = type(tokens[0])
827
+
828
+ def _tok(s: int, e: int):
829
+ return TokenType(text=text[s:e], start=s, end=e)
830
+
831
+ out: List[object] = []
832
+ for t in tokens:
833
+ tt = getattr(t, "text", "") or ""
834
+ s = int(getattr(t, "start"))
835
+ e = int(getattr(t, "end"))
836
+ if not (0 <= s <= e <= len(text)) or len(tt) < 2:
837
+ out.append(t)
838
+ continue
839
+ last = tt[-1]
840
+ if last in {"#", "@", "$"}:
841
+ head = tt[:-1]
842
+ # Only split when head looks like punctuation/emphasis noise.
843
+ # Keep it conservative to avoid breaking things like "C#".
844
+ if head and all((not ch.isalnum()) for ch in head):
845
+ mid = e - 1
846
+ out.append(_tok(s, mid))
847
+ out.append(_tok(mid, e))
848
+ continue
849
+ out.append(t)
850
+ return [x for x in out if (getattr(x, "text", "") or "")]
851
+
852
+
853
+ def _rule_split_punct_digit_ellipsis_clumps(tokens: List[object], _lang: str, text: str) -> List[object]:
854
+ """
855
+ Global neutral fix: split SNS-ish clumps like "!!!23333……" that sometimes appear in corpora.
856
+
857
+ This helps downstream SNS marker tagging and avoids treating such clumps as a single token.
858
+
859
+ Safety:
860
+ - only splits when the token is entirely: (punct-run){2,} + (digits){2,} + optional (ellipsis-run){2,}
861
+ - does not touch normal words or mixed alnum words.
862
+ """
863
+ if not tokens:
864
+ return tokens
865
+
866
+ TokenType = type(tokens[0])
867
+
868
+ def _tok(s: int, e: int):
869
+ return TokenType(text=text[s:e], start=s, end=e)
870
+
871
+ rx = re.compile(r"^([!!??~~]{2,})(\d{2,})([.…\.]{2,}|…{2,})?$")
872
+
873
+ out: List[object] = []
874
+ for t in tokens:
875
+ tt = getattr(t, "text", "") or ""
876
+ s = int(getattr(t, "start"))
877
+ e = int(getattr(t, "end"))
878
+ if not (0 <= s <= e <= len(text)) or len(tt) < 5:
879
+ out.append(t)
880
+ continue
881
+ m = rx.fullmatch(tt)
882
+ if not m:
883
+ out.append(t)
884
+ continue
885
+ g1, g2, g3 = m.group(1), m.group(2), m.group(3)
886
+ i1 = s + len(g1)
887
+ i2 = i1 + len(g2)
888
+ if not (s < i1 < i2 <= e):
889
+ out.append(t)
890
+ continue
891
+ out.append(_tok(s, i1))
892
+ out.append(_tok(i1, i2))
893
+ if g3:
894
+ i3 = i2 + len(g3)
895
+ if i2 < i3 <= e:
896
+ out.append(_tok(i2, i3))
897
+ else:
898
+ out.append(_tok(i2, e))
899
+ continue
900
+ return [x for x in out if (getattr(x, "text", "") or "")]
901
+
902
+
903
+ def _rule_merge_social_handles(tokens: List[object], _lang: str, text: str) -> List[object]:
904
+ """
905
+ Global neutral fix: merge contiguous social handles and tags.
906
+
907
+ Examples:
908
+ - "#AI" split as ["#", "AI"] -> ["#AI"]
909
+ - "@user" split as ["@", "user"] -> ["@user"]
910
+ - "$TSLA" split as ["$", "TSLA"] -> ["$TSLA"]
911
+ - "#发布会" split as ["#", "发布", "会"] -> ["#发布会"]
912
+
913
+ Safety:
914
+ - only when contiguous in the original text (b.start == a.end)
915
+ - tail must be "handle-like" (letters/numbers/marks plus _.- and CJK)
916
+ - bounded tail length
917
+ """
918
+ if not tokens or len(tokens) < 2:
919
+ return tokens
920
+
921
+ TokenType = type(tokens[0])
922
+
923
+ def _tok(text_: str, start: int, end: int):
924
+ return TokenType(text=text_, start=start, end=end)
925
+
926
+ def _is_tail_ok(s: str) -> bool:
927
+ if not s or len(s) > 64:
928
+ return False
929
+ for ch in s:
930
+ if ch.isalnum():
931
+ continue
932
+ # allow underscore/dot/hyphen
933
+ if ch in {"_", ".", "-"}:
934
+ continue
935
+ # allow CJK characters in tags
936
+ o = ord(ch)
937
+ if 0x4E00 <= o <= 0x9FFF:
938
+ continue
939
+ # allow Japanese kana in tags
940
+ if 0x3040 <= o <= 0x30FF:
941
+ continue
942
+ # allow Hangul in tags
943
+ if 0xAC00 <= o <= 0xD7AF:
944
+ continue
945
+ return False
946
+ return True
947
+
948
+ out: List[object] = []
949
+ i = 0
950
+ n = len(tokens)
951
+ while i < n:
952
+ a = tokens[i]
953
+ if i + 1 < n:
954
+ b = tokens[i + 1]
955
+ at = getattr(a, "text", "") or ""
956
+ bt = getattr(b, "text", "") or ""
957
+ if at in {"#", "@", "$"}:
958
+ a_s, a_e = int(getattr(a, "start")), int(getattr(a, "end"))
959
+ b_s, b_e = int(getattr(b, "start")), int(getattr(b, "end"))
960
+ if 0 <= a_s <= a_e <= len(text) and 0 <= b_s <= b_e <= len(text) and b_s == a_e:
961
+ # Merge a multi-token tail when it stays contiguous and handle-like.
962
+ parts = [bt]
963
+ end = b_e
964
+ j = i + 2
965
+ while j < n:
966
+ c = tokens[j]
967
+ ct = getattr(c, "text", "") or ""
968
+ c_s, c_e = int(getattr(c, "start")), int(getattr(c, "end"))
969
+ if c_s != end:
970
+ break
971
+ # stop if too long
972
+ if sum(len(p) for p in parts) + len(ct) > 64:
973
+ break
974
+ # accept only handle-like chunks
975
+ if not _is_tail_ok(ct):
976
+ break
977
+ parts.append(ct)
978
+ end = c_e
979
+ j += 1
980
+ merged_tail = "".join(parts)
981
+ if _is_tail_ok(merged_tail):
982
+ out.append(_tok(at + merged_tail, a_s, end))
983
+ i = j
984
+ continue
985
+ out.append(a)
986
+ i += 1
987
+ return out
988
+
989
+
990
+ def _rule_merge_emoji_sequences(tokens: List[object], _lang: str, _text: str) -> List[object]:
991
+ """
992
+ Global neutral fix: merge contiguous emoji sequences into a single token.
993
+
994
+ Handles common emoji composition characters:
995
+ - ZWJ (U+200D)
996
+ - variation selectors (FE0E/FE0F)
997
+ - skin tone modifiers (U+1F3FB..U+1F3FF)
998
+ - regional indicators (flags) (U+1F1E6..U+1F1FF)
999
+
1000
+ Safety:
1001
+ - only merges when tokens are contiguous (a.end == b.start)
1002
+ - only merges tokens that are "emojiish-only"
1003
+ - bounded total length
1004
+ """
1005
+ if not tokens or len(tokens) < 2:
1006
+ return tokens
1007
+
1008
+ TokenType = type(tokens[0])
1009
+
1010
+ def _tok(text_: str, start: int, end: int):
1011
+ return TokenType(text=text_, start=start, end=end)
1012
+
1013
+ def _is_emojiish_char(ch: str) -> bool:
1014
+ o = ord(ch)
1015
+ if ch == "\u200d": # ZWJ
1016
+ return True
1017
+ if o in {0xFE0E, 0xFE0F}: # variation selectors
1018
+ return True
1019
+ if 0x1F3FB <= o <= 0x1F3FF: # skin tone modifiers
1020
+ return True
1021
+ if 0x1F1E6 <= o <= 0x1F1FF: # regional indicators
1022
+ return True
1023
+ # common emoji blocks
1024
+ if 0x1F300 <= o <= 0x1FAFF:
1025
+ return True
1026
+ # misc symbols / dingbats often used as emoji
1027
+ if 0x2600 <= o <= 0x26FF:
1028
+ return True
1029
+ if 0x2700 <= o <= 0x27BF:
1030
+ return True
1031
+ return False
1032
+
1033
+ def _is_emojiish_token(s: str) -> bool:
1034
+ return bool(s) and all(_is_emojiish_char(ch) for ch in s)
1035
+
1036
+ out: List[object] = []
1037
+ i = 0
1038
+ n = len(tokens)
1039
+ while i < n:
1040
+ a = tokens[i]
1041
+ at = getattr(a, "text", "") or ""
1042
+ if not _is_emojiish_token(at):
1043
+ out.append(a)
1044
+ i += 1
1045
+ continue
1046
+
1047
+ start = int(getattr(a, "start"))
1048
+ end = int(getattr(a, "end"))
1049
+ parts = [at]
1050
+ j = i + 1
1051
+ while j < n:
1052
+ b = tokens[j]
1053
+ bt = getattr(b, "text", "") or ""
1054
+ if not _is_emojiish_token(bt):
1055
+ break
1056
+ if int(getattr(b, "start")) != end:
1057
+ break
1058
+ # cap length to avoid pathological merges
1059
+ if sum(len(p) for p in parts) + len(bt) > 32:
1060
+ break
1061
+ parts.append(bt)
1062
+ end = int(getattr(b, "end"))
1063
+ j += 1
1064
+
1065
+ if j > i + 1:
1066
+ out.append(_tok("".join(parts), start, end))
1067
+ i = j
1068
+ continue
1069
+ out.append(a)
1070
+ i += 1
1071
+ return out
1072
+
1073
+
1074
+ def _rule_ko_sns_markers(tokens: List[object], _lang: str, text: str) -> List[object]:
1075
+ """
1076
+ Korean SNS marker splitting (neutral preprocessing):
1077
+ - Split embedded/emergent discourse/emotion markers like ㅋㅋ/ㅎㅎ/ㅠㅠ/ㅜㅜ/ㄷㄷ/ㅇㅇ/ㄹㅇ when they
1078
+ are glued to neighboring words.
1079
+
1080
+ Examples:
1081
+ - "실화냐ㅋㅋ" -> ["실화냐", "ㅋㅋ"]
1082
+ - "아ㅋㅋ진짜" -> ["아", "ㅋㅋ", "진짜"]
1083
+ - "미쳤네ㅠㅠ" -> ["미쳤네", "ㅠㅠ"]
1084
+ - "ㄱㄱ!!!" -> ["ㄱㄱ", "!!!"] (only splits when punctuation is present)
1085
+
1086
+ Safety:
1087
+ - only triggers for Hangul/Jamo marker runs (rare in formal text)
1088
+ - preserves offsets by slicing the original `text`
1089
+ """
1090
+ if not tokens:
1091
+ return tokens
1092
+
1093
+ ll = (_lang or "").lower().replace("_", "-")
1094
+ if ll != "ko":
1095
+ return tokens
1096
+
1097
+ TokenType = type(tokens[0])
1098
+
1099
+ def _tok(s: int, e: int):
1100
+ return TokenType(text=text[s:e], start=s, end=e)
1101
+
1102
+ # Core SNS marker runs: laughter/cry/surprise/affirmation/emphasis.
1103
+ # - laughter/emotion: ㅋ/ㅎ/ㅠ/ㅜ repeated
1104
+ # - surprise: ㄷㄷ
1105
+ # - affirmation: ㅇㅇ
1106
+ # - emphasis slang: ㄹㅇ, ㅈㄴ, ㅅㅂ, ㅇㅋ etc. (consonant-only runs)
1107
+ import re
1108
+
1109
+ rx_marker = re.compile(r"(?:[ㅋㅎㅠㅜ]{2,}|ㄷ{2,}|ㅇ{2,}|[ㄱ-ㅎ]{2,5})")
1110
+ rx_punct_tail = re.compile(r"[!!??~…]+$")
1111
+
1112
+ out: List[object] = []
1113
+ for t in tokens:
1114
+ s0 = getattr(t, "text", "") or ""
1115
+ a_s, a_e = int(getattr(t, "start")), int(getattr(t, "end"))
1116
+ if not (0 <= a_s <= a_e <= len(text)) or not s0:
1117
+ out.append(t)
1118
+ continue
1119
+
1120
+ # Split trailing punctuation runs (SNS intensity) when attached to marker-only token.
1121
+ # e.g., "ㄱㄱ!!!" -> "ㄱㄱ" + "!!!"
1122
+ m_tail = rx_punct_tail.search(s0)
1123
+ if m_tail and m_tail.start() > 0:
1124
+ head = s0[: m_tail.start()]
1125
+ tail = s0[m_tail.start() :]
1126
+ # Only do this for jamo/consonant-like heads (avoid splitting normal words like "go!!!")
1127
+ if rx_marker.fullmatch(head):
1128
+ mid = a_s + m_tail.start()
1129
+ out.append(_tok(a_s, mid))
1130
+ out.append(_tok(mid, a_e))
1131
+ continue
1132
+
1133
+ # Find embedded marker runs; if none, keep as-is.
1134
+ hits = list(rx_marker.finditer(s0))
1135
+ if not hits:
1136
+ out.append(t)
1137
+ continue
1138
+
1139
+ # If the whole token is just a marker run, keep as-is (already good token).
1140
+ if len(hits) == 1 and hits[0].start() == 0 and hits[0].end() == len(s0):
1141
+ out.append(t)
1142
+ continue
1143
+
1144
+ # Split into segments using marker spans.
1145
+ cur = 0
1146
+ any_split = False
1147
+ for h in hits:
1148
+ hs, he = h.start(), h.end()
1149
+ if hs > cur:
1150
+ out.append(_tok(a_s + cur, a_s + hs))
1151
+ out.append(_tok(a_s + hs, a_s + he))
1152
+ if hs != 0 or he != len(s0):
1153
+ any_split = True
1154
+ cur = he
1155
+ if cur < len(s0):
1156
+ out.append(_tok(a_s + cur, a_e))
1157
+ if not any_split:
1158
+ # fallback: no meaningful split detected
1159
+ out.append(t)
1160
+
1161
+ # Drop empty tokens defensively
1162
+ out2: List[object] = []
1163
+ for x in out:
1164
+ if (getattr(x, "text", "") or ""):
1165
+ out2.append(x)
1166
+ return out2
1167
+
1168
+
1169
+ # Register built-in rules (core defaults)
1170
+ register_global_rule(_rule_merge_digit_groups)
1171
+ register_global_rule(_rule_demesh_hangul_keysmash_inside_token)
1172
+ register_global_rule(_rule_mesh_hangul_keysmash_runs)
1173
+ register_global_rule(_rule_merge_simple_punct_runs)
1174
+ register_global_rule(_rule_split_punct_before_social_marker)
1175
+ register_global_rule(_rule_split_punct_digit_ellipsis_clumps)
1176
+ register_global_rule(_rule_merge_base64_and_heart)
1177
+ register_global_rule(_rule_merge_social_handles)
1178
+ register_global_rule(_rule_merge_emoji_sequences)
1179
+ register_lang_rule("ko", _rule_ko_sns_markers)
1180
+ register_lang_rule("zh", _rule_zh)
1181
+ register_lang_rule("zh-cn", _rule_zh)
1182
+ register_lang_rule("zh-tw", _rule_zh)
1183
+ register_lang_rule("ja", _rule_ja)
1184
+
1185
+