tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
tokmor/brahmic.py ADDED
@@ -0,0 +1,516 @@
1
+ """
2
+ Brahmic Tokenizer (No External Dependencies)
3
+ =============================================
4
+
5
+ 태국어, 라오어, 미얀마어, 크메르어용 토크나이저
6
+ 외부 라이브러리 없이 순수 Python으로 구현
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import math
12
+ import re
13
+ import unicodedata
14
+ from typing import List, Set, Dict, Optional
15
+ from .base import BaseTokenizer, Token, TokenizerResult, MorphologicalAnalyzer
16
+ from .resources import resolve_seg_lexicon_path, resolve_sea_wordlist_path
17
+ from .morphology.thai_native import ThaiNativeAnalyzer
18
+
19
+
20
+ # ============================================================
21
+ # Built-in seed dictionaries (REMOVED in OSS core)
22
+ # ============================================================
23
+ #
24
+ # OSS policy:
25
+ # - Do not embed language wordlists in the core code distribution.
26
+ # - For SEA no-space tokenization quality, provide optional offline assets via:
27
+ # TOKMOR_DATA_DIR/seg_lexicon/{lang}_wordfreq.pkl and/or {lang}_wordlist.(pkl|txt)
28
+ #
29
+ # Keep these as empty sets so the tokenizer still works, but quality depends on assets.
30
+ THAI_DICT: Set[str] = set()
31
+ LAO_DICT: Set[str] = set()
32
+ MYANMAR_DICT: Set[str] = set()
33
+ KHMER_DICT: Set[str] = set()
34
+
35
+
36
+ class BrahmicTokenizer(BaseTokenizer):
37
+ """
38
+ Brahmic 스크립트 토크나이저 (No External Dependencies)
39
+ """
40
+
41
+ SUPPORTED_LANGUAGES = {'th', 'lo', 'my', 'km'}
42
+
43
+ # Unicode ranges
44
+ THAI = '\u0e00-\u0e7f'
45
+ LAO = '\u0e80-\u0eff'
46
+ MYANMAR = '\u1000-\u109f'
47
+ KHMER = '\u1780-\u17ff'
48
+
49
+ # 언어별 사전
50
+ DICTIONARIES = {
51
+ 'th': THAI_DICT,
52
+ 'lo': LAO_DICT,
53
+ 'my': MYANMAR_DICT,
54
+ 'km': KHMER_DICT,
55
+ }
56
+
57
+ def __init__(self, lang: str, use_morphology: bool = False):
58
+ super().__init__(lang, use_morphology)
59
+ self._setup_patterns()
60
+ # Keep a copy of the small built-in dictionary (high-precision).
61
+ self._builtin_dictionary: Set[str] = set(self.DICTIONARIES.get(lang, set()))
62
+ self._dictionary = set(self._builtin_dictionary)
63
+ self._max_word_len = max(len(w) for w in self._dictionary) if self._dictionary else 20
64
+ self._wordfreq: Optional[Dict[str, int]] = None
65
+ self._wordfreq_max_len: int = 2
66
+ self._wordlist_size: int = 0
67
+ self._load_seg_lexicon()
68
+ self._load_wordlist()
69
+ self._max_word_len = max(len(w) for w in self._dictionary) if self._dictionary else 20
70
+
71
+ def _load_wordlist(self) -> None:
72
+ """
73
+ Optional SEA tokenizer wordlist (offline), used for longest-match segmentation.
74
+
75
+ File:
76
+ seg_lexicon/{lang}_wordlist.pkl (set[str]) or .txt (one token per line)
77
+ """
78
+ p = resolve_sea_wordlist_path(self.lang)
79
+ if not p:
80
+ return
81
+ try:
82
+ words: set[str] = set()
83
+ if p.suffix.lower() == ".pkl":
84
+ import pickle
85
+ obj = pickle.loads(p.read_bytes())
86
+ if isinstance(obj, set):
87
+ words = {w for w in obj if isinstance(w, str) and w}
88
+ elif isinstance(obj, dict):
89
+ # allow {word:freq} too
90
+ words = {w for w in obj.keys() if isinstance(w, str) and w}
91
+ else:
92
+ # txt
93
+ for line in p.read_text(encoding="utf-8", errors="ignore").splitlines():
94
+ w = line.strip()
95
+ if w:
96
+ words.add(w)
97
+ if not words:
98
+ return
99
+ # Filter very short entries for km/lo/my to avoid syllable-chunking
100
+ if self.lang in {"km", "lo", "my"}:
101
+ words = {w for w in words if len(w) >= 2}
102
+ self._wordlist_size = len(words)
103
+ self._dictionary = set(self._dictionary) | set(words)
104
+ except Exception:
105
+ return
106
+
107
+ def _load_seg_lexicon(self) -> None:
108
+ """
109
+ Optional segmentation lexicon for no-space scripts:
110
+ seg_lexicon/{lang}_wordfreq.pkl (dict[str,int])
111
+ """
112
+ p = resolve_seg_lexicon_path(self.lang)
113
+ if not p:
114
+ return
115
+ try:
116
+ import pickle
117
+ obj = pickle.loads(p.read_bytes())
118
+ if not isinstance(obj, dict):
119
+ return
120
+ wf: Dict[str, int] = {}
121
+ mx = 1
122
+ for k, v in obj.items():
123
+ if isinstance(k, str) and k and isinstance(v, int) and v > 0:
124
+ # For SEA scripts (km/lo/my), 2-gram frequencies tend to be too "syllable-like"
125
+ # and can cause degenerate segmentation into short chunks.
126
+ # Filter out very short entries to keep Viterbi candidates more word-like.
127
+ if self.lang in {"km", "lo", "my"} and len(k) < 3:
128
+ continue
129
+ wf[k] = v
130
+ if len(k) > mx:
131
+ mx = len(k)
132
+ if wf:
133
+ # keep bounds conservative (Thai/Lao/Myanmar/Khmer words rarely exceed 12 chars)
134
+ self._wordfreq = wf
135
+ self._wordfreq_max_len = max(2, min(int(mx), 12))
136
+ except Exception:
137
+ return
138
+
139
+ def _setup_patterns(self):
140
+ pattern_map = {
141
+ 'th': self.THAI,
142
+ 'lo': self.LAO,
143
+ 'my': self.MYANMAR,
144
+ 'km': self.KHMER,
145
+ }
146
+ script_range = pattern_map.get(self.lang, self.THAI)
147
+ self._script_pattern = re.compile(f'[{script_range}]+')
148
+ self._latin_pattern = re.compile(r'[a-zA-Z0-9]+')
149
+
150
+ def _init_morphology(self):
151
+ # 외부 의존 제거 - 내장 사전 사용
152
+ self._morphology_analyzer = None
153
+
154
+ def tokenize(self, text: str) -> TokenizerResult:
155
+ text = self.clean_text(text)
156
+ if not text:
157
+ return TokenizerResult(tokens=[], text=text, lang=self.lang)
158
+
159
+ tokens: List[Token] = []
160
+
161
+ for match in self._script_pattern.finditer(text):
162
+ chunk = match.group()
163
+ # NOTE:
164
+ # For km/lo/my, naive n-gram wordfreq lexicons tend to over-segment into short chunks.
165
+ # Until we have a word-level lexicon, keep the robust longest-match+unknown-grouping path.
166
+ # For Thai:
167
+ # - If we have a big token-level wordlist, prefer longest-match with that wordlist (more word-like).
168
+ # - Otherwise, fall back to Viterbi over wordfreq (may be n-gramy, but still better than tiny dict).
169
+ # Split native digit runs inside the script chunk BEFORE segmentation.
170
+ # This prevents mixed letter+digit tokens (e.g., Myanmar: 'က၂' + '၀၂၅') and keeps digits intact.
171
+ start0 = match.start()
172
+ cur = 0
173
+ for part in self._split_native_digit_runs(chunk):
174
+ if not part:
175
+ continue
176
+ if self._is_native_digit_token(part):
177
+ tokens.append(Token(text=part, start=start0 + cur, end=start0 + cur + len(part)))
178
+ cur += len(part)
179
+ continue
180
+ if self._wordfreq and self.lang == "th" and self._wordlist_size <= 0:
181
+ part_tokens = self._segment_viterbi(part)
182
+ else:
183
+ part_tokens = self._segment_longest_match(part)
184
+ for t in part_tokens:
185
+ if not t:
186
+ continue
187
+ tokens.append(Token(text=t, start=start0 + cur, end=start0 + cur + len(t)))
188
+ cur += len(t)
189
+
190
+ # 라틴/숫자
191
+ for match in self._latin_pattern.finditer(text):
192
+ overlaps = any(
193
+ t.start <= match.start() < t.end or t.start < match.end() <= t.end
194
+ for t in tokens
195
+ )
196
+ if not overlaps:
197
+ tokens.append(Token(text=match.group(), start=match.start(), end=match.end()))
198
+
199
+ # punctuation / symbols: include as single-char tokens (helps SBD & downstream)
200
+ # Only add chars not already covered by script/latin tokens.
201
+ covered = [False] * (len(text) + 1)
202
+ for t in tokens:
203
+ s = max(0, min(len(text), int(t.start)))
204
+ e = max(0, min(len(text), int(t.end)))
205
+ for i in range(s, e):
206
+ covered[i] = True
207
+ for i, ch in enumerate(text):
208
+ if covered[i] or ch.isspace():
209
+ continue
210
+ # skip characters that are part of our main scripts (should have been covered)
211
+ if self._script_pattern.match(ch) or self._latin_pattern.match(ch):
212
+ continue
213
+ tokens.append(Token(text=ch, start=i, end=i + 1))
214
+
215
+ tokens.sort(key=lambda t: t.start)
216
+ tokens = self._postprocess_marks_and_digits(tokens, text)
217
+ return TokenizerResult(tokens=tokens, text=text, lang=self.lang, morphology_used=False)
218
+
219
+ def _postprocess_marks_and_digits(self, tokens: List[Token], text: str) -> List[Token]:
220
+ """
221
+ Postprocess for SEA scripts:
222
+ - Never allow a token to start with a combining mark (Mn/Mc/Me). If it does, merge into previous token
223
+ when contiguous. This fixes cases like Khmer coeng/virama or Myanmar vowel signs splitting.
224
+ - Merge contiguous native-digit runs (Thai/Lao/Myanmar/Khmer digits) into a single token to avoid
225
+ 'per-digit' fragmentation.
226
+ """
227
+ if not tokens:
228
+ return tokens
229
+
230
+ def _is_mark(ch: str) -> bool:
231
+ try:
232
+ return unicodedata.category(ch) in {"Mn", "Mc", "Me"}
233
+ except Exception:
234
+ return False
235
+
236
+ # (native digit helpers are shared with pre-seg split)
237
+
238
+ out: List[Token] = []
239
+ for t in tokens:
240
+ if out:
241
+ prev = out[-1]
242
+ # Merge leading combining marks into previous token if contiguous
243
+ if t.text and _is_mark(t.text[0]) and prev.end == t.start:
244
+ prev.text += t.text
245
+ prev.end = t.end
246
+ continue
247
+ # Merge native-digit runs if contiguous (e.g., Myanmar digits)
248
+ if self._is_native_digit_token(prev.text) and self._is_native_digit_token(t.text) and prev.end == t.start:
249
+ prev.text += t.text
250
+ prev.end = t.end
251
+ continue
252
+ out.append(t)
253
+
254
+ return out
255
+
256
+ def _segment_longest_match(self, text: str) -> List[str]:
257
+ """
258
+ Longest Matching 알고리즘 (improved fallback).
259
+
260
+ Key change vs previous version:
261
+ - If no dictionary match, do NOT emit single-character tokens (degenerate).
262
+ Instead, group an "unknown span" until the next plausible dictionary boundary,
263
+ capped to a reasonable max length.
264
+ """
265
+ tokens = []
266
+ pos = 0
267
+
268
+ while pos < len(text):
269
+ # Length-first longest match (product-safe).
270
+ # Use wordfreq only as a tie-breaker among same-length candidates.
271
+ best_match = None
272
+ maxL = min(self._max_word_len, len(text) - pos)
273
+ wf = self._wordfreq or {}
274
+ for length in range(maxL, 0, -1):
275
+ best_freq = -1
276
+ cand = None
277
+ candidate = text[pos:pos + length]
278
+ if candidate in self._dictionary:
279
+ cand = candidate
280
+ best_freq = wf.get(candidate, 0)
281
+ # Optionally allow a small set of high-precision builtins to win even if freq is missing.
282
+ if cand:
283
+ best_match = cand
284
+ # Prefer built-in dictionary compounds for Thai (e.g., ประเทศไทย, มาตรการ).
285
+ if self.lang == "th" and cand in self._builtin_dictionary:
286
+ break
287
+ # Otherwise, accept the first (longest) match.
288
+ break
289
+
290
+ if best_match:
291
+ tokens.append(best_match)
292
+ pos += len(best_match)
293
+ else:
294
+ # Unknown: group forward until we hit a position that can start a known word,
295
+ # or until we reach a max span length.
296
+ max_unknown = 8 if self.lang in {"th", "lo"} else 10
297
+ end_pos = min(len(text), pos + 1)
298
+ while end_pos < len(text) and (end_pos - pos) < max_unknown:
299
+ # stop if the remainder begins with a dictionary word
300
+ found = False
301
+ for length in range(min(self._max_word_len, len(text) - end_pos), 1, -1):
302
+ if text[end_pos:end_pos + length] in self._dictionary:
303
+ found = True
304
+ break
305
+ if found:
306
+ break
307
+ end_pos += 1
308
+ if end_pos <= pos:
309
+ end_pos = pos + 1
310
+ tokens.append(text[pos:end_pos])
311
+ pos = end_pos
312
+
313
+ return tokens
314
+
315
+ def _is_native_digit_char(self, ch: str) -> bool:
316
+ o = ord(ch)
317
+ if self.lang == "th":
318
+ return 0x0E50 <= o <= 0x0E59
319
+ if self.lang == "lo":
320
+ return 0x0ED0 <= o <= 0x0ED9
321
+ if self.lang == "my":
322
+ return 0x1040 <= o <= 0x1049
323
+ if self.lang == "km":
324
+ return 0x17E0 <= o <= 0x17E9
325
+ return False
326
+
327
+ def _is_native_digit_token(self, s: str) -> bool:
328
+ return bool(s) and all(self._is_native_digit_char(c) for c in s)
329
+
330
+ def _split_native_digit_runs(self, s: str) -> List[str]:
331
+ """
332
+ Split a SEA-script run into alternating [letters] / [native-digit-runs].
333
+ Digits are kept as-is to preserve offsets.
334
+ """
335
+ if not s:
336
+ return []
337
+ out: List[str] = []
338
+ i = 0
339
+ n = len(s)
340
+ while i < n:
341
+ ch = s[i]
342
+ if self._is_native_digit_char(ch):
343
+ j = i + 1
344
+ while j < n and self._is_native_digit_char(s[j]):
345
+ j += 1
346
+ out.append(s[i:j])
347
+ i = j
348
+ else:
349
+ j = i + 1
350
+ while j < n and (not self._is_native_digit_char(s[j])):
351
+ j += 1
352
+ out.append(s[i:j])
353
+ i = j
354
+ return out
355
+
356
+ def _segment_viterbi(self, run: str) -> List[str]:
357
+ """
358
+ Viterbi segmentation over a pure-script run (Thai/Lao/Myanmar/Khmer).
359
+
360
+ Candidates are any substrings; scoring prefers:
361
+ - known words from wordfreq lexicon (coverage)
362
+ - words from small built-in dictionary (precision)
363
+ - multi-char groupings over per-char tokens (anti-degenerate)
364
+ """
365
+ wf = self._wordfreq or {}
366
+ max_len = max(self._max_word_len, self._wordfreq_max_len)
367
+ max_len = max(2, min(int(max_len), 12))
368
+ n = len(run)
369
+ if n <= 0:
370
+ return []
371
+
372
+ # Tuned to avoid per-character segmentation even when lexicon is sparse.
373
+ len_bonus = 0.55
374
+ single_penalty = 1.0
375
+ dict_bonus = 1.8
376
+ unk_base = -1.2
377
+ unk_len_penalty = 0.25
378
+
379
+ best = [-1e100] * (n + 1)
380
+ back = [-1] * (n + 1)
381
+ back_len = [1] * (n + 1)
382
+ best[0] = 0.0
383
+
384
+ for i in range(n):
385
+ if best[i] <= -1e90:
386
+ continue
387
+ maxL = min(max_len, n - i)
388
+ for L in range(1, maxL + 1):
389
+ w = run[i:i + L]
390
+ freq = wf.get(w, 0)
391
+ in_dict = w in self._dictionary
392
+ sc = best[i]
393
+ if in_dict or freq > 0:
394
+ sc += math.log(float(freq) + 1.0)
395
+ else:
396
+ sc += unk_base - unk_len_penalty * L
397
+ sc += len_bonus * (L - 1)
398
+ if L == 1:
399
+ sc -= single_penalty
400
+ if in_dict:
401
+ sc += dict_bonus
402
+ j = i + L
403
+ if sc > best[j]:
404
+ best[j] = sc
405
+ back[j] = i
406
+ back_len[j] = L
407
+
408
+ if back[n] < 0:
409
+ # fallback safety: never return per-char; use unknown grouping
410
+ return self._segment_longest_match(run)
411
+
412
+ spans = []
413
+ j = n
414
+ while j > 0:
415
+ i = back[j]
416
+ if i < 0:
417
+ i = j - 1
418
+ spans.append((i, j))
419
+ j = i
420
+ spans.reverse()
421
+ return [run[i:j] for i, j in spans]
422
+
423
+ def extract_ngrams(self, text: str, min_n: int = 2, max_n: int = 8) -> List[str]:
424
+ ngrams = []
425
+ for match in self._script_pattern.finditer(text):
426
+ chunk = match.group()
427
+ for n in range(min_n, min(max_n + 1, len(chunk) + 1)):
428
+ for i in range(len(chunk) - n + 1):
429
+ ngrams.append(chunk[i:i+n])
430
+ return ngrams
431
+
432
+
433
+ # 언어별 특화 클래스
434
+ class ThaiTokenizer(BrahmicTokenizer):
435
+ SUPPORTED_LANGUAGES = {'th'}
436
+ def __init__(self, use_morphology: bool = False):
437
+ super().__init__('th', use_morphology)
438
+ # Ensure the native analyzer is available even when use_morphology=False,
439
+ # so Thai segmentation stays high-quality by default.
440
+ if getattr(self, "_morphology_analyzer", None) is None:
441
+ try:
442
+ self._morphology_analyzer = ThaiNativeAnalyzer()
443
+ except Exception:
444
+ self._morphology_analyzer = None
445
+
446
+ def _init_morphology(self):
447
+ # Native Thai analyzer (offline) provides better segmentation than the tiny dictionary.
448
+ try:
449
+ self._morphology_analyzer = ThaiNativeAnalyzer()
450
+ except Exception:
451
+ self._morphology_analyzer = None
452
+
453
+ def tokenize(self, text: str) -> TokenizerResult:
454
+ """
455
+ For Thai, always prefer the internal native segmenter (offline) if available.
456
+ This is a tokenizer-quality feature, not an external dependency.
457
+
458
+ - If morphology is enabled: include lemma/pos where possible and mark morphology_used=True.
459
+ - If morphology is disabled: return tokens only (no lemma/pos) but still use the better segmenter.
460
+ """
461
+ text = self.clean_text(text)
462
+ if not text:
463
+ return TokenizerResult(tokens=[], text=text, lang=self.lang)
464
+
465
+ # For Thai, prefer the corpus-derived wordlist longest-match path when available.
466
+ # This tends to produce more word-like tokens on real text than the tiny native dictionary.
467
+ if (not self.use_morphology) and getattr(self, "_wordlist_size", 0) > 0:
468
+ return super().tokenize(text)
469
+
470
+ if self._morphology_analyzer is not None:
471
+ try:
472
+ morps = self._morphology_analyzer.analyze(text)
473
+ toks: List[Token] = []
474
+ for m in morps:
475
+ if self.use_morphology:
476
+ toks.append(Token(text=m.surface, start=m.start, end=m.end, lemma=m.lemma, pos=m.pos))
477
+ else:
478
+ toks.append(Token(text=m.surface, start=m.start, end=m.end))
479
+ # include non-covered punctuation/symbols
480
+ covered = [False] * (len(text) + 1)
481
+ for t in toks:
482
+ for i in range(max(0, t.start), min(len(text), t.end)):
483
+ covered[i] = True
484
+ for i, ch in enumerate(text):
485
+ if covered[i] or ch.isspace():
486
+ continue
487
+ toks.append(Token(text=ch, start=i, end=i + 1))
488
+ toks.sort(key=lambda t: t.start)
489
+ return TokenizerResult(tokens=toks, text=text, lang=self.lang, morphology_used=bool(self.use_morphology))
490
+ except Exception:
491
+ # fallback to base behavior
492
+ pass
493
+
494
+ return super().tokenize(text)
495
+
496
+
497
+ class LaoTokenizer(BrahmicTokenizer):
498
+ SUPPORTED_LANGUAGES = {'lo'}
499
+ def __init__(self, use_morphology: bool = False):
500
+ super().__init__('lo', use_morphology)
501
+
502
+
503
+ class MyanmarTokenizer(BrahmicTokenizer):
504
+ SUPPORTED_LANGUAGES = {'my'}
505
+ def __init__(self, use_morphology: bool = False):
506
+ super().__init__('my', use_morphology)
507
+
508
+
509
+ class KhmerTokenizer(BrahmicTokenizer):
510
+ SUPPORTED_LANGUAGES = {'km'}
511
+ def __init__(self, use_morphology: bool = False):
512
+ super().__init__('km', use_morphology)
513
+
514
+
515
+ # ThaiMorphologyAnalyzer 제거 (외부 의존 제거)
516
+ # pythainlp, attacut 필요 없음