tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
tokmor/brahmic.py
ADDED
|
@@ -0,0 +1,516 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Brahmic Tokenizer (No External Dependencies)
|
|
3
|
+
=============================================
|
|
4
|
+
|
|
5
|
+
태국어, 라오어, 미얀마어, 크메르어용 토크나이저
|
|
6
|
+
외부 라이브러리 없이 순수 Python으로 구현
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import math
|
|
12
|
+
import re
|
|
13
|
+
import unicodedata
|
|
14
|
+
from typing import List, Set, Dict, Optional
|
|
15
|
+
from .base import BaseTokenizer, Token, TokenizerResult, MorphologicalAnalyzer
|
|
16
|
+
from .resources import resolve_seg_lexicon_path, resolve_sea_wordlist_path
|
|
17
|
+
from .morphology.thai_native import ThaiNativeAnalyzer
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# ============================================================
|
|
21
|
+
# Built-in seed dictionaries (REMOVED in OSS core)
|
|
22
|
+
# ============================================================
|
|
23
|
+
#
|
|
24
|
+
# OSS policy:
|
|
25
|
+
# - Do not embed language wordlists in the core code distribution.
|
|
26
|
+
# - For SEA no-space tokenization quality, provide optional offline assets via:
|
|
27
|
+
# TOKMOR_DATA_DIR/seg_lexicon/{lang}_wordfreq.pkl and/or {lang}_wordlist.(pkl|txt)
|
|
28
|
+
#
|
|
29
|
+
# Keep these as empty sets so the tokenizer still works, but quality depends on assets.
|
|
30
|
+
THAI_DICT: Set[str] = set()
|
|
31
|
+
LAO_DICT: Set[str] = set()
|
|
32
|
+
MYANMAR_DICT: Set[str] = set()
|
|
33
|
+
KHMER_DICT: Set[str] = set()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class BrahmicTokenizer(BaseTokenizer):
|
|
37
|
+
"""
|
|
38
|
+
Brahmic 스크립트 토크나이저 (No External Dependencies)
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
SUPPORTED_LANGUAGES = {'th', 'lo', 'my', 'km'}
|
|
42
|
+
|
|
43
|
+
# Unicode ranges
|
|
44
|
+
THAI = '\u0e00-\u0e7f'
|
|
45
|
+
LAO = '\u0e80-\u0eff'
|
|
46
|
+
MYANMAR = '\u1000-\u109f'
|
|
47
|
+
KHMER = '\u1780-\u17ff'
|
|
48
|
+
|
|
49
|
+
# 언어별 사전
|
|
50
|
+
DICTIONARIES = {
|
|
51
|
+
'th': THAI_DICT,
|
|
52
|
+
'lo': LAO_DICT,
|
|
53
|
+
'my': MYANMAR_DICT,
|
|
54
|
+
'km': KHMER_DICT,
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
def __init__(self, lang: str, use_morphology: bool = False):
|
|
58
|
+
super().__init__(lang, use_morphology)
|
|
59
|
+
self._setup_patterns()
|
|
60
|
+
# Keep a copy of the small built-in dictionary (high-precision).
|
|
61
|
+
self._builtin_dictionary: Set[str] = set(self.DICTIONARIES.get(lang, set()))
|
|
62
|
+
self._dictionary = set(self._builtin_dictionary)
|
|
63
|
+
self._max_word_len = max(len(w) for w in self._dictionary) if self._dictionary else 20
|
|
64
|
+
self._wordfreq: Optional[Dict[str, int]] = None
|
|
65
|
+
self._wordfreq_max_len: int = 2
|
|
66
|
+
self._wordlist_size: int = 0
|
|
67
|
+
self._load_seg_lexicon()
|
|
68
|
+
self._load_wordlist()
|
|
69
|
+
self._max_word_len = max(len(w) for w in self._dictionary) if self._dictionary else 20
|
|
70
|
+
|
|
71
|
+
def _load_wordlist(self) -> None:
|
|
72
|
+
"""
|
|
73
|
+
Optional SEA tokenizer wordlist (offline), used for longest-match segmentation.
|
|
74
|
+
|
|
75
|
+
File:
|
|
76
|
+
seg_lexicon/{lang}_wordlist.pkl (set[str]) or .txt (one token per line)
|
|
77
|
+
"""
|
|
78
|
+
p = resolve_sea_wordlist_path(self.lang)
|
|
79
|
+
if not p:
|
|
80
|
+
return
|
|
81
|
+
try:
|
|
82
|
+
words: set[str] = set()
|
|
83
|
+
if p.suffix.lower() == ".pkl":
|
|
84
|
+
import pickle
|
|
85
|
+
obj = pickle.loads(p.read_bytes())
|
|
86
|
+
if isinstance(obj, set):
|
|
87
|
+
words = {w for w in obj if isinstance(w, str) and w}
|
|
88
|
+
elif isinstance(obj, dict):
|
|
89
|
+
# allow {word:freq} too
|
|
90
|
+
words = {w for w in obj.keys() if isinstance(w, str) and w}
|
|
91
|
+
else:
|
|
92
|
+
# txt
|
|
93
|
+
for line in p.read_text(encoding="utf-8", errors="ignore").splitlines():
|
|
94
|
+
w = line.strip()
|
|
95
|
+
if w:
|
|
96
|
+
words.add(w)
|
|
97
|
+
if not words:
|
|
98
|
+
return
|
|
99
|
+
# Filter very short entries for km/lo/my to avoid syllable-chunking
|
|
100
|
+
if self.lang in {"km", "lo", "my"}:
|
|
101
|
+
words = {w for w in words if len(w) >= 2}
|
|
102
|
+
self._wordlist_size = len(words)
|
|
103
|
+
self._dictionary = set(self._dictionary) | set(words)
|
|
104
|
+
except Exception:
|
|
105
|
+
return
|
|
106
|
+
|
|
107
|
+
def _load_seg_lexicon(self) -> None:
|
|
108
|
+
"""
|
|
109
|
+
Optional segmentation lexicon for no-space scripts:
|
|
110
|
+
seg_lexicon/{lang}_wordfreq.pkl (dict[str,int])
|
|
111
|
+
"""
|
|
112
|
+
p = resolve_seg_lexicon_path(self.lang)
|
|
113
|
+
if not p:
|
|
114
|
+
return
|
|
115
|
+
try:
|
|
116
|
+
import pickle
|
|
117
|
+
obj = pickle.loads(p.read_bytes())
|
|
118
|
+
if not isinstance(obj, dict):
|
|
119
|
+
return
|
|
120
|
+
wf: Dict[str, int] = {}
|
|
121
|
+
mx = 1
|
|
122
|
+
for k, v in obj.items():
|
|
123
|
+
if isinstance(k, str) and k and isinstance(v, int) and v > 0:
|
|
124
|
+
# For SEA scripts (km/lo/my), 2-gram frequencies tend to be too "syllable-like"
|
|
125
|
+
# and can cause degenerate segmentation into short chunks.
|
|
126
|
+
# Filter out very short entries to keep Viterbi candidates more word-like.
|
|
127
|
+
if self.lang in {"km", "lo", "my"} and len(k) < 3:
|
|
128
|
+
continue
|
|
129
|
+
wf[k] = v
|
|
130
|
+
if len(k) > mx:
|
|
131
|
+
mx = len(k)
|
|
132
|
+
if wf:
|
|
133
|
+
# keep bounds conservative (Thai/Lao/Myanmar/Khmer words rarely exceed 12 chars)
|
|
134
|
+
self._wordfreq = wf
|
|
135
|
+
self._wordfreq_max_len = max(2, min(int(mx), 12))
|
|
136
|
+
except Exception:
|
|
137
|
+
return
|
|
138
|
+
|
|
139
|
+
def _setup_patterns(self):
|
|
140
|
+
pattern_map = {
|
|
141
|
+
'th': self.THAI,
|
|
142
|
+
'lo': self.LAO,
|
|
143
|
+
'my': self.MYANMAR,
|
|
144
|
+
'km': self.KHMER,
|
|
145
|
+
}
|
|
146
|
+
script_range = pattern_map.get(self.lang, self.THAI)
|
|
147
|
+
self._script_pattern = re.compile(f'[{script_range}]+')
|
|
148
|
+
self._latin_pattern = re.compile(r'[a-zA-Z0-9]+')
|
|
149
|
+
|
|
150
|
+
def _init_morphology(self):
|
|
151
|
+
# 외부 의존 제거 - 내장 사전 사용
|
|
152
|
+
self._morphology_analyzer = None
|
|
153
|
+
|
|
154
|
+
def tokenize(self, text: str) -> TokenizerResult:
|
|
155
|
+
text = self.clean_text(text)
|
|
156
|
+
if not text:
|
|
157
|
+
return TokenizerResult(tokens=[], text=text, lang=self.lang)
|
|
158
|
+
|
|
159
|
+
tokens: List[Token] = []
|
|
160
|
+
|
|
161
|
+
for match in self._script_pattern.finditer(text):
|
|
162
|
+
chunk = match.group()
|
|
163
|
+
# NOTE:
|
|
164
|
+
# For km/lo/my, naive n-gram wordfreq lexicons tend to over-segment into short chunks.
|
|
165
|
+
# Until we have a word-level lexicon, keep the robust longest-match+unknown-grouping path.
|
|
166
|
+
# For Thai:
|
|
167
|
+
# - If we have a big token-level wordlist, prefer longest-match with that wordlist (more word-like).
|
|
168
|
+
# - Otherwise, fall back to Viterbi over wordfreq (may be n-gramy, but still better than tiny dict).
|
|
169
|
+
# Split native digit runs inside the script chunk BEFORE segmentation.
|
|
170
|
+
# This prevents mixed letter+digit tokens (e.g., Myanmar: 'က၂' + '၀၂၅') and keeps digits intact.
|
|
171
|
+
start0 = match.start()
|
|
172
|
+
cur = 0
|
|
173
|
+
for part in self._split_native_digit_runs(chunk):
|
|
174
|
+
if not part:
|
|
175
|
+
continue
|
|
176
|
+
if self._is_native_digit_token(part):
|
|
177
|
+
tokens.append(Token(text=part, start=start0 + cur, end=start0 + cur + len(part)))
|
|
178
|
+
cur += len(part)
|
|
179
|
+
continue
|
|
180
|
+
if self._wordfreq and self.lang == "th" and self._wordlist_size <= 0:
|
|
181
|
+
part_tokens = self._segment_viterbi(part)
|
|
182
|
+
else:
|
|
183
|
+
part_tokens = self._segment_longest_match(part)
|
|
184
|
+
for t in part_tokens:
|
|
185
|
+
if not t:
|
|
186
|
+
continue
|
|
187
|
+
tokens.append(Token(text=t, start=start0 + cur, end=start0 + cur + len(t)))
|
|
188
|
+
cur += len(t)
|
|
189
|
+
|
|
190
|
+
# 라틴/숫자
|
|
191
|
+
for match in self._latin_pattern.finditer(text):
|
|
192
|
+
overlaps = any(
|
|
193
|
+
t.start <= match.start() < t.end or t.start < match.end() <= t.end
|
|
194
|
+
for t in tokens
|
|
195
|
+
)
|
|
196
|
+
if not overlaps:
|
|
197
|
+
tokens.append(Token(text=match.group(), start=match.start(), end=match.end()))
|
|
198
|
+
|
|
199
|
+
# punctuation / symbols: include as single-char tokens (helps SBD & downstream)
|
|
200
|
+
# Only add chars not already covered by script/latin tokens.
|
|
201
|
+
covered = [False] * (len(text) + 1)
|
|
202
|
+
for t in tokens:
|
|
203
|
+
s = max(0, min(len(text), int(t.start)))
|
|
204
|
+
e = max(0, min(len(text), int(t.end)))
|
|
205
|
+
for i in range(s, e):
|
|
206
|
+
covered[i] = True
|
|
207
|
+
for i, ch in enumerate(text):
|
|
208
|
+
if covered[i] or ch.isspace():
|
|
209
|
+
continue
|
|
210
|
+
# skip characters that are part of our main scripts (should have been covered)
|
|
211
|
+
if self._script_pattern.match(ch) or self._latin_pattern.match(ch):
|
|
212
|
+
continue
|
|
213
|
+
tokens.append(Token(text=ch, start=i, end=i + 1))
|
|
214
|
+
|
|
215
|
+
tokens.sort(key=lambda t: t.start)
|
|
216
|
+
tokens = self._postprocess_marks_and_digits(tokens, text)
|
|
217
|
+
return TokenizerResult(tokens=tokens, text=text, lang=self.lang, morphology_used=False)
|
|
218
|
+
|
|
219
|
+
def _postprocess_marks_and_digits(self, tokens: List[Token], text: str) -> List[Token]:
|
|
220
|
+
"""
|
|
221
|
+
Postprocess for SEA scripts:
|
|
222
|
+
- Never allow a token to start with a combining mark (Mn/Mc/Me). If it does, merge into previous token
|
|
223
|
+
when contiguous. This fixes cases like Khmer coeng/virama or Myanmar vowel signs splitting.
|
|
224
|
+
- Merge contiguous native-digit runs (Thai/Lao/Myanmar/Khmer digits) into a single token to avoid
|
|
225
|
+
'per-digit' fragmentation.
|
|
226
|
+
"""
|
|
227
|
+
if not tokens:
|
|
228
|
+
return tokens
|
|
229
|
+
|
|
230
|
+
def _is_mark(ch: str) -> bool:
|
|
231
|
+
try:
|
|
232
|
+
return unicodedata.category(ch) in {"Mn", "Mc", "Me"}
|
|
233
|
+
except Exception:
|
|
234
|
+
return False
|
|
235
|
+
|
|
236
|
+
# (native digit helpers are shared with pre-seg split)
|
|
237
|
+
|
|
238
|
+
out: List[Token] = []
|
|
239
|
+
for t in tokens:
|
|
240
|
+
if out:
|
|
241
|
+
prev = out[-1]
|
|
242
|
+
# Merge leading combining marks into previous token if contiguous
|
|
243
|
+
if t.text and _is_mark(t.text[0]) and prev.end == t.start:
|
|
244
|
+
prev.text += t.text
|
|
245
|
+
prev.end = t.end
|
|
246
|
+
continue
|
|
247
|
+
# Merge native-digit runs if contiguous (e.g., Myanmar digits)
|
|
248
|
+
if self._is_native_digit_token(prev.text) and self._is_native_digit_token(t.text) and prev.end == t.start:
|
|
249
|
+
prev.text += t.text
|
|
250
|
+
prev.end = t.end
|
|
251
|
+
continue
|
|
252
|
+
out.append(t)
|
|
253
|
+
|
|
254
|
+
return out
|
|
255
|
+
|
|
256
|
+
def _segment_longest_match(self, text: str) -> List[str]:
|
|
257
|
+
"""
|
|
258
|
+
Longest Matching 알고리즘 (improved fallback).
|
|
259
|
+
|
|
260
|
+
Key change vs previous version:
|
|
261
|
+
- If no dictionary match, do NOT emit single-character tokens (degenerate).
|
|
262
|
+
Instead, group an "unknown span" until the next plausible dictionary boundary,
|
|
263
|
+
capped to a reasonable max length.
|
|
264
|
+
"""
|
|
265
|
+
tokens = []
|
|
266
|
+
pos = 0
|
|
267
|
+
|
|
268
|
+
while pos < len(text):
|
|
269
|
+
# Length-first longest match (product-safe).
|
|
270
|
+
# Use wordfreq only as a tie-breaker among same-length candidates.
|
|
271
|
+
best_match = None
|
|
272
|
+
maxL = min(self._max_word_len, len(text) - pos)
|
|
273
|
+
wf = self._wordfreq or {}
|
|
274
|
+
for length in range(maxL, 0, -1):
|
|
275
|
+
best_freq = -1
|
|
276
|
+
cand = None
|
|
277
|
+
candidate = text[pos:pos + length]
|
|
278
|
+
if candidate in self._dictionary:
|
|
279
|
+
cand = candidate
|
|
280
|
+
best_freq = wf.get(candidate, 0)
|
|
281
|
+
# Optionally allow a small set of high-precision builtins to win even if freq is missing.
|
|
282
|
+
if cand:
|
|
283
|
+
best_match = cand
|
|
284
|
+
# Prefer built-in dictionary compounds for Thai (e.g., ประเทศไทย, มาตรการ).
|
|
285
|
+
if self.lang == "th" and cand in self._builtin_dictionary:
|
|
286
|
+
break
|
|
287
|
+
# Otherwise, accept the first (longest) match.
|
|
288
|
+
break
|
|
289
|
+
|
|
290
|
+
if best_match:
|
|
291
|
+
tokens.append(best_match)
|
|
292
|
+
pos += len(best_match)
|
|
293
|
+
else:
|
|
294
|
+
# Unknown: group forward until we hit a position that can start a known word,
|
|
295
|
+
# or until we reach a max span length.
|
|
296
|
+
max_unknown = 8 if self.lang in {"th", "lo"} else 10
|
|
297
|
+
end_pos = min(len(text), pos + 1)
|
|
298
|
+
while end_pos < len(text) and (end_pos - pos) < max_unknown:
|
|
299
|
+
# stop if the remainder begins with a dictionary word
|
|
300
|
+
found = False
|
|
301
|
+
for length in range(min(self._max_word_len, len(text) - end_pos), 1, -1):
|
|
302
|
+
if text[end_pos:end_pos + length] in self._dictionary:
|
|
303
|
+
found = True
|
|
304
|
+
break
|
|
305
|
+
if found:
|
|
306
|
+
break
|
|
307
|
+
end_pos += 1
|
|
308
|
+
if end_pos <= pos:
|
|
309
|
+
end_pos = pos + 1
|
|
310
|
+
tokens.append(text[pos:end_pos])
|
|
311
|
+
pos = end_pos
|
|
312
|
+
|
|
313
|
+
return tokens
|
|
314
|
+
|
|
315
|
+
def _is_native_digit_char(self, ch: str) -> bool:
|
|
316
|
+
o = ord(ch)
|
|
317
|
+
if self.lang == "th":
|
|
318
|
+
return 0x0E50 <= o <= 0x0E59
|
|
319
|
+
if self.lang == "lo":
|
|
320
|
+
return 0x0ED0 <= o <= 0x0ED9
|
|
321
|
+
if self.lang == "my":
|
|
322
|
+
return 0x1040 <= o <= 0x1049
|
|
323
|
+
if self.lang == "km":
|
|
324
|
+
return 0x17E0 <= o <= 0x17E9
|
|
325
|
+
return False
|
|
326
|
+
|
|
327
|
+
def _is_native_digit_token(self, s: str) -> bool:
|
|
328
|
+
return bool(s) and all(self._is_native_digit_char(c) for c in s)
|
|
329
|
+
|
|
330
|
+
def _split_native_digit_runs(self, s: str) -> List[str]:
|
|
331
|
+
"""
|
|
332
|
+
Split a SEA-script run into alternating [letters] / [native-digit-runs].
|
|
333
|
+
Digits are kept as-is to preserve offsets.
|
|
334
|
+
"""
|
|
335
|
+
if not s:
|
|
336
|
+
return []
|
|
337
|
+
out: List[str] = []
|
|
338
|
+
i = 0
|
|
339
|
+
n = len(s)
|
|
340
|
+
while i < n:
|
|
341
|
+
ch = s[i]
|
|
342
|
+
if self._is_native_digit_char(ch):
|
|
343
|
+
j = i + 1
|
|
344
|
+
while j < n and self._is_native_digit_char(s[j]):
|
|
345
|
+
j += 1
|
|
346
|
+
out.append(s[i:j])
|
|
347
|
+
i = j
|
|
348
|
+
else:
|
|
349
|
+
j = i + 1
|
|
350
|
+
while j < n and (not self._is_native_digit_char(s[j])):
|
|
351
|
+
j += 1
|
|
352
|
+
out.append(s[i:j])
|
|
353
|
+
i = j
|
|
354
|
+
return out
|
|
355
|
+
|
|
356
|
+
def _segment_viterbi(self, run: str) -> List[str]:
|
|
357
|
+
"""
|
|
358
|
+
Viterbi segmentation over a pure-script run (Thai/Lao/Myanmar/Khmer).
|
|
359
|
+
|
|
360
|
+
Candidates are any substrings; scoring prefers:
|
|
361
|
+
- known words from wordfreq lexicon (coverage)
|
|
362
|
+
- words from small built-in dictionary (precision)
|
|
363
|
+
- multi-char groupings over per-char tokens (anti-degenerate)
|
|
364
|
+
"""
|
|
365
|
+
wf = self._wordfreq or {}
|
|
366
|
+
max_len = max(self._max_word_len, self._wordfreq_max_len)
|
|
367
|
+
max_len = max(2, min(int(max_len), 12))
|
|
368
|
+
n = len(run)
|
|
369
|
+
if n <= 0:
|
|
370
|
+
return []
|
|
371
|
+
|
|
372
|
+
# Tuned to avoid per-character segmentation even when lexicon is sparse.
|
|
373
|
+
len_bonus = 0.55
|
|
374
|
+
single_penalty = 1.0
|
|
375
|
+
dict_bonus = 1.8
|
|
376
|
+
unk_base = -1.2
|
|
377
|
+
unk_len_penalty = 0.25
|
|
378
|
+
|
|
379
|
+
best = [-1e100] * (n + 1)
|
|
380
|
+
back = [-1] * (n + 1)
|
|
381
|
+
back_len = [1] * (n + 1)
|
|
382
|
+
best[0] = 0.0
|
|
383
|
+
|
|
384
|
+
for i in range(n):
|
|
385
|
+
if best[i] <= -1e90:
|
|
386
|
+
continue
|
|
387
|
+
maxL = min(max_len, n - i)
|
|
388
|
+
for L in range(1, maxL + 1):
|
|
389
|
+
w = run[i:i + L]
|
|
390
|
+
freq = wf.get(w, 0)
|
|
391
|
+
in_dict = w in self._dictionary
|
|
392
|
+
sc = best[i]
|
|
393
|
+
if in_dict or freq > 0:
|
|
394
|
+
sc += math.log(float(freq) + 1.0)
|
|
395
|
+
else:
|
|
396
|
+
sc += unk_base - unk_len_penalty * L
|
|
397
|
+
sc += len_bonus * (L - 1)
|
|
398
|
+
if L == 1:
|
|
399
|
+
sc -= single_penalty
|
|
400
|
+
if in_dict:
|
|
401
|
+
sc += dict_bonus
|
|
402
|
+
j = i + L
|
|
403
|
+
if sc > best[j]:
|
|
404
|
+
best[j] = sc
|
|
405
|
+
back[j] = i
|
|
406
|
+
back_len[j] = L
|
|
407
|
+
|
|
408
|
+
if back[n] < 0:
|
|
409
|
+
# fallback safety: never return per-char; use unknown grouping
|
|
410
|
+
return self._segment_longest_match(run)
|
|
411
|
+
|
|
412
|
+
spans = []
|
|
413
|
+
j = n
|
|
414
|
+
while j > 0:
|
|
415
|
+
i = back[j]
|
|
416
|
+
if i < 0:
|
|
417
|
+
i = j - 1
|
|
418
|
+
spans.append((i, j))
|
|
419
|
+
j = i
|
|
420
|
+
spans.reverse()
|
|
421
|
+
return [run[i:j] for i, j in spans]
|
|
422
|
+
|
|
423
|
+
def extract_ngrams(self, text: str, min_n: int = 2, max_n: int = 8) -> List[str]:
|
|
424
|
+
ngrams = []
|
|
425
|
+
for match in self._script_pattern.finditer(text):
|
|
426
|
+
chunk = match.group()
|
|
427
|
+
for n in range(min_n, min(max_n + 1, len(chunk) + 1)):
|
|
428
|
+
for i in range(len(chunk) - n + 1):
|
|
429
|
+
ngrams.append(chunk[i:i+n])
|
|
430
|
+
return ngrams
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
# 언어별 특화 클래스
|
|
434
|
+
class ThaiTokenizer(BrahmicTokenizer):
|
|
435
|
+
SUPPORTED_LANGUAGES = {'th'}
|
|
436
|
+
def __init__(self, use_morphology: bool = False):
|
|
437
|
+
super().__init__('th', use_morphology)
|
|
438
|
+
# Ensure the native analyzer is available even when use_morphology=False,
|
|
439
|
+
# so Thai segmentation stays high-quality by default.
|
|
440
|
+
if getattr(self, "_morphology_analyzer", None) is None:
|
|
441
|
+
try:
|
|
442
|
+
self._morphology_analyzer = ThaiNativeAnalyzer()
|
|
443
|
+
except Exception:
|
|
444
|
+
self._morphology_analyzer = None
|
|
445
|
+
|
|
446
|
+
def _init_morphology(self):
|
|
447
|
+
# Native Thai analyzer (offline) provides better segmentation than the tiny dictionary.
|
|
448
|
+
try:
|
|
449
|
+
self._morphology_analyzer = ThaiNativeAnalyzer()
|
|
450
|
+
except Exception:
|
|
451
|
+
self._morphology_analyzer = None
|
|
452
|
+
|
|
453
|
+
def tokenize(self, text: str) -> TokenizerResult:
|
|
454
|
+
"""
|
|
455
|
+
For Thai, always prefer the internal native segmenter (offline) if available.
|
|
456
|
+
This is a tokenizer-quality feature, not an external dependency.
|
|
457
|
+
|
|
458
|
+
- If morphology is enabled: include lemma/pos where possible and mark morphology_used=True.
|
|
459
|
+
- If morphology is disabled: return tokens only (no lemma/pos) but still use the better segmenter.
|
|
460
|
+
"""
|
|
461
|
+
text = self.clean_text(text)
|
|
462
|
+
if not text:
|
|
463
|
+
return TokenizerResult(tokens=[], text=text, lang=self.lang)
|
|
464
|
+
|
|
465
|
+
# For Thai, prefer the corpus-derived wordlist longest-match path when available.
|
|
466
|
+
# This tends to produce more word-like tokens on real text than the tiny native dictionary.
|
|
467
|
+
if (not self.use_morphology) and getattr(self, "_wordlist_size", 0) > 0:
|
|
468
|
+
return super().tokenize(text)
|
|
469
|
+
|
|
470
|
+
if self._morphology_analyzer is not None:
|
|
471
|
+
try:
|
|
472
|
+
morps = self._morphology_analyzer.analyze(text)
|
|
473
|
+
toks: List[Token] = []
|
|
474
|
+
for m in morps:
|
|
475
|
+
if self.use_morphology:
|
|
476
|
+
toks.append(Token(text=m.surface, start=m.start, end=m.end, lemma=m.lemma, pos=m.pos))
|
|
477
|
+
else:
|
|
478
|
+
toks.append(Token(text=m.surface, start=m.start, end=m.end))
|
|
479
|
+
# include non-covered punctuation/symbols
|
|
480
|
+
covered = [False] * (len(text) + 1)
|
|
481
|
+
for t in toks:
|
|
482
|
+
for i in range(max(0, t.start), min(len(text), t.end)):
|
|
483
|
+
covered[i] = True
|
|
484
|
+
for i, ch in enumerate(text):
|
|
485
|
+
if covered[i] or ch.isspace():
|
|
486
|
+
continue
|
|
487
|
+
toks.append(Token(text=ch, start=i, end=i + 1))
|
|
488
|
+
toks.sort(key=lambda t: t.start)
|
|
489
|
+
return TokenizerResult(tokens=toks, text=text, lang=self.lang, morphology_used=bool(self.use_morphology))
|
|
490
|
+
except Exception:
|
|
491
|
+
# fallback to base behavior
|
|
492
|
+
pass
|
|
493
|
+
|
|
494
|
+
return super().tokenize(text)
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
class LaoTokenizer(BrahmicTokenizer):
|
|
498
|
+
SUPPORTED_LANGUAGES = {'lo'}
|
|
499
|
+
def __init__(self, use_morphology: bool = False):
|
|
500
|
+
super().__init__('lo', use_morphology)
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
class MyanmarTokenizer(BrahmicTokenizer):
|
|
504
|
+
SUPPORTED_LANGUAGES = {'my'}
|
|
505
|
+
def __init__(self, use_morphology: bool = False):
|
|
506
|
+
super().__init__('my', use_morphology)
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
class KhmerTokenizer(BrahmicTokenizer):
|
|
510
|
+
SUPPORTED_LANGUAGES = {'km'}
|
|
511
|
+
def __init__(self, use_morphology: bool = False):
|
|
512
|
+
super().__init__('km', use_morphology)
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
# ThaiMorphologyAnalyzer 제거 (외부 의존 제거)
|
|
516
|
+
# pythainlp, attacut 필요 없음
|