tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,736 @@
1
+ """
2
+ Chinese Morphological Analyzer - 자체 구현
3
+ =========================================
4
+
5
+ 외부 라이브러리 없이 순수 Python으로 구현한 중국어 분석기
6
+
7
+ 특징:
8
+ - 최장일치 분词 (Maximum Matching)
9
+ - 역방향 최장일치 (Reverse Maximum Matching)
10
+ - 양방향 비교로 최적 분할 선택
11
+
12
+ """
13
+
14
+ import os
15
+ import re
16
+ import math
17
+ import pickle
18
+ from typing import List, Tuple, Set, Optional
19
+ from dataclasses import dataclass
20
+
21
+ from ..resources import resolve_seg_lexicon_path, resolve_extra_dict_path
22
+
23
+
24
+ @dataclass
25
+ class Morpheme:
26
+ """형태소/단어"""
27
+ surface: str
28
+ lemma: str
29
+ pos: str
30
+ start: int
31
+ end: int
32
+
33
+ def __repr__(self):
34
+ return f"{self.surface}/{self.pos}"
35
+
36
+
37
+ class ChineseAnalyzer:
38
+ """
39
+ 중국어 분석기
40
+
41
+ Usage:
42
+ analyzer = ChineseAnalyzer()
43
+ result = analyzer.analyze("阿里巴巴集团在杭州宣布")
44
+ """
45
+
46
+ def __init__(self, join_dates: Optional[bool] = None):
47
+ # Product option: join common date spans into a single token for news/SNS.
48
+ # Default is OFF to avoid unexpected evaluation/style mismatches.
49
+ if join_dates is None:
50
+ self.join_dates = os.getenv("TOKMOR_ZH_JOIN_DATES", "0").strip().lower() in ("1", "true", "yes", "y", "on")
51
+ else:
52
+ self.join_dates = bool(join_dates)
53
+ # Optional segmentation lexicon (offline, generated from large corpora):
54
+ # {word(str): freq(int)}. If present, we use a DP/Viterbi segmenter for Hanzi runs.
55
+ self._wordfreq = None
56
+ self._wordfreq_max_len = 4
57
+ self._build_dictionary()
58
+ self._load_seg_lexicon()
59
+ self._load_extra_dict()
60
+
61
+ def _load_extra_dict(self) -> None:
62
+ """
63
+ Optional runtime extension dictionary (offline).
64
+ File: seg_lexicon/zh_extra_dict.json (token -> pos)
65
+ """
66
+ p = resolve_extra_dict_path("zh")
67
+ if not p:
68
+ return
69
+ try:
70
+ import json
71
+
72
+ obj = json.loads(p.read_text(encoding="utf-8", errors="ignore"))
73
+ if not isinstance(obj, dict):
74
+ return
75
+ # Merge into dictionary; keep only sane entries.
76
+ for k, v in obj.items():
77
+ if not isinstance(k, str) or not k:
78
+ continue
79
+ if not isinstance(v, str) or not v:
80
+ v = "n"
81
+ self.dictionary[k] = v
82
+ # Update max word length
83
+ self.max_word_len = max(len(w) for w in self.dictionary) if self.dictionary else self.max_word_len
84
+ except Exception:
85
+ return
86
+
87
+ def _load_seg_lexicon(self):
88
+ p = resolve_seg_lexicon_path("zh")
89
+ if not p:
90
+ return
91
+ try:
92
+ obj = pickle.loads(p.read_bytes())
93
+ if isinstance(obj, dict):
94
+ # keep only str->int like entries
95
+ wf = {}
96
+ mx = 1
97
+ for k, v in obj.items():
98
+ if isinstance(k, str) and isinstance(v, int) and k:
99
+ wf[k] = v
100
+ if len(k) > mx:
101
+ mx = len(k)
102
+ self._wordfreq = wf
103
+ self._wordfreq_max_len = max(2, min(int(mx), 8))
104
+ except Exception:
105
+ return
106
+
107
+ def _build_dictionary(self):
108
+ """사전 구축"""
109
+
110
+ # 주요 단어 사전 (단어: 품사)
111
+ self.dictionary = {
112
+ # ============================================================
113
+ # News / SNS domain (high-impact for segmentation)
114
+ # ============================================================
115
+ # 통신사/매체
116
+ '新华社': 'nrt', '中新社': 'nrt', '人民网': 'nrt', '央视': 'nrt', '央视网': 'nrt',
117
+ '澎湃新闻': 'nrt', '环球时报': 'nrt', '财新': 'nrt', '凤凰网': 'nrt', '网易': 'nrt', '新浪': 'nrt',
118
+ # 뉴스 관용구
119
+ '日电': 'n', '日讯': 'n', '消息': 'n', '报道': 'n', '记者': 'n',
120
+ '原标题': 'n', '来源': 'n', '编辑': 'n', '评论': 'n',
121
+ # 뉴스 단골(분절 안정화)
122
+ '直击': 'v', '强震': 'n', '灾区': 'n',
123
+ # SNS
124
+ '微博': 'n', '微信': 'n', '抖音': 'n', '快手': 'n', '小红书': 'n',
125
+ '网友': 'n', '点赞': 'v', '转发': 'v', '评论区': 'n',
126
+
127
+ # 지명
128
+ '北京': 'ns', '上海': 'ns', '广州': 'ns', '深圳': 'ns',
129
+ '杭州': 'ns', '南京': 'ns', '武汉': 'ns', '成都': 'ns',
130
+ '香港': 'ns', '台湾': 'ns', '中国': 'ns', '美国': 'ns',
131
+ '日本': 'ns', '韩国': 'ns', '英国': 'ns', '法国': 'ns',
132
+ '德国': 'ns', '俄罗斯': 'ns', '印度': 'ns', '越南': 'ns',
133
+ '泰国': 'ns', '新加坡': 'ns', '马来西亚': 'ns',
134
+ '土耳其': 'ns',
135
+
136
+ # 기업/조직
137
+ '阿里巴巴': 'nrt', '腾讯': 'nrt', '百度': 'nrt', '华为': 'nrt',
138
+ '小米': 'nrt', '京东': 'nrt', '美团': 'nrt', '字节跳动': 'nrt',
139
+ '苹果': 'nrt', '谷歌': 'nrt', '微软': 'nrt', '三星': 'nrt',
140
+ '特斯拉': 'nrt', '丰田': 'nrt', '奔驰': 'nrt', '宝马': 'nrt',
141
+ '集团': 'n', '公司': 'n', '企业': 'n', '银行': 'n',
142
+ '政府': 'n', '学校': 'n', '大学': 'n', '医院': 'n',
143
+
144
+ # 일반명사
145
+ '人': 'n', '人们': 'n', '时间': 'n', '地方': 'n',
146
+ '问题': 'n', '情况': 'n', '工作': 'n', '生活': 'n',
147
+ '发展': 'n', '经济': 'n', '社会': 'n', '文化': 'n',
148
+ '技术': 'n', '产品': 'n', '服务': 'n', '市场': 'n',
149
+ '国家': 'n', '世界': 'n', '城市': 'n', '地区': 'n',
150
+
151
+ # 동사
152
+ '是': 'v', '有': 'v', '在': 'v', '说': 'v', '做': 'v',
153
+ '去': 'v', '来': 'v', '看': 'v', '想': 'v', '知道': 'v',
154
+ '发表': 'v', '宣布': 'v', '公布': 'v', '发布': 'v',
155
+ '开始': 'v', '结束': 'v', '进行': 'v', '完成': 'v',
156
+ '研究': 'v', '开发': 'v', '生产': 'v', '销售': 'v',
157
+ '投资': 'v', '合作': 'v', '成立': 'v', '成为': 'v',
158
+
159
+ # 형용사
160
+ '大': 'a', '小': 'a', '多': 'a', '少': 'a',
161
+ '好': 'a', '新': 'a', '高': 'a', '重要': 'a',
162
+
163
+ # 부사
164
+ '不': 'd', '也': 'd', '就': 'd', '都': 'd', '还': 'd',
165
+ '很': 'd', '最': 'd', '已经': 'd', '正在': 'd',
166
+
167
+ # 개사 (전치사)
168
+ '在': 'p', '从': 'p', '向': 'p', '对': 'p', '把': 'p',
169
+ '被': 'p', '比': 'p', '跟': 'p', '和': 'p', '与': 'p',
170
+
171
+ # 조사/어기사
172
+ '的': 'u', '地': 'u', '得': 'u', '了': 'u', '着': 'u', '过': 'u',
173
+ '吗': 'u', '呢': 'u', '吧': 'u', '啊': 'u',
174
+
175
+ # 대명사
176
+ '我': 'r', '你': 'r', '他': 'r', '她': 'r', '它': 'r',
177
+ '我们': 'r', '你们': 'r', '他们': 'r', '这': 'r', '那': 'r',
178
+ '这个': 'r', '那个': 'r', '什么': 'r', '谁': 'r',
179
+
180
+ # 연결어
181
+ '和': 'c', '或': 'c', '但': 'c', '但是': 'c', '因为': 'c',
182
+ '所以': 'c', '如果': 'c', '虽然': 'c',
183
+
184
+ # 수사
185
+ '一': 'm', '二': 'm', '三': 'm', '四': 'm', '五': 'm',
186
+ '六': 'm', '七': 'm', '八': 'm', '九': 'm', '十': 'm',
187
+ '百': 'm', '千': 'm', '万': 'm', '亿': 'm',
188
+
189
+ # 양사
190
+ '个': 'q', '年': 'q', '月': 'q', '日': 'q', '号': 'q',
191
+ '次': 'q', '种': 'q', '件': 'q', '位': 'q',
192
+ }
193
+
194
+ # 최대 단어 길이
195
+ self.max_word_len = max(len(w) for w in self.dictionary) if self.dictionary else 4
196
+
197
+ # 한자 패턴
198
+ self.hanzi = re.compile(r'[\u4e00-\u9fff]+')
199
+
200
+ # 날짜/시간 패턴 (뉴스에서 매우 자주 등장)
201
+ # Examples: 2025年12月31日, 12月31日, 12月31日电
202
+ # NOTE: only used when self.join_dates is True
203
+ self._date_ymd = re.compile(r'^[0-9]{2,4}年[0-9]{1,2}月[0-9]{1,2}[日号]')
204
+ self._date_md = re.compile(r'^[0-9]{1,2}月[0-9]{1,2}[日号]')
205
+ self._date_md_with_dian = re.compile(r'^[0-9]{1,2}月[0-9]{1,2}[日号]电')
206
+
207
+ def analyze(self, text: str) -> List[Morpheme]:
208
+ """
209
+ Analyze text into morphemes.
210
+ """
211
+ if not text:
212
+ return []
213
+
214
+ # If we have a segmentation lexicon, prefer DP segmentation (more stable on real corpora).
215
+ if self._wordfreq:
216
+ out = self._segment_with_lexicon(text)
217
+ out = self._postprocess_fix_mixed_function_word_tokens(out)
218
+ return self._postprocess_merge_common_suffixes(out)
219
+
220
+ # Fallback: 정방향/역방향 최장일치
221
+ forward = self._forward_max_match(text)
222
+ backward = self._backward_max_match(text)
223
+ out = forward if len(forward) <= len(backward) else backward
224
+ out = self._postprocess_fix_mixed_function_word_tokens(out)
225
+ return self._postprocess_merge_common_suffixes(out)
226
+
227
+ def _postprocess_fix_mixed_function_word_tokens(self, toks: List[Morpheme]) -> List[Morpheme]:
228
+ """
229
+ Fix a common segmentation error from n-gram lexicons:
230
+ - function word + first char of a toponym gets merged (e.g., 在北 + 京市)
231
+
232
+ We do a conservative local split/merge:
233
+ - If token is length-2 and starts with a function char (在/对/从/与/和/将/为/把/被/给/向/于)
234
+ and the 2nd char is a plausible toponym initial (first char of any known ns entry),
235
+ split it into two single-char tokens.
236
+ - If token is length-2 and ends with an admin suffix (市/省/区/县/州/国/镇/村/旗/盟),
237
+ split it into (head char, suffix char).
238
+ - Then merge adjacent 2-char toponyms when they exist in the hand dictionary as ns.
239
+ """
240
+ if not toks:
241
+ return toks
242
+
243
+ func0 = {"在", "对", "从", "与", "和", "将", "为", "把", "被", "给", "向", "于"}
244
+ admin1 = {"市", "省", "县", "区", "州", "国", "镇", "村", "旗", "盟"}
245
+
246
+ # derive plausible toponym initials from known ns tokens
247
+ topo_initials = set()
248
+ try:
249
+ for w, p in self.dictionary.items():
250
+ if p == "ns" and isinstance(w, str) and len(w) >= 2:
251
+ topo_initials.add(w[0])
252
+ except Exception:
253
+ topo_initials = set()
254
+
255
+ # step1: split tokens conservatively
256
+ split: List[Morpheme] = []
257
+ for m in toks:
258
+ s = m.surface
259
+ if isinstance(s, str) and len(s) == 2:
260
+ if (s[0] in func0) and (s[1] in topo_initials) and (s not in self.dictionary):
261
+ # split: 在北 -> 在 + 北
262
+ split.append(Morpheme(s[0], s[0], self.dictionary.get(s[0], "x"), m.start, m.start + 1))
263
+ split.append(Morpheme(s[1], s[1], self.dictionary.get(s[1], "x"), m.start + 1, m.end))
264
+ continue
265
+ if (s[1] in admin1) and (s not in self.dictionary):
266
+ # split: 京市 -> 京 + 市
267
+ split.append(Morpheme(s[0], s[0], self.dictionary.get(s[0], "x"), m.start, m.start + 1))
268
+ split.append(Morpheme(s[1], s[1], self.dictionary.get(s[1], "x"), m.start + 1, m.end))
269
+ continue
270
+ split.append(m)
271
+
272
+ # step2: merge adjacent 2-char toponyms when present in dictionary
273
+ out: List[Morpheme] = []
274
+ i = 0
275
+ while i < len(split):
276
+ if i + 1 < len(split):
277
+ a, b = split[i], split[i + 1]
278
+ if a.end == b.start:
279
+ comb = a.surface + b.surface
280
+ if self.dictionary.get(comb) == "ns":
281
+ out.append(Morpheme(comb, comb, "ns", a.start, b.end))
282
+ i += 2
283
+ continue
284
+ out.append(split[i])
285
+ i += 1
286
+ return out
287
+
288
+ def _postprocess_merge_common_suffixes(self, toks: List[Morpheme]) -> List[Morpheme]:
289
+ """
290
+ Postprocess merges to improve segmentation quality for product use:
291
+ - Location suffixes: 北京 + 市 -> 北京市 (when previous token is a location-like ns)
292
+ - Organization suffixes: 阿里巴巴 + 集团 -> 阿里巴巴集团 (when previous is nrt)
293
+ - Numeric unit tails: 10 + 亿 -> 10亿 (handled in CJKTokenizer too, but keep safe here when we see it)
294
+
295
+ This is conservative: it only merges when contiguity is exact (prev.end == next.start),
296
+ and when previous token is already strongly typed (ns/nrt) or combined form exists in lexicon.
297
+ """
298
+ if not toks:
299
+ return toks
300
+
301
+ admin_suffix = {"市", "省", "县", "区", "州", "国", "镇", "村", "旗", "盟"}
302
+ # multi-char admin tails that frequently appear split
303
+ admin_suffix_multi = {"自治区", "自治州", "自治县", "特别行政区", "行政区"}
304
+ # NOTE: exclude "政府" here (it's a standalone noun too often; merging X+政府 is risky).
305
+ org_suffix_strong = {"集团", "公司", "银行", "大学"}
306
+ org_suffix_generic = {"委员会", "协会", "研究院", "研究所", "法院", "检察院", "公安局"}
307
+ # multi-char org tails
308
+ org_suffix_multi = {"有限公司", "有限责任公司", "股份有限公司", "集团公司"}
309
+
310
+ wf = self._wordfreq or {}
311
+
312
+ out: List[Morpheme] = []
313
+ i = 0
314
+ while i < len(toks):
315
+ cur = toks[i]
316
+ if out and cur.start == out[-1].end:
317
+ prev = out[-1]
318
+ comb = prev.surface + cur.surface
319
+ # 1) merge if combined form exists in hand dictionary
320
+ if comb in self.dictionary:
321
+ pos = self.dictionary.get(comb, prev.pos)
322
+ out[-1] = Morpheme(comb, comb, pos, prev.start, cur.end)
323
+ i += 1
324
+ continue
325
+ # 2) merge location + admin suffix (prev already ns)
326
+ if prev.pos == "ns" and cur.surface in admin_suffix:
327
+ # Only merge if lexicon suggests it's a real unit OR prev is very short (typical toponyms)
328
+ if wf.get(comb, 0) > 0 or len(prev.surface) <= 3:
329
+ out[-1] = Morpheme(comb, comb, "ns", prev.start, cur.end)
330
+ i += 1
331
+ continue
332
+ # 2a) merge toponym + multi-char admin suffix (lexicon supported)
333
+ if cur.surface in admin_suffix_multi and 1 <= len(prev.surface) <= 6 and cur.start == prev.end:
334
+ f_comb = int(wf.get(comb, 0) or 0)
335
+ if prev.pos == "ns" or f_comb >= 200:
336
+ out[-1] = Morpheme(comb, comb, "ns", prev.start, cur.end)
337
+ i += 1
338
+ continue
339
+ # 2b) merge toponym + admin suffix even if prev not tagged, when lexicon strongly supports comb.
340
+ if cur.surface in admin_suffix and 1 <= len(prev.surface) <= 4:
341
+ f_comb = int(wf.get(comb, 0) or 0)
342
+ if f_comb >= 100:
343
+ out[-1] = Morpheme(comb, comb, "ns", prev.start, cur.end)
344
+ i += 1
345
+ continue
346
+ # 3) merge organization + suffix (prev already nrt)
347
+ if prev.pos == "nrt" and (cur.surface in org_suffix_strong or cur.surface in org_suffix_generic):
348
+ # Prefer lexicon evidence, but be permissive for strong organization tails.
349
+ if wf.get(comb, 0) > 0 or len(prev.surface) >= 2:
350
+ out[-1] = Morpheme(comb, comb, "nrt", prev.start, cur.end)
351
+ i += 1
352
+ continue
353
+ # 3a) merge org + multi-char suffix (lexicon supported)
354
+ if cur.surface in org_suffix_multi and 1 <= len(prev.surface) <= 8 and cur.start == prev.end:
355
+ f_comb = int(wf.get(comb, 0) or 0)
356
+ if prev.pos == "nrt" or f_comb >= 200:
357
+ out[-1] = Morpheme(comb, comb, "nrt", prev.start, cur.end)
358
+ i += 1
359
+ continue
360
+ # 3b) merge org tail even if prev not tagged, when lexicon supports comb.
361
+ # Avoid merging verb+bank (e.g., 支持 + 银行) via a tiny stoplist.
362
+ if cur.start == prev.end and 1 <= len(prev.surface) <= 8:
363
+ stop_prev = {
364
+ "支持", "提供", "表示", "认为", "指出", "强调", "包括", "进行", "开展", "推动", "加强",
365
+ "将", "对", "在", "与", "和", "或", "但", "而", "为", "把", "被", "从", "向",
366
+ }
367
+ if prev.surface not in stop_prev:
368
+ f_comb = int(wf.get(comb, 0) or 0)
369
+ # strong tails: lower threshold
370
+ if cur.surface in org_suffix_strong and f_comb >= 20:
371
+ out[-1] = Morpheme(comb, comb, "nrt", prev.start, cur.end)
372
+ i += 1
373
+ continue
374
+ # generic tails: keep strict
375
+ if cur.surface in org_suffix_generic and f_comb >= 200:
376
+ out[-1] = Morpheme(comb, comb, "nrt", prev.start, cur.end)
377
+ i += 1
378
+ continue
379
+ out.append(cur)
380
+ i += 1
381
+ return out
382
+
383
+ def _segment_with_lexicon(self, text: str) -> List[Morpheme]:
384
+ """
385
+ Segment full text. For Hanzi runs, use Viterbi over word candidates from:
386
+ - hand dictionary (high precision)
387
+ - wordfreq lexicon (coverage)
388
+ Non-Hanzi parts follow the same rules as the native segmenter.
389
+ """
390
+ out: List[Morpheme] = []
391
+ pos = 0
392
+ n = len(text)
393
+
394
+ while pos < n:
395
+ # optional date joins (reuse native date patterns)
396
+ if self.join_dates:
397
+ m = self._date_md_with_dian.match(text[pos:])
398
+ if m:
399
+ s = m.group()
400
+ core = s[:-1]
401
+ out.append(Morpheme(core, core, 't', pos, pos + len(core)))
402
+ out.append(Morpheme('电', '电', 'n', pos + len(core), pos + len(s)))
403
+ pos += len(s)
404
+ continue
405
+ m = self._date_ymd.match(text[pos:])
406
+ if m:
407
+ s = m.group()
408
+ out.append(Morpheme(s, s, 't', pos, pos + len(s)))
409
+ pos += len(s)
410
+ continue
411
+ m = self._date_md.match(text[pos:])
412
+ if m:
413
+ s = m.group()
414
+ out.append(Morpheme(s, s, 't', pos, pos + len(s)))
415
+ pos += len(s)
416
+ continue
417
+
418
+ ch = text[pos]
419
+ # whitespace
420
+ if ch.isspace():
421
+ pos += 1
422
+ continue
423
+
424
+ # latin/digit chunk
425
+ if not self.hanzi.match(ch):
426
+ match = re.match(r'[a-zA-Z0-9]+', text[pos:])
427
+ if match:
428
+ w = match.group()
429
+ out.append(Morpheme(w, w, 'x', pos, pos + len(w)))
430
+ pos += len(w)
431
+ else:
432
+ out.append(Morpheme(ch, ch, 'x', pos, pos + 1))
433
+ pos += 1
434
+ continue
435
+
436
+ # hanzi run
437
+ m = self.hanzi.match(text[pos:])
438
+ if not m:
439
+ out.append(Morpheme(ch, ch, 'x', pos, pos + 1))
440
+ pos += 1
441
+ continue
442
+ run = m.group()
443
+ run_start = pos
444
+ out.extend(self._viterbi_hanzi_run(run, run_start))
445
+ pos += len(run)
446
+
447
+ return out
448
+
449
+ def _viterbi_hanzi_run(self, run: str, offset: int) -> List[Morpheme]:
450
+ """
451
+ Viterbi segmentation over a pure-Hanzi run.
452
+ Score = log(freq+1) + len_bonus*(len-1) - single_penalty(if len==1) + dict_bonus(if in hand dict)
453
+ """
454
+ wf = self._wordfreq or {}
455
+ max_len = max(self.max_word_len, self._wordfreq_max_len)
456
+ max_len = max(2, min(int(max_len), 8))
457
+ n = len(run)
458
+
459
+ len_bonus = 0.8
460
+ single_penalty = 1.2
461
+ # Hand dictionary entries should beat high-frequency short n-grams.
462
+ # Otherwise entities like "阿里巴巴" can get split into "阿里"+"巴巴".
463
+ dict_bonus = 3.5
464
+ dict_len_bonus = 0.6
465
+ entity_bonus = 4.0
466
+ entity_freq_floor = 50_000
467
+ # When lexicon doesn't contain a span, still allow 2-4 char grouping
468
+ # to avoid degenerate per-character segmentation.
469
+ unk_base = -1.5
470
+ unk_len_penalty = 0.35
471
+ freq_cap = 200_000
472
+
473
+ best_score = [-1e100] * (n + 1)
474
+ back = [-1] * (n + 1)
475
+ back_len = [1] * (n + 1)
476
+ best_score[0] = 0.0
477
+
478
+ for i in range(n):
479
+ if best_score[i] <= -1e90:
480
+ continue
481
+ # try candidates
482
+ for L in range(1, min(max_len, n - i) + 1):
483
+ w = run[i:i+L]
484
+ # require either hand dict or wordfreq for multi-char; allow single-char always
485
+ freq = wf.get(w, 0)
486
+ in_dict = w in self.dictionary
487
+ # If it's a known entity/location in the hand dictionary, treat it as high-confidence.
488
+ # This prevents frequent short n-grams from splitting proper nouns (e.g., 阿里巴巴 -> 阿里 + 巴巴).
489
+ if in_dict:
490
+ pos_tag = self.dictionary.get(w, "")
491
+ if pos_tag in ("nrt", "ns"):
492
+ freq = max(freq, entity_freq_floor)
493
+ sc = best_score[i]
494
+ if in_dict or freq > 0:
495
+ # base by frequency
496
+ sc += math.log(min(freq, freq_cap) + 1.0)
497
+ else:
498
+ # unknown grouping: prefer 2-3 char chunks over 1-char
499
+ sc += unk_base - unk_len_penalty * L
500
+ # length preference (avoid too many singles)
501
+ sc += len_bonus * (L - 1)
502
+ if L == 1:
503
+ sc -= single_penalty
504
+ if in_dict:
505
+ sc += dict_bonus + dict_len_bonus * (L - 1)
506
+ pos_tag = self.dictionary.get(w, "")
507
+ if pos_tag in ("nrt", "ns"):
508
+ sc += entity_bonus
509
+ j = i + L
510
+ if sc > best_score[j]:
511
+ best_score[j] = sc
512
+ back[j] = i
513
+ back_len[j] = L
514
+
515
+ # if something went wrong, fallback to single chars
516
+ if back[n] < 0:
517
+ return [Morpheme(run[i], run[i], self.dictionary.get(run[i], "n"), offset + i, offset + i + 1) for i in range(n)]
518
+
519
+ # backtrack
520
+ toks: List[Tuple[int, int]] = []
521
+ j = n
522
+ while j > 0:
523
+ i = back[j]
524
+ if i < 0:
525
+ # safety
526
+ i = j - 1
527
+ L = back_len[j]
528
+ toks.append((i, j))
529
+ j = i
530
+ toks.reverse()
531
+
532
+ out: List[Morpheme] = []
533
+ for i, j in toks:
534
+ w = run[i:j]
535
+ pos = self.dictionary.get(w, "x")
536
+ out.append(Morpheme(w, w, pos, offset + i, offset + j))
537
+ return out
538
+
539
+ def _forward_max_match(self, text: str) -> List[Morpheme]:
540
+ """정방향 최장일치"""
541
+ result = []
542
+ pos = 0
543
+
544
+ while pos < len(text):
545
+ # 날짜 패턴 우선 처리 (숫자+年/月/日/号) - optional
546
+ if self.join_dates:
547
+ m = self._date_md_with_dian.match(text[pos:])
548
+ if m:
549
+ s = m.group()
550
+ # split: 12月31日 + 电
551
+ core = s[:-1]
552
+ result.append(Morpheme(core, core, 't', pos, pos + len(core)))
553
+ result.append(Morpheme('电', '电', 'n', pos + len(core), pos + len(s)))
554
+ pos += len(s)
555
+ continue
556
+ m = self._date_ymd.match(text[pos:])
557
+ if m:
558
+ s = m.group()
559
+ result.append(Morpheme(s, s, 't', pos, pos + len(s)))
560
+ pos += len(s)
561
+ continue
562
+ m = self._date_md.match(text[pos:])
563
+ if m:
564
+ s = m.group()
565
+ result.append(Morpheme(s, s, 't', pos, pos + len(s)))
566
+ pos += len(s)
567
+ continue
568
+
569
+ # 공백/기호 스킵
570
+ if not self.hanzi.match(text[pos:pos+1]):
571
+ if text[pos].isspace():
572
+ pos += 1
573
+ continue
574
+ # 숫자/영문
575
+ match = re.match(r'[a-zA-Z0-9]+', text[pos:])
576
+ if match:
577
+ word = match.group()
578
+ result.append(Morpheme(word, word, 'x', pos, pos + len(word)))
579
+ pos += len(word)
580
+ else:
581
+ result.append(Morpheme(text[pos], text[pos], 'x', pos, pos + 1))
582
+ pos += 1
583
+ continue
584
+
585
+ # 최장일치
586
+ matched = False
587
+ for length in range(min(self.max_word_len, len(text) - pos), 0, -1):
588
+ word = text[pos:pos+length]
589
+ if word in self.dictionary:
590
+ result.append(Morpheme(
591
+ word, word, self.dictionary[word],
592
+ pos, pos + length
593
+ ))
594
+ pos += length
595
+ matched = True
596
+ break
597
+
598
+ if not matched:
599
+ # 미등록어: 다음 사전 단어까지 그룹화 (2-4자 우선)
600
+ end_pos = pos + 1
601
+ while end_pos < len(text) and end_pos - pos < 4:
602
+ if not self.hanzi.match(text[end_pos:end_pos+1]):
603
+ break
604
+ # 이 위치에서 사전 단어가 시작하면 끊음
605
+ found_dict_word = False
606
+ for length in range(min(self.max_word_len, len(text) - end_pos), 0, -1):
607
+ if text[end_pos:end_pos+length] in self.dictionary:
608
+ found_dict_word = True
609
+ break
610
+ if found_dict_word:
611
+ break
612
+ end_pos += 1
613
+
614
+ # 2자 이상이면 하나의 단어로
615
+ if end_pos - pos >= 2:
616
+ word = text[pos:end_pos]
617
+ result.append(Morpheme(
618
+ word, word, 'nz', # nz = 미등록 고유명사
619
+ pos, end_pos
620
+ ))
621
+ pos = end_pos
622
+ else:
623
+ # 1자는 그대로
624
+ result.append(Morpheme(
625
+ text[pos], text[pos], 'n',
626
+ pos, pos + 1
627
+ ))
628
+ pos += 1
629
+
630
+ return result
631
+
632
+ def _backward_max_match(self, text: str) -> List[Morpheme]:
633
+ """역방향 최장일치"""
634
+ result = []
635
+ pos = len(text)
636
+
637
+ while pos > 0:
638
+ # 공백 스킵
639
+ if pos > 0 and text[pos-1].isspace():
640
+ pos -= 1
641
+ continue
642
+
643
+ # 비한자
644
+ if pos > 0 and not self.hanzi.match(text[pos-1:pos]):
645
+ # 연속 비한자 찾기
646
+ end = pos
647
+ while pos > 0 and not self.hanzi.match(text[pos-1:pos]) and not text[pos-1].isspace():
648
+ pos -= 1
649
+ if pos < end:
650
+ word = text[pos:end]
651
+ result.insert(0, Morpheme(word, word, 'x', pos, end))
652
+ continue
653
+
654
+ # 날짜 패턴(역방향) 처리(옵션): "...12月31日电" / "...2025年12月31日"
655
+ if self.join_dates:
656
+ lookback = max(0, pos - 16)
657
+ chunk = text[lookback:pos]
658
+ m = re.search(r'[0-9]{1,2}月[0-9]{1,2}[日号]电$', chunk)
659
+ if m:
660
+ s = m.group()
661
+ core = s[:-1]
662
+ start = pos - len(s)
663
+ result.insert(0, Morpheme('电', '电', 'n', pos - 1, pos))
664
+ result.insert(0, Morpheme(core, core, 't', start, start + len(core)))
665
+ pos -= len(s)
666
+ continue
667
+ m = re.search(r'[0-9]{2,4}年[0-9]{1,2}月[0-9]{1,2}[日号]$', chunk)
668
+ if m:
669
+ s = m.group()
670
+ start = pos - len(s)
671
+ result.insert(0, Morpheme(s, s, 't', start, pos))
672
+ pos -= len(s)
673
+ continue
674
+ m = re.search(r'[0-9]{1,2}月[0-9]{1,2}[日号]$', chunk)
675
+ if m:
676
+ s = m.group()
677
+ start = pos - len(s)
678
+ result.insert(0, Morpheme(s, s, 't', start, pos))
679
+ pos -= len(s)
680
+ continue
681
+
682
+ # 최장일치 (역방향)
683
+ matched = False
684
+ for length in range(min(self.max_word_len, pos), 0, -1):
685
+ word = text[pos-length:pos]
686
+ if word in self.dictionary:
687
+ result.insert(0, Morpheme(
688
+ word, word, self.dictionary[word],
689
+ pos - length, pos
690
+ ))
691
+ pos -= length
692
+ matched = True
693
+ break
694
+
695
+ if not matched:
696
+ # 미등록어: 이전 사전 단어까지 그룹화 (역방향, 2-4자)
697
+ start_pos = pos - 1
698
+ while start_pos > 0 and pos - start_pos < 4:
699
+ if not self.hanzi.match(text[start_pos-1:start_pos]):
700
+ break
701
+ # 이 위치에서 끝나는 사전 단어가 있으면 끊음
702
+ found_dict_word = False
703
+ for length in range(min(self.max_word_len, start_pos), 0, -1):
704
+ if text[start_pos-length:start_pos] in self.dictionary:
705
+ found_dict_word = True
706
+ break
707
+ if found_dict_word:
708
+ break
709
+ start_pos -= 1
710
+
711
+ # 2자 이상이면 하나의 단어로
712
+ if pos - start_pos >= 2:
713
+ word = text[start_pos:pos]
714
+ result.insert(0, Morpheme(
715
+ word, word, 'nz', # nz = 미등록 고유명사
716
+ start_pos, pos
717
+ ))
718
+ pos = start_pos
719
+ else:
720
+ result.insert(0, Morpheme(
721
+ text[pos-1], text[pos-1], 'n',
722
+ pos - 1, pos
723
+ ))
724
+ pos -= 1
725
+
726
+ return result
727
+
728
+ def segment(self, text: str) -> List[str]:
729
+ """단어 분리 (간편 함수)"""
730
+ morphemes = self.analyze(text)
731
+ return [m.surface for m in morphemes]
732
+
733
+ def pos_tag(self, text: str) -> List[Tuple[str, str]]:
734
+ """품사 태깅"""
735
+ morphemes = self.analyze(text)
736
+ return [(m.surface, m.pos) for m in morphemes]