tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,736 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Chinese Morphological Analyzer - 자체 구현
|
|
3
|
+
=========================================
|
|
4
|
+
|
|
5
|
+
외부 라이브러리 없이 순수 Python으로 구현한 중국어 분석기
|
|
6
|
+
|
|
7
|
+
특징:
|
|
8
|
+
- 최장일치 분词 (Maximum Matching)
|
|
9
|
+
- 역방향 최장일치 (Reverse Maximum Matching)
|
|
10
|
+
- 양방향 비교로 최적 분할 선택
|
|
11
|
+
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
import re
|
|
16
|
+
import math
|
|
17
|
+
import pickle
|
|
18
|
+
from typing import List, Tuple, Set, Optional
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
|
|
21
|
+
from ..resources import resolve_seg_lexicon_path, resolve_extra_dict_path
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class Morpheme:
|
|
26
|
+
"""형태소/단어"""
|
|
27
|
+
surface: str
|
|
28
|
+
lemma: str
|
|
29
|
+
pos: str
|
|
30
|
+
start: int
|
|
31
|
+
end: int
|
|
32
|
+
|
|
33
|
+
def __repr__(self):
|
|
34
|
+
return f"{self.surface}/{self.pos}"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ChineseAnalyzer:
|
|
38
|
+
"""
|
|
39
|
+
중국어 분석기
|
|
40
|
+
|
|
41
|
+
Usage:
|
|
42
|
+
analyzer = ChineseAnalyzer()
|
|
43
|
+
result = analyzer.analyze("阿里巴巴集团在杭州宣布")
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(self, join_dates: Optional[bool] = None):
|
|
47
|
+
# Product option: join common date spans into a single token for news/SNS.
|
|
48
|
+
# Default is OFF to avoid unexpected evaluation/style mismatches.
|
|
49
|
+
if join_dates is None:
|
|
50
|
+
self.join_dates = os.getenv("TOKMOR_ZH_JOIN_DATES", "0").strip().lower() in ("1", "true", "yes", "y", "on")
|
|
51
|
+
else:
|
|
52
|
+
self.join_dates = bool(join_dates)
|
|
53
|
+
# Optional segmentation lexicon (offline, generated from large corpora):
|
|
54
|
+
# {word(str): freq(int)}. If present, we use a DP/Viterbi segmenter for Hanzi runs.
|
|
55
|
+
self._wordfreq = None
|
|
56
|
+
self._wordfreq_max_len = 4
|
|
57
|
+
self._build_dictionary()
|
|
58
|
+
self._load_seg_lexicon()
|
|
59
|
+
self._load_extra_dict()
|
|
60
|
+
|
|
61
|
+
def _load_extra_dict(self) -> None:
|
|
62
|
+
"""
|
|
63
|
+
Optional runtime extension dictionary (offline).
|
|
64
|
+
File: seg_lexicon/zh_extra_dict.json (token -> pos)
|
|
65
|
+
"""
|
|
66
|
+
p = resolve_extra_dict_path("zh")
|
|
67
|
+
if not p:
|
|
68
|
+
return
|
|
69
|
+
try:
|
|
70
|
+
import json
|
|
71
|
+
|
|
72
|
+
obj = json.loads(p.read_text(encoding="utf-8", errors="ignore"))
|
|
73
|
+
if not isinstance(obj, dict):
|
|
74
|
+
return
|
|
75
|
+
# Merge into dictionary; keep only sane entries.
|
|
76
|
+
for k, v in obj.items():
|
|
77
|
+
if not isinstance(k, str) or not k:
|
|
78
|
+
continue
|
|
79
|
+
if not isinstance(v, str) or not v:
|
|
80
|
+
v = "n"
|
|
81
|
+
self.dictionary[k] = v
|
|
82
|
+
# Update max word length
|
|
83
|
+
self.max_word_len = max(len(w) for w in self.dictionary) if self.dictionary else self.max_word_len
|
|
84
|
+
except Exception:
|
|
85
|
+
return
|
|
86
|
+
|
|
87
|
+
def _load_seg_lexicon(self):
|
|
88
|
+
p = resolve_seg_lexicon_path("zh")
|
|
89
|
+
if not p:
|
|
90
|
+
return
|
|
91
|
+
try:
|
|
92
|
+
obj = pickle.loads(p.read_bytes())
|
|
93
|
+
if isinstance(obj, dict):
|
|
94
|
+
# keep only str->int like entries
|
|
95
|
+
wf = {}
|
|
96
|
+
mx = 1
|
|
97
|
+
for k, v in obj.items():
|
|
98
|
+
if isinstance(k, str) and isinstance(v, int) and k:
|
|
99
|
+
wf[k] = v
|
|
100
|
+
if len(k) > mx:
|
|
101
|
+
mx = len(k)
|
|
102
|
+
self._wordfreq = wf
|
|
103
|
+
self._wordfreq_max_len = max(2, min(int(mx), 8))
|
|
104
|
+
except Exception:
|
|
105
|
+
return
|
|
106
|
+
|
|
107
|
+
def _build_dictionary(self):
|
|
108
|
+
"""사전 구축"""
|
|
109
|
+
|
|
110
|
+
# 주요 단어 사전 (단어: 품사)
|
|
111
|
+
self.dictionary = {
|
|
112
|
+
# ============================================================
|
|
113
|
+
# News / SNS domain (high-impact for segmentation)
|
|
114
|
+
# ============================================================
|
|
115
|
+
# 통신사/매체
|
|
116
|
+
'新华社': 'nrt', '中新社': 'nrt', '人民网': 'nrt', '央视': 'nrt', '央视网': 'nrt',
|
|
117
|
+
'澎湃新闻': 'nrt', '环球时报': 'nrt', '财新': 'nrt', '凤凰网': 'nrt', '网易': 'nrt', '新浪': 'nrt',
|
|
118
|
+
# 뉴스 관용구
|
|
119
|
+
'日电': 'n', '日讯': 'n', '消息': 'n', '报道': 'n', '记者': 'n',
|
|
120
|
+
'原标题': 'n', '来源': 'n', '编辑': 'n', '评论': 'n',
|
|
121
|
+
# 뉴스 단골(분절 안정화)
|
|
122
|
+
'直击': 'v', '强震': 'n', '灾区': 'n',
|
|
123
|
+
# SNS
|
|
124
|
+
'微博': 'n', '微信': 'n', '抖音': 'n', '快手': 'n', '小红书': 'n',
|
|
125
|
+
'网友': 'n', '点赞': 'v', '转发': 'v', '评论区': 'n',
|
|
126
|
+
|
|
127
|
+
# 지명
|
|
128
|
+
'北京': 'ns', '上海': 'ns', '广州': 'ns', '深圳': 'ns',
|
|
129
|
+
'杭州': 'ns', '南京': 'ns', '武汉': 'ns', '成都': 'ns',
|
|
130
|
+
'香港': 'ns', '台湾': 'ns', '中国': 'ns', '美国': 'ns',
|
|
131
|
+
'日本': 'ns', '韩国': 'ns', '英国': 'ns', '法国': 'ns',
|
|
132
|
+
'德国': 'ns', '俄罗斯': 'ns', '印度': 'ns', '越南': 'ns',
|
|
133
|
+
'泰国': 'ns', '新加坡': 'ns', '马来西亚': 'ns',
|
|
134
|
+
'土耳其': 'ns',
|
|
135
|
+
|
|
136
|
+
# 기업/조직
|
|
137
|
+
'阿里巴巴': 'nrt', '腾讯': 'nrt', '百度': 'nrt', '华为': 'nrt',
|
|
138
|
+
'小米': 'nrt', '京东': 'nrt', '美团': 'nrt', '字节跳动': 'nrt',
|
|
139
|
+
'苹果': 'nrt', '谷歌': 'nrt', '微软': 'nrt', '三星': 'nrt',
|
|
140
|
+
'特斯拉': 'nrt', '丰田': 'nrt', '奔驰': 'nrt', '宝马': 'nrt',
|
|
141
|
+
'集团': 'n', '公司': 'n', '企业': 'n', '银行': 'n',
|
|
142
|
+
'政府': 'n', '学校': 'n', '大学': 'n', '医院': 'n',
|
|
143
|
+
|
|
144
|
+
# 일반명사
|
|
145
|
+
'人': 'n', '人们': 'n', '时间': 'n', '地方': 'n',
|
|
146
|
+
'问题': 'n', '情况': 'n', '工作': 'n', '生活': 'n',
|
|
147
|
+
'发展': 'n', '经济': 'n', '社会': 'n', '文化': 'n',
|
|
148
|
+
'技术': 'n', '产品': 'n', '服务': 'n', '市场': 'n',
|
|
149
|
+
'国家': 'n', '世界': 'n', '城市': 'n', '地区': 'n',
|
|
150
|
+
|
|
151
|
+
# 동사
|
|
152
|
+
'是': 'v', '有': 'v', '在': 'v', '说': 'v', '做': 'v',
|
|
153
|
+
'去': 'v', '来': 'v', '看': 'v', '想': 'v', '知道': 'v',
|
|
154
|
+
'发表': 'v', '宣布': 'v', '公布': 'v', '发布': 'v',
|
|
155
|
+
'开始': 'v', '结束': 'v', '进行': 'v', '完成': 'v',
|
|
156
|
+
'研究': 'v', '开发': 'v', '生产': 'v', '销售': 'v',
|
|
157
|
+
'投资': 'v', '合作': 'v', '成立': 'v', '成为': 'v',
|
|
158
|
+
|
|
159
|
+
# 형용사
|
|
160
|
+
'大': 'a', '小': 'a', '多': 'a', '少': 'a',
|
|
161
|
+
'好': 'a', '新': 'a', '高': 'a', '重要': 'a',
|
|
162
|
+
|
|
163
|
+
# 부사
|
|
164
|
+
'不': 'd', '也': 'd', '就': 'd', '都': 'd', '还': 'd',
|
|
165
|
+
'很': 'd', '最': 'd', '已经': 'd', '正在': 'd',
|
|
166
|
+
|
|
167
|
+
# 개사 (전치사)
|
|
168
|
+
'在': 'p', '从': 'p', '向': 'p', '对': 'p', '把': 'p',
|
|
169
|
+
'被': 'p', '比': 'p', '跟': 'p', '和': 'p', '与': 'p',
|
|
170
|
+
|
|
171
|
+
# 조사/어기사
|
|
172
|
+
'的': 'u', '地': 'u', '得': 'u', '了': 'u', '着': 'u', '过': 'u',
|
|
173
|
+
'吗': 'u', '呢': 'u', '吧': 'u', '啊': 'u',
|
|
174
|
+
|
|
175
|
+
# 대명사
|
|
176
|
+
'我': 'r', '你': 'r', '他': 'r', '她': 'r', '它': 'r',
|
|
177
|
+
'我们': 'r', '你们': 'r', '他们': 'r', '这': 'r', '那': 'r',
|
|
178
|
+
'这个': 'r', '那个': 'r', '什么': 'r', '谁': 'r',
|
|
179
|
+
|
|
180
|
+
# 연결어
|
|
181
|
+
'和': 'c', '或': 'c', '但': 'c', '但是': 'c', '因为': 'c',
|
|
182
|
+
'所以': 'c', '如果': 'c', '虽然': 'c',
|
|
183
|
+
|
|
184
|
+
# 수사
|
|
185
|
+
'一': 'm', '二': 'm', '三': 'm', '四': 'm', '五': 'm',
|
|
186
|
+
'六': 'm', '七': 'm', '八': 'm', '九': 'm', '十': 'm',
|
|
187
|
+
'百': 'm', '千': 'm', '万': 'm', '亿': 'm',
|
|
188
|
+
|
|
189
|
+
# 양사
|
|
190
|
+
'个': 'q', '年': 'q', '月': 'q', '日': 'q', '号': 'q',
|
|
191
|
+
'次': 'q', '种': 'q', '件': 'q', '位': 'q',
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
# 최대 단어 길이
|
|
195
|
+
self.max_word_len = max(len(w) for w in self.dictionary) if self.dictionary else 4
|
|
196
|
+
|
|
197
|
+
# 한자 패턴
|
|
198
|
+
self.hanzi = re.compile(r'[\u4e00-\u9fff]+')
|
|
199
|
+
|
|
200
|
+
# 날짜/시간 패턴 (뉴스에서 매우 자주 등장)
|
|
201
|
+
# Examples: 2025年12月31日, 12月31日, 12月31日电
|
|
202
|
+
# NOTE: only used when self.join_dates is True
|
|
203
|
+
self._date_ymd = re.compile(r'^[0-9]{2,4}年[0-9]{1,2}月[0-9]{1,2}[日号]')
|
|
204
|
+
self._date_md = re.compile(r'^[0-9]{1,2}月[0-9]{1,2}[日号]')
|
|
205
|
+
self._date_md_with_dian = re.compile(r'^[0-9]{1,2}月[0-9]{1,2}[日号]电')
|
|
206
|
+
|
|
207
|
+
def analyze(self, text: str) -> List[Morpheme]:
|
|
208
|
+
"""
|
|
209
|
+
Analyze text into morphemes.
|
|
210
|
+
"""
|
|
211
|
+
if not text:
|
|
212
|
+
return []
|
|
213
|
+
|
|
214
|
+
# If we have a segmentation lexicon, prefer DP segmentation (more stable on real corpora).
|
|
215
|
+
if self._wordfreq:
|
|
216
|
+
out = self._segment_with_lexicon(text)
|
|
217
|
+
out = self._postprocess_fix_mixed_function_word_tokens(out)
|
|
218
|
+
return self._postprocess_merge_common_suffixes(out)
|
|
219
|
+
|
|
220
|
+
# Fallback: 정방향/역방향 최장일치
|
|
221
|
+
forward = self._forward_max_match(text)
|
|
222
|
+
backward = self._backward_max_match(text)
|
|
223
|
+
out = forward if len(forward) <= len(backward) else backward
|
|
224
|
+
out = self._postprocess_fix_mixed_function_word_tokens(out)
|
|
225
|
+
return self._postprocess_merge_common_suffixes(out)
|
|
226
|
+
|
|
227
|
+
def _postprocess_fix_mixed_function_word_tokens(self, toks: List[Morpheme]) -> List[Morpheme]:
|
|
228
|
+
"""
|
|
229
|
+
Fix a common segmentation error from n-gram lexicons:
|
|
230
|
+
- function word + first char of a toponym gets merged (e.g., 在北 + 京市)
|
|
231
|
+
|
|
232
|
+
We do a conservative local split/merge:
|
|
233
|
+
- If token is length-2 and starts with a function char (在/对/从/与/和/将/为/把/被/给/向/于)
|
|
234
|
+
and the 2nd char is a plausible toponym initial (first char of any known ns entry),
|
|
235
|
+
split it into two single-char tokens.
|
|
236
|
+
- If token is length-2 and ends with an admin suffix (市/省/区/县/州/国/镇/村/旗/盟),
|
|
237
|
+
split it into (head char, suffix char).
|
|
238
|
+
- Then merge adjacent 2-char toponyms when they exist in the hand dictionary as ns.
|
|
239
|
+
"""
|
|
240
|
+
if not toks:
|
|
241
|
+
return toks
|
|
242
|
+
|
|
243
|
+
func0 = {"在", "对", "从", "与", "和", "将", "为", "把", "被", "给", "向", "于"}
|
|
244
|
+
admin1 = {"市", "省", "县", "区", "州", "国", "镇", "村", "旗", "盟"}
|
|
245
|
+
|
|
246
|
+
# derive plausible toponym initials from known ns tokens
|
|
247
|
+
topo_initials = set()
|
|
248
|
+
try:
|
|
249
|
+
for w, p in self.dictionary.items():
|
|
250
|
+
if p == "ns" and isinstance(w, str) and len(w) >= 2:
|
|
251
|
+
topo_initials.add(w[0])
|
|
252
|
+
except Exception:
|
|
253
|
+
topo_initials = set()
|
|
254
|
+
|
|
255
|
+
# step1: split tokens conservatively
|
|
256
|
+
split: List[Morpheme] = []
|
|
257
|
+
for m in toks:
|
|
258
|
+
s = m.surface
|
|
259
|
+
if isinstance(s, str) and len(s) == 2:
|
|
260
|
+
if (s[0] in func0) and (s[1] in topo_initials) and (s not in self.dictionary):
|
|
261
|
+
# split: 在北 -> 在 + 北
|
|
262
|
+
split.append(Morpheme(s[0], s[0], self.dictionary.get(s[0], "x"), m.start, m.start + 1))
|
|
263
|
+
split.append(Morpheme(s[1], s[1], self.dictionary.get(s[1], "x"), m.start + 1, m.end))
|
|
264
|
+
continue
|
|
265
|
+
if (s[1] in admin1) and (s not in self.dictionary):
|
|
266
|
+
# split: 京市 -> 京 + 市
|
|
267
|
+
split.append(Morpheme(s[0], s[0], self.dictionary.get(s[0], "x"), m.start, m.start + 1))
|
|
268
|
+
split.append(Morpheme(s[1], s[1], self.dictionary.get(s[1], "x"), m.start + 1, m.end))
|
|
269
|
+
continue
|
|
270
|
+
split.append(m)
|
|
271
|
+
|
|
272
|
+
# step2: merge adjacent 2-char toponyms when present in dictionary
|
|
273
|
+
out: List[Morpheme] = []
|
|
274
|
+
i = 0
|
|
275
|
+
while i < len(split):
|
|
276
|
+
if i + 1 < len(split):
|
|
277
|
+
a, b = split[i], split[i + 1]
|
|
278
|
+
if a.end == b.start:
|
|
279
|
+
comb = a.surface + b.surface
|
|
280
|
+
if self.dictionary.get(comb) == "ns":
|
|
281
|
+
out.append(Morpheme(comb, comb, "ns", a.start, b.end))
|
|
282
|
+
i += 2
|
|
283
|
+
continue
|
|
284
|
+
out.append(split[i])
|
|
285
|
+
i += 1
|
|
286
|
+
return out
|
|
287
|
+
|
|
288
|
+
def _postprocess_merge_common_suffixes(self, toks: List[Morpheme]) -> List[Morpheme]:
|
|
289
|
+
"""
|
|
290
|
+
Postprocess merges to improve segmentation quality for product use:
|
|
291
|
+
- Location suffixes: 北京 + 市 -> 北京市 (when previous token is a location-like ns)
|
|
292
|
+
- Organization suffixes: 阿里巴巴 + 集团 -> 阿里巴巴集团 (when previous is nrt)
|
|
293
|
+
- Numeric unit tails: 10 + 亿 -> 10亿 (handled in CJKTokenizer too, but keep safe here when we see it)
|
|
294
|
+
|
|
295
|
+
This is conservative: it only merges when contiguity is exact (prev.end == next.start),
|
|
296
|
+
and when previous token is already strongly typed (ns/nrt) or combined form exists in lexicon.
|
|
297
|
+
"""
|
|
298
|
+
if not toks:
|
|
299
|
+
return toks
|
|
300
|
+
|
|
301
|
+
admin_suffix = {"市", "省", "县", "区", "州", "国", "镇", "村", "旗", "盟"}
|
|
302
|
+
# multi-char admin tails that frequently appear split
|
|
303
|
+
admin_suffix_multi = {"自治区", "自治州", "自治县", "特别行政区", "行政区"}
|
|
304
|
+
# NOTE: exclude "政府" here (it's a standalone noun too often; merging X+政府 is risky).
|
|
305
|
+
org_suffix_strong = {"集团", "公司", "银行", "大学"}
|
|
306
|
+
org_suffix_generic = {"委员会", "协会", "研究院", "研究所", "法院", "检察院", "公安局"}
|
|
307
|
+
# multi-char org tails
|
|
308
|
+
org_suffix_multi = {"有限公司", "有限责任公司", "股份有限公司", "集团公司"}
|
|
309
|
+
|
|
310
|
+
wf = self._wordfreq or {}
|
|
311
|
+
|
|
312
|
+
out: List[Morpheme] = []
|
|
313
|
+
i = 0
|
|
314
|
+
while i < len(toks):
|
|
315
|
+
cur = toks[i]
|
|
316
|
+
if out and cur.start == out[-1].end:
|
|
317
|
+
prev = out[-1]
|
|
318
|
+
comb = prev.surface + cur.surface
|
|
319
|
+
# 1) merge if combined form exists in hand dictionary
|
|
320
|
+
if comb in self.dictionary:
|
|
321
|
+
pos = self.dictionary.get(comb, prev.pos)
|
|
322
|
+
out[-1] = Morpheme(comb, comb, pos, prev.start, cur.end)
|
|
323
|
+
i += 1
|
|
324
|
+
continue
|
|
325
|
+
# 2) merge location + admin suffix (prev already ns)
|
|
326
|
+
if prev.pos == "ns" and cur.surface in admin_suffix:
|
|
327
|
+
# Only merge if lexicon suggests it's a real unit OR prev is very short (typical toponyms)
|
|
328
|
+
if wf.get(comb, 0) > 0 or len(prev.surface) <= 3:
|
|
329
|
+
out[-1] = Morpheme(comb, comb, "ns", prev.start, cur.end)
|
|
330
|
+
i += 1
|
|
331
|
+
continue
|
|
332
|
+
# 2a) merge toponym + multi-char admin suffix (lexicon supported)
|
|
333
|
+
if cur.surface in admin_suffix_multi and 1 <= len(prev.surface) <= 6 and cur.start == prev.end:
|
|
334
|
+
f_comb = int(wf.get(comb, 0) or 0)
|
|
335
|
+
if prev.pos == "ns" or f_comb >= 200:
|
|
336
|
+
out[-1] = Morpheme(comb, comb, "ns", prev.start, cur.end)
|
|
337
|
+
i += 1
|
|
338
|
+
continue
|
|
339
|
+
# 2b) merge toponym + admin suffix even if prev not tagged, when lexicon strongly supports comb.
|
|
340
|
+
if cur.surface in admin_suffix and 1 <= len(prev.surface) <= 4:
|
|
341
|
+
f_comb = int(wf.get(comb, 0) or 0)
|
|
342
|
+
if f_comb >= 100:
|
|
343
|
+
out[-1] = Morpheme(comb, comb, "ns", prev.start, cur.end)
|
|
344
|
+
i += 1
|
|
345
|
+
continue
|
|
346
|
+
# 3) merge organization + suffix (prev already nrt)
|
|
347
|
+
if prev.pos == "nrt" and (cur.surface in org_suffix_strong or cur.surface in org_suffix_generic):
|
|
348
|
+
# Prefer lexicon evidence, but be permissive for strong organization tails.
|
|
349
|
+
if wf.get(comb, 0) > 0 or len(prev.surface) >= 2:
|
|
350
|
+
out[-1] = Morpheme(comb, comb, "nrt", prev.start, cur.end)
|
|
351
|
+
i += 1
|
|
352
|
+
continue
|
|
353
|
+
# 3a) merge org + multi-char suffix (lexicon supported)
|
|
354
|
+
if cur.surface in org_suffix_multi and 1 <= len(prev.surface) <= 8 and cur.start == prev.end:
|
|
355
|
+
f_comb = int(wf.get(comb, 0) or 0)
|
|
356
|
+
if prev.pos == "nrt" or f_comb >= 200:
|
|
357
|
+
out[-1] = Morpheme(comb, comb, "nrt", prev.start, cur.end)
|
|
358
|
+
i += 1
|
|
359
|
+
continue
|
|
360
|
+
# 3b) merge org tail even if prev not tagged, when lexicon supports comb.
|
|
361
|
+
# Avoid merging verb+bank (e.g., 支持 + 银行) via a tiny stoplist.
|
|
362
|
+
if cur.start == prev.end and 1 <= len(prev.surface) <= 8:
|
|
363
|
+
stop_prev = {
|
|
364
|
+
"支持", "提供", "表示", "认为", "指出", "强调", "包括", "进行", "开展", "推动", "加强",
|
|
365
|
+
"将", "对", "在", "与", "和", "或", "但", "而", "为", "把", "被", "从", "向",
|
|
366
|
+
}
|
|
367
|
+
if prev.surface not in stop_prev:
|
|
368
|
+
f_comb = int(wf.get(comb, 0) or 0)
|
|
369
|
+
# strong tails: lower threshold
|
|
370
|
+
if cur.surface in org_suffix_strong and f_comb >= 20:
|
|
371
|
+
out[-1] = Morpheme(comb, comb, "nrt", prev.start, cur.end)
|
|
372
|
+
i += 1
|
|
373
|
+
continue
|
|
374
|
+
# generic tails: keep strict
|
|
375
|
+
if cur.surface in org_suffix_generic and f_comb >= 200:
|
|
376
|
+
out[-1] = Morpheme(comb, comb, "nrt", prev.start, cur.end)
|
|
377
|
+
i += 1
|
|
378
|
+
continue
|
|
379
|
+
out.append(cur)
|
|
380
|
+
i += 1
|
|
381
|
+
return out
|
|
382
|
+
|
|
383
|
+
def _segment_with_lexicon(self, text: str) -> List[Morpheme]:
|
|
384
|
+
"""
|
|
385
|
+
Segment full text. For Hanzi runs, use Viterbi over word candidates from:
|
|
386
|
+
- hand dictionary (high precision)
|
|
387
|
+
- wordfreq lexicon (coverage)
|
|
388
|
+
Non-Hanzi parts follow the same rules as the native segmenter.
|
|
389
|
+
"""
|
|
390
|
+
out: List[Morpheme] = []
|
|
391
|
+
pos = 0
|
|
392
|
+
n = len(text)
|
|
393
|
+
|
|
394
|
+
while pos < n:
|
|
395
|
+
# optional date joins (reuse native date patterns)
|
|
396
|
+
if self.join_dates:
|
|
397
|
+
m = self._date_md_with_dian.match(text[pos:])
|
|
398
|
+
if m:
|
|
399
|
+
s = m.group()
|
|
400
|
+
core = s[:-1]
|
|
401
|
+
out.append(Morpheme(core, core, 't', pos, pos + len(core)))
|
|
402
|
+
out.append(Morpheme('电', '电', 'n', pos + len(core), pos + len(s)))
|
|
403
|
+
pos += len(s)
|
|
404
|
+
continue
|
|
405
|
+
m = self._date_ymd.match(text[pos:])
|
|
406
|
+
if m:
|
|
407
|
+
s = m.group()
|
|
408
|
+
out.append(Morpheme(s, s, 't', pos, pos + len(s)))
|
|
409
|
+
pos += len(s)
|
|
410
|
+
continue
|
|
411
|
+
m = self._date_md.match(text[pos:])
|
|
412
|
+
if m:
|
|
413
|
+
s = m.group()
|
|
414
|
+
out.append(Morpheme(s, s, 't', pos, pos + len(s)))
|
|
415
|
+
pos += len(s)
|
|
416
|
+
continue
|
|
417
|
+
|
|
418
|
+
ch = text[pos]
|
|
419
|
+
# whitespace
|
|
420
|
+
if ch.isspace():
|
|
421
|
+
pos += 1
|
|
422
|
+
continue
|
|
423
|
+
|
|
424
|
+
# latin/digit chunk
|
|
425
|
+
if not self.hanzi.match(ch):
|
|
426
|
+
match = re.match(r'[a-zA-Z0-9]+', text[pos:])
|
|
427
|
+
if match:
|
|
428
|
+
w = match.group()
|
|
429
|
+
out.append(Morpheme(w, w, 'x', pos, pos + len(w)))
|
|
430
|
+
pos += len(w)
|
|
431
|
+
else:
|
|
432
|
+
out.append(Morpheme(ch, ch, 'x', pos, pos + 1))
|
|
433
|
+
pos += 1
|
|
434
|
+
continue
|
|
435
|
+
|
|
436
|
+
# hanzi run
|
|
437
|
+
m = self.hanzi.match(text[pos:])
|
|
438
|
+
if not m:
|
|
439
|
+
out.append(Morpheme(ch, ch, 'x', pos, pos + 1))
|
|
440
|
+
pos += 1
|
|
441
|
+
continue
|
|
442
|
+
run = m.group()
|
|
443
|
+
run_start = pos
|
|
444
|
+
out.extend(self._viterbi_hanzi_run(run, run_start))
|
|
445
|
+
pos += len(run)
|
|
446
|
+
|
|
447
|
+
return out
|
|
448
|
+
|
|
449
|
+
def _viterbi_hanzi_run(self, run: str, offset: int) -> List[Morpheme]:
|
|
450
|
+
"""
|
|
451
|
+
Viterbi segmentation over a pure-Hanzi run.
|
|
452
|
+
Score = log(freq+1) + len_bonus*(len-1) - single_penalty(if len==1) + dict_bonus(if in hand dict)
|
|
453
|
+
"""
|
|
454
|
+
wf = self._wordfreq or {}
|
|
455
|
+
max_len = max(self.max_word_len, self._wordfreq_max_len)
|
|
456
|
+
max_len = max(2, min(int(max_len), 8))
|
|
457
|
+
n = len(run)
|
|
458
|
+
|
|
459
|
+
len_bonus = 0.8
|
|
460
|
+
single_penalty = 1.2
|
|
461
|
+
# Hand dictionary entries should beat high-frequency short n-grams.
|
|
462
|
+
# Otherwise entities like "阿里巴巴" can get split into "阿里"+"巴巴".
|
|
463
|
+
dict_bonus = 3.5
|
|
464
|
+
dict_len_bonus = 0.6
|
|
465
|
+
entity_bonus = 4.0
|
|
466
|
+
entity_freq_floor = 50_000
|
|
467
|
+
# When lexicon doesn't contain a span, still allow 2-4 char grouping
|
|
468
|
+
# to avoid degenerate per-character segmentation.
|
|
469
|
+
unk_base = -1.5
|
|
470
|
+
unk_len_penalty = 0.35
|
|
471
|
+
freq_cap = 200_000
|
|
472
|
+
|
|
473
|
+
best_score = [-1e100] * (n + 1)
|
|
474
|
+
back = [-1] * (n + 1)
|
|
475
|
+
back_len = [1] * (n + 1)
|
|
476
|
+
best_score[0] = 0.0
|
|
477
|
+
|
|
478
|
+
for i in range(n):
|
|
479
|
+
if best_score[i] <= -1e90:
|
|
480
|
+
continue
|
|
481
|
+
# try candidates
|
|
482
|
+
for L in range(1, min(max_len, n - i) + 1):
|
|
483
|
+
w = run[i:i+L]
|
|
484
|
+
# require either hand dict or wordfreq for multi-char; allow single-char always
|
|
485
|
+
freq = wf.get(w, 0)
|
|
486
|
+
in_dict = w in self.dictionary
|
|
487
|
+
# If it's a known entity/location in the hand dictionary, treat it as high-confidence.
|
|
488
|
+
# This prevents frequent short n-grams from splitting proper nouns (e.g., 阿里巴巴 -> 阿里 + 巴巴).
|
|
489
|
+
if in_dict:
|
|
490
|
+
pos_tag = self.dictionary.get(w, "")
|
|
491
|
+
if pos_tag in ("nrt", "ns"):
|
|
492
|
+
freq = max(freq, entity_freq_floor)
|
|
493
|
+
sc = best_score[i]
|
|
494
|
+
if in_dict or freq > 0:
|
|
495
|
+
# base by frequency
|
|
496
|
+
sc += math.log(min(freq, freq_cap) + 1.0)
|
|
497
|
+
else:
|
|
498
|
+
# unknown grouping: prefer 2-3 char chunks over 1-char
|
|
499
|
+
sc += unk_base - unk_len_penalty * L
|
|
500
|
+
# length preference (avoid too many singles)
|
|
501
|
+
sc += len_bonus * (L - 1)
|
|
502
|
+
if L == 1:
|
|
503
|
+
sc -= single_penalty
|
|
504
|
+
if in_dict:
|
|
505
|
+
sc += dict_bonus + dict_len_bonus * (L - 1)
|
|
506
|
+
pos_tag = self.dictionary.get(w, "")
|
|
507
|
+
if pos_tag in ("nrt", "ns"):
|
|
508
|
+
sc += entity_bonus
|
|
509
|
+
j = i + L
|
|
510
|
+
if sc > best_score[j]:
|
|
511
|
+
best_score[j] = sc
|
|
512
|
+
back[j] = i
|
|
513
|
+
back_len[j] = L
|
|
514
|
+
|
|
515
|
+
# if something went wrong, fallback to single chars
|
|
516
|
+
if back[n] < 0:
|
|
517
|
+
return [Morpheme(run[i], run[i], self.dictionary.get(run[i], "n"), offset + i, offset + i + 1) for i in range(n)]
|
|
518
|
+
|
|
519
|
+
# backtrack
|
|
520
|
+
toks: List[Tuple[int, int]] = []
|
|
521
|
+
j = n
|
|
522
|
+
while j > 0:
|
|
523
|
+
i = back[j]
|
|
524
|
+
if i < 0:
|
|
525
|
+
# safety
|
|
526
|
+
i = j - 1
|
|
527
|
+
L = back_len[j]
|
|
528
|
+
toks.append((i, j))
|
|
529
|
+
j = i
|
|
530
|
+
toks.reverse()
|
|
531
|
+
|
|
532
|
+
out: List[Morpheme] = []
|
|
533
|
+
for i, j in toks:
|
|
534
|
+
w = run[i:j]
|
|
535
|
+
pos = self.dictionary.get(w, "x")
|
|
536
|
+
out.append(Morpheme(w, w, pos, offset + i, offset + j))
|
|
537
|
+
return out
|
|
538
|
+
|
|
539
|
+
def _forward_max_match(self, text: str) -> List[Morpheme]:
|
|
540
|
+
"""정방향 최장일치"""
|
|
541
|
+
result = []
|
|
542
|
+
pos = 0
|
|
543
|
+
|
|
544
|
+
while pos < len(text):
|
|
545
|
+
# 날짜 패턴 우선 처리 (숫자+年/月/日/号) - optional
|
|
546
|
+
if self.join_dates:
|
|
547
|
+
m = self._date_md_with_dian.match(text[pos:])
|
|
548
|
+
if m:
|
|
549
|
+
s = m.group()
|
|
550
|
+
# split: 12月31日 + 电
|
|
551
|
+
core = s[:-1]
|
|
552
|
+
result.append(Morpheme(core, core, 't', pos, pos + len(core)))
|
|
553
|
+
result.append(Morpheme('电', '电', 'n', pos + len(core), pos + len(s)))
|
|
554
|
+
pos += len(s)
|
|
555
|
+
continue
|
|
556
|
+
m = self._date_ymd.match(text[pos:])
|
|
557
|
+
if m:
|
|
558
|
+
s = m.group()
|
|
559
|
+
result.append(Morpheme(s, s, 't', pos, pos + len(s)))
|
|
560
|
+
pos += len(s)
|
|
561
|
+
continue
|
|
562
|
+
m = self._date_md.match(text[pos:])
|
|
563
|
+
if m:
|
|
564
|
+
s = m.group()
|
|
565
|
+
result.append(Morpheme(s, s, 't', pos, pos + len(s)))
|
|
566
|
+
pos += len(s)
|
|
567
|
+
continue
|
|
568
|
+
|
|
569
|
+
# 공백/기호 스킵
|
|
570
|
+
if not self.hanzi.match(text[pos:pos+1]):
|
|
571
|
+
if text[pos].isspace():
|
|
572
|
+
pos += 1
|
|
573
|
+
continue
|
|
574
|
+
# 숫자/영문
|
|
575
|
+
match = re.match(r'[a-zA-Z0-9]+', text[pos:])
|
|
576
|
+
if match:
|
|
577
|
+
word = match.group()
|
|
578
|
+
result.append(Morpheme(word, word, 'x', pos, pos + len(word)))
|
|
579
|
+
pos += len(word)
|
|
580
|
+
else:
|
|
581
|
+
result.append(Morpheme(text[pos], text[pos], 'x', pos, pos + 1))
|
|
582
|
+
pos += 1
|
|
583
|
+
continue
|
|
584
|
+
|
|
585
|
+
# 최장일치
|
|
586
|
+
matched = False
|
|
587
|
+
for length in range(min(self.max_word_len, len(text) - pos), 0, -1):
|
|
588
|
+
word = text[pos:pos+length]
|
|
589
|
+
if word in self.dictionary:
|
|
590
|
+
result.append(Morpheme(
|
|
591
|
+
word, word, self.dictionary[word],
|
|
592
|
+
pos, pos + length
|
|
593
|
+
))
|
|
594
|
+
pos += length
|
|
595
|
+
matched = True
|
|
596
|
+
break
|
|
597
|
+
|
|
598
|
+
if not matched:
|
|
599
|
+
# 미등록어: 다음 사전 단어까지 그룹화 (2-4자 우선)
|
|
600
|
+
end_pos = pos + 1
|
|
601
|
+
while end_pos < len(text) and end_pos - pos < 4:
|
|
602
|
+
if not self.hanzi.match(text[end_pos:end_pos+1]):
|
|
603
|
+
break
|
|
604
|
+
# 이 위치에서 사전 단어가 시작하면 끊음
|
|
605
|
+
found_dict_word = False
|
|
606
|
+
for length in range(min(self.max_word_len, len(text) - end_pos), 0, -1):
|
|
607
|
+
if text[end_pos:end_pos+length] in self.dictionary:
|
|
608
|
+
found_dict_word = True
|
|
609
|
+
break
|
|
610
|
+
if found_dict_word:
|
|
611
|
+
break
|
|
612
|
+
end_pos += 1
|
|
613
|
+
|
|
614
|
+
# 2자 이상이면 하나의 단어로
|
|
615
|
+
if end_pos - pos >= 2:
|
|
616
|
+
word = text[pos:end_pos]
|
|
617
|
+
result.append(Morpheme(
|
|
618
|
+
word, word, 'nz', # nz = 미등록 고유명사
|
|
619
|
+
pos, end_pos
|
|
620
|
+
))
|
|
621
|
+
pos = end_pos
|
|
622
|
+
else:
|
|
623
|
+
# 1자는 그대로
|
|
624
|
+
result.append(Morpheme(
|
|
625
|
+
text[pos], text[pos], 'n',
|
|
626
|
+
pos, pos + 1
|
|
627
|
+
))
|
|
628
|
+
pos += 1
|
|
629
|
+
|
|
630
|
+
return result
|
|
631
|
+
|
|
632
|
+
def _backward_max_match(self, text: str) -> List[Morpheme]:
|
|
633
|
+
"""역방향 최장일치"""
|
|
634
|
+
result = []
|
|
635
|
+
pos = len(text)
|
|
636
|
+
|
|
637
|
+
while pos > 0:
|
|
638
|
+
# 공백 스킵
|
|
639
|
+
if pos > 0 and text[pos-1].isspace():
|
|
640
|
+
pos -= 1
|
|
641
|
+
continue
|
|
642
|
+
|
|
643
|
+
# 비한자
|
|
644
|
+
if pos > 0 and not self.hanzi.match(text[pos-1:pos]):
|
|
645
|
+
# 연속 비한자 찾기
|
|
646
|
+
end = pos
|
|
647
|
+
while pos > 0 and not self.hanzi.match(text[pos-1:pos]) and not text[pos-1].isspace():
|
|
648
|
+
pos -= 1
|
|
649
|
+
if pos < end:
|
|
650
|
+
word = text[pos:end]
|
|
651
|
+
result.insert(0, Morpheme(word, word, 'x', pos, end))
|
|
652
|
+
continue
|
|
653
|
+
|
|
654
|
+
# 날짜 패턴(역방향) 처리(옵션): "...12月31日电" / "...2025年12月31日"
|
|
655
|
+
if self.join_dates:
|
|
656
|
+
lookback = max(0, pos - 16)
|
|
657
|
+
chunk = text[lookback:pos]
|
|
658
|
+
m = re.search(r'[0-9]{1,2}月[0-9]{1,2}[日号]电$', chunk)
|
|
659
|
+
if m:
|
|
660
|
+
s = m.group()
|
|
661
|
+
core = s[:-1]
|
|
662
|
+
start = pos - len(s)
|
|
663
|
+
result.insert(0, Morpheme('电', '电', 'n', pos - 1, pos))
|
|
664
|
+
result.insert(0, Morpheme(core, core, 't', start, start + len(core)))
|
|
665
|
+
pos -= len(s)
|
|
666
|
+
continue
|
|
667
|
+
m = re.search(r'[0-9]{2,4}年[0-9]{1,2}月[0-9]{1,2}[日号]$', chunk)
|
|
668
|
+
if m:
|
|
669
|
+
s = m.group()
|
|
670
|
+
start = pos - len(s)
|
|
671
|
+
result.insert(0, Morpheme(s, s, 't', start, pos))
|
|
672
|
+
pos -= len(s)
|
|
673
|
+
continue
|
|
674
|
+
m = re.search(r'[0-9]{1,2}月[0-9]{1,2}[日号]$', chunk)
|
|
675
|
+
if m:
|
|
676
|
+
s = m.group()
|
|
677
|
+
start = pos - len(s)
|
|
678
|
+
result.insert(0, Morpheme(s, s, 't', start, pos))
|
|
679
|
+
pos -= len(s)
|
|
680
|
+
continue
|
|
681
|
+
|
|
682
|
+
# 최장일치 (역방향)
|
|
683
|
+
matched = False
|
|
684
|
+
for length in range(min(self.max_word_len, pos), 0, -1):
|
|
685
|
+
word = text[pos-length:pos]
|
|
686
|
+
if word in self.dictionary:
|
|
687
|
+
result.insert(0, Morpheme(
|
|
688
|
+
word, word, self.dictionary[word],
|
|
689
|
+
pos - length, pos
|
|
690
|
+
))
|
|
691
|
+
pos -= length
|
|
692
|
+
matched = True
|
|
693
|
+
break
|
|
694
|
+
|
|
695
|
+
if not matched:
|
|
696
|
+
# 미등록어: 이전 사전 단어까지 그룹화 (역방향, 2-4자)
|
|
697
|
+
start_pos = pos - 1
|
|
698
|
+
while start_pos > 0 and pos - start_pos < 4:
|
|
699
|
+
if not self.hanzi.match(text[start_pos-1:start_pos]):
|
|
700
|
+
break
|
|
701
|
+
# 이 위치에서 끝나는 사전 단어가 있으면 끊음
|
|
702
|
+
found_dict_word = False
|
|
703
|
+
for length in range(min(self.max_word_len, start_pos), 0, -1):
|
|
704
|
+
if text[start_pos-length:start_pos] in self.dictionary:
|
|
705
|
+
found_dict_word = True
|
|
706
|
+
break
|
|
707
|
+
if found_dict_word:
|
|
708
|
+
break
|
|
709
|
+
start_pos -= 1
|
|
710
|
+
|
|
711
|
+
# 2자 이상이면 하나의 단어로
|
|
712
|
+
if pos - start_pos >= 2:
|
|
713
|
+
word = text[start_pos:pos]
|
|
714
|
+
result.insert(0, Morpheme(
|
|
715
|
+
word, word, 'nz', # nz = 미등록 고유명사
|
|
716
|
+
start_pos, pos
|
|
717
|
+
))
|
|
718
|
+
pos = start_pos
|
|
719
|
+
else:
|
|
720
|
+
result.insert(0, Morpheme(
|
|
721
|
+
text[pos-1], text[pos-1], 'n',
|
|
722
|
+
pos - 1, pos
|
|
723
|
+
))
|
|
724
|
+
pos -= 1
|
|
725
|
+
|
|
726
|
+
return result
|
|
727
|
+
|
|
728
|
+
def segment(self, text: str) -> List[str]:
|
|
729
|
+
"""단어 분리 (간편 함수)"""
|
|
730
|
+
morphemes = self.analyze(text)
|
|
731
|
+
return [m.surface for m in morphemes]
|
|
732
|
+
|
|
733
|
+
def pos_tag(self, text: str) -> List[Tuple[str, str]]:
|
|
734
|
+
"""품사 태깅"""
|
|
735
|
+
morphemes = self.analyze(text)
|
|
736
|
+
return [(m.surface, m.pos) for m in morphemes]
|