tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Chinese Advanced Morphological Analyzer
|
|
3
|
+
=======================================
|
|
4
|
+
|
|
5
|
+
5가지 고급 기능을 지원하는 중국어 형태소 분석기
|
|
6
|
+
|
|
7
|
+
Features:
|
|
8
|
+
1. NER Gazetteer Integration - 개체명 경계 보존
|
|
9
|
+
2. Real-time Dictionary Extension - 런타임 사전 확장
|
|
10
|
+
3. Domain Adaptation - 도메인별 분석 최적화
|
|
11
|
+
4. Code-switching - 영중 혼용 텍스트 처리
|
|
12
|
+
5. N-best Analysis - 다중 후보 + 신뢰도 점수
|
|
13
|
+
|
|
14
|
+
Algorithm: Bidirectional Maximum Matching
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import re
|
|
18
|
+
from typing import List, Tuple, Dict, Set, Optional, Any
|
|
19
|
+
|
|
20
|
+
from .advanced_base import (
|
|
21
|
+
AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, NBestResult, Domain
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ChineseAdvancedAnalyzer(AdvancedMorphologicalAnalyzer):
|
|
26
|
+
"""
|
|
27
|
+
중국어 고급 형태소 분석기
|
|
28
|
+
|
|
29
|
+
Usage:
|
|
30
|
+
analyzer = ChineseAdvancedAnalyzer()
|
|
31
|
+
|
|
32
|
+
# 기본 분석
|
|
33
|
+
result = analyzer.analyze("阿里巴巴集团在杭州宣布")
|
|
34
|
+
|
|
35
|
+
# 개체명 보존
|
|
36
|
+
analyzer.add_entity("阿里巴巴", "ORG")
|
|
37
|
+
result = analyzer.analyze("阿里巴巴在北京", preserve_entities=True)
|
|
38
|
+
|
|
39
|
+
# 도메인 적응
|
|
40
|
+
result = analyzer.analyze("苹果很好吃", domain="food") # 사과
|
|
41
|
+
result = analyzer.analyze("苹果发布新品", domain="tech") # Apple Inc.
|
|
42
|
+
|
|
43
|
+
# N-best 분석
|
|
44
|
+
result = analyzer.analyze("银行", n_best=3)
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
LANG_CODE = "zh"
|
|
48
|
+
LANG_NAME = "Chinese"
|
|
49
|
+
|
|
50
|
+
# 한자 패턴
|
|
51
|
+
HANZI_PATTERN = re.compile(r'[\u4e00-\u9fff]+')
|
|
52
|
+
LATIN_PATTERN = re.compile(r'[a-zA-Z]+')
|
|
53
|
+
NUMBER_PATTERN = re.compile(r'[0-9]+')
|
|
54
|
+
|
|
55
|
+
def __init__(self):
|
|
56
|
+
super().__init__()
|
|
57
|
+
self.max_word_len = 8 # 최대 단어 길이
|
|
58
|
+
|
|
59
|
+
def _build_base_dictionary(self):
|
|
60
|
+
"""기본 사전 구축"""
|
|
61
|
+
|
|
62
|
+
# =================================================================
|
|
63
|
+
# 地名 (지명)
|
|
64
|
+
# =================================================================
|
|
65
|
+
self.places = {
|
|
66
|
+
'北京': 'ns', '上海': 'ns', '广州': 'ns', '深圳': 'ns',
|
|
67
|
+
'杭州': 'ns', '南京': 'ns', '武汉': 'ns', '成都': 'ns',
|
|
68
|
+
'西安': 'ns', '重庆': 'ns', '天津': 'ns', '苏州': 'ns',
|
|
69
|
+
'香港': 'ns', '台湾': 'ns', '澳门': 'ns',
|
|
70
|
+
'中国': 'ns', '美国': 'ns', '日本': 'ns', '韩国': 'ns',
|
|
71
|
+
'英国': 'ns', '法国': 'ns', '德国': 'ns', '俄罗斯': 'ns',
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
# =================================================================
|
|
75
|
+
# 组织/企业 (조직/기업)
|
|
76
|
+
# =================================================================
|
|
77
|
+
self.organizations = {
|
|
78
|
+
'阿里巴巴': 'nrt', '腾讯': 'nrt', '百度': 'nrt', '华为': 'nrt',
|
|
79
|
+
'小米': 'nrt', '京东': 'nrt', '美团': 'nrt', '字节跳动': 'nrt',
|
|
80
|
+
'苹果': 'nrt', # Apple (tech context)
|
|
81
|
+
'三星': 'nrt', '谷歌': 'nrt', '微软': 'nrt', '亚马逊': 'nrt',
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
# =================================================================
|
|
85
|
+
# 一般名词 (일반명사)
|
|
86
|
+
# =================================================================
|
|
87
|
+
self.common_nouns = {
|
|
88
|
+
# 组织
|
|
89
|
+
'集团': 'n', '公司': 'n', '企业': 'n', '银行': 'n',
|
|
90
|
+
'政府': 'n', '学校': 'n', '大学': 'n', '医院': 'n',
|
|
91
|
+
# 一般
|
|
92
|
+
'人': 'n', '人们': 'n', '时间': 'n', '地方': 'n',
|
|
93
|
+
'问题': 'n', '情况': 'n', '工作': 'n', '生活': 'n',
|
|
94
|
+
'发展': 'n', '经济': 'n', '社会': 'n', '文化': 'n',
|
|
95
|
+
'技术': 'n', '产品': 'n', '服务': 'n', '市场': 'n',
|
|
96
|
+
'国家': 'n', '世界': 'n', '城市': 'n', '地区': 'n',
|
|
97
|
+
'消息': 'n', '新闻': 'n', '报道': 'n', '信息': 'n',
|
|
98
|
+
# 食物 (음식)
|
|
99
|
+
'苹果': 'n', # 사과 (food context)
|
|
100
|
+
'香蕉': 'n', '橘子': 'n', '西瓜': 'n',
|
|
101
|
+
'米饭': 'n', '面条': 'n', '饺子': 'n',
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
# =================================================================
|
|
105
|
+
# 动词 (동사)
|
|
106
|
+
# =================================================================
|
|
107
|
+
self.verbs = {
|
|
108
|
+
'是': 'v', '有': 'v', '在': 'v', '说': 'v', '做': 'v',
|
|
109
|
+
'去': 'v', '来': 'v', '看': 'v', '想': 'v', '知道': 'v',
|
|
110
|
+
'发表': 'v', '宣布': 'v', '公布': 'v', '发布': 'v',
|
|
111
|
+
'开始': 'v', '结束': 'v', '进行': 'v', '完成': 'v',
|
|
112
|
+
'研究': 'v', '开发': 'v', '生产': 'v', '销售': 'v',
|
|
113
|
+
'投资': 'v', '合作': 'v', '成立': 'v', '成为': 'v',
|
|
114
|
+
'表示': 'v', '认为': 'v', '希望': 'v', '需要': 'v',
|
|
115
|
+
'吃': 'v', '喝': 'v', '买': 'v', '卖': 'v',
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
# =================================================================
|
|
119
|
+
# 形容词 (형용사)
|
|
120
|
+
# =================================================================
|
|
121
|
+
self.adjectives = {
|
|
122
|
+
'大': 'a', '小': 'a', '多': 'a', '少': 'a',
|
|
123
|
+
'好': 'a', '新': 'a', '高': 'a', '低': 'a',
|
|
124
|
+
'重要': 'a', '主要': 'a', '不同': 'a', '相同': 'a',
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
# =================================================================
|
|
128
|
+
# 副词 (부사)
|
|
129
|
+
# =================================================================
|
|
130
|
+
self.adverbs = {
|
|
131
|
+
'不': 'd', '也': 'd', '就': 'd', '都': 'd', '还': 'd',
|
|
132
|
+
'很': 'd', '最': 'd', '已经': 'd', '正在': 'd',
|
|
133
|
+
'可能': 'd', '一定': 'd', '非常': 'd', '比较': 'd',
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
# =================================================================
|
|
137
|
+
# 介词 (개사/전치사)
|
|
138
|
+
# =================================================================
|
|
139
|
+
self.prepositions = {
|
|
140
|
+
'在': 'p', '从': 'p', '向': 'p', '对': 'p', '把': 'p',
|
|
141
|
+
'被': 'p', '比': 'p', '跟': 'p', '和': 'p', '与': 'p',
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
# =================================================================
|
|
145
|
+
# 助词 (조사)
|
|
146
|
+
# =================================================================
|
|
147
|
+
self.particles = {
|
|
148
|
+
'的': 'u', '地': 'u', '得': 'u', '了': 'u', '着': 'u', '过': 'u',
|
|
149
|
+
'吗': 'u', '呢': 'u', '吧': 'u', '啊': 'u',
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
# =================================================================
|
|
153
|
+
# 代词 (대명사)
|
|
154
|
+
# =================================================================
|
|
155
|
+
self.pronouns = {
|
|
156
|
+
'我': 'r', '你': 'r', '他': 'r', '她': 'r', '它': 'r',
|
|
157
|
+
'我们': 'r', '你们': 'r', '他们': 'r',
|
|
158
|
+
'这': 'r', '那': 'r', '这个': 'r', '那个': 'r',
|
|
159
|
+
'什么': 'r', '谁': 'r', '哪': 'r', '怎么': 'r',
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
# =================================================================
|
|
163
|
+
# 连词 (연결사)
|
|
164
|
+
# =================================================================
|
|
165
|
+
self.conjunctions = {
|
|
166
|
+
'和': 'c', '或': 'c', '但': 'c', '但是': 'c',
|
|
167
|
+
'因为': 'c', '所以': 'c', '如果': 'c', '虽然': 'c',
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
# =================================================================
|
|
171
|
+
# 数词 (수사)
|
|
172
|
+
# =================================================================
|
|
173
|
+
self.numerals = {
|
|
174
|
+
'一': 'm', '二': 'm', '三': 'm', '四': 'm', '五': 'm',
|
|
175
|
+
'六': 'm', '七': 'm', '八': 'm', '九': 'm', '十': 'm',
|
|
176
|
+
'百': 'm', '千': 'm', '万': 'm', '亿': 'm',
|
|
177
|
+
'两': 'm', '几': 'm', '多': 'm',
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
# =================================================================
|
|
181
|
+
# 量词 (양사)
|
|
182
|
+
# =================================================================
|
|
183
|
+
self.classifiers = {
|
|
184
|
+
'个': 'q', '年': 'q', '月': 'q', '日': 'q', '号': 'q',
|
|
185
|
+
'次': 'q', '种': 'q', '件': 'q', '位': 'q', '家': 'q',
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
# 통합 사전 구축
|
|
189
|
+
self._dictionary = {}
|
|
190
|
+
for d in [self.places, self.organizations, self.common_nouns,
|
|
191
|
+
self.verbs, self.adjectives, self.adverbs,
|
|
192
|
+
self.prepositions, self.particles, self.pronouns,
|
|
193
|
+
self.conjunctions, self.numerals, self.classifiers]:
|
|
194
|
+
for word, pos in d.items():
|
|
195
|
+
self._dictionary[word] = (word, pos)
|
|
196
|
+
|
|
197
|
+
def _build_domain_dictionaries(self):
|
|
198
|
+
"""도메인별 사전 구축"""
|
|
199
|
+
|
|
200
|
+
# FOOD 도메인
|
|
201
|
+
self._domain_dictionaries[Domain.FOOD] = {
|
|
202
|
+
'苹果': ('苹果', 'n'), # 사과
|
|
203
|
+
'小米': ('小米', 'n'), # 좁쌀
|
|
204
|
+
'银行': ('银行', 'n'), # 일반 은행
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
# TECH 도메인
|
|
208
|
+
self._domain_dictionaries[Domain.TECH] = {
|
|
209
|
+
'苹果': ('苹果公司', 'nrt'), # Apple Inc.
|
|
210
|
+
'小米': ('小米科技', 'nrt'), # Xiaomi
|
|
211
|
+
'华为': ('华为技术', 'nrt'),
|
|
212
|
+
'云': ('云计算', 'n'),
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
# FINANCE 도메인
|
|
216
|
+
self._domain_dictionaries[Domain.FINANCE] = {
|
|
217
|
+
'银行': ('银行', 'n'),
|
|
218
|
+
'股票': ('股票', 'n'),
|
|
219
|
+
'基金': ('基金', 'n'),
|
|
220
|
+
'投资': ('投资', 'n'),
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
# ENTERTAINMENT 도메인
|
|
224
|
+
self._domain_dictionaries[Domain.ENTERTAINMENT] = {
|
|
225
|
+
'苹果': ('苹果', 'n'), # 일반 사과
|
|
226
|
+
'明星': ('明星', 'n'),
|
|
227
|
+
'演员': ('演员', 'n'),
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
|
|
231
|
+
"""분석 후보 생성 (양방향 최장일치)"""
|
|
232
|
+
if not text or not text.strip():
|
|
233
|
+
return [AnalysisResult([])]
|
|
234
|
+
|
|
235
|
+
candidates = []
|
|
236
|
+
|
|
237
|
+
# 정방향 분석
|
|
238
|
+
forward_morphemes = self._forward_max_match(text, domain)
|
|
239
|
+
forward_result = AnalysisResult(
|
|
240
|
+
morphemes=forward_morphemes,
|
|
241
|
+
score=1.0,
|
|
242
|
+
domain=domain
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
# 역방향 분석
|
|
246
|
+
backward_morphemes = self._backward_max_match(text, domain)
|
|
247
|
+
backward_result = AnalysisResult(
|
|
248
|
+
morphemes=backward_morphemes,
|
|
249
|
+
score=0.95,
|
|
250
|
+
domain=domain
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# 더 적은 형태소 수를 선택 (기본 휴리스틱)
|
|
254
|
+
if len(forward_morphemes) <= len(backward_morphemes):
|
|
255
|
+
forward_result.score = self._score_analysis(forward_result)
|
|
256
|
+
candidates.append(forward_result)
|
|
257
|
+
backward_result.score = self._score_analysis(backward_result) * 0.9
|
|
258
|
+
candidates.append(backward_result)
|
|
259
|
+
else:
|
|
260
|
+
backward_result.score = self._score_analysis(backward_result)
|
|
261
|
+
candidates.append(backward_result)
|
|
262
|
+
forward_result.score = self._score_analysis(forward_result) * 0.9
|
|
263
|
+
candidates.append(forward_result)
|
|
264
|
+
|
|
265
|
+
return candidates
|
|
266
|
+
|
|
267
|
+
def _forward_max_match(self, text: str, domain: Domain) -> List[Morpheme]:
|
|
268
|
+
"""정방향 최장일치"""
|
|
269
|
+
result = []
|
|
270
|
+
pos = 0
|
|
271
|
+
|
|
272
|
+
while pos < len(text):
|
|
273
|
+
# 공백 스킵
|
|
274
|
+
if text[pos].isspace():
|
|
275
|
+
pos += 1
|
|
276
|
+
continue
|
|
277
|
+
|
|
278
|
+
# 비한자 처리
|
|
279
|
+
if not self.HANZI_PATTERN.match(text[pos:pos+1]):
|
|
280
|
+
# 라틴 문자
|
|
281
|
+
match = self.LATIN_PATTERN.match(text[pos:])
|
|
282
|
+
if match:
|
|
283
|
+
word = match.group()
|
|
284
|
+
# 런타임 사전 확인
|
|
285
|
+
if word.lower() in self._user_dictionary:
|
|
286
|
+
lemma, pos_tag, _ = self._user_dictionary[word.lower()]
|
|
287
|
+
result.append(Morpheme(word, lemma, pos_tag, pos, pos + len(word)))
|
|
288
|
+
else:
|
|
289
|
+
result.append(Morpheme(word, word, 'x', pos, pos + len(word)))
|
|
290
|
+
pos += len(word)
|
|
291
|
+
continue
|
|
292
|
+
|
|
293
|
+
# 숫자
|
|
294
|
+
match = self.NUMBER_PATTERN.match(text[pos:])
|
|
295
|
+
if match:
|
|
296
|
+
word = match.group()
|
|
297
|
+
result.append(Morpheme(word, word, 'm', pos, pos + len(word)))
|
|
298
|
+
pos += len(word)
|
|
299
|
+
continue
|
|
300
|
+
|
|
301
|
+
# 기타 기호
|
|
302
|
+
result.append(Morpheme(text[pos], text[pos], 'x', pos, pos + 1))
|
|
303
|
+
pos += 1
|
|
304
|
+
continue
|
|
305
|
+
|
|
306
|
+
# 최장일치 (런타임 사전 우선)
|
|
307
|
+
matched = False
|
|
308
|
+
|
|
309
|
+
# 런타임 사전
|
|
310
|
+
for length in range(min(self.max_word_len, len(text) - pos), 0, -1):
|
|
311
|
+
word = text[pos:pos+length]
|
|
312
|
+
if word in self._user_dictionary:
|
|
313
|
+
lemma, pos_tag, _ = self._user_dictionary[word]
|
|
314
|
+
result.append(Morpheme(word, lemma, pos_tag, pos, pos + length))
|
|
315
|
+
pos += length
|
|
316
|
+
matched = True
|
|
317
|
+
break
|
|
318
|
+
|
|
319
|
+
if matched:
|
|
320
|
+
continue
|
|
321
|
+
|
|
322
|
+
# 도메인 사전
|
|
323
|
+
domain_sense = None
|
|
324
|
+
for length in range(min(self.max_word_len, len(text) - pos), 0, -1):
|
|
325
|
+
word = text[pos:pos+length]
|
|
326
|
+
domain_sense = self._get_domain_sense(word, domain)
|
|
327
|
+
if domain_sense:
|
|
328
|
+
result.append(Morpheme(word, domain_sense[0], domain_sense[1], pos, pos + length))
|
|
329
|
+
pos += length
|
|
330
|
+
matched = True
|
|
331
|
+
break
|
|
332
|
+
|
|
333
|
+
if matched:
|
|
334
|
+
continue
|
|
335
|
+
|
|
336
|
+
# 기본 사전
|
|
337
|
+
for length in range(min(self.max_word_len, len(text) - pos), 0, -1):
|
|
338
|
+
word = text[pos:pos+length]
|
|
339
|
+
if word in self._dictionary:
|
|
340
|
+
lemma, pos_tag = self._dictionary[word]
|
|
341
|
+
result.append(Morpheme(word, lemma, pos_tag, pos, pos + length))
|
|
342
|
+
pos += length
|
|
343
|
+
matched = True
|
|
344
|
+
break
|
|
345
|
+
|
|
346
|
+
if not matched:
|
|
347
|
+
# 미등록어: 한 글자씩
|
|
348
|
+
result.append(Morpheme(text[pos], text[pos], 'n', pos, pos + 1))
|
|
349
|
+
pos += 1
|
|
350
|
+
|
|
351
|
+
return result
|
|
352
|
+
|
|
353
|
+
def _backward_max_match(self, text: str, domain: Domain) -> List[Morpheme]:
|
|
354
|
+
"""역방향 최장일치"""
|
|
355
|
+
result = []
|
|
356
|
+
pos = len(text)
|
|
357
|
+
|
|
358
|
+
while pos > 0:
|
|
359
|
+
# 공백 스킵
|
|
360
|
+
if text[pos-1].isspace():
|
|
361
|
+
pos -= 1
|
|
362
|
+
continue
|
|
363
|
+
|
|
364
|
+
# 비한자
|
|
365
|
+
if not self.HANZI_PATTERN.match(text[pos-1:pos]):
|
|
366
|
+
end = pos
|
|
367
|
+
while pos > 0 and not self.HANZI_PATTERN.match(text[pos-1:pos]) and not text[pos-1].isspace():
|
|
368
|
+
pos -= 1
|
|
369
|
+
if pos < end:
|
|
370
|
+
word = text[pos:end]
|
|
371
|
+
result.insert(0, Morpheme(word, word, 'x', pos, end))
|
|
372
|
+
continue
|
|
373
|
+
|
|
374
|
+
# 최장일치 (역방향)
|
|
375
|
+
matched = False
|
|
376
|
+
|
|
377
|
+
# 런타임 사전
|
|
378
|
+
for length in range(min(self.max_word_len, pos), 0, -1):
|
|
379
|
+
word = text[pos-length:pos]
|
|
380
|
+
if word in self._user_dictionary:
|
|
381
|
+
lemma, pos_tag, _ = self._user_dictionary[word]
|
|
382
|
+
result.insert(0, Morpheme(word, lemma, pos_tag, pos - length, pos))
|
|
383
|
+
pos -= length
|
|
384
|
+
matched = True
|
|
385
|
+
break
|
|
386
|
+
|
|
387
|
+
if matched:
|
|
388
|
+
continue
|
|
389
|
+
|
|
390
|
+
# 기본 사전
|
|
391
|
+
for length in range(min(self.max_word_len, pos), 0, -1):
|
|
392
|
+
word = text[pos-length:pos]
|
|
393
|
+
if word in self._dictionary:
|
|
394
|
+
lemma, pos_tag = self._dictionary[word]
|
|
395
|
+
result.insert(0, Morpheme(word, lemma, pos_tag, pos - length, pos))
|
|
396
|
+
pos -= length
|
|
397
|
+
matched = True
|
|
398
|
+
break
|
|
399
|
+
|
|
400
|
+
if not matched:
|
|
401
|
+
result.insert(0, Morpheme(text[pos-1], text[pos-1], 'n', pos - 1, pos))
|
|
402
|
+
pos -= 1
|
|
403
|
+
|
|
404
|
+
return result
|
|
405
|
+
|
|
406
|
+
def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
|
|
407
|
+
"""대안 분석 결과 생성"""
|
|
408
|
+
# 이미 양방향 분석에서 2개 후보 생성
|
|
409
|
+
alternatives = []
|
|
410
|
+
|
|
411
|
+
# 다른 도메인으로 분석
|
|
412
|
+
other_domains = [d for d in Domain if d != domain][:count]
|
|
413
|
+
|
|
414
|
+
for alt_domain in other_domains:
|
|
415
|
+
candidates = self._generate_candidates(text, alt_domain)
|
|
416
|
+
for c in candidates[:1]:
|
|
417
|
+
c.score *= 0.85
|
|
418
|
+
c.domain = alt_domain
|
|
419
|
+
alternatives.append(c)
|
|
420
|
+
|
|
421
|
+
return alternatives
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
# Alias for backward compatibility
|
|
425
|
+
ChineseAnalyzer = ChineseAdvancedAnalyzer
|