tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,425 @@
1
+ """
2
+ Chinese Advanced Morphological Analyzer
3
+ =======================================
4
+
5
+ 5가지 고급 기능을 지원하는 중국어 형태소 분석기
6
+
7
+ Features:
8
+ 1. NER Gazetteer Integration - 개체명 경계 보존
9
+ 2. Real-time Dictionary Extension - 런타임 사전 확장
10
+ 3. Domain Adaptation - 도메인별 분석 최적화
11
+ 4. Code-switching - 영중 혼용 텍스트 처리
12
+ 5. N-best Analysis - 다중 후보 + 신뢰도 점수
13
+
14
+ Algorithm: Bidirectional Maximum Matching
15
+ """
16
+
17
+ import re
18
+ from typing import List, Tuple, Dict, Set, Optional, Any
19
+
20
+ from .advanced_base import (
21
+ AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, NBestResult, Domain
22
+ )
23
+
24
+
25
+ class ChineseAdvancedAnalyzer(AdvancedMorphologicalAnalyzer):
26
+ """
27
+ 중국어 고급 형태소 분석기
28
+
29
+ Usage:
30
+ analyzer = ChineseAdvancedAnalyzer()
31
+
32
+ # 기본 분석
33
+ result = analyzer.analyze("阿里巴巴集团在杭州宣布")
34
+
35
+ # 개체명 보존
36
+ analyzer.add_entity("阿里巴巴", "ORG")
37
+ result = analyzer.analyze("阿里巴巴在北京", preserve_entities=True)
38
+
39
+ # 도메인 적응
40
+ result = analyzer.analyze("苹果很好吃", domain="food") # 사과
41
+ result = analyzer.analyze("苹果发布新品", domain="tech") # Apple Inc.
42
+
43
+ # N-best 분석
44
+ result = analyzer.analyze("银行", n_best=3)
45
+ """
46
+
47
+ LANG_CODE = "zh"
48
+ LANG_NAME = "Chinese"
49
+
50
+ # 한자 패턴
51
+ HANZI_PATTERN = re.compile(r'[\u4e00-\u9fff]+')
52
+ LATIN_PATTERN = re.compile(r'[a-zA-Z]+')
53
+ NUMBER_PATTERN = re.compile(r'[0-9]+')
54
+
55
+ def __init__(self):
56
+ super().__init__()
57
+ self.max_word_len = 8 # 최대 단어 길이
58
+
59
+ def _build_base_dictionary(self):
60
+ """기본 사전 구축"""
61
+
62
+ # =================================================================
63
+ # 地名 (지명)
64
+ # =================================================================
65
+ self.places = {
66
+ '北京': 'ns', '上海': 'ns', '广州': 'ns', '深圳': 'ns',
67
+ '杭州': 'ns', '南京': 'ns', '武汉': 'ns', '成都': 'ns',
68
+ '西安': 'ns', '重庆': 'ns', '天津': 'ns', '苏州': 'ns',
69
+ '香港': 'ns', '台湾': 'ns', '澳门': 'ns',
70
+ '中国': 'ns', '美国': 'ns', '日本': 'ns', '韩国': 'ns',
71
+ '英国': 'ns', '法国': 'ns', '德国': 'ns', '俄罗斯': 'ns',
72
+ }
73
+
74
+ # =================================================================
75
+ # 组织/企业 (조직/기업)
76
+ # =================================================================
77
+ self.organizations = {
78
+ '阿里巴巴': 'nrt', '腾讯': 'nrt', '百度': 'nrt', '华为': 'nrt',
79
+ '小米': 'nrt', '京东': 'nrt', '美团': 'nrt', '字节跳动': 'nrt',
80
+ '苹果': 'nrt', # Apple (tech context)
81
+ '三星': 'nrt', '谷歌': 'nrt', '微软': 'nrt', '亚马逊': 'nrt',
82
+ }
83
+
84
+ # =================================================================
85
+ # 一般名词 (일반명사)
86
+ # =================================================================
87
+ self.common_nouns = {
88
+ # 组织
89
+ '集团': 'n', '公司': 'n', '企业': 'n', '银行': 'n',
90
+ '政府': 'n', '学校': 'n', '大学': 'n', '医院': 'n',
91
+ # 一般
92
+ '人': 'n', '人们': 'n', '时间': 'n', '地方': 'n',
93
+ '问题': 'n', '情况': 'n', '工作': 'n', '生活': 'n',
94
+ '发展': 'n', '经济': 'n', '社会': 'n', '文化': 'n',
95
+ '技术': 'n', '产品': 'n', '服务': 'n', '市场': 'n',
96
+ '国家': 'n', '世界': 'n', '城市': 'n', '地区': 'n',
97
+ '消息': 'n', '新闻': 'n', '报道': 'n', '信息': 'n',
98
+ # 食物 (음식)
99
+ '苹果': 'n', # 사과 (food context)
100
+ '香蕉': 'n', '橘子': 'n', '西瓜': 'n',
101
+ '米饭': 'n', '面条': 'n', '饺子': 'n',
102
+ }
103
+
104
+ # =================================================================
105
+ # 动词 (동사)
106
+ # =================================================================
107
+ self.verbs = {
108
+ '是': 'v', '有': 'v', '在': 'v', '说': 'v', '做': 'v',
109
+ '去': 'v', '来': 'v', '看': 'v', '想': 'v', '知道': 'v',
110
+ '发表': 'v', '宣布': 'v', '公布': 'v', '发布': 'v',
111
+ '开始': 'v', '结束': 'v', '进行': 'v', '完成': 'v',
112
+ '研究': 'v', '开发': 'v', '生产': 'v', '销售': 'v',
113
+ '投资': 'v', '合作': 'v', '成立': 'v', '成为': 'v',
114
+ '表示': 'v', '认为': 'v', '希望': 'v', '需要': 'v',
115
+ '吃': 'v', '喝': 'v', '买': 'v', '卖': 'v',
116
+ }
117
+
118
+ # =================================================================
119
+ # 形容词 (형용사)
120
+ # =================================================================
121
+ self.adjectives = {
122
+ '大': 'a', '小': 'a', '多': 'a', '少': 'a',
123
+ '好': 'a', '新': 'a', '高': 'a', '低': 'a',
124
+ '重要': 'a', '主要': 'a', '不同': 'a', '相同': 'a',
125
+ }
126
+
127
+ # =================================================================
128
+ # 副词 (부사)
129
+ # =================================================================
130
+ self.adverbs = {
131
+ '不': 'd', '也': 'd', '就': 'd', '都': 'd', '还': 'd',
132
+ '很': 'd', '最': 'd', '已经': 'd', '正在': 'd',
133
+ '可能': 'd', '一定': 'd', '非常': 'd', '比较': 'd',
134
+ }
135
+
136
+ # =================================================================
137
+ # 介词 (개사/전치사)
138
+ # =================================================================
139
+ self.prepositions = {
140
+ '在': 'p', '从': 'p', '向': 'p', '对': 'p', '把': 'p',
141
+ '被': 'p', '比': 'p', '跟': 'p', '和': 'p', '与': 'p',
142
+ }
143
+
144
+ # =================================================================
145
+ # 助词 (조사)
146
+ # =================================================================
147
+ self.particles = {
148
+ '的': 'u', '地': 'u', '得': 'u', '了': 'u', '着': 'u', '过': 'u',
149
+ '吗': 'u', '呢': 'u', '吧': 'u', '啊': 'u',
150
+ }
151
+
152
+ # =================================================================
153
+ # 代词 (대명사)
154
+ # =================================================================
155
+ self.pronouns = {
156
+ '我': 'r', '你': 'r', '他': 'r', '她': 'r', '它': 'r',
157
+ '我们': 'r', '你们': 'r', '他们': 'r',
158
+ '这': 'r', '那': 'r', '这个': 'r', '那个': 'r',
159
+ '什么': 'r', '谁': 'r', '哪': 'r', '怎么': 'r',
160
+ }
161
+
162
+ # =================================================================
163
+ # 连词 (연결사)
164
+ # =================================================================
165
+ self.conjunctions = {
166
+ '和': 'c', '或': 'c', '但': 'c', '但是': 'c',
167
+ '因为': 'c', '所以': 'c', '如果': 'c', '虽然': 'c',
168
+ }
169
+
170
+ # =================================================================
171
+ # 数词 (수사)
172
+ # =================================================================
173
+ self.numerals = {
174
+ '一': 'm', '二': 'm', '三': 'm', '四': 'm', '五': 'm',
175
+ '六': 'm', '七': 'm', '八': 'm', '九': 'm', '十': 'm',
176
+ '百': 'm', '千': 'm', '万': 'm', '亿': 'm',
177
+ '两': 'm', '几': 'm', '多': 'm',
178
+ }
179
+
180
+ # =================================================================
181
+ # 量词 (양사)
182
+ # =================================================================
183
+ self.classifiers = {
184
+ '个': 'q', '年': 'q', '月': 'q', '日': 'q', '号': 'q',
185
+ '次': 'q', '种': 'q', '件': 'q', '位': 'q', '家': 'q',
186
+ }
187
+
188
+ # 통합 사전 구축
189
+ self._dictionary = {}
190
+ for d in [self.places, self.organizations, self.common_nouns,
191
+ self.verbs, self.adjectives, self.adverbs,
192
+ self.prepositions, self.particles, self.pronouns,
193
+ self.conjunctions, self.numerals, self.classifiers]:
194
+ for word, pos in d.items():
195
+ self._dictionary[word] = (word, pos)
196
+
197
+ def _build_domain_dictionaries(self):
198
+ """도메인별 사전 구축"""
199
+
200
+ # FOOD 도메인
201
+ self._domain_dictionaries[Domain.FOOD] = {
202
+ '苹果': ('苹果', 'n'), # 사과
203
+ '小米': ('小米', 'n'), # 좁쌀
204
+ '银行': ('银行', 'n'), # 일반 은행
205
+ }
206
+
207
+ # TECH 도메인
208
+ self._domain_dictionaries[Domain.TECH] = {
209
+ '苹果': ('苹果公司', 'nrt'), # Apple Inc.
210
+ '小米': ('小米科技', 'nrt'), # Xiaomi
211
+ '华为': ('华为技术', 'nrt'),
212
+ '云': ('云计算', 'n'),
213
+ }
214
+
215
+ # FINANCE 도메인
216
+ self._domain_dictionaries[Domain.FINANCE] = {
217
+ '银行': ('银行', 'n'),
218
+ '股票': ('股票', 'n'),
219
+ '基金': ('基金', 'n'),
220
+ '投资': ('投资', 'n'),
221
+ }
222
+
223
+ # ENTERTAINMENT 도메인
224
+ self._domain_dictionaries[Domain.ENTERTAINMENT] = {
225
+ '苹果': ('苹果', 'n'), # 일반 사과
226
+ '明星': ('明星', 'n'),
227
+ '演员': ('演员', 'n'),
228
+ }
229
+
230
+ def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
231
+ """분석 후보 생성 (양방향 최장일치)"""
232
+ if not text or not text.strip():
233
+ return [AnalysisResult([])]
234
+
235
+ candidates = []
236
+
237
+ # 정방향 분석
238
+ forward_morphemes = self._forward_max_match(text, domain)
239
+ forward_result = AnalysisResult(
240
+ morphemes=forward_morphemes,
241
+ score=1.0,
242
+ domain=domain
243
+ )
244
+
245
+ # 역방향 분석
246
+ backward_morphemes = self._backward_max_match(text, domain)
247
+ backward_result = AnalysisResult(
248
+ morphemes=backward_morphemes,
249
+ score=0.95,
250
+ domain=domain
251
+ )
252
+
253
+ # 더 적은 형태소 수를 선택 (기본 휴리스틱)
254
+ if len(forward_morphemes) <= len(backward_morphemes):
255
+ forward_result.score = self._score_analysis(forward_result)
256
+ candidates.append(forward_result)
257
+ backward_result.score = self._score_analysis(backward_result) * 0.9
258
+ candidates.append(backward_result)
259
+ else:
260
+ backward_result.score = self._score_analysis(backward_result)
261
+ candidates.append(backward_result)
262
+ forward_result.score = self._score_analysis(forward_result) * 0.9
263
+ candidates.append(forward_result)
264
+
265
+ return candidates
266
+
267
+ def _forward_max_match(self, text: str, domain: Domain) -> List[Morpheme]:
268
+ """정방향 최장일치"""
269
+ result = []
270
+ pos = 0
271
+
272
+ while pos < len(text):
273
+ # 공백 스킵
274
+ if text[pos].isspace():
275
+ pos += 1
276
+ continue
277
+
278
+ # 비한자 처리
279
+ if not self.HANZI_PATTERN.match(text[pos:pos+1]):
280
+ # 라틴 문자
281
+ match = self.LATIN_PATTERN.match(text[pos:])
282
+ if match:
283
+ word = match.group()
284
+ # 런타임 사전 확인
285
+ if word.lower() in self._user_dictionary:
286
+ lemma, pos_tag, _ = self._user_dictionary[word.lower()]
287
+ result.append(Morpheme(word, lemma, pos_tag, pos, pos + len(word)))
288
+ else:
289
+ result.append(Morpheme(word, word, 'x', pos, pos + len(word)))
290
+ pos += len(word)
291
+ continue
292
+
293
+ # 숫자
294
+ match = self.NUMBER_PATTERN.match(text[pos:])
295
+ if match:
296
+ word = match.group()
297
+ result.append(Morpheme(word, word, 'm', pos, pos + len(word)))
298
+ pos += len(word)
299
+ continue
300
+
301
+ # 기타 기호
302
+ result.append(Morpheme(text[pos], text[pos], 'x', pos, pos + 1))
303
+ pos += 1
304
+ continue
305
+
306
+ # 최장일치 (런타임 사전 우선)
307
+ matched = False
308
+
309
+ # 런타임 사전
310
+ for length in range(min(self.max_word_len, len(text) - pos), 0, -1):
311
+ word = text[pos:pos+length]
312
+ if word in self._user_dictionary:
313
+ lemma, pos_tag, _ = self._user_dictionary[word]
314
+ result.append(Morpheme(word, lemma, pos_tag, pos, pos + length))
315
+ pos += length
316
+ matched = True
317
+ break
318
+
319
+ if matched:
320
+ continue
321
+
322
+ # 도메인 사전
323
+ domain_sense = None
324
+ for length in range(min(self.max_word_len, len(text) - pos), 0, -1):
325
+ word = text[pos:pos+length]
326
+ domain_sense = self._get_domain_sense(word, domain)
327
+ if domain_sense:
328
+ result.append(Morpheme(word, domain_sense[0], domain_sense[1], pos, pos + length))
329
+ pos += length
330
+ matched = True
331
+ break
332
+
333
+ if matched:
334
+ continue
335
+
336
+ # 기본 사전
337
+ for length in range(min(self.max_word_len, len(text) - pos), 0, -1):
338
+ word = text[pos:pos+length]
339
+ if word in self._dictionary:
340
+ lemma, pos_tag = self._dictionary[word]
341
+ result.append(Morpheme(word, lemma, pos_tag, pos, pos + length))
342
+ pos += length
343
+ matched = True
344
+ break
345
+
346
+ if not matched:
347
+ # 미등록어: 한 글자씩
348
+ result.append(Morpheme(text[pos], text[pos], 'n', pos, pos + 1))
349
+ pos += 1
350
+
351
+ return result
352
+
353
+ def _backward_max_match(self, text: str, domain: Domain) -> List[Morpheme]:
354
+ """역방향 최장일치"""
355
+ result = []
356
+ pos = len(text)
357
+
358
+ while pos > 0:
359
+ # 공백 스킵
360
+ if text[pos-1].isspace():
361
+ pos -= 1
362
+ continue
363
+
364
+ # 비한자
365
+ if not self.HANZI_PATTERN.match(text[pos-1:pos]):
366
+ end = pos
367
+ while pos > 0 and not self.HANZI_PATTERN.match(text[pos-1:pos]) and not text[pos-1].isspace():
368
+ pos -= 1
369
+ if pos < end:
370
+ word = text[pos:end]
371
+ result.insert(0, Morpheme(word, word, 'x', pos, end))
372
+ continue
373
+
374
+ # 최장일치 (역방향)
375
+ matched = False
376
+
377
+ # 런타임 사전
378
+ for length in range(min(self.max_word_len, pos), 0, -1):
379
+ word = text[pos-length:pos]
380
+ if word in self._user_dictionary:
381
+ lemma, pos_tag, _ = self._user_dictionary[word]
382
+ result.insert(0, Morpheme(word, lemma, pos_tag, pos - length, pos))
383
+ pos -= length
384
+ matched = True
385
+ break
386
+
387
+ if matched:
388
+ continue
389
+
390
+ # 기본 사전
391
+ for length in range(min(self.max_word_len, pos), 0, -1):
392
+ word = text[pos-length:pos]
393
+ if word in self._dictionary:
394
+ lemma, pos_tag = self._dictionary[word]
395
+ result.insert(0, Morpheme(word, lemma, pos_tag, pos - length, pos))
396
+ pos -= length
397
+ matched = True
398
+ break
399
+
400
+ if not matched:
401
+ result.insert(0, Morpheme(text[pos-1], text[pos-1], 'n', pos - 1, pos))
402
+ pos -= 1
403
+
404
+ return result
405
+
406
+ def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
407
+ """대안 분석 결과 생성"""
408
+ # 이미 양방향 분석에서 2개 후보 생성
409
+ alternatives = []
410
+
411
+ # 다른 도메인으로 분석
412
+ other_domains = [d for d in Domain if d != domain][:count]
413
+
414
+ for alt_domain in other_domains:
415
+ candidates = self._generate_candidates(text, alt_domain)
416
+ for c in candidates[:1]:
417
+ c.score *= 0.85
418
+ c.domain = alt_domain
419
+ alternatives.append(c)
420
+
421
+ return alternatives
422
+
423
+
424
+ # Alias for backward compatibility
425
+ ChineseAnalyzer = ChineseAdvancedAnalyzer