tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,472 @@
1
+ """
2
+ Advanced Morphological Analyzer Base
3
+ =====================================
4
+
5
+ 5가지 고급 기능을 지원하는 형태소 분석기 베이스 클래스
6
+
7
+ Features:
8
+ 1. NER Gazetteer Integration - 개체명 경계 보존
9
+ 2. Real-time Dictionary Extension - 런타임 사전 확장
10
+ 3. Domain Adaptation - 도메인별 분석 최적화
11
+ 4. Code-switching - 다국어 혼용 텍스트 처리
12
+ 5. N-best Analysis - 다중 후보 + 신뢰도 점수
13
+ """
14
+
15
+ import re
16
+ from abc import ABC, abstractmethod
17
+ from typing import List, Dict, Tuple, Optional, Set, Any
18
+ from dataclasses import dataclass, field
19
+ from enum import Enum
20
+
21
+
22
+ class Domain(Enum):
23
+ """분석 도메인"""
24
+ GENERAL = "general"
25
+ TECH = "tech"
26
+ FOOD = "food"
27
+ SPORTS = "sports"
28
+ MEDICAL = "medical"
29
+ LEGAL = "legal"
30
+ FINANCE = "finance"
31
+ ENTERTAINMENT = "entertainment"
32
+ NEWS = "news"
33
+ SNS = "sns"
34
+
35
+
36
+ @dataclass
37
+ class Morpheme:
38
+ """형태소"""
39
+ surface: str # 표층형
40
+ lemma: str # 기본형
41
+ pos: str # 품사
42
+ start: int # 시작 위치
43
+ end: int # 끝 위치
44
+ score: float = 1.0 # 신뢰도 점수
45
+ features: Dict[str, Any] = field(default_factory=dict)
46
+
47
+ def __repr__(self):
48
+ return f"{self.surface}/{self.pos}"
49
+
50
+
51
+ @dataclass
52
+ class AnalysisResult:
53
+ """분석 결과"""
54
+ morphemes: List[Morpheme]
55
+ score: float = 1.0
56
+ domain: Domain = Domain.GENERAL
57
+ detected_languages: Set[str] = field(default_factory=set)
58
+
59
+ def __repr__(self):
60
+ return " + ".join(str(m) for m in self.morphemes)
61
+
62
+
63
+ @dataclass
64
+ class NBestResult:
65
+ """N-best 분석 결과"""
66
+ results: List[AnalysisResult]
67
+
68
+ @property
69
+ def best(self) -> AnalysisResult:
70
+ return self.results[0] if self.results else AnalysisResult([])
71
+
72
+ def __iter__(self):
73
+ return iter(self.results)
74
+
75
+
76
+ class AdvancedMorphologicalAnalyzer(ABC):
77
+ """
78
+ 고급 형태소 분석기 베이스 클래스
79
+
80
+ 모든 언어별 분석기가 상속받아 구현
81
+ """
82
+
83
+ # 서브클래스에서 오버라이드
84
+ LANG_CODE: str = ""
85
+ LANG_NAME: str = ""
86
+
87
+ def __init__(self):
88
+ # 기본 사전
89
+ self._dictionary: Dict[str, Tuple[str, str]] = {} # word -> (lemma, pos)
90
+
91
+ # 런타임 추가 사전
92
+ self._user_dictionary: Dict[str, Tuple[str, str, Optional[str]]] = {} # word -> (lemma, pos, domain)
93
+
94
+ # 개체명 가제티어 (외부 연동)
95
+ self._gazetteer: Set[str] = set()
96
+ self._gazetteer_entities: Dict[str, str] = {} # entity -> type (PER, ORG, LOC, etc.)
97
+
98
+ # 도메인별 사전
99
+ self._domain_dictionaries: Dict[Domain, Dict[str, Tuple[str, str]]] = {
100
+ domain: {} for domain in Domain
101
+ }
102
+
103
+ # 초기화
104
+ self._build_base_dictionary()
105
+ self._build_domain_dictionaries()
106
+
107
+ @abstractmethod
108
+ def _build_base_dictionary(self):
109
+ """기본 사전 구축 - 서브클래스 구현"""
110
+ pass
111
+
112
+ def _build_domain_dictionaries(self):
113
+ """도메인 사전 구축 - 서브클래스에서 오버라이드 가능"""
114
+ pass
115
+
116
+ # =========================================================================
117
+ # Feature 1: NER Gazetteer Integration
118
+ # =========================================================================
119
+
120
+ def load_gazetteer(self, gazetteer: Set[str], entity_types: Optional[Dict[str, str]] = None):
121
+ """
122
+ 개체명 가제티어 로드
123
+
124
+ Args:
125
+ gazetteer: 개체명 집합
126
+ entity_types: 개체명 -> 타입 매핑 (PER, ORG, LOC 등)
127
+ """
128
+ self._gazetteer = gazetteer
129
+ if entity_types:
130
+ self._gazetteer_entities = entity_types
131
+
132
+ def add_entity(self, entity: str, entity_type: str = "NNP"):
133
+ """단일 개체명 추가"""
134
+ self._gazetteer.add(entity)
135
+ self._gazetteer_entities[entity] = entity_type
136
+
137
+ def _find_entities_in_text(self, text: str) -> List[Tuple[int, int, str, str]]:
138
+ """
139
+ 텍스트에서 개체명 위치 찾기
140
+
141
+ Returns:
142
+ List of (start, end, entity, entity_type)
143
+ """
144
+ entities = []
145
+
146
+ # 긴 개체명부터 매칭 (greedy)
147
+ sorted_gazetteer = sorted(self._gazetteer, key=len, reverse=True)
148
+
149
+ used_positions = set()
150
+
151
+ for entity in sorted_gazetteer:
152
+ start = 0
153
+ while True:
154
+ idx = text.find(entity, start)
155
+ if idx == -1:
156
+ break
157
+
158
+ # 이미 사용된 위치인지 확인
159
+ positions = set(range(idx, idx + len(entity)))
160
+ if not positions & used_positions:
161
+ entity_type = self._gazetteer_entities.get(entity, "NNP")
162
+ entities.append((idx, idx + len(entity), entity, entity_type))
163
+ used_positions |= positions
164
+
165
+ start = idx + 1
166
+
167
+ # 위치순 정렬
168
+ entities.sort(key=lambda x: x[0])
169
+ return entities
170
+
171
+ # =========================================================================
172
+ # Feature 2: Real-time Dictionary Extension
173
+ # =========================================================================
174
+
175
+ def add_word(self, word: str, pos: str, lemma: Optional[str] = None,
176
+ domain: Optional[str] = None):
177
+ """
178
+ 런타임 사전 확장
179
+
180
+ Args:
181
+ word: 단어
182
+ pos: 품사
183
+ lemma: 기본형 (생략시 word와 동일)
184
+ domain: 도메인 (생략시 전체 도메인)
185
+ """
186
+ lemma = lemma or word
187
+ self._user_dictionary[word] = (lemma, pos, domain)
188
+
189
+ # 도메인 사전에도 추가
190
+ if domain:
191
+ try:
192
+ dom = Domain(domain)
193
+ self._domain_dictionaries[dom][word] = (lemma, pos)
194
+ except ValueError:
195
+ pass
196
+
197
+ def add_words(self, words: List[Dict[str, str]]):
198
+ """
199
+ 복수 단어 추가
200
+
201
+ Args:
202
+ words: [{"word": "뉴진스", "pos": "NNP", "domain": "entertainment"}, ...]
203
+ """
204
+ for w in words:
205
+ self.add_word(
206
+ word=w["word"],
207
+ pos=w["pos"],
208
+ lemma=w.get("lemma"),
209
+ domain=w.get("domain")
210
+ )
211
+
212
+ def remove_word(self, word: str):
213
+ """사전에서 단어 제거"""
214
+ self._user_dictionary.pop(word, None)
215
+ for dom_dict in self._domain_dictionaries.values():
216
+ dom_dict.pop(word, None)
217
+
218
+ # =========================================================================
219
+ # Feature 3: Domain Adaptation
220
+ # =========================================================================
221
+
222
+ def set_domain_words(self, domain: Domain, words: Dict[str, Tuple[str, str]]):
223
+ """
224
+ 도메인 사전 설정
225
+
226
+ Args:
227
+ domain: 도메인
228
+ words: {word: (lemma, pos), ...}
229
+ """
230
+ self._domain_dictionaries[domain] = words
231
+
232
+ def _get_domain_sense(self, word: str, domain: Domain) -> Optional[Tuple[str, str]]:
233
+ """도메인별 단어 의미 조회"""
234
+ if domain in self._domain_dictionaries:
235
+ return self._domain_dictionaries[domain].get(word)
236
+ return None
237
+
238
+ # =========================================================================
239
+ # Feature 4: Code-switching Detection
240
+ # =========================================================================
241
+
242
+ # 언어별 문자 패턴 (Unicode ranges)
243
+ SCRIPT_PATTERNS = {
244
+ 'ko': re.compile(r'[\uac00-\ud7af\u1100-\u11ff\u3130-\u318f]+'), # 한글
245
+ 'ja': re.compile(r'[\u3040-\u309f\u30a0-\u30ff]+'), # 히라가나/가타카나
246
+ 'zh': re.compile(r'[\u4e00-\u9fff]+'), # 한자
247
+ 'ar': re.compile(r'[\u0600-\u06ff\u0750-\u077f]+'), # 아랍어
248
+ 'hi': re.compile(r'[\u0900-\u097f]+'), # 데바나가리
249
+ 'ru': re.compile(r'[\u0400-\u04ff]+'), # 키릴
250
+ 'th': re.compile(r'[\u0e00-\u0e7f]+'), # 태국어
251
+ 'he': re.compile(r'[\u0590-\u05ff]+'), # 히브리어
252
+ 'latin': re.compile(r'[a-zA-ZÀ-ÿ]+'), # 라틴 문자
253
+ }
254
+
255
+ def detect_languages(self, text: str) -> Set[str]:
256
+ """
257
+ 텍스트에서 사용된 언어(스크립트) 감지
258
+
259
+ Returns:
260
+ 감지된 언어 코드 집합
261
+ """
262
+ detected = set()
263
+
264
+ for lang, pattern in self.SCRIPT_PATTERNS.items():
265
+ if pattern.search(text):
266
+ detected.add(lang)
267
+
268
+ return detected
269
+
270
+ def _split_by_language(self, text: str) -> List[Tuple[str, str, int, int]]:
271
+ """
272
+ 텍스트를 언어별로 분리
273
+
274
+ Returns:
275
+ List of (segment, language, start, end)
276
+ """
277
+ segments = []
278
+
279
+ # 모든 패턴으로 매칭
280
+ all_matches = []
281
+ for lang, pattern in self.SCRIPT_PATTERNS.items():
282
+ for match in pattern.finditer(text):
283
+ all_matches.append((match.start(), match.end(), match.group(), lang))
284
+
285
+ # 숫자/기호
286
+ for match in re.finditer(r'[0-9]+', text):
287
+ all_matches.append((match.start(), match.end(), match.group(), 'num'))
288
+
289
+ # 위치순 정렬
290
+ all_matches.sort(key=lambda x: x[0])
291
+
292
+ # 겹치지 않는 세그먼트 선택
293
+ used = set()
294
+ for start, end, segment, lang in all_matches:
295
+ positions = set(range(start, end))
296
+ if not positions & used:
297
+ segments.append((segment, lang, start, end))
298
+ used |= positions
299
+
300
+ segments.sort(key=lambda x: x[2])
301
+ return segments
302
+
303
+ # =========================================================================
304
+ # Feature 5: N-best Analysis
305
+ # =========================================================================
306
+
307
+ @abstractmethod
308
+ def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
309
+ """
310
+ 분석 후보 생성 - 서브클래스 구현
311
+
312
+ Returns:
313
+ 점수순 정렬된 분석 결과 리스트
314
+ """
315
+ pass
316
+
317
+ def _score_analysis(self, result: AnalysisResult) -> float:
318
+ """
319
+ 분석 결과 점수 계산
320
+
321
+ 기본 휴리스틱:
322
+ - 형태소 수가 적을수록 좋음
323
+ - 알려진 단어가 많을수록 좋음
324
+ - 도메인 일치 보너스
325
+ """
326
+ if not result.morphemes:
327
+ return 0.0
328
+
329
+ # 기본 점수
330
+ score = 1.0
331
+
332
+ # 형태소 수 패널티 (적을수록 좋음)
333
+ score -= len(result.morphemes) * 0.01
334
+
335
+ # 알려진 단어 보너스
336
+ known_count = sum(
337
+ 1 for m in result.morphemes
338
+ if m.surface in self._dictionary or m.surface in self._user_dictionary
339
+ )
340
+ score += known_count * 0.05
341
+
342
+ return max(0.0, min(1.0, score))
343
+
344
+ # =========================================================================
345
+ # Main Analysis Methods
346
+ # =========================================================================
347
+
348
+ def analyze(self, text: str,
349
+ preserve_entities: bool = True,
350
+ domain: Optional[str] = None,
351
+ n_best: int = 1) -> NBestResult:
352
+ """
353
+ 형태소 분석 (고급 기능 통합)
354
+
355
+ Args:
356
+ text: 입력 텍스트
357
+ preserve_entities: 개체명 경계 보존 여부
358
+ domain: 분석 도메인
359
+ n_best: 반환할 후보 수
360
+
361
+ Returns:
362
+ NBestResult: n-best 분석 결과
363
+ """
364
+ if not text or not text.strip():
365
+ return NBestResult([AnalysisResult([])])
366
+
367
+ # 도메인 결정
368
+ dom = Domain(domain) if domain else Domain.GENERAL
369
+
370
+ # 언어 감지 (code-switching)
371
+ detected_langs = self.detect_languages(text)
372
+
373
+ # 개체명 위치 찾기
374
+ entity_spans = []
375
+ if preserve_entities and self._gazetteer:
376
+ entity_spans = self._find_entities_in_text(text)
377
+
378
+ # 텍스트 분할 (개체명 보존)
379
+ segments = self._segment_text(text, entity_spans)
380
+
381
+ # 세그먼트별 분석
382
+ all_morphemes = []
383
+ for seg_text, seg_start, is_entity, entity_type in segments:
384
+ if is_entity:
385
+ # 개체명은 그대로 유지
386
+ all_morphemes.append(Morpheme(
387
+ surface=seg_text,
388
+ lemma=seg_text,
389
+ pos=entity_type,
390
+ start=seg_start,
391
+ end=seg_start + len(seg_text),
392
+ score=1.0,
393
+ features={"is_entity": True}
394
+ ))
395
+ else:
396
+ # 일반 텍스트 분석
397
+ segment_results = self._generate_candidates(seg_text, dom)
398
+ if segment_results:
399
+ # 오프셋 조정
400
+ for m in segment_results[0].morphemes:
401
+ m.start += seg_start
402
+ m.end += seg_start
403
+ all_morphemes.extend(segment_results[0].morphemes)
404
+
405
+ # 최종 결과
406
+ main_result = AnalysisResult(
407
+ morphemes=all_morphemes,
408
+ score=self._score_analysis(AnalysisResult(all_morphemes)),
409
+ domain=dom,
410
+ detected_languages=detected_langs
411
+ )
412
+
413
+ # N-best 생성 (현재는 단일 결과)
414
+ results = [main_result]
415
+
416
+ # 추가 후보 생성 (n_best > 1인 경우)
417
+ if n_best > 1:
418
+ additional = self._generate_alternatives(text, dom, n_best - 1)
419
+ results.extend(additional)
420
+
421
+ return NBestResult(results[:n_best])
422
+
423
+ def _segment_text(self, text: str, entity_spans: List[Tuple[int, int, str, str]]) -> List[Tuple[str, int, bool, str]]:
424
+ """
425
+ 텍스트를 개체명 기준으로 분할
426
+
427
+ Returns:
428
+ List of (segment_text, start_offset, is_entity, entity_type)
429
+ """
430
+ if not entity_spans:
431
+ return [(text, 0, False, "")]
432
+
433
+ segments = []
434
+ prev_end = 0
435
+
436
+ for start, end, entity, entity_type in entity_spans:
437
+ # 개체명 이전 텍스트
438
+ if start > prev_end:
439
+ segments.append((text[prev_end:start], prev_end, False, ""))
440
+
441
+ # 개체명
442
+ segments.append((entity, start, True, entity_type))
443
+ prev_end = end
444
+
445
+ # 마지막 개체명 이후 텍스트
446
+ if prev_end < len(text):
447
+ segments.append((text[prev_end:], prev_end, False, ""))
448
+
449
+ return segments
450
+
451
+ def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
452
+ """대안 분석 결과 생성 - 서브클래스에서 오버라이드 가능"""
453
+ return []
454
+
455
+ # =========================================================================
456
+ # Convenience Methods
457
+ # =========================================================================
458
+
459
+ def tokenize(self, text: str) -> List[str]:
460
+ """간편 토크나이징"""
461
+ result = self.analyze(text, preserve_entities=False, n_best=1)
462
+ return [m.surface for m in result.best.morphemes]
463
+
464
+ def pos_tag(self, text: str) -> List[Tuple[str, str]]:
465
+ """품사 태깅"""
466
+ result = self.analyze(text, preserve_entities=False, n_best=1)
467
+ return [(m.surface, m.pos) for m in result.best.morphemes]
468
+
469
+ def lemmatize(self, text: str) -> List[str]:
470
+ """기본형 추출"""
471
+ result = self.analyze(text, preserve_entities=False, n_best=1)
472
+ return [m.lemma for m in result.best.morphemes]