tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,603 @@
1
+ """
2
+ Korean Advanced Morphological Analyzer
3
+ ======================================
4
+
5
+ 5가지 고급 기능을 지원하는 한국어 형태소 분석기
6
+
7
+ Features:
8
+ 1. NER Gazetteer Integration - 개체명 경계 보존
9
+ 2. Real-time Dictionary Extension - 런타임 사전 확장
10
+ 3. Domain Adaptation - 도메인별 분석 최적화
11
+ 4. Code-switching - 영한 혼용 텍스트 처리
12
+ 5. N-best Analysis - 다중 후보 + 신뢰도 점수
13
+ """
14
+
15
+ import re
16
+ import json
17
+ from pathlib import Path
18
+ from typing import List, Tuple, Dict, Set, Optional, Any
19
+ from dataclasses import dataclass
20
+
21
+ from .advanced_base import (
22
+ AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, NBestResult, Domain
23
+ )
24
+
25
+ # 확장 사전 경로
26
+ from .. import resources
27
+
28
+ # Optional external asset dir (default: none). If you want extended dictionaries,
29
+ # provide them under: TOKMOR_DATA_DIR/extended_dict/{lang}_extended.json
30
+ DICT_DIR = resources.data_dir() / "extended_dict"
31
+
32
+
33
+ class KoreanAdvancedAnalyzer(AdvancedMorphologicalAnalyzer):
34
+ """
35
+ 한국어 고급 형태소 분석기
36
+
37
+ Usage:
38
+ analyzer = KoreanAdvancedAnalyzer()
39
+
40
+ # 기본 분석
41
+ result = analyzer.analyze("삼성전자가 서울에서 발표했다")
42
+
43
+ # 개체명 보존
44
+ analyzer.add_entity("삼성전자", "ORG")
45
+ result = analyzer.analyze("삼성전자가 발표했다", preserve_entities=True)
46
+
47
+ # 도메인 적응
48
+ result = analyzer.analyze("배가 아파요", domain="food") # 과일
49
+ result = analyzer.analyze("배가 아파요", domain="medical") # 신체
50
+
51
+ # 런타임 사전 확장
52
+ analyzer.add_word("뉴진스", pos="NNP", domain="entertainment")
53
+
54
+ # N-best 분석
55
+ result = analyzer.analyze("사과", n_best=3)
56
+ """
57
+
58
+ LANG_CODE = "ko"
59
+ LANG_NAME = "Korean"
60
+
61
+ def __init__(self):
62
+ # 한글 자모
63
+ self.CHO = 'ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ'
64
+ self.JUNG = 'ㅏㅐㅑㅒㅓㅔㅕㅖㅗㅘㅙㅚㅛㅜㅝㅞㅟㅠㅡㅢㅣ'
65
+ self.JONG = ' ㄱㄲㄳㄴㄵㄶㄷㄹㄺㄻㄼㄽㄾㄿㅀㅁㅂㅄㅅㅆㅇㅈㅊㅋㅌㅍㅎ'
66
+
67
+ super().__init__()
68
+
69
+ def _build_base_dictionary(self):
70
+ """기본 사전 구축"""
71
+
72
+ # =================================================================
73
+ # 조사 사전
74
+ # =================================================================
75
+ self.josa = {
76
+ # 격조사
77
+ '이': 'JKS', '가': 'JKS', '께서': 'JKS',
78
+ '을': 'JKO', '를': 'JKO',
79
+ '의': 'JKG',
80
+ '에': 'JKB', '에서': 'JKB', '에게': 'JKB', '한테': 'JKB',
81
+ '로': 'JKB', '으로': 'JKB', '에게로': 'JKB',
82
+ '와': 'JKB', '과': 'JKB', '랑': 'JKB', '이랑': 'JKB',
83
+ '보다': 'JKB', '처럼': 'JKB', '같이': 'JKB', '만큼': 'JKB',
84
+ # 보조사
85
+ '은': 'JX', '는': 'JX', '도': 'JX', '만': 'JX', '까지': 'JX',
86
+ '부터': 'JX', '마저': 'JX', '조차': 'JX', '밖에': 'JX',
87
+ '라도': 'JX', '이라도': 'JX', '나': 'JX', '이나': 'JX',
88
+ '든지': 'JX', '이든지': 'JX', '야': 'JX', '이야': 'JX',
89
+ # 접속조사
90
+ '하고': 'JC', '이며': 'JC', '며': 'JC',
91
+ }
92
+
93
+ # =================================================================
94
+ # 어미 사전
95
+ # =================================================================
96
+ self.eomi = {
97
+ # 종결어미
98
+ '다': 'EF', 'ㄴ다': 'EF', '는다': 'EF', '니다': 'EF', '습니다': 'EF',
99
+ '어요': 'EF', '아요': 'EF', '여요': 'EF', '해요': 'EF',
100
+ '어': 'EF', '아': 'EF', '지': 'EF', '네': 'EF', '군': 'EF', '구나': 'EF',
101
+ '냐': 'EF', '니': 'EF', '나': 'EF', '자': 'EF', '세요': 'EF', '십시오': 'EF',
102
+ '라': 'EF', '거라': 'EF', '렴': 'EF', '려무나': 'EF',
103
+ # 연결어미
104
+ '고': 'EC', '며': 'EC', '면서': 'EC', '면': 'EC', '으면': 'EC',
105
+ '서': 'EC', '아서': 'EC', '어서': 'EC', '여서': 'EC',
106
+ '니까': 'EC', '으니까': 'EC',
107
+ '지만': 'EC', '는데': 'EC', '은데': 'EC', 'ㄴ데': 'EC',
108
+ '도록': 'EC', '게': 'EC', '려고': 'EC', '으려고': 'EC',
109
+ '러': 'EC', '으러': 'EC', '자마자': 'EC',
110
+ # 관형형전성어미
111
+ '는': 'ETM', '은': 'ETM', 'ㄴ': 'ETM', 'ㄹ': 'ETM', '을': 'ETM',
112
+ # 명사형전성어미
113
+ '음': 'ETN', '기': 'ETN', 'ㅁ': 'ETN',
114
+ }
115
+
116
+ # =================================================================
117
+ # 선어말어미
118
+ # =================================================================
119
+ self.prefinal = {
120
+ '았': 'EP', '었': 'EP', '였': 'EP', '겠': 'EP',
121
+ '시': 'EP', '으시': 'EP', '셨': 'EP', '으셨': 'EP',
122
+ }
123
+
124
+ # =================================================================
125
+ # 접미사
126
+ # =================================================================
127
+ self.suffix = {
128
+ '하': 'XSV', '되': 'XSV', '시키': 'XSV', '당하': 'XSV',
129
+ '스럽': 'XSA', '롭': 'XSA', '답': 'XSA', '적': 'XSA',
130
+ }
131
+
132
+ # =================================================================
133
+ # 명사 사전
134
+ # =================================================================
135
+ self.nouns = {
136
+ # 일반명사
137
+ '것': 'NNG', '수': 'NNG', '등': 'NNG', '때': 'NNG', '곳': 'NNG',
138
+ '사람': 'NNG', '사람들': 'NNG', '일': 'NNG', '말': 'NNG', '생각': 'NNG',
139
+ '문제': 'NNG', '경우': 'NNG', '사실': 'NNG', '점': 'NNG', '시간': 'NNG',
140
+ '세계': 'NNG', '나라': 'NNG', '정부': 'NNG', '사회': 'NNG', '국가': 'NNG',
141
+ '회사': 'NNG', '기업': 'NNG', '시장': 'NNG', '경제': 'NNG', '산업': 'NNG',
142
+ '기술': 'NNG', '발표': 'NNG', '개발': 'NNG', '연구': 'NNG', '조사': 'NNG',
143
+ '결과': 'NNG', '계획': 'NNG', '방법': 'NNG', '이유': 'NNG', '내용': 'NNG',
144
+ '오늘': 'NNG', '내일': 'NNG', '어제': 'NNG', '올해': 'NNG', '작년': 'NNG',
145
+ '학교': 'NNG', '대학': 'NNG', '학생': 'NNG', '선생': 'NNG', '공부': 'NNG',
146
+ '집': 'NNG', '방': 'NNG', '문': 'NNG', '길': 'NNG', '차': 'NNG',
147
+ '돈': 'NNG', '물': 'NNG', '밥': 'NNG', '책': 'NNG', '글': 'NNG',
148
+ '배': 'NNG', '사과': 'NNG', '과일': 'NNG', '음식': 'NNG',
149
+ # 고유명사
150
+ '삼성': 'NNP', '현대': 'NNP', '서울': 'NNP', '부산': 'NNP', '한국': 'NNP',
151
+ '미국': 'NNP', '중국': 'NNP', '일본': 'NNP',
152
+ '삼성전자': 'NNP', '현대자동차': 'NNP', 'LG전자': 'NNP', 'SK하이닉스': 'NNP',
153
+ # 의존명사
154
+ '데': 'NNB', '바': 'NNB', '뿐': 'NNB', '줄': 'NNB', '리': 'NNB',
155
+ # 대명사
156
+ '나': 'NP', '너': 'NP', '저': 'NP', '우리': 'NP', '그': 'NP', '이': 'NP',
157
+ '누구': 'NP', '무엇': 'NP', '어디': 'NP', '언제': 'NP',
158
+ # 수사
159
+ '하나': 'NR', '둘': 'NR', '셋': 'NR', '넷': 'NR', '다섯': 'NR',
160
+ }
161
+
162
+ # =================================================================
163
+ # 용언 활용형 사전 (어간+어미 결합형)
164
+ # =================================================================
165
+ self.verb_forms = {
166
+ # 가다
167
+ '간다': [('가', 'VV'), ('ㄴ다', 'EF')],
168
+ '갔다': [('가', 'VV'), ('았', 'EP'), ('다', 'EF')],
169
+ '가면': [('가', 'VV'), ('면', 'EC')],
170
+ '가고': [('가', 'VV'), ('고', 'EC')],
171
+ '가서': [('가', 'VV'), ('서', 'EC')],
172
+ # 오다
173
+ '온다': [('오', 'VV'), ('ㄴ다', 'EF')],
174
+ '왔다': [('오', 'VV'), ('았', 'EP'), ('다', 'EF')],
175
+ # 하다
176
+ '한다': [('하', 'VV'), ('ㄴ다', 'EF')],
177
+ '했다': [('하', 'VV'), ('였', 'EP'), ('다', 'EF')],
178
+ '하고': [('하', 'VV'), ('고', 'EC')],
179
+ '하면': [('하', 'VV'), ('면', 'EC')],
180
+ '하는': [('하', 'VV'), ('는', 'ETM')],
181
+ # 되다
182
+ '된다': [('되', 'VV'), ('ㄴ다', 'EF')],
183
+ '됐다': [('되', 'VV'), ('었', 'EP'), ('다', 'EF')],
184
+ # 있다/없다
185
+ '있다': [('있', 'VX'), ('다', 'EF')],
186
+ '있는': [('있', 'VX'), ('는', 'ETM')],
187
+ '없다': [('없', 'VX'), ('다', 'EF')],
188
+ '없는': [('없', 'VX'), ('는', 'ETM')],
189
+ # 이다
190
+ '이다': [('이', 'VCP'), ('다', 'EF')],
191
+ # 발표하다
192
+ '발표했다': [('발표', 'NNG'), ('하', 'XSV'), ('였', 'EP'), ('다', 'EF')],
193
+ '발표한다': [('발표', 'NNG'), ('하', 'XSV'), ('ㄴ다', 'EF')],
194
+ '발표하는': [('발표', 'NNG'), ('하', 'XSV'), ('는', 'ETM')],
195
+ # 아프다
196
+ '아파요': [('아프', 'VA'), ('아요', 'EF')],
197
+ '아프다': [('아프', 'VA'), ('다', 'EF')],
198
+ }
199
+
200
+ # =================================================================
201
+ # 용언 어간 사전
202
+ # =================================================================
203
+ self.verbs = {
204
+ '하': 'VV', '되': 'VV', '가': 'VV', '오': 'VV', '보': 'VV',
205
+ '알': 'VV', '모르': 'VV', '주': 'VV', '받': 'VV', '만들': 'VV',
206
+ '말하': 'VV', '생각하': 'VV', '사용하': 'VV', '발표하': 'VV',
207
+ '나오': 'VV', '들어가': 'VV', '나가': 'VV', '들어오': 'VV',
208
+ '있': 'VV', '없': 'VV', '같': 'VV',
209
+ '크': 'VA', '작': 'VA', '많': 'VA', '적': 'VA', '좋': 'VA', '나쁘': 'VA',
210
+ '높': 'VA', '낮': 'VA', '길': 'VA', '짧': 'VA', '넓': 'VA', '좁': 'VA',
211
+ '새롭': 'VA', '어렵': 'VA', '쉽': 'VA', '아름답': 'VA', '아프': 'VA',
212
+ '이': 'VCP',
213
+ }
214
+
215
+ # =================================================================
216
+ # 부사
217
+ # =================================================================
218
+ self.adverbs = {
219
+ '매우': 'MAG', '아주': 'MAG', '너무': 'MAG', '정말': 'MAG', '진짜': 'MAG',
220
+ '가장': 'MAG', '더': 'MAG', '덜': 'MAG', '잘': 'MAG', '못': 'MAG',
221
+ '다': 'MAG', '모두': 'MAG', '함께': 'MAG', '같이': 'MAG', '다시': 'MAG',
222
+ '또': 'MAG', '이미': 'MAG', '아직': 'MAG', '벌써': 'MAG', '곧': 'MAG',
223
+ '그리고': 'MAJ', '그러나': 'MAJ', '그래서': 'MAJ', '하지만': 'MAJ',
224
+ }
225
+
226
+ # ================================================================
227
+ # 확장 사전 로드 (optional external asset)
228
+ # ================================================================
229
+ self._load_extended_dictionary()
230
+
231
+ # 모음조화
232
+ self.yang_vowels = {'ㅏ', 'ㅗ', 'ㅑ', 'ㅛ'}
233
+ self.eum_vowels = {'ㅓ', 'ㅜ', 'ㅡ', 'ㅕ', 'ㅠ', 'ㅣ'}
234
+
235
+ def _load_extended_dictionary(self):
236
+ """Load optional external extended dictionary"""
237
+ dict_path = DICT_DIR / 'ko_extended.json'
238
+ if not dict_path.exists():
239
+ return
240
+
241
+ with open(dict_path, 'r', encoding='utf-8') as f:
242
+ extended = json.load(f)
243
+
244
+ # 기존 사전에 없는 것만 추가
245
+ for word, pos in extended.items():
246
+ if pos in ('NNG', 'NNP', 'NNB'):
247
+ if word not in self.nouns:
248
+ self.nouns[word] = pos
249
+ elif pos == 'VV':
250
+ if word not in self.verbs:
251
+ self.verbs[word] = pos
252
+ elif pos == 'VA':
253
+ if word not in self.verbs:
254
+ self.verbs[word] = pos
255
+ elif pos == 'MAG':
256
+ if word not in self.adverbs:
257
+ self.adverbs[word] = pos
258
+ elif pos == 'NP':
259
+ if word not in self.nouns:
260
+ self.nouns[word] = pos
261
+ elif pos == 'NR':
262
+ if word not in self.nouns:
263
+ self.nouns[word] = pos
264
+ elif pos == 'MM':
265
+ # 관형사는 별도 사전 필요시 추가
266
+ pass
267
+
268
+ def _build_domain_dictionaries(self):
269
+ """도메인별 사전 구축"""
270
+
271
+ # FOOD 도메인
272
+ self._domain_dictionaries[Domain.FOOD] = {
273
+ '배': ('배', 'NNG'), # 과일
274
+ '사과': ('사과', 'NNG'), # 과일
275
+ '밤': ('밤', 'NNG'), # 음식 (밤나무 열매)
276
+ '감': ('감', 'NNG'), # 과일
277
+ '귤': ('귤', 'NNG'),
278
+ '포도': ('포도', 'NNG'),
279
+ }
280
+
281
+ # MEDICAL 도메인
282
+ self._domain_dictionaries[Domain.MEDICAL] = {
283
+ '배': ('배', 'NNG'), # 신체 부위 (복부)
284
+ '머리': ('머리', 'NNG'),
285
+ '다리': ('다리', 'NNG'),
286
+ '팔': ('팔', 'NNG'),
287
+ '목': ('목', 'NNG'),
288
+ '허리': ('허리', 'NNG'),
289
+ }
290
+
291
+ # TECH 도메인
292
+ self._domain_dictionaries[Domain.TECH] = {
293
+ '배': ('배', 'NNG'), # 배열 (array)의 앞글자? -> 일반적으로 array
294
+ '모델': ('모델', 'NNG'),
295
+ '서버': ('서버', 'NNG'),
296
+ '클라우드': ('클라우드', 'NNG'),
297
+ '메모리': ('메모리', 'NNG'),
298
+ '프로세서': ('프로세서', 'NNG'),
299
+ }
300
+
301
+ # SPORTS 도메인
302
+ self._domain_dictionaries[Domain.SPORTS] = {
303
+ '배': ('배', 'NNG'), # 배드민턴/배구
304
+ '공': ('공', 'NNG'),
305
+ '골': ('골', 'NNG'),
306
+ '경기': ('경기', 'NNG'),
307
+ '선수': ('선수', 'NNG'),
308
+ }
309
+
310
+ # ENTERTAINMENT 도메인
311
+ self._domain_dictionaries[Domain.ENTERTAINMENT] = {
312
+ '배': ('배', 'NNG'), # 배우
313
+ '가수': ('가수', 'NNG'),
314
+ '배우': ('배우', 'NNG'),
315
+ '아이돌': ('아이돌', 'NNG'),
316
+ '뉴진스': ('뉴진스', 'NNP'),
317
+ 'BTS': ('BTS', 'NNP'),
318
+ }
319
+
320
+ # FINANCE 도메인
321
+ self._domain_dictionaries[Domain.FINANCE] = {
322
+ '배': ('배', 'NNG'), # 배당
323
+ '주식': ('주식', 'NNG'),
324
+ '채권': ('채권', 'NNG'),
325
+ '펀드': ('펀드', 'NNG'),
326
+ '배당': ('배당', 'NNG'),
327
+ }
328
+
329
+ def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
330
+ """분석 후보 생성"""
331
+ if not text or not text.strip():
332
+ return [AnalysisResult([])]
333
+
334
+ candidates = []
335
+
336
+ # 기본 분석
337
+ main_morphemes = self._analyze_text(text, domain)
338
+ main_result = AnalysisResult(
339
+ morphemes=main_morphemes,
340
+ score=1.0,
341
+ domain=domain
342
+ )
343
+ main_result.score = self._score_analysis(main_result)
344
+ candidates.append(main_result)
345
+
346
+ return candidates
347
+
348
+ def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
349
+ """텍스트 분석 (메인 로직)"""
350
+ if not text:
351
+ return []
352
+
353
+ result = []
354
+ segments = self._segment(text)
355
+
356
+ offset = 0
357
+ for segment in segments:
358
+ if not segment.strip():
359
+ offset += len(segment)
360
+ continue
361
+
362
+ morphemes = self._analyze_segment(segment, offset, domain)
363
+ result.extend(morphemes)
364
+ offset += len(segment)
365
+
366
+ return result
367
+
368
+ def _segment(self, text: str) -> List[str]:
369
+ """공백/구두점으로 분리"""
370
+ segments = re.findall(r'[가-힣]+|[a-zA-Z]+|[0-9]+|[^\s가-힣a-zA-Z0-9]+|\s+', text)
371
+ return segments
372
+
373
+ def _analyze_segment(self, segment: str, offset: int, domain: Domain) -> List[Morpheme]:
374
+ """단일 세그먼트 분석"""
375
+
376
+ # 비한글 (영어, 숫자, 기호)
377
+ if not re.match(r'[가-힣]', segment):
378
+ # Code-switching: 영어 단어 분석
379
+ if re.match(r'[a-zA-Z]', segment):
380
+ pos = 'SL'
381
+ # 런타임 사전에서 영어 단어 확인
382
+ if segment.lower() in self._user_dictionary:
383
+ lemma, pos, _ = self._user_dictionary[segment.lower()]
384
+ return [Morpheme(
385
+ surface=segment, lemma=lemma, pos=pos,
386
+ start=offset, end=offset + len(segment)
387
+ )]
388
+ return [Morpheme(
389
+ surface=segment, lemma=segment, pos=pos,
390
+ start=offset, end=offset + len(segment)
391
+ )]
392
+ elif segment.isdigit():
393
+ return [Morpheme(
394
+ surface=segment, lemma=segment, pos='SN',
395
+ start=offset, end=offset + len(segment)
396
+ )]
397
+ else:
398
+ return [Morpheme(
399
+ surface=segment, lemma=segment, pos='SW',
400
+ start=offset, end=offset + len(segment)
401
+ )]
402
+
403
+ # 런타임 사전 확인 (우선순위 최고)
404
+ if segment in self._user_dictionary:
405
+ lemma, pos, _ = self._user_dictionary[segment]
406
+ return [Morpheme(
407
+ surface=segment, lemma=lemma, pos=pos,
408
+ start=offset, end=offset + len(segment)
409
+ )]
410
+
411
+ # 용언 활용형 직접 매칭
412
+ if segment in self.verb_forms:
413
+ results = []
414
+ pos = offset
415
+ for surface, tag in self.verb_forms[segment]:
416
+ results.append(Morpheme(
417
+ surface=surface, lemma=surface, pos=tag,
418
+ start=pos, end=pos + len(surface)
419
+ ))
420
+ pos += len(surface)
421
+ return results
422
+
423
+ # 도메인 사전 확인
424
+ domain_sense = self._get_domain_sense(segment, domain)
425
+ if domain_sense:
426
+ return [Morpheme(
427
+ surface=segment, lemma=domain_sense[0], pos=domain_sense[1],
428
+ start=offset, end=offset + len(segment)
429
+ )]
430
+
431
+ # 기본 사전 매칭
432
+ if segment in self.nouns:
433
+ return [Morpheme(
434
+ surface=segment, lemma=segment, pos=self.nouns[segment],
435
+ start=offset, end=offset + len(segment)
436
+ )]
437
+ if segment in self.adverbs:
438
+ return [Morpheme(
439
+ surface=segment, lemma=segment, pos=self.adverbs[segment],
440
+ start=offset, end=offset + len(segment)
441
+ )]
442
+
443
+ # 형태소 분석 (조사/어미 분리)
444
+ return self._morpheme_analysis(segment, offset, domain)
445
+
446
+ def _morpheme_analysis(self, word: str, offset: int, domain: Domain) -> List[Morpheme]:
447
+ """형태소 분석 (체언+조사, 용언+어미 분리)"""
448
+ results = []
449
+
450
+ # 1. 조사 분리 시도
451
+ for josa, pos in sorted(self.josa.items(), key=lambda x: -len(x[0])):
452
+ if word.endswith(josa) and len(word) > len(josa):
453
+ stem = word[:-len(josa)]
454
+
455
+ # 음운 조건 확인
456
+ if self._check_josa_condition(stem, josa):
457
+ stem_morphs = self._analyze_stem(stem, offset, domain)
458
+ if stem_morphs:
459
+ results = stem_morphs
460
+ results.append(Morpheme(
461
+ surface=josa, lemma=josa, pos=pos,
462
+ start=offset + len(stem), end=offset + len(word)
463
+ ))
464
+ return results
465
+
466
+ # 2. 어미 분리 시도 (용언)
467
+ verb_result = self._analyze_verb(word, offset)
468
+ if verb_result:
469
+ return verb_result
470
+
471
+ # 3. 미등록어 처리
472
+ return [Morpheme(
473
+ surface=word, lemma=word, pos='NNG',
474
+ start=offset, end=offset + len(word)
475
+ )]
476
+
477
+ def _check_josa_condition(self, stem: str, josa: str) -> bool:
478
+ """조사 음운 조건 확인"""
479
+ if not stem:
480
+ return False
481
+
482
+ last_char = stem[-1]
483
+ has_jong = self._has_jongseong(last_char)
484
+
485
+ # 이/가, 을/를, 은/는, 과/와 등 받침에 따른 이형태
486
+ if josa in ('이', '을', '은', '과', '으로', '이랑', '이나', '이라도', '이든지', '이야'):
487
+ return has_jong
488
+ if josa in ('가', '를', '는', '와', '로', '랑', '나', '라도', '든지', '야'):
489
+ return not has_jong
490
+
491
+ return True
492
+
493
+ def _analyze_stem(self, stem: str, offset: int, domain: Domain) -> List[Morpheme]:
494
+ """어간 분석"""
495
+ # 런타임 사전
496
+ if stem in self._user_dictionary:
497
+ lemma, pos, _ = self._user_dictionary[stem]
498
+ return [Morpheme(surface=stem, lemma=lemma, pos=pos, start=offset, end=offset + len(stem))]
499
+
500
+ # 도메인 사전
501
+ domain_sense = self._get_domain_sense(stem, domain)
502
+ if domain_sense:
503
+ return [Morpheme(
504
+ surface=stem, lemma=domain_sense[0], pos=domain_sense[1],
505
+ start=offset, end=offset + len(stem)
506
+ )]
507
+
508
+ # 명사 사전
509
+ if stem in self.nouns:
510
+ return [Morpheme(surface=stem, lemma=stem, pos=self.nouns[stem], start=offset, end=offset + len(stem))]
511
+
512
+ # 미등록 명사
513
+ if len(stem) >= 2:
514
+ return [Morpheme(surface=stem, lemma=stem, pos='NNG', start=offset, end=offset + len(stem))]
515
+
516
+ return []
517
+
518
+ def _analyze_verb(self, word: str, offset: int) -> List[Morpheme]:
519
+ """용언 분석 (어간 + 어미)"""
520
+ results = []
521
+
522
+ # 선어말어미 + 어말어미 조합 탐색
523
+ for prefinal, pf_pos in sorted(self.prefinal.items(), key=lambda x: -len(x[0])):
524
+ for eomi, em_pos in sorted(self.eomi.items(), key=lambda x: -len(x[0])):
525
+ suffix = prefinal + eomi
526
+ if word.endswith(suffix) and len(word) > len(suffix):
527
+ stem = word[:-len(suffix)]
528
+ verb_stem = self._find_verb_stem(stem)
529
+ if verb_stem:
530
+ results.append(Morpheme(
531
+ surface=stem, lemma=verb_stem[0], pos=verb_stem[1],
532
+ start=offset, end=offset + len(stem)
533
+ ))
534
+ results.append(Morpheme(
535
+ surface=prefinal, lemma=prefinal, pos=pf_pos,
536
+ start=offset + len(stem), end=offset + len(stem) + len(prefinal)
537
+ ))
538
+ results.append(Morpheme(
539
+ surface=eomi, lemma=eomi, pos=em_pos,
540
+ start=offset + len(stem) + len(prefinal), end=offset + len(word)
541
+ ))
542
+ return results
543
+
544
+ # 어말어미만
545
+ for eomi, em_pos in sorted(self.eomi.items(), key=lambda x: -len(x[0])):
546
+ if word.endswith(eomi) and len(word) > len(eomi):
547
+ stem = word[:-len(eomi)]
548
+ verb_stem = self._find_verb_stem(stem)
549
+ if verb_stem:
550
+ results.append(Morpheme(
551
+ surface=stem, lemma=verb_stem[0], pos=verb_stem[1],
552
+ start=offset, end=offset + len(stem)
553
+ ))
554
+ results.append(Morpheme(
555
+ surface=eomi, lemma=eomi, pos=em_pos,
556
+ start=offset + len(stem), end=offset + len(word)
557
+ ))
558
+ return results
559
+
560
+ return []
561
+
562
+ def _find_verb_stem(self, stem: str) -> Optional[Tuple[str, str]]:
563
+ """용언 어간 찾기"""
564
+ if stem in self.verbs:
565
+ return (stem, self.verbs[stem])
566
+
567
+ # 접미사 분리
568
+ for suffix, pos in self.suffix.items():
569
+ if stem.endswith(suffix) and len(stem) > len(suffix):
570
+ noun_part = stem[:-len(suffix)]
571
+ if noun_part in self.nouns or len(noun_part) >= 2:
572
+ return (stem, 'VV')
573
+
574
+ return None
575
+
576
+ def _has_jongseong(self, char: str) -> bool:
577
+ """받침 유무"""
578
+ if not ('\uAC00' <= char <= '\uD7A3'):
579
+ return False
580
+ return (ord(char) - 0xAC00) % 28 != 0
581
+
582
+ def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
583
+ """대안 분석 결과 생성 (N-best용)"""
584
+ alternatives = []
585
+
586
+ # 다른 도메인으로 분석
587
+ other_domains = [d for d in Domain if d != domain][:count]
588
+
589
+ for alt_domain in other_domains:
590
+ morphemes = self._analyze_text(text, alt_domain)
591
+ result = AnalysisResult(
592
+ morphemes=morphemes,
593
+ score=0.8, # 약간 낮은 점수
594
+ domain=alt_domain
595
+ )
596
+ result.score = self._score_analysis(result) * 0.9
597
+ alternatives.append(result)
598
+
599
+ return alternatives
600
+
601
+
602
+ # Alias for backward compatibility
603
+ KoreanAnalyzer = KoreanAdvancedAnalyzer