tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,534 @@
1
+ """
2
+ Korean Morphological Analyzer - 자체 구현
3
+ =========================================
4
+
5
+ 외부 라이브러리 없이 순수 Python으로 구현한 한국어 형태소 분석기
6
+
7
+ 알고리즘:
8
+ 1. 사전 기반 최장일치
9
+ 2. 조사/어미 분리 규칙
10
+ 3. 불규칙 활용 처리
11
+ 4. 미등록어 처리
12
+
13
+ 품사 태그:
14
+ - NNG: 일반명사
15
+ - NNP: 고유명사
16
+ - VV: 동사
17
+ - VA: 형용사
18
+ - JKS: 주격조사
19
+ - JKO: 목적격조사
20
+ - JKB: 부사격조사
21
+ - JX: 보조사
22
+ - EC: 연결어미
23
+ - EF: 종결어미
24
+ - EP: 선어말어미
25
+ - ETM: 관형형전성어미
26
+ - XSV: 동사파생접미사
27
+ - XSA: 형용사파생접미사
28
+ """
29
+
30
+ import re
31
+ from typing import List, Tuple, Dict, Set, Optional
32
+ from dataclasses import dataclass
33
+
34
+
35
+ @dataclass
36
+ class Morpheme:
37
+ """형태소"""
38
+ surface: str # 표층형
39
+ lemma: str # 기본형
40
+ pos: str # 품사
41
+ start: int # 시작 위치
42
+ end: int # 끝 위치
43
+ cost: float = 0.0 # 비용 (낮을수록 좋음)
44
+
45
+ def __repr__(self):
46
+ return f"{self.surface}/{self.pos}"
47
+
48
+
49
+ class KoreanAnalyzer:
50
+ """
51
+ 한국어 형태소 분석기
52
+
53
+ Usage:
54
+ analyzer = KoreanAnalyzer()
55
+ result = analyzer.analyze("삼성전자가 서울에서 발표했다")
56
+ for m in result:
57
+ print(f"{m.surface} [{m.pos}]")
58
+ """
59
+
60
+ def __init__(self):
61
+ self._build_dictionary()
62
+ self._build_rules()
63
+
64
+ def _build_dictionary(self):
65
+ """사전 구축"""
66
+
67
+ # 조사 사전
68
+ self.josa = {
69
+ # 격조사
70
+ '이': 'JKS', '가': 'JKS', '께서': 'JKS',
71
+ '을': 'JKO', '를': 'JKO',
72
+ '의': 'JKG',
73
+ '에': 'JKB', '에서': 'JKB', '에게': 'JKB', '한테': 'JKB',
74
+ '로': 'JKB', '으로': 'JKB', '에게로': 'JKB',
75
+ '와': 'JKB', '과': 'JKB', '랑': 'JKB', '이랑': 'JKB',
76
+ '보다': 'JKB', '처럼': 'JKB', '같이': 'JKB', '만큼': 'JKB',
77
+ # 보조사
78
+ '은': 'JX', '는': 'JX', '도': 'JX', '만': 'JX', '까지': 'JX',
79
+ '부터': 'JX', '마저': 'JX', '조차': 'JX', '밖에': 'JX',
80
+ '라도': 'JX', '이라도': 'JX', '나': 'JX', '이나': 'JX',
81
+ '든지': 'JX', '이든지': 'JX', '야': 'JX', '이야': 'JX',
82
+ # 접속조사
83
+ '와': 'JC', '과': 'JC', '하고': 'JC', '이며': 'JC', '며': 'JC',
84
+ }
85
+
86
+ # 어미 사전
87
+ self.eomi = {
88
+ # 종결어미
89
+ '다': 'EF', '니다': 'EF', '습니다': 'EF', '어요': 'EF', '아요': 'EF',
90
+ '어': 'EF', '아': 'EF', '지': 'EF', '네': 'EF', '군': 'EF', '구나': 'EF',
91
+ '냐': 'EF', '니': 'EF', '나': 'EF', '자': 'EF', '세요': 'EF', '십시오': 'EF',
92
+ '라': 'EF', '거라': 'EF', '렴': 'EF', '려무나': 'EF',
93
+ # 연결어미
94
+ '고': 'EC', '며': 'EC', '면서': 'EC', '면': 'EC', '으면': 'EC',
95
+ '서': 'EC', '아서': 'EC', '어서': 'EC', '니까': 'EC', '으니까': 'EC',
96
+ '지만': 'EC', '는데': 'EC', '은데': 'EC', 'ㄴ데': 'EC',
97
+ '도록': 'EC', '게': 'EC', '려고': 'EC', '으려고': 'EC',
98
+ '러': 'EC', '으러': 'EC', '자': 'EC', '자마자': 'EC',
99
+ # 관형형전성어미
100
+ '는': 'ETM', '은': 'ETM', 'ㄴ': 'ETM', 'ㄹ': 'ETM', '을': 'ETM',
101
+ # 명사형전성어미
102
+ '음': 'ETN', '기': 'ETN', 'ㅁ': 'ETN',
103
+ }
104
+
105
+ # 선어말어미
106
+ self.prefinal = {
107
+ '았': 'EP', '었': 'EP', '였': 'EP', '겠': 'EP',
108
+ '시': 'EP', '으시': 'EP', '셨': 'EP', '으셨': 'EP',
109
+ }
110
+
111
+ # 접미사
112
+ self.suffix = {
113
+ # 동사파생
114
+ '하': 'XSV', '되': 'XSV', '시키': 'XSV', '당하': 'XSV',
115
+ # 형용사파생
116
+ '스럽': 'XSA', '롭': 'XSA', '답': 'XSA', '적': 'XSA',
117
+ }
118
+
119
+ # 기본 체언 사전 (고빈도 명사)
120
+ self.nouns = {
121
+ # 일반명사
122
+ '것': 'NNG', '수': 'NNG', '등': 'NNG', '때': 'NNG', '곳': 'NNG',
123
+ '사람': 'NNG', '사람들': 'NNG', '일': 'NNG', '말': 'NNG', '생각': 'NNG',
124
+ '문제': 'NNG', '경우': 'NNG', '사실': 'NNG', '점': 'NNG', '시간': 'NNG',
125
+ '세계': 'NNG', '나라': 'NNG', '정부': 'NNG', '사회': 'NNG', '국가': 'NNG',
126
+ '회사': 'NNG', '기업': 'NNG', '시장': 'NNG', '경제': 'NNG', '산업': 'NNG',
127
+ '기술': 'NNG', '발표': 'NNG', '개발': 'NNG', '연구': 'NNG', '조사': 'NNG',
128
+ '결과': 'NNG', '계획': 'NNG', '방법': 'NNG', '이유': 'NNG', '내용': 'NNG',
129
+ '오늘': 'NNG', '내일': 'NNG', '어제': 'NNG', '올해': 'NNG', '작년': 'NNG',
130
+ '학교': 'NNG', '대학': 'NNG', '학생': 'NNG', '선생': 'NNG', '공부': 'NNG',
131
+ '집': 'NNG', '방': 'NNG', '문': 'NNG', '길': 'NNG', '차': 'NNG',
132
+ '돈': 'NNG', '물': 'NNG', '밥': 'NNG', '책': 'NNG', '글': 'NNG',
133
+ '삼성': 'NNP', '현대': 'NNP', '서울': 'NNP', '부산': 'NNP', '한국': 'NNP',
134
+ '미국': 'NNP', '중국': 'NNP', '일본': 'NNP',
135
+ '삼성전자': 'NNP', '현대자동차': 'NNP', 'LG전자': 'NNP', 'SK하이닉스': 'NNP',
136
+ # 의존명사
137
+ '데': 'NNB', '바': 'NNB', '뿐': 'NNB', '줄': 'NNB', '리': 'NNB',
138
+ # 대명사
139
+ '나': 'NP', '너': 'NP', '저': 'NP', '우리': 'NP', '그': 'NP', '이': 'NP',
140
+ '누구': 'NP', '무엇': 'NP', '어디': 'NP', '언제': 'NP',
141
+ # 수사
142
+ '하나': 'NR', '둘': 'NR', '셋': 'NR', '넷': 'NR', '다섯': 'NR',
143
+ '일': 'NR', '이': 'NR', '삼': 'NR', '사': 'NR', '오': 'NR',
144
+ }
145
+
146
+ # 용언 활용형 직접 등록 (어간+어미 결합형)
147
+ self.verb_forms = {
148
+ # 가다
149
+ '간다': [('가', 'VV'), ('ㄴ다', 'EF')],
150
+ '갔다': [('가', 'VV'), ('았', 'EP'), ('다', 'EF')],
151
+ '가면': [('가', 'VV'), ('면', 'EC')],
152
+ '가고': [('가', 'VV'), ('고', 'EC')],
153
+ '가서': [('가', 'VV'), ('서', 'EC')],
154
+ # 오다
155
+ '온다': [('오', 'VV'), ('ㄴ다', 'EF')],
156
+ '왔다': [('오', 'VV'), ('았', 'EP'), ('다', 'EF')],
157
+ # 하다
158
+ '한다': [('하', 'VV'), ('ㄴ다', 'EF')],
159
+ '했다': [('하', 'VV'), ('였', 'EP'), ('다', 'EF')],
160
+ '하고': [('하', 'VV'), ('고', 'EC')],
161
+ '하면': [('하', 'VV'), ('면', 'EC')],
162
+ '하는': [('하', 'VV'), ('는', 'ETM')],
163
+ # 되다
164
+ '된다': [('되', 'VV'), ('ㄴ다', 'EF')],
165
+ '됐다': [('되', 'VV'), ('었', 'EP'), ('다', 'EF')],
166
+ # 있다/없다
167
+ '있다': [('있', 'VX'), ('다', 'EF')],
168
+ '있는': [('있', 'VX'), ('는', 'ETM')],
169
+ '없다': [('없', 'VX'), ('다', 'EF')],
170
+ '없는': [('없', 'VX'), ('는', 'ETM')],
171
+ # 이다
172
+ '이다': [('이', 'VCP'), ('다', 'EF')],
173
+ # 발표하다
174
+ '발표했다': [('발표', 'NNG'), ('하', 'XSV'), ('였', 'EP'), ('다', 'EF')],
175
+ '발표한다': [('발표', 'NNG'), ('하', 'XSV'), ('ㄴ다', 'EF')],
176
+ '발표하는': [('발표', 'NNG'), ('하', 'XSV'), ('는', 'ETM')],
177
+ # 공부하다
178
+ '공부한다': [('공부', 'NNG'), ('하', 'XSV'), ('ㄴ다', 'EF')],
179
+ '공부했다': [('공부', 'NNG'), ('하', 'XSV'), ('였', 'EP'), ('다', 'EF')],
180
+ }
181
+
182
+ # 용언 어간 사전 (기본형)
183
+ self.verbs = {
184
+ # 동사
185
+ '하': 'VV', '되': 'VV', '가': 'VV', '오': 'VV', '보': 'VV',
186
+ '알': 'VV', '모르': 'VV', '주': 'VV', '받': 'VV', '만들': 'VV',
187
+ '말하': 'VV', '생각하': 'VV', '사용하': 'VV', '발표하': 'VV',
188
+ '나오': 'VV', '들어가': 'VV', '나가': 'VV', '들어오': 'VV',
189
+ '있': 'VV', '없': 'VV', '같': 'VV',
190
+ # 형용사
191
+ '크': 'VA', '작': 'VA', '많': 'VA', '적': 'VA', '좋': 'VA', '나쁘': 'VA',
192
+ '높': 'VA', '낮': 'VA', '길': 'VA', '짧': 'VA', '넓': 'VA', '좁': 'VA',
193
+ '새롭': 'VA', '어렵': 'VA', '쉽': 'VA', '아름답': 'VA',
194
+ '이': 'VCP', # 서술격조사 '이다'
195
+ }
196
+
197
+ # 부사
198
+ self.adverbs = {
199
+ '매우': 'MAG', '아주': 'MAG', '너무': 'MAG', '정말': 'MAG', '진짜': 'MAG',
200
+ '가장': 'MAG', '더': 'MAG', '덜': 'MAG', '잘': 'MAG', '못': 'MAG',
201
+ '다': 'MAG', '모두': 'MAG', '함께': 'MAG', '같이': 'MAG', '다시': 'MAG',
202
+ '또': 'MAG', '이미': 'MAG', '아직': 'MAG', '벌써': 'MAG', '곧': 'MAG',
203
+ '그리고': 'MAJ', '그러나': 'MAJ', '그래서': 'MAJ', '하지만': 'MAJ',
204
+ }
205
+
206
+ # 관형사
207
+ self.determiners = {
208
+ '이': 'MM', '그': 'MM', '저': 'MM', '어떤': 'MM', '무슨': 'MM',
209
+ '모든': 'MM', '각': 'MM', '온': 'MM', '전': 'MM', '새': 'MM', '헌': 'MM',
210
+ }
211
+
212
+ # 한글 자모 분리용
213
+ self.CHO = 'ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ'
214
+ self.JUNG = 'ㅏㅐㅑㅒㅓㅔㅕㅖㅗㅘㅙㅚㅛㅜㅝㅞㅟㅠㅡㅢㅣ'
215
+ self.JONG = ' ㄱㄲㄳㄴㄵㄶㄷㄹㄺㄻㄼㄽㄾㄿㅀㅁㅂㅄㅅㅆㅇㅈㅊㅋㅌㅍㅎ'
216
+
217
+ def _build_rules(self):
218
+ """음운 규칙 구축"""
219
+ # 모음조화: 양성모음(ㅏㅗ) vs 음성모음(ㅓㅜㅡ)
220
+ self.yang_vowels = {'ㅏ', 'ㅗ', 'ㅑ', 'ㅛ'}
221
+ self.eum_vowels = {'ㅓ', 'ㅜ', 'ㅡ', 'ㅕ', 'ㅠ', 'ㅣ'}
222
+
223
+ def _decompose(self, char: str) -> Tuple[str, str, str]:
224
+ """한글 음절을 자모로 분리"""
225
+ if not self._is_hangul(char):
226
+ return ('', char, '')
227
+
228
+ code = ord(char) - 0xAC00
229
+ cho = code // 588
230
+ jung = (code % 588) // 28
231
+ jong = code % 28
232
+
233
+ return (self.CHO[cho], self.JUNG[jung], self.JONG[jong])
234
+
235
+ def _compose(self, cho: str, jung: str, jong: str = ' ') -> str:
236
+ """자모를 음절로 조합"""
237
+ if cho not in self.CHO or jung not in self.JUNG:
238
+ return cho + jung + (jong if jong != ' ' else '')
239
+
240
+ cho_idx = self.CHO.index(cho)
241
+ jung_idx = self.JUNG.index(jung)
242
+ jong_idx = self.JONG.index(jong) if jong in self.JONG else 0
243
+
244
+ code = 0xAC00 + cho_idx * 588 + jung_idx * 28 + jong_idx
245
+ return chr(code)
246
+
247
+ def _is_hangul(self, char: str) -> bool:
248
+ """한글 음절 여부"""
249
+ return '\uAC00' <= char <= '\uD7A3'
250
+
251
+ def _has_jongseong(self, char: str) -> bool:
252
+ """받침 유무"""
253
+ if not self._is_hangul(char):
254
+ return False
255
+ return (ord(char) - 0xAC00) % 28 != 0
256
+
257
+ def _get_jongseong(self, char: str) -> str:
258
+ """받침 반환"""
259
+ if not self._is_hangul(char):
260
+ return ''
261
+ jong_idx = (ord(char) - 0xAC00) % 28
262
+ return self.JONG[jong_idx]
263
+
264
+ def analyze(self, text: str) -> List[Morpheme]:
265
+ """
266
+ 형태소 분석 수행
267
+
268
+ Args:
269
+ text: 입력 텍스트
270
+
271
+ Returns:
272
+ Morpheme 리스트
273
+ """
274
+ if not text:
275
+ return []
276
+
277
+ result = []
278
+ segments = self._segment(text)
279
+
280
+ offset = 0
281
+ for segment in segments:
282
+ if not segment.strip():
283
+ offset += len(segment)
284
+ continue
285
+
286
+ morphemes = self._analyze_segment(segment, offset)
287
+ result.extend(morphemes)
288
+ offset += len(segment)
289
+
290
+ return result
291
+
292
+ def _segment(self, text: str) -> List[str]:
293
+ """공백/구두점으로 분리"""
294
+ # 공백과 한글 어절 분리
295
+ segments = re.findall(r'[가-힣]+|[a-zA-Z]+|[0-9]+|[^\s가-힣a-zA-Z0-9]+|\s+', text)
296
+ return segments
297
+
298
+ def _analyze_segment(self, segment: str, offset: int) -> List[Morpheme]:
299
+ """단일 세그먼트 분석"""
300
+ # 비한글
301
+ if not re.match(r'[가-힣]', segment):
302
+ pos = 'SL' if re.match(r'[a-zA-Z]', segment) else 'SN' if segment.isdigit() else 'SW'
303
+ return [Morpheme(segment, segment, pos, offset, offset + len(segment))]
304
+
305
+ # 용언 활용형 직접 매칭
306
+ if segment in self.verb_forms:
307
+ results = []
308
+ pos = offset
309
+ for surface, tag in self.verb_forms[segment]:
310
+ results.append(Morpheme(surface, surface, tag, pos, pos + len(surface)))
311
+ pos += len(surface)
312
+ return results
313
+
314
+ # 사전 직접 매칭 (조사, 어미 제외한 전체 단어)
315
+ if segment in self.nouns:
316
+ return [Morpheme(segment, segment, self.nouns[segment], offset, offset + len(segment))]
317
+ if segment in self.adverbs:
318
+ return [Morpheme(segment, segment, self.adverbs[segment], offset, offset + len(segment))]
319
+
320
+ # 형태소 분석
321
+ return self._morpheme_analysis(segment, offset)
322
+
323
+ def _morpheme_analysis(self, word: str, offset: int) -> List[Morpheme]:
324
+ """형태소 분석 (체언+조사, 용언+어미 분리)"""
325
+ results = []
326
+
327
+ # 1. 조사 분리 시도
328
+ for josa, pos in sorted(self.josa.items(), key=lambda x: -len(x[0])):
329
+ if word.endswith(josa) and len(word) > len(josa):
330
+ stem = word[:-len(josa)]
331
+
332
+ # 음운 조건 확인 (받침 유무에 따른 조사 선택)
333
+ if self._check_josa_condition(stem, josa):
334
+ stem_morphs = self._analyze_stem(stem, offset)
335
+ if stem_morphs:
336
+ results = stem_morphs
337
+ results.append(Morpheme(
338
+ josa, josa, pos,
339
+ offset + len(stem), offset + len(word)
340
+ ))
341
+ return results
342
+
343
+ # 2. 어미 분리 시도 (용언)
344
+ verb_result = self._analyze_verb(word, offset)
345
+ if verb_result:
346
+ return verb_result
347
+
348
+ # 3. 미등록어 처리
349
+ return [Morpheme(word, word, 'NNG', offset, offset + len(word))]
350
+
351
+ def _check_josa_condition(self, stem: str, josa: str) -> bool:
352
+ """조사 음운 조건 확인"""
353
+ if not stem:
354
+ return False
355
+
356
+ last_char = stem[-1]
357
+ has_jong = self._has_jongseong(last_char)
358
+
359
+ # 이/가, 을/를, 은/는, 과/와 등 받침에 따른 이형태
360
+ if josa in ('이', '을', '은', '과', '으로', '이랑', '이나', '이라도', '이든지', '이야'):
361
+ return has_jong
362
+ if josa in ('가', '를', '는', '와', '로', '랑', '나', '라도', '든지', '야'):
363
+ return not has_jong
364
+
365
+ return True
366
+
367
+ def _analyze_stem(self, stem: str, offset: int) -> List[Morpheme]:
368
+ """어간 분석"""
369
+ # 명사 사전
370
+ if stem in self.nouns:
371
+ return [Morpheme(stem, stem, self.nouns[stem], offset, offset + len(stem))]
372
+
373
+ # 복합명사 분해 시도
374
+ decomposed = self._decompose_compound(stem, offset)
375
+ if decomposed:
376
+ return decomposed
377
+
378
+ # 미등록 명사
379
+ if len(stem) >= 2:
380
+ return [Morpheme(stem, stem, 'NNG', offset, offset + len(stem))]
381
+
382
+ return []
383
+
384
+ def _decompose_compound(self, word: str, offset: int) -> List[Morpheme]:
385
+ """복합명사 분해"""
386
+ # 최장일치로 앞에서부터 분해
387
+ results = []
388
+ pos = 0
389
+
390
+ while pos < len(word):
391
+ found = False
392
+ # 긴 것부터 매칭
393
+ for length in range(min(len(word) - pos, 10), 1, -1):
394
+ sub = word[pos:pos+length]
395
+ if sub in self.nouns:
396
+ results.append(Morpheme(
397
+ sub, sub, self.nouns[sub],
398
+ offset + pos, offset + pos + length
399
+ ))
400
+ pos += length
401
+ found = True
402
+ break
403
+
404
+ if not found:
405
+ # 남은 부분을 미등록어로
406
+ remaining = word[pos:]
407
+ if remaining:
408
+ results.append(Morpheme(
409
+ remaining, remaining, 'NNG',
410
+ offset + pos, offset + len(word)
411
+ ))
412
+ break
413
+
414
+ return results if len(results) > 1 else []
415
+
416
+ def _analyze_verb(self, word: str, offset: int) -> List[Morpheme]:
417
+ """용언 분석 (어간 + 어미)"""
418
+ results = []
419
+
420
+ # ---------------------------------------------------------------------
421
+ # Ultra-common polite endings (single-token cases)
422
+ # NOTE: This is intentionally conservative and substring-based to preserve offsets.
423
+ # e.g., 사랑합니다 -> 사랑 + 합니다
424
+ # ---------------------------------------------------------------------
425
+ if word.endswith("합니다") and len(word) > len("합니다") and re.fullmatch(r"[가-힣]+", word):
426
+ stem = word[: -len("합니다")]
427
+ if stem:
428
+ return [
429
+ Morpheme(stem, stem, "NNG", offset, offset + len(stem)),
430
+ Morpheme("합니다", "합니다", "EF", offset + len(stem), offset + len(word)),
431
+ ]
432
+
433
+ # 선어말어미 + 어말어미 조합 탐색
434
+ for prefinal, pf_pos in sorted(self.prefinal.items(), key=lambda x: -len(x[0])):
435
+ for eomi, em_pos in sorted(self.eomi.items(), key=lambda x: -len(x[0])):
436
+ suffix = prefinal + eomi
437
+ if word.endswith(suffix) and len(word) > len(suffix):
438
+ stem = word[:-len(suffix)]
439
+ verb_stem = self._find_verb_stem(stem)
440
+ # If we can't find the lemma stem, still split for long/polite endings.
441
+ # This helps single-token cases like 먹었습니다.
442
+ if not verb_stem:
443
+ if (len(eomi) >= 2 or len(prefinal) >= 2 or prefinal in {"았", "었", "였", "겠", "시", "셨"}):
444
+ if stem and re.fullmatch(r"[가-힣]+", stem):
445
+ verb_stem = (stem, "VV")
446
+ if verb_stem:
447
+ results.append(Morpheme(
448
+ stem, verb_stem[0], verb_stem[1],
449
+ offset, offset + len(stem)
450
+ ))
451
+ results.append(Morpheme(
452
+ prefinal, prefinal, pf_pos,
453
+ offset + len(stem), offset + len(stem) + len(prefinal)
454
+ ))
455
+ results.append(Morpheme(
456
+ eomi, eomi, em_pos,
457
+ offset + len(stem) + len(prefinal), offset + len(word)
458
+ ))
459
+ return results
460
+
461
+ # 어말어미만
462
+ for eomi, em_pos in sorted(self.eomi.items(), key=lambda x: -len(x[0])):
463
+ if word.endswith(eomi) and len(word) > len(eomi):
464
+ stem = word[:-len(eomi)]
465
+ verb_stem = self._find_verb_stem(stem)
466
+ if not verb_stem:
467
+ # Only do this for longer endings to avoid false splits like 바다 -> 바 + 다
468
+ if len(eomi) >= 2 and stem and re.fullmatch(r"[가-힣]+", stem):
469
+ verb_stem = (stem, "VV")
470
+ if verb_stem:
471
+ results.append(Morpheme(
472
+ stem, verb_stem[0], verb_stem[1],
473
+ offset, offset + len(stem)
474
+ ))
475
+ results.append(Morpheme(
476
+ eomi, eomi, em_pos,
477
+ offset + len(stem), offset + len(word)
478
+ ))
479
+ return results
480
+
481
+ return []
482
+
483
+ def _find_verb_stem(self, stem: str) -> Optional[Tuple[str, str]]:
484
+ """용언 어간 찾기"""
485
+ # 직접 매칭
486
+ if stem in self.verbs:
487
+ return (stem, self.verbs[stem])
488
+
489
+ # 접미사 분리 (하다, 되다 등)
490
+ for suffix, pos in self.suffix.items():
491
+ if stem.endswith(suffix) and len(stem) > len(suffix):
492
+ noun_part = stem[:-len(suffix)]
493
+ if noun_part in self.nouns or len(noun_part) >= 2:
494
+ return (stem, 'VV') # 파생동사
495
+
496
+ # 불규칙 활용 처리
497
+ irregular = self._check_irregular(stem)
498
+ if irregular:
499
+ return irregular
500
+
501
+ return None
502
+
503
+ def _check_irregular(self, stem: str) -> Optional[Tuple[str, str]]:
504
+ """불규칙 활용 확인"""
505
+ if not stem:
506
+ return None
507
+
508
+ # ㅂ 불규칙: 아름다우 -> 아름답
509
+ if stem.endswith('우') or stem.endswith('워'):
510
+ base = stem[:-1]
511
+ # 기본형 복원 시도
512
+ if base + 'ㅂ' in self.verbs:
513
+ return (base + 'ㅂ', self.verbs[base + 'ㅂ'])
514
+
515
+ # ㄷ 불규칙: 들어 -> 듣
516
+ # ㅅ 불규칙: 나아 -> 낫
517
+ # 르 불규칙: 모라 -> 모르
518
+
519
+ return None
520
+
521
+ def pos_tag(self, text: str) -> List[Tuple[str, str]]:
522
+ """품사 태깅"""
523
+ morphemes = self.analyze(text)
524
+ return [(m.surface, m.pos) for m in morphemes]
525
+
526
+ def nouns(self, text: str) -> List[str]:
527
+ """명사 추출"""
528
+ morphemes = self.analyze(text)
529
+ return [m.surface for m in morphemes if m.pos.startswith('N')]
530
+
531
+ def lemmatize(self, text: str) -> List[str]:
532
+ """기본형 추출"""
533
+ morphemes = self.analyze(text)
534
+ return [m.lemma for m in morphemes]