tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,603 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Korean Advanced Morphological Analyzer
|
|
3
|
+
======================================
|
|
4
|
+
|
|
5
|
+
5가지 고급 기능을 지원하는 한국어 형태소 분석기
|
|
6
|
+
|
|
7
|
+
Features:
|
|
8
|
+
1. NER Gazetteer Integration - 개체명 경계 보존
|
|
9
|
+
2. Real-time Dictionary Extension - 런타임 사전 확장
|
|
10
|
+
3. Domain Adaptation - 도메인별 분석 최적화
|
|
11
|
+
4. Code-switching - 영한 혼용 텍스트 처리
|
|
12
|
+
5. N-best Analysis - 다중 후보 + 신뢰도 점수
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
import json
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import List, Tuple, Dict, Set, Optional, Any
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
|
|
21
|
+
from .advanced_base import (
|
|
22
|
+
AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, NBestResult, Domain
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# 확장 사전 경로
|
|
26
|
+
from .. import resources
|
|
27
|
+
|
|
28
|
+
# Optional external asset dir (default: none). If you want extended dictionaries,
|
|
29
|
+
# provide them under: TOKMOR_DATA_DIR/extended_dict/{lang}_extended.json
|
|
30
|
+
DICT_DIR = resources.data_dir() / "extended_dict"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class KoreanAdvancedAnalyzer(AdvancedMorphologicalAnalyzer):
|
|
34
|
+
"""
|
|
35
|
+
한국어 고급 형태소 분석기
|
|
36
|
+
|
|
37
|
+
Usage:
|
|
38
|
+
analyzer = KoreanAdvancedAnalyzer()
|
|
39
|
+
|
|
40
|
+
# 기본 분석
|
|
41
|
+
result = analyzer.analyze("삼성전자가 서울에서 발표했다")
|
|
42
|
+
|
|
43
|
+
# 개체명 보존
|
|
44
|
+
analyzer.add_entity("삼성전자", "ORG")
|
|
45
|
+
result = analyzer.analyze("삼성전자가 발표했다", preserve_entities=True)
|
|
46
|
+
|
|
47
|
+
# 도메인 적응
|
|
48
|
+
result = analyzer.analyze("배가 아파요", domain="food") # 과일
|
|
49
|
+
result = analyzer.analyze("배가 아파요", domain="medical") # 신체
|
|
50
|
+
|
|
51
|
+
# 런타임 사전 확장
|
|
52
|
+
analyzer.add_word("뉴진스", pos="NNP", domain="entertainment")
|
|
53
|
+
|
|
54
|
+
# N-best 분석
|
|
55
|
+
result = analyzer.analyze("사과", n_best=3)
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
LANG_CODE = "ko"
|
|
59
|
+
LANG_NAME = "Korean"
|
|
60
|
+
|
|
61
|
+
def __init__(self):
|
|
62
|
+
# 한글 자모
|
|
63
|
+
self.CHO = 'ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ'
|
|
64
|
+
self.JUNG = 'ㅏㅐㅑㅒㅓㅔㅕㅖㅗㅘㅙㅚㅛㅜㅝㅞㅟㅠㅡㅢㅣ'
|
|
65
|
+
self.JONG = ' ㄱㄲㄳㄴㄵㄶㄷㄹㄺㄻㄼㄽㄾㄿㅀㅁㅂㅄㅅㅆㅇㅈㅊㅋㅌㅍㅎ'
|
|
66
|
+
|
|
67
|
+
super().__init__()
|
|
68
|
+
|
|
69
|
+
def _build_base_dictionary(self):
|
|
70
|
+
"""기본 사전 구축"""
|
|
71
|
+
|
|
72
|
+
# =================================================================
|
|
73
|
+
# 조사 사전
|
|
74
|
+
# =================================================================
|
|
75
|
+
self.josa = {
|
|
76
|
+
# 격조사
|
|
77
|
+
'이': 'JKS', '가': 'JKS', '께서': 'JKS',
|
|
78
|
+
'을': 'JKO', '를': 'JKO',
|
|
79
|
+
'의': 'JKG',
|
|
80
|
+
'에': 'JKB', '에서': 'JKB', '에게': 'JKB', '한테': 'JKB',
|
|
81
|
+
'로': 'JKB', '으로': 'JKB', '에게로': 'JKB',
|
|
82
|
+
'와': 'JKB', '과': 'JKB', '랑': 'JKB', '이랑': 'JKB',
|
|
83
|
+
'보다': 'JKB', '처럼': 'JKB', '같이': 'JKB', '만큼': 'JKB',
|
|
84
|
+
# 보조사
|
|
85
|
+
'은': 'JX', '는': 'JX', '도': 'JX', '만': 'JX', '까지': 'JX',
|
|
86
|
+
'부터': 'JX', '마저': 'JX', '조차': 'JX', '밖에': 'JX',
|
|
87
|
+
'라도': 'JX', '이라도': 'JX', '나': 'JX', '이나': 'JX',
|
|
88
|
+
'든지': 'JX', '이든지': 'JX', '야': 'JX', '이야': 'JX',
|
|
89
|
+
# 접속조사
|
|
90
|
+
'하고': 'JC', '이며': 'JC', '며': 'JC',
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
# =================================================================
|
|
94
|
+
# 어미 사전
|
|
95
|
+
# =================================================================
|
|
96
|
+
self.eomi = {
|
|
97
|
+
# 종결어미
|
|
98
|
+
'다': 'EF', 'ㄴ다': 'EF', '는다': 'EF', '니다': 'EF', '습니다': 'EF',
|
|
99
|
+
'어요': 'EF', '아요': 'EF', '여요': 'EF', '해요': 'EF',
|
|
100
|
+
'어': 'EF', '아': 'EF', '지': 'EF', '네': 'EF', '군': 'EF', '구나': 'EF',
|
|
101
|
+
'냐': 'EF', '니': 'EF', '나': 'EF', '자': 'EF', '세요': 'EF', '십시오': 'EF',
|
|
102
|
+
'라': 'EF', '거라': 'EF', '렴': 'EF', '려무나': 'EF',
|
|
103
|
+
# 연결어미
|
|
104
|
+
'고': 'EC', '며': 'EC', '면서': 'EC', '면': 'EC', '으면': 'EC',
|
|
105
|
+
'서': 'EC', '아서': 'EC', '어서': 'EC', '여서': 'EC',
|
|
106
|
+
'니까': 'EC', '으니까': 'EC',
|
|
107
|
+
'지만': 'EC', '는데': 'EC', '은데': 'EC', 'ㄴ데': 'EC',
|
|
108
|
+
'도록': 'EC', '게': 'EC', '려고': 'EC', '으려고': 'EC',
|
|
109
|
+
'러': 'EC', '으러': 'EC', '자마자': 'EC',
|
|
110
|
+
# 관형형전성어미
|
|
111
|
+
'는': 'ETM', '은': 'ETM', 'ㄴ': 'ETM', 'ㄹ': 'ETM', '을': 'ETM',
|
|
112
|
+
# 명사형전성어미
|
|
113
|
+
'음': 'ETN', '기': 'ETN', 'ㅁ': 'ETN',
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
# =================================================================
|
|
117
|
+
# 선어말어미
|
|
118
|
+
# =================================================================
|
|
119
|
+
self.prefinal = {
|
|
120
|
+
'았': 'EP', '었': 'EP', '였': 'EP', '겠': 'EP',
|
|
121
|
+
'시': 'EP', '으시': 'EP', '셨': 'EP', '으셨': 'EP',
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
# =================================================================
|
|
125
|
+
# 접미사
|
|
126
|
+
# =================================================================
|
|
127
|
+
self.suffix = {
|
|
128
|
+
'하': 'XSV', '되': 'XSV', '시키': 'XSV', '당하': 'XSV',
|
|
129
|
+
'스럽': 'XSA', '롭': 'XSA', '답': 'XSA', '적': 'XSA',
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
# =================================================================
|
|
133
|
+
# 명사 사전
|
|
134
|
+
# =================================================================
|
|
135
|
+
self.nouns = {
|
|
136
|
+
# 일반명사
|
|
137
|
+
'것': 'NNG', '수': 'NNG', '등': 'NNG', '때': 'NNG', '곳': 'NNG',
|
|
138
|
+
'사람': 'NNG', '사람들': 'NNG', '일': 'NNG', '말': 'NNG', '생각': 'NNG',
|
|
139
|
+
'문제': 'NNG', '경우': 'NNG', '사실': 'NNG', '점': 'NNG', '시간': 'NNG',
|
|
140
|
+
'세계': 'NNG', '나라': 'NNG', '정부': 'NNG', '사회': 'NNG', '국가': 'NNG',
|
|
141
|
+
'회사': 'NNG', '기업': 'NNG', '시장': 'NNG', '경제': 'NNG', '산업': 'NNG',
|
|
142
|
+
'기술': 'NNG', '발표': 'NNG', '개발': 'NNG', '연구': 'NNG', '조사': 'NNG',
|
|
143
|
+
'결과': 'NNG', '계획': 'NNG', '방법': 'NNG', '이유': 'NNG', '내용': 'NNG',
|
|
144
|
+
'오늘': 'NNG', '내일': 'NNG', '어제': 'NNG', '올해': 'NNG', '작년': 'NNG',
|
|
145
|
+
'학교': 'NNG', '대학': 'NNG', '학생': 'NNG', '선생': 'NNG', '공부': 'NNG',
|
|
146
|
+
'집': 'NNG', '방': 'NNG', '문': 'NNG', '길': 'NNG', '차': 'NNG',
|
|
147
|
+
'돈': 'NNG', '물': 'NNG', '밥': 'NNG', '책': 'NNG', '글': 'NNG',
|
|
148
|
+
'배': 'NNG', '사과': 'NNG', '과일': 'NNG', '음식': 'NNG',
|
|
149
|
+
# 고유명사
|
|
150
|
+
'삼성': 'NNP', '현대': 'NNP', '서울': 'NNP', '부산': 'NNP', '한국': 'NNP',
|
|
151
|
+
'미국': 'NNP', '중국': 'NNP', '일본': 'NNP',
|
|
152
|
+
'삼성전자': 'NNP', '현대자동차': 'NNP', 'LG전자': 'NNP', 'SK하이닉스': 'NNP',
|
|
153
|
+
# 의존명사
|
|
154
|
+
'데': 'NNB', '바': 'NNB', '뿐': 'NNB', '줄': 'NNB', '리': 'NNB',
|
|
155
|
+
# 대명사
|
|
156
|
+
'나': 'NP', '너': 'NP', '저': 'NP', '우리': 'NP', '그': 'NP', '이': 'NP',
|
|
157
|
+
'누구': 'NP', '무엇': 'NP', '어디': 'NP', '언제': 'NP',
|
|
158
|
+
# 수사
|
|
159
|
+
'하나': 'NR', '둘': 'NR', '셋': 'NR', '넷': 'NR', '다섯': 'NR',
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
# =================================================================
|
|
163
|
+
# 용언 활용형 사전 (어간+어미 결합형)
|
|
164
|
+
# =================================================================
|
|
165
|
+
self.verb_forms = {
|
|
166
|
+
# 가다
|
|
167
|
+
'간다': [('가', 'VV'), ('ㄴ다', 'EF')],
|
|
168
|
+
'갔다': [('가', 'VV'), ('았', 'EP'), ('다', 'EF')],
|
|
169
|
+
'가면': [('가', 'VV'), ('면', 'EC')],
|
|
170
|
+
'가고': [('가', 'VV'), ('고', 'EC')],
|
|
171
|
+
'가서': [('가', 'VV'), ('서', 'EC')],
|
|
172
|
+
# 오다
|
|
173
|
+
'온다': [('오', 'VV'), ('ㄴ다', 'EF')],
|
|
174
|
+
'왔다': [('오', 'VV'), ('았', 'EP'), ('다', 'EF')],
|
|
175
|
+
# 하다
|
|
176
|
+
'한다': [('하', 'VV'), ('ㄴ다', 'EF')],
|
|
177
|
+
'했다': [('하', 'VV'), ('였', 'EP'), ('다', 'EF')],
|
|
178
|
+
'하고': [('하', 'VV'), ('고', 'EC')],
|
|
179
|
+
'하면': [('하', 'VV'), ('면', 'EC')],
|
|
180
|
+
'하는': [('하', 'VV'), ('는', 'ETM')],
|
|
181
|
+
# 되다
|
|
182
|
+
'된다': [('되', 'VV'), ('ㄴ다', 'EF')],
|
|
183
|
+
'됐다': [('되', 'VV'), ('었', 'EP'), ('다', 'EF')],
|
|
184
|
+
# 있다/없다
|
|
185
|
+
'있다': [('있', 'VX'), ('다', 'EF')],
|
|
186
|
+
'있는': [('있', 'VX'), ('는', 'ETM')],
|
|
187
|
+
'없다': [('없', 'VX'), ('다', 'EF')],
|
|
188
|
+
'없는': [('없', 'VX'), ('는', 'ETM')],
|
|
189
|
+
# 이다
|
|
190
|
+
'이다': [('이', 'VCP'), ('다', 'EF')],
|
|
191
|
+
# 발표하다
|
|
192
|
+
'발표했다': [('발표', 'NNG'), ('하', 'XSV'), ('였', 'EP'), ('다', 'EF')],
|
|
193
|
+
'발표한다': [('발표', 'NNG'), ('하', 'XSV'), ('ㄴ다', 'EF')],
|
|
194
|
+
'발표하는': [('발표', 'NNG'), ('하', 'XSV'), ('는', 'ETM')],
|
|
195
|
+
# 아프다
|
|
196
|
+
'아파요': [('아프', 'VA'), ('아요', 'EF')],
|
|
197
|
+
'아프다': [('아프', 'VA'), ('다', 'EF')],
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
# =================================================================
|
|
201
|
+
# 용언 어간 사전
|
|
202
|
+
# =================================================================
|
|
203
|
+
self.verbs = {
|
|
204
|
+
'하': 'VV', '되': 'VV', '가': 'VV', '오': 'VV', '보': 'VV',
|
|
205
|
+
'알': 'VV', '모르': 'VV', '주': 'VV', '받': 'VV', '만들': 'VV',
|
|
206
|
+
'말하': 'VV', '생각하': 'VV', '사용하': 'VV', '발표하': 'VV',
|
|
207
|
+
'나오': 'VV', '들어가': 'VV', '나가': 'VV', '들어오': 'VV',
|
|
208
|
+
'있': 'VV', '없': 'VV', '같': 'VV',
|
|
209
|
+
'크': 'VA', '작': 'VA', '많': 'VA', '적': 'VA', '좋': 'VA', '나쁘': 'VA',
|
|
210
|
+
'높': 'VA', '낮': 'VA', '길': 'VA', '짧': 'VA', '넓': 'VA', '좁': 'VA',
|
|
211
|
+
'새롭': 'VA', '어렵': 'VA', '쉽': 'VA', '아름답': 'VA', '아프': 'VA',
|
|
212
|
+
'이': 'VCP',
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
# =================================================================
|
|
216
|
+
# 부사
|
|
217
|
+
# =================================================================
|
|
218
|
+
self.adverbs = {
|
|
219
|
+
'매우': 'MAG', '아주': 'MAG', '너무': 'MAG', '정말': 'MAG', '진짜': 'MAG',
|
|
220
|
+
'가장': 'MAG', '더': 'MAG', '덜': 'MAG', '잘': 'MAG', '못': 'MAG',
|
|
221
|
+
'다': 'MAG', '모두': 'MAG', '함께': 'MAG', '같이': 'MAG', '다시': 'MAG',
|
|
222
|
+
'또': 'MAG', '이미': 'MAG', '아직': 'MAG', '벌써': 'MAG', '곧': 'MAG',
|
|
223
|
+
'그리고': 'MAJ', '그러나': 'MAJ', '그래서': 'MAJ', '하지만': 'MAJ',
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
# ================================================================
|
|
227
|
+
# 확장 사전 로드 (optional external asset)
|
|
228
|
+
# ================================================================
|
|
229
|
+
self._load_extended_dictionary()
|
|
230
|
+
|
|
231
|
+
# 모음조화
|
|
232
|
+
self.yang_vowels = {'ㅏ', 'ㅗ', 'ㅑ', 'ㅛ'}
|
|
233
|
+
self.eum_vowels = {'ㅓ', 'ㅜ', 'ㅡ', 'ㅕ', 'ㅠ', 'ㅣ'}
|
|
234
|
+
|
|
235
|
+
def _load_extended_dictionary(self):
|
|
236
|
+
"""Load optional external extended dictionary"""
|
|
237
|
+
dict_path = DICT_DIR / 'ko_extended.json'
|
|
238
|
+
if not dict_path.exists():
|
|
239
|
+
return
|
|
240
|
+
|
|
241
|
+
with open(dict_path, 'r', encoding='utf-8') as f:
|
|
242
|
+
extended = json.load(f)
|
|
243
|
+
|
|
244
|
+
# 기존 사전에 없는 것만 추가
|
|
245
|
+
for word, pos in extended.items():
|
|
246
|
+
if pos in ('NNG', 'NNP', 'NNB'):
|
|
247
|
+
if word not in self.nouns:
|
|
248
|
+
self.nouns[word] = pos
|
|
249
|
+
elif pos == 'VV':
|
|
250
|
+
if word not in self.verbs:
|
|
251
|
+
self.verbs[word] = pos
|
|
252
|
+
elif pos == 'VA':
|
|
253
|
+
if word not in self.verbs:
|
|
254
|
+
self.verbs[word] = pos
|
|
255
|
+
elif pos == 'MAG':
|
|
256
|
+
if word not in self.adverbs:
|
|
257
|
+
self.adverbs[word] = pos
|
|
258
|
+
elif pos == 'NP':
|
|
259
|
+
if word not in self.nouns:
|
|
260
|
+
self.nouns[word] = pos
|
|
261
|
+
elif pos == 'NR':
|
|
262
|
+
if word not in self.nouns:
|
|
263
|
+
self.nouns[word] = pos
|
|
264
|
+
elif pos == 'MM':
|
|
265
|
+
# 관형사는 별도 사전 필요시 추가
|
|
266
|
+
pass
|
|
267
|
+
|
|
268
|
+
def _build_domain_dictionaries(self):
|
|
269
|
+
"""도메인별 사전 구축"""
|
|
270
|
+
|
|
271
|
+
# FOOD 도메인
|
|
272
|
+
self._domain_dictionaries[Domain.FOOD] = {
|
|
273
|
+
'배': ('배', 'NNG'), # 과일
|
|
274
|
+
'사과': ('사과', 'NNG'), # 과일
|
|
275
|
+
'밤': ('밤', 'NNG'), # 음식 (밤나무 열매)
|
|
276
|
+
'감': ('감', 'NNG'), # 과일
|
|
277
|
+
'귤': ('귤', 'NNG'),
|
|
278
|
+
'포도': ('포도', 'NNG'),
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
# MEDICAL 도메인
|
|
282
|
+
self._domain_dictionaries[Domain.MEDICAL] = {
|
|
283
|
+
'배': ('배', 'NNG'), # 신체 부위 (복부)
|
|
284
|
+
'머리': ('머리', 'NNG'),
|
|
285
|
+
'다리': ('다리', 'NNG'),
|
|
286
|
+
'팔': ('팔', 'NNG'),
|
|
287
|
+
'목': ('목', 'NNG'),
|
|
288
|
+
'허리': ('허리', 'NNG'),
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
# TECH 도메인
|
|
292
|
+
self._domain_dictionaries[Domain.TECH] = {
|
|
293
|
+
'배': ('배', 'NNG'), # 배열 (array)의 앞글자? -> 일반적으로 array
|
|
294
|
+
'모델': ('모델', 'NNG'),
|
|
295
|
+
'서버': ('서버', 'NNG'),
|
|
296
|
+
'클라우드': ('클라우드', 'NNG'),
|
|
297
|
+
'메모리': ('메모리', 'NNG'),
|
|
298
|
+
'프로세서': ('프로세서', 'NNG'),
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
# SPORTS 도메인
|
|
302
|
+
self._domain_dictionaries[Domain.SPORTS] = {
|
|
303
|
+
'배': ('배', 'NNG'), # 배드민턴/배구
|
|
304
|
+
'공': ('공', 'NNG'),
|
|
305
|
+
'골': ('골', 'NNG'),
|
|
306
|
+
'경기': ('경기', 'NNG'),
|
|
307
|
+
'선수': ('선수', 'NNG'),
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
# ENTERTAINMENT 도메인
|
|
311
|
+
self._domain_dictionaries[Domain.ENTERTAINMENT] = {
|
|
312
|
+
'배': ('배', 'NNG'), # 배우
|
|
313
|
+
'가수': ('가수', 'NNG'),
|
|
314
|
+
'배우': ('배우', 'NNG'),
|
|
315
|
+
'아이돌': ('아이돌', 'NNG'),
|
|
316
|
+
'뉴진스': ('뉴진스', 'NNP'),
|
|
317
|
+
'BTS': ('BTS', 'NNP'),
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
# FINANCE 도메인
|
|
321
|
+
self._domain_dictionaries[Domain.FINANCE] = {
|
|
322
|
+
'배': ('배', 'NNG'), # 배당
|
|
323
|
+
'주식': ('주식', 'NNG'),
|
|
324
|
+
'채권': ('채권', 'NNG'),
|
|
325
|
+
'펀드': ('펀드', 'NNG'),
|
|
326
|
+
'배당': ('배당', 'NNG'),
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
|
|
330
|
+
"""분석 후보 생성"""
|
|
331
|
+
if not text or not text.strip():
|
|
332
|
+
return [AnalysisResult([])]
|
|
333
|
+
|
|
334
|
+
candidates = []
|
|
335
|
+
|
|
336
|
+
# 기본 분석
|
|
337
|
+
main_morphemes = self._analyze_text(text, domain)
|
|
338
|
+
main_result = AnalysisResult(
|
|
339
|
+
morphemes=main_morphemes,
|
|
340
|
+
score=1.0,
|
|
341
|
+
domain=domain
|
|
342
|
+
)
|
|
343
|
+
main_result.score = self._score_analysis(main_result)
|
|
344
|
+
candidates.append(main_result)
|
|
345
|
+
|
|
346
|
+
return candidates
|
|
347
|
+
|
|
348
|
+
def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
|
|
349
|
+
"""텍스트 분석 (메인 로직)"""
|
|
350
|
+
if not text:
|
|
351
|
+
return []
|
|
352
|
+
|
|
353
|
+
result = []
|
|
354
|
+
segments = self._segment(text)
|
|
355
|
+
|
|
356
|
+
offset = 0
|
|
357
|
+
for segment in segments:
|
|
358
|
+
if not segment.strip():
|
|
359
|
+
offset += len(segment)
|
|
360
|
+
continue
|
|
361
|
+
|
|
362
|
+
morphemes = self._analyze_segment(segment, offset, domain)
|
|
363
|
+
result.extend(morphemes)
|
|
364
|
+
offset += len(segment)
|
|
365
|
+
|
|
366
|
+
return result
|
|
367
|
+
|
|
368
|
+
def _segment(self, text: str) -> List[str]:
|
|
369
|
+
"""공백/구두점으로 분리"""
|
|
370
|
+
segments = re.findall(r'[가-힣]+|[a-zA-Z]+|[0-9]+|[^\s가-힣a-zA-Z0-9]+|\s+', text)
|
|
371
|
+
return segments
|
|
372
|
+
|
|
373
|
+
def _analyze_segment(self, segment: str, offset: int, domain: Domain) -> List[Morpheme]:
|
|
374
|
+
"""단일 세그먼트 분석"""
|
|
375
|
+
|
|
376
|
+
# 비한글 (영어, 숫자, 기호)
|
|
377
|
+
if not re.match(r'[가-힣]', segment):
|
|
378
|
+
# Code-switching: 영어 단어 분석
|
|
379
|
+
if re.match(r'[a-zA-Z]', segment):
|
|
380
|
+
pos = 'SL'
|
|
381
|
+
# 런타임 사전에서 영어 단어 확인
|
|
382
|
+
if segment.lower() in self._user_dictionary:
|
|
383
|
+
lemma, pos, _ = self._user_dictionary[segment.lower()]
|
|
384
|
+
return [Morpheme(
|
|
385
|
+
surface=segment, lemma=lemma, pos=pos,
|
|
386
|
+
start=offset, end=offset + len(segment)
|
|
387
|
+
)]
|
|
388
|
+
return [Morpheme(
|
|
389
|
+
surface=segment, lemma=segment, pos=pos,
|
|
390
|
+
start=offset, end=offset + len(segment)
|
|
391
|
+
)]
|
|
392
|
+
elif segment.isdigit():
|
|
393
|
+
return [Morpheme(
|
|
394
|
+
surface=segment, lemma=segment, pos='SN',
|
|
395
|
+
start=offset, end=offset + len(segment)
|
|
396
|
+
)]
|
|
397
|
+
else:
|
|
398
|
+
return [Morpheme(
|
|
399
|
+
surface=segment, lemma=segment, pos='SW',
|
|
400
|
+
start=offset, end=offset + len(segment)
|
|
401
|
+
)]
|
|
402
|
+
|
|
403
|
+
# 런타임 사전 확인 (우선순위 최고)
|
|
404
|
+
if segment in self._user_dictionary:
|
|
405
|
+
lemma, pos, _ = self._user_dictionary[segment]
|
|
406
|
+
return [Morpheme(
|
|
407
|
+
surface=segment, lemma=lemma, pos=pos,
|
|
408
|
+
start=offset, end=offset + len(segment)
|
|
409
|
+
)]
|
|
410
|
+
|
|
411
|
+
# 용언 활용형 직접 매칭
|
|
412
|
+
if segment in self.verb_forms:
|
|
413
|
+
results = []
|
|
414
|
+
pos = offset
|
|
415
|
+
for surface, tag in self.verb_forms[segment]:
|
|
416
|
+
results.append(Morpheme(
|
|
417
|
+
surface=surface, lemma=surface, pos=tag,
|
|
418
|
+
start=pos, end=pos + len(surface)
|
|
419
|
+
))
|
|
420
|
+
pos += len(surface)
|
|
421
|
+
return results
|
|
422
|
+
|
|
423
|
+
# 도메인 사전 확인
|
|
424
|
+
domain_sense = self._get_domain_sense(segment, domain)
|
|
425
|
+
if domain_sense:
|
|
426
|
+
return [Morpheme(
|
|
427
|
+
surface=segment, lemma=domain_sense[0], pos=domain_sense[1],
|
|
428
|
+
start=offset, end=offset + len(segment)
|
|
429
|
+
)]
|
|
430
|
+
|
|
431
|
+
# 기본 사전 매칭
|
|
432
|
+
if segment in self.nouns:
|
|
433
|
+
return [Morpheme(
|
|
434
|
+
surface=segment, lemma=segment, pos=self.nouns[segment],
|
|
435
|
+
start=offset, end=offset + len(segment)
|
|
436
|
+
)]
|
|
437
|
+
if segment in self.adverbs:
|
|
438
|
+
return [Morpheme(
|
|
439
|
+
surface=segment, lemma=segment, pos=self.adverbs[segment],
|
|
440
|
+
start=offset, end=offset + len(segment)
|
|
441
|
+
)]
|
|
442
|
+
|
|
443
|
+
# 형태소 분석 (조사/어미 분리)
|
|
444
|
+
return self._morpheme_analysis(segment, offset, domain)
|
|
445
|
+
|
|
446
|
+
def _morpheme_analysis(self, word: str, offset: int, domain: Domain) -> List[Morpheme]:
|
|
447
|
+
"""형태소 분석 (체언+조사, 용언+어미 분리)"""
|
|
448
|
+
results = []
|
|
449
|
+
|
|
450
|
+
# 1. 조사 분리 시도
|
|
451
|
+
for josa, pos in sorted(self.josa.items(), key=lambda x: -len(x[0])):
|
|
452
|
+
if word.endswith(josa) and len(word) > len(josa):
|
|
453
|
+
stem = word[:-len(josa)]
|
|
454
|
+
|
|
455
|
+
# 음운 조건 확인
|
|
456
|
+
if self._check_josa_condition(stem, josa):
|
|
457
|
+
stem_morphs = self._analyze_stem(stem, offset, domain)
|
|
458
|
+
if stem_morphs:
|
|
459
|
+
results = stem_morphs
|
|
460
|
+
results.append(Morpheme(
|
|
461
|
+
surface=josa, lemma=josa, pos=pos,
|
|
462
|
+
start=offset + len(stem), end=offset + len(word)
|
|
463
|
+
))
|
|
464
|
+
return results
|
|
465
|
+
|
|
466
|
+
# 2. 어미 분리 시도 (용언)
|
|
467
|
+
verb_result = self._analyze_verb(word, offset)
|
|
468
|
+
if verb_result:
|
|
469
|
+
return verb_result
|
|
470
|
+
|
|
471
|
+
# 3. 미등록어 처리
|
|
472
|
+
return [Morpheme(
|
|
473
|
+
surface=word, lemma=word, pos='NNG',
|
|
474
|
+
start=offset, end=offset + len(word)
|
|
475
|
+
)]
|
|
476
|
+
|
|
477
|
+
def _check_josa_condition(self, stem: str, josa: str) -> bool:
|
|
478
|
+
"""조사 음운 조건 확인"""
|
|
479
|
+
if not stem:
|
|
480
|
+
return False
|
|
481
|
+
|
|
482
|
+
last_char = stem[-1]
|
|
483
|
+
has_jong = self._has_jongseong(last_char)
|
|
484
|
+
|
|
485
|
+
# 이/가, 을/를, 은/는, 과/와 등 받침에 따른 이형태
|
|
486
|
+
if josa in ('이', '을', '은', '과', '으로', '이랑', '이나', '이라도', '이든지', '이야'):
|
|
487
|
+
return has_jong
|
|
488
|
+
if josa in ('가', '를', '는', '와', '로', '랑', '나', '라도', '든지', '야'):
|
|
489
|
+
return not has_jong
|
|
490
|
+
|
|
491
|
+
return True
|
|
492
|
+
|
|
493
|
+
def _analyze_stem(self, stem: str, offset: int, domain: Domain) -> List[Morpheme]:
|
|
494
|
+
"""어간 분석"""
|
|
495
|
+
# 런타임 사전
|
|
496
|
+
if stem in self._user_dictionary:
|
|
497
|
+
lemma, pos, _ = self._user_dictionary[stem]
|
|
498
|
+
return [Morpheme(surface=stem, lemma=lemma, pos=pos, start=offset, end=offset + len(stem))]
|
|
499
|
+
|
|
500
|
+
# 도메인 사전
|
|
501
|
+
domain_sense = self._get_domain_sense(stem, domain)
|
|
502
|
+
if domain_sense:
|
|
503
|
+
return [Morpheme(
|
|
504
|
+
surface=stem, lemma=domain_sense[0], pos=domain_sense[1],
|
|
505
|
+
start=offset, end=offset + len(stem)
|
|
506
|
+
)]
|
|
507
|
+
|
|
508
|
+
# 명사 사전
|
|
509
|
+
if stem in self.nouns:
|
|
510
|
+
return [Morpheme(surface=stem, lemma=stem, pos=self.nouns[stem], start=offset, end=offset + len(stem))]
|
|
511
|
+
|
|
512
|
+
# 미등록 명사
|
|
513
|
+
if len(stem) >= 2:
|
|
514
|
+
return [Morpheme(surface=stem, lemma=stem, pos='NNG', start=offset, end=offset + len(stem))]
|
|
515
|
+
|
|
516
|
+
return []
|
|
517
|
+
|
|
518
|
+
def _analyze_verb(self, word: str, offset: int) -> List[Morpheme]:
|
|
519
|
+
"""용언 분석 (어간 + 어미)"""
|
|
520
|
+
results = []
|
|
521
|
+
|
|
522
|
+
# 선어말어미 + 어말어미 조합 탐색
|
|
523
|
+
for prefinal, pf_pos in sorted(self.prefinal.items(), key=lambda x: -len(x[0])):
|
|
524
|
+
for eomi, em_pos in sorted(self.eomi.items(), key=lambda x: -len(x[0])):
|
|
525
|
+
suffix = prefinal + eomi
|
|
526
|
+
if word.endswith(suffix) and len(word) > len(suffix):
|
|
527
|
+
stem = word[:-len(suffix)]
|
|
528
|
+
verb_stem = self._find_verb_stem(stem)
|
|
529
|
+
if verb_stem:
|
|
530
|
+
results.append(Morpheme(
|
|
531
|
+
surface=stem, lemma=verb_stem[0], pos=verb_stem[1],
|
|
532
|
+
start=offset, end=offset + len(stem)
|
|
533
|
+
))
|
|
534
|
+
results.append(Morpheme(
|
|
535
|
+
surface=prefinal, lemma=prefinal, pos=pf_pos,
|
|
536
|
+
start=offset + len(stem), end=offset + len(stem) + len(prefinal)
|
|
537
|
+
))
|
|
538
|
+
results.append(Morpheme(
|
|
539
|
+
surface=eomi, lemma=eomi, pos=em_pos,
|
|
540
|
+
start=offset + len(stem) + len(prefinal), end=offset + len(word)
|
|
541
|
+
))
|
|
542
|
+
return results
|
|
543
|
+
|
|
544
|
+
# 어말어미만
|
|
545
|
+
for eomi, em_pos in sorted(self.eomi.items(), key=lambda x: -len(x[0])):
|
|
546
|
+
if word.endswith(eomi) and len(word) > len(eomi):
|
|
547
|
+
stem = word[:-len(eomi)]
|
|
548
|
+
verb_stem = self._find_verb_stem(stem)
|
|
549
|
+
if verb_stem:
|
|
550
|
+
results.append(Morpheme(
|
|
551
|
+
surface=stem, lemma=verb_stem[0], pos=verb_stem[1],
|
|
552
|
+
start=offset, end=offset + len(stem)
|
|
553
|
+
))
|
|
554
|
+
results.append(Morpheme(
|
|
555
|
+
surface=eomi, lemma=eomi, pos=em_pos,
|
|
556
|
+
start=offset + len(stem), end=offset + len(word)
|
|
557
|
+
))
|
|
558
|
+
return results
|
|
559
|
+
|
|
560
|
+
return []
|
|
561
|
+
|
|
562
|
+
def _find_verb_stem(self, stem: str) -> Optional[Tuple[str, str]]:
|
|
563
|
+
"""용언 어간 찾기"""
|
|
564
|
+
if stem in self.verbs:
|
|
565
|
+
return (stem, self.verbs[stem])
|
|
566
|
+
|
|
567
|
+
# 접미사 분리
|
|
568
|
+
for suffix, pos in self.suffix.items():
|
|
569
|
+
if stem.endswith(suffix) and len(stem) > len(suffix):
|
|
570
|
+
noun_part = stem[:-len(suffix)]
|
|
571
|
+
if noun_part in self.nouns or len(noun_part) >= 2:
|
|
572
|
+
return (stem, 'VV')
|
|
573
|
+
|
|
574
|
+
return None
|
|
575
|
+
|
|
576
|
+
def _has_jongseong(self, char: str) -> bool:
|
|
577
|
+
"""받침 유무"""
|
|
578
|
+
if not ('\uAC00' <= char <= '\uD7A3'):
|
|
579
|
+
return False
|
|
580
|
+
return (ord(char) - 0xAC00) % 28 != 0
|
|
581
|
+
|
|
582
|
+
def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
|
|
583
|
+
"""대안 분석 결과 생성 (N-best용)"""
|
|
584
|
+
alternatives = []
|
|
585
|
+
|
|
586
|
+
# 다른 도메인으로 분석
|
|
587
|
+
other_domains = [d for d in Domain if d != domain][:count]
|
|
588
|
+
|
|
589
|
+
for alt_domain in other_domains:
|
|
590
|
+
morphemes = self._analyze_text(text, alt_domain)
|
|
591
|
+
result = AnalysisResult(
|
|
592
|
+
morphemes=morphemes,
|
|
593
|
+
score=0.8, # 약간 낮은 점수
|
|
594
|
+
domain=alt_domain
|
|
595
|
+
)
|
|
596
|
+
result.score = self._score_analysis(result) * 0.9
|
|
597
|
+
alternatives.append(result)
|
|
598
|
+
|
|
599
|
+
return alternatives
|
|
600
|
+
|
|
601
|
+
|
|
602
|
+
# Alias for backward compatibility
|
|
603
|
+
KoreanAnalyzer = KoreanAdvancedAnalyzer
|