tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,560 @@
|
|
|
1
|
+
"""
|
|
2
|
+
English Advanced Morphological Analyzer
|
|
3
|
+
=======================================
|
|
4
|
+
|
|
5
|
+
5가지 고급 기능을 지원하는 영어 형태소 분석기
|
|
6
|
+
|
|
7
|
+
Features:
|
|
8
|
+
1. NER Gazetteer Integration - 개체명 경계 보존
|
|
9
|
+
2. Real-time Dictionary Extension - 런타임 사전 확장
|
|
10
|
+
3. Domain Adaptation - 도메인별 분석 최적화
|
|
11
|
+
4. Code-switching - 다국어 혼용 텍스트 처리
|
|
12
|
+
5. N-best Analysis - 다중 후보 + 신뢰도 점수
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
import json
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import List, Tuple, Dict, Set, Optional, Any
|
|
19
|
+
|
|
20
|
+
from .advanced_base import (
|
|
21
|
+
AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, NBestResult, Domain
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# 확장 사전 경로
|
|
25
|
+
from .. import resources
|
|
26
|
+
|
|
27
|
+
# Optional external asset dir (default: none). If you want extended dictionaries,
|
|
28
|
+
# provide them under: TOKMOR_DATA_DIR/extended_dict/{lang}_extended.json
|
|
29
|
+
DICT_DIR = resources.data_dir() / "extended_dict"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class EnglishAdvancedAnalyzer(AdvancedMorphologicalAnalyzer):
|
|
33
|
+
"""
|
|
34
|
+
영어 고급 형태소 분석기
|
|
35
|
+
|
|
36
|
+
Usage:
|
|
37
|
+
analyzer = EnglishAdvancedAnalyzer()
|
|
38
|
+
|
|
39
|
+
# 기본 분석
|
|
40
|
+
result = analyzer.analyze("Apple announced new products")
|
|
41
|
+
|
|
42
|
+
# 개체명 보존
|
|
43
|
+
analyzer.add_entity("Apple", "ORG")
|
|
44
|
+
result = analyzer.analyze("Apple announced", preserve_entities=True)
|
|
45
|
+
|
|
46
|
+
# 도메인 적응
|
|
47
|
+
result = analyzer.analyze("apple", domain="food") # fruit
|
|
48
|
+
result = analyzer.analyze("apple", domain="tech") # company
|
|
49
|
+
|
|
50
|
+
# N-best 분석
|
|
51
|
+
result = analyzer.analyze("bank", n_best=3)
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
LANG_CODE = "en"
|
|
55
|
+
LANG_NAME = "English"
|
|
56
|
+
|
|
57
|
+
# 토큰 패턴
|
|
58
|
+
WORD_PATTERN = re.compile(r"[a-zA-Z]+(?:'[a-zA-Z]+)?")
|
|
59
|
+
NUMBER_PATTERN = re.compile(r'[0-9]+(?:\.[0-9]+)?')
|
|
60
|
+
|
|
61
|
+
def __init__(self):
|
|
62
|
+
super().__init__()
|
|
63
|
+
|
|
64
|
+
def _build_base_dictionary(self):
|
|
65
|
+
"""기본 사전 구축"""
|
|
66
|
+
|
|
67
|
+
# =================================================================
|
|
68
|
+
# Irregular Verbs (불규칙 동사)
|
|
69
|
+
# =================================================================
|
|
70
|
+
self.irregular_verbs = {
|
|
71
|
+
# Past tense
|
|
72
|
+
'went': 'go', 'gone': 'go', 'goes': 'go',
|
|
73
|
+
'saw': 'see', 'seen': 'see', 'sees': 'see',
|
|
74
|
+
'came': 'come', 'comes': 'come',
|
|
75
|
+
'took': 'take', 'taken': 'take', 'takes': 'take',
|
|
76
|
+
'made': 'make', 'makes': 'make',
|
|
77
|
+
'said': 'say', 'says': 'say',
|
|
78
|
+
'got': 'get', 'gotten': 'get', 'gets': 'get',
|
|
79
|
+
'knew': 'know', 'known': 'know', 'knows': 'know',
|
|
80
|
+
'thought': 'think', 'thinks': 'think',
|
|
81
|
+
'found': 'find', 'finds': 'find',
|
|
82
|
+
'gave': 'give', 'given': 'give', 'gives': 'give',
|
|
83
|
+
'told': 'tell', 'tells': 'tell',
|
|
84
|
+
'became': 'become', 'becomes': 'become',
|
|
85
|
+
'left': 'leave', 'leaves': 'leave',
|
|
86
|
+
'felt': 'feel', 'feels': 'feel',
|
|
87
|
+
'brought': 'bring', 'brings': 'bring',
|
|
88
|
+
'began': 'begin', 'begun': 'begin', 'begins': 'begin',
|
|
89
|
+
'kept': 'keep', 'keeps': 'keep',
|
|
90
|
+
'held': 'hold', 'holds': 'hold',
|
|
91
|
+
'wrote': 'write', 'written': 'write', 'writes': 'write',
|
|
92
|
+
'stood': 'stand', 'stands': 'stand',
|
|
93
|
+
'heard': 'hear', 'hears': 'hear',
|
|
94
|
+
'let': 'let', 'lets': 'let',
|
|
95
|
+
'meant': 'mean', 'means': 'mean',
|
|
96
|
+
'set': 'set', 'sets': 'set',
|
|
97
|
+
'met': 'meet', 'meets': 'meet',
|
|
98
|
+
'ran': 'run', 'runs': 'run',
|
|
99
|
+
'paid': 'pay', 'pays': 'pay',
|
|
100
|
+
'sat': 'sit', 'sits': 'sit',
|
|
101
|
+
'spoke': 'speak', 'spoken': 'speak', 'speaks': 'speak',
|
|
102
|
+
'lay': 'lie', 'lain': 'lie', 'lies': 'lie',
|
|
103
|
+
'led': 'lead', 'leads': 'lead',
|
|
104
|
+
'read': 'read', 'reads': 'read',
|
|
105
|
+
'grew': 'grow', 'grown': 'grow', 'grows': 'grow',
|
|
106
|
+
'lost': 'lose', 'loses': 'lose',
|
|
107
|
+
'fell': 'fall', 'fallen': 'fall', 'falls': 'fall',
|
|
108
|
+
'sent': 'send', 'sends': 'send',
|
|
109
|
+
'built': 'build', 'builds': 'build',
|
|
110
|
+
'understood': 'understand', 'understands': 'understand',
|
|
111
|
+
'drew': 'draw', 'drawn': 'draw', 'draws': 'draw',
|
|
112
|
+
'broke': 'break', 'broken': 'break', 'breaks': 'break',
|
|
113
|
+
'spent': 'spend', 'spends': 'spend',
|
|
114
|
+
'cut': 'cut', 'cuts': 'cut',
|
|
115
|
+
'hit': 'hit', 'hits': 'hit',
|
|
116
|
+
'put': 'put', 'puts': 'put',
|
|
117
|
+
'shut': 'shut', 'shuts': 'shut',
|
|
118
|
+
# be동사
|
|
119
|
+
'am': 'be', 'is': 'be', 'are': 'be', 'was': 'be', 'were': 'be', 'been': 'be',
|
|
120
|
+
# have
|
|
121
|
+
'has': 'have', 'had': 'have',
|
|
122
|
+
# do
|
|
123
|
+
'does': 'do', 'did': 'do', 'done': 'do',
|
|
124
|
+
# will/would/can/could 등은 조동사로 처리
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
# =================================================================
|
|
128
|
+
# Irregular Plurals (불규칙 복수)
|
|
129
|
+
# =================================================================
|
|
130
|
+
self.irregular_plurals = {
|
|
131
|
+
'men': 'man', 'women': 'woman',
|
|
132
|
+
'children': 'child', 'feet': 'foot', 'teeth': 'tooth',
|
|
133
|
+
'mice': 'mouse', 'geese': 'goose', 'oxen': 'ox',
|
|
134
|
+
'people': 'person', 'lives': 'life', 'knives': 'knife',
|
|
135
|
+
'wives': 'wife', 'selves': 'self', 'leaves': 'leaf',
|
|
136
|
+
'loaves': 'loaf', 'halves': 'half', 'wolves': 'wolf',
|
|
137
|
+
'calves': 'calf', 'shelves': 'shelf', 'thieves': 'thief',
|
|
138
|
+
'phenomena': 'phenomenon', 'criteria': 'criterion',
|
|
139
|
+
'analyses': 'analysis', 'bases': 'basis',
|
|
140
|
+
'crises': 'crisis', 'theses': 'thesis',
|
|
141
|
+
'data': 'datum', 'media': 'medium',
|
|
142
|
+
'indices': 'index', 'matrices': 'matrix',
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
# =================================================================
|
|
146
|
+
# Function Words (기능어)
|
|
147
|
+
# =================================================================
|
|
148
|
+
self.determiners = {
|
|
149
|
+
'the', 'a', 'an', 'this', 'that', 'these', 'those',
|
|
150
|
+
'my', 'your', 'his', 'her', 'its', 'our', 'their',
|
|
151
|
+
'some', 'any', 'no', 'every', 'each', 'all', 'both',
|
|
152
|
+
'few', 'many', 'much', 'several', 'enough',
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
self.pronouns = {
|
|
156
|
+
'i', 'you', 'he', 'she', 'it', 'we', 'they',
|
|
157
|
+
'me', 'him', 'her', 'us', 'them',
|
|
158
|
+
'myself', 'yourself', 'himself', 'herself', 'itself',
|
|
159
|
+
'ourselves', 'yourselves', 'themselves',
|
|
160
|
+
'who', 'whom', 'whose', 'which', 'what', 'that',
|
|
161
|
+
'whoever', 'whomever', 'whatever', 'whichever',
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
self.prepositions = {
|
|
165
|
+
'in', 'on', 'at', 'to', 'for', 'with', 'by', 'from',
|
|
166
|
+
'of', 'about', 'into', 'through', 'during', 'before',
|
|
167
|
+
'after', 'above', 'below', 'between', 'under', 'over',
|
|
168
|
+
'against', 'among', 'around', 'behind', 'beside',
|
|
169
|
+
'without', 'within', 'along', 'across', 'beyond',
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
self.conjunctions = {
|
|
173
|
+
'and', 'or', 'but', 'nor', 'yet', 'so', 'for',
|
|
174
|
+
'because', 'although', 'though', 'while', 'if', 'unless',
|
|
175
|
+
'until', 'when', 'where', 'whether', 'since', 'as',
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
self.auxiliaries = {
|
|
179
|
+
'will', 'would', 'shall', 'should', 'can', 'could',
|
|
180
|
+
'may', 'might', 'must', 'need', 'dare', 'ought',
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
self.adverbs = {
|
|
184
|
+
'very', 'really', 'quite', 'rather', 'too', 'also',
|
|
185
|
+
'just', 'only', 'even', 'still', 'already', 'always',
|
|
186
|
+
'never', 'often', 'sometimes', 'usually', 'seldom',
|
|
187
|
+
'here', 'there', 'now', 'then', 'today', 'yesterday',
|
|
188
|
+
'tomorrow', 'soon', 'ago', 'well', 'badly', 'quickly',
|
|
189
|
+
'slowly', 'carefully', 'easily', 'hard', 'fast',
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
# =================================================================
|
|
193
|
+
# Adjectives (형용사)
|
|
194
|
+
# =================================================================
|
|
195
|
+
self.adjectives = {
|
|
196
|
+
# 기본 형용사
|
|
197
|
+
'good', 'bad', 'new', 'old', 'young', 'big', 'small', 'large',
|
|
198
|
+
'long', 'short', 'high', 'low', 'great', 'little', 'other',
|
|
199
|
+
'same', 'different', 'important', 'right', 'wrong', 'real',
|
|
200
|
+
'true', 'false', 'sure', 'certain', 'clear', 'free', 'full',
|
|
201
|
+
'empty', 'open', 'close', 'early', 'late', 'easy', 'hard',
|
|
202
|
+
'hot', 'cold', 'warm', 'cool', 'dark', 'light', 'bright',
|
|
203
|
+
'heavy', 'strong', 'weak', 'rich', 'poor', 'happy', 'sad',
|
|
204
|
+
'angry', 'afraid', 'alone', 'alive', 'dead', 'ready', 'busy',
|
|
205
|
+
'simple', 'complex', 'special', 'general', 'common', 'rare',
|
|
206
|
+
'strange', 'normal', 'natural', 'human', 'social', 'political',
|
|
207
|
+
'economic', 'public', 'private', 'local', 'national', 'international',
|
|
208
|
+
'main', 'major', 'minor', 'final', 'total', 'whole', 'single',
|
|
209
|
+
'double', 'various', 'similar', 'recent', 'current', 'present',
|
|
210
|
+
'past', 'future', 'ancient', 'modern', 'traditional', 'popular',
|
|
211
|
+
'famous', 'beautiful', 'pretty', 'ugly', 'nice', 'fine', 'perfect',
|
|
212
|
+
'terrible', 'wonderful', 'excellent', 'amazing', 'incredible',
|
|
213
|
+
'possible', 'impossible', 'necessary', 'available', 'responsible',
|
|
214
|
+
'successful', 'powerful', 'useful', 'dangerous', 'safe', 'healthy',
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
# =================================================================
|
|
218
|
+
# Ambiguous Words (다의어) - 도메인별로 다른 의미
|
|
219
|
+
# =================================================================
|
|
220
|
+
self.ambiguous_words = {
|
|
221
|
+
'apple': {'food': 'fruit', 'tech': 'company', 'default': 'fruit'},
|
|
222
|
+
'bank': {'finance': 'financial institution', 'nature': 'river bank', 'default': 'financial institution'},
|
|
223
|
+
'java': {'tech': 'programming language', 'food': 'coffee', 'default': 'programming language'},
|
|
224
|
+
'python': {'tech': 'programming language', 'nature': 'snake', 'default': 'programming language'},
|
|
225
|
+
'ruby': {'tech': 'programming language', 'default': 'gemstone'},
|
|
226
|
+
'shell': {'tech': 'command shell', 'nature': 'seashell', 'default': 'shell'},
|
|
227
|
+
'bug': {'tech': 'software bug', 'nature': 'insect', 'default': 'insect'},
|
|
228
|
+
'cloud': {'tech': 'cloud computing', 'nature': 'sky cloud', 'default': 'sky cloud'},
|
|
229
|
+
'mouse': {'tech': 'computer mouse', 'nature': 'animal', 'default': 'animal'},
|
|
230
|
+
'server': {'tech': 'computer server', 'food': 'person serving', 'default': 'computer server'},
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
# =================================================================
|
|
234
|
+
# 확장 사전 로드 (optional external asset)
|
|
235
|
+
# =================================================================
|
|
236
|
+
self._load_extended_dictionary()
|
|
237
|
+
|
|
238
|
+
def _load_extended_dictionary(self):
|
|
239
|
+
"""Load optional external extended dictionary"""
|
|
240
|
+
dict_path = DICT_DIR / 'en_extended.json'
|
|
241
|
+
if not dict_path.exists():
|
|
242
|
+
return
|
|
243
|
+
|
|
244
|
+
# 확장 사전 초기화
|
|
245
|
+
self.extended_nouns = set()
|
|
246
|
+
self.extended_verbs = set()
|
|
247
|
+
|
|
248
|
+
with open(dict_path, 'r', encoding='utf-8') as f:
|
|
249
|
+
extended = json.load(f)
|
|
250
|
+
|
|
251
|
+
# 확장 사전에 추가
|
|
252
|
+
for word, upos in extended.items():
|
|
253
|
+
word_lower = word.lower()
|
|
254
|
+
if upos in ('NOUN', 'PROPN'):
|
|
255
|
+
self.extended_nouns.add(word_lower)
|
|
256
|
+
elif upos == 'VERB' and word_lower not in self.irregular_verbs:
|
|
257
|
+
self.extended_verbs.add(word_lower)
|
|
258
|
+
elif upos == 'ADJ':
|
|
259
|
+
self.adjectives.add(word_lower)
|
|
260
|
+
elif upos == 'ADV':
|
|
261
|
+
self.adverbs.add(word_lower)
|
|
262
|
+
|
|
263
|
+
def _build_domain_dictionaries(self):
|
|
264
|
+
"""도메인별 사전 구축"""
|
|
265
|
+
|
|
266
|
+
# TECH 도메인
|
|
267
|
+
self._domain_dictionaries[Domain.TECH] = {
|
|
268
|
+
'apple': ('Apple', 'NNP'),
|
|
269
|
+
'java': ('Java', 'NNP'),
|
|
270
|
+
'python': ('Python', 'NNP'),
|
|
271
|
+
'ruby': ('Ruby', 'NNP'),
|
|
272
|
+
'shell': ('shell', 'NN'),
|
|
273
|
+
'bug': ('bug', 'NN'),
|
|
274
|
+
'cloud': ('cloud', 'NN'),
|
|
275
|
+
'mouse': ('mouse', 'NN'),
|
|
276
|
+
'server': ('server', 'NN'),
|
|
277
|
+
'google': ('Google', 'NNP'),
|
|
278
|
+
'microsoft': ('Microsoft', 'NNP'),
|
|
279
|
+
'amazon': ('Amazon', 'NNP'),
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
# FOOD 도메인
|
|
283
|
+
self._domain_dictionaries[Domain.FOOD] = {
|
|
284
|
+
'apple': ('apple', 'NN'),
|
|
285
|
+
'java': ('java', 'NN'), # coffee
|
|
286
|
+
'server': ('server', 'NN'),
|
|
287
|
+
'dish': ('dish', 'NN'),
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
# FINANCE 도메인
|
|
291
|
+
self._domain_dictionaries[Domain.FINANCE] = {
|
|
292
|
+
'bank': ('bank', 'NN'),
|
|
293
|
+
'stock': ('stock', 'NN'),
|
|
294
|
+
'bond': ('bond', 'NN'),
|
|
295
|
+
'market': ('market', 'NN'),
|
|
296
|
+
'apple': ('Apple', 'NNP'), # stock ticker
|
|
297
|
+
'amazon': ('Amazon', 'NNP'),
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
# SPORTS 도메인
|
|
301
|
+
self._domain_dictionaries[Domain.SPORTS] = {
|
|
302
|
+
'court': ('court', 'NN'),
|
|
303
|
+
'field': ('field', 'NN'),
|
|
304
|
+
'net': ('net', 'NN'),
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
|
|
308
|
+
"""분석 후보 생성"""
|
|
309
|
+
if not text or not text.strip():
|
|
310
|
+
return [AnalysisResult([])]
|
|
311
|
+
|
|
312
|
+
candidates = []
|
|
313
|
+
|
|
314
|
+
# 기본 분석
|
|
315
|
+
main_morphemes = self._analyze_text(text, domain)
|
|
316
|
+
main_result = AnalysisResult(
|
|
317
|
+
morphemes=main_morphemes,
|
|
318
|
+
score=1.0,
|
|
319
|
+
domain=domain
|
|
320
|
+
)
|
|
321
|
+
main_result.score = self._score_analysis(main_result)
|
|
322
|
+
candidates.append(main_result)
|
|
323
|
+
|
|
324
|
+
return candidates
|
|
325
|
+
|
|
326
|
+
def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
|
|
327
|
+
"""텍스트 분석"""
|
|
328
|
+
if not text:
|
|
329
|
+
return []
|
|
330
|
+
|
|
331
|
+
result = []
|
|
332
|
+
pos = 0
|
|
333
|
+
|
|
334
|
+
while pos < len(text):
|
|
335
|
+
# 공백 스킵
|
|
336
|
+
if text[pos].isspace():
|
|
337
|
+
pos += 1
|
|
338
|
+
continue
|
|
339
|
+
|
|
340
|
+
# 단어 매칭
|
|
341
|
+
word_match = self.WORD_PATTERN.match(text[pos:])
|
|
342
|
+
if word_match:
|
|
343
|
+
word = word_match.group()
|
|
344
|
+
morpheme = self._analyze_word(word, pos, domain)
|
|
345
|
+
result.append(morpheme)
|
|
346
|
+
pos += len(word)
|
|
347
|
+
continue
|
|
348
|
+
|
|
349
|
+
# 숫자
|
|
350
|
+
num_match = self.NUMBER_PATTERN.match(text[pos:])
|
|
351
|
+
if num_match:
|
|
352
|
+
num = num_match.group()
|
|
353
|
+
result.append(Morpheme(
|
|
354
|
+
surface=num, lemma=num, pos='CD',
|
|
355
|
+
start=pos, end=pos + len(num)
|
|
356
|
+
))
|
|
357
|
+
pos += len(num)
|
|
358
|
+
continue
|
|
359
|
+
|
|
360
|
+
# 기타 (기호)
|
|
361
|
+
result.append(Morpheme(
|
|
362
|
+
surface=text[pos], lemma=text[pos], pos='SYM',
|
|
363
|
+
start=pos, end=pos + 1
|
|
364
|
+
))
|
|
365
|
+
pos += 1
|
|
366
|
+
|
|
367
|
+
return result
|
|
368
|
+
|
|
369
|
+
def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
|
|
370
|
+
"""단어 분석"""
|
|
371
|
+
word_lower = word.lower()
|
|
372
|
+
|
|
373
|
+
# 1. 런타임 사전 확인
|
|
374
|
+
if word_lower in self._user_dictionary:
|
|
375
|
+
lemma, pos_tag, _ = self._user_dictionary[word_lower]
|
|
376
|
+
return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
|
|
377
|
+
|
|
378
|
+
# 2. 도메인 사전 확인
|
|
379
|
+
domain_sense = self._get_domain_sense(word_lower, domain)
|
|
380
|
+
if domain_sense:
|
|
381
|
+
return Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))
|
|
382
|
+
|
|
383
|
+
# 3. 기능어 확인
|
|
384
|
+
if word_lower in self.determiners:
|
|
385
|
+
return Morpheme(surface=word, lemma=word_lower, pos='DT', start=offset, end=offset + len(word))
|
|
386
|
+
if word_lower in self.pronouns:
|
|
387
|
+
return Morpheme(surface=word, lemma=word_lower, pos='PRP', start=offset, end=offset + len(word))
|
|
388
|
+
if word_lower in self.prepositions:
|
|
389
|
+
return Morpheme(surface=word, lemma=word_lower, pos='IN', start=offset, end=offset + len(word))
|
|
390
|
+
if word_lower in self.conjunctions:
|
|
391
|
+
return Morpheme(surface=word, lemma=word_lower, pos='CC', start=offset, end=offset + len(word))
|
|
392
|
+
if word_lower in self.auxiliaries:
|
|
393
|
+
return Morpheme(surface=word, lemma=word_lower, pos='MD', start=offset, end=offset + len(word))
|
|
394
|
+
if word_lower in self.adverbs:
|
|
395
|
+
return Morpheme(surface=word, lemma=word_lower, pos='RB', start=offset, end=offset + len(word))
|
|
396
|
+
if word_lower in self.adjectives:
|
|
397
|
+
return Morpheme(surface=word, lemma=word_lower, pos='JJ', start=offset, end=offset + len(word))
|
|
398
|
+
|
|
399
|
+
# 4. 불규칙 동사
|
|
400
|
+
if word_lower in self.irregular_verbs:
|
|
401
|
+
lemma = self.irregular_verbs[word_lower]
|
|
402
|
+
return Morpheme(surface=word, lemma=lemma, pos='VB', start=offset, end=offset + len(word))
|
|
403
|
+
|
|
404
|
+
# 5. 불규칙 복수
|
|
405
|
+
if word_lower in self.irregular_plurals:
|
|
406
|
+
lemma = self.irregular_plurals[word_lower]
|
|
407
|
+
return Morpheme(surface=word, lemma=lemma, pos='NNS', start=offset, end=offset + len(word))
|
|
408
|
+
|
|
409
|
+
# 6. 확장 사전 (optional external)
|
|
410
|
+
if hasattr(self, 'extended_verbs') and word_lower in self.extended_verbs:
|
|
411
|
+
return Morpheme(surface=word, lemma=word_lower, pos='VB', start=offset, end=offset + len(word))
|
|
412
|
+
if hasattr(self, 'extended_nouns') and word_lower in self.extended_nouns:
|
|
413
|
+
return Morpheme(surface=word, lemma=word_lower, pos='NN', start=offset, end=offset + len(word))
|
|
414
|
+
|
|
415
|
+
# 7. 규칙 활용 분석
|
|
416
|
+
lemma, pos_tag = self._analyze_morphology(word_lower)
|
|
417
|
+
return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
|
|
418
|
+
|
|
419
|
+
def _analyze_morphology(self, word: str) -> Tuple[str, str]:
|
|
420
|
+
"""형태 분석 (lemmatization + POS guessing)"""
|
|
421
|
+
|
|
422
|
+
# -ing 형
|
|
423
|
+
if word.endswith('ing') and len(word) > 4:
|
|
424
|
+
stem = word[:-3]
|
|
425
|
+
if stem.endswith(('e',)):
|
|
426
|
+
return (stem + 'e', 'VBG')
|
|
427
|
+
if len(stem) >= 3 and stem[-1] == stem[-2]: # running -> run
|
|
428
|
+
return (stem[:-1], 'VBG')
|
|
429
|
+
return (stem, 'VBG')
|
|
430
|
+
|
|
431
|
+
# -ed 형
|
|
432
|
+
if word.endswith('ed') and len(word) > 3:
|
|
433
|
+
# -ied → -y: carried → carry
|
|
434
|
+
if word.endswith('ied'):
|
|
435
|
+
return (word[:-3] + 'y', 'VBD')
|
|
436
|
+
|
|
437
|
+
stem = word[:-2]
|
|
438
|
+
|
|
439
|
+
# 자음 중복 제거: stopped → stop, planned → plan
|
|
440
|
+
if len(stem) >= 2 and stem[-1] == stem[-2] and stem[-1] in 'bdgklmnprst':
|
|
441
|
+
return (stem[:-1], 'VBD')
|
|
442
|
+
|
|
443
|
+
# 묵음 e 복원: announced → announce, danced → dance
|
|
444
|
+
if stem.endswith(('c', 'v', 'z')):
|
|
445
|
+
return (stem + 'e', 'VBD')
|
|
446
|
+
|
|
447
|
+
# -ged 뒤 모음이면 e 복원: changed → change
|
|
448
|
+
if stem.endswith('g') and len(stem) >= 2 and stem[-2] in 'aeioumn':
|
|
449
|
+
return (stem + 'e', 'VBD')
|
|
450
|
+
|
|
451
|
+
# CVC + e 패턴: liked → like, hoped → hope
|
|
452
|
+
if len(stem) >= 2 and stem[-2] in 'aeiou' and stem[-1] in 'kptd':
|
|
453
|
+
return (stem + 'e', 'VBD')
|
|
454
|
+
|
|
455
|
+
return (stem, 'VBD')
|
|
456
|
+
|
|
457
|
+
# -s/-es 형 (동사 3인칭 또는 복수)
|
|
458
|
+
if word.endswith('ies') and len(word) > 4:
|
|
459
|
+
return (word[:-3] + 'y', 'VBZ') # or NNS
|
|
460
|
+
if word.endswith('es') and len(word) > 3:
|
|
461
|
+
return (word[:-2], 'VBZ')
|
|
462
|
+
if word.endswith('s') and len(word) > 2:
|
|
463
|
+
return (word[:-1], 'VBZ')
|
|
464
|
+
|
|
465
|
+
# -ly 형 (부사)
|
|
466
|
+
if word.endswith('ly') and len(word) > 3:
|
|
467
|
+
return (word[:-2], 'RB')
|
|
468
|
+
|
|
469
|
+
# -ness 형 (명사)
|
|
470
|
+
if word.endswith('ness') and len(word) > 5:
|
|
471
|
+
return (word[:-4], 'NN')
|
|
472
|
+
|
|
473
|
+
# -tion/-sion 형 (명사)
|
|
474
|
+
if word.endswith(('tion', 'sion')) and len(word) > 5:
|
|
475
|
+
return (word, 'NN')
|
|
476
|
+
|
|
477
|
+
# -or 형 (명사): doctor, actor, director
|
|
478
|
+
if word.endswith('or') and len(word) > 3:
|
|
479
|
+
return (word, 'NN')
|
|
480
|
+
|
|
481
|
+
# -est 형 (최상급)
|
|
482
|
+
if word.endswith('est') and len(word) > 4:
|
|
483
|
+
return (word[:-3], 'JJS')
|
|
484
|
+
|
|
485
|
+
# -er 형 (비교급/명사)
|
|
486
|
+
if word.endswith('er') and len(word) > 3:
|
|
487
|
+
base = word[:-2]
|
|
488
|
+
# 비교급 패턴: bigger → big, nicer → nice, taller → tall
|
|
489
|
+
# 자음 중복
|
|
490
|
+
if len(base) >= 2 and base[-1] == base[-2]:
|
|
491
|
+
return (base[:-1], 'JJR')
|
|
492
|
+
# e 탈락
|
|
493
|
+
if len(base) >= 2 and base[-1] in 'cgkptvlns':
|
|
494
|
+
return (base + 'e', 'JJR')
|
|
495
|
+
# 나머지는 명사로 (teacher, player)
|
|
496
|
+
return (word, 'NN')
|
|
497
|
+
|
|
498
|
+
# =================================================================
|
|
499
|
+
# Adjective Suffix Patterns (형용사 접미사)
|
|
500
|
+
# =================================================================
|
|
501
|
+
|
|
502
|
+
# -ful 형 (형용사): beautiful, wonderful, powerful
|
|
503
|
+
if word.endswith('ful') and len(word) > 5:
|
|
504
|
+
return (word[:-3], 'JJ')
|
|
505
|
+
|
|
506
|
+
# -less 형 (형용사): useless, helpless, careless
|
|
507
|
+
if word.endswith('less') and len(word) > 5:
|
|
508
|
+
return (word[:-4], 'JJ')
|
|
509
|
+
|
|
510
|
+
# -ous 형 (형용사): famous, dangerous, nervous
|
|
511
|
+
if word.endswith('ous') and len(word) > 4:
|
|
512
|
+
return (word, 'JJ')
|
|
513
|
+
|
|
514
|
+
# -ive 형 (형용사): active, creative, impressive
|
|
515
|
+
if word.endswith('ive') and len(word) > 4:
|
|
516
|
+
return (word, 'JJ')
|
|
517
|
+
|
|
518
|
+
# -able/-ible 형 (형용사): available, possible, incredible
|
|
519
|
+
if word.endswith(('able', 'ible')) and len(word) > 5:
|
|
520
|
+
return (word, 'JJ')
|
|
521
|
+
|
|
522
|
+
# -al/-ial/-ical 형 (형용사): natural, social, political
|
|
523
|
+
if word.endswith(('ical', 'ial')) and len(word) > 5:
|
|
524
|
+
return (word, 'JJ')
|
|
525
|
+
if word.endswith('al') and len(word) > 3:
|
|
526
|
+
return (word, 'JJ')
|
|
527
|
+
|
|
528
|
+
# -ent/-ant 형 (형용사): different, important, excellent
|
|
529
|
+
if word.endswith(('ent', 'ant')) and len(word) > 4:
|
|
530
|
+
return (word, 'JJ')
|
|
531
|
+
|
|
532
|
+
# 대문자로 시작 (고유명사)
|
|
533
|
+
if word[0].isupper():
|
|
534
|
+
return (word, 'NNP')
|
|
535
|
+
|
|
536
|
+
# 기본값: 명사
|
|
537
|
+
return (word, 'NN')
|
|
538
|
+
|
|
539
|
+
def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
|
|
540
|
+
"""대안 분석 결과 생성"""
|
|
541
|
+
alternatives = []
|
|
542
|
+
|
|
543
|
+
# 다른 도메인으로 분석
|
|
544
|
+
other_domains = [d for d in Domain if d != domain][:count]
|
|
545
|
+
|
|
546
|
+
for alt_domain in other_domains:
|
|
547
|
+
morphemes = self._analyze_text(text, alt_domain)
|
|
548
|
+
result = AnalysisResult(
|
|
549
|
+
morphemes=morphemes,
|
|
550
|
+
score=0.8,
|
|
551
|
+
domain=alt_domain
|
|
552
|
+
)
|
|
553
|
+
result.score = self._score_analysis(result) * 0.9
|
|
554
|
+
alternatives.append(result)
|
|
555
|
+
|
|
556
|
+
return alternatives
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
# Alias for backward compatibility
|
|
560
|
+
EnglishAnalyzer = EnglishAdvancedAnalyzer
|