tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,560 @@
1
+ """
2
+ English Advanced Morphological Analyzer
3
+ =======================================
4
+
5
+ 5가지 고급 기능을 지원하는 영어 형태소 분석기
6
+
7
+ Features:
8
+ 1. NER Gazetteer Integration - 개체명 경계 보존
9
+ 2. Real-time Dictionary Extension - 런타임 사전 확장
10
+ 3. Domain Adaptation - 도메인별 분석 최적화
11
+ 4. Code-switching - 다국어 혼용 텍스트 처리
12
+ 5. N-best Analysis - 다중 후보 + 신뢰도 점수
13
+ """
14
+
15
+ import re
16
+ import json
17
+ from pathlib import Path
18
+ from typing import List, Tuple, Dict, Set, Optional, Any
19
+
20
+ from .advanced_base import (
21
+ AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, NBestResult, Domain
22
+ )
23
+
24
+ # 확장 사전 경로
25
+ from .. import resources
26
+
27
+ # Optional external asset dir (default: none). If you want extended dictionaries,
28
+ # provide them under: TOKMOR_DATA_DIR/extended_dict/{lang}_extended.json
29
+ DICT_DIR = resources.data_dir() / "extended_dict"
30
+
31
+
32
+ class EnglishAdvancedAnalyzer(AdvancedMorphologicalAnalyzer):
33
+ """
34
+ 영어 고급 형태소 분석기
35
+
36
+ Usage:
37
+ analyzer = EnglishAdvancedAnalyzer()
38
+
39
+ # 기본 분석
40
+ result = analyzer.analyze("Apple announced new products")
41
+
42
+ # 개체명 보존
43
+ analyzer.add_entity("Apple", "ORG")
44
+ result = analyzer.analyze("Apple announced", preserve_entities=True)
45
+
46
+ # 도메인 적응
47
+ result = analyzer.analyze("apple", domain="food") # fruit
48
+ result = analyzer.analyze("apple", domain="tech") # company
49
+
50
+ # N-best 분석
51
+ result = analyzer.analyze("bank", n_best=3)
52
+ """
53
+
54
+ LANG_CODE = "en"
55
+ LANG_NAME = "English"
56
+
57
+ # 토큰 패턴
58
+ WORD_PATTERN = re.compile(r"[a-zA-Z]+(?:'[a-zA-Z]+)?")
59
+ NUMBER_PATTERN = re.compile(r'[0-9]+(?:\.[0-9]+)?')
60
+
61
+ def __init__(self):
62
+ super().__init__()
63
+
64
+ def _build_base_dictionary(self):
65
+ """기본 사전 구축"""
66
+
67
+ # =================================================================
68
+ # Irregular Verbs (불규칙 동사)
69
+ # =================================================================
70
+ self.irregular_verbs = {
71
+ # Past tense
72
+ 'went': 'go', 'gone': 'go', 'goes': 'go',
73
+ 'saw': 'see', 'seen': 'see', 'sees': 'see',
74
+ 'came': 'come', 'comes': 'come',
75
+ 'took': 'take', 'taken': 'take', 'takes': 'take',
76
+ 'made': 'make', 'makes': 'make',
77
+ 'said': 'say', 'says': 'say',
78
+ 'got': 'get', 'gotten': 'get', 'gets': 'get',
79
+ 'knew': 'know', 'known': 'know', 'knows': 'know',
80
+ 'thought': 'think', 'thinks': 'think',
81
+ 'found': 'find', 'finds': 'find',
82
+ 'gave': 'give', 'given': 'give', 'gives': 'give',
83
+ 'told': 'tell', 'tells': 'tell',
84
+ 'became': 'become', 'becomes': 'become',
85
+ 'left': 'leave', 'leaves': 'leave',
86
+ 'felt': 'feel', 'feels': 'feel',
87
+ 'brought': 'bring', 'brings': 'bring',
88
+ 'began': 'begin', 'begun': 'begin', 'begins': 'begin',
89
+ 'kept': 'keep', 'keeps': 'keep',
90
+ 'held': 'hold', 'holds': 'hold',
91
+ 'wrote': 'write', 'written': 'write', 'writes': 'write',
92
+ 'stood': 'stand', 'stands': 'stand',
93
+ 'heard': 'hear', 'hears': 'hear',
94
+ 'let': 'let', 'lets': 'let',
95
+ 'meant': 'mean', 'means': 'mean',
96
+ 'set': 'set', 'sets': 'set',
97
+ 'met': 'meet', 'meets': 'meet',
98
+ 'ran': 'run', 'runs': 'run',
99
+ 'paid': 'pay', 'pays': 'pay',
100
+ 'sat': 'sit', 'sits': 'sit',
101
+ 'spoke': 'speak', 'spoken': 'speak', 'speaks': 'speak',
102
+ 'lay': 'lie', 'lain': 'lie', 'lies': 'lie',
103
+ 'led': 'lead', 'leads': 'lead',
104
+ 'read': 'read', 'reads': 'read',
105
+ 'grew': 'grow', 'grown': 'grow', 'grows': 'grow',
106
+ 'lost': 'lose', 'loses': 'lose',
107
+ 'fell': 'fall', 'fallen': 'fall', 'falls': 'fall',
108
+ 'sent': 'send', 'sends': 'send',
109
+ 'built': 'build', 'builds': 'build',
110
+ 'understood': 'understand', 'understands': 'understand',
111
+ 'drew': 'draw', 'drawn': 'draw', 'draws': 'draw',
112
+ 'broke': 'break', 'broken': 'break', 'breaks': 'break',
113
+ 'spent': 'spend', 'spends': 'spend',
114
+ 'cut': 'cut', 'cuts': 'cut',
115
+ 'hit': 'hit', 'hits': 'hit',
116
+ 'put': 'put', 'puts': 'put',
117
+ 'shut': 'shut', 'shuts': 'shut',
118
+ # be동사
119
+ 'am': 'be', 'is': 'be', 'are': 'be', 'was': 'be', 'were': 'be', 'been': 'be',
120
+ # have
121
+ 'has': 'have', 'had': 'have',
122
+ # do
123
+ 'does': 'do', 'did': 'do', 'done': 'do',
124
+ # will/would/can/could 등은 조동사로 처리
125
+ }
126
+
127
+ # =================================================================
128
+ # Irregular Plurals (불규칙 복수)
129
+ # =================================================================
130
+ self.irregular_plurals = {
131
+ 'men': 'man', 'women': 'woman',
132
+ 'children': 'child', 'feet': 'foot', 'teeth': 'tooth',
133
+ 'mice': 'mouse', 'geese': 'goose', 'oxen': 'ox',
134
+ 'people': 'person', 'lives': 'life', 'knives': 'knife',
135
+ 'wives': 'wife', 'selves': 'self', 'leaves': 'leaf',
136
+ 'loaves': 'loaf', 'halves': 'half', 'wolves': 'wolf',
137
+ 'calves': 'calf', 'shelves': 'shelf', 'thieves': 'thief',
138
+ 'phenomena': 'phenomenon', 'criteria': 'criterion',
139
+ 'analyses': 'analysis', 'bases': 'basis',
140
+ 'crises': 'crisis', 'theses': 'thesis',
141
+ 'data': 'datum', 'media': 'medium',
142
+ 'indices': 'index', 'matrices': 'matrix',
143
+ }
144
+
145
+ # =================================================================
146
+ # Function Words (기능어)
147
+ # =================================================================
148
+ self.determiners = {
149
+ 'the', 'a', 'an', 'this', 'that', 'these', 'those',
150
+ 'my', 'your', 'his', 'her', 'its', 'our', 'their',
151
+ 'some', 'any', 'no', 'every', 'each', 'all', 'both',
152
+ 'few', 'many', 'much', 'several', 'enough',
153
+ }
154
+
155
+ self.pronouns = {
156
+ 'i', 'you', 'he', 'she', 'it', 'we', 'they',
157
+ 'me', 'him', 'her', 'us', 'them',
158
+ 'myself', 'yourself', 'himself', 'herself', 'itself',
159
+ 'ourselves', 'yourselves', 'themselves',
160
+ 'who', 'whom', 'whose', 'which', 'what', 'that',
161
+ 'whoever', 'whomever', 'whatever', 'whichever',
162
+ }
163
+
164
+ self.prepositions = {
165
+ 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'from',
166
+ 'of', 'about', 'into', 'through', 'during', 'before',
167
+ 'after', 'above', 'below', 'between', 'under', 'over',
168
+ 'against', 'among', 'around', 'behind', 'beside',
169
+ 'without', 'within', 'along', 'across', 'beyond',
170
+ }
171
+
172
+ self.conjunctions = {
173
+ 'and', 'or', 'but', 'nor', 'yet', 'so', 'for',
174
+ 'because', 'although', 'though', 'while', 'if', 'unless',
175
+ 'until', 'when', 'where', 'whether', 'since', 'as',
176
+ }
177
+
178
+ self.auxiliaries = {
179
+ 'will', 'would', 'shall', 'should', 'can', 'could',
180
+ 'may', 'might', 'must', 'need', 'dare', 'ought',
181
+ }
182
+
183
+ self.adverbs = {
184
+ 'very', 'really', 'quite', 'rather', 'too', 'also',
185
+ 'just', 'only', 'even', 'still', 'already', 'always',
186
+ 'never', 'often', 'sometimes', 'usually', 'seldom',
187
+ 'here', 'there', 'now', 'then', 'today', 'yesterday',
188
+ 'tomorrow', 'soon', 'ago', 'well', 'badly', 'quickly',
189
+ 'slowly', 'carefully', 'easily', 'hard', 'fast',
190
+ }
191
+
192
+ # =================================================================
193
+ # Adjectives (형용사)
194
+ # =================================================================
195
+ self.adjectives = {
196
+ # 기본 형용사
197
+ 'good', 'bad', 'new', 'old', 'young', 'big', 'small', 'large',
198
+ 'long', 'short', 'high', 'low', 'great', 'little', 'other',
199
+ 'same', 'different', 'important', 'right', 'wrong', 'real',
200
+ 'true', 'false', 'sure', 'certain', 'clear', 'free', 'full',
201
+ 'empty', 'open', 'close', 'early', 'late', 'easy', 'hard',
202
+ 'hot', 'cold', 'warm', 'cool', 'dark', 'light', 'bright',
203
+ 'heavy', 'strong', 'weak', 'rich', 'poor', 'happy', 'sad',
204
+ 'angry', 'afraid', 'alone', 'alive', 'dead', 'ready', 'busy',
205
+ 'simple', 'complex', 'special', 'general', 'common', 'rare',
206
+ 'strange', 'normal', 'natural', 'human', 'social', 'political',
207
+ 'economic', 'public', 'private', 'local', 'national', 'international',
208
+ 'main', 'major', 'minor', 'final', 'total', 'whole', 'single',
209
+ 'double', 'various', 'similar', 'recent', 'current', 'present',
210
+ 'past', 'future', 'ancient', 'modern', 'traditional', 'popular',
211
+ 'famous', 'beautiful', 'pretty', 'ugly', 'nice', 'fine', 'perfect',
212
+ 'terrible', 'wonderful', 'excellent', 'amazing', 'incredible',
213
+ 'possible', 'impossible', 'necessary', 'available', 'responsible',
214
+ 'successful', 'powerful', 'useful', 'dangerous', 'safe', 'healthy',
215
+ }
216
+
217
+ # =================================================================
218
+ # Ambiguous Words (다의어) - 도메인별로 다른 의미
219
+ # =================================================================
220
+ self.ambiguous_words = {
221
+ 'apple': {'food': 'fruit', 'tech': 'company', 'default': 'fruit'},
222
+ 'bank': {'finance': 'financial institution', 'nature': 'river bank', 'default': 'financial institution'},
223
+ 'java': {'tech': 'programming language', 'food': 'coffee', 'default': 'programming language'},
224
+ 'python': {'tech': 'programming language', 'nature': 'snake', 'default': 'programming language'},
225
+ 'ruby': {'tech': 'programming language', 'default': 'gemstone'},
226
+ 'shell': {'tech': 'command shell', 'nature': 'seashell', 'default': 'shell'},
227
+ 'bug': {'tech': 'software bug', 'nature': 'insect', 'default': 'insect'},
228
+ 'cloud': {'tech': 'cloud computing', 'nature': 'sky cloud', 'default': 'sky cloud'},
229
+ 'mouse': {'tech': 'computer mouse', 'nature': 'animal', 'default': 'animal'},
230
+ 'server': {'tech': 'computer server', 'food': 'person serving', 'default': 'computer server'},
231
+ }
232
+
233
+ # =================================================================
234
+ # 확장 사전 로드 (optional external asset)
235
+ # =================================================================
236
+ self._load_extended_dictionary()
237
+
238
+ def _load_extended_dictionary(self):
239
+ """Load optional external extended dictionary"""
240
+ dict_path = DICT_DIR / 'en_extended.json'
241
+ if not dict_path.exists():
242
+ return
243
+
244
+ # 확장 사전 초기화
245
+ self.extended_nouns = set()
246
+ self.extended_verbs = set()
247
+
248
+ with open(dict_path, 'r', encoding='utf-8') as f:
249
+ extended = json.load(f)
250
+
251
+ # 확장 사전에 추가
252
+ for word, upos in extended.items():
253
+ word_lower = word.lower()
254
+ if upos in ('NOUN', 'PROPN'):
255
+ self.extended_nouns.add(word_lower)
256
+ elif upos == 'VERB' and word_lower not in self.irregular_verbs:
257
+ self.extended_verbs.add(word_lower)
258
+ elif upos == 'ADJ':
259
+ self.adjectives.add(word_lower)
260
+ elif upos == 'ADV':
261
+ self.adverbs.add(word_lower)
262
+
263
+ def _build_domain_dictionaries(self):
264
+ """도메인별 사전 구축"""
265
+
266
+ # TECH 도메인
267
+ self._domain_dictionaries[Domain.TECH] = {
268
+ 'apple': ('Apple', 'NNP'),
269
+ 'java': ('Java', 'NNP'),
270
+ 'python': ('Python', 'NNP'),
271
+ 'ruby': ('Ruby', 'NNP'),
272
+ 'shell': ('shell', 'NN'),
273
+ 'bug': ('bug', 'NN'),
274
+ 'cloud': ('cloud', 'NN'),
275
+ 'mouse': ('mouse', 'NN'),
276
+ 'server': ('server', 'NN'),
277
+ 'google': ('Google', 'NNP'),
278
+ 'microsoft': ('Microsoft', 'NNP'),
279
+ 'amazon': ('Amazon', 'NNP'),
280
+ }
281
+
282
+ # FOOD 도메인
283
+ self._domain_dictionaries[Domain.FOOD] = {
284
+ 'apple': ('apple', 'NN'),
285
+ 'java': ('java', 'NN'), # coffee
286
+ 'server': ('server', 'NN'),
287
+ 'dish': ('dish', 'NN'),
288
+ }
289
+
290
+ # FINANCE 도메인
291
+ self._domain_dictionaries[Domain.FINANCE] = {
292
+ 'bank': ('bank', 'NN'),
293
+ 'stock': ('stock', 'NN'),
294
+ 'bond': ('bond', 'NN'),
295
+ 'market': ('market', 'NN'),
296
+ 'apple': ('Apple', 'NNP'), # stock ticker
297
+ 'amazon': ('Amazon', 'NNP'),
298
+ }
299
+
300
+ # SPORTS 도메인
301
+ self._domain_dictionaries[Domain.SPORTS] = {
302
+ 'court': ('court', 'NN'),
303
+ 'field': ('field', 'NN'),
304
+ 'net': ('net', 'NN'),
305
+ }
306
+
307
+ def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
308
+ """분석 후보 생성"""
309
+ if not text or not text.strip():
310
+ return [AnalysisResult([])]
311
+
312
+ candidates = []
313
+
314
+ # 기본 분석
315
+ main_morphemes = self._analyze_text(text, domain)
316
+ main_result = AnalysisResult(
317
+ morphemes=main_morphemes,
318
+ score=1.0,
319
+ domain=domain
320
+ )
321
+ main_result.score = self._score_analysis(main_result)
322
+ candidates.append(main_result)
323
+
324
+ return candidates
325
+
326
+ def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
327
+ """텍스트 분석"""
328
+ if not text:
329
+ return []
330
+
331
+ result = []
332
+ pos = 0
333
+
334
+ while pos < len(text):
335
+ # 공백 스킵
336
+ if text[pos].isspace():
337
+ pos += 1
338
+ continue
339
+
340
+ # 단어 매칭
341
+ word_match = self.WORD_PATTERN.match(text[pos:])
342
+ if word_match:
343
+ word = word_match.group()
344
+ morpheme = self._analyze_word(word, pos, domain)
345
+ result.append(morpheme)
346
+ pos += len(word)
347
+ continue
348
+
349
+ # 숫자
350
+ num_match = self.NUMBER_PATTERN.match(text[pos:])
351
+ if num_match:
352
+ num = num_match.group()
353
+ result.append(Morpheme(
354
+ surface=num, lemma=num, pos='CD',
355
+ start=pos, end=pos + len(num)
356
+ ))
357
+ pos += len(num)
358
+ continue
359
+
360
+ # 기타 (기호)
361
+ result.append(Morpheme(
362
+ surface=text[pos], lemma=text[pos], pos='SYM',
363
+ start=pos, end=pos + 1
364
+ ))
365
+ pos += 1
366
+
367
+ return result
368
+
369
+ def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
370
+ """단어 분석"""
371
+ word_lower = word.lower()
372
+
373
+ # 1. 런타임 사전 확인
374
+ if word_lower in self._user_dictionary:
375
+ lemma, pos_tag, _ = self._user_dictionary[word_lower]
376
+ return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
377
+
378
+ # 2. 도메인 사전 확인
379
+ domain_sense = self._get_domain_sense(word_lower, domain)
380
+ if domain_sense:
381
+ return Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))
382
+
383
+ # 3. 기능어 확인
384
+ if word_lower in self.determiners:
385
+ return Morpheme(surface=word, lemma=word_lower, pos='DT', start=offset, end=offset + len(word))
386
+ if word_lower in self.pronouns:
387
+ return Morpheme(surface=word, lemma=word_lower, pos='PRP', start=offset, end=offset + len(word))
388
+ if word_lower in self.prepositions:
389
+ return Morpheme(surface=word, lemma=word_lower, pos='IN', start=offset, end=offset + len(word))
390
+ if word_lower in self.conjunctions:
391
+ return Morpheme(surface=word, lemma=word_lower, pos='CC', start=offset, end=offset + len(word))
392
+ if word_lower in self.auxiliaries:
393
+ return Morpheme(surface=word, lemma=word_lower, pos='MD', start=offset, end=offset + len(word))
394
+ if word_lower in self.adverbs:
395
+ return Morpheme(surface=word, lemma=word_lower, pos='RB', start=offset, end=offset + len(word))
396
+ if word_lower in self.adjectives:
397
+ return Morpheme(surface=word, lemma=word_lower, pos='JJ', start=offset, end=offset + len(word))
398
+
399
+ # 4. 불규칙 동사
400
+ if word_lower in self.irregular_verbs:
401
+ lemma = self.irregular_verbs[word_lower]
402
+ return Morpheme(surface=word, lemma=lemma, pos='VB', start=offset, end=offset + len(word))
403
+
404
+ # 5. 불규칙 복수
405
+ if word_lower in self.irregular_plurals:
406
+ lemma = self.irregular_plurals[word_lower]
407
+ return Morpheme(surface=word, lemma=lemma, pos='NNS', start=offset, end=offset + len(word))
408
+
409
+ # 6. 확장 사전 (optional external)
410
+ if hasattr(self, 'extended_verbs') and word_lower in self.extended_verbs:
411
+ return Morpheme(surface=word, lemma=word_lower, pos='VB', start=offset, end=offset + len(word))
412
+ if hasattr(self, 'extended_nouns') and word_lower in self.extended_nouns:
413
+ return Morpheme(surface=word, lemma=word_lower, pos='NN', start=offset, end=offset + len(word))
414
+
415
+ # 7. 규칙 활용 분석
416
+ lemma, pos_tag = self._analyze_morphology(word_lower)
417
+ return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
418
+
419
+ def _analyze_morphology(self, word: str) -> Tuple[str, str]:
420
+ """형태 분석 (lemmatization + POS guessing)"""
421
+
422
+ # -ing 형
423
+ if word.endswith('ing') and len(word) > 4:
424
+ stem = word[:-3]
425
+ if stem.endswith(('e',)):
426
+ return (stem + 'e', 'VBG')
427
+ if len(stem) >= 3 and stem[-1] == stem[-2]: # running -> run
428
+ return (stem[:-1], 'VBG')
429
+ return (stem, 'VBG')
430
+
431
+ # -ed 형
432
+ if word.endswith('ed') and len(word) > 3:
433
+ # -ied → -y: carried → carry
434
+ if word.endswith('ied'):
435
+ return (word[:-3] + 'y', 'VBD')
436
+
437
+ stem = word[:-2]
438
+
439
+ # 자음 중복 제거: stopped → stop, planned → plan
440
+ if len(stem) >= 2 and stem[-1] == stem[-2] and stem[-1] in 'bdgklmnprst':
441
+ return (stem[:-1], 'VBD')
442
+
443
+ # 묵음 e 복원: announced → announce, danced → dance
444
+ if stem.endswith(('c', 'v', 'z')):
445
+ return (stem + 'e', 'VBD')
446
+
447
+ # -ged 뒤 모음이면 e 복원: changed → change
448
+ if stem.endswith('g') and len(stem) >= 2 and stem[-2] in 'aeioumn':
449
+ return (stem + 'e', 'VBD')
450
+
451
+ # CVC + e 패턴: liked → like, hoped → hope
452
+ if len(stem) >= 2 and stem[-2] in 'aeiou' and stem[-1] in 'kptd':
453
+ return (stem + 'e', 'VBD')
454
+
455
+ return (stem, 'VBD')
456
+
457
+ # -s/-es 형 (동사 3인칭 또는 복수)
458
+ if word.endswith('ies') and len(word) > 4:
459
+ return (word[:-3] + 'y', 'VBZ') # or NNS
460
+ if word.endswith('es') and len(word) > 3:
461
+ return (word[:-2], 'VBZ')
462
+ if word.endswith('s') and len(word) > 2:
463
+ return (word[:-1], 'VBZ')
464
+
465
+ # -ly 형 (부사)
466
+ if word.endswith('ly') and len(word) > 3:
467
+ return (word[:-2], 'RB')
468
+
469
+ # -ness 형 (명사)
470
+ if word.endswith('ness') and len(word) > 5:
471
+ return (word[:-4], 'NN')
472
+
473
+ # -tion/-sion 형 (명사)
474
+ if word.endswith(('tion', 'sion')) and len(word) > 5:
475
+ return (word, 'NN')
476
+
477
+ # -or 형 (명사): doctor, actor, director
478
+ if word.endswith('or') and len(word) > 3:
479
+ return (word, 'NN')
480
+
481
+ # -est 형 (최상급)
482
+ if word.endswith('est') and len(word) > 4:
483
+ return (word[:-3], 'JJS')
484
+
485
+ # -er 형 (비교급/명사)
486
+ if word.endswith('er') and len(word) > 3:
487
+ base = word[:-2]
488
+ # 비교급 패턴: bigger → big, nicer → nice, taller → tall
489
+ # 자음 중복
490
+ if len(base) >= 2 and base[-1] == base[-2]:
491
+ return (base[:-1], 'JJR')
492
+ # e 탈락
493
+ if len(base) >= 2 and base[-1] in 'cgkptvlns':
494
+ return (base + 'e', 'JJR')
495
+ # 나머지는 명사로 (teacher, player)
496
+ return (word, 'NN')
497
+
498
+ # =================================================================
499
+ # Adjective Suffix Patterns (형용사 접미사)
500
+ # =================================================================
501
+
502
+ # -ful 형 (형용사): beautiful, wonderful, powerful
503
+ if word.endswith('ful') and len(word) > 5:
504
+ return (word[:-3], 'JJ')
505
+
506
+ # -less 형 (형용사): useless, helpless, careless
507
+ if word.endswith('less') and len(word) > 5:
508
+ return (word[:-4], 'JJ')
509
+
510
+ # -ous 형 (형용사): famous, dangerous, nervous
511
+ if word.endswith('ous') and len(word) > 4:
512
+ return (word, 'JJ')
513
+
514
+ # -ive 형 (형용사): active, creative, impressive
515
+ if word.endswith('ive') and len(word) > 4:
516
+ return (word, 'JJ')
517
+
518
+ # -able/-ible 형 (형용사): available, possible, incredible
519
+ if word.endswith(('able', 'ible')) and len(word) > 5:
520
+ return (word, 'JJ')
521
+
522
+ # -al/-ial/-ical 형 (형용사): natural, social, political
523
+ if word.endswith(('ical', 'ial')) and len(word) > 5:
524
+ return (word, 'JJ')
525
+ if word.endswith('al') and len(word) > 3:
526
+ return (word, 'JJ')
527
+
528
+ # -ent/-ant 형 (형용사): different, important, excellent
529
+ if word.endswith(('ent', 'ant')) and len(word) > 4:
530
+ return (word, 'JJ')
531
+
532
+ # 대문자로 시작 (고유명사)
533
+ if word[0].isupper():
534
+ return (word, 'NNP')
535
+
536
+ # 기본값: 명사
537
+ return (word, 'NN')
538
+
539
+ def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
540
+ """대안 분석 결과 생성"""
541
+ alternatives = []
542
+
543
+ # 다른 도메인으로 분석
544
+ other_domains = [d for d in Domain if d != domain][:count]
545
+
546
+ for alt_domain in other_domains:
547
+ morphemes = self._analyze_text(text, alt_domain)
548
+ result = AnalysisResult(
549
+ morphemes=morphemes,
550
+ score=0.8,
551
+ domain=alt_domain
552
+ )
553
+ result.score = self._score_analysis(result) * 0.9
554
+ alternatives.append(result)
555
+
556
+ return alternatives
557
+
558
+
559
+ # Alias for backward compatibility
560
+ EnglishAnalyzer = EnglishAdvancedAnalyzer