tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,181 @@
1
+ """
2
+ Brahmic Script Language Template
3
+ ================================
4
+
5
+ 브라흐미 계열 문자 기반 언어용 템플릿 분석기
6
+ Devanagari, Bengali, Tamil, Telugu, Thai, Khmer, Myanmar, etc.
7
+ """
8
+
9
+ import re
10
+ from typing import List, Tuple, Dict, Optional
11
+
12
+ from ..advanced_base import (
13
+ AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, Domain
14
+ )
15
+
16
+
17
+ class BrahmicScriptAnalyzer(AdvancedMorphologicalAnalyzer):
18
+ """
19
+ 브라흐미 계열 문자 기반 언어 템플릿
20
+
21
+ Covers:
22
+ - Devanagari: Hindi, Marathi, Nepali, Sanskrit
23
+ - Bengali: Bengali, Assamese
24
+ - Tamil: Tamil
25
+ - Telugu: Telugu
26
+ - Kannada: Kannada
27
+ - Malayalam: Malayalam
28
+ - Gujarati: Gujarati
29
+ - Punjabi (Gurmukhi): Punjabi
30
+ - Odia: Odia
31
+ - Sinhala: Sinhala
32
+ - Thai: Thai
33
+ - Khmer: Khmer
34
+ - Myanmar: Burmese
35
+ - Lao: Lao
36
+ """
37
+
38
+ # Unicode ranges for various Brahmic scripts
39
+ SCRIPT_PATTERNS = {
40
+ 'devanagari': re.compile(r'[\u0900-\u097F]+'),
41
+ 'bengali': re.compile(r'[\u0980-\u09FF]+'),
42
+ 'tamil': re.compile(r'[\u0B80-\u0BFF]+'),
43
+ 'telugu': re.compile(r'[\u0C00-\u0C7F]+'),
44
+ 'kannada': re.compile(r'[\u0C80-\u0CFF]+'),
45
+ 'malayalam': re.compile(r'[\u0D00-\u0D7F]+'),
46
+ 'gujarati': re.compile(r'[\u0A80-\u0AFF]+'),
47
+ 'gurmukhi': re.compile(r'[\u0A00-\u0A7F]+'),
48
+ 'oriya': re.compile(r'[\u0B00-\u0B7F]+'),
49
+ 'sinhala': re.compile(r'[\u0D80-\u0DFF]+'),
50
+ 'thai': re.compile(r'[\u0E00-\u0E7F]+'),
51
+ 'lao': re.compile(r'[\u0E80-\u0EFF]+'),
52
+ 'khmer': re.compile(r'[\u1780-\u17FF]+'),
53
+ 'myanmar': re.compile(r'[\u1000-\u109F]+'),
54
+ 'tibetan': re.compile(r'[\u0F00-\u0FFF]+'),
55
+ }
56
+
57
+ # Combined pattern for any Brahmic script
58
+ WORD_PATTERN = re.compile(
59
+ r'[\u0900-\u097F' # Devanagari
60
+ r'\u0980-\u09FF' # Bengali
61
+ r'\u0A00-\u0A7F' # Gurmukhi
62
+ r'\u0A80-\u0AFF' # Gujarati
63
+ r'\u0B00-\u0B7F' # Oriya
64
+ r'\u0B80-\u0BFF' # Tamil
65
+ r'\u0C00-\u0C7F' # Telugu
66
+ r'\u0C80-\u0CFF' # Kannada
67
+ r'\u0D00-\u0D7F' # Malayalam
68
+ r'\u0D80-\u0DFF' # Sinhala
69
+ r'\u0E00-\u0E7F' # Thai
70
+ r'\u0E80-\u0EFF' # Lao
71
+ r'\u0F00-\u0FFF' # Tibetan
72
+ r'\u1000-\u109F' # Myanmar
73
+ r'\u1780-\u17FF' # Khmer
74
+ r']+'
75
+ )
76
+ NUMBER_PATTERN = re.compile(r'[0-9०-९০-৯੦-੯૦-૯୦-୯௦-௯౦-౯೦-೯൦-൯๐-๙]+')
77
+
78
+ # Script-specific setting (override in subclass)
79
+ SCRIPT_NAME: str = 'devanagari'
80
+
81
+ def __init__(self):
82
+ super().__init__()
83
+
84
+ def _build_base_dictionary(self):
85
+ """Override in subclass"""
86
+ self.postpositions: Dict[str, str] = {}
87
+ self.function_words: Dict[str, str] = {}
88
+ self.verb_stems: Dict[str, str] = {}
89
+
90
+ def _build_domain_dictionaries(self):
91
+ """Override in subclass"""
92
+ pass
93
+
94
+ def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
95
+ if not text or not text.strip():
96
+ return [AnalysisResult([])]
97
+
98
+ morphemes = self._analyze_text(text, domain)
99
+ result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
100
+ result.score = self._score_analysis(result)
101
+ return [result]
102
+
103
+ def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
104
+ result = []
105
+ pos = 0
106
+
107
+ while pos < len(text):
108
+ if text[pos].isspace():
109
+ pos += 1
110
+ continue
111
+
112
+ # Brahmic script word
113
+ word_match = self.WORD_PATTERN.match(text[pos:])
114
+ if word_match:
115
+ word = word_match.group()
116
+ morpheme = self._analyze_word(word, pos, domain)
117
+ result.append(morpheme)
118
+ pos += len(word)
119
+ continue
120
+
121
+ # Latin (foreign/transliteration)
122
+ latin_match = re.match(r'[a-zA-Z]+', text[pos:])
123
+ if latin_match:
124
+ word = latin_match.group()
125
+ result.append(Morpheme(surface=word, lemma=word, pos='FOREIGN', start=pos, end=pos + len(word)))
126
+ pos += len(word)
127
+ continue
128
+
129
+ # Number
130
+ num_match = self.NUMBER_PATTERN.match(text[pos:])
131
+ if num_match:
132
+ num = num_match.group()
133
+ result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
134
+ pos += len(num)
135
+ continue
136
+
137
+ # Punctuation
138
+ result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
139
+ pos += 1
140
+
141
+ return result
142
+
143
+ def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
144
+ """Analyze word"""
145
+
146
+ # User dictionary
147
+ if word in self._user_dictionary:
148
+ lemma, pos_tag, _ = self._user_dictionary[word]
149
+ return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
150
+
151
+ # Domain dictionary
152
+ domain_sense = self._get_domain_sense(word, domain)
153
+ if domain_sense:
154
+ return Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))
155
+
156
+ # Postpositions
157
+ if hasattr(self, 'postpositions') and word in self.postpositions:
158
+ return Morpheme(surface=word, lemma=word, pos='PSP', start=offset, end=offset + len(word))
159
+
160
+ # Function words
161
+ if hasattr(self, 'function_words') and word in self.function_words:
162
+ return Morpheme(surface=word, lemma=word, pos=self.function_words[word], start=offset, end=offset + len(word))
163
+
164
+ # Verb stem check
165
+ if hasattr(self, 'verb_stems'):
166
+ for stem, pos_tag in self.verb_stems.items():
167
+ if word.startswith(stem) and len(word) > len(stem):
168
+ return Morpheme(surface=word, lemma=stem, pos='V', start=offset, end=offset + len(word))
169
+
170
+ # Default: noun
171
+ return Morpheme(surface=word, lemma=word, pos='N', start=offset, end=offset + len(word))
172
+
173
+ def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
174
+ alternatives = []
175
+ other_domains = [d for d in Domain if d != domain][:count]
176
+ for alt_domain in other_domains:
177
+ morphemes = self._analyze_text(text, alt_domain)
178
+ result = AnalysisResult(morphemes=morphemes, score=0.8, domain=alt_domain)
179
+ result.score = self._score_analysis(result) * 0.9
180
+ alternatives.append(result)
181
+ return alternatives
@@ -0,0 +1,168 @@
1
+ """
2
+ Cyrillic Script Language Template
3
+ =================================
4
+
5
+ 키릴 문자 기반 언어용 템플릿 분석기
6
+ Russian, Ukrainian, Bulgarian, Serbian, Macedonian, etc.
7
+ """
8
+
9
+ import re
10
+ from typing import List, Tuple, Dict, Optional
11
+
12
+ from ..advanced_base import (
13
+ AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, Domain
14
+ )
15
+
16
+
17
+ class CyrillicScriptAnalyzer(AdvancedMorphologicalAnalyzer):
18
+ """
19
+ 키릴 문자 기반 언어 템플릿
20
+
21
+ Covers: Russian, Ukrainian, Bulgarian, Serbian (Cyrillic),
22
+ Macedonian, Belarusian, Kazakh, Uzbek, Mongolian, etc.
23
+ """
24
+
25
+ # Cyrillic character pattern (extended)
26
+ WORD_PATTERN = re.compile(
27
+ r'[а-яА-ЯёЁ' # Russian/basic
28
+ r'іїєґІЇЄҐ' # Ukrainian
29
+ r'ўЎ' # Belarusian
30
+ r'ђјљњћџЂЈЉЊЋЏ' # Serbian
31
+ r'ѓќѕѝЃЌЅЍ' # Macedonian
32
+ r'әғқңөұүһӘҒҚҢӨҰҮҺ' # Kazakh
33
+ r'ғқҳўғҚҲЎ' # Uzbek
34
+ r'өүөҮ]+' # Mongolian
35
+ )
36
+ NUMBER_PATTERN = re.compile(r'[0-9]+(?:[.,][0-9]+)?')
37
+
38
+ # Override in subclass
39
+ VERB_INFINITIVE_SUFFIX: str = 'ть' # Default: Russian
40
+ REFLEXIVE_SUFFIX: str = 'ся'
41
+
42
+ def __init__(self):
43
+ super().__init__()
44
+
45
+ def _build_base_dictionary(self):
46
+ """Override in subclass"""
47
+ self.function_words: Dict[str, str] = {}
48
+ self.irregular_verbs: Dict[str, str] = {}
49
+
50
+ def _build_domain_dictionaries(self):
51
+ """Override in subclass"""
52
+ pass
53
+
54
+ def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
55
+ if not text or not text.strip():
56
+ return [AnalysisResult([])]
57
+
58
+ morphemes = self._analyze_text(text, domain)
59
+ result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
60
+ result.score = self._score_analysis(result)
61
+ return [result]
62
+
63
+ def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
64
+ result = []
65
+ pos = 0
66
+
67
+ while pos < len(text):
68
+ if text[pos].isspace():
69
+ pos += 1
70
+ continue
71
+
72
+ # Cyrillic word
73
+ word_match = self.WORD_PATTERN.match(text[pos:])
74
+ if word_match:
75
+ word = word_match.group()
76
+ morpheme = self._analyze_word(word, pos, domain)
77
+ result.append(morpheme)
78
+ pos += len(word)
79
+ continue
80
+
81
+ # Latin (foreign words)
82
+ latin_match = re.match(r'[a-zA-Z]+', text[pos:])
83
+ if latin_match:
84
+ word = latin_match.group()
85
+ result.append(Morpheme(surface=word, lemma=word, pos='FOREIGN', start=pos, end=pos + len(word)))
86
+ pos += len(word)
87
+ continue
88
+
89
+ # Number
90
+ num_match = self.NUMBER_PATTERN.match(text[pos:])
91
+ if num_match:
92
+ num = num_match.group()
93
+ result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
94
+ pos += len(num)
95
+ continue
96
+
97
+ # Punctuation
98
+ result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
99
+ pos += 1
100
+
101
+ return result
102
+
103
+ def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
104
+ word_lower = word.lower()
105
+
106
+ # User dictionary
107
+ if word_lower in self._user_dictionary:
108
+ lemma, pos_tag, _ = self._user_dictionary[word_lower]
109
+ return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
110
+
111
+ # Domain dictionary
112
+ domain_sense = self._get_domain_sense(word_lower, domain)
113
+ if domain_sense:
114
+ return Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))
115
+
116
+ # Function words
117
+ if hasattr(self, 'function_words') and word_lower in self.function_words:
118
+ return Morpheme(surface=word, lemma=word_lower, pos=self.function_words[word_lower], start=offset, end=offset + len(word))
119
+
120
+ # Irregular verbs
121
+ if hasattr(self, 'irregular_verbs') and word_lower in self.irregular_verbs:
122
+ return Morpheme(surface=word, lemma=self.irregular_verbs[word_lower], pos='V', start=offset, end=offset + len(word))
123
+
124
+ # Morphological analysis
125
+ lemma, pos_tag = self._analyze_morphology(word)
126
+ return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
127
+
128
+ def _analyze_morphology(self, word: str) -> Tuple[str, str]:
129
+ """Suffix-based morphological analysis for Cyrillic"""
130
+ word_lower = word.lower()
131
+
132
+ # Reflexive verb
133
+ if word_lower.endswith(self.REFLEXIVE_SUFFIX) and len(word_lower) > 4:
134
+ return (word_lower[:-2], 'V')
135
+
136
+ # Verb infinitive
137
+ if word_lower.endswith(self.VERB_INFINITIVE_SUFFIX) and len(word_lower) > 3:
138
+ return (word_lower, 'V')
139
+
140
+ # Verbal noun suffixes (-ние, -ение, -ание, -ость, -есть)
141
+ if word_lower.endswith(('ние', 'ение', 'ание', 'ость', 'есть')) and len(word_lower) > 5:
142
+ return (word_lower, 'N')
143
+
144
+ # Adjective endings (-ый, -ий, -ой, -ая, -яя, -ое, -ее)
145
+ if word_lower.endswith(('ый', 'ий', 'ой', 'ая', 'яя', 'ое', 'ее', 'ые', 'ие')) and len(word_lower) > 3:
146
+ return (word_lower, 'ADJ')
147
+
148
+ # Adverb (-о, -е for short adjectives used as adverbs)
149
+ if word_lower.endswith('о') and len(word_lower) > 3:
150
+ # Could be adverb or neuter adjective
151
+ return (word_lower, 'ADV')
152
+
153
+ # Proper noun (capitalized)
154
+ if word[0].isupper():
155
+ return (word, 'NP')
156
+
157
+ # Default: noun
158
+ return (word_lower, 'N')
159
+
160
+ def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
161
+ alternatives = []
162
+ other_domains = [d for d in Domain if d != domain][:count]
163
+ for alt_domain in other_domains:
164
+ morphemes = self._analyze_text(text, alt_domain)
165
+ result = AnalysisResult(morphemes=morphemes, score=0.8, domain=alt_domain)
166
+ result.score = self._score_analysis(result) * 0.9
167
+ alternatives.append(result)
168
+ return alternatives
@@ -0,0 +1,235 @@
1
+ """
2
+ Latin Script Language Template
3
+ ==============================
4
+
5
+ 라틴 문자 기반 언어용 템플릿 분석기
6
+ Romance, Germanic, Slavic (Latin), Turkic, etc.
7
+ """
8
+
9
+ import re
10
+ import json
11
+ from pathlib import Path
12
+ from typing import List, Tuple, Dict, Optional, Set
13
+
14
+ from ..advanced_base import (
15
+ AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, Domain
16
+ )
17
+
18
+ # 확장 사전 디렉토리
19
+ from ... import resources
20
+
21
+ # Optional external asset dir (default: none). If you want extended dictionaries,
22
+ # provide them under: TOKMOR_DATA_DIR/extended_dict/{lang}_extended.json
23
+ DICT_DIR = resources.data_dir() / "extended_dict"
24
+
25
+
26
+ class LatinScriptAnalyzer(AdvancedMorphologicalAnalyzer):
27
+ """
28
+ 라틴 문자 기반 언어 템플릿
29
+
30
+ 서브클래스에서 오버라이드할 항목:
31
+ - LANG_CODE, LANG_NAME
32
+ - _build_base_dictionary()
33
+ - _build_domain_dictionaries()
34
+ """
35
+
36
+ # Extended Latin characters (covers most European languages)
37
+ WORD_PATTERN = re.compile(
38
+ r"[a-zA-Zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
39
+ r"ąćęłńóśźżĄĆĘŁŃÓŚŹŻ" # Polish
40
+ r"čďěňřšťůžČĎĚŇŘŠŤŮŽ" # Czech/Slovak
41
+ r"őűŐŰ" # Hungarian
42
+ r"ăâîșțĂÂÎȘȚ" # Romanian
43
+ r"āēīōūĀĒĪŌŪ" # Latvian/Lithuanian
44
+ r"ğışöüçĞİŞÖÜÇ" # Turkish
45
+ r"ảạấầẩẫậắằẳẵặẻẽẹếềểễệỉĩịỏọốồổỗộớờởỡợủũụứừửữựỳỵỷỹ" # Vietnamese
46
+ r"ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞß]+"
47
+ r"(?:[-'][a-zA-Zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ]+)*"
48
+ )
49
+ NUMBER_PATTERN = re.compile(r'[0-9]+(?:[.,][0-9]+)?')
50
+
51
+ # Common suffixes for morphological analysis (override in subclass)
52
+ VERB_INFINITIVE_SUFFIXES: List[str] = []
53
+ VERB_PARTICIPLE_SUFFIXES: List[str] = []
54
+ NOUN_PLURAL_SUFFIXES: List[str] = []
55
+ ADJECTIVE_SUFFIXES: List[str] = []
56
+ ADVERB_SUFFIXES: List[str] = []
57
+
58
+ def __init__(self):
59
+ super().__init__()
60
+ # 확장 사전 로드 (super().__init__에서 _build_base_dictionary 호출 후)
61
+ self._load_extended_dictionary()
62
+
63
+ def _build_base_dictionary(self):
64
+ """Override in subclass"""
65
+ self.function_words: Dict[str, str] = {}
66
+ self.irregular_verbs: Dict[str, str] = {}
67
+ self.irregular_nouns: Dict[str, str] = {}
68
+
69
+ def _load_extended_dictionary(self):
70
+ """Load optional external extended dictionary"""
71
+ # 확장 사전 속성 초기화
72
+ if not hasattr(self, 'extended_nouns'):
73
+ self.extended_nouns: Dict[str, str] = {}
74
+ if not hasattr(self, 'extended_verbs'):
75
+ self.extended_verbs: Dict[str, str] = {}
76
+ if not hasattr(self, 'extended_adjs'):
77
+ self.extended_adjs: Dict[str, str] = {}
78
+ if not hasattr(self, 'extended_advs'):
79
+ self.extended_advs: Dict[str, str] = {}
80
+
81
+ lang_code = getattr(self, 'LANG_CODE', None)
82
+ if not lang_code:
83
+ return
84
+
85
+ dict_path = DICT_DIR / f'{lang_code}_extended.json'
86
+ if not dict_path.exists():
87
+ return
88
+
89
+ with open(dict_path, 'r', encoding='utf-8') as f:
90
+ extended = json.load(f)
91
+
92
+ # 확장 사전에 추가
93
+ for word, upos in extended.items():
94
+ if upos == 'NOUN':
95
+ self.extended_nouns[word] = 'N'
96
+ elif upos == 'PROPN':
97
+ self.extended_nouns[word] = 'NP'
98
+ elif upos == 'VERB':
99
+ self.extended_verbs[word] = 'V'
100
+ elif upos == 'ADJ':
101
+ self.extended_adjs[word] = 'ADJ'
102
+ elif upos == 'ADV':
103
+ self.extended_advs[word] = 'ADV'
104
+
105
+ def _build_domain_dictionaries(self):
106
+ """Override in subclass"""
107
+ pass
108
+
109
+ def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
110
+ if not text or not text.strip():
111
+ return [AnalysisResult([])]
112
+
113
+ morphemes = self._analyze_text(text, domain)
114
+ result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
115
+ result.score = self._score_analysis(result)
116
+ return [result]
117
+
118
+ def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
119
+ result = []
120
+ pos = 0
121
+
122
+ while pos < len(text):
123
+ if text[pos].isspace():
124
+ pos += 1
125
+ continue
126
+
127
+ # Word matching
128
+ word_match = self.WORD_PATTERN.match(text[pos:])
129
+ if word_match:
130
+ word = word_match.group()
131
+ morpheme = self._analyze_word(word, pos, domain)
132
+ result.append(morpheme)
133
+ pos += len(word)
134
+ continue
135
+
136
+ # Number
137
+ num_match = self.NUMBER_PATTERN.match(text[pos:])
138
+ if num_match:
139
+ num = num_match.group()
140
+ result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
141
+ pos += len(num)
142
+ continue
143
+
144
+ # Punctuation/Symbol
145
+ result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
146
+ pos += 1
147
+
148
+ return result
149
+
150
+ def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
151
+ word_lower = word.lower()
152
+
153
+ # 1. User dictionary
154
+ if word_lower in self._user_dictionary:
155
+ lemma, pos_tag, _ = self._user_dictionary[word_lower]
156
+ return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
157
+
158
+ # 2. Domain dictionary
159
+ domain_sense = self._get_domain_sense(word_lower, domain)
160
+ if domain_sense:
161
+ return Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))
162
+
163
+ # 3. Function words
164
+ if hasattr(self, 'function_words') and word_lower in self.function_words:
165
+ return Morpheme(surface=word, lemma=word_lower, pos=self.function_words[word_lower], start=offset, end=offset + len(word))
166
+
167
+ # 4. Irregular verbs
168
+ if hasattr(self, 'irregular_verbs') and word_lower in self.irregular_verbs:
169
+ return Morpheme(surface=word, lemma=self.irregular_verbs[word_lower], pos='V', start=offset, end=offset + len(word))
170
+
171
+ # 5. Irregular nouns
172
+ if hasattr(self, 'irregular_nouns') and word_lower in self.irregular_nouns:
173
+ return Morpheme(surface=word, lemma=self.irregular_nouns[word_lower], pos='N', start=offset, end=offset + len(word))
174
+
175
+ # 6. Extended dictionary (optional external)
176
+ if hasattr(self, 'extended_verbs') and word_lower in self.extended_verbs:
177
+ return Morpheme(surface=word, lemma=word_lower, pos='V', start=offset, end=offset + len(word))
178
+ if hasattr(self, 'extended_nouns') and word_lower in self.extended_nouns:
179
+ return Morpheme(surface=word, lemma=word_lower, pos=self.extended_nouns[word_lower], start=offset, end=offset + len(word))
180
+ if hasattr(self, 'extended_adjs') and word_lower in self.extended_adjs:
181
+ return Morpheme(surface=word, lemma=word_lower, pos='ADJ', start=offset, end=offset + len(word))
182
+ if hasattr(self, 'extended_advs') and word_lower in self.extended_advs:
183
+ return Morpheme(surface=word, lemma=word_lower, pos='ADV', start=offset, end=offset + len(word))
184
+
185
+ # 7. Morphological analysis
186
+ lemma, pos_tag = self._analyze_morphology(word)
187
+ return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
188
+
189
+ def _analyze_morphology(self, word: str) -> Tuple[str, str]:
190
+ """Suffix-based morphological analysis"""
191
+ word_lower = word.lower()
192
+
193
+ # Verb infinitive
194
+ for suffix in self.VERB_INFINITIVE_SUFFIXES:
195
+ if word_lower.endswith(suffix) and len(word_lower) > len(suffix) + 1:
196
+ return (word_lower, 'V')
197
+
198
+ # Verb participle
199
+ for suffix in self.VERB_PARTICIPLE_SUFFIXES:
200
+ if word_lower.endswith(suffix) and len(word_lower) > len(suffix) + 1:
201
+ stem = word_lower[:-len(suffix)]
202
+ return (stem, 'V')
203
+
204
+ # Noun plural
205
+ for suffix in self.NOUN_PLURAL_SUFFIXES:
206
+ if word_lower.endswith(suffix) and len(word_lower) > len(suffix) + 1:
207
+ stem = word_lower[:-len(suffix)]
208
+ return (stem, 'N')
209
+
210
+ # Adjective
211
+ for suffix in self.ADJECTIVE_SUFFIXES:
212
+ if word_lower.endswith(suffix) and len(word_lower) > len(suffix) + 1:
213
+ return (word_lower, 'ADJ')
214
+
215
+ # Adverb
216
+ for suffix in self.ADVERB_SUFFIXES:
217
+ if word_lower.endswith(suffix) and len(word_lower) > len(suffix) + 1:
218
+ return (word_lower, 'ADV')
219
+
220
+ # Proper noun (capitalized)
221
+ if word[0].isupper():
222
+ return (word, 'NP')
223
+
224
+ # Default: noun
225
+ return (word_lower, 'N')
226
+
227
+ def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
228
+ alternatives = []
229
+ other_domains = [d for d in Domain if d != domain][:count]
230
+ for alt_domain in other_domains:
231
+ morphemes = self._analyze_text(text, alt_domain)
232
+ result = AnalysisResult(morphemes=morphemes, score=0.8, domain=alt_domain)
233
+ result.score = self._score_analysis(result) * 0.9
234
+ alternatives.append(result)
235
+ return alternatives