tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,237 @@
1
+ """
2
+ French Advanced Morphological Analyzer
3
+ ======================================
4
+
5
+ 5가지 고급 기능을 지원하는 프랑스어 형태소 분석기
6
+ """
7
+
8
+ import re
9
+ from typing import List, Tuple, Dict, Optional
10
+
11
+ from .advanced_base import (
12
+ AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, NBestResult, Domain
13
+ )
14
+
15
+
16
+ class FrenchAdvancedAnalyzer(AdvancedMorphologicalAnalyzer):
17
+ """프랑스어 고급 형태소 분석기"""
18
+
19
+ LANG_CODE = "fr"
20
+ LANG_NAME = "French"
21
+
22
+ WORD_PATTERN = re.compile(r"[a-zA-ZàâäéèêëïîôùûüÿœæçÀÂÄÉÈÊËÏÎÔÙÛÜŸŒÆÇ]+(?:-[a-zA-ZàâäéèêëïîôùûüÿœæçÀÂÄÉÈÊËÏÎÔÙÛÜŸŒÆÇ]+)*(?:'[a-zA-ZàâäéèêëïîôùûüÿœæçÀÂÄÉÈÊËÏÎÔÙÛÜŸŒÆÇ]+)?")
23
+ NUMBER_PATTERN = re.compile(r'[0-9]+(?:[.,][0-9]+)?')
24
+
25
+ def __init__(self):
26
+ super().__init__()
27
+
28
+ def _build_base_dictionary(self):
29
+ """기본 사전 구축"""
30
+
31
+ # 불규칙 동사 (être, avoir, aller, faire)
32
+ self.irregular_verbs = {
33
+ # être
34
+ 'suis': 'être', 'es': 'être', 'est': 'être',
35
+ 'sommes': 'être', 'êtes': 'être', 'sont': 'être',
36
+ 'étais': 'être', 'était': 'être', 'étions': 'être',
37
+ 'étiez': 'être', 'étaient': 'être', 'été': 'être',
38
+ # avoir
39
+ 'ai': 'avoir', 'as': 'avoir', 'a': 'avoir',
40
+ 'avons': 'avoir', 'avez': 'avoir', 'ont': 'avoir',
41
+ 'avais': 'avoir', 'avait': 'avoir', 'avions': 'avoir',
42
+ 'aviez': 'avoir', 'avaient': 'avoir', 'eu': 'avoir',
43
+ # aller
44
+ 'vais': 'aller', 'vas': 'aller', 'va': 'aller',
45
+ 'allons': 'aller', 'allez': 'aller', 'vont': 'aller',
46
+ 'allais': 'aller', 'allait': 'aller', 'allé': 'aller',
47
+ # faire
48
+ 'fais': 'faire', 'fait': 'faire', 'faisons': 'faire',
49
+ 'faites': 'faire', 'font': 'faire', 'faisais': 'faire',
50
+ # pouvoir
51
+ 'peux': 'pouvoir', 'peut': 'pouvoir', 'pouvons': 'pouvoir',
52
+ 'pouvez': 'pouvoir', 'peuvent': 'pouvoir', 'pu': 'pouvoir',
53
+ # vouloir
54
+ 'veux': 'vouloir', 'veut': 'vouloir', 'voulons': 'vouloir',
55
+ 'voulez': 'vouloir', 'veulent': 'vouloir', 'voulu': 'vouloir',
56
+ # savoir
57
+ 'sais': 'savoir', 'sait': 'savoir', 'savons': 'savoir',
58
+ 'savez': 'savoir', 'savent': 'savoir', 'su': 'savoir',
59
+ # venir
60
+ 'viens': 'venir', 'vient': 'venir', 'venons': 'venir',
61
+ 'venez': 'venir', 'viennent': 'venir', 'venu': 'venir',
62
+ # prendre
63
+ 'prends': 'prendre', 'prend': 'prendre', 'prenons': 'prendre',
64
+ 'prenez': 'prendre', 'prennent': 'prendre', 'pris': 'prendre',
65
+ }
66
+
67
+ # 관사
68
+ self.articles = {
69
+ 'le': 'DET', 'la': 'DET', 'les': 'DET', "l'": 'DET',
70
+ 'un': 'DET', 'une': 'DET', 'des': 'DET',
71
+ 'du': 'DET', 'de': 'DET', "d'": 'DET',
72
+ 'au': 'DET', 'aux': 'DET',
73
+ }
74
+
75
+ # 대명사
76
+ self.pronouns = {
77
+ 'je': 'PRON', 'tu': 'PRON', 'il': 'PRON', 'elle': 'PRON',
78
+ 'on': 'PRON', 'nous': 'PRON', 'vous': 'PRON', 'ils': 'PRON', 'elles': 'PRON',
79
+ 'me': 'PRON', 'te': 'PRON', 'se': 'PRON', 'lui': 'PRON', 'leur': 'PRON',
80
+ 'ce': 'PRON', 'cela': 'PRON', 'ça': 'PRON', 'ceci': 'PRON',
81
+ 'qui': 'PRON', 'que': 'PRON', 'quoi': 'PRON', 'dont': 'PRON',
82
+ }
83
+
84
+ # 전치사
85
+ self.prepositions = {
86
+ 'à': 'PREP', 'de': 'PREP', 'en': 'PREP', 'dans': 'PREP',
87
+ 'sur': 'PREP', 'sous': 'PREP', 'avec': 'PREP', 'sans': 'PREP',
88
+ 'pour': 'PREP', 'par': 'PREP', 'chez': 'PREP', 'vers': 'PREP',
89
+ 'entre': 'PREP', 'contre': 'PREP', 'depuis': 'PREP', 'pendant': 'PREP',
90
+ 'avant': 'PREP', 'après': 'PREP', 'devant': 'PREP', 'derrière': 'PREP',
91
+ }
92
+
93
+ # 접속사
94
+ self.conjunctions = {
95
+ 'et': 'CONJ', 'ou': 'CONJ', 'mais': 'CONJ', 'donc': 'CONJ',
96
+ 'car': 'CONJ', 'ni': 'CONJ', 'or': 'CONJ',
97
+ 'que': 'CONJ', 'si': 'CONJ', 'quand': 'CONJ', 'comme': 'CONJ',
98
+ 'parce': 'CONJ', 'puisque': 'CONJ', 'lorsque': 'CONJ',
99
+ }
100
+
101
+ # 부사
102
+ self.adverbs = {
103
+ 'très': 'ADV', 'bien': 'ADV', 'mal': 'ADV', 'peu': 'ADV',
104
+ 'beaucoup': 'ADV', 'trop': 'ADV', 'assez': 'ADV', 'plus': 'ADV',
105
+ 'moins': 'ADV', 'aussi': 'ADV', 'encore': 'ADV', 'toujours': 'ADV',
106
+ 'jamais': 'ADV', 'souvent': 'ADV', 'parfois': 'ADV', 'ici': 'ADV',
107
+ 'là': 'ADV', 'maintenant': 'ADV', 'déjà': 'ADV', 'bientôt': 'ADV',
108
+ }
109
+
110
+ def _build_domain_dictionaries(self):
111
+ """도메인별 사전"""
112
+ self._domain_dictionaries[Domain.TECH] = {
113
+ 'pomme': ('Apple', 'NP'),
114
+ 'nuage': ('cloud', 'NC'),
115
+ }
116
+ self._domain_dictionaries[Domain.FOOD] = {
117
+ 'pomme': ('pomme', 'NC'),
118
+ }
119
+ self._domain_dictionaries[Domain.FINANCE] = {
120
+ 'banque': ('banque', 'NC'),
121
+ 'action': ('action', 'NC'),
122
+ }
123
+
124
+ def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
125
+ if not text or not text.strip():
126
+ return [AnalysisResult([])]
127
+
128
+ morphemes = self._analyze_text(text, domain)
129
+ result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
130
+ result.score = self._score_analysis(result)
131
+ return [result]
132
+
133
+ def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
134
+ result = []
135
+ pos = 0
136
+
137
+ while pos < len(text):
138
+ if text[pos].isspace():
139
+ pos += 1
140
+ continue
141
+
142
+ word_match = self.WORD_PATTERN.match(text[pos:])
143
+ if word_match:
144
+ word = word_match.group()
145
+ morpheme = self._analyze_word(word, pos, domain)
146
+ result.append(morpheme)
147
+ pos += len(word)
148
+ continue
149
+
150
+ num_match = self.NUMBER_PATTERN.match(text[pos:])
151
+ if num_match:
152
+ num = num_match.group()
153
+ result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
154
+ pos += len(num)
155
+ continue
156
+
157
+ result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
158
+ pos += 1
159
+
160
+ return result
161
+
162
+ def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
163
+ word_lower = word.lower()
164
+
165
+ # 런타임 사전
166
+ if word_lower in self._user_dictionary:
167
+ lemma, pos_tag, _ = self._user_dictionary[word_lower]
168
+ return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
169
+
170
+ # 도메인 사전
171
+ domain_sense = self._get_domain_sense(word_lower, domain)
172
+ if domain_sense:
173
+ return Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))
174
+
175
+ # 기능어
176
+ if word_lower in self.articles:
177
+ return Morpheme(surface=word, lemma=word_lower, pos='DET', start=offset, end=offset + len(word))
178
+ if word_lower in self.pronouns:
179
+ return Morpheme(surface=word, lemma=word_lower, pos='PRON', start=offset, end=offset + len(word))
180
+ if word_lower in self.prepositions:
181
+ return Morpheme(surface=word, lemma=word_lower, pos='PREP', start=offset, end=offset + len(word))
182
+ if word_lower in self.conjunctions:
183
+ return Morpheme(surface=word, lemma=word_lower, pos='CONJ', start=offset, end=offset + len(word))
184
+ if word_lower in self.adverbs:
185
+ return Morpheme(surface=word, lemma=word_lower, pos='ADV', start=offset, end=offset + len(word))
186
+
187
+ # 불규칙 동사
188
+ if word_lower in self.irregular_verbs:
189
+ return Morpheme(surface=word, lemma=self.irregular_verbs[word_lower], pos='V', start=offset, end=offset + len(word))
190
+
191
+ # 형태 분석
192
+ lemma, pos_tag = self._analyze_morphology(word)
193
+ return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
194
+
195
+ def _analyze_morphology(self, word: str) -> Tuple[str, str]:
196
+ # -er 동사 (1군)
197
+ if word.endswith('er') and len(word) > 3:
198
+ return (word, 'V')
199
+
200
+ # -ir 동사 (2군)
201
+ if word.endswith('ir') and len(word) > 3:
202
+ return (word, 'V')
203
+
204
+ # -re 동사 (3군)
205
+ if word.endswith('re') and len(word) > 3:
206
+ return (word, 'V')
207
+
208
+ # -tion/-sion 명사
209
+ if word.endswith(('tion', 'sion')) and len(word) > 5:
210
+ return (word, 'NC')
211
+
212
+ # -ment 부사
213
+ if word.endswith('ment') and len(word) > 5:
214
+ return (word, 'ADV')
215
+
216
+ # -eux/-euse 형용사
217
+ if word.endswith(('eux', 'euse')) and len(word) > 4:
218
+ return (word, 'ADJ')
219
+
220
+ # 대문자 시작 (고유명사)
221
+ if word[0].isupper():
222
+ return (word, 'NP')
223
+
224
+ return (word, 'NC')
225
+
226
+ def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
227
+ alternatives = []
228
+ other_domains = [d for d in Domain if d != domain][:count]
229
+ for alt_domain in other_domains:
230
+ morphemes = self._analyze_text(text, alt_domain)
231
+ result = AnalysisResult(morphemes=morphemes, score=0.8, domain=alt_domain)
232
+ result.score = self._score_analysis(result) * 0.9
233
+ alternatives.append(result)
234
+ return alternatives
235
+
236
+
237
+ FrenchAnalyzer = FrenchAdvancedAnalyzer
@@ -0,0 +1,343 @@
1
+ """
2
+ German Advanced Morphological Analyzer
3
+ ======================================
4
+
5
+ 5가지 고급 기능을 지원하는 독일어 형태소 분석기
6
+
7
+ Features:
8
+ 1. NER Gazetteer Integration - 개체명 경계 보존
9
+ 2. Real-time Dictionary Extension - 런타임 사전 확장
10
+ 3. Domain Adaptation - 도메인별 분석 최적화
11
+ 4. Code-switching - 다국어 혼용 텍스트 처리
12
+ 5. N-best Analysis - 다중 후보 + 신뢰도 점수
13
+ """
14
+
15
+ import re
16
+ from typing import List, Tuple, Dict, Set, Optional, Any
17
+
18
+ from .advanced_base import (
19
+ AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, NBestResult, Domain
20
+ )
21
+
22
+
23
+ class GermanAdvancedAnalyzer(AdvancedMorphologicalAnalyzer):
24
+ """
25
+ 독일어 고급 형태소 분석기
26
+
27
+ 특징:
28
+ - 복합명사 분해
29
+ - 강/약 변화 처리
30
+ - 분리동사 처리
31
+ """
32
+
33
+ LANG_CODE = "de"
34
+ LANG_NAME = "German"
35
+
36
+ WORD_PATTERN = re.compile(r'[a-zA-ZäöüÄÖÜß]+')
37
+ NUMBER_PATTERN = re.compile(r'[0-9]+(?:[.,][0-9]+)?')
38
+
39
+ def __init__(self):
40
+ super().__init__()
41
+
42
+ def _build_base_dictionary(self):
43
+ """기본 사전 구축"""
44
+
45
+ # =================================================================
46
+ # 불규칙 동사 (Strong Verbs)
47
+ # =================================================================
48
+ self.irregular_verbs = {
49
+ # sein
50
+ 'bin': 'sein', 'bist': 'sein', 'ist': 'sein',
51
+ 'sind': 'sein', 'seid': 'sein', 'war': 'sein',
52
+ 'warst': 'sein', 'waren': 'sein', 'wart': 'sein',
53
+ 'gewesen': 'sein',
54
+ # haben
55
+ 'habe': 'haben', 'hast': 'haben', 'hat': 'haben',
56
+ 'habt': 'haben', 'hatte': 'haben', 'hattest': 'haben',
57
+ 'hatten': 'haben', 'hattet': 'haben', 'gehabt': 'haben',
58
+ # werden
59
+ 'werde': 'werden', 'wirst': 'werden', 'wird': 'werden',
60
+ 'werdet': 'werden', 'wurde': 'werden', 'wurdest': 'werden',
61
+ 'wurden': 'werden', 'wurdet': 'werden', 'geworden': 'werden',
62
+ # 기타 강변화 동사
63
+ 'ging': 'gehen', 'gegangen': 'gehen',
64
+ 'kam': 'kommen', 'gekommen': 'kommen',
65
+ 'sah': 'sehen', 'gesehen': 'sehen',
66
+ 'nahm': 'nehmen', 'genommen': 'nehmen',
67
+ 'gab': 'geben', 'gegeben': 'geben',
68
+ 'fand': 'finden', 'gefunden': 'finden',
69
+ 'sprach': 'sprechen', 'gesprochen': 'sprechen',
70
+ 'trug': 'tragen', 'getragen': 'tragen',
71
+ 'fuhr': 'fahren', 'gefahren': 'fahren',
72
+ 'schlief': 'schlafen', 'geschlafen': 'schlafen',
73
+ 'lief': 'laufen', 'gelaufen': 'laufen',
74
+ # 규칙 동사 활용형 (gehen, machen, etc.)
75
+ 'gehe': 'gehen', 'gehst': 'gehen', 'geht': 'gehen',
76
+ 'mache': 'machen', 'machst': 'machen', 'macht': 'machen',
77
+ 'sage': 'sagen', 'sagst': 'sagen', 'sagt': 'sagen',
78
+ 'arbeite': 'arbeiten', 'arbeitest': 'arbeiten', 'arbeitet': 'arbeiten',
79
+ 'lerne': 'lernen', 'lernst': 'lernen', 'lernt': 'lernen',
80
+ 'spiele': 'spielen', 'spielst': 'spielen', 'spielt': 'spielen',
81
+ 'kaufe': 'kaufen', 'kaufst': 'kaufen', 'kauft': 'kaufen',
82
+ 'frage': 'fragen', 'fragst': 'fragen', 'fragt': 'fragen',
83
+ 'höre': 'hören', 'hörst': 'hören', 'hört': 'hören',
84
+ 'lebe': 'leben', 'lebst': 'leben', 'lebt': 'leben',
85
+ 'liebe': 'lieben', 'liebst': 'lieben', 'liebt': 'lieben',
86
+ 'warte': 'warten', 'wartest': 'warten', 'wartet': 'warten',
87
+ 'öffne': 'öffnen', 'öffnest': 'öffnen', 'öffnet': 'öffnen',
88
+ 'zeige': 'zeigen', 'zeigst': 'zeigen', 'zeigt': 'zeigen',
89
+ 'brauche': 'brauchen', 'brauchst': 'brauchen', 'braucht': 'brauchen',
90
+ 'glaube': 'glauben', 'glaubst': 'glauben', 'glaubt': 'glauben',
91
+ 'denke': 'denken', 'denkst': 'denken', 'denkt': 'denken',
92
+ 'kenne': 'kennen', 'kennst': 'kennen', 'kennt': 'kennen',
93
+ 'wohne': 'wohnen', 'wohnst': 'wohnen', 'wohnt': 'wohnen',
94
+ 'suche': 'suchen', 'suchst': 'suchen', 'sucht': 'suchen',
95
+ 'folge': 'folgen', 'folgst': 'folgen', 'folgt': 'folgen',
96
+ 'führe': 'führen', 'führst': 'führen', 'führt': 'führen',
97
+ 'laufe': 'laufen', 'läufst': 'laufen', 'läuft': 'laufen',
98
+ 'fahre': 'fahren', 'fährst': 'fahren', 'fährt': 'fahren',
99
+ 'lese': 'lesen', 'liest': 'lesen',
100
+ 'esse': 'essen', 'isst': 'essen',
101
+ 'schlafe': 'schlafen', 'schläfst': 'schlafen', 'schläft': 'schlafen',
102
+ 'spreche': 'sprechen', 'sprichst': 'sprechen', 'spricht': 'sprechen',
103
+ 'nehme': 'nehmen', 'nimmst': 'nehmen', 'nimmt': 'nehmen',
104
+ 'gebe': 'geben', 'gibst': 'geben', 'gibt': 'geben',
105
+ 'sehe': 'sehen', 'siehst': 'sehen', 'sieht': 'sehen',
106
+ 'helfe': 'helfen', 'hilfst': 'helfen', 'hilft': 'helfen',
107
+ 'treffe': 'treffen', 'triffst': 'treffen', 'trifft': 'treffen',
108
+ 'finde': 'finden', 'findest': 'finden', 'findet': 'finden',
109
+ 'stehe': 'stehen', 'stehst': 'stehen', 'steht': 'stehen',
110
+ 'sitze': 'sitzen', 'sitzt': 'sitzen',
111
+ 'liege': 'liegen', 'liegst': 'liegen', 'liegt': 'liegen',
112
+ 'bleibe': 'bleiben', 'bleibst': 'bleiben', 'bleibt': 'bleiben',
113
+ 'komme': 'kommen', 'kommst': 'kommen', 'kommt': 'kommen',
114
+ 'bringe': 'bringen', 'bringst': 'bringen', 'bringt': 'bringen',
115
+ 'trage': 'tragen', 'trägst': 'tragen', 'trägt': 'tragen',
116
+ 'halte': 'halten', 'hältst': 'halten', 'hält': 'halten',
117
+ 'falle': 'fallen', 'fällst': 'fallen', 'fällt': 'fallen',
118
+ 'lasse': 'lassen', 'lässt': 'lassen',
119
+ 'rufe': 'rufen', 'rufst': 'rufen', 'ruft': 'rufen',
120
+ 'schreibe': 'schreiben', 'schreibst': 'schreiben', 'schreibt': 'schreiben',
121
+ 'ziehe': 'ziehen', 'ziehst': 'ziehen', 'zieht': 'ziehen',
122
+ 'weiß': 'wissen', 'weißt': 'wissen', 'wisst': 'wissen', 'wissen': 'wissen',
123
+ }
124
+
125
+ # =================================================================
126
+ # 관사 (Articles)
127
+ # =================================================================
128
+ self.articles = {
129
+ # 정관사
130
+ 'der': 'ART', 'die': 'ART', 'das': 'ART',
131
+ 'den': 'ART', 'dem': 'ART', 'des': 'ART',
132
+ # 부정관사
133
+ 'ein': 'ART', 'eine': 'ART', 'einer': 'ART',
134
+ 'einem': 'ART', 'einen': 'ART', 'eines': 'ART',
135
+ }
136
+
137
+ # =================================================================
138
+ # 대명사 (Pronouns)
139
+ # =================================================================
140
+ self.pronouns = {
141
+ 'ich': 'PPER', 'du': 'PPER', 'er': 'PPER', 'sie': 'PPER', 'es': 'PPER',
142
+ 'wir': 'PPER', 'ihr': 'PPER',
143
+ 'mich': 'PPER', 'dich': 'PPER', 'ihn': 'PPER',
144
+ 'mir': 'PPER', 'dir': 'PPER', 'ihm': 'PPER',
145
+ 'uns': 'PPER', 'euch': 'PPER', 'ihnen': 'PPER',
146
+ 'mein': 'PPOS', 'dein': 'PPOS', 'sein': 'PPOS',
147
+ 'unser': 'PPOS', 'euer': 'PPOS',
148
+ 'dieser': 'PDEM', 'diese': 'PDEM', 'dieses': 'PDEM',
149
+ 'jener': 'PDEM', 'jene': 'PDEM', 'jenes': 'PDEM',
150
+ }
151
+
152
+ # =================================================================
153
+ # 전치사 (Prepositions)
154
+ # =================================================================
155
+ self.prepositions = {
156
+ 'in': 'APPR', 'an': 'APPR', 'auf': 'APPR', 'für': 'APPR',
157
+ 'mit': 'APPR', 'von': 'APPR', 'zu': 'APPR', 'bei': 'APPR',
158
+ 'nach': 'APPR', 'über': 'APPR', 'unter': 'APPR', 'vor': 'APPR',
159
+ 'zwischen': 'APPR', 'durch': 'APPR', 'gegen': 'APPR',
160
+ 'ohne': 'APPR', 'um': 'APPR', 'aus': 'APPR', 'seit': 'APPR',
161
+ # 축약형 (Preposition + Article)
162
+ 'zur': 'APPRART', 'zum': 'APPRART', 'im': 'APPRART', 'am': 'APPRART',
163
+ 'ins': 'APPRART', 'ans': 'APPRART', 'vom': 'APPRART', 'beim': 'APPRART',
164
+ 'aufs': 'APPRART', 'fürs': 'APPRART', 'ums': 'APPRART',
165
+ }
166
+
167
+ # =================================================================
168
+ # 접속사 (Conjunctions)
169
+ # =================================================================
170
+ self.conjunctions = {
171
+ 'und': 'KON', 'oder': 'KON', 'aber': 'KON', 'denn': 'KON',
172
+ 'sondern': 'KON', 'doch': 'KON',
173
+ 'dass': 'KOUS', 'weil': 'KOUS', 'wenn': 'KOUS', 'als': 'KOUS',
174
+ 'ob': 'KOUS', 'obwohl': 'KOUS', 'während': 'KOUS',
175
+ 'bevor': 'KOUS', 'nachdem': 'KOUS', 'damit': 'KOUS',
176
+ }
177
+
178
+ # =================================================================
179
+ # 조동사 (Modal Verbs)
180
+ # =================================================================
181
+ self.modal_verbs = {
182
+ 'kann': 'können', 'kannst': 'können', 'können': 'können', 'könnt': 'können',
183
+ 'konnte': 'können', 'konnten': 'können', 'gekonnt': 'können',
184
+ 'muss': 'müssen', 'musst': 'müssen', 'müssen': 'müssen', 'müsst': 'müssen',
185
+ 'musste': 'müssen', 'mussten': 'müssen', 'gemusst': 'müssen',
186
+ 'will': 'wollen', 'willst': 'wollen', 'wollen': 'wollen', 'wollt': 'wollen',
187
+ 'wollte': 'wollen', 'wollten': 'wollen', 'gewollt': 'wollen',
188
+ 'soll': 'sollen', 'sollst': 'sollen', 'sollen': 'sollen', 'sollt': 'sollen',
189
+ 'sollte': 'sollen', 'sollten': 'sollen', 'gesollt': 'sollen',
190
+ 'darf': 'dürfen', 'darfst': 'dürfen', 'dürfen': 'dürfen', 'dürft': 'dürfen',
191
+ 'durfte': 'dürfen', 'durften': 'dürfen', 'gedurft': 'dürfen',
192
+ 'mag': 'mögen', 'magst': 'mögen', 'mögen': 'mögen', 'mögt': 'mögen',
193
+ 'mochte': 'mögen', 'mochten': 'mögen', 'gemocht': 'mögen',
194
+ }
195
+
196
+ # =================================================================
197
+ # 복합명사 요소
198
+ # =================================================================
199
+ self.compound_elements = {
200
+ 'Auto': 'NN', 'Bahn': 'NN', 'Haus': 'NN', 'Stadt': 'NN',
201
+ 'Land': 'NN', 'Straße': 'NN', 'Platz': 'NN', 'Markt': 'NN',
202
+ 'Arbeit': 'NN', 'Zeit': 'NN', 'Tag': 'NN', 'Jahr': 'NN',
203
+ 'Woche': 'NN', 'Monat': 'NN', 'Geld': 'NN', 'Bank': 'NN',
204
+ }
205
+
206
+ def _build_domain_dictionaries(self):
207
+ """도메인별 사전 구축"""
208
+
209
+ self._domain_dictionaries[Domain.TECH] = {
210
+ 'apfel': ('Apple', 'NE'),
211
+ 'wolke': ('Cloud', 'NN'),
212
+ 'netz': ('Netzwerk', 'NN'),
213
+ }
214
+
215
+ self._domain_dictionaries[Domain.FOOD] = {
216
+ 'apfel': ('Apfel', 'NN'),
217
+ }
218
+
219
+ self._domain_dictionaries[Domain.FINANCE] = {
220
+ 'bank': ('Bank', 'NN'),
221
+ 'aktie': ('Aktie', 'NN'),
222
+ }
223
+
224
+ def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
225
+ """분석 후보 생성"""
226
+ if not text or not text.strip():
227
+ return [AnalysisResult([])]
228
+
229
+ morphemes = self._analyze_text(text, domain)
230
+ result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
231
+ result.score = self._score_analysis(result)
232
+
233
+ return [result]
234
+
235
+ def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
236
+ """텍스트 분석"""
237
+ result = []
238
+ pos = 0
239
+
240
+ while pos < len(text):
241
+ if text[pos].isspace():
242
+ pos += 1
243
+ continue
244
+
245
+ word_match = self.WORD_PATTERN.match(text[pos:])
246
+ if word_match:
247
+ word = word_match.group()
248
+ morpheme = self._analyze_word(word, pos, domain)
249
+ result.append(morpheme)
250
+ pos += len(word)
251
+ continue
252
+
253
+ num_match = self.NUMBER_PATTERN.match(text[pos:])
254
+ if num_match:
255
+ num = num_match.group()
256
+ result.append(Morpheme(surface=num, lemma=num, pos='CARD', start=pos, end=pos + len(num)))
257
+ pos += len(num)
258
+ continue
259
+
260
+ result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='XY', start=pos, end=pos + 1))
261
+ pos += 1
262
+
263
+ return result
264
+
265
+ def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
266
+ """단어 분석"""
267
+ word_lower = word.lower()
268
+
269
+ # 런타임 사전
270
+ if word_lower in self._user_dictionary:
271
+ lemma, pos_tag, _ = self._user_dictionary[word_lower]
272
+ return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
273
+
274
+ # 도메인 사전
275
+ domain_sense = self._get_domain_sense(word_lower, domain)
276
+ if domain_sense:
277
+ return Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))
278
+
279
+ # 기능어
280
+ if word_lower in self.articles:
281
+ return Morpheme(surface=word, lemma=word_lower, pos=self.articles[word_lower], start=offset, end=offset + len(word))
282
+ if word_lower in self.pronouns:
283
+ return Morpheme(surface=word, lemma=word_lower, pos=self.pronouns[word_lower], start=offset, end=offset + len(word))
284
+ if word_lower in self.prepositions:
285
+ return Morpheme(surface=word, lemma=word_lower, pos=self.prepositions[word_lower], start=offset, end=offset + len(word))
286
+ if word_lower in self.conjunctions:
287
+ return Morpheme(surface=word, lemma=word_lower, pos=self.conjunctions[word_lower], start=offset, end=offset + len(word))
288
+
289
+ # 불규칙 동사
290
+ if word_lower in self.irregular_verbs:
291
+ return Morpheme(surface=word, lemma=self.irregular_verbs[word_lower], pos='VVFIN', start=offset, end=offset + len(word))
292
+
293
+ # 조동사
294
+ if word_lower in self.modal_verbs:
295
+ return Morpheme(surface=word, lemma=self.modal_verbs[word_lower], pos='VMFIN', start=offset, end=offset + len(word))
296
+
297
+ # 형태 분석
298
+ lemma, pos_tag = self._analyze_morphology(word)
299
+ return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
300
+
301
+ def _analyze_morphology(self, word: str) -> Tuple[str, str]:
302
+ """형태 분석"""
303
+ # -en 동사 어미
304
+ if word.endswith('en') and len(word) > 3:
305
+ return (word, 'VVINF')
306
+
307
+ # -t 동사 어미 (3인칭)
308
+ if word.endswith('t') and len(word) > 2:
309
+ return (word[:-1] + 'en', 'VVFIN')
310
+
311
+ # -ung 명사
312
+ if word.endswith('ung') and len(word) > 4:
313
+ return (word, 'NN')
314
+
315
+ # -heit/-keit 명사
316
+ if word.endswith(('heit', 'keit')) and len(word) > 5:
317
+ return (word, 'NN')
318
+
319
+ # -lich/-ig 형용사
320
+ if word.endswith(('lich', 'ig')) and len(word) > 4:
321
+ return (word, 'ADJD')
322
+
323
+ # 대문자 시작 (명사)
324
+ if word[0].isupper():
325
+ return (word, 'NN')
326
+
327
+ return (word, 'NN')
328
+
329
+ def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
330
+ """대안 생성"""
331
+ alternatives = []
332
+ other_domains = [d for d in Domain if d != domain][:count]
333
+
334
+ for alt_domain in other_domains:
335
+ morphemes = self._analyze_text(text, alt_domain)
336
+ result = AnalysisResult(morphemes=morphemes, score=0.8, domain=alt_domain)
337
+ result.score = self._score_analysis(result) * 0.9
338
+ alternatives.append(result)
339
+
340
+ return alternatives
341
+
342
+
343
+ GermanAnalyzer = GermanAdvancedAnalyzer