tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
"""
|
|
2
|
+
French Advanced Morphological Analyzer
|
|
3
|
+
======================================
|
|
4
|
+
|
|
5
|
+
5가지 고급 기능을 지원하는 프랑스어 형태소 분석기
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
from typing import List, Tuple, Dict, Optional
|
|
10
|
+
|
|
11
|
+
from .advanced_base import (
|
|
12
|
+
AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, NBestResult, Domain
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class FrenchAdvancedAnalyzer(AdvancedMorphologicalAnalyzer):
|
|
17
|
+
"""프랑스어 고급 형태소 분석기"""
|
|
18
|
+
|
|
19
|
+
LANG_CODE = "fr"
|
|
20
|
+
LANG_NAME = "French"
|
|
21
|
+
|
|
22
|
+
WORD_PATTERN = re.compile(r"[a-zA-ZàâäéèêëïîôùûüÿœæçÀÂÄÉÈÊËÏÎÔÙÛÜŸŒÆÇ]+(?:-[a-zA-ZàâäéèêëïîôùûüÿœæçÀÂÄÉÈÊËÏÎÔÙÛÜŸŒÆÇ]+)*(?:'[a-zA-ZàâäéèêëïîôùûüÿœæçÀÂÄÉÈÊËÏÎÔÙÛÜŸŒÆÇ]+)?")
|
|
23
|
+
NUMBER_PATTERN = re.compile(r'[0-9]+(?:[.,][0-9]+)?')
|
|
24
|
+
|
|
25
|
+
def __init__(self):
|
|
26
|
+
super().__init__()
|
|
27
|
+
|
|
28
|
+
def _build_base_dictionary(self):
|
|
29
|
+
"""기본 사전 구축"""
|
|
30
|
+
|
|
31
|
+
# 불규칙 동사 (être, avoir, aller, faire)
|
|
32
|
+
self.irregular_verbs = {
|
|
33
|
+
# être
|
|
34
|
+
'suis': 'être', 'es': 'être', 'est': 'être',
|
|
35
|
+
'sommes': 'être', 'êtes': 'être', 'sont': 'être',
|
|
36
|
+
'étais': 'être', 'était': 'être', 'étions': 'être',
|
|
37
|
+
'étiez': 'être', 'étaient': 'être', 'été': 'être',
|
|
38
|
+
# avoir
|
|
39
|
+
'ai': 'avoir', 'as': 'avoir', 'a': 'avoir',
|
|
40
|
+
'avons': 'avoir', 'avez': 'avoir', 'ont': 'avoir',
|
|
41
|
+
'avais': 'avoir', 'avait': 'avoir', 'avions': 'avoir',
|
|
42
|
+
'aviez': 'avoir', 'avaient': 'avoir', 'eu': 'avoir',
|
|
43
|
+
# aller
|
|
44
|
+
'vais': 'aller', 'vas': 'aller', 'va': 'aller',
|
|
45
|
+
'allons': 'aller', 'allez': 'aller', 'vont': 'aller',
|
|
46
|
+
'allais': 'aller', 'allait': 'aller', 'allé': 'aller',
|
|
47
|
+
# faire
|
|
48
|
+
'fais': 'faire', 'fait': 'faire', 'faisons': 'faire',
|
|
49
|
+
'faites': 'faire', 'font': 'faire', 'faisais': 'faire',
|
|
50
|
+
# pouvoir
|
|
51
|
+
'peux': 'pouvoir', 'peut': 'pouvoir', 'pouvons': 'pouvoir',
|
|
52
|
+
'pouvez': 'pouvoir', 'peuvent': 'pouvoir', 'pu': 'pouvoir',
|
|
53
|
+
# vouloir
|
|
54
|
+
'veux': 'vouloir', 'veut': 'vouloir', 'voulons': 'vouloir',
|
|
55
|
+
'voulez': 'vouloir', 'veulent': 'vouloir', 'voulu': 'vouloir',
|
|
56
|
+
# savoir
|
|
57
|
+
'sais': 'savoir', 'sait': 'savoir', 'savons': 'savoir',
|
|
58
|
+
'savez': 'savoir', 'savent': 'savoir', 'su': 'savoir',
|
|
59
|
+
# venir
|
|
60
|
+
'viens': 'venir', 'vient': 'venir', 'venons': 'venir',
|
|
61
|
+
'venez': 'venir', 'viennent': 'venir', 'venu': 'venir',
|
|
62
|
+
# prendre
|
|
63
|
+
'prends': 'prendre', 'prend': 'prendre', 'prenons': 'prendre',
|
|
64
|
+
'prenez': 'prendre', 'prennent': 'prendre', 'pris': 'prendre',
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
# 관사
|
|
68
|
+
self.articles = {
|
|
69
|
+
'le': 'DET', 'la': 'DET', 'les': 'DET', "l'": 'DET',
|
|
70
|
+
'un': 'DET', 'une': 'DET', 'des': 'DET',
|
|
71
|
+
'du': 'DET', 'de': 'DET', "d'": 'DET',
|
|
72
|
+
'au': 'DET', 'aux': 'DET',
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
# 대명사
|
|
76
|
+
self.pronouns = {
|
|
77
|
+
'je': 'PRON', 'tu': 'PRON', 'il': 'PRON', 'elle': 'PRON',
|
|
78
|
+
'on': 'PRON', 'nous': 'PRON', 'vous': 'PRON', 'ils': 'PRON', 'elles': 'PRON',
|
|
79
|
+
'me': 'PRON', 'te': 'PRON', 'se': 'PRON', 'lui': 'PRON', 'leur': 'PRON',
|
|
80
|
+
'ce': 'PRON', 'cela': 'PRON', 'ça': 'PRON', 'ceci': 'PRON',
|
|
81
|
+
'qui': 'PRON', 'que': 'PRON', 'quoi': 'PRON', 'dont': 'PRON',
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
# 전치사
|
|
85
|
+
self.prepositions = {
|
|
86
|
+
'à': 'PREP', 'de': 'PREP', 'en': 'PREP', 'dans': 'PREP',
|
|
87
|
+
'sur': 'PREP', 'sous': 'PREP', 'avec': 'PREP', 'sans': 'PREP',
|
|
88
|
+
'pour': 'PREP', 'par': 'PREP', 'chez': 'PREP', 'vers': 'PREP',
|
|
89
|
+
'entre': 'PREP', 'contre': 'PREP', 'depuis': 'PREP', 'pendant': 'PREP',
|
|
90
|
+
'avant': 'PREP', 'après': 'PREP', 'devant': 'PREP', 'derrière': 'PREP',
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
# 접속사
|
|
94
|
+
self.conjunctions = {
|
|
95
|
+
'et': 'CONJ', 'ou': 'CONJ', 'mais': 'CONJ', 'donc': 'CONJ',
|
|
96
|
+
'car': 'CONJ', 'ni': 'CONJ', 'or': 'CONJ',
|
|
97
|
+
'que': 'CONJ', 'si': 'CONJ', 'quand': 'CONJ', 'comme': 'CONJ',
|
|
98
|
+
'parce': 'CONJ', 'puisque': 'CONJ', 'lorsque': 'CONJ',
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
# 부사
|
|
102
|
+
self.adverbs = {
|
|
103
|
+
'très': 'ADV', 'bien': 'ADV', 'mal': 'ADV', 'peu': 'ADV',
|
|
104
|
+
'beaucoup': 'ADV', 'trop': 'ADV', 'assez': 'ADV', 'plus': 'ADV',
|
|
105
|
+
'moins': 'ADV', 'aussi': 'ADV', 'encore': 'ADV', 'toujours': 'ADV',
|
|
106
|
+
'jamais': 'ADV', 'souvent': 'ADV', 'parfois': 'ADV', 'ici': 'ADV',
|
|
107
|
+
'là': 'ADV', 'maintenant': 'ADV', 'déjà': 'ADV', 'bientôt': 'ADV',
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
def _build_domain_dictionaries(self):
|
|
111
|
+
"""도메인별 사전"""
|
|
112
|
+
self._domain_dictionaries[Domain.TECH] = {
|
|
113
|
+
'pomme': ('Apple', 'NP'),
|
|
114
|
+
'nuage': ('cloud', 'NC'),
|
|
115
|
+
}
|
|
116
|
+
self._domain_dictionaries[Domain.FOOD] = {
|
|
117
|
+
'pomme': ('pomme', 'NC'),
|
|
118
|
+
}
|
|
119
|
+
self._domain_dictionaries[Domain.FINANCE] = {
|
|
120
|
+
'banque': ('banque', 'NC'),
|
|
121
|
+
'action': ('action', 'NC'),
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
|
|
125
|
+
if not text or not text.strip():
|
|
126
|
+
return [AnalysisResult([])]
|
|
127
|
+
|
|
128
|
+
morphemes = self._analyze_text(text, domain)
|
|
129
|
+
result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
|
|
130
|
+
result.score = self._score_analysis(result)
|
|
131
|
+
return [result]
|
|
132
|
+
|
|
133
|
+
def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
|
|
134
|
+
result = []
|
|
135
|
+
pos = 0
|
|
136
|
+
|
|
137
|
+
while pos < len(text):
|
|
138
|
+
if text[pos].isspace():
|
|
139
|
+
pos += 1
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
word_match = self.WORD_PATTERN.match(text[pos:])
|
|
143
|
+
if word_match:
|
|
144
|
+
word = word_match.group()
|
|
145
|
+
morpheme = self._analyze_word(word, pos, domain)
|
|
146
|
+
result.append(morpheme)
|
|
147
|
+
pos += len(word)
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
num_match = self.NUMBER_PATTERN.match(text[pos:])
|
|
151
|
+
if num_match:
|
|
152
|
+
num = num_match.group()
|
|
153
|
+
result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
|
|
154
|
+
pos += len(num)
|
|
155
|
+
continue
|
|
156
|
+
|
|
157
|
+
result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
|
|
158
|
+
pos += 1
|
|
159
|
+
|
|
160
|
+
return result
|
|
161
|
+
|
|
162
|
+
def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
|
|
163
|
+
word_lower = word.lower()
|
|
164
|
+
|
|
165
|
+
# 런타임 사전
|
|
166
|
+
if word_lower in self._user_dictionary:
|
|
167
|
+
lemma, pos_tag, _ = self._user_dictionary[word_lower]
|
|
168
|
+
return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
|
|
169
|
+
|
|
170
|
+
# 도메인 사전
|
|
171
|
+
domain_sense = self._get_domain_sense(word_lower, domain)
|
|
172
|
+
if domain_sense:
|
|
173
|
+
return Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))
|
|
174
|
+
|
|
175
|
+
# 기능어
|
|
176
|
+
if word_lower in self.articles:
|
|
177
|
+
return Morpheme(surface=word, lemma=word_lower, pos='DET', start=offset, end=offset + len(word))
|
|
178
|
+
if word_lower in self.pronouns:
|
|
179
|
+
return Morpheme(surface=word, lemma=word_lower, pos='PRON', start=offset, end=offset + len(word))
|
|
180
|
+
if word_lower in self.prepositions:
|
|
181
|
+
return Morpheme(surface=word, lemma=word_lower, pos='PREP', start=offset, end=offset + len(word))
|
|
182
|
+
if word_lower in self.conjunctions:
|
|
183
|
+
return Morpheme(surface=word, lemma=word_lower, pos='CONJ', start=offset, end=offset + len(word))
|
|
184
|
+
if word_lower in self.adverbs:
|
|
185
|
+
return Morpheme(surface=word, lemma=word_lower, pos='ADV', start=offset, end=offset + len(word))
|
|
186
|
+
|
|
187
|
+
# 불규칙 동사
|
|
188
|
+
if word_lower in self.irregular_verbs:
|
|
189
|
+
return Morpheme(surface=word, lemma=self.irregular_verbs[word_lower], pos='V', start=offset, end=offset + len(word))
|
|
190
|
+
|
|
191
|
+
# 형태 분석
|
|
192
|
+
lemma, pos_tag = self._analyze_morphology(word)
|
|
193
|
+
return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
|
|
194
|
+
|
|
195
|
+
def _analyze_morphology(self, word: str) -> Tuple[str, str]:
|
|
196
|
+
# -er 동사 (1군)
|
|
197
|
+
if word.endswith('er') and len(word) > 3:
|
|
198
|
+
return (word, 'V')
|
|
199
|
+
|
|
200
|
+
# -ir 동사 (2군)
|
|
201
|
+
if word.endswith('ir') and len(word) > 3:
|
|
202
|
+
return (word, 'V')
|
|
203
|
+
|
|
204
|
+
# -re 동사 (3군)
|
|
205
|
+
if word.endswith('re') and len(word) > 3:
|
|
206
|
+
return (word, 'V')
|
|
207
|
+
|
|
208
|
+
# -tion/-sion 명사
|
|
209
|
+
if word.endswith(('tion', 'sion')) and len(word) > 5:
|
|
210
|
+
return (word, 'NC')
|
|
211
|
+
|
|
212
|
+
# -ment 부사
|
|
213
|
+
if word.endswith('ment') and len(word) > 5:
|
|
214
|
+
return (word, 'ADV')
|
|
215
|
+
|
|
216
|
+
# -eux/-euse 형용사
|
|
217
|
+
if word.endswith(('eux', 'euse')) and len(word) > 4:
|
|
218
|
+
return (word, 'ADJ')
|
|
219
|
+
|
|
220
|
+
# 대문자 시작 (고유명사)
|
|
221
|
+
if word[0].isupper():
|
|
222
|
+
return (word, 'NP')
|
|
223
|
+
|
|
224
|
+
return (word, 'NC')
|
|
225
|
+
|
|
226
|
+
def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
|
|
227
|
+
alternatives = []
|
|
228
|
+
other_domains = [d for d in Domain if d != domain][:count]
|
|
229
|
+
for alt_domain in other_domains:
|
|
230
|
+
morphemes = self._analyze_text(text, alt_domain)
|
|
231
|
+
result = AnalysisResult(morphemes=morphemes, score=0.8, domain=alt_domain)
|
|
232
|
+
result.score = self._score_analysis(result) * 0.9
|
|
233
|
+
alternatives.append(result)
|
|
234
|
+
return alternatives
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
FrenchAnalyzer = FrenchAdvancedAnalyzer
|
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
"""
|
|
2
|
+
German Advanced Morphological Analyzer
|
|
3
|
+
======================================
|
|
4
|
+
|
|
5
|
+
5가지 고급 기능을 지원하는 독일어 형태소 분석기
|
|
6
|
+
|
|
7
|
+
Features:
|
|
8
|
+
1. NER Gazetteer Integration - 개체명 경계 보존
|
|
9
|
+
2. Real-time Dictionary Extension - 런타임 사전 확장
|
|
10
|
+
3. Domain Adaptation - 도메인별 분석 최적화
|
|
11
|
+
4. Code-switching - 다국어 혼용 텍스트 처리
|
|
12
|
+
5. N-best Analysis - 다중 후보 + 신뢰도 점수
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
from typing import List, Tuple, Dict, Set, Optional, Any
|
|
17
|
+
|
|
18
|
+
from .advanced_base import (
|
|
19
|
+
AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, NBestResult, Domain
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class GermanAdvancedAnalyzer(AdvancedMorphologicalAnalyzer):
|
|
24
|
+
"""
|
|
25
|
+
독일어 고급 형태소 분석기
|
|
26
|
+
|
|
27
|
+
특징:
|
|
28
|
+
- 복합명사 분해
|
|
29
|
+
- 강/약 변화 처리
|
|
30
|
+
- 분리동사 처리
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
LANG_CODE = "de"
|
|
34
|
+
LANG_NAME = "German"
|
|
35
|
+
|
|
36
|
+
WORD_PATTERN = re.compile(r'[a-zA-ZäöüÄÖÜß]+')
|
|
37
|
+
NUMBER_PATTERN = re.compile(r'[0-9]+(?:[.,][0-9]+)?')
|
|
38
|
+
|
|
39
|
+
def __init__(self):
|
|
40
|
+
super().__init__()
|
|
41
|
+
|
|
42
|
+
def _build_base_dictionary(self):
|
|
43
|
+
"""기본 사전 구축"""
|
|
44
|
+
|
|
45
|
+
# =================================================================
|
|
46
|
+
# 불규칙 동사 (Strong Verbs)
|
|
47
|
+
# =================================================================
|
|
48
|
+
self.irregular_verbs = {
|
|
49
|
+
# sein
|
|
50
|
+
'bin': 'sein', 'bist': 'sein', 'ist': 'sein',
|
|
51
|
+
'sind': 'sein', 'seid': 'sein', 'war': 'sein',
|
|
52
|
+
'warst': 'sein', 'waren': 'sein', 'wart': 'sein',
|
|
53
|
+
'gewesen': 'sein',
|
|
54
|
+
# haben
|
|
55
|
+
'habe': 'haben', 'hast': 'haben', 'hat': 'haben',
|
|
56
|
+
'habt': 'haben', 'hatte': 'haben', 'hattest': 'haben',
|
|
57
|
+
'hatten': 'haben', 'hattet': 'haben', 'gehabt': 'haben',
|
|
58
|
+
# werden
|
|
59
|
+
'werde': 'werden', 'wirst': 'werden', 'wird': 'werden',
|
|
60
|
+
'werdet': 'werden', 'wurde': 'werden', 'wurdest': 'werden',
|
|
61
|
+
'wurden': 'werden', 'wurdet': 'werden', 'geworden': 'werden',
|
|
62
|
+
# 기타 강변화 동사
|
|
63
|
+
'ging': 'gehen', 'gegangen': 'gehen',
|
|
64
|
+
'kam': 'kommen', 'gekommen': 'kommen',
|
|
65
|
+
'sah': 'sehen', 'gesehen': 'sehen',
|
|
66
|
+
'nahm': 'nehmen', 'genommen': 'nehmen',
|
|
67
|
+
'gab': 'geben', 'gegeben': 'geben',
|
|
68
|
+
'fand': 'finden', 'gefunden': 'finden',
|
|
69
|
+
'sprach': 'sprechen', 'gesprochen': 'sprechen',
|
|
70
|
+
'trug': 'tragen', 'getragen': 'tragen',
|
|
71
|
+
'fuhr': 'fahren', 'gefahren': 'fahren',
|
|
72
|
+
'schlief': 'schlafen', 'geschlafen': 'schlafen',
|
|
73
|
+
'lief': 'laufen', 'gelaufen': 'laufen',
|
|
74
|
+
# 규칙 동사 활용형 (gehen, machen, etc.)
|
|
75
|
+
'gehe': 'gehen', 'gehst': 'gehen', 'geht': 'gehen',
|
|
76
|
+
'mache': 'machen', 'machst': 'machen', 'macht': 'machen',
|
|
77
|
+
'sage': 'sagen', 'sagst': 'sagen', 'sagt': 'sagen',
|
|
78
|
+
'arbeite': 'arbeiten', 'arbeitest': 'arbeiten', 'arbeitet': 'arbeiten',
|
|
79
|
+
'lerne': 'lernen', 'lernst': 'lernen', 'lernt': 'lernen',
|
|
80
|
+
'spiele': 'spielen', 'spielst': 'spielen', 'spielt': 'spielen',
|
|
81
|
+
'kaufe': 'kaufen', 'kaufst': 'kaufen', 'kauft': 'kaufen',
|
|
82
|
+
'frage': 'fragen', 'fragst': 'fragen', 'fragt': 'fragen',
|
|
83
|
+
'höre': 'hören', 'hörst': 'hören', 'hört': 'hören',
|
|
84
|
+
'lebe': 'leben', 'lebst': 'leben', 'lebt': 'leben',
|
|
85
|
+
'liebe': 'lieben', 'liebst': 'lieben', 'liebt': 'lieben',
|
|
86
|
+
'warte': 'warten', 'wartest': 'warten', 'wartet': 'warten',
|
|
87
|
+
'öffne': 'öffnen', 'öffnest': 'öffnen', 'öffnet': 'öffnen',
|
|
88
|
+
'zeige': 'zeigen', 'zeigst': 'zeigen', 'zeigt': 'zeigen',
|
|
89
|
+
'brauche': 'brauchen', 'brauchst': 'brauchen', 'braucht': 'brauchen',
|
|
90
|
+
'glaube': 'glauben', 'glaubst': 'glauben', 'glaubt': 'glauben',
|
|
91
|
+
'denke': 'denken', 'denkst': 'denken', 'denkt': 'denken',
|
|
92
|
+
'kenne': 'kennen', 'kennst': 'kennen', 'kennt': 'kennen',
|
|
93
|
+
'wohne': 'wohnen', 'wohnst': 'wohnen', 'wohnt': 'wohnen',
|
|
94
|
+
'suche': 'suchen', 'suchst': 'suchen', 'sucht': 'suchen',
|
|
95
|
+
'folge': 'folgen', 'folgst': 'folgen', 'folgt': 'folgen',
|
|
96
|
+
'führe': 'führen', 'führst': 'führen', 'führt': 'führen',
|
|
97
|
+
'laufe': 'laufen', 'läufst': 'laufen', 'läuft': 'laufen',
|
|
98
|
+
'fahre': 'fahren', 'fährst': 'fahren', 'fährt': 'fahren',
|
|
99
|
+
'lese': 'lesen', 'liest': 'lesen',
|
|
100
|
+
'esse': 'essen', 'isst': 'essen',
|
|
101
|
+
'schlafe': 'schlafen', 'schläfst': 'schlafen', 'schläft': 'schlafen',
|
|
102
|
+
'spreche': 'sprechen', 'sprichst': 'sprechen', 'spricht': 'sprechen',
|
|
103
|
+
'nehme': 'nehmen', 'nimmst': 'nehmen', 'nimmt': 'nehmen',
|
|
104
|
+
'gebe': 'geben', 'gibst': 'geben', 'gibt': 'geben',
|
|
105
|
+
'sehe': 'sehen', 'siehst': 'sehen', 'sieht': 'sehen',
|
|
106
|
+
'helfe': 'helfen', 'hilfst': 'helfen', 'hilft': 'helfen',
|
|
107
|
+
'treffe': 'treffen', 'triffst': 'treffen', 'trifft': 'treffen',
|
|
108
|
+
'finde': 'finden', 'findest': 'finden', 'findet': 'finden',
|
|
109
|
+
'stehe': 'stehen', 'stehst': 'stehen', 'steht': 'stehen',
|
|
110
|
+
'sitze': 'sitzen', 'sitzt': 'sitzen',
|
|
111
|
+
'liege': 'liegen', 'liegst': 'liegen', 'liegt': 'liegen',
|
|
112
|
+
'bleibe': 'bleiben', 'bleibst': 'bleiben', 'bleibt': 'bleiben',
|
|
113
|
+
'komme': 'kommen', 'kommst': 'kommen', 'kommt': 'kommen',
|
|
114
|
+
'bringe': 'bringen', 'bringst': 'bringen', 'bringt': 'bringen',
|
|
115
|
+
'trage': 'tragen', 'trägst': 'tragen', 'trägt': 'tragen',
|
|
116
|
+
'halte': 'halten', 'hältst': 'halten', 'hält': 'halten',
|
|
117
|
+
'falle': 'fallen', 'fällst': 'fallen', 'fällt': 'fallen',
|
|
118
|
+
'lasse': 'lassen', 'lässt': 'lassen',
|
|
119
|
+
'rufe': 'rufen', 'rufst': 'rufen', 'ruft': 'rufen',
|
|
120
|
+
'schreibe': 'schreiben', 'schreibst': 'schreiben', 'schreibt': 'schreiben',
|
|
121
|
+
'ziehe': 'ziehen', 'ziehst': 'ziehen', 'zieht': 'ziehen',
|
|
122
|
+
'weiß': 'wissen', 'weißt': 'wissen', 'wisst': 'wissen', 'wissen': 'wissen',
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
# =================================================================
|
|
126
|
+
# 관사 (Articles)
|
|
127
|
+
# =================================================================
|
|
128
|
+
self.articles = {
|
|
129
|
+
# 정관사
|
|
130
|
+
'der': 'ART', 'die': 'ART', 'das': 'ART',
|
|
131
|
+
'den': 'ART', 'dem': 'ART', 'des': 'ART',
|
|
132
|
+
# 부정관사
|
|
133
|
+
'ein': 'ART', 'eine': 'ART', 'einer': 'ART',
|
|
134
|
+
'einem': 'ART', 'einen': 'ART', 'eines': 'ART',
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
# =================================================================
|
|
138
|
+
# 대명사 (Pronouns)
|
|
139
|
+
# =================================================================
|
|
140
|
+
self.pronouns = {
|
|
141
|
+
'ich': 'PPER', 'du': 'PPER', 'er': 'PPER', 'sie': 'PPER', 'es': 'PPER',
|
|
142
|
+
'wir': 'PPER', 'ihr': 'PPER',
|
|
143
|
+
'mich': 'PPER', 'dich': 'PPER', 'ihn': 'PPER',
|
|
144
|
+
'mir': 'PPER', 'dir': 'PPER', 'ihm': 'PPER',
|
|
145
|
+
'uns': 'PPER', 'euch': 'PPER', 'ihnen': 'PPER',
|
|
146
|
+
'mein': 'PPOS', 'dein': 'PPOS', 'sein': 'PPOS',
|
|
147
|
+
'unser': 'PPOS', 'euer': 'PPOS',
|
|
148
|
+
'dieser': 'PDEM', 'diese': 'PDEM', 'dieses': 'PDEM',
|
|
149
|
+
'jener': 'PDEM', 'jene': 'PDEM', 'jenes': 'PDEM',
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
# =================================================================
|
|
153
|
+
# 전치사 (Prepositions)
|
|
154
|
+
# =================================================================
|
|
155
|
+
self.prepositions = {
|
|
156
|
+
'in': 'APPR', 'an': 'APPR', 'auf': 'APPR', 'für': 'APPR',
|
|
157
|
+
'mit': 'APPR', 'von': 'APPR', 'zu': 'APPR', 'bei': 'APPR',
|
|
158
|
+
'nach': 'APPR', 'über': 'APPR', 'unter': 'APPR', 'vor': 'APPR',
|
|
159
|
+
'zwischen': 'APPR', 'durch': 'APPR', 'gegen': 'APPR',
|
|
160
|
+
'ohne': 'APPR', 'um': 'APPR', 'aus': 'APPR', 'seit': 'APPR',
|
|
161
|
+
# 축약형 (Preposition + Article)
|
|
162
|
+
'zur': 'APPRART', 'zum': 'APPRART', 'im': 'APPRART', 'am': 'APPRART',
|
|
163
|
+
'ins': 'APPRART', 'ans': 'APPRART', 'vom': 'APPRART', 'beim': 'APPRART',
|
|
164
|
+
'aufs': 'APPRART', 'fürs': 'APPRART', 'ums': 'APPRART',
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
# =================================================================
|
|
168
|
+
# 접속사 (Conjunctions)
|
|
169
|
+
# =================================================================
|
|
170
|
+
self.conjunctions = {
|
|
171
|
+
'und': 'KON', 'oder': 'KON', 'aber': 'KON', 'denn': 'KON',
|
|
172
|
+
'sondern': 'KON', 'doch': 'KON',
|
|
173
|
+
'dass': 'KOUS', 'weil': 'KOUS', 'wenn': 'KOUS', 'als': 'KOUS',
|
|
174
|
+
'ob': 'KOUS', 'obwohl': 'KOUS', 'während': 'KOUS',
|
|
175
|
+
'bevor': 'KOUS', 'nachdem': 'KOUS', 'damit': 'KOUS',
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
# =================================================================
|
|
179
|
+
# 조동사 (Modal Verbs)
|
|
180
|
+
# =================================================================
|
|
181
|
+
self.modal_verbs = {
|
|
182
|
+
'kann': 'können', 'kannst': 'können', 'können': 'können', 'könnt': 'können',
|
|
183
|
+
'konnte': 'können', 'konnten': 'können', 'gekonnt': 'können',
|
|
184
|
+
'muss': 'müssen', 'musst': 'müssen', 'müssen': 'müssen', 'müsst': 'müssen',
|
|
185
|
+
'musste': 'müssen', 'mussten': 'müssen', 'gemusst': 'müssen',
|
|
186
|
+
'will': 'wollen', 'willst': 'wollen', 'wollen': 'wollen', 'wollt': 'wollen',
|
|
187
|
+
'wollte': 'wollen', 'wollten': 'wollen', 'gewollt': 'wollen',
|
|
188
|
+
'soll': 'sollen', 'sollst': 'sollen', 'sollen': 'sollen', 'sollt': 'sollen',
|
|
189
|
+
'sollte': 'sollen', 'sollten': 'sollen', 'gesollt': 'sollen',
|
|
190
|
+
'darf': 'dürfen', 'darfst': 'dürfen', 'dürfen': 'dürfen', 'dürft': 'dürfen',
|
|
191
|
+
'durfte': 'dürfen', 'durften': 'dürfen', 'gedurft': 'dürfen',
|
|
192
|
+
'mag': 'mögen', 'magst': 'mögen', 'mögen': 'mögen', 'mögt': 'mögen',
|
|
193
|
+
'mochte': 'mögen', 'mochten': 'mögen', 'gemocht': 'mögen',
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
# =================================================================
|
|
197
|
+
# 복합명사 요소
|
|
198
|
+
# =================================================================
|
|
199
|
+
self.compound_elements = {
|
|
200
|
+
'Auto': 'NN', 'Bahn': 'NN', 'Haus': 'NN', 'Stadt': 'NN',
|
|
201
|
+
'Land': 'NN', 'Straße': 'NN', 'Platz': 'NN', 'Markt': 'NN',
|
|
202
|
+
'Arbeit': 'NN', 'Zeit': 'NN', 'Tag': 'NN', 'Jahr': 'NN',
|
|
203
|
+
'Woche': 'NN', 'Monat': 'NN', 'Geld': 'NN', 'Bank': 'NN',
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
def _build_domain_dictionaries(self):
|
|
207
|
+
"""도메인별 사전 구축"""
|
|
208
|
+
|
|
209
|
+
self._domain_dictionaries[Domain.TECH] = {
|
|
210
|
+
'apfel': ('Apple', 'NE'),
|
|
211
|
+
'wolke': ('Cloud', 'NN'),
|
|
212
|
+
'netz': ('Netzwerk', 'NN'),
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
self._domain_dictionaries[Domain.FOOD] = {
|
|
216
|
+
'apfel': ('Apfel', 'NN'),
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
self._domain_dictionaries[Domain.FINANCE] = {
|
|
220
|
+
'bank': ('Bank', 'NN'),
|
|
221
|
+
'aktie': ('Aktie', 'NN'),
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
|
|
225
|
+
"""분석 후보 생성"""
|
|
226
|
+
if not text or not text.strip():
|
|
227
|
+
return [AnalysisResult([])]
|
|
228
|
+
|
|
229
|
+
morphemes = self._analyze_text(text, domain)
|
|
230
|
+
result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
|
|
231
|
+
result.score = self._score_analysis(result)
|
|
232
|
+
|
|
233
|
+
return [result]
|
|
234
|
+
|
|
235
|
+
def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
|
|
236
|
+
"""텍스트 분석"""
|
|
237
|
+
result = []
|
|
238
|
+
pos = 0
|
|
239
|
+
|
|
240
|
+
while pos < len(text):
|
|
241
|
+
if text[pos].isspace():
|
|
242
|
+
pos += 1
|
|
243
|
+
continue
|
|
244
|
+
|
|
245
|
+
word_match = self.WORD_PATTERN.match(text[pos:])
|
|
246
|
+
if word_match:
|
|
247
|
+
word = word_match.group()
|
|
248
|
+
morpheme = self._analyze_word(word, pos, domain)
|
|
249
|
+
result.append(morpheme)
|
|
250
|
+
pos += len(word)
|
|
251
|
+
continue
|
|
252
|
+
|
|
253
|
+
num_match = self.NUMBER_PATTERN.match(text[pos:])
|
|
254
|
+
if num_match:
|
|
255
|
+
num = num_match.group()
|
|
256
|
+
result.append(Morpheme(surface=num, lemma=num, pos='CARD', start=pos, end=pos + len(num)))
|
|
257
|
+
pos += len(num)
|
|
258
|
+
continue
|
|
259
|
+
|
|
260
|
+
result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='XY', start=pos, end=pos + 1))
|
|
261
|
+
pos += 1
|
|
262
|
+
|
|
263
|
+
return result
|
|
264
|
+
|
|
265
|
+
def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
|
|
266
|
+
"""단어 분석"""
|
|
267
|
+
word_lower = word.lower()
|
|
268
|
+
|
|
269
|
+
# 런타임 사전
|
|
270
|
+
if word_lower in self._user_dictionary:
|
|
271
|
+
lemma, pos_tag, _ = self._user_dictionary[word_lower]
|
|
272
|
+
return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
|
|
273
|
+
|
|
274
|
+
# 도메인 사전
|
|
275
|
+
domain_sense = self._get_domain_sense(word_lower, domain)
|
|
276
|
+
if domain_sense:
|
|
277
|
+
return Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))
|
|
278
|
+
|
|
279
|
+
# 기능어
|
|
280
|
+
if word_lower in self.articles:
|
|
281
|
+
return Morpheme(surface=word, lemma=word_lower, pos=self.articles[word_lower], start=offset, end=offset + len(word))
|
|
282
|
+
if word_lower in self.pronouns:
|
|
283
|
+
return Morpheme(surface=word, lemma=word_lower, pos=self.pronouns[word_lower], start=offset, end=offset + len(word))
|
|
284
|
+
if word_lower in self.prepositions:
|
|
285
|
+
return Morpheme(surface=word, lemma=word_lower, pos=self.prepositions[word_lower], start=offset, end=offset + len(word))
|
|
286
|
+
if word_lower in self.conjunctions:
|
|
287
|
+
return Morpheme(surface=word, lemma=word_lower, pos=self.conjunctions[word_lower], start=offset, end=offset + len(word))
|
|
288
|
+
|
|
289
|
+
# 불규칙 동사
|
|
290
|
+
if word_lower in self.irregular_verbs:
|
|
291
|
+
return Morpheme(surface=word, lemma=self.irregular_verbs[word_lower], pos='VVFIN', start=offset, end=offset + len(word))
|
|
292
|
+
|
|
293
|
+
# 조동사
|
|
294
|
+
if word_lower in self.modal_verbs:
|
|
295
|
+
return Morpheme(surface=word, lemma=self.modal_verbs[word_lower], pos='VMFIN', start=offset, end=offset + len(word))
|
|
296
|
+
|
|
297
|
+
# 형태 분석
|
|
298
|
+
lemma, pos_tag = self._analyze_morphology(word)
|
|
299
|
+
return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
|
|
300
|
+
|
|
301
|
+
def _analyze_morphology(self, word: str) -> Tuple[str, str]:
|
|
302
|
+
"""형태 분석"""
|
|
303
|
+
# -en 동사 어미
|
|
304
|
+
if word.endswith('en') and len(word) > 3:
|
|
305
|
+
return (word, 'VVINF')
|
|
306
|
+
|
|
307
|
+
# -t 동사 어미 (3인칭)
|
|
308
|
+
if word.endswith('t') and len(word) > 2:
|
|
309
|
+
return (word[:-1] + 'en', 'VVFIN')
|
|
310
|
+
|
|
311
|
+
# -ung 명사
|
|
312
|
+
if word.endswith('ung') and len(word) > 4:
|
|
313
|
+
return (word, 'NN')
|
|
314
|
+
|
|
315
|
+
# -heit/-keit 명사
|
|
316
|
+
if word.endswith(('heit', 'keit')) and len(word) > 5:
|
|
317
|
+
return (word, 'NN')
|
|
318
|
+
|
|
319
|
+
# -lich/-ig 형용사
|
|
320
|
+
if word.endswith(('lich', 'ig')) and len(word) > 4:
|
|
321
|
+
return (word, 'ADJD')
|
|
322
|
+
|
|
323
|
+
# 대문자 시작 (명사)
|
|
324
|
+
if word[0].isupper():
|
|
325
|
+
return (word, 'NN')
|
|
326
|
+
|
|
327
|
+
return (word, 'NN')
|
|
328
|
+
|
|
329
|
+
def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
|
|
330
|
+
"""대안 생성"""
|
|
331
|
+
alternatives = []
|
|
332
|
+
other_domains = [d for d in Domain if d != domain][:count]
|
|
333
|
+
|
|
334
|
+
for alt_domain in other_domains:
|
|
335
|
+
morphemes = self._analyze_text(text, alt_domain)
|
|
336
|
+
result = AnalysisResult(morphemes=morphemes, score=0.8, domain=alt_domain)
|
|
337
|
+
result.score = self._score_analysis(result) * 0.9
|
|
338
|
+
alternatives.append(result)
|
|
339
|
+
|
|
340
|
+
return alternatives
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
GermanAnalyzer = GermanAdvancedAnalyzer
|