tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Russian Advanced Morphological Analyzer
|
|
3
|
+
=======================================
|
|
4
|
+
|
|
5
|
+
5가지 고급 기능을 지원하는 러시아어 형태소 분석기
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
from typing import List, Tuple, Dict, Optional
|
|
10
|
+
|
|
11
|
+
from .advanced_base import (
|
|
12
|
+
AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, NBestResult, Domain
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RussianAdvancedAnalyzer(AdvancedMorphologicalAnalyzer):
|
|
17
|
+
"""러시아어 고급 형태소 분석기 (키릴 문자)"""
|
|
18
|
+
|
|
19
|
+
LANG_CODE = "ru"
|
|
20
|
+
LANG_NAME = "Russian"
|
|
21
|
+
|
|
22
|
+
# 키릴 문자 패턴
|
|
23
|
+
WORD_PATTERN = re.compile(r'[а-яА-ЯёЁ]+')
|
|
24
|
+
NUMBER_PATTERN = re.compile(r'[0-9]+(?:[.,][0-9]+)?')
|
|
25
|
+
|
|
26
|
+
def __init__(self):
|
|
27
|
+
super().__init__()
|
|
28
|
+
|
|
29
|
+
def _build_base_dictionary(self):
|
|
30
|
+
"""기본 사전 구축"""
|
|
31
|
+
|
|
32
|
+
# 불규칙 동사 (быть)
|
|
33
|
+
self.irregular_verbs = {
|
|
34
|
+
# быть (be)
|
|
35
|
+
'есть': 'быть', 'был': 'быть', 'была': 'быть', 'было': 'быть',
|
|
36
|
+
'были': 'быть', 'буду': 'быть', 'будешь': 'быть', 'будет': 'быть',
|
|
37
|
+
'будем': 'быть', 'будете': 'быть', 'будут': 'быть',
|
|
38
|
+
# идти (go)
|
|
39
|
+
'иду': 'идти', 'идёшь': 'идти', 'идёт': 'идти',
|
|
40
|
+
'идём': 'идти', 'идёте': 'идти', 'идут': 'идти',
|
|
41
|
+
'шёл': 'идти', 'шла': 'идти', 'шло': 'идти', 'шли': 'идти',
|
|
42
|
+
# хотеть (want)
|
|
43
|
+
'хочу': 'хотеть', 'хочешь': 'хотеть', 'хочет': 'хотеть',
|
|
44
|
+
'хотим': 'хотеть', 'хотите': 'хотеть', 'хотят': 'хотеть',
|
|
45
|
+
# мочь (can)
|
|
46
|
+
'могу': 'мочь', 'можешь': 'мочь', 'может': 'мочь',
|
|
47
|
+
'можем': 'мочь', 'можете': 'мочь', 'могут': 'мочь',
|
|
48
|
+
# есть (eat)
|
|
49
|
+
'ем': 'есть', 'ешь': 'есть', 'ест': 'есть',
|
|
50
|
+
'едим': 'есть', 'едите': 'есть', 'едят': 'есть',
|
|
51
|
+
# давать (give)
|
|
52
|
+
'даю': 'давать', 'даёшь': 'давать', 'даёт': 'давать',
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
# 대명사
|
|
56
|
+
self.pronouns = {
|
|
57
|
+
'я': 'PRON', 'ты': 'PRON', 'он': 'PRON', 'она': 'PRON', 'оно': 'PRON',
|
|
58
|
+
'мы': 'PRON', 'вы': 'PRON', 'они': 'PRON',
|
|
59
|
+
'меня': 'PRON', 'тебя': 'PRON', 'его': 'PRON', 'её': 'PRON',
|
|
60
|
+
'нас': 'PRON', 'вас': 'PRON', 'их': 'PRON',
|
|
61
|
+
'мне': 'PRON', 'тебе': 'PRON', 'ему': 'PRON', 'ей': 'PRON',
|
|
62
|
+
'нам': 'PRON', 'вам': 'PRON', 'им': 'PRON',
|
|
63
|
+
'кто': 'PRON', 'что': 'PRON', 'какой': 'PRON', 'который': 'PRON',
|
|
64
|
+
'этот': 'PRON', 'тот': 'PRON', 'весь': 'PRON', 'сам': 'PRON',
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
# 전치사
|
|
68
|
+
self.prepositions = {
|
|
69
|
+
'в': 'PREP', 'на': 'PREP', 'с': 'PREP', 'к': 'PREP', 'по': 'PREP',
|
|
70
|
+
'за': 'PREP', 'из': 'PREP', 'от': 'PREP', 'до': 'PREP', 'о': 'PREP',
|
|
71
|
+
'об': 'PREP', 'у': 'PREP', 'при': 'PREP', 'над': 'PREP', 'под': 'PREP',
|
|
72
|
+
'перед': 'PREP', 'между': 'PREP', 'без': 'PREP', 'через': 'PREP',
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
# 접속사
|
|
76
|
+
self.conjunctions = {
|
|
77
|
+
'и': 'CONJ', 'а': 'CONJ', 'но': 'CONJ', 'или': 'CONJ',
|
|
78
|
+
'что': 'CONJ', 'чтобы': 'CONJ', 'если': 'CONJ', 'когда': 'CONJ',
|
|
79
|
+
'потому': 'CONJ', 'хотя': 'CONJ', 'пока': 'CONJ', 'как': 'CONJ',
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
# 부사
|
|
83
|
+
self.adverbs = {
|
|
84
|
+
'очень': 'ADV', 'хорошо': 'ADV', 'плохо': 'ADV', 'быстро': 'ADV',
|
|
85
|
+
'медленно': 'ADV', 'много': 'ADV', 'мало': 'ADV', 'тоже': 'ADV',
|
|
86
|
+
'уже': 'ADV', 'ещё': 'ADV', 'всегда': 'ADV', 'никогда': 'ADV',
|
|
87
|
+
'здесь': 'ADV', 'там': 'ADV', 'сейчас': 'ADV', 'потом': 'ADV',
|
|
88
|
+
'тогда': 'ADV', 'давно': 'ADV', 'скоро': 'ADV', 'вместе': 'ADV',
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
# 조사/불변화사
|
|
92
|
+
self.particles = {
|
|
93
|
+
'не': 'PART', 'ни': 'PART', 'же': 'PART', 'бы': 'PART',
|
|
94
|
+
'ли': 'PART', 'да': 'PART', 'нет': 'PART', 'вот': 'PART',
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
def _build_domain_dictionaries(self):
|
|
98
|
+
"""도메인별 사전"""
|
|
99
|
+
self._domain_dictionaries[Domain.TECH] = {
|
|
100
|
+
'яблоко': ('Apple', 'NP'),
|
|
101
|
+
'облако': ('cloud', 'NC'),
|
|
102
|
+
}
|
|
103
|
+
self._domain_dictionaries[Domain.FOOD] = {
|
|
104
|
+
'яблоко': ('яблоко', 'NC'),
|
|
105
|
+
}
|
|
106
|
+
self._domain_dictionaries[Domain.FINANCE] = {
|
|
107
|
+
'банк': ('банк', 'NC'),
|
|
108
|
+
'акция': ('акция', 'NC'),
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
|
|
112
|
+
if not text or not text.strip():
|
|
113
|
+
return [AnalysisResult([])]
|
|
114
|
+
morphemes = self._analyze_text(text, domain)
|
|
115
|
+
result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
|
|
116
|
+
result.score = self._score_analysis(result)
|
|
117
|
+
return [result]
|
|
118
|
+
|
|
119
|
+
def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
|
|
120
|
+
result = []
|
|
121
|
+
pos = 0
|
|
122
|
+
while pos < len(text):
|
|
123
|
+
if text[pos].isspace():
|
|
124
|
+
pos += 1
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
word_match = self.WORD_PATTERN.match(text[pos:])
|
|
128
|
+
if word_match:
|
|
129
|
+
word = word_match.group()
|
|
130
|
+
morpheme = self._analyze_word(word, pos, domain)
|
|
131
|
+
result.append(morpheme)
|
|
132
|
+
pos += len(word)
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
# 라틴 문자 (외래어/영어)
|
|
136
|
+
latin_match = re.match(r'[a-zA-Z]+', text[pos:])
|
|
137
|
+
if latin_match:
|
|
138
|
+
word = latin_match.group()
|
|
139
|
+
result.append(Morpheme(surface=word, lemma=word, pos='FOREIGN', start=pos, end=pos + len(word)))
|
|
140
|
+
pos += len(word)
|
|
141
|
+
continue
|
|
142
|
+
|
|
143
|
+
num_match = self.NUMBER_PATTERN.match(text[pos:])
|
|
144
|
+
if num_match:
|
|
145
|
+
num = num_match.group()
|
|
146
|
+
result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
|
|
147
|
+
pos += len(num)
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
|
|
151
|
+
pos += 1
|
|
152
|
+
return result
|
|
153
|
+
|
|
154
|
+
def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
|
|
155
|
+
word_lower = word.lower()
|
|
156
|
+
|
|
157
|
+
if word_lower in self._user_dictionary:
|
|
158
|
+
lemma, pos_tag, _ = self._user_dictionary[word_lower]
|
|
159
|
+
return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
|
|
160
|
+
|
|
161
|
+
domain_sense = self._get_domain_sense(word_lower, domain)
|
|
162
|
+
if domain_sense:
|
|
163
|
+
return Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))
|
|
164
|
+
|
|
165
|
+
if word_lower in self.pronouns:
|
|
166
|
+
return Morpheme(surface=word, lemma=word_lower, pos='PRON', start=offset, end=offset + len(word))
|
|
167
|
+
if word_lower in self.prepositions:
|
|
168
|
+
return Morpheme(surface=word, lemma=word_lower, pos='PREP', start=offset, end=offset + len(word))
|
|
169
|
+
if word_lower in self.conjunctions:
|
|
170
|
+
return Morpheme(surface=word, lemma=word_lower, pos='CONJ', start=offset, end=offset + len(word))
|
|
171
|
+
if word_lower in self.adverbs:
|
|
172
|
+
return Morpheme(surface=word, lemma=word_lower, pos='ADV', start=offset, end=offset + len(word))
|
|
173
|
+
if word_lower in self.particles:
|
|
174
|
+
return Morpheme(surface=word, lemma=word_lower, pos='PART', start=offset, end=offset + len(word))
|
|
175
|
+
|
|
176
|
+
if word_lower in self.irregular_verbs:
|
|
177
|
+
return Morpheme(surface=word, lemma=self.irregular_verbs[word_lower], pos='V', start=offset, end=offset + len(word))
|
|
178
|
+
|
|
179
|
+
lemma, pos_tag = self._analyze_morphology(word)
|
|
180
|
+
return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
|
|
181
|
+
|
|
182
|
+
def _analyze_morphology(self, word: str) -> Tuple[str, str]:
|
|
183
|
+
# -ть 동사 원형
|
|
184
|
+
if word.endswith('ть') and len(word) > 3:
|
|
185
|
+
return (word, 'V')
|
|
186
|
+
# -ся 재귀동사
|
|
187
|
+
if word.endswith('ся') and len(word) > 4:
|
|
188
|
+
return (word[:-2], 'V')
|
|
189
|
+
# -ние/-ение 명사
|
|
190
|
+
if word.endswith(('ние', 'ение', 'ание')) and len(word) > 5:
|
|
191
|
+
return (word, 'NC')
|
|
192
|
+
# -ость/-есть 명사
|
|
193
|
+
if word.endswith(('ость', 'есть')) and len(word) > 5:
|
|
194
|
+
return (word, 'NC')
|
|
195
|
+
# -ый/-ий/-ой 형용사
|
|
196
|
+
if word.endswith(('ый', 'ий', 'ой')) and len(word) > 3:
|
|
197
|
+
return (word, 'ADJ')
|
|
198
|
+
# -ая/-яя 형용사 (여성)
|
|
199
|
+
if word.endswith(('ая', 'яя')) and len(word) > 3:
|
|
200
|
+
return (word, 'ADJ')
|
|
201
|
+
# 대문자 시작 (고유명사)
|
|
202
|
+
if word[0].isupper():
|
|
203
|
+
return (word, 'NP')
|
|
204
|
+
return (word, 'NC')
|
|
205
|
+
|
|
206
|
+
def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
|
|
207
|
+
alternatives = []
|
|
208
|
+
other_domains = [d for d in Domain if d != domain][:count]
|
|
209
|
+
for alt_domain in other_domains:
|
|
210
|
+
morphemes = self._analyze_text(text, alt_domain)
|
|
211
|
+
result = AnalysisResult(morphemes=morphemes, score=0.8, domain=alt_domain)
|
|
212
|
+
result.score = self._score_analysis(result) * 0.9
|
|
213
|
+
alternatives.append(result)
|
|
214
|
+
return alternatives
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
RussianAnalyzer = RussianAdvancedAnalyzer
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Spanish Advanced Morphological Analyzer
|
|
3
|
+
=======================================
|
|
4
|
+
|
|
5
|
+
5가지 고급 기능을 지원하는 스페인어 형태소 분석기
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
from typing import List, Tuple, Dict, Optional
|
|
10
|
+
|
|
11
|
+
from .advanced_base import (
|
|
12
|
+
AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, NBestResult, Domain
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class SpanishAdvancedAnalyzer(AdvancedMorphologicalAnalyzer):
|
|
17
|
+
"""스페인어 고급 형태소 분석기"""
|
|
18
|
+
|
|
19
|
+
LANG_CODE = "es"
|
|
20
|
+
LANG_NAME = "Spanish"
|
|
21
|
+
|
|
22
|
+
WORD_PATTERN = re.compile(r"[a-zA-ZáéíóúüñÁÉÍÓÚÜÑ]+")
|
|
23
|
+
NUMBER_PATTERN = re.compile(r'[0-9]+(?:[.,][0-9]+)?')
|
|
24
|
+
|
|
25
|
+
def __init__(self):
|
|
26
|
+
super().__init__()
|
|
27
|
+
|
|
28
|
+
def _build_base_dictionary(self):
|
|
29
|
+
"""기본 사전 구축"""
|
|
30
|
+
|
|
31
|
+
# 불규칙 동사 (ser, estar, ir, tener, hacer)
|
|
32
|
+
self.irregular_verbs = {
|
|
33
|
+
# ser
|
|
34
|
+
'soy': 'ser', 'eres': 'ser', 'es': 'ser',
|
|
35
|
+
'somos': 'ser', 'sois': 'ser', 'son': 'ser',
|
|
36
|
+
'era': 'ser', 'eras': 'ser', 'éramos': 'ser',
|
|
37
|
+
'erais': 'ser', 'eran': 'ser', 'fue': 'ser',
|
|
38
|
+
'fuiste': 'ser', 'fuimos': 'ser', 'fueron': 'ser',
|
|
39
|
+
# estar
|
|
40
|
+
'estoy': 'estar', 'estás': 'estar', 'está': 'estar',
|
|
41
|
+
'estamos': 'estar', 'estáis': 'estar', 'están': 'estar',
|
|
42
|
+
'estaba': 'estar', 'estuve': 'estar', 'estuvo': 'estar',
|
|
43
|
+
# ir
|
|
44
|
+
'voy': 'ir', 'vas': 'ir', 'va': 'ir',
|
|
45
|
+
'vamos': 'ir', 'vais': 'ir', 'van': 'ir',
|
|
46
|
+
'iba': 'ir', 'ibas': 'ir', 'íbamos': 'ir',
|
|
47
|
+
# tener
|
|
48
|
+
'tengo': 'tener', 'tienes': 'tener', 'tiene': 'tener',
|
|
49
|
+
'tenemos': 'tener', 'tenéis': 'tener', 'tienen': 'tener',
|
|
50
|
+
'tenía': 'tener', 'tuve': 'tener', 'tuvo': 'tener',
|
|
51
|
+
# hacer
|
|
52
|
+
'hago': 'hacer', 'haces': 'hacer', 'hace': 'hacer',
|
|
53
|
+
'hacemos': 'hacer', 'hacéis': 'hacer', 'hacen': 'hacer',
|
|
54
|
+
'hacía': 'hacer', 'hice': 'hacer', 'hizo': 'hacer',
|
|
55
|
+
# poder
|
|
56
|
+
'puedo': 'poder', 'puedes': 'poder', 'puede': 'poder',
|
|
57
|
+
'podemos': 'poder', 'podéis': 'poder', 'pueden': 'poder',
|
|
58
|
+
'podía': 'poder', 'pude': 'poder', 'pudo': 'poder',
|
|
59
|
+
# querer
|
|
60
|
+
'quiero': 'querer', 'quieres': 'querer', 'quiere': 'querer',
|
|
61
|
+
'queremos': 'querer', 'queréis': 'querer', 'quieren': 'querer',
|
|
62
|
+
# saber
|
|
63
|
+
'sé': 'saber', 'sabes': 'saber', 'sabe': 'saber',
|
|
64
|
+
'sabemos': 'saber', 'sabéis': 'saber', 'saben': 'saber',
|
|
65
|
+
# venir
|
|
66
|
+
'vengo': 'venir', 'vienes': 'venir', 'viene': 'venir',
|
|
67
|
+
'venimos': 'venir', 'venís': 'venir', 'vienen': 'venir',
|
|
68
|
+
# decir
|
|
69
|
+
'digo': 'decir', 'dices': 'decir', 'dice': 'decir',
|
|
70
|
+
'decimos': 'decir', 'decís': 'decir', 'dicen': 'decir',
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
# 관사
|
|
74
|
+
self.articles = {
|
|
75
|
+
'el': 'DET', 'la': 'DET', 'los': 'DET', 'las': 'DET',
|
|
76
|
+
'un': 'DET', 'una': 'DET', 'unos': 'DET', 'unas': 'DET',
|
|
77
|
+
'al': 'DET', 'del': 'DET',
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
# 대명사
|
|
81
|
+
self.pronouns = {
|
|
82
|
+
'yo': 'PRON', 'tú': 'PRON', 'él': 'PRON', 'ella': 'PRON',
|
|
83
|
+
'nosotros': 'PRON', 'vosotros': 'PRON', 'ellos': 'PRON', 'ellas': 'PRON',
|
|
84
|
+
'me': 'PRON', 'te': 'PRON', 'se': 'PRON', 'nos': 'PRON', 'os': 'PRON',
|
|
85
|
+
'lo': 'PRON', 'le': 'PRON', 'les': 'PRON',
|
|
86
|
+
'que': 'PRON', 'quien': 'PRON', 'cual': 'PRON', 'cuyo': 'PRON',
|
|
87
|
+
'este': 'PRON', 'ese': 'PRON', 'aquel': 'PRON',
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
# 전치사
|
|
91
|
+
self.prepositions = {
|
|
92
|
+
'a': 'PREP', 'de': 'PREP', 'en': 'PREP', 'con': 'PREP',
|
|
93
|
+
'por': 'PREP', 'para': 'PREP', 'sin': 'PREP', 'sobre': 'PREP',
|
|
94
|
+
'entre': 'PREP', 'hasta': 'PREP', 'desde': 'PREP', 'hacia': 'PREP',
|
|
95
|
+
'bajo': 'PREP', 'contra': 'PREP', 'durante': 'PREP', 'según': 'PREP',
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
# 접속사
|
|
99
|
+
self.conjunctions = {
|
|
100
|
+
'y': 'CONJ', 'e': 'CONJ', 'o': 'CONJ', 'u': 'CONJ',
|
|
101
|
+
'pero': 'CONJ', 'sino': 'CONJ', 'ni': 'CONJ',
|
|
102
|
+
'que': 'CONJ', 'si': 'CONJ', 'cuando': 'CONJ', 'porque': 'CONJ',
|
|
103
|
+
'aunque': 'CONJ', 'como': 'CONJ', 'mientras': 'CONJ',
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
# 부사
|
|
107
|
+
self.adverbs = {
|
|
108
|
+
'muy': 'ADV', 'bien': 'ADV', 'mal': 'ADV', 'poco': 'ADV',
|
|
109
|
+
'mucho': 'ADV', 'más': 'ADV', 'menos': 'ADV', 'también': 'ADV',
|
|
110
|
+
'siempre': 'ADV', 'nunca': 'ADV', 'ya': 'ADV', 'todavía': 'ADV',
|
|
111
|
+
'aquí': 'ADV', 'allí': 'ADV', 'ahora': 'ADV', 'hoy': 'ADV',
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
def _build_domain_dictionaries(self):
|
|
115
|
+
"""도메인별 사전"""
|
|
116
|
+
self._domain_dictionaries[Domain.TECH] = {
|
|
117
|
+
'manzana': ('Apple', 'NP'),
|
|
118
|
+
'nube': ('cloud', 'NC'),
|
|
119
|
+
}
|
|
120
|
+
self._domain_dictionaries[Domain.FOOD] = {
|
|
121
|
+
'manzana': ('manzana', 'NC'),
|
|
122
|
+
}
|
|
123
|
+
self._domain_dictionaries[Domain.FINANCE] = {
|
|
124
|
+
'banco': ('banco', 'NC'),
|
|
125
|
+
'acción': ('acción', 'NC'),
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
|
|
129
|
+
if not text or not text.strip():
|
|
130
|
+
return [AnalysisResult([])]
|
|
131
|
+
morphemes = self._analyze_text(text, domain)
|
|
132
|
+
result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
|
|
133
|
+
result.score = self._score_analysis(result)
|
|
134
|
+
return [result]
|
|
135
|
+
|
|
136
|
+
def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
|
|
137
|
+
result = []
|
|
138
|
+
pos = 0
|
|
139
|
+
while pos < len(text):
|
|
140
|
+
if text[pos].isspace():
|
|
141
|
+
pos += 1
|
|
142
|
+
continue
|
|
143
|
+
|
|
144
|
+
word_match = self.WORD_PATTERN.match(text[pos:])
|
|
145
|
+
if word_match:
|
|
146
|
+
word = word_match.group()
|
|
147
|
+
morpheme = self._analyze_word(word, pos, domain)
|
|
148
|
+
result.append(morpheme)
|
|
149
|
+
pos += len(word)
|
|
150
|
+
continue
|
|
151
|
+
|
|
152
|
+
num_match = self.NUMBER_PATTERN.match(text[pos:])
|
|
153
|
+
if num_match:
|
|
154
|
+
num = num_match.group()
|
|
155
|
+
result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
|
|
156
|
+
pos += len(num)
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
|
|
160
|
+
pos += 1
|
|
161
|
+
return result
|
|
162
|
+
|
|
163
|
+
def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
|
|
164
|
+
word_lower = word.lower()
|
|
165
|
+
|
|
166
|
+
if word_lower in self._user_dictionary:
|
|
167
|
+
lemma, pos_tag, _ = self._user_dictionary[word_lower]
|
|
168
|
+
return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
|
|
169
|
+
|
|
170
|
+
domain_sense = self._get_domain_sense(word_lower, domain)
|
|
171
|
+
if domain_sense:
|
|
172
|
+
return Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))
|
|
173
|
+
|
|
174
|
+
if word_lower in self.articles:
|
|
175
|
+
return Morpheme(surface=word, lemma=word_lower, pos='DET', start=offset, end=offset + len(word))
|
|
176
|
+
if word_lower in self.pronouns:
|
|
177
|
+
return Morpheme(surface=word, lemma=word_lower, pos='PRON', start=offset, end=offset + len(word))
|
|
178
|
+
if word_lower in self.prepositions:
|
|
179
|
+
return Morpheme(surface=word, lemma=word_lower, pos='PREP', start=offset, end=offset + len(word))
|
|
180
|
+
if word_lower in self.conjunctions:
|
|
181
|
+
return Morpheme(surface=word, lemma=word_lower, pos='CONJ', start=offset, end=offset + len(word))
|
|
182
|
+
if word_lower in self.adverbs:
|
|
183
|
+
return Morpheme(surface=word, lemma=word_lower, pos='ADV', start=offset, end=offset + len(word))
|
|
184
|
+
|
|
185
|
+
if word_lower in self.irregular_verbs:
|
|
186
|
+
return Morpheme(surface=word, lemma=self.irregular_verbs[word_lower], pos='V', start=offset, end=offset + len(word))
|
|
187
|
+
|
|
188
|
+
lemma, pos_tag = self._analyze_morphology(word)
|
|
189
|
+
return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
|
|
190
|
+
|
|
191
|
+
def _analyze_morphology(self, word: str) -> Tuple[str, str]:
|
|
192
|
+
# -ar 동사 (1군)
|
|
193
|
+
if word.endswith('ar') and len(word) > 3:
|
|
194
|
+
return (word, 'V')
|
|
195
|
+
# -er 동사 (2군)
|
|
196
|
+
if word.endswith('er') and len(word) > 3:
|
|
197
|
+
return (word, 'V')
|
|
198
|
+
# -ir 동사 (3군)
|
|
199
|
+
if word.endswith('ir') and len(word) > 3:
|
|
200
|
+
return (word, 'V')
|
|
201
|
+
# -ción/-sión 명사
|
|
202
|
+
if word.endswith(('ción', 'sión')) and len(word) > 5:
|
|
203
|
+
return (word, 'NC')
|
|
204
|
+
# -mente 부사
|
|
205
|
+
if word.endswith('mente') and len(word) > 6:
|
|
206
|
+
return (word, 'ADV')
|
|
207
|
+
# -oso/-osa 형용사
|
|
208
|
+
if word.endswith(('oso', 'osa')) and len(word) > 4:
|
|
209
|
+
return (word, 'ADJ')
|
|
210
|
+
# 대문자 시작 (고유명사)
|
|
211
|
+
if word[0].isupper():
|
|
212
|
+
return (word, 'NP')
|
|
213
|
+
return (word, 'NC')
|
|
214
|
+
|
|
215
|
+
def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
|
|
216
|
+
alternatives = []
|
|
217
|
+
other_domains = [d for d in Domain if d != domain][:count]
|
|
218
|
+
for alt_domain in other_domains:
|
|
219
|
+
morphemes = self._analyze_text(text, alt_domain)
|
|
220
|
+
result = AnalysisResult(morphemes=morphemes, score=0.8, domain=alt_domain)
|
|
221
|
+
result.score = self._score_analysis(result) * 0.9
|
|
222
|
+
alternatives.append(result)
|
|
223
|
+
return alternatives
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
SpanishAnalyzer = SpanishAdvancedAnalyzer
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Language Family Templates
|
|
3
|
+
=========================
|
|
4
|
+
|
|
5
|
+
언어군별 템플릿 분석기 - 개별 언어 분석기의 베이스
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .latin_template import LatinScriptAnalyzer
|
|
9
|
+
from .cyrillic_template import CyrillicScriptAnalyzer
|
|
10
|
+
from .arabic_script_template import ArabicScriptAnalyzer
|
|
11
|
+
from .brahmic_template import BrahmicScriptAnalyzer
|
|
12
|
+
from .other_scripts_template import (
|
|
13
|
+
HebrewScriptAnalyzer,
|
|
14
|
+
GreekScriptAnalyzer,
|
|
15
|
+
GeorgianScriptAnalyzer,
|
|
16
|
+
ArmenianScriptAnalyzer,
|
|
17
|
+
ThaiScriptAnalyzer,
|
|
18
|
+
EthiopicScriptAnalyzer,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
'LatinScriptAnalyzer',
|
|
23
|
+
'CyrillicScriptAnalyzer',
|
|
24
|
+
'ArabicScriptAnalyzer',
|
|
25
|
+
'BrahmicScriptAnalyzer',
|
|
26
|
+
'HebrewScriptAnalyzer',
|
|
27
|
+
'GreekScriptAnalyzer',
|
|
28
|
+
'GeorgianScriptAnalyzer',
|
|
29
|
+
'ArmenianScriptAnalyzer',
|
|
30
|
+
'ThaiScriptAnalyzer',
|
|
31
|
+
'EthiopicScriptAnalyzer',
|
|
32
|
+
]
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Arabic Script Language Template
|
|
3
|
+
===============================
|
|
4
|
+
|
|
5
|
+
아랍 문자 기반 언어용 템플릿 분석기
|
|
6
|
+
Arabic, Persian, Urdu, Pashto, Kurdish, etc.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
from typing import List, Tuple, Dict, Optional
|
|
11
|
+
|
|
12
|
+
from ..advanced_base import (
|
|
13
|
+
AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, Domain
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ArabicScriptAnalyzer(AdvancedMorphologicalAnalyzer):
|
|
18
|
+
"""
|
|
19
|
+
아랍 문자 기반 언어 템플릿
|
|
20
|
+
|
|
21
|
+
Covers: Arabic, Persian/Farsi, Urdu, Pashto, Kurdish, Sindhi, etc.
|
|
22
|
+
RTL (Right-to-Left) text processing
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
# Arabic script pattern (extended for Persian, Urdu, etc.)
|
|
26
|
+
WORD_PATTERN = re.compile(
|
|
27
|
+
r'[\u0600-\u06FF' # Arabic
|
|
28
|
+
r'\u0750-\u077F' # Arabic Supplement
|
|
29
|
+
r'\u08A0-\u08FF' # Arabic Extended-A
|
|
30
|
+
r'\uFB50-\uFDFF' # Arabic Presentation Forms-A
|
|
31
|
+
r'\uFE70-\uFEFF' # Arabic Presentation Forms-B
|
|
32
|
+
r'\u0671-\u06D3' # Extended Arabic letters
|
|
33
|
+
r'پچژگک' # Persian additions
|
|
34
|
+
r'ڈڑںھٹ' # Urdu additions
|
|
35
|
+
r']+'
|
|
36
|
+
)
|
|
37
|
+
NUMBER_PATTERN = re.compile(r'[0-9٠-٩۰-۹]+(?:[.,][0-9٠-٩۰-۹]+)?')
|
|
38
|
+
|
|
39
|
+
def __init__(self):
|
|
40
|
+
super().__init__()
|
|
41
|
+
|
|
42
|
+
def _build_base_dictionary(self):
|
|
43
|
+
"""Override in subclass"""
|
|
44
|
+
self.prefixes: Dict[str, str] = {}
|
|
45
|
+
self.suffixes: Dict[str, str] = {}
|
|
46
|
+
self.function_words: Dict[str, str] = {}
|
|
47
|
+
|
|
48
|
+
def _build_domain_dictionaries(self):
|
|
49
|
+
"""Override in subclass"""
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
|
|
53
|
+
if not text or not text.strip():
|
|
54
|
+
return [AnalysisResult([])]
|
|
55
|
+
|
|
56
|
+
morphemes = self._analyze_text(text, domain)
|
|
57
|
+
result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
|
|
58
|
+
result.score = self._score_analysis(result)
|
|
59
|
+
return [result]
|
|
60
|
+
|
|
61
|
+
def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
|
|
62
|
+
result = []
|
|
63
|
+
pos = 0
|
|
64
|
+
|
|
65
|
+
while pos < len(text):
|
|
66
|
+
if text[pos].isspace():
|
|
67
|
+
pos += 1
|
|
68
|
+
continue
|
|
69
|
+
|
|
70
|
+
# Arabic script word
|
|
71
|
+
word_match = self.WORD_PATTERN.match(text[pos:])
|
|
72
|
+
if word_match:
|
|
73
|
+
word = word_match.group()
|
|
74
|
+
morphemes = self._analyze_word(word, pos, domain)
|
|
75
|
+
result.extend(morphemes)
|
|
76
|
+
pos += len(word)
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
# Latin (foreign words)
|
|
80
|
+
latin_match = re.match(r'[a-zA-Z]+', text[pos:])
|
|
81
|
+
if latin_match:
|
|
82
|
+
word = latin_match.group()
|
|
83
|
+
result.append(Morpheme(surface=word, lemma=word, pos='FOREIGN', start=pos, end=pos + len(word)))
|
|
84
|
+
pos += len(word)
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
# Number
|
|
88
|
+
num_match = self.NUMBER_PATTERN.match(text[pos:])
|
|
89
|
+
if num_match:
|
|
90
|
+
num = num_match.group()
|
|
91
|
+
result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
|
|
92
|
+
pos += len(num)
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
# Punctuation
|
|
96
|
+
result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
|
|
97
|
+
pos += 1
|
|
98
|
+
|
|
99
|
+
return result
|
|
100
|
+
|
|
101
|
+
def _analyze_word(self, word: str, offset: int, domain: Domain) -> List[Morpheme]:
|
|
102
|
+
"""Analyze word with prefix/suffix separation"""
|
|
103
|
+
morphemes = []
|
|
104
|
+
|
|
105
|
+
# User dictionary
|
|
106
|
+
if word in self._user_dictionary:
|
|
107
|
+
lemma, pos_tag, _ = self._user_dictionary[word]
|
|
108
|
+
return [Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))]
|
|
109
|
+
|
|
110
|
+
# Domain dictionary
|
|
111
|
+
domain_sense = self._get_domain_sense(word, domain)
|
|
112
|
+
if domain_sense:
|
|
113
|
+
return [Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))]
|
|
114
|
+
|
|
115
|
+
# Function words
|
|
116
|
+
if hasattr(self, 'function_words') and word in self.function_words:
|
|
117
|
+
return [Morpheme(surface=word, lemma=word, pos=self.function_words[word], start=offset, end=offset + len(word))]
|
|
118
|
+
|
|
119
|
+
# Prefix/suffix analysis
|
|
120
|
+
current_offset = offset
|
|
121
|
+
remaining = word
|
|
122
|
+
|
|
123
|
+
# Check prefixes
|
|
124
|
+
if hasattr(self, 'prefixes'):
|
|
125
|
+
for prefix, pos_tag in sorted(self.prefixes.items(), key=lambda x: -len(x[0])):
|
|
126
|
+
if remaining.startswith(prefix) and len(remaining) > len(prefix):
|
|
127
|
+
morphemes.append(Morpheme(surface=prefix, lemma=prefix, pos=pos_tag, start=current_offset, end=current_offset + len(prefix)))
|
|
128
|
+
current_offset += len(prefix)
|
|
129
|
+
remaining = remaining[len(prefix):]
|
|
130
|
+
break
|
|
131
|
+
|
|
132
|
+
# Check suffixes
|
|
133
|
+
stem = remaining
|
|
134
|
+
suffix_morphemes = []
|
|
135
|
+
if hasattr(self, 'suffixes'):
|
|
136
|
+
for suffix, pos_tag in sorted(self.suffixes.items(), key=lambda x: -len(x[0])):
|
|
137
|
+
if remaining.endswith(suffix) and len(remaining) > len(suffix):
|
|
138
|
+
stem = remaining[:-len(suffix)]
|
|
139
|
+
suffix_morphemes.append(Morpheme(
|
|
140
|
+
surface=suffix, lemma=suffix, pos=pos_tag,
|
|
141
|
+
start=current_offset + len(stem), end=offset + len(word)
|
|
142
|
+
))
|
|
143
|
+
break
|
|
144
|
+
|
|
145
|
+
# Add stem
|
|
146
|
+
if stem:
|
|
147
|
+
morphemes.append(Morpheme(surface=stem, lemma=stem, pos='N', start=current_offset, end=current_offset + len(stem)))
|
|
148
|
+
|
|
149
|
+
# Add suffix morphemes
|
|
150
|
+
morphemes.extend(suffix_morphemes)
|
|
151
|
+
|
|
152
|
+
return morphemes if morphemes else [Morpheme(surface=word, lemma=word, pos='N', start=offset, end=offset + len(word))]
|
|
153
|
+
|
|
154
|
+
def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
|
|
155
|
+
alternatives = []
|
|
156
|
+
other_domains = [d for d in Domain if d != domain][:count]
|
|
157
|
+
for alt_domain in other_domains:
|
|
158
|
+
morphemes = self._analyze_text(text, alt_domain)
|
|
159
|
+
result = AnalysisResult(morphemes=morphemes, score=0.8, domain=alt_domain)
|
|
160
|
+
result.score = self._score_analysis(result) * 0.9
|
|
161
|
+
alternatives.append(result)
|
|
162
|
+
return alternatives
|