tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Arabic Advanced Morphological Analyzer
|
|
3
|
+
======================================
|
|
4
|
+
|
|
5
|
+
5가지 고급 기능을 지원하는 아랍어 형태소 분석기
|
|
6
|
+
|
|
7
|
+
특징:
|
|
8
|
+
- RTL (Right-to-Left) 문자 처리
|
|
9
|
+
- 어근-어형 시스템 (Root-Pattern)
|
|
10
|
+
- 접두사/접미사 분리
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import re
|
|
14
|
+
from typing import List, Tuple, Dict, Optional
|
|
15
|
+
|
|
16
|
+
from .advanced_base import (
|
|
17
|
+
AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, NBestResult, Domain
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ArabicAdvancedAnalyzer(AdvancedMorphologicalAnalyzer):
|
|
22
|
+
"""아랍어 고급 형태소 분석기"""
|
|
23
|
+
|
|
24
|
+
LANG_CODE = "ar"
|
|
25
|
+
LANG_NAME = "Arabic"
|
|
26
|
+
|
|
27
|
+
# 아랍어 문자 패턴
|
|
28
|
+
ARABIC_PATTERN = re.compile(r'[\u0600-\u06FF\u0750-\u077F]+')
|
|
29
|
+
NUMBER_PATTERN = re.compile(r'[0-9٠-٩]+(?:[.,][0-9٠-٩]+)?')
|
|
30
|
+
|
|
31
|
+
def __init__(self):
|
|
32
|
+
super().__init__()
|
|
33
|
+
|
|
34
|
+
def _build_base_dictionary(self):
|
|
35
|
+
"""기본 사전 구축"""
|
|
36
|
+
|
|
37
|
+
# 접두사 (Prefixes)
|
|
38
|
+
self.prefixes = {
|
|
39
|
+
'ال': 'DEF', # 정관사 (the)
|
|
40
|
+
'و': 'CONJ', # 그리고 (and)
|
|
41
|
+
'ف': 'CONJ', # 그러면 (so)
|
|
42
|
+
'ب': 'PREP', # ~로 (with/by)
|
|
43
|
+
'ك': 'PREP', # ~처럼 (like)
|
|
44
|
+
'ل': 'PREP', # ~에게 (to/for)
|
|
45
|
+
'س': 'FUT', # 미래 표지
|
|
46
|
+
'سوف': 'FUT', # 미래 표지
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# 접미사 (대명사 접미사)
|
|
50
|
+
self.suffixes = {
|
|
51
|
+
'ي': 'PRON', # 나의
|
|
52
|
+
'ك': 'PRON', # 너의
|
|
53
|
+
'ه': 'PRON', # 그의
|
|
54
|
+
'ها': 'PRON', # 그녀의
|
|
55
|
+
'نا': 'PRON', # 우리의
|
|
56
|
+
'كم': 'PRON', # 너희의
|
|
57
|
+
'هم': 'PRON', # 그들의
|
|
58
|
+
'ة': 'FEM', # 여성 어미
|
|
59
|
+
'ات': 'PL', # 복수 어미 (여성)
|
|
60
|
+
'ون': 'PL', # 복수 어미 (남성)
|
|
61
|
+
'ين': 'PL', # 복수 어미 (남성)
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
# 대명사
|
|
65
|
+
self.pronouns = {
|
|
66
|
+
'أنا': 'PRON', 'أنت': 'PRON', 'أنتِ': 'PRON',
|
|
67
|
+
'هو': 'PRON', 'هي': 'PRON',
|
|
68
|
+
'نحن': 'PRON', 'أنتم': 'PRON', 'هم': 'PRON', 'هن': 'PRON',
|
|
69
|
+
'هذا': 'DEM', 'هذه': 'DEM', 'ذلك': 'DEM', 'تلك': 'DEM',
|
|
70
|
+
'من': 'REL', 'ما': 'REL', 'الذي': 'REL', 'التي': 'REL',
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
# 전치사
|
|
74
|
+
self.prepositions = {
|
|
75
|
+
'في': 'PREP', 'من': 'PREP', 'إلى': 'PREP', 'على': 'PREP',
|
|
76
|
+
'عن': 'PREP', 'مع': 'PREP', 'بين': 'PREP', 'حول': 'PREP',
|
|
77
|
+
'قبل': 'PREP', 'بعد': 'PREP', 'تحت': 'PREP', 'فوق': 'PREP',
|
|
78
|
+
'عند': 'PREP', 'خلال': 'PREP', 'منذ': 'PREP', 'دون': 'PREP',
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
# 접속사
|
|
82
|
+
self.conjunctions = {
|
|
83
|
+
'و': 'CONJ', 'أو': 'CONJ', 'لكن': 'CONJ', 'بل': 'CONJ',
|
|
84
|
+
'أن': 'CONJ', 'إن': 'CONJ', 'لأن': 'CONJ', 'إذا': 'CONJ',
|
|
85
|
+
'حتى': 'CONJ', 'كما': 'CONJ', 'عندما': 'CONJ', 'بينما': 'CONJ',
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
# 부사
|
|
89
|
+
self.adverbs = {
|
|
90
|
+
'جداً': 'ADV', 'كثيراً': 'ADV', 'قليلاً': 'ADV',
|
|
91
|
+
'دائماً': 'ADV', 'أبداً': 'ADV', 'أحياناً': 'ADV',
|
|
92
|
+
'الآن': 'ADV', 'غداً': 'ADV', 'أمس': 'ADV', 'اليوم': 'ADV',
|
|
93
|
+
'هنا': 'ADV', 'هناك': 'ADV', 'فقط': 'ADV', 'أيضاً': 'ADV',
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
# 부정어
|
|
97
|
+
self.negation = {
|
|
98
|
+
'لا': 'NEG', 'لم': 'NEG', 'لن': 'NEG', 'ما': 'NEG',
|
|
99
|
+
'ليس': 'NEG', 'ليست': 'NEG', 'ليسوا': 'NEG',
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
# 일반 명사 (고빈도)
|
|
103
|
+
self.common_nouns = {
|
|
104
|
+
'الله': 'NP', # Allah
|
|
105
|
+
'رسول': 'NC', # messenger
|
|
106
|
+
'كتاب': 'NC', # book
|
|
107
|
+
'يوم': 'NC', # day
|
|
108
|
+
'سنة': 'NC', # year
|
|
109
|
+
'ناس': 'NC', # people
|
|
110
|
+
'بيت': 'NC', # house
|
|
111
|
+
'عمل': 'NC', # work
|
|
112
|
+
'دولة': 'NC', # state
|
|
113
|
+
'شركة': 'NC', # company
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
def _build_domain_dictionaries(self):
|
|
117
|
+
"""도메인별 사전"""
|
|
118
|
+
self._domain_dictionaries[Domain.TECH] = {
|
|
119
|
+
'تفاحة': ('Apple', 'NP'), # Apple (company)
|
|
120
|
+
'سحابة': ('cloud', 'NC'), # cloud computing
|
|
121
|
+
}
|
|
122
|
+
self._domain_dictionaries[Domain.FOOD] = {
|
|
123
|
+
'تفاحة': ('تفاحة', 'NC'), # apple (fruit)
|
|
124
|
+
}
|
|
125
|
+
self._domain_dictionaries[Domain.FINANCE] = {
|
|
126
|
+
'بنك': ('بنك', 'NC'), # bank
|
|
127
|
+
'سهم': ('سهم', 'NC'), # stock
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
|
|
131
|
+
if not text or not text.strip():
|
|
132
|
+
return [AnalysisResult([])]
|
|
133
|
+
morphemes = self._analyze_text(text, domain)
|
|
134
|
+
result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
|
|
135
|
+
result.score = self._score_analysis(result)
|
|
136
|
+
return [result]
|
|
137
|
+
|
|
138
|
+
def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
|
|
139
|
+
result = []
|
|
140
|
+
pos = 0
|
|
141
|
+
while pos < len(text):
|
|
142
|
+
if text[pos].isspace():
|
|
143
|
+
pos += 1
|
|
144
|
+
continue
|
|
145
|
+
|
|
146
|
+
arabic_match = self.ARABIC_PATTERN.match(text[pos:])
|
|
147
|
+
if arabic_match:
|
|
148
|
+
word = arabic_match.group()
|
|
149
|
+
morphemes = self._analyze_word(word, pos, domain)
|
|
150
|
+
result.extend(morphemes)
|
|
151
|
+
pos += len(word)
|
|
152
|
+
continue
|
|
153
|
+
|
|
154
|
+
# 라틴 문자 (외래어)
|
|
155
|
+
latin_match = re.match(r'[a-zA-Z]+', text[pos:])
|
|
156
|
+
if latin_match:
|
|
157
|
+
word = latin_match.group()
|
|
158
|
+
result.append(Morpheme(surface=word, lemma=word, pos='FOREIGN', start=pos, end=pos + len(word)))
|
|
159
|
+
pos += len(word)
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
num_match = self.NUMBER_PATTERN.match(text[pos:])
|
|
163
|
+
if num_match:
|
|
164
|
+
num = num_match.group()
|
|
165
|
+
result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
|
|
166
|
+
pos += len(num)
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
|
|
170
|
+
pos += 1
|
|
171
|
+
return result
|
|
172
|
+
|
|
173
|
+
def _analyze_word(self, word: str, offset: int, domain: Domain) -> List[Morpheme]:
|
|
174
|
+
"""단어 분석 (접두사/접미사 분리)"""
|
|
175
|
+
morphemes = []
|
|
176
|
+
|
|
177
|
+
# 런타임 사전
|
|
178
|
+
if word in self._user_dictionary:
|
|
179
|
+
lemma, pos_tag, _ = self._user_dictionary[word]
|
|
180
|
+
return [Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))]
|
|
181
|
+
|
|
182
|
+
# 도메인 사전
|
|
183
|
+
domain_sense = self._get_domain_sense(word, domain)
|
|
184
|
+
if domain_sense:
|
|
185
|
+
return [Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))]
|
|
186
|
+
|
|
187
|
+
# 기능어
|
|
188
|
+
if word in self.pronouns:
|
|
189
|
+
return [Morpheme(surface=word, lemma=word, pos='PRON', start=offset, end=offset + len(word))]
|
|
190
|
+
if word in self.prepositions:
|
|
191
|
+
return [Morpheme(surface=word, lemma=word, pos='PREP', start=offset, end=offset + len(word))]
|
|
192
|
+
if word in self.conjunctions:
|
|
193
|
+
return [Morpheme(surface=word, lemma=word, pos='CONJ', start=offset, end=offset + len(word))]
|
|
194
|
+
if word in self.adverbs:
|
|
195
|
+
return [Morpheme(surface=word, lemma=word, pos='ADV', start=offset, end=offset + len(word))]
|
|
196
|
+
if word in self.negation:
|
|
197
|
+
return [Morpheme(surface=word, lemma=word, pos='NEG', start=offset, end=offset + len(word))]
|
|
198
|
+
if word in self.common_nouns:
|
|
199
|
+
return [Morpheme(surface=word, lemma=word, pos=self.common_nouns[word], start=offset, end=offset + len(word))]
|
|
200
|
+
|
|
201
|
+
# 접두사 분리
|
|
202
|
+
current_offset = offset
|
|
203
|
+
remaining = word
|
|
204
|
+
|
|
205
|
+
# 정관사 ال 분리
|
|
206
|
+
if remaining.startswith('ال') and len(remaining) > 2:
|
|
207
|
+
morphemes.append(Morpheme(surface='ال', lemma='ال', pos='DEF', start=current_offset, end=current_offset + 2))
|
|
208
|
+
current_offset += 2
|
|
209
|
+
remaining = remaining[2:]
|
|
210
|
+
|
|
211
|
+
# 접속사/전치사 접두사
|
|
212
|
+
for prefix, pos_tag in self.prefixes.items():
|
|
213
|
+
if remaining.startswith(prefix) and len(remaining) > len(prefix):
|
|
214
|
+
morphemes.append(Morpheme(surface=prefix, lemma=prefix, pos=pos_tag, start=current_offset, end=current_offset + len(prefix)))
|
|
215
|
+
current_offset += len(prefix)
|
|
216
|
+
remaining = remaining[len(prefix):]
|
|
217
|
+
break
|
|
218
|
+
|
|
219
|
+
# 남은 어간
|
|
220
|
+
if remaining:
|
|
221
|
+
stem_end = current_offset + len(remaining)
|
|
222
|
+
|
|
223
|
+
# 접미사 분리 시도
|
|
224
|
+
for suffix, pos_tag in sorted(self.suffixes.items(), key=lambda x: -len(x[0])):
|
|
225
|
+
if remaining.endswith(suffix) and len(remaining) > len(suffix):
|
|
226
|
+
stem = remaining[:-len(suffix)]
|
|
227
|
+
morphemes.append(Morpheme(surface=stem, lemma=stem, pos='NC', start=current_offset, end=current_offset + len(stem)))
|
|
228
|
+
morphemes.append(Morpheme(surface=suffix, lemma=suffix, pos=pos_tag, start=current_offset + len(stem), end=stem_end))
|
|
229
|
+
return morphemes
|
|
230
|
+
|
|
231
|
+
# 접미사 없음
|
|
232
|
+
morphemes.append(Morpheme(surface=remaining, lemma=remaining, pos='NC', start=current_offset, end=stem_end))
|
|
233
|
+
|
|
234
|
+
return morphemes if morphemes else [Morpheme(surface=word, lemma=word, pos='NC', start=offset, end=offset + len(word))]
|
|
235
|
+
|
|
236
|
+
def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
|
|
237
|
+
alternatives = []
|
|
238
|
+
other_domains = [d for d in Domain if d != domain][:count]
|
|
239
|
+
for alt_domain in other_domains:
|
|
240
|
+
morphemes = self._analyze_text(text, alt_domain)
|
|
241
|
+
result = AnalysisResult(morphemes=morphemes, score=0.8, domain=alt_domain)
|
|
242
|
+
result.score = self._score_analysis(result) * 0.9
|
|
243
|
+
alternatives.append(result)
|
|
244
|
+
return alternatives
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
ArabicAnalyzer = ArabicAdvancedAnalyzer
|