tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,475 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Other Script Language Templates
|
|
3
|
+
===============================
|
|
4
|
+
|
|
5
|
+
기타 문자 체계용 템플릿 분석기
|
|
6
|
+
Hebrew, Greek, Georgian, Armenian, Thai, Ethiopic, etc.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
from typing import List, Tuple, Dict, Optional
|
|
11
|
+
|
|
12
|
+
from ..advanced_base import (
|
|
13
|
+
AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, Domain
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class HebrewScriptAnalyzer(AdvancedMorphologicalAnalyzer):
|
|
18
|
+
"""히브리 문자 기반 언어 템플릿 (Hebrew, Yiddish)"""
|
|
19
|
+
|
|
20
|
+
LANG_CODE = "he"
|
|
21
|
+
LANG_NAME = "Hebrew"
|
|
22
|
+
|
|
23
|
+
WORD_PATTERN = re.compile(r'[\u0590-\u05FF\uFB1D-\uFB4F]+')
|
|
24
|
+
NUMBER_PATTERN = re.compile(r'[0-9]+')
|
|
25
|
+
|
|
26
|
+
def __init__(self):
|
|
27
|
+
super().__init__()
|
|
28
|
+
|
|
29
|
+
def _build_base_dictionary(self):
|
|
30
|
+
self.prefixes = {'ה': 'DEF', 'ו': 'CONJ', 'ב': 'PREP', 'ל': 'PREP', 'מ': 'PREP', 'כ': 'PREP'}
|
|
31
|
+
self.function_words = {
|
|
32
|
+
'אני': 'PRON', 'אתה': 'PRON', 'את': 'PRON', 'הוא': 'PRON', 'היא': 'PRON',
|
|
33
|
+
'אנחנו': 'PRON', 'אתם': 'PRON', 'הם': 'PRON', 'הן': 'PRON',
|
|
34
|
+
'של': 'PREP', 'על': 'PREP', 'עם': 'PREP', 'אל': 'PREP',
|
|
35
|
+
'לא': 'NEG', 'אין': 'NEG', 'כן': 'ADV', 'גם': 'ADV',
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
def _build_domain_dictionaries(self):
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
|
|
42
|
+
if not text.strip():
|
|
43
|
+
return [AnalysisResult([])]
|
|
44
|
+
morphemes = self._analyze_text(text, domain)
|
|
45
|
+
result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
|
|
46
|
+
return [result]
|
|
47
|
+
|
|
48
|
+
def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
|
|
49
|
+
result = []
|
|
50
|
+
pos = 0
|
|
51
|
+
while pos < len(text):
|
|
52
|
+
if text[pos].isspace():
|
|
53
|
+
pos += 1
|
|
54
|
+
continue
|
|
55
|
+
word_match = self.WORD_PATTERN.match(text[pos:])
|
|
56
|
+
if word_match:
|
|
57
|
+
word = word_match.group()
|
|
58
|
+
morphemes = self._analyze_word(word, pos, domain)
|
|
59
|
+
result.extend(morphemes)
|
|
60
|
+
pos += len(word)
|
|
61
|
+
continue
|
|
62
|
+
latin_match = re.match(r'[a-zA-Z]+', text[pos:])
|
|
63
|
+
if latin_match:
|
|
64
|
+
word = latin_match.group()
|
|
65
|
+
result.append(Morpheme(surface=word, lemma=word, pos='FOREIGN', start=pos, end=pos + len(word)))
|
|
66
|
+
pos += len(word)
|
|
67
|
+
continue
|
|
68
|
+
num_match = self.NUMBER_PATTERN.match(text[pos:])
|
|
69
|
+
if num_match:
|
|
70
|
+
num = num_match.group()
|
|
71
|
+
result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
|
|
72
|
+
pos += len(num)
|
|
73
|
+
continue
|
|
74
|
+
result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
|
|
75
|
+
pos += 1
|
|
76
|
+
return result
|
|
77
|
+
|
|
78
|
+
def _analyze_word(self, word: str, offset: int, domain: Domain) -> List[Morpheme]:
|
|
79
|
+
if word in self._user_dictionary:
|
|
80
|
+
lemma, pos_tag, _ = self._user_dictionary[word]
|
|
81
|
+
return [Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))]
|
|
82
|
+
if word in self.function_words:
|
|
83
|
+
return [Morpheme(surface=word, lemma=word, pos=self.function_words[word], start=offset, end=offset + len(word))]
|
|
84
|
+
# Prefix separation
|
|
85
|
+
morphemes = []
|
|
86
|
+
remaining = word
|
|
87
|
+
curr_offset = offset
|
|
88
|
+
for prefix, pos_tag in self.prefixes.items():
|
|
89
|
+
if remaining.startswith(prefix) and len(remaining) > 1:
|
|
90
|
+
morphemes.append(Morpheme(surface=prefix, lemma=prefix, pos=pos_tag, start=curr_offset, end=curr_offset + 1))
|
|
91
|
+
curr_offset += 1
|
|
92
|
+
remaining = remaining[1:]
|
|
93
|
+
break
|
|
94
|
+
if remaining:
|
|
95
|
+
morphemes.append(Morpheme(surface=remaining, lemma=remaining, pos='N', start=curr_offset, end=offset + len(word)))
|
|
96
|
+
return morphemes if morphemes else [Morpheme(surface=word, lemma=word, pos='N', start=offset, end=offset + len(word))]
|
|
97
|
+
|
|
98
|
+
def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
|
|
99
|
+
return []
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class GreekScriptAnalyzer(AdvancedMorphologicalAnalyzer):
|
|
103
|
+
"""그리스 문자 기반 언어 템플릿 (Greek)"""
|
|
104
|
+
|
|
105
|
+
LANG_CODE = "el"
|
|
106
|
+
LANG_NAME = "Greek"
|
|
107
|
+
|
|
108
|
+
WORD_PATTERN = re.compile(r'[α-ωΑ-Ωάέήίόύώϊϋΐΰἀ-ῼ]+')
|
|
109
|
+
NUMBER_PATTERN = re.compile(r'[0-9]+')
|
|
110
|
+
|
|
111
|
+
def __init__(self):
|
|
112
|
+
super().__init__()
|
|
113
|
+
|
|
114
|
+
def _build_base_dictionary(self):
|
|
115
|
+
self.function_words = {
|
|
116
|
+
'ο': 'DET', 'η': 'DET', 'το': 'DET', 'οι': 'DET', 'τα': 'DET',
|
|
117
|
+
'ένα': 'DET', 'μια': 'DET',
|
|
118
|
+
'εγώ': 'PRON', 'εσύ': 'PRON', 'αυτός': 'PRON', 'αυτή': 'PRON', 'αυτό': 'PRON',
|
|
119
|
+
'εμείς': 'PRON', 'εσείς': 'PRON', 'αυτοί': 'PRON', 'αυτές': 'PRON',
|
|
120
|
+
'και': 'CONJ', 'ή': 'CONJ', 'αλλά': 'CONJ', 'όμως': 'CONJ',
|
|
121
|
+
'σε': 'PREP', 'από': 'PREP', 'με': 'PREP', 'για': 'PREP',
|
|
122
|
+
'δεν': 'NEG', 'μην': 'NEG',
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
def _build_domain_dictionaries(self):
|
|
126
|
+
pass
|
|
127
|
+
|
|
128
|
+
def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
|
|
129
|
+
if not text.strip():
|
|
130
|
+
return [AnalysisResult([])]
|
|
131
|
+
morphemes = self._analyze_text(text, domain)
|
|
132
|
+
result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
|
|
133
|
+
return [result]
|
|
134
|
+
|
|
135
|
+
def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
|
|
136
|
+
result = []
|
|
137
|
+
pos = 0
|
|
138
|
+
while pos < len(text):
|
|
139
|
+
if text[pos].isspace():
|
|
140
|
+
pos += 1
|
|
141
|
+
continue
|
|
142
|
+
word_match = self.WORD_PATTERN.match(text[pos:])
|
|
143
|
+
if word_match:
|
|
144
|
+
word = word_match.group()
|
|
145
|
+
morpheme = self._analyze_word(word, pos, domain)
|
|
146
|
+
result.append(morpheme)
|
|
147
|
+
pos += len(word)
|
|
148
|
+
continue
|
|
149
|
+
latin_match = re.match(r'[a-zA-Z]+', text[pos:])
|
|
150
|
+
if latin_match:
|
|
151
|
+
word = latin_match.group()
|
|
152
|
+
result.append(Morpheme(surface=word, lemma=word, pos='FOREIGN', start=pos, end=pos + len(word)))
|
|
153
|
+
pos += len(word)
|
|
154
|
+
continue
|
|
155
|
+
num_match = self.NUMBER_PATTERN.match(text[pos:])
|
|
156
|
+
if num_match:
|
|
157
|
+
num = num_match.group()
|
|
158
|
+
result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
|
|
159
|
+
pos += len(num)
|
|
160
|
+
continue
|
|
161
|
+
result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
|
|
162
|
+
pos += 1
|
|
163
|
+
return result
|
|
164
|
+
|
|
165
|
+
def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
|
|
166
|
+
word_lower = word.lower()
|
|
167
|
+
if word_lower in self._user_dictionary:
|
|
168
|
+
lemma, pos_tag, _ = self._user_dictionary[word_lower]
|
|
169
|
+
return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
|
|
170
|
+
if word_lower in self.function_words:
|
|
171
|
+
return Morpheme(surface=word, lemma=word_lower, pos=self.function_words[word_lower], start=offset, end=offset + len(word))
|
|
172
|
+
# Morphological analysis
|
|
173
|
+
if word_lower.endswith(('ω', 'ει', 'ουν', 'ουμε')): # Verb endings
|
|
174
|
+
return Morpheme(surface=word, lemma=word_lower, pos='V', start=offset, end=offset + len(word))
|
|
175
|
+
if word_lower.endswith(('ος', 'ης', 'ας', 'η', 'α', 'ο')): # Noun endings
|
|
176
|
+
return Morpheme(surface=word, lemma=word_lower, pos='N', start=offset, end=offset + len(word))
|
|
177
|
+
if word[0].isupper():
|
|
178
|
+
return Morpheme(surface=word, lemma=word, pos='NP', start=offset, end=offset + len(word))
|
|
179
|
+
return Morpheme(surface=word, lemma=word_lower, pos='N', start=offset, end=offset + len(word))
|
|
180
|
+
|
|
181
|
+
def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
|
|
182
|
+
return []
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class GeorgianScriptAnalyzer(AdvancedMorphologicalAnalyzer):
|
|
186
|
+
"""조지아 문자 기반 언어 템플릿 (Georgian)"""
|
|
187
|
+
|
|
188
|
+
LANG_CODE = "ka"
|
|
189
|
+
LANG_NAME = "Georgian"
|
|
190
|
+
|
|
191
|
+
WORD_PATTERN = re.compile(r'[\u10A0-\u10FF\u2D00-\u2D2F]+')
|
|
192
|
+
NUMBER_PATTERN = re.compile(r'[0-9]+')
|
|
193
|
+
|
|
194
|
+
def __init__(self):
|
|
195
|
+
super().__init__()
|
|
196
|
+
|
|
197
|
+
def _build_base_dictionary(self):
|
|
198
|
+
self.function_words = {
|
|
199
|
+
'მე': 'PRON', 'შენ': 'PRON', 'ის': 'PRON', 'ჩვენ': 'PRON', 'თქვენ': 'PRON', 'ისინი': 'PRON',
|
|
200
|
+
'და': 'CONJ', 'ან': 'CONJ', 'მაგრამ': 'CONJ',
|
|
201
|
+
'არ': 'NEG', 'არა': 'NEG',
|
|
202
|
+
'-ში': 'PSP', '-ზე': 'PSP', '-თან': 'PSP', '-დან': 'PSP',
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
def _build_domain_dictionaries(self):
|
|
206
|
+
pass
|
|
207
|
+
|
|
208
|
+
def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
|
|
209
|
+
if not text.strip():
|
|
210
|
+
return [AnalysisResult([])]
|
|
211
|
+
morphemes = self._analyze_text(text, domain)
|
|
212
|
+
result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
|
|
213
|
+
return [result]
|
|
214
|
+
|
|
215
|
+
def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
|
|
216
|
+
result = []
|
|
217
|
+
pos = 0
|
|
218
|
+
while pos < len(text):
|
|
219
|
+
if text[pos].isspace():
|
|
220
|
+
pos += 1
|
|
221
|
+
continue
|
|
222
|
+
word_match = self.WORD_PATTERN.match(text[pos:])
|
|
223
|
+
if word_match:
|
|
224
|
+
word = word_match.group()
|
|
225
|
+
morpheme = self._analyze_word(word, pos, domain)
|
|
226
|
+
result.append(morpheme)
|
|
227
|
+
pos += len(word)
|
|
228
|
+
continue
|
|
229
|
+
num_match = self.NUMBER_PATTERN.match(text[pos:])
|
|
230
|
+
if num_match:
|
|
231
|
+
num = num_match.group()
|
|
232
|
+
result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
|
|
233
|
+
pos += len(num)
|
|
234
|
+
continue
|
|
235
|
+
result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
|
|
236
|
+
pos += 1
|
|
237
|
+
return result
|
|
238
|
+
|
|
239
|
+
def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
|
|
240
|
+
if word in self._user_dictionary:
|
|
241
|
+
lemma, pos_tag, _ = self._user_dictionary[word]
|
|
242
|
+
return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
|
|
243
|
+
if word in self.function_words:
|
|
244
|
+
return Morpheme(surface=word, lemma=word, pos=self.function_words[word], start=offset, end=offset + len(word))
|
|
245
|
+
return Morpheme(surface=word, lemma=word, pos='N', start=offset, end=offset + len(word))
|
|
246
|
+
|
|
247
|
+
def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
|
|
248
|
+
return []
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
class ArmenianScriptAnalyzer(AdvancedMorphologicalAnalyzer):
|
|
252
|
+
"""아르메니아 문자 기반 언어 템플릿 (Armenian)"""
|
|
253
|
+
|
|
254
|
+
LANG_CODE = "hy"
|
|
255
|
+
LANG_NAME = "Armenian"
|
|
256
|
+
|
|
257
|
+
WORD_PATTERN = re.compile(r'[\u0530-\u058F\uFB00-\uFB17]+')
|
|
258
|
+
NUMBER_PATTERN = re.compile(r'[0-9]+')
|
|
259
|
+
|
|
260
|
+
def __init__(self):
|
|
261
|
+
super().__init__()
|
|
262
|
+
|
|
263
|
+
def _build_base_dictionary(self):
|
|
264
|
+
self.function_words = {
|
|
265
|
+
'ես': 'PRON', 'դdelays': 'PRON', 'նdelays': 'PRON',
|
|
266
|
+
'delays': 'CONJ', 'կdelays': 'CONJ',
|
|
267
|
+
'delays': 'NEG',
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
def _build_domain_dictionaries(self):
|
|
271
|
+
pass
|
|
272
|
+
|
|
273
|
+
def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
|
|
274
|
+
if not text.strip():
|
|
275
|
+
return [AnalysisResult([])]
|
|
276
|
+
morphemes = self._analyze_text(text, domain)
|
|
277
|
+
result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
|
|
278
|
+
return [result]
|
|
279
|
+
|
|
280
|
+
def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
|
|
281
|
+
result = []
|
|
282
|
+
pos = 0
|
|
283
|
+
while pos < len(text):
|
|
284
|
+
if text[pos].isspace():
|
|
285
|
+
pos += 1
|
|
286
|
+
continue
|
|
287
|
+
word_match = self.WORD_PATTERN.match(text[pos:])
|
|
288
|
+
if word_match:
|
|
289
|
+
word = word_match.group()
|
|
290
|
+
morpheme = self._analyze_word(word, pos, domain)
|
|
291
|
+
result.append(morpheme)
|
|
292
|
+
pos += len(word)
|
|
293
|
+
continue
|
|
294
|
+
num_match = self.NUMBER_PATTERN.match(text[pos:])
|
|
295
|
+
if num_match:
|
|
296
|
+
num = num_match.group()
|
|
297
|
+
result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
|
|
298
|
+
pos += len(num)
|
|
299
|
+
continue
|
|
300
|
+
result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
|
|
301
|
+
pos += 1
|
|
302
|
+
return result
|
|
303
|
+
|
|
304
|
+
def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
|
|
305
|
+
if word in self._user_dictionary:
|
|
306
|
+
lemma, pos_tag, _ = self._user_dictionary[word]
|
|
307
|
+
return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
|
|
308
|
+
return Morpheme(surface=word, lemma=word, pos='N', start=offset, end=offset + len(word))
|
|
309
|
+
|
|
310
|
+
def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
|
|
311
|
+
return []
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
class ThaiScriptAnalyzer(AdvancedMorphologicalAnalyzer):
|
|
315
|
+
"""태국 문자 기반 언어 템플릿 (Thai) - 공백 없는 언어"""
|
|
316
|
+
|
|
317
|
+
LANG_CODE = "th"
|
|
318
|
+
LANG_NAME = "Thai"
|
|
319
|
+
|
|
320
|
+
WORD_PATTERN = re.compile(r'[\u0E00-\u0E7F]+')
|
|
321
|
+
NUMBER_PATTERN = re.compile(r'[0-9๐-๙]+')
|
|
322
|
+
|
|
323
|
+
def __init__(self):
|
|
324
|
+
super().__init__()
|
|
325
|
+
|
|
326
|
+
def _build_base_dictionary(self):
|
|
327
|
+
# Thai function words
|
|
328
|
+
self.function_words = {
|
|
329
|
+
'ฉัน': 'PRON', 'คุณ': 'PRON', 'เขา': 'PRON', 'เรา': 'PRON', 'พวกเขา': 'PRON',
|
|
330
|
+
'ที่': 'REL', 'ของ': 'PREP', 'ใน': 'PREP', 'บน': 'PREP', 'กับ': 'PREP',
|
|
331
|
+
'และ': 'CONJ', 'หรือ': 'CONJ', 'แต่': 'CONJ',
|
|
332
|
+
'ไม่': 'NEG', 'ไหม': 'Q',
|
|
333
|
+
'มาก': 'ADV', 'น้อย': 'ADV', 'ดี': 'ADJ',
|
|
334
|
+
}
|
|
335
|
+
# Common Thai words for word segmentation
|
|
336
|
+
self.common_words = set(self.function_words.keys())
|
|
337
|
+
|
|
338
|
+
def _build_domain_dictionaries(self):
|
|
339
|
+
pass
|
|
340
|
+
|
|
341
|
+
def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
|
|
342
|
+
if not text.strip():
|
|
343
|
+
return [AnalysisResult([])]
|
|
344
|
+
morphemes = self._analyze_text(text, domain)
|
|
345
|
+
result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
|
|
346
|
+
return [result]
|
|
347
|
+
|
|
348
|
+
def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
|
|
349
|
+
"""Thai text analysis - simplified word segmentation"""
|
|
350
|
+
result = []
|
|
351
|
+
pos = 0
|
|
352
|
+
while pos < len(text):
|
|
353
|
+
if text[pos].isspace():
|
|
354
|
+
pos += 1
|
|
355
|
+
continue
|
|
356
|
+
# Thai script
|
|
357
|
+
thai_match = self.WORD_PATTERN.match(text[pos:])
|
|
358
|
+
if thai_match:
|
|
359
|
+
chunk = thai_match.group()
|
|
360
|
+
# Simple maximum matching for word segmentation
|
|
361
|
+
words = self._segment_thai(chunk)
|
|
362
|
+
for word, wpos in words:
|
|
363
|
+
if word in self.function_words:
|
|
364
|
+
result.append(Morpheme(surface=word, lemma=word, pos=self.function_words[word], start=pos + wpos, end=pos + wpos + len(word)))
|
|
365
|
+
else:
|
|
366
|
+
result.append(Morpheme(surface=word, lemma=word, pos='N', start=pos + wpos, end=pos + wpos + len(word)))
|
|
367
|
+
pos += len(chunk)
|
|
368
|
+
continue
|
|
369
|
+
num_match = self.NUMBER_PATTERN.match(text[pos:])
|
|
370
|
+
if num_match:
|
|
371
|
+
num = num_match.group()
|
|
372
|
+
result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
|
|
373
|
+
pos += len(num)
|
|
374
|
+
continue
|
|
375
|
+
result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
|
|
376
|
+
pos += 1
|
|
377
|
+
return result
|
|
378
|
+
|
|
379
|
+
def _segment_thai(self, text: str) -> List[Tuple[str, int]]:
|
|
380
|
+
"""Simple Thai word segmentation using maximum matching"""
|
|
381
|
+
words = []
|
|
382
|
+
pos = 0
|
|
383
|
+
while pos < len(text):
|
|
384
|
+
# Try to match known words (longest first)
|
|
385
|
+
matched = False
|
|
386
|
+
for length in range(min(10, len(text) - pos), 0, -1):
|
|
387
|
+
substr = text[pos:pos + length]
|
|
388
|
+
if substr in self.common_words or substr in self._user_dictionary:
|
|
389
|
+
words.append((substr, pos))
|
|
390
|
+
pos += length
|
|
391
|
+
matched = True
|
|
392
|
+
break
|
|
393
|
+
if not matched:
|
|
394
|
+
# Single character
|
|
395
|
+
words.append((text[pos], pos))
|
|
396
|
+
pos += 1
|
|
397
|
+
return words
|
|
398
|
+
|
|
399
|
+
def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
|
|
400
|
+
if word in self._user_dictionary:
|
|
401
|
+
lemma, pos_tag, _ = self._user_dictionary[word]
|
|
402
|
+
return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
|
|
403
|
+
if word in self.function_words:
|
|
404
|
+
return Morpheme(surface=word, lemma=word, pos=self.function_words[word], start=offset, end=offset + len(word))
|
|
405
|
+
return Morpheme(surface=word, lemma=word, pos='N', start=offset, end=offset + len(word))
|
|
406
|
+
|
|
407
|
+
def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
|
|
408
|
+
return []
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
class EthiopicScriptAnalyzer(AdvancedMorphologicalAnalyzer):
|
|
412
|
+
"""에티오피아 문자 기반 언어 템플릿 (Amharic, Tigrinya, etc.)"""
|
|
413
|
+
|
|
414
|
+
LANG_CODE = "am"
|
|
415
|
+
LANG_NAME = "Amharic"
|
|
416
|
+
|
|
417
|
+
WORD_PATTERN = re.compile(r'[\u1200-\u137F\u1380-\u139F\u2D80-\u2DDF]+')
|
|
418
|
+
NUMBER_PATTERN = re.compile(r'[0-9፩-፼]+')
|
|
419
|
+
|
|
420
|
+
def __init__(self):
|
|
421
|
+
super().__init__()
|
|
422
|
+
|
|
423
|
+
def _build_base_dictionary(self):
|
|
424
|
+
self.function_words = {
|
|
425
|
+
'እኔ': 'PRON', 'አንተ': 'PRON', 'እሱ': 'PRON', 'እሷ': 'PRON',
|
|
426
|
+
'እኛ': 'PRON', 'እናንተ': 'PRON', 'እነሱ': 'PRON',
|
|
427
|
+
'እና': 'CONJ', 'ወይም': 'CONJ', 'ግን': 'CONJ',
|
|
428
|
+
'አይ': 'NEG', 'የለም': 'NEG',
|
|
429
|
+
'ውስጥ': 'PREP', 'ላይ': 'PREP', 'ከ': 'PREP', 'ወደ': 'PREP',
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
def _build_domain_dictionaries(self):
|
|
433
|
+
pass
|
|
434
|
+
|
|
435
|
+
def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
|
|
436
|
+
if not text.strip():
|
|
437
|
+
return [AnalysisResult([])]
|
|
438
|
+
morphemes = self._analyze_text(text, domain)
|
|
439
|
+
result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
|
|
440
|
+
return [result]
|
|
441
|
+
|
|
442
|
+
def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
|
|
443
|
+
result = []
|
|
444
|
+
pos = 0
|
|
445
|
+
while pos < len(text):
|
|
446
|
+
if text[pos].isspace():
|
|
447
|
+
pos += 1
|
|
448
|
+
continue
|
|
449
|
+
word_match = self.WORD_PATTERN.match(text[pos:])
|
|
450
|
+
if word_match:
|
|
451
|
+
word = word_match.group()
|
|
452
|
+
morpheme = self._analyze_word(word, pos, domain)
|
|
453
|
+
result.append(morpheme)
|
|
454
|
+
pos += len(word)
|
|
455
|
+
continue
|
|
456
|
+
num_match = self.NUMBER_PATTERN.match(text[pos:])
|
|
457
|
+
if num_match:
|
|
458
|
+
num = num_match.group()
|
|
459
|
+
result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
|
|
460
|
+
pos += len(num)
|
|
461
|
+
continue
|
|
462
|
+
result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
|
|
463
|
+
pos += 1
|
|
464
|
+
return result
|
|
465
|
+
|
|
466
|
+
def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
|
|
467
|
+
if word in self._user_dictionary:
|
|
468
|
+
lemma, pos_tag, _ = self._user_dictionary[word]
|
|
469
|
+
return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
|
|
470
|
+
if word in self.function_words:
|
|
471
|
+
return Morpheme(surface=word, lemma=word, pos=self.function_words[word], start=offset, end=offset + len(word))
|
|
472
|
+
return Morpheme(surface=word, lemma=word, pos='N', start=offset, end=offset + len(word))
|
|
473
|
+
|
|
474
|
+
def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
|
|
475
|
+
return []
|