tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,475 @@
1
+ """
2
+ Other Script Language Templates
3
+ ===============================
4
+
5
+ 기타 문자 체계용 템플릿 분석기
6
+ Hebrew, Greek, Georgian, Armenian, Thai, Ethiopic, etc.
7
+ """
8
+
9
+ import re
10
+ from typing import List, Tuple, Dict, Optional
11
+
12
+ from ..advanced_base import (
13
+ AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, Domain
14
+ )
15
+
16
+
17
+ class HebrewScriptAnalyzer(AdvancedMorphologicalAnalyzer):
18
+ """히브리 문자 기반 언어 템플릿 (Hebrew, Yiddish)"""
19
+
20
+ LANG_CODE = "he"
21
+ LANG_NAME = "Hebrew"
22
+
23
+ WORD_PATTERN = re.compile(r'[\u0590-\u05FF\uFB1D-\uFB4F]+')
24
+ NUMBER_PATTERN = re.compile(r'[0-9]+')
25
+
26
+ def __init__(self):
27
+ super().__init__()
28
+
29
+ def _build_base_dictionary(self):
30
+ self.prefixes = {'ה': 'DEF', 'ו': 'CONJ', 'ב': 'PREP', 'ל': 'PREP', 'מ': 'PREP', 'כ': 'PREP'}
31
+ self.function_words = {
32
+ 'אני': 'PRON', 'אתה': 'PRON', 'את': 'PRON', 'הוא': 'PRON', 'היא': 'PRON',
33
+ 'אנחנו': 'PRON', 'אתם': 'PRON', 'הם': 'PRON', 'הן': 'PRON',
34
+ 'של': 'PREP', 'על': 'PREP', 'עם': 'PREP', 'אל': 'PREP',
35
+ 'לא': 'NEG', 'אין': 'NEG', 'כן': 'ADV', 'גם': 'ADV',
36
+ }
37
+
38
+ def _build_domain_dictionaries(self):
39
+ pass
40
+
41
+ def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
42
+ if not text.strip():
43
+ return [AnalysisResult([])]
44
+ morphemes = self._analyze_text(text, domain)
45
+ result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
46
+ return [result]
47
+
48
+ def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
49
+ result = []
50
+ pos = 0
51
+ while pos < len(text):
52
+ if text[pos].isspace():
53
+ pos += 1
54
+ continue
55
+ word_match = self.WORD_PATTERN.match(text[pos:])
56
+ if word_match:
57
+ word = word_match.group()
58
+ morphemes = self._analyze_word(word, pos, domain)
59
+ result.extend(morphemes)
60
+ pos += len(word)
61
+ continue
62
+ latin_match = re.match(r'[a-zA-Z]+', text[pos:])
63
+ if latin_match:
64
+ word = latin_match.group()
65
+ result.append(Morpheme(surface=word, lemma=word, pos='FOREIGN', start=pos, end=pos + len(word)))
66
+ pos += len(word)
67
+ continue
68
+ num_match = self.NUMBER_PATTERN.match(text[pos:])
69
+ if num_match:
70
+ num = num_match.group()
71
+ result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
72
+ pos += len(num)
73
+ continue
74
+ result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
75
+ pos += 1
76
+ return result
77
+
78
+ def _analyze_word(self, word: str, offset: int, domain: Domain) -> List[Morpheme]:
79
+ if word in self._user_dictionary:
80
+ lemma, pos_tag, _ = self._user_dictionary[word]
81
+ return [Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))]
82
+ if word in self.function_words:
83
+ return [Morpheme(surface=word, lemma=word, pos=self.function_words[word], start=offset, end=offset + len(word))]
84
+ # Prefix separation
85
+ morphemes = []
86
+ remaining = word
87
+ curr_offset = offset
88
+ for prefix, pos_tag in self.prefixes.items():
89
+ if remaining.startswith(prefix) and len(remaining) > 1:
90
+ morphemes.append(Morpheme(surface=prefix, lemma=prefix, pos=pos_tag, start=curr_offset, end=curr_offset + 1))
91
+ curr_offset += 1
92
+ remaining = remaining[1:]
93
+ break
94
+ if remaining:
95
+ morphemes.append(Morpheme(surface=remaining, lemma=remaining, pos='N', start=curr_offset, end=offset + len(word)))
96
+ return morphemes if morphemes else [Morpheme(surface=word, lemma=word, pos='N', start=offset, end=offset + len(word))]
97
+
98
+ def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
99
+ return []
100
+
101
+
102
+ class GreekScriptAnalyzer(AdvancedMorphologicalAnalyzer):
103
+ """그리스 문자 기반 언어 템플릿 (Greek)"""
104
+
105
+ LANG_CODE = "el"
106
+ LANG_NAME = "Greek"
107
+
108
+ WORD_PATTERN = re.compile(r'[α-ωΑ-Ωάέήίόύώϊϋΐΰἀ-ῼ]+')
109
+ NUMBER_PATTERN = re.compile(r'[0-9]+')
110
+
111
+ def __init__(self):
112
+ super().__init__()
113
+
114
+ def _build_base_dictionary(self):
115
+ self.function_words = {
116
+ 'ο': 'DET', 'η': 'DET', 'το': 'DET', 'οι': 'DET', 'τα': 'DET',
117
+ 'ένα': 'DET', 'μια': 'DET',
118
+ 'εγώ': 'PRON', 'εσύ': 'PRON', 'αυτός': 'PRON', 'αυτή': 'PRON', 'αυτό': 'PRON',
119
+ 'εμείς': 'PRON', 'εσείς': 'PRON', 'αυτοί': 'PRON', 'αυτές': 'PRON',
120
+ 'και': 'CONJ', 'ή': 'CONJ', 'αλλά': 'CONJ', 'όμως': 'CONJ',
121
+ 'σε': 'PREP', 'από': 'PREP', 'με': 'PREP', 'για': 'PREP',
122
+ 'δεν': 'NEG', 'μην': 'NEG',
123
+ }
124
+
125
+ def _build_domain_dictionaries(self):
126
+ pass
127
+
128
+ def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
129
+ if not text.strip():
130
+ return [AnalysisResult([])]
131
+ morphemes = self._analyze_text(text, domain)
132
+ result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
133
+ return [result]
134
+
135
+ def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
136
+ result = []
137
+ pos = 0
138
+ while pos < len(text):
139
+ if text[pos].isspace():
140
+ pos += 1
141
+ continue
142
+ word_match = self.WORD_PATTERN.match(text[pos:])
143
+ if word_match:
144
+ word = word_match.group()
145
+ morpheme = self._analyze_word(word, pos, domain)
146
+ result.append(morpheme)
147
+ pos += len(word)
148
+ continue
149
+ latin_match = re.match(r'[a-zA-Z]+', text[pos:])
150
+ if latin_match:
151
+ word = latin_match.group()
152
+ result.append(Morpheme(surface=word, lemma=word, pos='FOREIGN', start=pos, end=pos + len(word)))
153
+ pos += len(word)
154
+ continue
155
+ num_match = self.NUMBER_PATTERN.match(text[pos:])
156
+ if num_match:
157
+ num = num_match.group()
158
+ result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
159
+ pos += len(num)
160
+ continue
161
+ result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
162
+ pos += 1
163
+ return result
164
+
165
+ def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
166
+ word_lower = word.lower()
167
+ if word_lower in self._user_dictionary:
168
+ lemma, pos_tag, _ = self._user_dictionary[word_lower]
169
+ return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
170
+ if word_lower in self.function_words:
171
+ return Morpheme(surface=word, lemma=word_lower, pos=self.function_words[word_lower], start=offset, end=offset + len(word))
172
+ # Morphological analysis
173
+ if word_lower.endswith(('ω', 'ει', 'ουν', 'ουμε')): # Verb endings
174
+ return Morpheme(surface=word, lemma=word_lower, pos='V', start=offset, end=offset + len(word))
175
+ if word_lower.endswith(('ος', 'ης', 'ας', 'η', 'α', 'ο')): # Noun endings
176
+ return Morpheme(surface=word, lemma=word_lower, pos='N', start=offset, end=offset + len(word))
177
+ if word[0].isupper():
178
+ return Morpheme(surface=word, lemma=word, pos='NP', start=offset, end=offset + len(word))
179
+ return Morpheme(surface=word, lemma=word_lower, pos='N', start=offset, end=offset + len(word))
180
+
181
+ def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
182
+ return []
183
+
184
+
185
+ class GeorgianScriptAnalyzer(AdvancedMorphologicalAnalyzer):
186
+ """조지아 문자 기반 언어 템플릿 (Georgian)"""
187
+
188
+ LANG_CODE = "ka"
189
+ LANG_NAME = "Georgian"
190
+
191
+ WORD_PATTERN = re.compile(r'[\u10A0-\u10FF\u2D00-\u2D2F]+')
192
+ NUMBER_PATTERN = re.compile(r'[0-9]+')
193
+
194
+ def __init__(self):
195
+ super().__init__()
196
+
197
+ def _build_base_dictionary(self):
198
+ self.function_words = {
199
+ 'მე': 'PRON', 'შენ': 'PRON', 'ის': 'PRON', 'ჩვენ': 'PRON', 'თქვენ': 'PRON', 'ისინი': 'PRON',
200
+ 'და': 'CONJ', 'ან': 'CONJ', 'მაგრამ': 'CONJ',
201
+ 'არ': 'NEG', 'არა': 'NEG',
202
+ '-ში': 'PSP', '-ზე': 'PSP', '-თან': 'PSP', '-დან': 'PSP',
203
+ }
204
+
205
+ def _build_domain_dictionaries(self):
206
+ pass
207
+
208
+ def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
209
+ if not text.strip():
210
+ return [AnalysisResult([])]
211
+ morphemes = self._analyze_text(text, domain)
212
+ result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
213
+ return [result]
214
+
215
+ def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
216
+ result = []
217
+ pos = 0
218
+ while pos < len(text):
219
+ if text[pos].isspace():
220
+ pos += 1
221
+ continue
222
+ word_match = self.WORD_PATTERN.match(text[pos:])
223
+ if word_match:
224
+ word = word_match.group()
225
+ morpheme = self._analyze_word(word, pos, domain)
226
+ result.append(morpheme)
227
+ pos += len(word)
228
+ continue
229
+ num_match = self.NUMBER_PATTERN.match(text[pos:])
230
+ if num_match:
231
+ num = num_match.group()
232
+ result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
233
+ pos += len(num)
234
+ continue
235
+ result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
236
+ pos += 1
237
+ return result
238
+
239
+ def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
240
+ if word in self._user_dictionary:
241
+ lemma, pos_tag, _ = self._user_dictionary[word]
242
+ return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
243
+ if word in self.function_words:
244
+ return Morpheme(surface=word, lemma=word, pos=self.function_words[word], start=offset, end=offset + len(word))
245
+ return Morpheme(surface=word, lemma=word, pos='N', start=offset, end=offset + len(word))
246
+
247
+ def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
248
+ return []
249
+
250
+
251
+ class ArmenianScriptAnalyzer(AdvancedMorphologicalAnalyzer):
252
+ """아르메니아 문자 기반 언어 템플릿 (Armenian)"""
253
+
254
+ LANG_CODE = "hy"
255
+ LANG_NAME = "Armenian"
256
+
257
+ WORD_PATTERN = re.compile(r'[\u0530-\u058F\uFB00-\uFB17]+')
258
+ NUMBER_PATTERN = re.compile(r'[0-9]+')
259
+
260
+ def __init__(self):
261
+ super().__init__()
262
+
263
+ def _build_base_dictionary(self):
264
+ self.function_words = {
265
+ 'ես': 'PRON', 'դdelays': 'PRON', 'նdelays': 'PRON',
266
+ 'delays': 'CONJ', 'կdelays': 'CONJ',
267
+ 'delays': 'NEG',
268
+ }
269
+
270
+ def _build_domain_dictionaries(self):
271
+ pass
272
+
273
+ def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
274
+ if not text.strip():
275
+ return [AnalysisResult([])]
276
+ morphemes = self._analyze_text(text, domain)
277
+ result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
278
+ return [result]
279
+
280
+ def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
281
+ result = []
282
+ pos = 0
283
+ while pos < len(text):
284
+ if text[pos].isspace():
285
+ pos += 1
286
+ continue
287
+ word_match = self.WORD_PATTERN.match(text[pos:])
288
+ if word_match:
289
+ word = word_match.group()
290
+ morpheme = self._analyze_word(word, pos, domain)
291
+ result.append(morpheme)
292
+ pos += len(word)
293
+ continue
294
+ num_match = self.NUMBER_PATTERN.match(text[pos:])
295
+ if num_match:
296
+ num = num_match.group()
297
+ result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
298
+ pos += len(num)
299
+ continue
300
+ result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
301
+ pos += 1
302
+ return result
303
+
304
+ def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
305
+ if word in self._user_dictionary:
306
+ lemma, pos_tag, _ = self._user_dictionary[word]
307
+ return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
308
+ return Morpheme(surface=word, lemma=word, pos='N', start=offset, end=offset + len(word))
309
+
310
+ def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
311
+ return []
312
+
313
+
314
+ class ThaiScriptAnalyzer(AdvancedMorphologicalAnalyzer):
315
+ """태국 문자 기반 언어 템플릿 (Thai) - 공백 없는 언어"""
316
+
317
+ LANG_CODE = "th"
318
+ LANG_NAME = "Thai"
319
+
320
+ WORD_PATTERN = re.compile(r'[\u0E00-\u0E7F]+')
321
+ NUMBER_PATTERN = re.compile(r'[0-9๐-๙]+')
322
+
323
+ def __init__(self):
324
+ super().__init__()
325
+
326
+ def _build_base_dictionary(self):
327
+ # Thai function words
328
+ self.function_words = {
329
+ 'ฉัน': 'PRON', 'คุณ': 'PRON', 'เขา': 'PRON', 'เรา': 'PRON', 'พวกเขา': 'PRON',
330
+ 'ที่': 'REL', 'ของ': 'PREP', 'ใน': 'PREP', 'บน': 'PREP', 'กับ': 'PREP',
331
+ 'และ': 'CONJ', 'หรือ': 'CONJ', 'แต่': 'CONJ',
332
+ 'ไม่': 'NEG', 'ไหม': 'Q',
333
+ 'มาก': 'ADV', 'น้อย': 'ADV', 'ดี': 'ADJ',
334
+ }
335
+ # Common Thai words for word segmentation
336
+ self.common_words = set(self.function_words.keys())
337
+
338
+ def _build_domain_dictionaries(self):
339
+ pass
340
+
341
+ def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
342
+ if not text.strip():
343
+ return [AnalysisResult([])]
344
+ morphemes = self._analyze_text(text, domain)
345
+ result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
346
+ return [result]
347
+
348
+ def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
349
+ """Thai text analysis - simplified word segmentation"""
350
+ result = []
351
+ pos = 0
352
+ while pos < len(text):
353
+ if text[pos].isspace():
354
+ pos += 1
355
+ continue
356
+ # Thai script
357
+ thai_match = self.WORD_PATTERN.match(text[pos:])
358
+ if thai_match:
359
+ chunk = thai_match.group()
360
+ # Simple maximum matching for word segmentation
361
+ words = self._segment_thai(chunk)
362
+ for word, wpos in words:
363
+ if word in self.function_words:
364
+ result.append(Morpheme(surface=word, lemma=word, pos=self.function_words[word], start=pos + wpos, end=pos + wpos + len(word)))
365
+ else:
366
+ result.append(Morpheme(surface=word, lemma=word, pos='N', start=pos + wpos, end=pos + wpos + len(word)))
367
+ pos += len(chunk)
368
+ continue
369
+ num_match = self.NUMBER_PATTERN.match(text[pos:])
370
+ if num_match:
371
+ num = num_match.group()
372
+ result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
373
+ pos += len(num)
374
+ continue
375
+ result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
376
+ pos += 1
377
+ return result
378
+
379
+ def _segment_thai(self, text: str) -> List[Tuple[str, int]]:
380
+ """Simple Thai word segmentation using maximum matching"""
381
+ words = []
382
+ pos = 0
383
+ while pos < len(text):
384
+ # Try to match known words (longest first)
385
+ matched = False
386
+ for length in range(min(10, len(text) - pos), 0, -1):
387
+ substr = text[pos:pos + length]
388
+ if substr in self.common_words or substr in self._user_dictionary:
389
+ words.append((substr, pos))
390
+ pos += length
391
+ matched = True
392
+ break
393
+ if not matched:
394
+ # Single character
395
+ words.append((text[pos], pos))
396
+ pos += 1
397
+ return words
398
+
399
+ def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
400
+ if word in self._user_dictionary:
401
+ lemma, pos_tag, _ = self._user_dictionary[word]
402
+ return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
403
+ if word in self.function_words:
404
+ return Morpheme(surface=word, lemma=word, pos=self.function_words[word], start=offset, end=offset + len(word))
405
+ return Morpheme(surface=word, lemma=word, pos='N', start=offset, end=offset + len(word))
406
+
407
+ def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
408
+ return []
409
+
410
+
411
+ class EthiopicScriptAnalyzer(AdvancedMorphologicalAnalyzer):
412
+ """에티오피아 문자 기반 언어 템플릿 (Amharic, Tigrinya, etc.)"""
413
+
414
+ LANG_CODE = "am"
415
+ LANG_NAME = "Amharic"
416
+
417
+ WORD_PATTERN = re.compile(r'[\u1200-\u137F\u1380-\u139F\u2D80-\u2DDF]+')
418
+ NUMBER_PATTERN = re.compile(r'[0-9፩-፼]+')
419
+
420
+ def __init__(self):
421
+ super().__init__()
422
+
423
+ def _build_base_dictionary(self):
424
+ self.function_words = {
425
+ 'እኔ': 'PRON', 'አንተ': 'PRON', 'እሱ': 'PRON', 'እሷ': 'PRON',
426
+ 'እኛ': 'PRON', 'እናንተ': 'PRON', 'እነሱ': 'PRON',
427
+ 'እና': 'CONJ', 'ወይም': 'CONJ', 'ግን': 'CONJ',
428
+ 'አይ': 'NEG', 'የለም': 'NEG',
429
+ 'ውስጥ': 'PREP', 'ላይ': 'PREP', 'ከ': 'PREP', 'ወደ': 'PREP',
430
+ }
431
+
432
+ def _build_domain_dictionaries(self):
433
+ pass
434
+
435
+ def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
436
+ if not text.strip():
437
+ return [AnalysisResult([])]
438
+ morphemes = self._analyze_text(text, domain)
439
+ result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
440
+ return [result]
441
+
442
+ def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
443
+ result = []
444
+ pos = 0
445
+ while pos < len(text):
446
+ if text[pos].isspace():
447
+ pos += 1
448
+ continue
449
+ word_match = self.WORD_PATTERN.match(text[pos:])
450
+ if word_match:
451
+ word = word_match.group()
452
+ morpheme = self._analyze_word(word, pos, domain)
453
+ result.append(morpheme)
454
+ pos += len(word)
455
+ continue
456
+ num_match = self.NUMBER_PATTERN.match(text[pos:])
457
+ if num_match:
458
+ num = num_match.group()
459
+ result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
460
+ pos += len(num)
461
+ continue
462
+ result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
463
+ pos += 1
464
+ return result
465
+
466
+ def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
467
+ if word in self._user_dictionary:
468
+ lemma, pos_tag, _ = self._user_dictionary[word]
469
+ return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
470
+ if word in self.function_words:
471
+ return Morpheme(surface=word, lemma=word, pos=self.function_words[word], start=offset, end=offset + len(word))
472
+ return Morpheme(surface=word, lemma=word, pos='N', start=offset, end=offset + len(word))
473
+
474
+ def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
475
+ return []