tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,217 @@
1
+ """
2
+ Russian Advanced Morphological Analyzer
3
+ =======================================
4
+
5
+ 5가지 고급 기능을 지원하는 러시아어 형태소 분석기
6
+ """
7
+
8
+ import re
9
+ from typing import List, Tuple, Dict, Optional
10
+
11
+ from .advanced_base import (
12
+ AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, NBestResult, Domain
13
+ )
14
+
15
+
16
+ class RussianAdvancedAnalyzer(AdvancedMorphologicalAnalyzer):
17
+ """러시아어 고급 형태소 분석기 (키릴 문자)"""
18
+
19
+ LANG_CODE = "ru"
20
+ LANG_NAME = "Russian"
21
+
22
+ # 키릴 문자 패턴
23
+ WORD_PATTERN = re.compile(r'[а-яА-ЯёЁ]+')
24
+ NUMBER_PATTERN = re.compile(r'[0-9]+(?:[.,][0-9]+)?')
25
+
26
+ def __init__(self):
27
+ super().__init__()
28
+
29
+ def _build_base_dictionary(self):
30
+ """기본 사전 구축"""
31
+
32
+ # 불규칙 동사 (быть)
33
+ self.irregular_verbs = {
34
+ # быть (be)
35
+ 'есть': 'быть', 'был': 'быть', 'была': 'быть', 'было': 'быть',
36
+ 'были': 'быть', 'буду': 'быть', 'будешь': 'быть', 'будет': 'быть',
37
+ 'будем': 'быть', 'будете': 'быть', 'будут': 'быть',
38
+ # идти (go)
39
+ 'иду': 'идти', 'идёшь': 'идти', 'идёт': 'идти',
40
+ 'идём': 'идти', 'идёте': 'идти', 'идут': 'идти',
41
+ 'шёл': 'идти', 'шла': 'идти', 'шло': 'идти', 'шли': 'идти',
42
+ # хотеть (want)
43
+ 'хочу': 'хотеть', 'хочешь': 'хотеть', 'хочет': 'хотеть',
44
+ 'хотим': 'хотеть', 'хотите': 'хотеть', 'хотят': 'хотеть',
45
+ # мочь (can)
46
+ 'могу': 'мочь', 'можешь': 'мочь', 'может': 'мочь',
47
+ 'можем': 'мочь', 'можете': 'мочь', 'могут': 'мочь',
48
+ # есть (eat)
49
+ 'ем': 'есть', 'ешь': 'есть', 'ест': 'есть',
50
+ 'едим': 'есть', 'едите': 'есть', 'едят': 'есть',
51
+ # давать (give)
52
+ 'даю': 'давать', 'даёшь': 'давать', 'даёт': 'давать',
53
+ }
54
+
55
+ # 대명사
56
+ self.pronouns = {
57
+ 'я': 'PRON', 'ты': 'PRON', 'он': 'PRON', 'она': 'PRON', 'оно': 'PRON',
58
+ 'мы': 'PRON', 'вы': 'PRON', 'они': 'PRON',
59
+ 'меня': 'PRON', 'тебя': 'PRON', 'его': 'PRON', 'её': 'PRON',
60
+ 'нас': 'PRON', 'вас': 'PRON', 'их': 'PRON',
61
+ 'мне': 'PRON', 'тебе': 'PRON', 'ему': 'PRON', 'ей': 'PRON',
62
+ 'нам': 'PRON', 'вам': 'PRON', 'им': 'PRON',
63
+ 'кто': 'PRON', 'что': 'PRON', 'какой': 'PRON', 'который': 'PRON',
64
+ 'этот': 'PRON', 'тот': 'PRON', 'весь': 'PRON', 'сам': 'PRON',
65
+ }
66
+
67
+ # 전치사
68
+ self.prepositions = {
69
+ 'в': 'PREP', 'на': 'PREP', 'с': 'PREP', 'к': 'PREP', 'по': 'PREP',
70
+ 'за': 'PREP', 'из': 'PREP', 'от': 'PREP', 'до': 'PREP', 'о': 'PREP',
71
+ 'об': 'PREP', 'у': 'PREP', 'при': 'PREP', 'над': 'PREP', 'под': 'PREP',
72
+ 'перед': 'PREP', 'между': 'PREP', 'без': 'PREP', 'через': 'PREP',
73
+ }
74
+
75
+ # 접속사
76
+ self.conjunctions = {
77
+ 'и': 'CONJ', 'а': 'CONJ', 'но': 'CONJ', 'или': 'CONJ',
78
+ 'что': 'CONJ', 'чтобы': 'CONJ', 'если': 'CONJ', 'когда': 'CONJ',
79
+ 'потому': 'CONJ', 'хотя': 'CONJ', 'пока': 'CONJ', 'как': 'CONJ',
80
+ }
81
+
82
+ # 부사
83
+ self.adverbs = {
84
+ 'очень': 'ADV', 'хорошо': 'ADV', 'плохо': 'ADV', 'быстро': 'ADV',
85
+ 'медленно': 'ADV', 'много': 'ADV', 'мало': 'ADV', 'тоже': 'ADV',
86
+ 'уже': 'ADV', 'ещё': 'ADV', 'всегда': 'ADV', 'никогда': 'ADV',
87
+ 'здесь': 'ADV', 'там': 'ADV', 'сейчас': 'ADV', 'потом': 'ADV',
88
+ 'тогда': 'ADV', 'давно': 'ADV', 'скоро': 'ADV', 'вместе': 'ADV',
89
+ }
90
+
91
+ # 조사/불변화사
92
+ self.particles = {
93
+ 'не': 'PART', 'ни': 'PART', 'же': 'PART', 'бы': 'PART',
94
+ 'ли': 'PART', 'да': 'PART', 'нет': 'PART', 'вот': 'PART',
95
+ }
96
+
97
+ def _build_domain_dictionaries(self):
98
+ """도메인별 사전"""
99
+ self._domain_dictionaries[Domain.TECH] = {
100
+ 'яблоко': ('Apple', 'NP'),
101
+ 'облако': ('cloud', 'NC'),
102
+ }
103
+ self._domain_dictionaries[Domain.FOOD] = {
104
+ 'яблоко': ('яблоко', 'NC'),
105
+ }
106
+ self._domain_dictionaries[Domain.FINANCE] = {
107
+ 'банк': ('банк', 'NC'),
108
+ 'акция': ('акция', 'NC'),
109
+ }
110
+
111
+ def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
112
+ if not text or not text.strip():
113
+ return [AnalysisResult([])]
114
+ morphemes = self._analyze_text(text, domain)
115
+ result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
116
+ result.score = self._score_analysis(result)
117
+ return [result]
118
+
119
+ def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
120
+ result = []
121
+ pos = 0
122
+ while pos < len(text):
123
+ if text[pos].isspace():
124
+ pos += 1
125
+ continue
126
+
127
+ word_match = self.WORD_PATTERN.match(text[pos:])
128
+ if word_match:
129
+ word = word_match.group()
130
+ morpheme = self._analyze_word(word, pos, domain)
131
+ result.append(morpheme)
132
+ pos += len(word)
133
+ continue
134
+
135
+ # 라틴 문자 (외래어/영어)
136
+ latin_match = re.match(r'[a-zA-Z]+', text[pos:])
137
+ if latin_match:
138
+ word = latin_match.group()
139
+ result.append(Morpheme(surface=word, lemma=word, pos='FOREIGN', start=pos, end=pos + len(word)))
140
+ pos += len(word)
141
+ continue
142
+
143
+ num_match = self.NUMBER_PATTERN.match(text[pos:])
144
+ if num_match:
145
+ num = num_match.group()
146
+ result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
147
+ pos += len(num)
148
+ continue
149
+
150
+ result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
151
+ pos += 1
152
+ return result
153
+
154
+ def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
155
+ word_lower = word.lower()
156
+
157
+ if word_lower in self._user_dictionary:
158
+ lemma, pos_tag, _ = self._user_dictionary[word_lower]
159
+ return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
160
+
161
+ domain_sense = self._get_domain_sense(word_lower, domain)
162
+ if domain_sense:
163
+ return Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))
164
+
165
+ if word_lower in self.pronouns:
166
+ return Morpheme(surface=word, lemma=word_lower, pos='PRON', start=offset, end=offset + len(word))
167
+ if word_lower in self.prepositions:
168
+ return Morpheme(surface=word, lemma=word_lower, pos='PREP', start=offset, end=offset + len(word))
169
+ if word_lower in self.conjunctions:
170
+ return Morpheme(surface=word, lemma=word_lower, pos='CONJ', start=offset, end=offset + len(word))
171
+ if word_lower in self.adverbs:
172
+ return Morpheme(surface=word, lemma=word_lower, pos='ADV', start=offset, end=offset + len(word))
173
+ if word_lower in self.particles:
174
+ return Morpheme(surface=word, lemma=word_lower, pos='PART', start=offset, end=offset + len(word))
175
+
176
+ if word_lower in self.irregular_verbs:
177
+ return Morpheme(surface=word, lemma=self.irregular_verbs[word_lower], pos='V', start=offset, end=offset + len(word))
178
+
179
+ lemma, pos_tag = self._analyze_morphology(word)
180
+ return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
181
+
182
+ def _analyze_morphology(self, word: str) -> Tuple[str, str]:
183
+ # -ть 동사 원형
184
+ if word.endswith('ть') and len(word) > 3:
185
+ return (word, 'V')
186
+ # -ся 재귀동사
187
+ if word.endswith('ся') and len(word) > 4:
188
+ return (word[:-2], 'V')
189
+ # -ние/-ение 명사
190
+ if word.endswith(('ние', 'ение', 'ание')) and len(word) > 5:
191
+ return (word, 'NC')
192
+ # -ость/-есть 명사
193
+ if word.endswith(('ость', 'есть')) and len(word) > 5:
194
+ return (word, 'NC')
195
+ # -ый/-ий/-ой 형용사
196
+ if word.endswith(('ый', 'ий', 'ой')) and len(word) > 3:
197
+ return (word, 'ADJ')
198
+ # -ая/-яя 형용사 (여성)
199
+ if word.endswith(('ая', 'яя')) and len(word) > 3:
200
+ return (word, 'ADJ')
201
+ # 대문자 시작 (고유명사)
202
+ if word[0].isupper():
203
+ return (word, 'NP')
204
+ return (word, 'NC')
205
+
206
+ def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
207
+ alternatives = []
208
+ other_domains = [d for d in Domain if d != domain][:count]
209
+ for alt_domain in other_domains:
210
+ morphemes = self._analyze_text(text, alt_domain)
211
+ result = AnalysisResult(morphemes=morphemes, score=0.8, domain=alt_domain)
212
+ result.score = self._score_analysis(result) * 0.9
213
+ alternatives.append(result)
214
+ return alternatives
215
+
216
+
217
+ RussianAnalyzer = RussianAdvancedAnalyzer
@@ -0,0 +1,226 @@
1
+ """
2
+ Spanish Advanced Morphological Analyzer
3
+ =======================================
4
+
5
+ 5가지 고급 기능을 지원하는 스페인어 형태소 분석기
6
+ """
7
+
8
+ import re
9
+ from typing import List, Tuple, Dict, Optional
10
+
11
+ from .advanced_base import (
12
+ AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, NBestResult, Domain
13
+ )
14
+
15
+
16
+ class SpanishAdvancedAnalyzer(AdvancedMorphologicalAnalyzer):
17
+ """스페인어 고급 형태소 분석기"""
18
+
19
+ LANG_CODE = "es"
20
+ LANG_NAME = "Spanish"
21
+
22
+ WORD_PATTERN = re.compile(r"[a-zA-ZáéíóúüñÁÉÍÓÚÜÑ]+")
23
+ NUMBER_PATTERN = re.compile(r'[0-9]+(?:[.,][0-9]+)?')
24
+
25
+ def __init__(self):
26
+ super().__init__()
27
+
28
+ def _build_base_dictionary(self):
29
+ """기본 사전 구축"""
30
+
31
+ # 불규칙 동사 (ser, estar, ir, tener, hacer)
32
+ self.irregular_verbs = {
33
+ # ser
34
+ 'soy': 'ser', 'eres': 'ser', 'es': 'ser',
35
+ 'somos': 'ser', 'sois': 'ser', 'son': 'ser',
36
+ 'era': 'ser', 'eras': 'ser', 'éramos': 'ser',
37
+ 'erais': 'ser', 'eran': 'ser', 'fue': 'ser',
38
+ 'fuiste': 'ser', 'fuimos': 'ser', 'fueron': 'ser',
39
+ # estar
40
+ 'estoy': 'estar', 'estás': 'estar', 'está': 'estar',
41
+ 'estamos': 'estar', 'estáis': 'estar', 'están': 'estar',
42
+ 'estaba': 'estar', 'estuve': 'estar', 'estuvo': 'estar',
43
+ # ir
44
+ 'voy': 'ir', 'vas': 'ir', 'va': 'ir',
45
+ 'vamos': 'ir', 'vais': 'ir', 'van': 'ir',
46
+ 'iba': 'ir', 'ibas': 'ir', 'íbamos': 'ir',
47
+ # tener
48
+ 'tengo': 'tener', 'tienes': 'tener', 'tiene': 'tener',
49
+ 'tenemos': 'tener', 'tenéis': 'tener', 'tienen': 'tener',
50
+ 'tenía': 'tener', 'tuve': 'tener', 'tuvo': 'tener',
51
+ # hacer
52
+ 'hago': 'hacer', 'haces': 'hacer', 'hace': 'hacer',
53
+ 'hacemos': 'hacer', 'hacéis': 'hacer', 'hacen': 'hacer',
54
+ 'hacía': 'hacer', 'hice': 'hacer', 'hizo': 'hacer',
55
+ # poder
56
+ 'puedo': 'poder', 'puedes': 'poder', 'puede': 'poder',
57
+ 'podemos': 'poder', 'podéis': 'poder', 'pueden': 'poder',
58
+ 'podía': 'poder', 'pude': 'poder', 'pudo': 'poder',
59
+ # querer
60
+ 'quiero': 'querer', 'quieres': 'querer', 'quiere': 'querer',
61
+ 'queremos': 'querer', 'queréis': 'querer', 'quieren': 'querer',
62
+ # saber
63
+ 'sé': 'saber', 'sabes': 'saber', 'sabe': 'saber',
64
+ 'sabemos': 'saber', 'sabéis': 'saber', 'saben': 'saber',
65
+ # venir
66
+ 'vengo': 'venir', 'vienes': 'venir', 'viene': 'venir',
67
+ 'venimos': 'venir', 'venís': 'venir', 'vienen': 'venir',
68
+ # decir
69
+ 'digo': 'decir', 'dices': 'decir', 'dice': 'decir',
70
+ 'decimos': 'decir', 'decís': 'decir', 'dicen': 'decir',
71
+ }
72
+
73
+ # 관사
74
+ self.articles = {
75
+ 'el': 'DET', 'la': 'DET', 'los': 'DET', 'las': 'DET',
76
+ 'un': 'DET', 'una': 'DET', 'unos': 'DET', 'unas': 'DET',
77
+ 'al': 'DET', 'del': 'DET',
78
+ }
79
+
80
+ # 대명사
81
+ self.pronouns = {
82
+ 'yo': 'PRON', 'tú': 'PRON', 'él': 'PRON', 'ella': 'PRON',
83
+ 'nosotros': 'PRON', 'vosotros': 'PRON', 'ellos': 'PRON', 'ellas': 'PRON',
84
+ 'me': 'PRON', 'te': 'PRON', 'se': 'PRON', 'nos': 'PRON', 'os': 'PRON',
85
+ 'lo': 'PRON', 'le': 'PRON', 'les': 'PRON',
86
+ 'que': 'PRON', 'quien': 'PRON', 'cual': 'PRON', 'cuyo': 'PRON',
87
+ 'este': 'PRON', 'ese': 'PRON', 'aquel': 'PRON',
88
+ }
89
+
90
+ # 전치사
91
+ self.prepositions = {
92
+ 'a': 'PREP', 'de': 'PREP', 'en': 'PREP', 'con': 'PREP',
93
+ 'por': 'PREP', 'para': 'PREP', 'sin': 'PREP', 'sobre': 'PREP',
94
+ 'entre': 'PREP', 'hasta': 'PREP', 'desde': 'PREP', 'hacia': 'PREP',
95
+ 'bajo': 'PREP', 'contra': 'PREP', 'durante': 'PREP', 'según': 'PREP',
96
+ }
97
+
98
+ # 접속사
99
+ self.conjunctions = {
100
+ 'y': 'CONJ', 'e': 'CONJ', 'o': 'CONJ', 'u': 'CONJ',
101
+ 'pero': 'CONJ', 'sino': 'CONJ', 'ni': 'CONJ',
102
+ 'que': 'CONJ', 'si': 'CONJ', 'cuando': 'CONJ', 'porque': 'CONJ',
103
+ 'aunque': 'CONJ', 'como': 'CONJ', 'mientras': 'CONJ',
104
+ }
105
+
106
+ # 부사
107
+ self.adverbs = {
108
+ 'muy': 'ADV', 'bien': 'ADV', 'mal': 'ADV', 'poco': 'ADV',
109
+ 'mucho': 'ADV', 'más': 'ADV', 'menos': 'ADV', 'también': 'ADV',
110
+ 'siempre': 'ADV', 'nunca': 'ADV', 'ya': 'ADV', 'todavía': 'ADV',
111
+ 'aquí': 'ADV', 'allí': 'ADV', 'ahora': 'ADV', 'hoy': 'ADV',
112
+ }
113
+
114
+ def _build_domain_dictionaries(self):
115
+ """도메인별 사전"""
116
+ self._domain_dictionaries[Domain.TECH] = {
117
+ 'manzana': ('Apple', 'NP'),
118
+ 'nube': ('cloud', 'NC'),
119
+ }
120
+ self._domain_dictionaries[Domain.FOOD] = {
121
+ 'manzana': ('manzana', 'NC'),
122
+ }
123
+ self._domain_dictionaries[Domain.FINANCE] = {
124
+ 'banco': ('banco', 'NC'),
125
+ 'acción': ('acción', 'NC'),
126
+ }
127
+
128
+ def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
129
+ if not text or not text.strip():
130
+ return [AnalysisResult([])]
131
+ morphemes = self._analyze_text(text, domain)
132
+ result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
133
+ result.score = self._score_analysis(result)
134
+ return [result]
135
+
136
+ def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
137
+ result = []
138
+ pos = 0
139
+ while pos < len(text):
140
+ if text[pos].isspace():
141
+ pos += 1
142
+ continue
143
+
144
+ word_match = self.WORD_PATTERN.match(text[pos:])
145
+ if word_match:
146
+ word = word_match.group()
147
+ morpheme = self._analyze_word(word, pos, domain)
148
+ result.append(morpheme)
149
+ pos += len(word)
150
+ continue
151
+
152
+ num_match = self.NUMBER_PATTERN.match(text[pos:])
153
+ if num_match:
154
+ num = num_match.group()
155
+ result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
156
+ pos += len(num)
157
+ continue
158
+
159
+ result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
160
+ pos += 1
161
+ return result
162
+
163
+ def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
164
+ word_lower = word.lower()
165
+
166
+ if word_lower in self._user_dictionary:
167
+ lemma, pos_tag, _ = self._user_dictionary[word_lower]
168
+ return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
169
+
170
+ domain_sense = self._get_domain_sense(word_lower, domain)
171
+ if domain_sense:
172
+ return Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))
173
+
174
+ if word_lower in self.articles:
175
+ return Morpheme(surface=word, lemma=word_lower, pos='DET', start=offset, end=offset + len(word))
176
+ if word_lower in self.pronouns:
177
+ return Morpheme(surface=word, lemma=word_lower, pos='PRON', start=offset, end=offset + len(word))
178
+ if word_lower in self.prepositions:
179
+ return Morpheme(surface=word, lemma=word_lower, pos='PREP', start=offset, end=offset + len(word))
180
+ if word_lower in self.conjunctions:
181
+ return Morpheme(surface=word, lemma=word_lower, pos='CONJ', start=offset, end=offset + len(word))
182
+ if word_lower in self.adverbs:
183
+ return Morpheme(surface=word, lemma=word_lower, pos='ADV', start=offset, end=offset + len(word))
184
+
185
+ if word_lower in self.irregular_verbs:
186
+ return Morpheme(surface=word, lemma=self.irregular_verbs[word_lower], pos='V', start=offset, end=offset + len(word))
187
+
188
+ lemma, pos_tag = self._analyze_morphology(word)
189
+ return Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))
190
+
191
+ def _analyze_morphology(self, word: str) -> Tuple[str, str]:
192
+ # -ar 동사 (1군)
193
+ if word.endswith('ar') and len(word) > 3:
194
+ return (word, 'V')
195
+ # -er 동사 (2군)
196
+ if word.endswith('er') and len(word) > 3:
197
+ return (word, 'V')
198
+ # -ir 동사 (3군)
199
+ if word.endswith('ir') and len(word) > 3:
200
+ return (word, 'V')
201
+ # -ción/-sión 명사
202
+ if word.endswith(('ción', 'sión')) and len(word) > 5:
203
+ return (word, 'NC')
204
+ # -mente 부사
205
+ if word.endswith('mente') and len(word) > 6:
206
+ return (word, 'ADV')
207
+ # -oso/-osa 형용사
208
+ if word.endswith(('oso', 'osa')) and len(word) > 4:
209
+ return (word, 'ADJ')
210
+ # 대문자 시작 (고유명사)
211
+ if word[0].isupper():
212
+ return (word, 'NP')
213
+ return (word, 'NC')
214
+
215
+ def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
216
+ alternatives = []
217
+ other_domains = [d for d in Domain if d != domain][:count]
218
+ for alt_domain in other_domains:
219
+ morphemes = self._analyze_text(text, alt_domain)
220
+ result = AnalysisResult(morphemes=morphemes, score=0.8, domain=alt_domain)
221
+ result.score = self._score_analysis(result) * 0.9
222
+ alternatives.append(result)
223
+ return alternatives
224
+
225
+
226
+ SpanishAnalyzer = SpanishAdvancedAnalyzer
@@ -0,0 +1,32 @@
1
+ """
2
+ Language Family Templates
3
+ =========================
4
+
5
+ 언어군별 템플릿 분석기 - 개별 언어 분석기의 베이스
6
+ """
7
+
8
+ from .latin_template import LatinScriptAnalyzer
9
+ from .cyrillic_template import CyrillicScriptAnalyzer
10
+ from .arabic_script_template import ArabicScriptAnalyzer
11
+ from .brahmic_template import BrahmicScriptAnalyzer
12
+ from .other_scripts_template import (
13
+ HebrewScriptAnalyzer,
14
+ GreekScriptAnalyzer,
15
+ GeorgianScriptAnalyzer,
16
+ ArmenianScriptAnalyzer,
17
+ ThaiScriptAnalyzer,
18
+ EthiopicScriptAnalyzer,
19
+ )
20
+
21
+ __all__ = [
22
+ 'LatinScriptAnalyzer',
23
+ 'CyrillicScriptAnalyzer',
24
+ 'ArabicScriptAnalyzer',
25
+ 'BrahmicScriptAnalyzer',
26
+ 'HebrewScriptAnalyzer',
27
+ 'GreekScriptAnalyzer',
28
+ 'GeorgianScriptAnalyzer',
29
+ 'ArmenianScriptAnalyzer',
30
+ 'ThaiScriptAnalyzer',
31
+ 'EthiopicScriptAnalyzer',
32
+ ]
@@ -0,0 +1,162 @@
1
+ """
2
+ Arabic Script Language Template
3
+ ===============================
4
+
5
+ 아랍 문자 기반 언어용 템플릿 분석기
6
+ Arabic, Persian, Urdu, Pashto, Kurdish, etc.
7
+ """
8
+
9
+ import re
10
+ from typing import List, Tuple, Dict, Optional
11
+
12
+ from ..advanced_base import (
13
+ AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, Domain
14
+ )
15
+
16
+
17
+ class ArabicScriptAnalyzer(AdvancedMorphologicalAnalyzer):
18
+ """
19
+ 아랍 문자 기반 언어 템플릿
20
+
21
+ Covers: Arabic, Persian/Farsi, Urdu, Pashto, Kurdish, Sindhi, etc.
22
+ RTL (Right-to-Left) text processing
23
+ """
24
+
25
+ # Arabic script pattern (extended for Persian, Urdu, etc.)
26
+ WORD_PATTERN = re.compile(
27
+ r'[\u0600-\u06FF' # Arabic
28
+ r'\u0750-\u077F' # Arabic Supplement
29
+ r'\u08A0-\u08FF' # Arabic Extended-A
30
+ r'\uFB50-\uFDFF' # Arabic Presentation Forms-A
31
+ r'\uFE70-\uFEFF' # Arabic Presentation Forms-B
32
+ r'\u0671-\u06D3' # Extended Arabic letters
33
+ r'پچژگک' # Persian additions
34
+ r'ڈڑںھٹ' # Urdu additions
35
+ r']+'
36
+ )
37
+ NUMBER_PATTERN = re.compile(r'[0-9٠-٩۰-۹]+(?:[.,][0-9٠-٩۰-۹]+)?')
38
+
39
+ def __init__(self):
40
+ super().__init__()
41
+
42
+ def _build_base_dictionary(self):
43
+ """Override in subclass"""
44
+ self.prefixes: Dict[str, str] = {}
45
+ self.suffixes: Dict[str, str] = {}
46
+ self.function_words: Dict[str, str] = {}
47
+
48
+ def _build_domain_dictionaries(self):
49
+ """Override in subclass"""
50
+ pass
51
+
52
+ def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
53
+ if not text or not text.strip():
54
+ return [AnalysisResult([])]
55
+
56
+ morphemes = self._analyze_text(text, domain)
57
+ result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
58
+ result.score = self._score_analysis(result)
59
+ return [result]
60
+
61
+ def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
62
+ result = []
63
+ pos = 0
64
+
65
+ while pos < len(text):
66
+ if text[pos].isspace():
67
+ pos += 1
68
+ continue
69
+
70
+ # Arabic script word
71
+ word_match = self.WORD_PATTERN.match(text[pos:])
72
+ if word_match:
73
+ word = word_match.group()
74
+ morphemes = self._analyze_word(word, pos, domain)
75
+ result.extend(morphemes)
76
+ pos += len(word)
77
+ continue
78
+
79
+ # Latin (foreign words)
80
+ latin_match = re.match(r'[a-zA-Z]+', text[pos:])
81
+ if latin_match:
82
+ word = latin_match.group()
83
+ result.append(Morpheme(surface=word, lemma=word, pos='FOREIGN', start=pos, end=pos + len(word)))
84
+ pos += len(word)
85
+ continue
86
+
87
+ # Number
88
+ num_match = self.NUMBER_PATTERN.match(text[pos:])
89
+ if num_match:
90
+ num = num_match.group()
91
+ result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
92
+ pos += len(num)
93
+ continue
94
+
95
+ # Punctuation
96
+ result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
97
+ pos += 1
98
+
99
+ return result
100
+
101
+ def _analyze_word(self, word: str, offset: int, domain: Domain) -> List[Morpheme]:
102
+ """Analyze word with prefix/suffix separation"""
103
+ morphemes = []
104
+
105
+ # User dictionary
106
+ if word in self._user_dictionary:
107
+ lemma, pos_tag, _ = self._user_dictionary[word]
108
+ return [Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))]
109
+
110
+ # Domain dictionary
111
+ domain_sense = self._get_domain_sense(word, domain)
112
+ if domain_sense:
113
+ return [Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))]
114
+
115
+ # Function words
116
+ if hasattr(self, 'function_words') and word in self.function_words:
117
+ return [Morpheme(surface=word, lemma=word, pos=self.function_words[word], start=offset, end=offset + len(word))]
118
+
119
+ # Prefix/suffix analysis
120
+ current_offset = offset
121
+ remaining = word
122
+
123
+ # Check prefixes
124
+ if hasattr(self, 'prefixes'):
125
+ for prefix, pos_tag in sorted(self.prefixes.items(), key=lambda x: -len(x[0])):
126
+ if remaining.startswith(prefix) and len(remaining) > len(prefix):
127
+ morphemes.append(Morpheme(surface=prefix, lemma=prefix, pos=pos_tag, start=current_offset, end=current_offset + len(prefix)))
128
+ current_offset += len(prefix)
129
+ remaining = remaining[len(prefix):]
130
+ break
131
+
132
+ # Check suffixes
133
+ stem = remaining
134
+ suffix_morphemes = []
135
+ if hasattr(self, 'suffixes'):
136
+ for suffix, pos_tag in sorted(self.suffixes.items(), key=lambda x: -len(x[0])):
137
+ if remaining.endswith(suffix) and len(remaining) > len(suffix):
138
+ stem = remaining[:-len(suffix)]
139
+ suffix_morphemes.append(Morpheme(
140
+ surface=suffix, lemma=suffix, pos=pos_tag,
141
+ start=current_offset + len(stem), end=offset + len(word)
142
+ ))
143
+ break
144
+
145
+ # Add stem
146
+ if stem:
147
+ morphemes.append(Morpheme(surface=stem, lemma=stem, pos='N', start=current_offset, end=current_offset + len(stem)))
148
+
149
+ # Add suffix morphemes
150
+ morphemes.extend(suffix_morphemes)
151
+
152
+ return morphemes if morphemes else [Morpheme(surface=word, lemma=word, pos='N', start=offset, end=offset + len(word))]
153
+
154
+ def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
155
+ alternatives = []
156
+ other_domains = [d for d in Domain if d != domain][:count]
157
+ for alt_domain in other_domains:
158
+ morphemes = self._analyze_text(text, alt_domain)
159
+ result = AnalysisResult(morphemes=morphemes, score=0.8, domain=alt_domain)
160
+ result.score = self._score_analysis(result) * 0.9
161
+ alternatives.append(result)
162
+ return alternatives