tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,247 @@
1
+ """
2
+ Arabic Advanced Morphological Analyzer
3
+ ======================================
4
+
5
+ 5가지 고급 기능을 지원하는 아랍어 형태소 분석기
6
+
7
+ 특징:
8
+ - RTL (Right-to-Left) 문자 처리
9
+ - 어근-어형 시스템 (Root-Pattern)
10
+ - 접두사/접미사 분리
11
+ """
12
+
13
+ import re
14
+ from typing import List, Tuple, Dict, Optional
15
+
16
+ from .advanced_base import (
17
+ AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, NBestResult, Domain
18
+ )
19
+
20
+
21
+ class ArabicAdvancedAnalyzer(AdvancedMorphologicalAnalyzer):
22
+ """아랍어 고급 형태소 분석기"""
23
+
24
+ LANG_CODE = "ar"
25
+ LANG_NAME = "Arabic"
26
+
27
+ # 아랍어 문자 패턴
28
+ ARABIC_PATTERN = re.compile(r'[\u0600-\u06FF\u0750-\u077F]+')
29
+ NUMBER_PATTERN = re.compile(r'[0-9٠-٩]+(?:[.,][0-9٠-٩]+)?')
30
+
31
+ def __init__(self):
32
+ super().__init__()
33
+
34
+ def _build_base_dictionary(self):
35
+ """기본 사전 구축"""
36
+
37
+ # 접두사 (Prefixes)
38
+ self.prefixes = {
39
+ 'ال': 'DEF', # 정관사 (the)
40
+ 'و': 'CONJ', # 그리고 (and)
41
+ 'ف': 'CONJ', # 그러면 (so)
42
+ 'ب': 'PREP', # ~로 (with/by)
43
+ 'ك': 'PREP', # ~처럼 (like)
44
+ 'ل': 'PREP', # ~에게 (to/for)
45
+ 'س': 'FUT', # 미래 표지
46
+ 'سوف': 'FUT', # 미래 표지
47
+ }
48
+
49
+ # 접미사 (대명사 접미사)
50
+ self.suffixes = {
51
+ 'ي': 'PRON', # 나의
52
+ 'ك': 'PRON', # 너의
53
+ 'ه': 'PRON', # 그의
54
+ 'ها': 'PRON', # 그녀의
55
+ 'نا': 'PRON', # 우리의
56
+ 'كم': 'PRON', # 너희의
57
+ 'هم': 'PRON', # 그들의
58
+ 'ة': 'FEM', # 여성 어미
59
+ 'ات': 'PL', # 복수 어미 (여성)
60
+ 'ون': 'PL', # 복수 어미 (남성)
61
+ 'ين': 'PL', # 복수 어미 (남성)
62
+ }
63
+
64
+ # 대명사
65
+ self.pronouns = {
66
+ 'أنا': 'PRON', 'أنت': 'PRON', 'أنتِ': 'PRON',
67
+ 'هو': 'PRON', 'هي': 'PRON',
68
+ 'نحن': 'PRON', 'أنتم': 'PRON', 'هم': 'PRON', 'هن': 'PRON',
69
+ 'هذا': 'DEM', 'هذه': 'DEM', 'ذلك': 'DEM', 'تلك': 'DEM',
70
+ 'من': 'REL', 'ما': 'REL', 'الذي': 'REL', 'التي': 'REL',
71
+ }
72
+
73
+ # 전치사
74
+ self.prepositions = {
75
+ 'في': 'PREP', 'من': 'PREP', 'إلى': 'PREP', 'على': 'PREP',
76
+ 'عن': 'PREP', 'مع': 'PREP', 'بين': 'PREP', 'حول': 'PREP',
77
+ 'قبل': 'PREP', 'بعد': 'PREP', 'تحت': 'PREP', 'فوق': 'PREP',
78
+ 'عند': 'PREP', 'خلال': 'PREP', 'منذ': 'PREP', 'دون': 'PREP',
79
+ }
80
+
81
+ # 접속사
82
+ self.conjunctions = {
83
+ 'و': 'CONJ', 'أو': 'CONJ', 'لكن': 'CONJ', 'بل': 'CONJ',
84
+ 'أن': 'CONJ', 'إن': 'CONJ', 'لأن': 'CONJ', 'إذا': 'CONJ',
85
+ 'حتى': 'CONJ', 'كما': 'CONJ', 'عندما': 'CONJ', 'بينما': 'CONJ',
86
+ }
87
+
88
+ # 부사
89
+ self.adverbs = {
90
+ 'جداً': 'ADV', 'كثيراً': 'ADV', 'قليلاً': 'ADV',
91
+ 'دائماً': 'ADV', 'أبداً': 'ADV', 'أحياناً': 'ADV',
92
+ 'الآن': 'ADV', 'غداً': 'ADV', 'أمس': 'ADV', 'اليوم': 'ADV',
93
+ 'هنا': 'ADV', 'هناك': 'ADV', 'فقط': 'ADV', 'أيضاً': 'ADV',
94
+ }
95
+
96
+ # 부정어
97
+ self.negation = {
98
+ 'لا': 'NEG', 'لم': 'NEG', 'لن': 'NEG', 'ما': 'NEG',
99
+ 'ليس': 'NEG', 'ليست': 'NEG', 'ليسوا': 'NEG',
100
+ }
101
+
102
+ # 일반 명사 (고빈도)
103
+ self.common_nouns = {
104
+ 'الله': 'NP', # Allah
105
+ 'رسول': 'NC', # messenger
106
+ 'كتاب': 'NC', # book
107
+ 'يوم': 'NC', # day
108
+ 'سنة': 'NC', # year
109
+ 'ناس': 'NC', # people
110
+ 'بيت': 'NC', # house
111
+ 'عمل': 'NC', # work
112
+ 'دولة': 'NC', # state
113
+ 'شركة': 'NC', # company
114
+ }
115
+
116
+ def _build_domain_dictionaries(self):
117
+ """도메인별 사전"""
118
+ self._domain_dictionaries[Domain.TECH] = {
119
+ 'تفاحة': ('Apple', 'NP'), # Apple (company)
120
+ 'سحابة': ('cloud', 'NC'), # cloud computing
121
+ }
122
+ self._domain_dictionaries[Domain.FOOD] = {
123
+ 'تفاحة': ('تفاحة', 'NC'), # apple (fruit)
124
+ }
125
+ self._domain_dictionaries[Domain.FINANCE] = {
126
+ 'بنك': ('بنك', 'NC'), # bank
127
+ 'سهم': ('سهم', 'NC'), # stock
128
+ }
129
+
130
+ def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
131
+ if not text or not text.strip():
132
+ return [AnalysisResult([])]
133
+ morphemes = self._analyze_text(text, domain)
134
+ result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
135
+ result.score = self._score_analysis(result)
136
+ return [result]
137
+
138
+ def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
139
+ result = []
140
+ pos = 0
141
+ while pos < len(text):
142
+ if text[pos].isspace():
143
+ pos += 1
144
+ continue
145
+
146
+ arabic_match = self.ARABIC_PATTERN.match(text[pos:])
147
+ if arabic_match:
148
+ word = arabic_match.group()
149
+ morphemes = self._analyze_word(word, pos, domain)
150
+ result.extend(morphemes)
151
+ pos += len(word)
152
+ continue
153
+
154
+ # 라틴 문자 (외래어)
155
+ latin_match = re.match(r'[a-zA-Z]+', text[pos:])
156
+ if latin_match:
157
+ word = latin_match.group()
158
+ result.append(Morpheme(surface=word, lemma=word, pos='FOREIGN', start=pos, end=pos + len(word)))
159
+ pos += len(word)
160
+ continue
161
+
162
+ num_match = self.NUMBER_PATTERN.match(text[pos:])
163
+ if num_match:
164
+ num = num_match.group()
165
+ result.append(Morpheme(surface=num, lemma=num, pos='NUM', start=pos, end=pos + len(num)))
166
+ pos += len(num)
167
+ continue
168
+
169
+ result.append(Morpheme(surface=text[pos], lemma=text[pos], pos='PUNCT', start=pos, end=pos + 1))
170
+ pos += 1
171
+ return result
172
+
173
+ def _analyze_word(self, word: str, offset: int, domain: Domain) -> List[Morpheme]:
174
+ """단어 분석 (접두사/접미사 분리)"""
175
+ morphemes = []
176
+
177
+ # 런타임 사전
178
+ if word in self._user_dictionary:
179
+ lemma, pos_tag, _ = self._user_dictionary[word]
180
+ return [Morpheme(surface=word, lemma=lemma, pos=pos_tag, start=offset, end=offset + len(word))]
181
+
182
+ # 도메인 사전
183
+ domain_sense = self._get_domain_sense(word, domain)
184
+ if domain_sense:
185
+ return [Morpheme(surface=word, lemma=domain_sense[0], pos=domain_sense[1], start=offset, end=offset + len(word))]
186
+
187
+ # 기능어
188
+ if word in self.pronouns:
189
+ return [Morpheme(surface=word, lemma=word, pos='PRON', start=offset, end=offset + len(word))]
190
+ if word in self.prepositions:
191
+ return [Morpheme(surface=word, lemma=word, pos='PREP', start=offset, end=offset + len(word))]
192
+ if word in self.conjunctions:
193
+ return [Morpheme(surface=word, lemma=word, pos='CONJ', start=offset, end=offset + len(word))]
194
+ if word in self.adverbs:
195
+ return [Morpheme(surface=word, lemma=word, pos='ADV', start=offset, end=offset + len(word))]
196
+ if word in self.negation:
197
+ return [Morpheme(surface=word, lemma=word, pos='NEG', start=offset, end=offset + len(word))]
198
+ if word in self.common_nouns:
199
+ return [Morpheme(surface=word, lemma=word, pos=self.common_nouns[word], start=offset, end=offset + len(word))]
200
+
201
+ # 접두사 분리
202
+ current_offset = offset
203
+ remaining = word
204
+
205
+ # 정관사 ال 분리
206
+ if remaining.startswith('ال') and len(remaining) > 2:
207
+ morphemes.append(Morpheme(surface='ال', lemma='ال', pos='DEF', start=current_offset, end=current_offset + 2))
208
+ current_offset += 2
209
+ remaining = remaining[2:]
210
+
211
+ # 접속사/전치사 접두사
212
+ for prefix, pos_tag in self.prefixes.items():
213
+ if remaining.startswith(prefix) and len(remaining) > len(prefix):
214
+ morphemes.append(Morpheme(surface=prefix, lemma=prefix, pos=pos_tag, start=current_offset, end=current_offset + len(prefix)))
215
+ current_offset += len(prefix)
216
+ remaining = remaining[len(prefix):]
217
+ break
218
+
219
+ # 남은 어간
220
+ if remaining:
221
+ stem_end = current_offset + len(remaining)
222
+
223
+ # 접미사 분리 시도
224
+ for suffix, pos_tag in sorted(self.suffixes.items(), key=lambda x: -len(x[0])):
225
+ if remaining.endswith(suffix) and len(remaining) > len(suffix):
226
+ stem = remaining[:-len(suffix)]
227
+ morphemes.append(Morpheme(surface=stem, lemma=stem, pos='NC', start=current_offset, end=current_offset + len(stem)))
228
+ morphemes.append(Morpheme(surface=suffix, lemma=suffix, pos=pos_tag, start=current_offset + len(stem), end=stem_end))
229
+ return morphemes
230
+
231
+ # 접미사 없음
232
+ morphemes.append(Morpheme(surface=remaining, lemma=remaining, pos='NC', start=current_offset, end=stem_end))
233
+
234
+ return morphemes if morphemes else [Morpheme(surface=word, lemma=word, pos='NC', start=offset, end=offset + len(word))]
235
+
236
+ def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
237
+ alternatives = []
238
+ other_domains = [d for d in Domain if d != domain][:count]
239
+ for alt_domain in other_domains:
240
+ morphemes = self._analyze_text(text, alt_domain)
241
+ result = AnalysisResult(morphemes=morphemes, score=0.8, domain=alt_domain)
242
+ result.score = self._score_analysis(result) * 0.9
243
+ alternatives.append(result)
244
+ return alternatives
245
+
246
+
247
+ ArabicAnalyzer = ArabicAdvancedAnalyzer