tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,398 @@
1
+ """
2
+ Universal Fallback Morphological Analyzer
3
+ =========================================
4
+
5
+ 모든 언어를 커버하는 범용 형태소 분석기
6
+ 언어별 분석기가 없는 경우 자동으로 사용됨
7
+
8
+ Features:
9
+ - Unicode script detection
10
+ - Automatic word boundary detection
11
+ - RTL/LTR handling
12
+ - Any language support
13
+ """
14
+
15
+ import re
16
+ import unicodedata
17
+ from typing import List, Dict, Optional, Set, Tuple
18
+
19
+ from .advanced_base import (
20
+ AdvancedMorphologicalAnalyzer, Morpheme, AnalysisResult, Domain
21
+ )
22
+
23
+
24
+ class UniversalFallbackAnalyzer(AdvancedMorphologicalAnalyzer):
25
+ """
26
+ Universal fallback analyzer for any language
27
+
28
+ Handles:
29
+ - All Unicode scripts automatically
30
+ - Mixed script texts
31
+ - Unknown languages
32
+ - Automatic script detection
33
+ """
34
+
35
+ LANG_CODE = "xx" # Universal code
36
+ LANG_NAME = "Universal"
37
+
38
+ # Unicode script categories (simplified)
39
+ SCRIPT_PATTERNS = {
40
+ 'latin': re.compile(r'[\u0041-\u007A\u00C0-\u024F\u1E00-\u1EFF]+'),
41
+ 'cyrillic': re.compile(r'[\u0400-\u04FF\u0500-\u052F]+'),
42
+ 'greek': re.compile(r'[\u0370-\u03FF\u1F00-\u1FFF]+'),
43
+ 'arabic': re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]+'),
44
+ 'hebrew': re.compile(r'[\u0590-\u05FF]+'),
45
+ 'devanagari': re.compile(r'[\u0900-\u097F]+'),
46
+ 'bengali': re.compile(r'[\u0980-\u09FF]+'),
47
+ 'tamil': re.compile(r'[\u0B80-\u0BFF]+'),
48
+ 'telugu': re.compile(r'[\u0C00-\u0C7F]+'),
49
+ 'kannada': re.compile(r'[\u0C80-\u0CFF]+'),
50
+ 'malayalam': re.compile(r'[\u0D00-\u0D7F]+'),
51
+ 'thai': re.compile(r'[\u0E00-\u0E7F]+'),
52
+ 'lao': re.compile(r'[\u0E80-\u0EFF]+'),
53
+ 'myanmar': re.compile(r'[\u1000-\u109F]+'),
54
+ 'khmer': re.compile(r'[\u1780-\u17FF]+'),
55
+ 'tibetan': re.compile(r'[\u0F00-\u0FFF]+'),
56
+ 'georgian': re.compile(r'[\u10A0-\u10FF\u2D00-\u2D2F]+'),
57
+ 'armenian': re.compile(r'[\u0530-\u058F]+'),
58
+ 'hangul': re.compile(r'[\uAC00-\uD7AF\u1100-\u11FF\u3130-\u318F]+'),
59
+ 'hiragana': re.compile(r'[\u3040-\u309F]+'),
60
+ 'katakana': re.compile(r'[\u30A0-\u30FF]+'),
61
+ 'cjk': re.compile(r'[\u4E00-\u9FFF\u3400-\u4DBF]+'),
62
+ 'ethiopic': re.compile(r'[\u1200-\u137F]+'),
63
+ 'sinhala': re.compile(r'[\u0D80-\u0DFF]+'),
64
+ 'gujarati': re.compile(r'[\u0A80-\u0AFF]+'),
65
+ 'gurmukhi': re.compile(r'[\u0A00-\u0A7F]+'),
66
+ 'oriya': re.compile(r'[\u0B00-\u0B7F]+'),
67
+ }
68
+
69
+ # Combined word pattern for any script
70
+ WORD_PATTERN = re.compile(
71
+ r'[\u0041-\u007A\u00C0-\u024F\u1E00-\u1EFF' # Latin
72
+ r'\u0400-\u052F' # Cyrillic
73
+ r'\u0370-\u03FF\u1F00-\u1FFF' # Greek
74
+ r'\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF' # Arabic
75
+ r'\u0590-\u05FF' # Hebrew
76
+ r'\u0900-\u097F' # Devanagari
77
+ r'\u0980-\u09FF' # Bengali
78
+ r'\u0A00-\u0A7F' # Gurmukhi
79
+ r'\u0A80-\u0AFF' # Gujarati
80
+ r'\u0B00-\u0B7F' # Oriya
81
+ r'\u0B80-\u0BFF' # Tamil
82
+ r'\u0C00-\u0C7F' # Telugu
83
+ r'\u0C80-\u0CFF' # Kannada
84
+ r'\u0D00-\u0DFF' # Malayalam/Sinhala
85
+ r'\u0E00-\u0E7F' # Thai
86
+ r'\u0E80-\u0EFF' # Lao
87
+ r'\u0F00-\u0FFF' # Tibetan
88
+ r'\u1000-\u109F' # Myanmar
89
+ r'\u1200-\u137F' # Ethiopic
90
+ r'\u1780-\u17FF' # Khmer
91
+ r'\u10A0-\u10FF\u2D00-\u2D2F' # Georgian
92
+ r'\u0530-\u058F' # Armenian
93
+ r'\uAC00-\uD7AF\u1100-\u11FF\u3130-\u318F' # Hangul
94
+ r'\u3040-\u309F' # Hiragana
95
+ r'\u30A0-\u30FF' # Katakana
96
+ r'\u4E00-\u9FFF\u3400-\u4DBF' # CJK
97
+ r']+',
98
+ re.UNICODE
99
+ )
100
+
101
+ # Number patterns (various scripts)
102
+ NUMBER_PATTERN = re.compile(
103
+ r'[0-9'
104
+ r'\u0660-\u0669' # Arabic-Indic
105
+ r'\u06F0-\u06F9' # Extended Arabic-Indic
106
+ r'\u0966-\u096F' # Devanagari
107
+ r'\u09E6-\u09EF' # Bengali
108
+ r'\u0A66-\u0A6F' # Gurmukhi
109
+ r'\u0AE6-\u0AEF' # Gujarati
110
+ r'\u0B66-\u0B6F' # Oriya
111
+ r'\u0BE6-\u0BEF' # Tamil
112
+ r'\u0C66-\u0C6F' # Telugu
113
+ r'\u0CE6-\u0CEF' # Kannada
114
+ r'\u0D66-\u0D6F' # Malayalam
115
+ r'\u0E50-\u0E59' # Thai
116
+ r'\u0ED0-\u0ED9' # Lao
117
+ r'\u0F20-\u0F29' # Tibetan
118
+ r'\u1040-\u1049' # Myanmar
119
+ r'\u17E0-\u17E9' # Khmer
120
+ r']+'
121
+ )
122
+
123
+ def __init__(self):
124
+ super().__init__()
125
+ self._detected_script: Optional[str] = None
126
+
127
+ def _build_base_dictionary(self):
128
+ """Universal basic dictionary"""
129
+ self.function_words: Dict[str, str] = {}
130
+
131
+ def _build_domain_dictionaries(self):
132
+ """No domain dictionaries for universal"""
133
+ pass
134
+
135
+ def detect_script(self, text: str) -> str:
136
+ """Detect primary script of text"""
137
+ script_counts: Dict[str, int] = {}
138
+
139
+ for script_name, pattern in self.SCRIPT_PATTERNS.items():
140
+ matches = pattern.findall(text)
141
+ if matches:
142
+ script_counts[script_name] = sum(len(m) for m in matches)
143
+
144
+ if not script_counts:
145
+ return 'unknown'
146
+
147
+ return max(script_counts, key=script_counts.get)
148
+
149
+ def detect_language_hint(self, text: str) -> str:
150
+ """Try to detect language from script"""
151
+ script = self.detect_script(text)
152
+
153
+ # Script to likely language mapping
154
+ script_lang_map = {
155
+ 'hangul': 'ko',
156
+ 'hiragana': 'ja',
157
+ 'katakana': 'ja',
158
+ 'cjk': 'zh', # Could be zh/ja/ko
159
+ 'devanagari': 'hi',
160
+ 'bengali': 'bn',
161
+ 'tamil': 'ta',
162
+ 'telugu': 'te',
163
+ 'kannada': 'kn',
164
+ 'malayalam': 'ml',
165
+ 'gujarati': 'gu',
166
+ 'gurmukhi': 'pa',
167
+ 'oriya': 'or',
168
+ 'thai': 'th',
169
+ 'lao': 'lo',
170
+ 'myanmar': 'my',
171
+ 'khmer': 'km',
172
+ 'tibetan': 'bo',
173
+ 'georgian': 'ka',
174
+ 'armenian': 'hy',
175
+ 'ethiopic': 'am',
176
+ 'sinhala': 'si',
177
+ 'hebrew': 'he',
178
+ 'arabic': 'ar',
179
+ 'greek': 'el',
180
+ 'cyrillic': 'ru',
181
+ 'latin': 'en',
182
+ }
183
+
184
+ return script_lang_map.get(script, 'xx')
185
+
186
+ def _generate_candidates(self, text: str, domain: Domain) -> List[AnalysisResult]:
187
+ if not text or not text.strip():
188
+ return [AnalysisResult([])]
189
+
190
+ self._detected_script = self.detect_script(text)
191
+ morphemes = self._analyze_text(text, domain)
192
+ result = AnalysisResult(morphemes=morphemes, score=1.0, domain=domain)
193
+ result.score = self._score_analysis(result)
194
+ return [result]
195
+
196
+ def _analyze_text(self, text: str, domain: Domain) -> List[Morpheme]:
197
+ """Universal tokenization using Unicode properties"""
198
+ result = []
199
+ pos = 0
200
+
201
+ while pos < len(text):
202
+ # Skip whitespace
203
+ if text[pos].isspace():
204
+ pos += 1
205
+ continue
206
+
207
+ # Try number first
208
+ num_match = self.NUMBER_PATTERN.match(text[pos:])
209
+ if num_match:
210
+ num = num_match.group()
211
+ result.append(Morpheme(
212
+ surface=num, lemma=num, pos='NUM',
213
+ start=pos, end=pos + len(num)
214
+ ))
215
+ pos += len(num)
216
+ continue
217
+
218
+ # Try word pattern
219
+ word_match = self.WORD_PATTERN.match(text[pos:])
220
+ if word_match:
221
+ word = word_match.group()
222
+ morpheme = self._analyze_word(word, pos, domain)
223
+ result.append(morpheme)
224
+ pos += len(word)
225
+ continue
226
+
227
+ # Check for CJK characters (needs character-by-character for Chinese)
228
+ if self._is_cjk(text[pos]):
229
+ # Group consecutive CJK
230
+ end = pos
231
+ while end < len(text) and self._is_cjk(text[end]):
232
+ end += 1
233
+ word = text[pos:end]
234
+ result.append(Morpheme(
235
+ surface=word, lemma=word, pos='N',
236
+ start=pos, end=end
237
+ ))
238
+ pos = end
239
+ continue
240
+
241
+ # Punctuation or unknown
242
+ char = text[pos]
243
+ if unicodedata.category(char).startswith('P'):
244
+ result.append(Morpheme(
245
+ surface=char, lemma=char, pos='PUNCT',
246
+ start=pos, end=pos + 1
247
+ ))
248
+ else:
249
+ result.append(Morpheme(
250
+ surface=char, lemma=char, pos='X',
251
+ start=pos, end=pos + 1
252
+ ))
253
+ pos += 1
254
+
255
+ return result
256
+
257
+ def _is_cjk(self, char: str) -> bool:
258
+ """Check if character is CJK"""
259
+ if not char:
260
+ return False
261
+ code = ord(char)
262
+ return (
263
+ 0x4E00 <= code <= 0x9FFF or # CJK Unified
264
+ 0x3400 <= code <= 0x4DBF or # CJK Extension A
265
+ 0x20000 <= code <= 0x2A6DF or # CJK Extension B
266
+ 0x2A700 <= code <= 0x2B73F or # CJK Extension C
267
+ 0x2B740 <= code <= 0x2B81F or # CJK Extension D
268
+ 0xF900 <= code <= 0xFAFF or # CJK Compatibility
269
+ 0x2F00 <= code <= 0x2FDF # Kangxi Radicals
270
+ )
271
+
272
+ def _analyze_word(self, word: str, offset: int, domain: Domain) -> Morpheme:
273
+ """Analyze single word"""
274
+
275
+ # Check user dictionary
276
+ if word in self._user_dictionary:
277
+ lemma, pos_tag, _ = self._user_dictionary[word]
278
+ return Morpheme(
279
+ surface=word, lemma=lemma, pos=pos_tag,
280
+ start=offset, end=offset + len(word)
281
+ )
282
+
283
+ # Check domain dictionary
284
+ domain_sense = self._get_domain_sense(word, domain)
285
+ if domain_sense:
286
+ return Morpheme(
287
+ surface=word, lemma=domain_sense[0], pos=domain_sense[1],
288
+ start=offset, end=offset + len(word)
289
+ )
290
+
291
+ # Heuristic POS tagging based on Unicode category
292
+ pos = self._guess_pos(word)
293
+
294
+ return Morpheme(
295
+ surface=word, lemma=word.lower() if word.isascii() else word,
296
+ pos=pos, start=offset, end=offset + len(word)
297
+ )
298
+
299
+ def _guess_pos(self, word: str) -> str:
300
+ """Heuristic POS guessing based on character properties"""
301
+ if not word:
302
+ return 'X'
303
+
304
+ # All uppercase -> proper noun or acronym
305
+ if word.isupper() and len(word) > 1:
306
+ return 'NNP'
307
+
308
+ # Title case -> proper noun
309
+ if word.istitle():
310
+ return 'NNP'
311
+
312
+ # Contains digits -> number-like
313
+ if any(c.isdigit() for c in word):
314
+ return 'NUM'
315
+
316
+ # Default to noun
317
+ return 'N'
318
+
319
+ def _generate_alternatives(self, text: str, domain: Domain, count: int) -> List[AnalysisResult]:
320
+ """Generate alternative analyses"""
321
+ alternatives = []
322
+ other_domains = [d for d in Domain if d != domain][:count]
323
+
324
+ for alt_domain in other_domains:
325
+ morphemes = self._analyze_text(text, alt_domain)
326
+ result = AnalysisResult(morphemes=morphemes, score=0.8, domain=alt_domain)
327
+ result.score = self._score_analysis(result) * 0.9
328
+ alternatives.append(result)
329
+
330
+ return alternatives
331
+
332
+
333
+ # Language registry with fallback
334
+ class AnalyzerRegistry:
335
+ """
336
+ Global registry for morphological analyzers
337
+ Auto-selects appropriate analyzer based on language code or script detection
338
+ """
339
+
340
+ _analyzers: Dict[str, type] = {}
341
+ _instances: Dict[str, AdvancedMorphologicalAnalyzer] = {}
342
+ _fallback = UniversalFallbackAnalyzer
343
+
344
+ @classmethod
345
+ def register(cls, lang_code: str, analyzer_class: type):
346
+ """Register an analyzer for a language code"""
347
+ cls._analyzers[lang_code.lower()] = analyzer_class
348
+
349
+ @classmethod
350
+ def get(cls, lang_code: str) -> AdvancedMorphologicalAnalyzer:
351
+ """Get analyzer instance for language code"""
352
+ lang_code = lang_code.lower()
353
+
354
+ if lang_code not in cls._instances:
355
+ if lang_code in cls._analyzers:
356
+ cls._instances[lang_code] = cls._analyzers[lang_code]()
357
+ else:
358
+ # Use fallback
359
+ cls._instances[lang_code] = cls._fallback()
360
+
361
+ return cls._instances[lang_code]
362
+
363
+ @classmethod
364
+ def get_for_text(cls, text: str) -> AdvancedMorphologicalAnalyzer:
365
+ """Auto-detect language and get appropriate analyzer"""
366
+ fallback = cls._fallback()
367
+ detected_lang = fallback.detect_language_hint(text)
368
+ return cls.get(detected_lang)
369
+
370
+ @classmethod
371
+ def supported_languages(cls) -> List[str]:
372
+ """List all registered language codes"""
373
+ return list(cls._analyzers.keys())
374
+
375
+ @classmethod
376
+ def clear_cache(cls):
377
+ """Clear instance cache"""
378
+ cls._instances.clear()
379
+
380
+
381
+ # Convenience function
382
+ def get_analyzer(lang_code: str = None, text: str = None) -> AdvancedMorphologicalAnalyzer:
383
+ """
384
+ Get morphological analyzer
385
+
386
+ Args:
387
+ lang_code: Language code (e.g., 'ko', 'en', 'ja')
388
+ text: Text to analyze (for auto-detection)
389
+
390
+ Returns:
391
+ Appropriate analyzer instance
392
+ """
393
+ if lang_code:
394
+ return AnalyzerRegistry.get(lang_code)
395
+ elif text:
396
+ return AnalyzerRegistry.get_for_text(text)
397
+ else:
398
+ return UniversalFallbackAnalyzer()