tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,395 @@
1
+ """
2
+ TokMor Morphology - 통합 형태소 분석기
3
+ ======================================
4
+
5
+ 350+ 언어 지원 형태소 분석기
6
+ - 특화 분석기 (74개 언어): 규칙 기반 고급 분석
7
+ - 형태소 사전/규칙 모델 (333개 언어): 사전 기반 lemmatization (온라인 호출 없음)
8
+
9
+ 간편 사용:
10
+ from tokmor.morphology import lemmatize, analyze
11
+
12
+ # Lemmatization
13
+ lemmatize('running', 'en') # 'run'
14
+ lemmatize('했다', 'ko') # '하다'
15
+
16
+ # 상세 분석
17
+ results = analyze('The cats are running', 'en')
18
+ for r in results:
19
+ print(f'{r.word} -> {r.lemma}')
20
+
21
+ 지원 언어:
22
+ - Tier 1 (10): ko, ja, zh, en, de, fr, es, ru, ar, hi
23
+ - Tier 2 (15): pt, it, nl, pl, tr, vi, th, id, he, fa, uk, el, cs, ro, sv
24
+ - Tier 3 (21): no, da, fi, hu, bg, hr, sr, sk, sl, lt, lv, et, ca, eu, gl, cy, ga, is, mt, sq, mk
25
+ - Tier 4 (26+): bn, ta, te, mr, gu, kn, ml, pa, ur, ne, si, my, km, lo, tl, sw, am, yo, ha, zu, af, ka, hy, az, kk, uz, mn
26
+ - 형태소 사전/규칙 모델 (333): 추가 언어 지원
27
+ """
28
+
29
+ # Base classes
30
+ from .advanced_base import (
31
+ AdvancedMorphologicalAnalyzer,
32
+ Morpheme,
33
+ AnalysisResult,
34
+ NBestResult,
35
+ Domain
36
+ )
37
+
38
+ # Legacy analyzers (backward compatibility)
39
+ from .korean import KoreanAnalyzer as KoreanAnalyzerLegacy
40
+ from .japanese import JapaneseAnalyzer as JapaneseAnalyzerLegacy
41
+ from .chinese import ChineseAnalyzer as ChineseAnalyzerLegacy
42
+
43
+ # =============================================================================
44
+ # Tier 1 - Advanced analyzers (10 languages)
45
+ # =============================================================================
46
+ from .korean_advanced import KoreanAdvancedAnalyzer, KoreanAnalyzer
47
+ from .japanese_advanced import JapaneseAdvancedAnalyzer, JapaneseAnalyzer
48
+ from .chinese_advanced import ChineseAdvancedAnalyzer, ChineseAnalyzer
49
+ from .english_advanced import EnglishAdvancedAnalyzer, EnglishAnalyzer
50
+ from .german_advanced import GermanAdvancedAnalyzer, GermanAnalyzer
51
+ from .french_advanced import FrenchAdvancedAnalyzer, FrenchAnalyzer
52
+ from .spanish_advanced import SpanishAdvancedAnalyzer, SpanishAnalyzer
53
+ from .russian_advanced import RussianAdvancedAnalyzer, RussianAnalyzer
54
+ from .arabic_advanced import ArabicAdvancedAnalyzer, ArabicAnalyzer
55
+ from .hindi_advanced import HindiAdvancedAnalyzer, HindiAnalyzer
56
+
57
+ # =============================================================================
58
+ # Tier 2 - Major regional languages (15 languages)
59
+ # =============================================================================
60
+ from .tier2 import (
61
+ PortugueseAnalyzer,
62
+ ItalianAnalyzer,
63
+ DutchAnalyzer,
64
+ PolishAnalyzer,
65
+ TurkishAnalyzer,
66
+ VietnameseAnalyzer,
67
+ ThaiAnalyzer,
68
+ IndonesianAnalyzer,
69
+ HebrewAnalyzer,
70
+ PersianAnalyzer,
71
+ UkrainianAnalyzer,
72
+ GreekAnalyzer,
73
+ CzechAnalyzer,
74
+ RomanianAnalyzer,
75
+ SwedishAnalyzer,
76
+ )
77
+
78
+ # =============================================================================
79
+ # Tier 3 - Regional languages (21 languages)
80
+ # =============================================================================
81
+ from .tier3 import (
82
+ NorwegianAnalyzer,
83
+ DanishAnalyzer,
84
+ FinnishAnalyzer,
85
+ HungarianAnalyzer,
86
+ BulgarianAnalyzer,
87
+ CroatianAnalyzer,
88
+ SerbianAnalyzer,
89
+ SlovakAnalyzer,
90
+ SlovenianAnalyzer,
91
+ LithuanianAnalyzer,
92
+ LatvianAnalyzer,
93
+ EstonianAnalyzer,
94
+ CatalanAnalyzer,
95
+ BasqueAnalyzer,
96
+ GalicianAnalyzer,
97
+ WelshAnalyzer,
98
+ IrishAnalyzer,
99
+ IcelandicAnalyzer,
100
+ MalteseAnalyzer,
101
+ AlbanianAnalyzer,
102
+ MacedonianAnalyzer,
103
+ )
104
+
105
+ # =============================================================================
106
+ # Tier 4 - Extended global coverage (26+ languages)
107
+ # =============================================================================
108
+ from .tier4 import (
109
+ # South Asian
110
+ BengaliAnalyzer,
111
+ TamilAnalyzer,
112
+ TeluguAnalyzer,
113
+ MarathiAnalyzer,
114
+ GujaratiAnalyzer,
115
+ KannadaAnalyzer,
116
+ MalayalamAnalyzer,
117
+ PunjabiAnalyzer,
118
+ UrduAnalyzer,
119
+ NepaliAnalyzer,
120
+ SinhalaAnalyzer,
121
+ # Southeast Asian
122
+ MyanmarAnalyzer,
123
+ KhmerAnalyzer,
124
+ LaoAnalyzer,
125
+ TagalogAnalyzer,
126
+ MalayAnalyzer,
127
+ # African
128
+ SwahiliAnalyzer,
129
+ AmharicAnalyzer,
130
+ YorubaAnalyzer,
131
+ HausaAnalyzer,
132
+ ZuluAnalyzer,
133
+ AfrikaansAnalyzer,
134
+ # Caucasian
135
+ GeorgianAnalyzer,
136
+ ArmenianAnalyzer,
137
+ AzerbaijaniAnalyzer,
138
+ # Central Asian
139
+ KazakhAnalyzer,
140
+ UzbekAnalyzer,
141
+ MongolianAnalyzer,
142
+ )
143
+
144
+ # =============================================================================
145
+ # Universal Fallback
146
+ # =============================================================================
147
+ from .universal_fallback import (
148
+ UniversalFallbackAnalyzer,
149
+ AnalyzerRegistry,
150
+ get_analyzer as get_analyzer_auto,
151
+ )
152
+
153
+ # =============================================================================
154
+ # Analyzer Registry - All 72+ languages
155
+ # =============================================================================
156
+ ANALYZERS = {
157
+ # Tier 1 (10)
158
+ 'ko': KoreanAdvancedAnalyzer,
159
+ 'ja': JapaneseAdvancedAnalyzer,
160
+ 'zh': ChineseAdvancedAnalyzer,
161
+ 'en': EnglishAdvancedAnalyzer,
162
+ 'de': GermanAdvancedAnalyzer,
163
+ 'fr': FrenchAdvancedAnalyzer,
164
+ 'es': SpanishAdvancedAnalyzer,
165
+ 'ru': RussianAdvancedAnalyzer,
166
+ 'ar': ArabicAdvancedAnalyzer,
167
+ 'hi': HindiAdvancedAnalyzer,
168
+
169
+ # Tier 2 (15)
170
+ 'pt': PortugueseAnalyzer,
171
+ 'it': ItalianAnalyzer,
172
+ 'nl': DutchAnalyzer,
173
+ 'pl': PolishAnalyzer,
174
+ 'tr': TurkishAnalyzer,
175
+ 'vi': VietnameseAnalyzer,
176
+ 'th': ThaiAnalyzer,
177
+ 'id': IndonesianAnalyzer,
178
+ 'he': HebrewAnalyzer,
179
+ 'fa': PersianAnalyzer,
180
+ 'uk': UkrainianAnalyzer,
181
+ 'el': GreekAnalyzer,
182
+ 'cs': CzechAnalyzer,
183
+ 'ro': RomanianAnalyzer,
184
+ 'sv': SwedishAnalyzer,
185
+
186
+ # Tier 3 (21)
187
+ 'no': NorwegianAnalyzer,
188
+ 'da': DanishAnalyzer,
189
+ 'fi': FinnishAnalyzer,
190
+ 'hu': HungarianAnalyzer,
191
+ 'bg': BulgarianAnalyzer,
192
+ 'hr': CroatianAnalyzer,
193
+ 'sr': SerbianAnalyzer,
194
+ 'sk': SlovakAnalyzer,
195
+ 'sl': SlovenianAnalyzer,
196
+ 'lt': LithuanianAnalyzer,
197
+ 'lv': LatvianAnalyzer,
198
+ 'et': EstonianAnalyzer,
199
+ 'ca': CatalanAnalyzer,
200
+ 'eu': BasqueAnalyzer,
201
+ 'gl': GalicianAnalyzer,
202
+ 'cy': WelshAnalyzer,
203
+ 'ga': IrishAnalyzer,
204
+ 'is': IcelandicAnalyzer,
205
+ 'mt': MalteseAnalyzer,
206
+ 'sq': AlbanianAnalyzer,
207
+ 'mk': MacedonianAnalyzer,
208
+
209
+ # Tier 4 - South Asian (11)
210
+ 'bn': BengaliAnalyzer,
211
+ 'ta': TamilAnalyzer,
212
+ 'te': TeluguAnalyzer,
213
+ 'mr': MarathiAnalyzer,
214
+ 'gu': GujaratiAnalyzer,
215
+ 'kn': KannadaAnalyzer,
216
+ 'ml': MalayalamAnalyzer,
217
+ 'pa': PunjabiAnalyzer,
218
+ 'ur': UrduAnalyzer,
219
+ 'ne': NepaliAnalyzer,
220
+ 'si': SinhalaAnalyzer,
221
+
222
+ # Tier 4 - Southeast Asian (5)
223
+ 'my': MyanmarAnalyzer,
224
+ 'km': KhmerAnalyzer,
225
+ 'lo': LaoAnalyzer,
226
+ 'tl': TagalogAnalyzer,
227
+ 'ms': MalayAnalyzer,
228
+
229
+ # Tier 4 - African (6)
230
+ 'sw': SwahiliAnalyzer,
231
+ 'am': AmharicAnalyzer,
232
+ 'yo': YorubaAnalyzer,
233
+ 'ha': HausaAnalyzer,
234
+ 'zu': ZuluAnalyzer,
235
+ 'af': AfrikaansAnalyzer,
236
+
237
+ # Tier 4 - Caucasian (3)
238
+ 'ka': GeorgianAnalyzer,
239
+ 'hy': ArmenianAnalyzer,
240
+ 'az': AzerbaijaniAnalyzer,
241
+
242
+ # Tier 4 - Central Asian (3)
243
+ 'kk': KazakhAnalyzer,
244
+ 'uz': UzbekAnalyzer,
245
+ 'mn': MongolianAnalyzer,
246
+
247
+ # Universal fallback
248
+ 'xx': UniversalFallbackAnalyzer,
249
+ }
250
+
251
+ # Register all analyzers
252
+ for code, analyzer_class in ANALYZERS.items():
253
+ AnalyzerRegistry.register(code, analyzer_class)
254
+
255
+
256
+ def get_analyzer(lang: str = None, text: str = None, fallback: bool = True) -> AdvancedMorphologicalAnalyzer:
257
+ """
258
+ 언어 코드로 분석기 인스턴스 생성
259
+
260
+ Args:
261
+ lang: ISO 639-1 언어 코드 (ko, ja, zh, en, de, fr, es, ru, ar, hi, ...)
262
+ text: 분석할 텍스트 (언어 자동 감지용)
263
+ fallback: True면 미지원 언어에 대해 UniversalFallback 반환
264
+
265
+ Returns:
266
+ 해당 언어의 형태소 분석기 인스턴스
267
+
268
+ Raises:
269
+ ValueError: fallback=False이고 지원하지 않는 언어 코드일 때
270
+ """
271
+ if text and not lang:
272
+ # Auto-detect from text
273
+ return get_analyzer_auto(text=text)
274
+
275
+ if lang:
276
+ lang = lang.lower()
277
+ if lang in ANALYZERS:
278
+ return ANALYZERS[lang]()
279
+ elif fallback:
280
+ return UniversalFallbackAnalyzer()
281
+ else:
282
+ raise ValueError(f"Unsupported language: {lang}. Supported: {list(ANALYZERS.keys())}")
283
+
284
+ return UniversalFallbackAnalyzer()
285
+
286
+
287
+ def supported_languages() -> list:
288
+ """지원 언어 목록 반환"""
289
+ return [code for code in ANALYZERS.keys() if code != 'xx']
290
+
291
+
292
+ def language_info() -> dict:
293
+ """언어별 상세 정보 반환"""
294
+ return {
295
+ 'tier1': ['ko', 'ja', 'zh', 'en', 'de', 'fr', 'es', 'ru', 'ar', 'hi'],
296
+ 'tier2': ['pt', 'it', 'nl', 'pl', 'tr', 'vi', 'th', 'id', 'he', 'fa', 'uk', 'el', 'cs', 'ro', 'sv'],
297
+ 'tier3': ['no', 'da', 'fi', 'hu', 'bg', 'hr', 'sr', 'sk', 'sl', 'lt', 'lv', 'et', 'ca', 'eu', 'gl', 'cy', 'ga', 'is', 'mt', 'sq', 'mk'],
298
+ 'tier4_south_asian': ['bn', 'ta', 'te', 'mr', 'gu', 'kn', 'ml', 'pa', 'ur', 'ne', 'si'],
299
+ 'tier4_southeast_asian': ['my', 'km', 'lo', 'tl'],
300
+ 'tier4_african': ['sw', 'am', 'yo', 'ha', 'zu', 'af'],
301
+ 'tier4_caucasian': ['ka', 'hy', 'az'],
302
+ 'tier4_central_asian': ['kk', 'uz', 'mn'],
303
+ 'total': len(ANALYZERS) - 1, # Exclude 'xx'
304
+ }
305
+
306
+
307
+ # =============================================================================
308
+ # Unified API (recommended)
309
+ # =============================================================================
310
+ from .unified import (
311
+ UnifiedMorphAnalyzer,
312
+ LemmaResult,
313
+ get_unified_analyzer,
314
+ unified_supported_languages,
315
+ unified_language_info,
316
+ lemmatize,
317
+ analyze,
318
+ )
319
+
320
+ __all__ = [
321
+ # Unified API (recommended)
322
+ 'lemmatize', # lemmatize('running', 'en') -> 'run'
323
+ 'analyze', # analyze('text', 'en') -> [LemmaResult, ...]
324
+ 'get_unified_analyzer',
325
+ 'unified_supported_languages',
326
+ 'unified_language_info',
327
+ 'UnifiedMorphAnalyzer',
328
+ 'LemmaResult',
329
+
330
+ # Base classes
331
+ 'AdvancedMorphologicalAnalyzer',
332
+ 'Morpheme',
333
+ 'AnalysisResult',
334
+ 'NBestResult',
335
+ 'Domain',
336
+
337
+ # Factory (legacy)
338
+ 'get_analyzer',
339
+ 'supported_languages',
340
+ 'language_info',
341
+ 'ANALYZERS',
342
+ 'AnalyzerRegistry',
343
+
344
+ # Universal Fallback
345
+ 'UniversalFallbackAnalyzer',
346
+
347
+ # Tier 1 - Advanced
348
+ 'KoreanAdvancedAnalyzer', 'KoreanAnalyzer',
349
+ 'JapaneseAdvancedAnalyzer', 'JapaneseAnalyzer',
350
+ 'ChineseAdvancedAnalyzer', 'ChineseAnalyzer',
351
+ 'EnglishAdvancedAnalyzer', 'EnglishAnalyzer',
352
+ 'GermanAdvancedAnalyzer', 'GermanAnalyzer',
353
+ 'FrenchAdvancedAnalyzer', 'FrenchAnalyzer',
354
+ 'SpanishAdvancedAnalyzer', 'SpanishAnalyzer',
355
+ 'RussianAdvancedAnalyzer', 'RussianAnalyzer',
356
+ 'ArabicAdvancedAnalyzer', 'ArabicAnalyzer',
357
+ 'HindiAdvancedAnalyzer', 'HindiAnalyzer',
358
+
359
+ # Tier 2
360
+ 'PortugueseAnalyzer', 'ItalianAnalyzer', 'DutchAnalyzer',
361
+ 'PolishAnalyzer', 'TurkishAnalyzer', 'VietnameseAnalyzer',
362
+ 'ThaiAnalyzer', 'IndonesianAnalyzer', 'HebrewAnalyzer',
363
+ 'PersianAnalyzer', 'UkrainianAnalyzer', 'GreekAnalyzer',
364
+ 'CzechAnalyzer', 'RomanianAnalyzer', 'SwedishAnalyzer',
365
+
366
+ # Tier 3
367
+ 'NorwegianAnalyzer', 'DanishAnalyzer', 'FinnishAnalyzer',
368
+ 'HungarianAnalyzer', 'BulgarianAnalyzer', 'CroatianAnalyzer',
369
+ 'SerbianAnalyzer', 'SlovakAnalyzer', 'SlovenianAnalyzer',
370
+ 'LithuanianAnalyzer', 'LatvianAnalyzer', 'EstonianAnalyzer',
371
+ 'CatalanAnalyzer', 'BasqueAnalyzer', 'GalicianAnalyzer',
372
+ 'WelshAnalyzer', 'IrishAnalyzer', 'IcelandicAnalyzer',
373
+ 'MalteseAnalyzer', 'AlbanianAnalyzer', 'MacedonianAnalyzer',
374
+
375
+ # Tier 4 - South Asian
376
+ 'BengaliAnalyzer', 'TamilAnalyzer', 'TeluguAnalyzer',
377
+ 'MarathiAnalyzer', 'GujaratiAnalyzer', 'KannadaAnalyzer',
378
+ 'MalayalamAnalyzer', 'PunjabiAnalyzer', 'UrduAnalyzer',
379
+ 'NepaliAnalyzer', 'SinhalaAnalyzer',
380
+
381
+ # Tier 4 - Southeast Asian
382
+ 'MyanmarAnalyzer', 'KhmerAnalyzer', 'LaoAnalyzer', 'TagalogAnalyzer', 'MalayAnalyzer',
383
+
384
+ # Tier 4 - African
385
+ 'SwahiliAnalyzer', 'AmharicAnalyzer', 'YorubaAnalyzer',
386
+ 'HausaAnalyzer', 'ZuluAnalyzer', 'AfrikaansAnalyzer',
387
+
388
+ # Tier 4 - Caucasian
389
+ 'GeorgianAnalyzer', 'ArmenianAnalyzer', 'AzerbaijaniAnalyzer',
390
+
391
+ # Tier 4 - Central Asian
392
+ 'KazakhAnalyzer', 'UzbekAnalyzer', 'MongolianAnalyzer',
393
+ ]
394
+
395
+ __version__ = '3.0.0'