tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
tokmor/factory.py ADDED
@@ -0,0 +1,394 @@
1
+ """
2
+ Tokenizer Factory
3
+ =================
4
+
5
+ 토크나이저 생성 팩토리 및 유틸리티 함수
6
+ """
7
+
8
+ import re
9
+ import os
10
+ from typing import List, Optional, Union, Type, Tuple
11
+ from .base import BaseTokenizer, Token, TokenizerResult, MorphologicalAnalyzer
12
+ from .space_based import SpaceBasedTokenizer, EnglishTokenizer, GermanTokenizer, RussianTokenizer
13
+ from .cjk import (
14
+ CJKTokenizer, KoreanTokenizer, JapaneseTokenizer, ChineseTokenizer,
15
+ KoreanMorphologyAnalyzer, JapaneseMorphologyAnalyzer, ChineseMorphologyAnalyzer
16
+ )
17
+ from .rtl import RTLTokenizer, ArabicTokenizer, HebrewTokenizer, PersianTokenizer
18
+ from .brahmic import BrahmicTokenizer, ThaiTokenizer, LaoTokenizer, MyanmarTokenizer, KhmerTokenizer
19
+ from .indic import (
20
+ IndicTokenizer, HindiTokenizer, BengaliTokenizer, TamilTokenizer,
21
+ TeluguTokenizer, MarathiTokenizer, GujaratiTokenizer, KannadaTokenizer,
22
+ MalayalamTokenizer, PunjabiTokenizer, HindiMorphologyAnalyzer
23
+ )
24
+
25
+
26
+ # 언어 코드 → 토크나이저 클래스 매핑
27
+ TOKENIZER_MAP: dict[str, Type[BaseTokenizer]] = {
28
+ # CJK
29
+ 'ko': KoreanTokenizer,
30
+ 'ja': JapaneseTokenizer,
31
+ 'zh': ChineseTokenizer,
32
+ 'zh-cn': ChineseTokenizer,
33
+ 'zh-tw': ChineseTokenizer,
34
+
35
+ # RTL
36
+ 'ar': ArabicTokenizer,
37
+ 'he': HebrewTokenizer,
38
+ 'fa': PersianTokenizer,
39
+ 'ur': RTLTokenizer,
40
+
41
+ # Brahmic (no-space)
42
+ 'th': ThaiTokenizer,
43
+ 'lo': LaoTokenizer,
44
+ 'my': MyanmarTokenizer,
45
+ 'km': KhmerTokenizer,
46
+
47
+ # Indic (space-based with special script handling)
48
+ 'hi': HindiTokenizer,
49
+ 'bn': BengaliTokenizer,
50
+ 'ta': TamilTokenizer,
51
+ 'te': TeluguTokenizer,
52
+ 'mr': MarathiTokenizer,
53
+ 'gu': GujaratiTokenizer,
54
+ 'kn': KannadaTokenizer,
55
+ 'ml': MalayalamTokenizer,
56
+ 'pa': PunjabiTokenizer,
57
+
58
+ # Space-based special
59
+ 'en': EnglishTokenizer,
60
+ 'de': GermanTokenizer,
61
+ 'ru': RussianTokenizer,
62
+ }
63
+
64
+ # Cache for tokenizer instances (creation can be expensive: regex compile, lexicon load, analyzer init)
65
+ _tokenizers: dict[tuple[str, bool, Optional[bool]], BaseTokenizer] = {}
66
+
67
+
68
+ def get_tokenizer(
69
+ lang: str,
70
+ use_morphology: bool = None,
71
+ morph_backend: str = 'auto',
72
+ *,
73
+ zh_join_dates: Optional[bool] = None,
74
+ ) -> BaseTokenizer:
75
+ """
76
+ 언어별 토크나이저 반환
77
+
78
+ Args:
79
+ lang: 언어 코드 (ISO 639-1)
80
+ use_morphology: 형태소 분석 사용 여부 (None이면 언어별 기본값 사용)
81
+ CJK (ko, ja, zh)는 기본 True, 나머지는 기본 False
82
+ morph_backend: 형태소 분석 백엔드 (호환성용; 현재 tokmor 내장 분석기는 backend를 사용하지 않음)
83
+
84
+ Returns:
85
+ 해당 언어의 토크나이저 인스턴스
86
+
87
+ Example:
88
+ >>> tok = get_tokenizer('ko') # CJK는 기본으로 형태소 분석 사용
89
+ >>> result = tok.tokenize("삼성전자가 서울에서 발표했다")
90
+ >>> print(result.texts())
91
+ ['삼성전자', '가', '서울', '에서', '발표', '했다']
92
+ """
93
+ # NOTE: morph_backend는 CLI 호환성(옵션 파싱)을 위해 받지만,
94
+ # 현재 tokmor의 내장 형태소 분석기는 backend 선택을 사용하지 않습니다.
95
+ _ = morph_backend
96
+
97
+ lang = lang.lower().replace('_', '-')
98
+
99
+ # CJK 언어 목록
100
+ cjk_languages = {'ko', 'ja', 'zh', 'zh-cn', 'zh-tw'}
101
+
102
+ # use_morphology 기본값 결정
103
+ if use_morphology is None:
104
+ use_morphology = lang in cjk_languages # CJK는 기본 True
105
+
106
+ # Tokenizer cache key:
107
+ # - For zh*, include zh_join_dates (None/True/False)
108
+ # - For non-zh, ignore zh_join_dates (always None in key)
109
+ cache_key = (lang, bool(use_morphology), zh_join_dates if lang.startswith("zh") else None)
110
+ cached = _tokenizers.get(cache_key)
111
+ if cached is not None:
112
+ return cached
113
+
114
+ # 특화 토크나이저 확인
115
+ if lang in TOKENIZER_MAP:
116
+ tokenizer_class = TOKENIZER_MAP[lang]
117
+
118
+ # CJK 토크나이저
119
+ if lang in ('ko', 'ja', 'zh', 'zh-cn', 'zh-tw'):
120
+ # zh 옵션 전달 (다른 언어는 무시)
121
+ if lang.startswith("zh"):
122
+ try:
123
+ tok = tokenizer_class(use_morphology=use_morphology, zh_join_dates=zh_join_dates)
124
+ _tokenizers[cache_key] = tok
125
+ return tok
126
+ except TypeError:
127
+ tok = tokenizer_class(use_morphology=use_morphology)
128
+ _tokenizers[cache_key] = tok
129
+ return tok
130
+ tok = tokenizer_class(use_morphology=use_morphology)
131
+ _tokenizers[cache_key] = tok
132
+ return tok
133
+ elif hasattr(tokenizer_class, '__init__'):
134
+ try:
135
+ tok = tokenizer_class(use_morphology=use_morphology)
136
+ _tokenizers[cache_key] = tok
137
+ return tok
138
+ except TypeError:
139
+ tok = tokenizer_class(lang, use_morphology)
140
+ _tokenizers[cache_key] = tok
141
+ return tok
142
+
143
+ # 언어군별 기본 토크나이저
144
+ if lang in CJKTokenizer.SUPPORTED_LANGUAGES:
145
+ tok = CJKTokenizer(lang, use_morphology, zh_join_dates=zh_join_dates)
146
+ _tokenizers[cache_key] = tok
147
+ return tok
148
+ elif lang in RTLTokenizer.SUPPORTED_LANGUAGES:
149
+ tok = RTLTokenizer(lang, use_morphology)
150
+ _tokenizers[cache_key] = tok
151
+ return tok
152
+ elif lang in BrahmicTokenizer.SUPPORTED_LANGUAGES:
153
+ tok = BrahmicTokenizer(lang, use_morphology)
154
+ _tokenizers[cache_key] = tok
155
+ return tok
156
+ elif lang in IndicTokenizer.SUPPORTED_LANGUAGES:
157
+ tok = IndicTokenizer(lang, use_morphology)
158
+ _tokenizers[cache_key] = tok
159
+ return tok
160
+ else:
161
+ tok = SpaceBasedTokenizer(lang, use_morphology)
162
+ _tokenizers[cache_key] = tok
163
+ return tok
164
+
165
+
166
+ def clear_tokenizer_cache() -> None:
167
+ """Clear cached tokenizer instances (mainly for tests/bench)."""
168
+ _tokenizers.clear()
169
+
170
+
171
+ def tokenize(text: str, lang: str = 'en', use_morphology: bool = None) -> List[str]:
172
+ """
173
+ 텍스트를 토큰으로 분리 (간편 함수)
174
+
175
+ Args:
176
+ text: 입력 텍스트
177
+ lang: 언어 코드
178
+ use_morphology: 형태소 분석 사용 여부 (None이면 언어별 기본값)
179
+
180
+ Returns:
181
+ 토큰 문자열 리스트
182
+
183
+ Example:
184
+ >>> tokenize("Hello world", lang="en")
185
+ ['Hello', 'world']
186
+ >>> tokenize("삼성전자가 발표했다", lang="ko") # CJK는 기본 형태소 분석
187
+ ['삼성전자', '가', '발표', '했', '다']
188
+ """
189
+ tok = get_tokenizer(lang, use_morphology)
190
+ return tok.tokenize_simple(text)
191
+
192
+
193
+ def detect_language(text: str) -> str:
194
+ """
195
+ 텍스트 언어 자동 감지
196
+
197
+ Args:
198
+ text: 입력 텍스트
199
+
200
+ Returns:
201
+ 감지된 언어 코드 (ISO 639-1)
202
+
203
+ Example:
204
+ >>> detect_language("こんにちは世界")
205
+ 'ja'
206
+ >>> detect_language("Hello world")
207
+ 'en'
208
+ """
209
+ if not text or not text.strip():
210
+ return 'en'
211
+
212
+ # Unicode script detection
213
+ scripts = {
214
+ 'hangul': len(re.findall(r'[\uac00-\ud7af]', text)),
215
+ 'hiragana': len(re.findall(r'[\u3040-\u309f]', text)),
216
+ 'katakana': len(re.findall(r'[\u30a0-\u30ff]', text)),
217
+ 'cjk': len(re.findall(r'[\u4e00-\u9fff]', text)),
218
+ 'arabic': len(re.findall(r'[\u0600-\u06ff]', text)),
219
+ 'hebrew': len(re.findall(r'[\u0590-\u05ff]', text)),
220
+ 'thai': len(re.findall(r'[\u0e00-\u0e7f]', text)),
221
+ 'devanagari': len(re.findall(r'[\u0900-\u097f]', text)),
222
+ 'cyrillic': len(re.findall(r'[\u0400-\u04ff]', text)),
223
+ 'latin': len(re.findall(r'[a-zA-Z]', text)),
224
+ }
225
+
226
+ total = sum(scripts.values())
227
+ if total == 0:
228
+ return 'en'
229
+
230
+ # Determine language by dominant script
231
+ max_script = max(scripts, key=scripts.get)
232
+ ratio = scripts[max_script] / total
233
+
234
+ if ratio < 0.3:
235
+ return 'en' # Mixed or unclear
236
+
237
+ if max_script == 'hangul':
238
+ return 'ko'
239
+ elif max_script in ('hiragana', 'katakana'):
240
+ return 'ja'
241
+ elif max_script == 'cjk':
242
+ # CJK: 히라가나/가타카나 있으면 일본어, 한글 있으면 한국어, 없으면 중국어
243
+ if scripts['hiragana'] + scripts['katakana'] > 0:
244
+ return 'ja'
245
+ elif scripts['hangul'] > 0:
246
+ return 'ko'
247
+ return 'zh'
248
+ elif max_script == 'arabic':
249
+ return 'ar'
250
+ elif max_script == 'hebrew':
251
+ return 'he'
252
+ elif max_script == 'thai':
253
+ return 'th'
254
+ elif max_script == 'devanagari':
255
+ return 'hi'
256
+ elif max_script == 'cyrillic':
257
+ return 'ru'
258
+ else:
259
+ return 'en'
260
+
261
+
262
+ def get_morphological_analyzer(
263
+ lang: str,
264
+ backend: str = 'auto'
265
+ ) -> Optional[MorphologicalAnalyzer]:
266
+ """
267
+ 언어별 형태소 분석기 반환
268
+
269
+ Args:
270
+ lang: 언어 코드
271
+ backend: 분석기 백엔드
272
+
273
+ Returns:
274
+ 형태소 분석기 인스턴스 또는 None
275
+
276
+ Example:
277
+ >>> analyzer = get_morphological_analyzer('ko')
278
+ >>> if analyzer and analyzer.is_available():
279
+ ... tokens = analyzer.analyze("삼성전자가 발표했다")
280
+ """
281
+ lang = lang.lower()
282
+
283
+ if lang == 'ko':
284
+ return KoreanMorphologyAnalyzer()
285
+ elif lang == 'ja':
286
+ return JapaneseMorphologyAnalyzer()
287
+ elif lang == 'zh':
288
+ # NOTE: Chinese join-dates behavior is controlled by tokenizer/API options.
289
+ # Factory-level analyzer keeps backward-compatible env-var behavior.
290
+ return ChineseMorphologyAnalyzer()
291
+ elif lang == 'hi':
292
+ return HindiMorphologyAnalyzer()
293
+
294
+ return None
295
+
296
+
297
+ def supported_languages() -> List[str]:
298
+ """
299
+ 지원 언어 목록 반환
300
+
301
+ Returns:
302
+ 언어 코드 리스트 (358개 위키피디아 언어)
303
+ """
304
+ # 모든 위키피디아 언어 지원 (358개)
305
+ # 특화된 토크나이저가 있는 언어 + 나머지는 SpaceBasedTokenizer로 처리
306
+ ALL_WIKI_LANGS = {
307
+ # CJK
308
+ 'zh', 'ja', 'ko', 'zh-classical', 'zh-yue', 'zh-min-nan', 'lzh',
309
+ # RTL
310
+ 'ar', 'he', 'fa', 'ur', 'ps', 'yi', 'arz', 'azb', 'ckb', 'mzn', 'pnb', 'sd', 'ug',
311
+ # Indic
312
+ 'hi', 'bn', 'ta', 'te', 'ml', 'mr', 'gu', 'kn', 'pa', 'or', 'as', 'ne', 'si', 'sa', 'bh', 'new',
313
+ # Brahmic (no-space)
314
+ 'th', 'lo', 'my', 'km', 'bo',
315
+ # Germanic
316
+ 'en', 'de', 'nl', 'sv', 'da', 'no', 'nb', 'nn', 'is', 'af', 'fy', 'fo', 'lb', 'li', 'nds', 'als', 'bar', 'gsw', 'pdc', 'stq', 'yi', 'ang', 'got', 'frr', 'ksh', 'vls', 'zea',
317
+ # Romance
318
+ 'fr', 'es', 'pt', 'it', 'ro', 'ca', 'gl', 'oc', 'an', 'ast', 'co', 'ext', 'fur', 'lad', 'lij', 'lmo', 'nap', 'pms', 'rm', 'roa-rup', 'roa-tara', 'sc', 'scn', 'vec', 'wa', 'frp', 'eml', 'mwl', 'fro',
319
+ # Slavic
320
+ 'ru', 'uk', 'pl', 'cs', 'sk', 'hr', 'sr', 'bg', 'sl', 'mk', 'be', 'be-tarask', 'sh', 'bs', 'dsb', 'hsb', 'cu', 'csb', 'rue', 'szl',
321
+ # Baltic
322
+ 'lt', 'lv', 'sgs', 'bat-smg', 'ltg',
323
+ # Celtic
324
+ 'cy', 'ga', 'gd', 'br', 'gv', 'kw', 'pcd',
325
+ # Finno-Ugric
326
+ 'fi', 'et', 'hu', 'sme', 'kv', 'mhr', 'mrj', 'myv', 'mdf', 'udm', 'koi', 'vep', 'olo', 'se',
327
+ # Turkic
328
+ 'tr', 'az', 'kk', 'uz', 'ky', 'tk', 'tt', 'ba', 'cv', 'sah', 'crh', 'gag', 'kaa', 'ug',
329
+ # Caucasian
330
+ 'ka', 'hy', 'ab', 'av', 'ce', 'kbd', 'lbe', 'lez', 'os', 'xal',
331
+ # Greek
332
+ 'el', 'grc', 'pnt',
333
+ # Other European
334
+ 'sq', 'mt', 'eu', 'la', 'vo', 'eo', 'ia', 'ie', 'io', 'jbo', 'nov',
335
+ # Austronesian
336
+ 'id', 'ms', 'tl', 'jv', 'su', 'ceb', 'bcl', 'ilo', 'pam', 'war', 'ban', 'ace', 'bug', 'cbk-zam', 'map-bms', 'min', 'bjn', 'haw', 'mi', 'sm', 'to', 'ty', 'ch', 'mg', 'pag', 'nah',
337
+ # Vietnamese (space-based)
338
+ 'vi',
339
+ # African
340
+ 'sw', 'ha', 'yo', 'ig', 'zu', 'xh', 'sn', 'so', 'rw', 'lg', 'ln', 'st', 'ts', 'tn', 've', 'ss', 'rn', 'ny', 'wo', 'om', 'am', 'ti', 'tw', 'bm', 'ff', 'ee', 'fj', 'kg', 'ki', 'luo', 'tum', 'ak',
341
+ # Mongolian
342
+ 'mn', 'bxr',
343
+ # Iranian
344
+ 'ku', 'ckb', 'glk', 'tg', 'os', 'zza', 'lrc', 'mzn',
345
+ # Other Asian
346
+ 'dz', 'dv', 'ks', 'pi',
347
+ # Other
348
+ 'ht', 'qu', 'ay', 'gn', 'srn', 'to', 'za', 'ab', 'av', 'kl', 'ik', 'chr', 'nv', 'cr', 'iu', 'ii', 'chy', 'arc', 'got', 'xmf', 'sco', 'pap', 'kab', 'loz', 'din', 'tpi', 'bi', 'hif', 'pdc', 'wuu', 'gan', 'hak', 'cdo', 'bpy', 'nso', 'pih', 'tet', 'nrm', 'pih', 'nov', 'ie', 'lez', 'diq', 'gor', 'jam', 'szy', 'skr', 'mad', 'mni', 'trv', 'inh', 'awa', 'ban', 'dag', 'fat', 'guw', 'shi', 'nia', 'blk', 'gur', 'gpe', 'nqo', 'alt', 'tay', 'pwn', 'sat', 'lld', 'gcr', 'smn', 'ary', 'avk', 'kbp', 'pcm',
349
+ # Additional Wikipedia languages (missing 66)
350
+ 'aa', 'ady', 'ami', 'ann', 'anp', 'atj', 'bbc', 'bcl', 'bdr', 'be-x-old', 'bew', 'btm', 'bug', 'cho', 'dga', 'dtp', 'dty', 'fiu-vro', 'fon', 'gom', 'guc', 'ho', 'hyw', 'hz', 'iba', 'igl', 'kcg', 'kge', 'kj', 'knc', 'kr', 'krc', 'kus', 'lfn', 'mai', 'map-bms', 'mh', 'mnw', 'mo', 'mos', 'mus', 'na', 'nan', 'nds-nl', 'ng', 'nr', 'nup', 'pfl', 'rki', 'rmy', 'rsk', 'rup', 'sg', 'shn', 'shy', 'simple', 'syl', 'tcy', 'tdd', 'tig', 'tly', 'tok', 'tyv', 'vro', 'yue', 'zgh',
351
+ }
352
+
353
+ # 특화 토크나이저 언어 추가
354
+ all_langs = set(ALL_WIKI_LANGS)
355
+ all_langs.update(SpaceBasedTokenizer.SUPPORTED_LANGUAGES)
356
+ all_langs.update(CJKTokenizer.SUPPORTED_LANGUAGES)
357
+ all_langs.update(RTLTokenizer.SUPPORTED_LANGUAGES)
358
+ all_langs.update(BrahmicTokenizer.SUPPORTED_LANGUAGES)
359
+ all_langs.update(IndicTokenizer.SUPPORTED_LANGUAGES)
360
+
361
+ return sorted(all_langs)
362
+
363
+
364
+ def morphology_available(lang: str) -> bool:
365
+ """
366
+ 형태소 분석 지원 여부
367
+
368
+ Args:
369
+ lang: 언어 코드
370
+
371
+ Returns:
372
+ 형태소 분석 지원 여부
373
+ """
374
+ analyzer = get_morphological_analyzer(lang)
375
+ return analyzer is not None and analyzer.is_available()
376
+
377
+
378
+ def morph_supported_languages() -> List[str]:
379
+ """
380
+ 형태소 분석 지원 언어 목록 (통합: 특화 + 형태소 사전/규칙 모델; 온라인 호출 없음)
381
+
382
+ Returns:
383
+ 언어 코드 리스트 (350+개)
384
+ """
385
+ from .morphology.unified import unified_supported_languages
386
+ return unified_supported_languages()
387
+
388
+
389
+ #
390
+ # NOTE:
391
+ # Lemma-focused helpers (lemmatize/morph_analyze) are intentionally excluded from the
392
+ # \"tokenizer + morphology only\" distribution surface.
393
+ # - Tokenization is exposed via get_tokenizer()/tokenize()
394
+ # - Morphology remains available via get_morphological_analyzer() and via tokmor.api.segment(..., include_morphemes=True)
tokmor/indic.py ADDED
@@ -0,0 +1,289 @@
1
+ """
2
+ Indic Tokenizer
3
+ ===============
4
+
5
+ 인도어군 (힌디, 벵골어, 타밀어 등) 토크나이저
6
+ 데바나가리 및 기타 인도 스크립트 전용 처리
7
+ """
8
+
9
+ import re
10
+ from typing import List
11
+ from .base import BaseTokenizer, Token, TokenizerResult, MorphologicalAnalyzer
12
+
13
+
14
+ class IndicTokenizer(BaseTokenizer):
15
+ """
16
+ 인도어 토크나이저 기본 클래스
17
+
18
+ 데바나가리, 벵골, 타밀 등 인도 스크립트 지원
19
+ 공백 기반이지만 스크립트별 특수 처리 필요
20
+ """
21
+
22
+ SUPPORTED_LANGUAGES = {
23
+ 'hi', 'bn', 'gu', 'pa', 'mr', 'ne', 'si',
24
+ 'ta', 'te', 'kn', 'ml', 'or', 'as', 'sa'
25
+ }
26
+
27
+ # Unicode ranges for Indian scripts
28
+ DEVANAGARI = '\u0900-\u097F' # Hindi, Marathi, Sanskrit, Nepali
29
+ BENGALI = '\u0980-\u09FF' # Bengali, Assamese
30
+ GUJARATI = '\u0A80-\u0AFF' # Gujarati
31
+ GURMUKHI = '\u0A00-\u0A7F' # Punjabi
32
+ TAMIL = '\u0B80-\u0BFF' # Tamil
33
+ TELUGU = '\u0C00-\u0C7F' # Telugu
34
+ KANNADA = '\u0C80-\u0CFF' # Kannada
35
+ MALAYALAM = '\u0D00-\u0D7F' # Malayalam
36
+ ORIYA = '\u0B00-\u0B7F' # Oriya
37
+ SINHALA = '\u0D80-\u0DFF' # Sinhala
38
+
39
+ # 언어별 스크립트 매핑
40
+ LANG_SCRIPTS = {
41
+ 'hi': DEVANAGARI,
42
+ 'mr': DEVANAGARI,
43
+ 'ne': DEVANAGARI,
44
+ 'sa': DEVANAGARI,
45
+ 'bn': BENGALI,
46
+ 'as': BENGALI,
47
+ 'gu': GUJARATI,
48
+ 'pa': GURMUKHI,
49
+ 'ta': TAMIL,
50
+ 'te': TELUGU,
51
+ 'kn': KANNADA,
52
+ 'ml': MALAYALAM,
53
+ 'or': ORIYA,
54
+ 'si': SINHALA,
55
+ }
56
+
57
+ def __init__(self, lang: str, use_morphology: bool = False):
58
+ super().__init__(lang, use_morphology)
59
+ self._setup_patterns()
60
+
61
+ def _setup_patterns(self):
62
+ """언어별 스크립트 패턴 설정"""
63
+ script_range = self.LANG_SCRIPTS.get(self.lang, self.DEVANAGARI)
64
+
65
+ # 해당 스크립트 + 라틴 문자/숫자 패턴
66
+ # Include common format/joiner chars used in Indic scripts (e.g., Sinhala conjuncts use ZWJ).
67
+ joiners = "\u200c\u200d\u200b\u2060" # ZWNJ, ZWJ, ZWSP, WORD JOINER
68
+ self._script_pattern = re.compile(f'[{script_range}{joiners}]+')
69
+ self._latin_pattern = re.compile(r'[a-zA-Z0-9]+')
70
+ self._number_pattern = re.compile(r'[0-9०-९]+(?:[.,][0-9०-९]+)?')
71
+
72
+ def tokenize(self, text: str) -> TokenizerResult:
73
+ """인도어 토크나이징"""
74
+ text = self.clean_text(text)
75
+ if not text:
76
+ return TokenizerResult(tokens=[], text=text, lang=self.lang)
77
+
78
+ # 형태소 분석기 사용
79
+ if self.use_morphology and self._morphology_analyzer:
80
+ if self._morphology_analyzer.is_available():
81
+ tokens = self._morphology_analyzer.analyze(text)
82
+ return TokenizerResult(
83
+ tokens=tokens,
84
+ text=text,
85
+ lang=self.lang,
86
+ morphology_used=True
87
+ )
88
+
89
+ tokens = []
90
+ pos = 0
91
+
92
+ while pos < len(text):
93
+ # 공백 스킵
94
+ if text[pos].isspace():
95
+ pos += 1
96
+ continue
97
+
98
+ # 스크립트 문자 매칭
99
+ script_match = self._script_pattern.match(text[pos:])
100
+ if script_match:
101
+ word = script_match.group()
102
+ tokens.append(Token(
103
+ text=word,
104
+ start=pos,
105
+ end=pos + len(word),
106
+ ))
107
+ pos += len(word)
108
+ continue
109
+
110
+ # 숫자 매칭
111
+ num_match = self._number_pattern.match(text[pos:])
112
+ if num_match:
113
+ num = num_match.group()
114
+ tokens.append(Token(
115
+ text=num,
116
+ start=pos,
117
+ end=pos + len(num),
118
+ ))
119
+ pos += len(num)
120
+ continue
121
+
122
+ # 라틴 문자 매칭
123
+ latin_match = self._latin_pattern.match(text[pos:])
124
+ if latin_match:
125
+ word = latin_match.group()
126
+ tokens.append(Token(
127
+ text=word,
128
+ start=pos,
129
+ end=pos + len(word),
130
+ ))
131
+ pos += len(word)
132
+ continue
133
+
134
+ # 기타 문자 (구두점/이모지 등) - 보존
135
+ # (과거에는 스킵해서 reconstruct mismatch / SNS 기능 손실이 발생할 수 있었음)
136
+ tokens.append(Token(text=text[pos], start=pos, end=pos + 1))
137
+ pos += 1
138
+
139
+ return TokenizerResult(
140
+ tokens=tokens,
141
+ text=text,
142
+ lang=self.lang,
143
+ morphology_used=False
144
+ )
145
+
146
+
147
+ class HindiMorphologyAnalyzer(MorphologicalAnalyzer):
148
+ """힌디어 형태소 분석기"""
149
+
150
+ def __init__(self):
151
+ self._analyzer = None
152
+ self._init_analyzer()
153
+
154
+ def _init_analyzer(self):
155
+ """분석기 초기화"""
156
+ try:
157
+ from .morphology.hindi_advanced import HindiAdvancedAnalyzer
158
+ self._analyzer = HindiAdvancedAnalyzer()
159
+ except ImportError:
160
+ self._analyzer = None
161
+
162
+ def is_available(self) -> bool:
163
+ return self._analyzer is not None
164
+
165
+ def analyze(self, text: str) -> List[Token]:
166
+ """형태소 분석"""
167
+ if not self._analyzer:
168
+ return []
169
+
170
+ tokens = []
171
+ result = self._analyzer.analyze(text)
172
+
173
+ # HindiAdvancedAnalyzer may return:
174
+ # - AnalysisResult (has .morphemes)
175
+ # - NBestResult (has .best.morphemes)
176
+ # - List[Morpheme]
177
+ morphemes = None
178
+ try:
179
+ if hasattr(result, "best") and hasattr(result.best, "morphemes"):
180
+ morphemes = result.best.morphemes
181
+ elif hasattr(result, "morphemes"):
182
+ morphemes = result.morphemes
183
+ elif isinstance(result, list):
184
+ morphemes = result
185
+ except Exception:
186
+ morphemes = None
187
+
188
+ if not morphemes:
189
+ return []
190
+
191
+ for morph in morphemes:
192
+ # be defensive about attribute names
193
+ surface = getattr(morph, "surface", getattr(morph, "form", str(morph)))
194
+ start = getattr(morph, "start", 0)
195
+ end = getattr(morph, "end", start + len(surface))
196
+ lemma = getattr(morph, "lemma", surface)
197
+ pos = getattr(morph, "pos", None)
198
+ tokens.append(Token(text=surface, start=start, end=end, lemma=lemma, pos=pos))
199
+
200
+ return tokens
201
+
202
+
203
+ class HindiTokenizer(IndicTokenizer):
204
+ """힌디어 특화 토크나이저"""
205
+
206
+ SUPPORTED_LANGUAGES = {'hi'}
207
+
208
+ def __init__(self, use_morphology: bool = True):
209
+ """
210
+ Args:
211
+ use_morphology: 형태소 분석 사용 (기본 True)
212
+ """
213
+ super().__init__('hi', use_morphology)
214
+
215
+ def _init_morphology(self):
216
+ """형태소 분석기 초기화"""
217
+ self._morphology_analyzer = HindiMorphologyAnalyzer()
218
+
219
+
220
+ class BengaliTokenizer(IndicTokenizer):
221
+ """벵골어 특화 토크나이저"""
222
+
223
+ SUPPORTED_LANGUAGES = {'bn'}
224
+
225
+ def __init__(self, use_morphology: bool = False):
226
+ super().__init__('bn', use_morphology)
227
+
228
+
229
+ class TamilTokenizer(IndicTokenizer):
230
+ """타밀어 특화 토크나이저"""
231
+
232
+ SUPPORTED_LANGUAGES = {'ta'}
233
+
234
+ def __init__(self, use_morphology: bool = False):
235
+ super().__init__('ta', use_morphology)
236
+
237
+
238
+ class TeluguTokenizer(IndicTokenizer):
239
+ """텔루구어 특화 토크나이저"""
240
+
241
+ SUPPORTED_LANGUAGES = {'te'}
242
+
243
+ def __init__(self, use_morphology: bool = False):
244
+ super().__init__('te', use_morphology)
245
+
246
+
247
+ class MarathiTokenizer(IndicTokenizer):
248
+ """마라티어 특화 토크나이저"""
249
+
250
+ SUPPORTED_LANGUAGES = {'mr'}
251
+
252
+ def __init__(self, use_morphology: bool = False):
253
+ super().__init__('mr', use_morphology)
254
+
255
+
256
+ class GujaratiTokenizer(IndicTokenizer):
257
+ """구자라티어 특화 토크나이저"""
258
+
259
+ SUPPORTED_LANGUAGES = {'gu'}
260
+
261
+ def __init__(self, use_morphology: bool = False):
262
+ super().__init__('gu', use_morphology)
263
+
264
+
265
+ class KannadaTokenizer(IndicTokenizer):
266
+ """칸나다어 특화 토크나이저"""
267
+
268
+ SUPPORTED_LANGUAGES = {'kn'}
269
+
270
+ def __init__(self, use_morphology: bool = False):
271
+ super().__init__('kn', use_morphology)
272
+
273
+
274
+ class MalayalamTokenizer(IndicTokenizer):
275
+ """말라얄람어 특화 토크나이저"""
276
+
277
+ SUPPORTED_LANGUAGES = {'ml'}
278
+
279
+ def __init__(self, use_morphology: bool = False):
280
+ super().__init__('ml', use_morphology)
281
+
282
+
283
+ class PunjabiTokenizer(IndicTokenizer):
284
+ """펀자브어 특화 토크나이저"""
285
+
286
+ SUPPORTED_LANGUAGES = {'pa'}
287
+
288
+ def __init__(self, use_morphology: bool = False):
289
+ super().__init__('pa', use_morphology)