tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
tokmor/factory.py
ADDED
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tokenizer Factory
|
|
3
|
+
=================
|
|
4
|
+
|
|
5
|
+
토크나이저 생성 팩토리 및 유틸리티 함수
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
import os
|
|
10
|
+
from typing import List, Optional, Union, Type, Tuple
|
|
11
|
+
from .base import BaseTokenizer, Token, TokenizerResult, MorphologicalAnalyzer
|
|
12
|
+
from .space_based import SpaceBasedTokenizer, EnglishTokenizer, GermanTokenizer, RussianTokenizer
|
|
13
|
+
from .cjk import (
|
|
14
|
+
CJKTokenizer, KoreanTokenizer, JapaneseTokenizer, ChineseTokenizer,
|
|
15
|
+
KoreanMorphologyAnalyzer, JapaneseMorphologyAnalyzer, ChineseMorphologyAnalyzer
|
|
16
|
+
)
|
|
17
|
+
from .rtl import RTLTokenizer, ArabicTokenizer, HebrewTokenizer, PersianTokenizer
|
|
18
|
+
from .brahmic import BrahmicTokenizer, ThaiTokenizer, LaoTokenizer, MyanmarTokenizer, KhmerTokenizer
|
|
19
|
+
from .indic import (
|
|
20
|
+
IndicTokenizer, HindiTokenizer, BengaliTokenizer, TamilTokenizer,
|
|
21
|
+
TeluguTokenizer, MarathiTokenizer, GujaratiTokenizer, KannadaTokenizer,
|
|
22
|
+
MalayalamTokenizer, PunjabiTokenizer, HindiMorphologyAnalyzer
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# 언어 코드 → 토크나이저 클래스 매핑
|
|
27
|
+
TOKENIZER_MAP: dict[str, Type[BaseTokenizer]] = {
|
|
28
|
+
# CJK
|
|
29
|
+
'ko': KoreanTokenizer,
|
|
30
|
+
'ja': JapaneseTokenizer,
|
|
31
|
+
'zh': ChineseTokenizer,
|
|
32
|
+
'zh-cn': ChineseTokenizer,
|
|
33
|
+
'zh-tw': ChineseTokenizer,
|
|
34
|
+
|
|
35
|
+
# RTL
|
|
36
|
+
'ar': ArabicTokenizer,
|
|
37
|
+
'he': HebrewTokenizer,
|
|
38
|
+
'fa': PersianTokenizer,
|
|
39
|
+
'ur': RTLTokenizer,
|
|
40
|
+
|
|
41
|
+
# Brahmic (no-space)
|
|
42
|
+
'th': ThaiTokenizer,
|
|
43
|
+
'lo': LaoTokenizer,
|
|
44
|
+
'my': MyanmarTokenizer,
|
|
45
|
+
'km': KhmerTokenizer,
|
|
46
|
+
|
|
47
|
+
# Indic (space-based with special script handling)
|
|
48
|
+
'hi': HindiTokenizer,
|
|
49
|
+
'bn': BengaliTokenizer,
|
|
50
|
+
'ta': TamilTokenizer,
|
|
51
|
+
'te': TeluguTokenizer,
|
|
52
|
+
'mr': MarathiTokenizer,
|
|
53
|
+
'gu': GujaratiTokenizer,
|
|
54
|
+
'kn': KannadaTokenizer,
|
|
55
|
+
'ml': MalayalamTokenizer,
|
|
56
|
+
'pa': PunjabiTokenizer,
|
|
57
|
+
|
|
58
|
+
# Space-based special
|
|
59
|
+
'en': EnglishTokenizer,
|
|
60
|
+
'de': GermanTokenizer,
|
|
61
|
+
'ru': RussianTokenizer,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
# Cache for tokenizer instances (creation can be expensive: regex compile, lexicon load, analyzer init)
|
|
65
|
+
_tokenizers: dict[tuple[str, bool, Optional[bool]], BaseTokenizer] = {}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_tokenizer(
|
|
69
|
+
lang: str,
|
|
70
|
+
use_morphology: bool = None,
|
|
71
|
+
morph_backend: str = 'auto',
|
|
72
|
+
*,
|
|
73
|
+
zh_join_dates: Optional[bool] = None,
|
|
74
|
+
) -> BaseTokenizer:
|
|
75
|
+
"""
|
|
76
|
+
언어별 토크나이저 반환
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
lang: 언어 코드 (ISO 639-1)
|
|
80
|
+
use_morphology: 형태소 분석 사용 여부 (None이면 언어별 기본값 사용)
|
|
81
|
+
CJK (ko, ja, zh)는 기본 True, 나머지는 기본 False
|
|
82
|
+
morph_backend: 형태소 분석 백엔드 (호환성용; 현재 tokmor 내장 분석기는 backend를 사용하지 않음)
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
해당 언어의 토크나이저 인스턴스
|
|
86
|
+
|
|
87
|
+
Example:
|
|
88
|
+
>>> tok = get_tokenizer('ko') # CJK는 기본으로 형태소 분석 사용
|
|
89
|
+
>>> result = tok.tokenize("삼성전자가 서울에서 발표했다")
|
|
90
|
+
>>> print(result.texts())
|
|
91
|
+
['삼성전자', '가', '서울', '에서', '발표', '했다']
|
|
92
|
+
"""
|
|
93
|
+
# NOTE: morph_backend는 CLI 호환성(옵션 파싱)을 위해 받지만,
|
|
94
|
+
# 현재 tokmor의 내장 형태소 분석기는 backend 선택을 사용하지 않습니다.
|
|
95
|
+
_ = morph_backend
|
|
96
|
+
|
|
97
|
+
lang = lang.lower().replace('_', '-')
|
|
98
|
+
|
|
99
|
+
# CJK 언어 목록
|
|
100
|
+
cjk_languages = {'ko', 'ja', 'zh', 'zh-cn', 'zh-tw'}
|
|
101
|
+
|
|
102
|
+
# use_morphology 기본값 결정
|
|
103
|
+
if use_morphology is None:
|
|
104
|
+
use_morphology = lang in cjk_languages # CJK는 기본 True
|
|
105
|
+
|
|
106
|
+
# Tokenizer cache key:
|
|
107
|
+
# - For zh*, include zh_join_dates (None/True/False)
|
|
108
|
+
# - For non-zh, ignore zh_join_dates (always None in key)
|
|
109
|
+
cache_key = (lang, bool(use_morphology), zh_join_dates if lang.startswith("zh") else None)
|
|
110
|
+
cached = _tokenizers.get(cache_key)
|
|
111
|
+
if cached is not None:
|
|
112
|
+
return cached
|
|
113
|
+
|
|
114
|
+
# 특화 토크나이저 확인
|
|
115
|
+
if lang in TOKENIZER_MAP:
|
|
116
|
+
tokenizer_class = TOKENIZER_MAP[lang]
|
|
117
|
+
|
|
118
|
+
# CJK 토크나이저
|
|
119
|
+
if lang in ('ko', 'ja', 'zh', 'zh-cn', 'zh-tw'):
|
|
120
|
+
# zh 옵션 전달 (다른 언어는 무시)
|
|
121
|
+
if lang.startswith("zh"):
|
|
122
|
+
try:
|
|
123
|
+
tok = tokenizer_class(use_morphology=use_morphology, zh_join_dates=zh_join_dates)
|
|
124
|
+
_tokenizers[cache_key] = tok
|
|
125
|
+
return tok
|
|
126
|
+
except TypeError:
|
|
127
|
+
tok = tokenizer_class(use_morphology=use_morphology)
|
|
128
|
+
_tokenizers[cache_key] = tok
|
|
129
|
+
return tok
|
|
130
|
+
tok = tokenizer_class(use_morphology=use_morphology)
|
|
131
|
+
_tokenizers[cache_key] = tok
|
|
132
|
+
return tok
|
|
133
|
+
elif hasattr(tokenizer_class, '__init__'):
|
|
134
|
+
try:
|
|
135
|
+
tok = tokenizer_class(use_morphology=use_morphology)
|
|
136
|
+
_tokenizers[cache_key] = tok
|
|
137
|
+
return tok
|
|
138
|
+
except TypeError:
|
|
139
|
+
tok = tokenizer_class(lang, use_morphology)
|
|
140
|
+
_tokenizers[cache_key] = tok
|
|
141
|
+
return tok
|
|
142
|
+
|
|
143
|
+
# 언어군별 기본 토크나이저
|
|
144
|
+
if lang in CJKTokenizer.SUPPORTED_LANGUAGES:
|
|
145
|
+
tok = CJKTokenizer(lang, use_morphology, zh_join_dates=zh_join_dates)
|
|
146
|
+
_tokenizers[cache_key] = tok
|
|
147
|
+
return tok
|
|
148
|
+
elif lang in RTLTokenizer.SUPPORTED_LANGUAGES:
|
|
149
|
+
tok = RTLTokenizer(lang, use_morphology)
|
|
150
|
+
_tokenizers[cache_key] = tok
|
|
151
|
+
return tok
|
|
152
|
+
elif lang in BrahmicTokenizer.SUPPORTED_LANGUAGES:
|
|
153
|
+
tok = BrahmicTokenizer(lang, use_morphology)
|
|
154
|
+
_tokenizers[cache_key] = tok
|
|
155
|
+
return tok
|
|
156
|
+
elif lang in IndicTokenizer.SUPPORTED_LANGUAGES:
|
|
157
|
+
tok = IndicTokenizer(lang, use_morphology)
|
|
158
|
+
_tokenizers[cache_key] = tok
|
|
159
|
+
return tok
|
|
160
|
+
else:
|
|
161
|
+
tok = SpaceBasedTokenizer(lang, use_morphology)
|
|
162
|
+
_tokenizers[cache_key] = tok
|
|
163
|
+
return tok
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def clear_tokenizer_cache() -> None:
|
|
167
|
+
"""Clear cached tokenizer instances (mainly for tests/bench)."""
|
|
168
|
+
_tokenizers.clear()
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def tokenize(text: str, lang: str = 'en', use_morphology: bool = None) -> List[str]:
|
|
172
|
+
"""
|
|
173
|
+
텍스트를 토큰으로 분리 (간편 함수)
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
text: 입력 텍스트
|
|
177
|
+
lang: 언어 코드
|
|
178
|
+
use_morphology: 형태소 분석 사용 여부 (None이면 언어별 기본값)
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
토큰 문자열 리스트
|
|
182
|
+
|
|
183
|
+
Example:
|
|
184
|
+
>>> tokenize("Hello world", lang="en")
|
|
185
|
+
['Hello', 'world']
|
|
186
|
+
>>> tokenize("삼성전자가 발표했다", lang="ko") # CJK는 기본 형태소 분석
|
|
187
|
+
['삼성전자', '가', '발표', '했', '다']
|
|
188
|
+
"""
|
|
189
|
+
tok = get_tokenizer(lang, use_morphology)
|
|
190
|
+
return tok.tokenize_simple(text)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def detect_language(text: str) -> str:
|
|
194
|
+
"""
|
|
195
|
+
텍스트 언어 자동 감지
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
text: 입력 텍스트
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
감지된 언어 코드 (ISO 639-1)
|
|
202
|
+
|
|
203
|
+
Example:
|
|
204
|
+
>>> detect_language("こんにちは世界")
|
|
205
|
+
'ja'
|
|
206
|
+
>>> detect_language("Hello world")
|
|
207
|
+
'en'
|
|
208
|
+
"""
|
|
209
|
+
if not text or not text.strip():
|
|
210
|
+
return 'en'
|
|
211
|
+
|
|
212
|
+
# Unicode script detection
|
|
213
|
+
scripts = {
|
|
214
|
+
'hangul': len(re.findall(r'[\uac00-\ud7af]', text)),
|
|
215
|
+
'hiragana': len(re.findall(r'[\u3040-\u309f]', text)),
|
|
216
|
+
'katakana': len(re.findall(r'[\u30a0-\u30ff]', text)),
|
|
217
|
+
'cjk': len(re.findall(r'[\u4e00-\u9fff]', text)),
|
|
218
|
+
'arabic': len(re.findall(r'[\u0600-\u06ff]', text)),
|
|
219
|
+
'hebrew': len(re.findall(r'[\u0590-\u05ff]', text)),
|
|
220
|
+
'thai': len(re.findall(r'[\u0e00-\u0e7f]', text)),
|
|
221
|
+
'devanagari': len(re.findall(r'[\u0900-\u097f]', text)),
|
|
222
|
+
'cyrillic': len(re.findall(r'[\u0400-\u04ff]', text)),
|
|
223
|
+
'latin': len(re.findall(r'[a-zA-Z]', text)),
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
total = sum(scripts.values())
|
|
227
|
+
if total == 0:
|
|
228
|
+
return 'en'
|
|
229
|
+
|
|
230
|
+
# Determine language by dominant script
|
|
231
|
+
max_script = max(scripts, key=scripts.get)
|
|
232
|
+
ratio = scripts[max_script] / total
|
|
233
|
+
|
|
234
|
+
if ratio < 0.3:
|
|
235
|
+
return 'en' # Mixed or unclear
|
|
236
|
+
|
|
237
|
+
if max_script == 'hangul':
|
|
238
|
+
return 'ko'
|
|
239
|
+
elif max_script in ('hiragana', 'katakana'):
|
|
240
|
+
return 'ja'
|
|
241
|
+
elif max_script == 'cjk':
|
|
242
|
+
# CJK: 히라가나/가타카나 있으면 일본어, 한글 있으면 한국어, 없으면 중국어
|
|
243
|
+
if scripts['hiragana'] + scripts['katakana'] > 0:
|
|
244
|
+
return 'ja'
|
|
245
|
+
elif scripts['hangul'] > 0:
|
|
246
|
+
return 'ko'
|
|
247
|
+
return 'zh'
|
|
248
|
+
elif max_script == 'arabic':
|
|
249
|
+
return 'ar'
|
|
250
|
+
elif max_script == 'hebrew':
|
|
251
|
+
return 'he'
|
|
252
|
+
elif max_script == 'thai':
|
|
253
|
+
return 'th'
|
|
254
|
+
elif max_script == 'devanagari':
|
|
255
|
+
return 'hi'
|
|
256
|
+
elif max_script == 'cyrillic':
|
|
257
|
+
return 'ru'
|
|
258
|
+
else:
|
|
259
|
+
return 'en'
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def get_morphological_analyzer(
|
|
263
|
+
lang: str,
|
|
264
|
+
backend: str = 'auto'
|
|
265
|
+
) -> Optional[MorphologicalAnalyzer]:
|
|
266
|
+
"""
|
|
267
|
+
언어별 형태소 분석기 반환
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
lang: 언어 코드
|
|
271
|
+
backend: 분석기 백엔드
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
형태소 분석기 인스턴스 또는 None
|
|
275
|
+
|
|
276
|
+
Example:
|
|
277
|
+
>>> analyzer = get_morphological_analyzer('ko')
|
|
278
|
+
>>> if analyzer and analyzer.is_available():
|
|
279
|
+
... tokens = analyzer.analyze("삼성전자가 발표했다")
|
|
280
|
+
"""
|
|
281
|
+
lang = lang.lower()
|
|
282
|
+
|
|
283
|
+
if lang == 'ko':
|
|
284
|
+
return KoreanMorphologyAnalyzer()
|
|
285
|
+
elif lang == 'ja':
|
|
286
|
+
return JapaneseMorphologyAnalyzer()
|
|
287
|
+
elif lang == 'zh':
|
|
288
|
+
# NOTE: Chinese join-dates behavior is controlled by tokenizer/API options.
|
|
289
|
+
# Factory-level analyzer keeps backward-compatible env-var behavior.
|
|
290
|
+
return ChineseMorphologyAnalyzer()
|
|
291
|
+
elif lang == 'hi':
|
|
292
|
+
return HindiMorphologyAnalyzer()
|
|
293
|
+
|
|
294
|
+
return None
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def supported_languages() -> List[str]:
|
|
298
|
+
"""
|
|
299
|
+
지원 언어 목록 반환
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
언어 코드 리스트 (358개 위키피디아 언어)
|
|
303
|
+
"""
|
|
304
|
+
# 모든 위키피디아 언어 지원 (358개)
|
|
305
|
+
# 특화된 토크나이저가 있는 언어 + 나머지는 SpaceBasedTokenizer로 처리
|
|
306
|
+
ALL_WIKI_LANGS = {
|
|
307
|
+
# CJK
|
|
308
|
+
'zh', 'ja', 'ko', 'zh-classical', 'zh-yue', 'zh-min-nan', 'lzh',
|
|
309
|
+
# RTL
|
|
310
|
+
'ar', 'he', 'fa', 'ur', 'ps', 'yi', 'arz', 'azb', 'ckb', 'mzn', 'pnb', 'sd', 'ug',
|
|
311
|
+
# Indic
|
|
312
|
+
'hi', 'bn', 'ta', 'te', 'ml', 'mr', 'gu', 'kn', 'pa', 'or', 'as', 'ne', 'si', 'sa', 'bh', 'new',
|
|
313
|
+
# Brahmic (no-space)
|
|
314
|
+
'th', 'lo', 'my', 'km', 'bo',
|
|
315
|
+
# Germanic
|
|
316
|
+
'en', 'de', 'nl', 'sv', 'da', 'no', 'nb', 'nn', 'is', 'af', 'fy', 'fo', 'lb', 'li', 'nds', 'als', 'bar', 'gsw', 'pdc', 'stq', 'yi', 'ang', 'got', 'frr', 'ksh', 'vls', 'zea',
|
|
317
|
+
# Romance
|
|
318
|
+
'fr', 'es', 'pt', 'it', 'ro', 'ca', 'gl', 'oc', 'an', 'ast', 'co', 'ext', 'fur', 'lad', 'lij', 'lmo', 'nap', 'pms', 'rm', 'roa-rup', 'roa-tara', 'sc', 'scn', 'vec', 'wa', 'frp', 'eml', 'mwl', 'fro',
|
|
319
|
+
# Slavic
|
|
320
|
+
'ru', 'uk', 'pl', 'cs', 'sk', 'hr', 'sr', 'bg', 'sl', 'mk', 'be', 'be-tarask', 'sh', 'bs', 'dsb', 'hsb', 'cu', 'csb', 'rue', 'szl',
|
|
321
|
+
# Baltic
|
|
322
|
+
'lt', 'lv', 'sgs', 'bat-smg', 'ltg',
|
|
323
|
+
# Celtic
|
|
324
|
+
'cy', 'ga', 'gd', 'br', 'gv', 'kw', 'pcd',
|
|
325
|
+
# Finno-Ugric
|
|
326
|
+
'fi', 'et', 'hu', 'sme', 'kv', 'mhr', 'mrj', 'myv', 'mdf', 'udm', 'koi', 'vep', 'olo', 'se',
|
|
327
|
+
# Turkic
|
|
328
|
+
'tr', 'az', 'kk', 'uz', 'ky', 'tk', 'tt', 'ba', 'cv', 'sah', 'crh', 'gag', 'kaa', 'ug',
|
|
329
|
+
# Caucasian
|
|
330
|
+
'ka', 'hy', 'ab', 'av', 'ce', 'kbd', 'lbe', 'lez', 'os', 'xal',
|
|
331
|
+
# Greek
|
|
332
|
+
'el', 'grc', 'pnt',
|
|
333
|
+
# Other European
|
|
334
|
+
'sq', 'mt', 'eu', 'la', 'vo', 'eo', 'ia', 'ie', 'io', 'jbo', 'nov',
|
|
335
|
+
# Austronesian
|
|
336
|
+
'id', 'ms', 'tl', 'jv', 'su', 'ceb', 'bcl', 'ilo', 'pam', 'war', 'ban', 'ace', 'bug', 'cbk-zam', 'map-bms', 'min', 'bjn', 'haw', 'mi', 'sm', 'to', 'ty', 'ch', 'mg', 'pag', 'nah',
|
|
337
|
+
# Vietnamese (space-based)
|
|
338
|
+
'vi',
|
|
339
|
+
# African
|
|
340
|
+
'sw', 'ha', 'yo', 'ig', 'zu', 'xh', 'sn', 'so', 'rw', 'lg', 'ln', 'st', 'ts', 'tn', 've', 'ss', 'rn', 'ny', 'wo', 'om', 'am', 'ti', 'tw', 'bm', 'ff', 'ee', 'fj', 'kg', 'ki', 'luo', 'tum', 'ak',
|
|
341
|
+
# Mongolian
|
|
342
|
+
'mn', 'bxr',
|
|
343
|
+
# Iranian
|
|
344
|
+
'ku', 'ckb', 'glk', 'tg', 'os', 'zza', 'lrc', 'mzn',
|
|
345
|
+
# Other Asian
|
|
346
|
+
'dz', 'dv', 'ks', 'pi',
|
|
347
|
+
# Other
|
|
348
|
+
'ht', 'qu', 'ay', 'gn', 'srn', 'to', 'za', 'ab', 'av', 'kl', 'ik', 'chr', 'nv', 'cr', 'iu', 'ii', 'chy', 'arc', 'got', 'xmf', 'sco', 'pap', 'kab', 'loz', 'din', 'tpi', 'bi', 'hif', 'pdc', 'wuu', 'gan', 'hak', 'cdo', 'bpy', 'nso', 'pih', 'tet', 'nrm', 'pih', 'nov', 'ie', 'lez', 'diq', 'gor', 'jam', 'szy', 'skr', 'mad', 'mni', 'trv', 'inh', 'awa', 'ban', 'dag', 'fat', 'guw', 'shi', 'nia', 'blk', 'gur', 'gpe', 'nqo', 'alt', 'tay', 'pwn', 'sat', 'lld', 'gcr', 'smn', 'ary', 'avk', 'kbp', 'pcm',
|
|
349
|
+
# Additional Wikipedia languages (missing 66)
|
|
350
|
+
'aa', 'ady', 'ami', 'ann', 'anp', 'atj', 'bbc', 'bcl', 'bdr', 'be-x-old', 'bew', 'btm', 'bug', 'cho', 'dga', 'dtp', 'dty', 'fiu-vro', 'fon', 'gom', 'guc', 'ho', 'hyw', 'hz', 'iba', 'igl', 'kcg', 'kge', 'kj', 'knc', 'kr', 'krc', 'kus', 'lfn', 'mai', 'map-bms', 'mh', 'mnw', 'mo', 'mos', 'mus', 'na', 'nan', 'nds-nl', 'ng', 'nr', 'nup', 'pfl', 'rki', 'rmy', 'rsk', 'rup', 'sg', 'shn', 'shy', 'simple', 'syl', 'tcy', 'tdd', 'tig', 'tly', 'tok', 'tyv', 'vro', 'yue', 'zgh',
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
# 특화 토크나이저 언어 추가
|
|
354
|
+
all_langs = set(ALL_WIKI_LANGS)
|
|
355
|
+
all_langs.update(SpaceBasedTokenizer.SUPPORTED_LANGUAGES)
|
|
356
|
+
all_langs.update(CJKTokenizer.SUPPORTED_LANGUAGES)
|
|
357
|
+
all_langs.update(RTLTokenizer.SUPPORTED_LANGUAGES)
|
|
358
|
+
all_langs.update(BrahmicTokenizer.SUPPORTED_LANGUAGES)
|
|
359
|
+
all_langs.update(IndicTokenizer.SUPPORTED_LANGUAGES)
|
|
360
|
+
|
|
361
|
+
return sorted(all_langs)
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def morphology_available(lang: str) -> bool:
|
|
365
|
+
"""
|
|
366
|
+
형태소 분석 지원 여부
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
lang: 언어 코드
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
형태소 분석 지원 여부
|
|
373
|
+
"""
|
|
374
|
+
analyzer = get_morphological_analyzer(lang)
|
|
375
|
+
return analyzer is not None and analyzer.is_available()
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def morph_supported_languages() -> List[str]:
|
|
379
|
+
"""
|
|
380
|
+
형태소 분석 지원 언어 목록 (통합: 특화 + 형태소 사전/규칙 모델; 온라인 호출 없음)
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
언어 코드 리스트 (350+개)
|
|
384
|
+
"""
|
|
385
|
+
from .morphology.unified import unified_supported_languages
|
|
386
|
+
return unified_supported_languages()
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
#
|
|
390
|
+
# NOTE:
|
|
391
|
+
# Lemma-focused helpers (lemmatize/morph_analyze) are intentionally excluded from the
|
|
392
|
+
# \"tokenizer + morphology only\" distribution surface.
|
|
393
|
+
# - Tokenization is exposed via get_tokenizer()/tokenize()
|
|
394
|
+
# - Morphology remains available via get_morphological_analyzer() and via tokmor.api.segment(..., include_morphemes=True)
|
tokmor/indic.py
ADDED
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Indic Tokenizer
|
|
3
|
+
===============
|
|
4
|
+
|
|
5
|
+
인도어군 (힌디, 벵골어, 타밀어 등) 토크나이저
|
|
6
|
+
데바나가리 및 기타 인도 스크립트 전용 처리
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
from typing import List
|
|
11
|
+
from .base import BaseTokenizer, Token, TokenizerResult, MorphologicalAnalyzer
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class IndicTokenizer(BaseTokenizer):
|
|
15
|
+
"""
|
|
16
|
+
인도어 토크나이저 기본 클래스
|
|
17
|
+
|
|
18
|
+
데바나가리, 벵골, 타밀 등 인도 스크립트 지원
|
|
19
|
+
공백 기반이지만 스크립트별 특수 처리 필요
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
SUPPORTED_LANGUAGES = {
|
|
23
|
+
'hi', 'bn', 'gu', 'pa', 'mr', 'ne', 'si',
|
|
24
|
+
'ta', 'te', 'kn', 'ml', 'or', 'as', 'sa'
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
# Unicode ranges for Indian scripts
|
|
28
|
+
DEVANAGARI = '\u0900-\u097F' # Hindi, Marathi, Sanskrit, Nepali
|
|
29
|
+
BENGALI = '\u0980-\u09FF' # Bengali, Assamese
|
|
30
|
+
GUJARATI = '\u0A80-\u0AFF' # Gujarati
|
|
31
|
+
GURMUKHI = '\u0A00-\u0A7F' # Punjabi
|
|
32
|
+
TAMIL = '\u0B80-\u0BFF' # Tamil
|
|
33
|
+
TELUGU = '\u0C00-\u0C7F' # Telugu
|
|
34
|
+
KANNADA = '\u0C80-\u0CFF' # Kannada
|
|
35
|
+
MALAYALAM = '\u0D00-\u0D7F' # Malayalam
|
|
36
|
+
ORIYA = '\u0B00-\u0B7F' # Oriya
|
|
37
|
+
SINHALA = '\u0D80-\u0DFF' # Sinhala
|
|
38
|
+
|
|
39
|
+
# 언어별 스크립트 매핑
|
|
40
|
+
LANG_SCRIPTS = {
|
|
41
|
+
'hi': DEVANAGARI,
|
|
42
|
+
'mr': DEVANAGARI,
|
|
43
|
+
'ne': DEVANAGARI,
|
|
44
|
+
'sa': DEVANAGARI,
|
|
45
|
+
'bn': BENGALI,
|
|
46
|
+
'as': BENGALI,
|
|
47
|
+
'gu': GUJARATI,
|
|
48
|
+
'pa': GURMUKHI,
|
|
49
|
+
'ta': TAMIL,
|
|
50
|
+
'te': TELUGU,
|
|
51
|
+
'kn': KANNADA,
|
|
52
|
+
'ml': MALAYALAM,
|
|
53
|
+
'or': ORIYA,
|
|
54
|
+
'si': SINHALA,
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
def __init__(self, lang: str, use_morphology: bool = False):
|
|
58
|
+
super().__init__(lang, use_morphology)
|
|
59
|
+
self._setup_patterns()
|
|
60
|
+
|
|
61
|
+
def _setup_patterns(self):
|
|
62
|
+
"""언어별 스크립트 패턴 설정"""
|
|
63
|
+
script_range = self.LANG_SCRIPTS.get(self.lang, self.DEVANAGARI)
|
|
64
|
+
|
|
65
|
+
# 해당 스크립트 + 라틴 문자/숫자 패턴
|
|
66
|
+
# Include common format/joiner chars used in Indic scripts (e.g., Sinhala conjuncts use ZWJ).
|
|
67
|
+
joiners = "\u200c\u200d\u200b\u2060" # ZWNJ, ZWJ, ZWSP, WORD JOINER
|
|
68
|
+
self._script_pattern = re.compile(f'[{script_range}{joiners}]+')
|
|
69
|
+
self._latin_pattern = re.compile(r'[a-zA-Z0-9]+')
|
|
70
|
+
self._number_pattern = re.compile(r'[0-9०-९]+(?:[.,][0-9०-९]+)?')
|
|
71
|
+
|
|
72
|
+
def tokenize(self, text: str) -> TokenizerResult:
|
|
73
|
+
"""인도어 토크나이징"""
|
|
74
|
+
text = self.clean_text(text)
|
|
75
|
+
if not text:
|
|
76
|
+
return TokenizerResult(tokens=[], text=text, lang=self.lang)
|
|
77
|
+
|
|
78
|
+
# 형태소 분석기 사용
|
|
79
|
+
if self.use_morphology and self._morphology_analyzer:
|
|
80
|
+
if self._morphology_analyzer.is_available():
|
|
81
|
+
tokens = self._morphology_analyzer.analyze(text)
|
|
82
|
+
return TokenizerResult(
|
|
83
|
+
tokens=tokens,
|
|
84
|
+
text=text,
|
|
85
|
+
lang=self.lang,
|
|
86
|
+
morphology_used=True
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
tokens = []
|
|
90
|
+
pos = 0
|
|
91
|
+
|
|
92
|
+
while pos < len(text):
|
|
93
|
+
# 공백 스킵
|
|
94
|
+
if text[pos].isspace():
|
|
95
|
+
pos += 1
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
# 스크립트 문자 매칭
|
|
99
|
+
script_match = self._script_pattern.match(text[pos:])
|
|
100
|
+
if script_match:
|
|
101
|
+
word = script_match.group()
|
|
102
|
+
tokens.append(Token(
|
|
103
|
+
text=word,
|
|
104
|
+
start=pos,
|
|
105
|
+
end=pos + len(word),
|
|
106
|
+
))
|
|
107
|
+
pos += len(word)
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
# 숫자 매칭
|
|
111
|
+
num_match = self._number_pattern.match(text[pos:])
|
|
112
|
+
if num_match:
|
|
113
|
+
num = num_match.group()
|
|
114
|
+
tokens.append(Token(
|
|
115
|
+
text=num,
|
|
116
|
+
start=pos,
|
|
117
|
+
end=pos + len(num),
|
|
118
|
+
))
|
|
119
|
+
pos += len(num)
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
# 라틴 문자 매칭
|
|
123
|
+
latin_match = self._latin_pattern.match(text[pos:])
|
|
124
|
+
if latin_match:
|
|
125
|
+
word = latin_match.group()
|
|
126
|
+
tokens.append(Token(
|
|
127
|
+
text=word,
|
|
128
|
+
start=pos,
|
|
129
|
+
end=pos + len(word),
|
|
130
|
+
))
|
|
131
|
+
pos += len(word)
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
# 기타 문자 (구두점/이모지 등) - 보존
|
|
135
|
+
# (과거에는 스킵해서 reconstruct mismatch / SNS 기능 손실이 발생할 수 있었음)
|
|
136
|
+
tokens.append(Token(text=text[pos], start=pos, end=pos + 1))
|
|
137
|
+
pos += 1
|
|
138
|
+
|
|
139
|
+
return TokenizerResult(
|
|
140
|
+
tokens=tokens,
|
|
141
|
+
text=text,
|
|
142
|
+
lang=self.lang,
|
|
143
|
+
morphology_used=False
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class HindiMorphologyAnalyzer(MorphologicalAnalyzer):
|
|
148
|
+
"""힌디어 형태소 분석기"""
|
|
149
|
+
|
|
150
|
+
def __init__(self):
|
|
151
|
+
self._analyzer = None
|
|
152
|
+
self._init_analyzer()
|
|
153
|
+
|
|
154
|
+
def _init_analyzer(self):
|
|
155
|
+
"""분석기 초기화"""
|
|
156
|
+
try:
|
|
157
|
+
from .morphology.hindi_advanced import HindiAdvancedAnalyzer
|
|
158
|
+
self._analyzer = HindiAdvancedAnalyzer()
|
|
159
|
+
except ImportError:
|
|
160
|
+
self._analyzer = None
|
|
161
|
+
|
|
162
|
+
def is_available(self) -> bool:
|
|
163
|
+
return self._analyzer is not None
|
|
164
|
+
|
|
165
|
+
def analyze(self, text: str) -> List[Token]:
|
|
166
|
+
"""형태소 분석"""
|
|
167
|
+
if not self._analyzer:
|
|
168
|
+
return []
|
|
169
|
+
|
|
170
|
+
tokens = []
|
|
171
|
+
result = self._analyzer.analyze(text)
|
|
172
|
+
|
|
173
|
+
# HindiAdvancedAnalyzer may return:
|
|
174
|
+
# - AnalysisResult (has .morphemes)
|
|
175
|
+
# - NBestResult (has .best.morphemes)
|
|
176
|
+
# - List[Morpheme]
|
|
177
|
+
morphemes = None
|
|
178
|
+
try:
|
|
179
|
+
if hasattr(result, "best") and hasattr(result.best, "morphemes"):
|
|
180
|
+
morphemes = result.best.morphemes
|
|
181
|
+
elif hasattr(result, "morphemes"):
|
|
182
|
+
morphemes = result.morphemes
|
|
183
|
+
elif isinstance(result, list):
|
|
184
|
+
morphemes = result
|
|
185
|
+
except Exception:
|
|
186
|
+
morphemes = None
|
|
187
|
+
|
|
188
|
+
if not morphemes:
|
|
189
|
+
return []
|
|
190
|
+
|
|
191
|
+
for morph in morphemes:
|
|
192
|
+
# be defensive about attribute names
|
|
193
|
+
surface = getattr(morph, "surface", getattr(morph, "form", str(morph)))
|
|
194
|
+
start = getattr(morph, "start", 0)
|
|
195
|
+
end = getattr(morph, "end", start + len(surface))
|
|
196
|
+
lemma = getattr(morph, "lemma", surface)
|
|
197
|
+
pos = getattr(morph, "pos", None)
|
|
198
|
+
tokens.append(Token(text=surface, start=start, end=end, lemma=lemma, pos=pos))
|
|
199
|
+
|
|
200
|
+
return tokens
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class HindiTokenizer(IndicTokenizer):
|
|
204
|
+
"""힌디어 특화 토크나이저"""
|
|
205
|
+
|
|
206
|
+
SUPPORTED_LANGUAGES = {'hi'}
|
|
207
|
+
|
|
208
|
+
def __init__(self, use_morphology: bool = True):
|
|
209
|
+
"""
|
|
210
|
+
Args:
|
|
211
|
+
use_morphology: 형태소 분석 사용 (기본 True)
|
|
212
|
+
"""
|
|
213
|
+
super().__init__('hi', use_morphology)
|
|
214
|
+
|
|
215
|
+
def _init_morphology(self):
|
|
216
|
+
"""형태소 분석기 초기화"""
|
|
217
|
+
self._morphology_analyzer = HindiMorphologyAnalyzer()
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
class BengaliTokenizer(IndicTokenizer):
|
|
221
|
+
"""벵골어 특화 토크나이저"""
|
|
222
|
+
|
|
223
|
+
SUPPORTED_LANGUAGES = {'bn'}
|
|
224
|
+
|
|
225
|
+
def __init__(self, use_morphology: bool = False):
|
|
226
|
+
super().__init__('bn', use_morphology)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
class TamilTokenizer(IndicTokenizer):
|
|
230
|
+
"""타밀어 특화 토크나이저"""
|
|
231
|
+
|
|
232
|
+
SUPPORTED_LANGUAGES = {'ta'}
|
|
233
|
+
|
|
234
|
+
def __init__(self, use_morphology: bool = False):
|
|
235
|
+
super().__init__('ta', use_morphology)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
class TeluguTokenizer(IndicTokenizer):
|
|
239
|
+
"""텔루구어 특화 토크나이저"""
|
|
240
|
+
|
|
241
|
+
SUPPORTED_LANGUAGES = {'te'}
|
|
242
|
+
|
|
243
|
+
def __init__(self, use_morphology: bool = False):
|
|
244
|
+
super().__init__('te', use_morphology)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
class MarathiTokenizer(IndicTokenizer):
|
|
248
|
+
"""마라티어 특화 토크나이저"""
|
|
249
|
+
|
|
250
|
+
SUPPORTED_LANGUAGES = {'mr'}
|
|
251
|
+
|
|
252
|
+
def __init__(self, use_morphology: bool = False):
|
|
253
|
+
super().__init__('mr', use_morphology)
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
class GujaratiTokenizer(IndicTokenizer):
|
|
257
|
+
"""구자라티어 특화 토크나이저"""
|
|
258
|
+
|
|
259
|
+
SUPPORTED_LANGUAGES = {'gu'}
|
|
260
|
+
|
|
261
|
+
def __init__(self, use_morphology: bool = False):
|
|
262
|
+
super().__init__('gu', use_morphology)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
class KannadaTokenizer(IndicTokenizer):
|
|
266
|
+
"""칸나다어 특화 토크나이저"""
|
|
267
|
+
|
|
268
|
+
SUPPORTED_LANGUAGES = {'kn'}
|
|
269
|
+
|
|
270
|
+
def __init__(self, use_morphology: bool = False):
|
|
271
|
+
super().__init__('kn', use_morphology)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
class MalayalamTokenizer(IndicTokenizer):
|
|
275
|
+
"""말라얄람어 특화 토크나이저"""
|
|
276
|
+
|
|
277
|
+
SUPPORTED_LANGUAGES = {'ml'}
|
|
278
|
+
|
|
279
|
+
def __init__(self, use_morphology: bool = False):
|
|
280
|
+
super().__init__('ml', use_morphology)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
class PunjabiTokenizer(IndicTokenizer):
|
|
284
|
+
"""펀자브어 특화 토크나이저"""
|
|
285
|
+
|
|
286
|
+
SUPPORTED_LANGUAGES = {'pa'}
|
|
287
|
+
|
|
288
|
+
def __init__(self, use_morphology: bool = False):
|
|
289
|
+
super().__init__('pa', use_morphology)
|