tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
tokmor/space_based.py
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Space-Based Tokenizer
|
|
3
|
+
=====================
|
|
4
|
+
|
|
5
|
+
공백 기반 언어용 토크나이저 (영어, 유럽어 등)
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
import unicodedata
|
|
10
|
+
from typing import List
|
|
11
|
+
from .base import BaseTokenizer, Token, TokenizerResult
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SpaceBasedTokenizer(BaseTokenizer):
|
|
15
|
+
"""
|
|
16
|
+
공백 기반 토크나이저
|
|
17
|
+
|
|
18
|
+
대부분의 언어 (영어, 독일어, 프랑스어, 스페인어 등)
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
SUPPORTED_LANGUAGES = {
|
|
22
|
+
# Germanic
|
|
23
|
+
'en', 'de', 'nl', 'sv', 'da', 'no', 'nb', 'nn', 'is', 'af',
|
|
24
|
+
# Romance
|
|
25
|
+
'fr', 'es', 'pt', 'it', 'ro', 'ca', 'gl', 'oc',
|
|
26
|
+
# Slavic
|
|
27
|
+
'ru', 'uk', 'pl', 'cs', 'sk', 'hr', 'sr', 'bg', 'sl', 'mk', 'be',
|
|
28
|
+
# Baltic
|
|
29
|
+
'lv', 'lt',
|
|
30
|
+
# Finno-Ugric
|
|
31
|
+
'fi', 'et', 'hu',
|
|
32
|
+
# Other European
|
|
33
|
+
'el', 'sq', 'mt', 'eu', 'cy', 'ga',
|
|
34
|
+
# Turkic
|
|
35
|
+
'tr', 'az', 'kk', 'uz', 'ky', 'tk', 'ug',
|
|
36
|
+
# Other
|
|
37
|
+
'id', 'ms', 'tl', 'vi', 'sw', 'ha', 'yo', 'ig', 'zu', 'am',
|
|
38
|
+
'mn', 'ka', 'hy',
|
|
39
|
+
# Indic (space-based but special handling)
|
|
40
|
+
'hi', 'bn', 'gu', 'pa', 'mr', 'ne', 'si', 'ta', 'te', 'kn', 'ml',
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
# 언어별 특수 패턴
|
|
44
|
+
LANGUAGE_PATTERNS = {
|
|
45
|
+
# 독일어: 복합어
|
|
46
|
+
'de': {
|
|
47
|
+
'compound_split': True,
|
|
48
|
+
'preserve_case': True,
|
|
49
|
+
},
|
|
50
|
+
# 터키어: 접미사
|
|
51
|
+
'tr': {
|
|
52
|
+
'agglutinative': True,
|
|
53
|
+
},
|
|
54
|
+
# 핀란드어: 접미사
|
|
55
|
+
'fi': {
|
|
56
|
+
'agglutinative': True,
|
|
57
|
+
},
|
|
58
|
+
# 헝가리어: 접미사
|
|
59
|
+
'hu': {
|
|
60
|
+
'agglutinative': True,
|
|
61
|
+
},
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
def __init__(self, lang: str, use_morphology: bool = False):
|
|
65
|
+
super().__init__(lang, use_morphology)
|
|
66
|
+
self._word_pattern = re.compile(r'\b[\w\-\']+\b', re.UNICODE)
|
|
67
|
+
# Pre-scan special tokens that should stay intact for downstream normalization/value extraction.
|
|
68
|
+
self._special_pattern = re.compile(
|
|
69
|
+
r"(https?://\S+|www\.\S+|[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"
|
|
70
|
+
r"|[$€£¥₩]\s*[0-9][0-9,._]*(?:\.[0-9]+)?|[0-9][0-9,._]*(?:\.[0-9]+)?\s*(?:%|%)"
|
|
71
|
+
r"|\d{4}[-/\.]\d{1,2}[-/\.]\d{1,2})",
|
|
72
|
+
re.UNICODE,
|
|
73
|
+
)
|
|
74
|
+
self._config = self.LANGUAGE_PATTERNS.get(lang, {})
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
def _is_word_char(ch: str) -> bool:
|
|
78
|
+
"""
|
|
79
|
+
Unicode-aware word character definition:
|
|
80
|
+
- letters (L*)
|
|
81
|
+
- marks (M*) (important for Indic/Brahmic scripts)
|
|
82
|
+
- numbers (N*)
|
|
83
|
+
- joiners (ZWJ/ZWNJ) used in some scripts (e.g., Sinhala conjuncts)
|
|
84
|
+
"""
|
|
85
|
+
if not ch:
|
|
86
|
+
return False
|
|
87
|
+
# Keep joiners inside tokens to avoid breaking conjuncts (e.g., Sinhala: ක්රී)
|
|
88
|
+
if ch in {"\u200d", "\u200c"}: # ZWJ / ZWNJ
|
|
89
|
+
return True
|
|
90
|
+
try:
|
|
91
|
+
cat0 = unicodedata.category(ch)[0]
|
|
92
|
+
return cat0 in {"L", "M", "N"}
|
|
93
|
+
except Exception:
|
|
94
|
+
return False
|
|
95
|
+
|
|
96
|
+
def _iter_word_spans(self, text: str, covered: List[bool]) -> List[Token]:
|
|
97
|
+
"""
|
|
98
|
+
Find word-like spans using a unicode category scan.
|
|
99
|
+
This is more robust than regex word-boundaries (\b) for Indic/Brahmic scripts.
|
|
100
|
+
"""
|
|
101
|
+
tokens: List[Token] = []
|
|
102
|
+
i = 0
|
|
103
|
+
n = len(text)
|
|
104
|
+
while i < n:
|
|
105
|
+
if i < n and covered[i]:
|
|
106
|
+
i += 1
|
|
107
|
+
continue
|
|
108
|
+
ch = text[i]
|
|
109
|
+
if ch.isspace():
|
|
110
|
+
i += 1
|
|
111
|
+
continue
|
|
112
|
+
if not self._is_word_char(ch):
|
|
113
|
+
i += 1
|
|
114
|
+
continue
|
|
115
|
+
|
|
116
|
+
s = i
|
|
117
|
+
i += 1
|
|
118
|
+
while i < n:
|
|
119
|
+
if covered[i]:
|
|
120
|
+
break
|
|
121
|
+
c = text[i]
|
|
122
|
+
if self._is_word_char(c):
|
|
123
|
+
i += 1
|
|
124
|
+
continue
|
|
125
|
+
# allow in-word apostrophes/hyphens when surrounded by word chars
|
|
126
|
+
if c in {"'", "’", "-"} and (i + 1) < n:
|
|
127
|
+
if self._is_word_char(text[i - 1]) and self._is_word_char(text[i + 1]):
|
|
128
|
+
i += 1
|
|
129
|
+
continue
|
|
130
|
+
break
|
|
131
|
+
e = i
|
|
132
|
+
if e > s:
|
|
133
|
+
tokens.append(Token(text=text[s:e], start=s, end=e))
|
|
134
|
+
else:
|
|
135
|
+
i += 1
|
|
136
|
+
return tokens
|
|
137
|
+
|
|
138
|
+
def tokenize(self, text: str) -> TokenizerResult:
|
|
139
|
+
"""공백 기반 토크나이징"""
|
|
140
|
+
text = self.clean_text(text)
|
|
141
|
+
if not text:
|
|
142
|
+
return TokenizerResult(tokens=[], text=text, lang=self.lang)
|
|
143
|
+
|
|
144
|
+
tokens = []
|
|
145
|
+
|
|
146
|
+
# 0) Special tokens (URLs/emails/money/percent/dates) first
|
|
147
|
+
covered = [False] * (len(text) + 1)
|
|
148
|
+
for m in self._special_pattern.finditer(text):
|
|
149
|
+
s, e = m.start(), m.end()
|
|
150
|
+
if s < 0 or e <= s:
|
|
151
|
+
continue
|
|
152
|
+
tokens.append(Token(text=m.group(), start=s, end=e))
|
|
153
|
+
for i in range(s, min(e, len(text))):
|
|
154
|
+
covered[i] = True
|
|
155
|
+
|
|
156
|
+
# 1) Unicode-aware word scan (robust for Indic scripts and generally safe)
|
|
157
|
+
word_tokens = self._iter_word_spans(text, covered)
|
|
158
|
+
tokens.extend(word_tokens)
|
|
159
|
+
for t in word_tokens:
|
|
160
|
+
for i in range(int(t.start), min(int(t.end), len(text))):
|
|
161
|
+
covered[i] = True
|
|
162
|
+
|
|
163
|
+
# 2) Keep remaining visible symbols as single-char tokens.
|
|
164
|
+
# This improves SNS/user-generated text handling (emoji, hashtags symbols, punctuation),
|
|
165
|
+
# while remaining deterministic and easy to filter downstream.
|
|
166
|
+
for i, ch in enumerate(text):
|
|
167
|
+
if covered[i]:
|
|
168
|
+
continue
|
|
169
|
+
if ch.isspace():
|
|
170
|
+
continue
|
|
171
|
+
tokens.append(Token(text=ch, start=i, end=i + 1))
|
|
172
|
+
|
|
173
|
+
# 형태소 분석 적용
|
|
174
|
+
if self.use_morphology and self._morphology_analyzer:
|
|
175
|
+
tokens = self._apply_morphology(text, tokens)
|
|
176
|
+
|
|
177
|
+
tokens.sort(key=lambda t: t.start)
|
|
178
|
+
return TokenizerResult(
|
|
179
|
+
tokens=tokens,
|
|
180
|
+
text=text,
|
|
181
|
+
lang=self.lang,
|
|
182
|
+
morphology_used=self.use_morphology and self._morphology_analyzer is not None
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
def _apply_morphology(self, text: str, tokens: List[Token]) -> List[Token]:
|
|
186
|
+
"""형태소 분석 적용"""
|
|
187
|
+
if not self._morphology_analyzer:
|
|
188
|
+
return tokens
|
|
189
|
+
|
|
190
|
+
analyzed = self._morphology_analyzer.analyze(text)
|
|
191
|
+
|
|
192
|
+
# Match analyzed results with tokens
|
|
193
|
+
# (simplified - assumes same tokenization)
|
|
194
|
+
for i, token in enumerate(tokens):
|
|
195
|
+
if i < len(analyzed):
|
|
196
|
+
token.lemma = analyzed[i].lemma
|
|
197
|
+
token.pos = analyzed[i].pos
|
|
198
|
+
|
|
199
|
+
return tokens
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
class EnglishTokenizer(SpaceBasedTokenizer):
|
|
203
|
+
"""영어 특화 토크나이저"""
|
|
204
|
+
|
|
205
|
+
SUPPORTED_LANGUAGES = {'en'}
|
|
206
|
+
|
|
207
|
+
# 축약형 패턴
|
|
208
|
+
CONTRACTIONS = {
|
|
209
|
+
"n't": " not",
|
|
210
|
+
"'re": " are",
|
|
211
|
+
"'ve": " have",
|
|
212
|
+
"'ll": " will",
|
|
213
|
+
"'d": " would",
|
|
214
|
+
"'m": " am",
|
|
215
|
+
"'s": " is", # or possessive
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
def __init__(self, lang: str = 'en', use_morphology: bool = False):
|
|
219
|
+
super().__init__(lang, use_morphology)
|
|
220
|
+
self._contraction_pattern = re.compile(
|
|
221
|
+
r"(\w+)(n't|'re|'ve|'ll|'d|'m|'s)",
|
|
222
|
+
re.IGNORECASE
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
def tokenize(self, text: str, expand_contractions: bool = False) -> TokenizerResult:
|
|
226
|
+
"""영어 토크나이징"""
|
|
227
|
+
text = self.clean_text(text)
|
|
228
|
+
|
|
229
|
+
if expand_contractions:
|
|
230
|
+
text = self._expand_contractions(text)
|
|
231
|
+
|
|
232
|
+
return super().tokenize(text)
|
|
233
|
+
|
|
234
|
+
def _expand_contractions(self, text: str) -> str:
|
|
235
|
+
"""축약형 확장"""
|
|
236
|
+
def replace(match):
|
|
237
|
+
word = match.group(1)
|
|
238
|
+
contraction = match.group(2).lower()
|
|
239
|
+
expansion = self.CONTRACTIONS.get(contraction, contraction)
|
|
240
|
+
return word + expansion
|
|
241
|
+
|
|
242
|
+
return self._contraction_pattern.sub(replace, text)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
class GermanTokenizer(SpaceBasedTokenizer):
|
|
246
|
+
"""독일어 특화 토크나이저"""
|
|
247
|
+
|
|
248
|
+
SUPPORTED_LANGUAGES = {'de'}
|
|
249
|
+
|
|
250
|
+
def __init__(self, lang: str = 'de', use_morphology: bool = False):
|
|
251
|
+
super().__init__(lang, use_morphology)
|
|
252
|
+
|
|
253
|
+
def split_compound(self, word: str) -> List[str]:
|
|
254
|
+
"""
|
|
255
|
+
독일어 복합어 분리 (휴리스틱)
|
|
256
|
+
|
|
257
|
+
Note: 완전한 분리를 위해서는 형태소 분석기 필요
|
|
258
|
+
"""
|
|
259
|
+
# 기본적인 복합어 패턴
|
|
260
|
+
# 실제로는 사전 기반이나 ML 기반 분리가 필요
|
|
261
|
+
return [word] # 기본 구현은 분리 안 함
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
class RussianTokenizer(SpaceBasedTokenizer):
|
|
265
|
+
"""러시아어 특화 토크나이저"""
|
|
266
|
+
|
|
267
|
+
SUPPORTED_LANGUAGES = {'ru'}
|
|
268
|
+
|
|
269
|
+
def __init__(self, lang: str = 'ru', use_morphology: bool = False):
|
|
270
|
+
super().__init__(lang, use_morphology)
|
|
271
|
+
# 키릴 문자 포함 패턴
|
|
272
|
+
self._word_pattern = re.compile(r'[а-яА-ЯёЁa-zA-Z0-9]+', re.UNICODE)
|