tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
tokmor/cjk.py
ADDED
|
@@ -0,0 +1,497 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CJK Tokenizer
|
|
3
|
+
=============
|
|
4
|
+
|
|
5
|
+
중국어, 일본어, 한국어 토크나이저
|
|
6
|
+
외부 라이브러리 없이 자체 구현만 사용
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
import unicodedata
|
|
11
|
+
from typing import List, Optional
|
|
12
|
+
from .base import BaseTokenizer, Token, TokenizerResult, MorphologicalAnalyzer
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class CJKTokenizer(BaseTokenizer):
|
|
16
|
+
"""
|
|
17
|
+
CJK 기본 토크나이저
|
|
18
|
+
|
|
19
|
+
한자/한글/가나 연속 청크 기반
|
|
20
|
+
자체 형태소 분석기 사용 (외부 라이브러리 불필요)
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
SUPPORTED_LANGUAGES = {'zh', 'ja', 'ko'}
|
|
24
|
+
|
|
25
|
+
# Unicode ranges
|
|
26
|
+
CJK_UNIFIED = '\u4e00-\u9fff' # CJK Unified Ideographs
|
|
27
|
+
CJK_EXT_A = '\u3400-\u4dbf' # CJK Extension A
|
|
28
|
+
CJK_EXT_B = '\U00020000-\U0002a6df' # CJK Extension B
|
|
29
|
+
HIRAGANA = '\u3040-\u309f'
|
|
30
|
+
KATAKANA = '\u30a0-\u30ff'
|
|
31
|
+
HANGUL_SYLLABLES = '\uac00-\ud7af'
|
|
32
|
+
HANGUL_JAMO = '\u1100-\u11ff'
|
|
33
|
+
HANGUL_COMPAT = '\u3130-\u318f'
|
|
34
|
+
|
|
35
|
+
def __init__(self, lang: str, use_morphology: bool = True, *, zh_join_dates: Optional[bool] = None):
|
|
36
|
+
"""
|
|
37
|
+
Args:
|
|
38
|
+
lang: 언어 코드 (ko, ja, zh)
|
|
39
|
+
use_morphology: 형태소 분석 사용 (기본 True)
|
|
40
|
+
"""
|
|
41
|
+
# IMPORTANT: BaseTokenizer.__init__ may call _init_morphology() which uses _zh_join_dates.
|
|
42
|
+
# So set this BEFORE calling super().__init__().
|
|
43
|
+
self._zh_join_dates: Optional[bool] = zh_join_dates if lang == "zh" else None
|
|
44
|
+
super().__init__(lang, use_morphology)
|
|
45
|
+
self._setup_patterns()
|
|
46
|
+
self._native_analyzer = None
|
|
47
|
+
self._init_native_analyzer()
|
|
48
|
+
|
|
49
|
+
def _setup_patterns(self):
|
|
50
|
+
"""언어별 패턴 설정"""
|
|
51
|
+
if self.lang == 'ko':
|
|
52
|
+
# 한국어: 한글 + 한자
|
|
53
|
+
self._script_pattern = re.compile(
|
|
54
|
+
f'[{self.HANGUL_SYLLABLES}{self.HANGUL_JAMO}{self.HANGUL_COMPAT}'
|
|
55
|
+
f'{self.CJK_UNIFIED}{self.CJK_EXT_A}]+'
|
|
56
|
+
)
|
|
57
|
+
elif self.lang == 'ja':
|
|
58
|
+
# 일본어: 한자 + 히라가나 + 가타카나
|
|
59
|
+
self._script_pattern = re.compile(
|
|
60
|
+
f'[{self.CJK_UNIFIED}{self.CJK_EXT_A}'
|
|
61
|
+
f'{self.HIRAGANA}{self.KATAKANA}]+'
|
|
62
|
+
)
|
|
63
|
+
else: # zh
|
|
64
|
+
# 중국어: 한자
|
|
65
|
+
self._script_pattern = re.compile(
|
|
66
|
+
f'[{self.CJK_UNIFIED}{self.CJK_EXT_A}]+'
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# 라틴/숫자 패턴
|
|
70
|
+
self._latin_pattern = re.compile(r'[a-zA-Z0-9]+')
|
|
71
|
+
|
|
72
|
+
def _init_native_analyzer(self):
|
|
73
|
+
"""자체 형태소 분석기 초기화"""
|
|
74
|
+
try:
|
|
75
|
+
if self.lang == 'ko':
|
|
76
|
+
from .morphology.korean import KoreanAnalyzer
|
|
77
|
+
self._native_analyzer = KoreanAnalyzer()
|
|
78
|
+
elif self.lang == 'ja':
|
|
79
|
+
from .morphology.japanese import JapaneseAnalyzer
|
|
80
|
+
self._native_analyzer = JapaneseAnalyzer()
|
|
81
|
+
elif self.lang == 'zh':
|
|
82
|
+
from .morphology.chinese import ChineseAnalyzer
|
|
83
|
+
self._native_analyzer = ChineseAnalyzer(join_dates=self._zh_join_dates)
|
|
84
|
+
except ImportError:
|
|
85
|
+
self._native_analyzer = None
|
|
86
|
+
|
|
87
|
+
def _init_morphology(self):
|
|
88
|
+
"""형태소 분석기 초기화"""
|
|
89
|
+
if self.lang == 'ko':
|
|
90
|
+
self._morphology_analyzer = KoreanMorphologyAnalyzer()
|
|
91
|
+
elif self.lang == 'ja':
|
|
92
|
+
self._morphology_analyzer = JapaneseMorphologyAnalyzer()
|
|
93
|
+
elif self.lang == 'zh':
|
|
94
|
+
self._morphology_analyzer = ChineseMorphologyAnalyzer(join_dates=self._zh_join_dates)
|
|
95
|
+
|
|
96
|
+
def tokenize(self, text: str) -> TokenizerResult:
|
|
97
|
+
"""CJK 토크나이징"""
|
|
98
|
+
text = self.clean_text(text)
|
|
99
|
+
if not text:
|
|
100
|
+
return TokenizerResult(tokens=[], text=text, lang=self.lang)
|
|
101
|
+
|
|
102
|
+
# 1. 자체 형태소 분석기
|
|
103
|
+
# NOTE:
|
|
104
|
+
# - For Korean, native analyzer tends to produce morpheme-level splits.
|
|
105
|
+
# When use_morphology=False we should keep surface tokenization (eojeol-like).
|
|
106
|
+
# - For Japanese/Chinese, native analyzer is effectively the word segmenter,
|
|
107
|
+
# so we keep using it regardless.
|
|
108
|
+
# Only use the native analyzer when the text contains the target script.
|
|
109
|
+
# This avoids pathological per-character output on mislabeled corpora (e.g., Arabic lines in ja/zh files).
|
|
110
|
+
has_target = bool(self._script_pattern.search(text))
|
|
111
|
+
use_native = bool(self._native_analyzer) and has_target and (self.lang != "ko" or self.use_morphology)
|
|
112
|
+
if use_native:
|
|
113
|
+
try:
|
|
114
|
+
morphemes = self._native_analyzer.analyze(text)
|
|
115
|
+
tokens = []
|
|
116
|
+
for m in morphemes:
|
|
117
|
+
tokens.append(Token(
|
|
118
|
+
text=m.surface,
|
|
119
|
+
start=m.start,
|
|
120
|
+
end=m.end,
|
|
121
|
+
lemma=m.lemma,
|
|
122
|
+
pos=m.pos,
|
|
123
|
+
))
|
|
124
|
+
tokens.sort(key=lambda t: t.start)
|
|
125
|
+
tokens = self._postprocess_marks_and_numbers(tokens)
|
|
126
|
+
# Apply token-quality rules early for CJK (defensive).
|
|
127
|
+
# These same rules are also applied globally in TokenizerResult.__post_init__,
|
|
128
|
+
# but doing it here avoids edge cases where upstream CJK segmentation
|
|
129
|
+
# emits an over-merged chunk that should be split before returning.
|
|
130
|
+
try:
|
|
131
|
+
from .token_quality import apply_token_quality
|
|
132
|
+
tokens = apply_token_quality(tokens, lang=self.lang, text=text) # type: ignore[assignment]
|
|
133
|
+
except Exception:
|
|
134
|
+
pass
|
|
135
|
+
return TokenizerResult(
|
|
136
|
+
tokens=tokens,
|
|
137
|
+
text=text,
|
|
138
|
+
lang=self.lang,
|
|
139
|
+
morphology_used=self.use_morphology
|
|
140
|
+
)
|
|
141
|
+
except Exception:
|
|
142
|
+
pass # fallback to chunk-based
|
|
143
|
+
|
|
144
|
+
# 2. 기본: 스크립트 청크 기반 (fallback)
|
|
145
|
+
tokens = []
|
|
146
|
+
|
|
147
|
+
# CJK 청크
|
|
148
|
+
for match in self._script_pattern.finditer(text):
|
|
149
|
+
tokens.append(Token(
|
|
150
|
+
text=match.group(),
|
|
151
|
+
start=match.start(),
|
|
152
|
+
end=match.end(),
|
|
153
|
+
))
|
|
154
|
+
|
|
155
|
+
# 라틴/숫자
|
|
156
|
+
for match in self._latin_pattern.finditer(text):
|
|
157
|
+
# 이미 추출된 범위와 겹치지 않는지 확인
|
|
158
|
+
overlaps = any(
|
|
159
|
+
t.start <= match.start() < t.end or t.start < match.end() <= t.end
|
|
160
|
+
for t in tokens
|
|
161
|
+
)
|
|
162
|
+
if not overlaps:
|
|
163
|
+
tokens.append(Token(
|
|
164
|
+
text=match.group(),
|
|
165
|
+
start=match.start(),
|
|
166
|
+
end=match.end(),
|
|
167
|
+
))
|
|
168
|
+
|
|
169
|
+
# 위치순 정렬
|
|
170
|
+
tokens.sort(key=lambda t: t.start)
|
|
171
|
+
|
|
172
|
+
tokens = self._postprocess_marks_and_numbers(tokens)
|
|
173
|
+
try:
|
|
174
|
+
from .token_quality import apply_token_quality
|
|
175
|
+
tokens = apply_token_quality(tokens, lang=self.lang, text=text) # type: ignore[assignment]
|
|
176
|
+
except Exception:
|
|
177
|
+
pass
|
|
178
|
+
|
|
179
|
+
# Safety: never return empty tokens for non-empty input.
|
|
180
|
+
# This can happen when the input contains neither CJK script chunks nor latin/digits
|
|
181
|
+
# (e.g., mislabeled corpus lines). Fall back to whitespace tokens.
|
|
182
|
+
if not tokens:
|
|
183
|
+
for m in re.finditer(r"\S+", text):
|
|
184
|
+
tokens.append(Token(text=m.group(), start=m.start(), end=m.end()))
|
|
185
|
+
|
|
186
|
+
return TokenizerResult(
|
|
187
|
+
tokens=tokens,
|
|
188
|
+
text=text,
|
|
189
|
+
lang=self.lang,
|
|
190
|
+
morphology_used=False
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
def _postprocess_marks_and_numbers(self, tokens: List[Token]) -> List[Token]:
|
|
194
|
+
"""
|
|
195
|
+
Postprocess for robustness / quality:
|
|
196
|
+
- Merge standalone combining marks into previous token when contiguous (e.g., variation selectors in JA).
|
|
197
|
+
- Merge contiguous digit runs (ASCII/fullwidth/etc) into a single token (esp. JA dates/numbers).
|
|
198
|
+
- Merge contiguous ASCII alnum runs (e.g., "F" "I" "R" "E" -> "FIRE") when contiguous.
|
|
199
|
+
"""
|
|
200
|
+
if not tokens:
|
|
201
|
+
return tokens
|
|
202
|
+
|
|
203
|
+
def _is_mark_only(s: str) -> bool:
|
|
204
|
+
return bool(s) and all(unicodedata.category(ch) in {"Mn", "Mc", "Me"} for ch in s)
|
|
205
|
+
|
|
206
|
+
def _starts_with_mark(s: str) -> bool:
|
|
207
|
+
return bool(s) and unicodedata.category(s[0]) in {"Mn", "Mc", "Me"}
|
|
208
|
+
|
|
209
|
+
def _is_all_digits(s: str) -> bool:
|
|
210
|
+
return bool(s) and all(ch.isdigit() for ch in s)
|
|
211
|
+
|
|
212
|
+
def _is_ascii_alnum(s: str) -> bool:
|
|
213
|
+
return bool(s) and s.isascii() and all(ch.isalnum() for ch in s)
|
|
214
|
+
|
|
215
|
+
# pass 1: simple adjacent merges (marks/digits/ascii)
|
|
216
|
+
out: List[Token] = []
|
|
217
|
+
for t in tokens:
|
|
218
|
+
if not out:
|
|
219
|
+
out.append(t)
|
|
220
|
+
continue
|
|
221
|
+
prev = out[-1]
|
|
222
|
+
|
|
223
|
+
# merge contiguous combining marks (variation selectors, dakuten marks, etc.)
|
|
224
|
+
if (t.start == prev.end) and (t.text and (_is_mark_only(t.text) or _starts_with_mark(t.text))):
|
|
225
|
+
prev.text += t.text
|
|
226
|
+
prev.end = t.end
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
# merge contiguous digit runs (including full-width digits)
|
|
230
|
+
if (t.start == prev.end) and _is_all_digits(prev.text) and _is_all_digits(t.text):
|
|
231
|
+
prev.text += t.text
|
|
232
|
+
prev.end = t.end
|
|
233
|
+
continue
|
|
234
|
+
|
|
235
|
+
# merge contiguous ASCII alnum runs (common in JA/KO/ZH corpora)
|
|
236
|
+
if (t.start == prev.end) and _is_ascii_alnum(prev.text) and _is_ascii_alnum(t.text):
|
|
237
|
+
prev.text += t.text
|
|
238
|
+
prev.end = t.end
|
|
239
|
+
continue
|
|
240
|
+
|
|
241
|
+
out.append(t)
|
|
242
|
+
|
|
243
|
+
# pass 2 (JA): merge common numeric patterns to avoid token explosions in statistics/news lines
|
|
244
|
+
if self.lang == "ja" and len(out) >= 3:
|
|
245
|
+
merged: List[Token] = []
|
|
246
|
+
i = 0
|
|
247
|
+
JA_NUM_UNITS = {"年", "月", "日", "代", "人", "件", "話", "歳"}
|
|
248
|
+
DEC_SEPS = {".", ".", "・"}
|
|
249
|
+
PERCENTS = {"%", "%"}
|
|
250
|
+
while i < len(out):
|
|
251
|
+
t = out[i]
|
|
252
|
+
# digits + unit (e.g., 20 + 代 -> 20代)
|
|
253
|
+
if i + 1 < len(out) and (out[i].end == out[i + 1].start) and _is_all_digits(out[i].text) and (out[i + 1].text in JA_NUM_UNITS):
|
|
254
|
+
merged.append(Token(text=out[i].text + out[i + 1].text, start=out[i].start, end=out[i + 1].end))
|
|
255
|
+
i += 2
|
|
256
|
+
continue
|
|
257
|
+
# decimal: digits + sep + digits (+ percent)
|
|
258
|
+
if i + 2 < len(out):
|
|
259
|
+
a, b, c = out[i], out[i + 1], out[i + 2]
|
|
260
|
+
if (a.end == b.start) and (b.end == c.start) and _is_all_digits(a.text) and (b.text in DEC_SEPS) and _is_all_digits(c.text):
|
|
261
|
+
txt = a.text + b.text + c.text
|
|
262
|
+
end = c.end
|
|
263
|
+
j = i + 3
|
|
264
|
+
if j < len(out) and (out[j].start == end) and (out[j].text in PERCENTS):
|
|
265
|
+
txt += out[j].text
|
|
266
|
+
end = out[j].end
|
|
267
|
+
j += 1
|
|
268
|
+
merged.append(Token(text=txt, start=a.start, end=end))
|
|
269
|
+
i = j
|
|
270
|
+
continue
|
|
271
|
+
# digits + percent (e.g., 10 + % -> 10%)
|
|
272
|
+
if i + 1 < len(out) and (out[i].end == out[i + 1].start) and _is_all_digits(out[i].text) and (out[i + 1].text in PERCENTS):
|
|
273
|
+
merged.append(Token(text=out[i].text + out[i + 1].text, start=out[i].start, end=out[i + 1].end))
|
|
274
|
+
i += 2
|
|
275
|
+
continue
|
|
276
|
+
merged.append(t)
|
|
277
|
+
i += 1
|
|
278
|
+
out = merged
|
|
279
|
+
|
|
280
|
+
# pass 3 (ZH): merge contiguous ASCII digit token + common CJK unit char
|
|
281
|
+
# e.g., 10 + 亿 -> 10亿, 2025 + 年 -> 2025年
|
|
282
|
+
if self.lang == "zh" and len(out) >= 2:
|
|
283
|
+
merged2: List[Token] = []
|
|
284
|
+
i = 0
|
|
285
|
+
ZH_NUM_UNITS = {"年", "月", "日", "号", "亿", "万", "元", "%", "%", "度", "岁"}
|
|
286
|
+
while i < len(out):
|
|
287
|
+
t = out[i]
|
|
288
|
+
if i + 1 < len(out):
|
|
289
|
+
a, b = out[i], out[i + 1]
|
|
290
|
+
if (a.end == b.start) and _is_all_digits(a.text) and (b.text in ZH_NUM_UNITS):
|
|
291
|
+
merged2.append(Token(text=a.text + b.text, start=a.start, end=b.end))
|
|
292
|
+
i += 2
|
|
293
|
+
continue
|
|
294
|
+
# If the next token starts with a unit char (e.g., '亿人'), split it:
|
|
295
|
+
# 10 + 亿人 -> 10亿 + 人
|
|
296
|
+
if (a.end == b.start) and _is_all_digits(a.text) and b.text and (b.text[0] in ZH_NUM_UNITS) and len(b.text) > 1:
|
|
297
|
+
unit = b.text[0]
|
|
298
|
+
rest = b.text[1:]
|
|
299
|
+
merged2.append(Token(text=a.text + unit, start=a.start, end=b.start + 1))
|
|
300
|
+
merged2.append(Token(text=rest, start=b.start + 1, end=b.end))
|
|
301
|
+
i += 2
|
|
302
|
+
continue
|
|
303
|
+
merged2.append(t)
|
|
304
|
+
i += 1
|
|
305
|
+
out = merged2
|
|
306
|
+
|
|
307
|
+
return out
|
|
308
|
+
|
|
309
|
+
def extract_ngrams(self, text: str, min_n: int = 2, max_n: int = 8) -> List[str]:
|
|
310
|
+
"""N-gram 추출"""
|
|
311
|
+
ngrams = []
|
|
312
|
+
|
|
313
|
+
for match in self._script_pattern.finditer(text):
|
|
314
|
+
chunk = match.group()
|
|
315
|
+
for n in range(min_n, min(max_n + 1, len(chunk) + 1)):
|
|
316
|
+
for i in range(len(chunk) - n + 1):
|
|
317
|
+
ngrams.append(chunk[i:i+n])
|
|
318
|
+
|
|
319
|
+
return ngrams
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
# =============================================================================
|
|
323
|
+
# 형태소 분석기 (자체 구현만 사용)
|
|
324
|
+
# =============================================================================
|
|
325
|
+
|
|
326
|
+
class KoreanMorphologyAnalyzer(MorphologicalAnalyzer):
|
|
327
|
+
"""한국어 형태소 분석기 (자체 구현)"""
|
|
328
|
+
|
|
329
|
+
def __init__(self):
|
|
330
|
+
self._analyzer = None
|
|
331
|
+
self._init_analyzer()
|
|
332
|
+
|
|
333
|
+
def _init_analyzer(self):
|
|
334
|
+
"""분석기 초기화"""
|
|
335
|
+
try:
|
|
336
|
+
from .morphology.korean import KoreanAnalyzer
|
|
337
|
+
self._analyzer = KoreanAnalyzer()
|
|
338
|
+
except ImportError:
|
|
339
|
+
self._analyzer = None
|
|
340
|
+
|
|
341
|
+
def is_available(self) -> bool:
|
|
342
|
+
return self._analyzer is not None
|
|
343
|
+
|
|
344
|
+
def analyze(self, text: str) -> List[Token]:
|
|
345
|
+
"""형태소 분석"""
|
|
346
|
+
if not self._analyzer:
|
|
347
|
+
return []
|
|
348
|
+
|
|
349
|
+
tokens = []
|
|
350
|
+
result = self._analyzer.analyze(text)
|
|
351
|
+
for morph in result:
|
|
352
|
+
tokens.append(Token(
|
|
353
|
+
text=morph.surface,
|
|
354
|
+
start=morph.start,
|
|
355
|
+
end=morph.end,
|
|
356
|
+
lemma=morph.lemma,
|
|
357
|
+
pos=morph.pos,
|
|
358
|
+
))
|
|
359
|
+
return tokens
|
|
360
|
+
|
|
361
|
+
def nouns(self, text: str) -> List[str]:
|
|
362
|
+
"""명사 추출"""
|
|
363
|
+
if not self._analyzer:
|
|
364
|
+
return []
|
|
365
|
+
result = self._analyzer.analyze(text)
|
|
366
|
+
return [m.surface for m in result if m.pos.startswith('N')]
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
class JapaneseMorphologyAnalyzer(MorphologicalAnalyzer):
|
|
370
|
+
"""일본어 형태소 분석기 (자체 구현)"""
|
|
371
|
+
|
|
372
|
+
def __init__(self):
|
|
373
|
+
self._analyzer = None
|
|
374
|
+
self._init_analyzer()
|
|
375
|
+
|
|
376
|
+
def _init_analyzer(self):
|
|
377
|
+
"""분석기 초기화"""
|
|
378
|
+
try:
|
|
379
|
+
from .morphology.japanese import JapaneseAnalyzer
|
|
380
|
+
self._analyzer = JapaneseAnalyzer()
|
|
381
|
+
except ImportError:
|
|
382
|
+
self._analyzer = None
|
|
383
|
+
|
|
384
|
+
def is_available(self) -> bool:
|
|
385
|
+
return self._analyzer is not None
|
|
386
|
+
|
|
387
|
+
def analyze(self, text: str) -> List[Token]:
|
|
388
|
+
"""형태소 분석"""
|
|
389
|
+
if not self._analyzer:
|
|
390
|
+
return []
|
|
391
|
+
|
|
392
|
+
tokens = []
|
|
393
|
+
result = self._analyzer.analyze(text)
|
|
394
|
+
for morph in result:
|
|
395
|
+
tokens.append(Token(
|
|
396
|
+
text=morph.surface,
|
|
397
|
+
start=morph.start,
|
|
398
|
+
end=morph.end,
|
|
399
|
+
lemma=morph.lemma,
|
|
400
|
+
pos=morph.pos,
|
|
401
|
+
))
|
|
402
|
+
return tokens
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
class ChineseMorphologyAnalyzer(MorphologicalAnalyzer):
|
|
406
|
+
"""중국어 형태소 분석기 (자체 구현)"""
|
|
407
|
+
|
|
408
|
+
def __init__(self, join_dates: Optional[bool] = None):
|
|
409
|
+
self._analyzer = None
|
|
410
|
+
self._join_dates = join_dates
|
|
411
|
+
self._init_analyzer()
|
|
412
|
+
|
|
413
|
+
def _init_analyzer(self):
|
|
414
|
+
"""분석기 초기화"""
|
|
415
|
+
try:
|
|
416
|
+
from .morphology.chinese import ChineseAnalyzer
|
|
417
|
+
self._analyzer = ChineseAnalyzer(join_dates=self._join_dates)
|
|
418
|
+
except ImportError:
|
|
419
|
+
self._analyzer = None
|
|
420
|
+
|
|
421
|
+
def is_available(self) -> bool:
|
|
422
|
+
return self._analyzer is not None
|
|
423
|
+
|
|
424
|
+
def analyze(self, text: str) -> List[Token]:
|
|
425
|
+
"""형태소 분석"""
|
|
426
|
+
if not self._analyzer:
|
|
427
|
+
return []
|
|
428
|
+
|
|
429
|
+
tokens = []
|
|
430
|
+
result = self._analyzer.analyze(text)
|
|
431
|
+
for morph in result:
|
|
432
|
+
tokens.append(Token(
|
|
433
|
+
text=morph.surface,
|
|
434
|
+
start=morph.start,
|
|
435
|
+
end=morph.end,
|
|
436
|
+
lemma=morph.lemma,
|
|
437
|
+
pos=morph.pos,
|
|
438
|
+
))
|
|
439
|
+
return tokens
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
# =============================================================================
|
|
443
|
+
# 언어별 특화 토크나이저
|
|
444
|
+
# =============================================================================
|
|
445
|
+
|
|
446
|
+
class KoreanTokenizer(CJKTokenizer):
|
|
447
|
+
"""한국어 특화 토크나이저"""
|
|
448
|
+
|
|
449
|
+
SUPPORTED_LANGUAGES = {'ko'}
|
|
450
|
+
|
|
451
|
+
def __init__(self, use_morphology: bool = True):
|
|
452
|
+
"""
|
|
453
|
+
Args:
|
|
454
|
+
use_morphology: 형태소 분석 사용 (기본 True)
|
|
455
|
+
"""
|
|
456
|
+
super().__init__('ko', use_morphology)
|
|
457
|
+
|
|
458
|
+
def _init_morphology(self):
|
|
459
|
+
self._morphology_analyzer = KoreanMorphologyAnalyzer()
|
|
460
|
+
|
|
461
|
+
def nouns(self, text: str) -> List[str]:
|
|
462
|
+
"""명사 추출"""
|
|
463
|
+
if self._morphology_analyzer and self._morphology_analyzer.is_available():
|
|
464
|
+
return self._morphology_analyzer.nouns(text)
|
|
465
|
+
return []
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
class JapaneseTokenizer(CJKTokenizer):
|
|
469
|
+
"""일본어 특화 토크나이저"""
|
|
470
|
+
|
|
471
|
+
SUPPORTED_LANGUAGES = {'ja'}
|
|
472
|
+
|
|
473
|
+
def __init__(self, use_morphology: bool = True):
|
|
474
|
+
"""
|
|
475
|
+
Args:
|
|
476
|
+
use_morphology: 형태소 분석 사용 (기본 True)
|
|
477
|
+
"""
|
|
478
|
+
super().__init__('ja', use_morphology)
|
|
479
|
+
|
|
480
|
+
def _init_morphology(self):
|
|
481
|
+
self._morphology_analyzer = JapaneseMorphologyAnalyzer()
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
class ChineseTokenizer(CJKTokenizer):
|
|
485
|
+
"""중국어 특화 토크나이저"""
|
|
486
|
+
|
|
487
|
+
SUPPORTED_LANGUAGES = {'zh'}
|
|
488
|
+
|
|
489
|
+
def __init__(self, use_morphology: bool = True, *, zh_join_dates: Optional[bool] = None):
|
|
490
|
+
"""
|
|
491
|
+
Args:
|
|
492
|
+
use_morphology: 형태소 분석 사용 (기본 True)
|
|
493
|
+
"""
|
|
494
|
+
super().__init__('zh', use_morphology, zh_join_dates=zh_join_dates)
|
|
495
|
+
|
|
496
|
+
def _init_morphology(self):
|
|
497
|
+
self._morphology_analyzer = ChineseMorphologyAnalyzer(join_dates=self._zh_join_dates)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Domain lexicons (small, optional)
|
|
3
|
+
================================
|
|
4
|
+
|
|
5
|
+
TokMor core focuses on tokenization/morphology. Domain lexicons are small,
|
|
6
|
+
optional add-ons that can be shipped in a data pack (`TOKMOR_DATA_DIR`) or as
|
|
7
|
+
tiny bundled assets.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from .sentiment import load_sentiment_lexicon, sentiment_hint # noqa: F401
|
|
11
|
+
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from functools import lru_cache
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict, Iterable, List, Literal, Optional, Set, Tuple
|
|
8
|
+
|
|
9
|
+
from ..factory import detect_language
|
|
10
|
+
from ..preprocess import normalize_text
|
|
11
|
+
from ..resources import data_dir
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
Polarity = Literal["pos", "neg", "neu"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _domain_sentiment_path(lang: str) -> Path:
|
|
18
|
+
# Prefer external assets under TOKMOR_DATA_DIR, fallback to bundled models.
|
|
19
|
+
#
|
|
20
|
+
# IMPORTANT:
|
|
21
|
+
# - `resources.data_dir()` returns TOKMOR_DATA_DIR when set, which would hide bundled
|
|
22
|
+
# seed lexicons if we only looked there.
|
|
23
|
+
# - So we explicitly check external first, then the package-bundled models dir.
|
|
24
|
+
from .. import resources
|
|
25
|
+
|
|
26
|
+
l = (lang or "").lower().replace("_", "-")
|
|
27
|
+
rel = Path("domain") / "sentiment" / f"{l}.json"
|
|
28
|
+
|
|
29
|
+
env = resources.data_dir() / rel
|
|
30
|
+
bundled = Path(__file__).resolve().parents[1] / "models" / rel # tokmor/models/...
|
|
31
|
+
|
|
32
|
+
if env.exists():
|
|
33
|
+
return env
|
|
34
|
+
return bundled
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass(frozen=True)
|
|
38
|
+
class SentimentLexicon:
|
|
39
|
+
lang: str
|
|
40
|
+
pos: Set[str]
|
|
41
|
+
neg: Set[str]
|
|
42
|
+
negators: Set[str]
|
|
43
|
+
intensifiers: Set[str]
|
|
44
|
+
diminishers: Set[str]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _as_set(xs: Any) -> Set[str]:
|
|
48
|
+
if not xs:
|
|
49
|
+
return set()
|
|
50
|
+
out: Set[str] = set()
|
|
51
|
+
for x in xs if isinstance(xs, list) else []:
|
|
52
|
+
s = str(x).strip()
|
|
53
|
+
if s:
|
|
54
|
+
out.add(s)
|
|
55
|
+
return out
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@lru_cache(maxsize=32)
|
|
59
|
+
def load_sentiment_lexicon(lang: str) -> Optional[SentimentLexicon]:
|
|
60
|
+
"""
|
|
61
|
+
Load a small sentiment lexicon for a language.
|
|
62
|
+
|
|
63
|
+
Returns None if the lexicon does not exist.
|
|
64
|
+
"""
|
|
65
|
+
# Allow disabling domain lexicons entirely (e.g., minimal deployments).
|
|
66
|
+
try:
|
|
67
|
+
import os
|
|
68
|
+
|
|
69
|
+
v = (os.getenv("TOKMOR_DISABLE_DOMAIN_LEXICONS", "") or "").strip().lower()
|
|
70
|
+
if v in {"1", "true", "yes", "y", "on"}:
|
|
71
|
+
return None
|
|
72
|
+
except Exception:
|
|
73
|
+
pass
|
|
74
|
+
l = (lang or "").lower().replace("_", "-")
|
|
75
|
+
if not l:
|
|
76
|
+
return None
|
|
77
|
+
p = _domain_sentiment_path(l)
|
|
78
|
+
if not p.exists():
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
obj = json.loads(p.read_text(encoding="utf-8", errors="ignore"))
|
|
82
|
+
return SentimentLexicon(
|
|
83
|
+
lang=str(obj.get("lang") or l),
|
|
84
|
+
pos=_as_set(obj.get("pos")),
|
|
85
|
+
neg=_as_set(obj.get("neg")),
|
|
86
|
+
negators=_as_set(obj.get("negators")),
|
|
87
|
+
intensifiers=_as_set(obj.get("intensifiers")),
|
|
88
|
+
diminishers=_as_set(obj.get("diminishers")),
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _normalize_token_for_match(lang: str, tok: str) -> str:
|
|
93
|
+
s = (tok or "").strip()
|
|
94
|
+
if not s:
|
|
95
|
+
return ""
|
|
96
|
+
if lang.startswith("en"):
|
|
97
|
+
return s.lower()
|
|
98
|
+
return s
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _iter_surface_tokens_for_sentiment(text: str, *, lang: str, sns: bool) -> List[str]:
|
|
102
|
+
# Use ner_preprocess surfaces to avoid morpheme splits where possible.
|
|
103
|
+
from ..ner_prep import ner_preprocess as _ner_preprocess
|
|
104
|
+
|
|
105
|
+
out = _ner_preprocess(
|
|
106
|
+
text,
|
|
107
|
+
lang=lang,
|
|
108
|
+
sns=bool(sns),
|
|
109
|
+
morphology=None,
|
|
110
|
+
include_token_hints=False,
|
|
111
|
+
include_function_word_hints=False,
|
|
112
|
+
drop_function_words=False, # keep negators like "not", "안"
|
|
113
|
+
include_pos4_hints=False,
|
|
114
|
+
use_surfaces=True,
|
|
115
|
+
)
|
|
116
|
+
return [str(x) for x in (out.get("ner_input_tokens") or []) if str(x).strip()]
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def sentiment_hint(
|
|
120
|
+
text: str,
|
|
121
|
+
*,
|
|
122
|
+
lang: str = "auto",
|
|
123
|
+
sns: bool = True,
|
|
124
|
+
window_negate: int = 1,
|
|
125
|
+
) -> Dict[str, Any]:
|
|
126
|
+
"""
|
|
127
|
+
Best-effort sentiment hint (ko/en seed).
|
|
128
|
+
|
|
129
|
+
This is intentionally simple and deterministic:
|
|
130
|
+
- lexicon match on surface tokens
|
|
131
|
+
- optional 1-token negation inversion ("not good", "안 좋아")
|
|
132
|
+
- optional intensifier/diminisher multiplier
|
|
133
|
+
"""
|
|
134
|
+
if lang == "auto":
|
|
135
|
+
text_norm = normalize_text(text, sns=bool(sns))
|
|
136
|
+
lang = detect_language(text_norm)
|
|
137
|
+
|
|
138
|
+
lex = load_sentiment_lexicon(lang)
|
|
139
|
+
if lex is None:
|
|
140
|
+
return {
|
|
141
|
+
"lang": lang,
|
|
142
|
+
"supported": False,
|
|
143
|
+
"polarity": "neu",
|
|
144
|
+
"score": 0.0,
|
|
145
|
+
"hits": [],
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
toks = _iter_surface_tokens_for_sentiment(text, lang=lang, sns=bool(sns))
|
|
149
|
+
norm = [_normalize_token_for_match(lex.lang, t) for t in toks]
|
|
150
|
+
|
|
151
|
+
hits: List[Dict[str, Any]] = []
|
|
152
|
+
score = 0.0
|
|
153
|
+
|
|
154
|
+
def _mult(i: int) -> float:
|
|
155
|
+
# Look one token back for degree modifiers.
|
|
156
|
+
if i - 1 >= 0 and norm[i - 1] in lex.intensifiers:
|
|
157
|
+
return 1.5
|
|
158
|
+
if i - 1 >= 0 and norm[i - 1] in lex.diminishers:
|
|
159
|
+
return 0.5
|
|
160
|
+
return 1.0
|
|
161
|
+
|
|
162
|
+
def _is_negated(i: int) -> bool:
|
|
163
|
+
for j in range(max(0, i - int(window_negate)), i):
|
|
164
|
+
if norm[j] in lex.negators:
|
|
165
|
+
return True
|
|
166
|
+
return False
|
|
167
|
+
|
|
168
|
+
for i, t in enumerate(norm):
|
|
169
|
+
if not t:
|
|
170
|
+
continue
|
|
171
|
+
w = _mult(i)
|
|
172
|
+
if t in lex.pos:
|
|
173
|
+
s = (1.0 * w)
|
|
174
|
+
if _is_negated(i):
|
|
175
|
+
s = -s
|
|
176
|
+
score += s
|
|
177
|
+
hits.append({"token": toks[i], "match": "pos", "weight": s})
|
|
178
|
+
elif t in lex.neg:
|
|
179
|
+
s = (-1.0 * w)
|
|
180
|
+
if _is_negated(i):
|
|
181
|
+
s = -s
|
|
182
|
+
score += s
|
|
183
|
+
hits.append({"token": toks[i], "match": "neg", "weight": s})
|
|
184
|
+
|
|
185
|
+
pol: Polarity = "neu"
|
|
186
|
+
if score > 0.25:
|
|
187
|
+
pol = "pos"
|
|
188
|
+
elif score < -0.25:
|
|
189
|
+
pol = "neg"
|
|
190
|
+
|
|
191
|
+
return {
|
|
192
|
+
"lang": lex.lang,
|
|
193
|
+
"supported": True,
|
|
194
|
+
"polarity": pol,
|
|
195
|
+
"score": float(score),
|
|
196
|
+
"hits": hits,
|
|
197
|
+
}
|
|
198
|
+
|