tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
tokmor/cjk.py ADDED
@@ -0,0 +1,497 @@
1
+ """
2
+ CJK Tokenizer
3
+ =============
4
+
5
+ 중국어, 일본어, 한국어 토크나이저
6
+ 외부 라이브러리 없이 자체 구현만 사용
7
+ """
8
+
9
+ import re
10
+ import unicodedata
11
+ from typing import List, Optional
12
+ from .base import BaseTokenizer, Token, TokenizerResult, MorphologicalAnalyzer
13
+
14
+
15
+ class CJKTokenizer(BaseTokenizer):
16
+ """
17
+ CJK 기본 토크나이저
18
+
19
+ 한자/한글/가나 연속 청크 기반
20
+ 자체 형태소 분석기 사용 (외부 라이브러리 불필요)
21
+ """
22
+
23
+ SUPPORTED_LANGUAGES = {'zh', 'ja', 'ko'}
24
+
25
+ # Unicode ranges
26
+ CJK_UNIFIED = '\u4e00-\u9fff' # CJK Unified Ideographs
27
+ CJK_EXT_A = '\u3400-\u4dbf' # CJK Extension A
28
+ CJK_EXT_B = '\U00020000-\U0002a6df' # CJK Extension B
29
+ HIRAGANA = '\u3040-\u309f'
30
+ KATAKANA = '\u30a0-\u30ff'
31
+ HANGUL_SYLLABLES = '\uac00-\ud7af'
32
+ HANGUL_JAMO = '\u1100-\u11ff'
33
+ HANGUL_COMPAT = '\u3130-\u318f'
34
+
35
+ def __init__(self, lang: str, use_morphology: bool = True, *, zh_join_dates: Optional[bool] = None):
36
+ """
37
+ Args:
38
+ lang: 언어 코드 (ko, ja, zh)
39
+ use_morphology: 형태소 분석 사용 (기본 True)
40
+ """
41
+ # IMPORTANT: BaseTokenizer.__init__ may call _init_morphology() which uses _zh_join_dates.
42
+ # So set this BEFORE calling super().__init__().
43
+ self._zh_join_dates: Optional[bool] = zh_join_dates if lang == "zh" else None
44
+ super().__init__(lang, use_morphology)
45
+ self._setup_patterns()
46
+ self._native_analyzer = None
47
+ self._init_native_analyzer()
48
+
49
+ def _setup_patterns(self):
50
+ """언어별 패턴 설정"""
51
+ if self.lang == 'ko':
52
+ # 한국어: 한글 + 한자
53
+ self._script_pattern = re.compile(
54
+ f'[{self.HANGUL_SYLLABLES}{self.HANGUL_JAMO}{self.HANGUL_COMPAT}'
55
+ f'{self.CJK_UNIFIED}{self.CJK_EXT_A}]+'
56
+ )
57
+ elif self.lang == 'ja':
58
+ # 일본어: 한자 + 히라가나 + 가타카나
59
+ self._script_pattern = re.compile(
60
+ f'[{self.CJK_UNIFIED}{self.CJK_EXT_A}'
61
+ f'{self.HIRAGANA}{self.KATAKANA}]+'
62
+ )
63
+ else: # zh
64
+ # 중국어: 한자
65
+ self._script_pattern = re.compile(
66
+ f'[{self.CJK_UNIFIED}{self.CJK_EXT_A}]+'
67
+ )
68
+
69
+ # 라틴/숫자 패턴
70
+ self._latin_pattern = re.compile(r'[a-zA-Z0-9]+')
71
+
72
+ def _init_native_analyzer(self):
73
+ """자체 형태소 분석기 초기화"""
74
+ try:
75
+ if self.lang == 'ko':
76
+ from .morphology.korean import KoreanAnalyzer
77
+ self._native_analyzer = KoreanAnalyzer()
78
+ elif self.lang == 'ja':
79
+ from .morphology.japanese import JapaneseAnalyzer
80
+ self._native_analyzer = JapaneseAnalyzer()
81
+ elif self.lang == 'zh':
82
+ from .morphology.chinese import ChineseAnalyzer
83
+ self._native_analyzer = ChineseAnalyzer(join_dates=self._zh_join_dates)
84
+ except ImportError:
85
+ self._native_analyzer = None
86
+
87
+ def _init_morphology(self):
88
+ """형태소 분석기 초기화"""
89
+ if self.lang == 'ko':
90
+ self._morphology_analyzer = KoreanMorphologyAnalyzer()
91
+ elif self.lang == 'ja':
92
+ self._morphology_analyzer = JapaneseMorphologyAnalyzer()
93
+ elif self.lang == 'zh':
94
+ self._morphology_analyzer = ChineseMorphologyAnalyzer(join_dates=self._zh_join_dates)
95
+
96
+ def tokenize(self, text: str) -> TokenizerResult:
97
+ """CJK 토크나이징"""
98
+ text = self.clean_text(text)
99
+ if not text:
100
+ return TokenizerResult(tokens=[], text=text, lang=self.lang)
101
+
102
+ # 1. 자체 형태소 분석기
103
+ # NOTE:
104
+ # - For Korean, native analyzer tends to produce morpheme-level splits.
105
+ # When use_morphology=False we should keep surface tokenization (eojeol-like).
106
+ # - For Japanese/Chinese, native analyzer is effectively the word segmenter,
107
+ # so we keep using it regardless.
108
+ # Only use the native analyzer when the text contains the target script.
109
+ # This avoids pathological per-character output on mislabeled corpora (e.g., Arabic lines in ja/zh files).
110
+ has_target = bool(self._script_pattern.search(text))
111
+ use_native = bool(self._native_analyzer) and has_target and (self.lang != "ko" or self.use_morphology)
112
+ if use_native:
113
+ try:
114
+ morphemes = self._native_analyzer.analyze(text)
115
+ tokens = []
116
+ for m in morphemes:
117
+ tokens.append(Token(
118
+ text=m.surface,
119
+ start=m.start,
120
+ end=m.end,
121
+ lemma=m.lemma,
122
+ pos=m.pos,
123
+ ))
124
+ tokens.sort(key=lambda t: t.start)
125
+ tokens = self._postprocess_marks_and_numbers(tokens)
126
+ # Apply token-quality rules early for CJK (defensive).
127
+ # These same rules are also applied globally in TokenizerResult.__post_init__,
128
+ # but doing it here avoids edge cases where upstream CJK segmentation
129
+ # emits an over-merged chunk that should be split before returning.
130
+ try:
131
+ from .token_quality import apply_token_quality
132
+ tokens = apply_token_quality(tokens, lang=self.lang, text=text) # type: ignore[assignment]
133
+ except Exception:
134
+ pass
135
+ return TokenizerResult(
136
+ tokens=tokens,
137
+ text=text,
138
+ lang=self.lang,
139
+ morphology_used=self.use_morphology
140
+ )
141
+ except Exception:
142
+ pass # fallback to chunk-based
143
+
144
+ # 2. 기본: 스크립트 청크 기반 (fallback)
145
+ tokens = []
146
+
147
+ # CJK 청크
148
+ for match in self._script_pattern.finditer(text):
149
+ tokens.append(Token(
150
+ text=match.group(),
151
+ start=match.start(),
152
+ end=match.end(),
153
+ ))
154
+
155
+ # 라틴/숫자
156
+ for match in self._latin_pattern.finditer(text):
157
+ # 이미 추출된 범위와 겹치지 않는지 확인
158
+ overlaps = any(
159
+ t.start <= match.start() < t.end or t.start < match.end() <= t.end
160
+ for t in tokens
161
+ )
162
+ if not overlaps:
163
+ tokens.append(Token(
164
+ text=match.group(),
165
+ start=match.start(),
166
+ end=match.end(),
167
+ ))
168
+
169
+ # 위치순 정렬
170
+ tokens.sort(key=lambda t: t.start)
171
+
172
+ tokens = self._postprocess_marks_and_numbers(tokens)
173
+ try:
174
+ from .token_quality import apply_token_quality
175
+ tokens = apply_token_quality(tokens, lang=self.lang, text=text) # type: ignore[assignment]
176
+ except Exception:
177
+ pass
178
+
179
+ # Safety: never return empty tokens for non-empty input.
180
+ # This can happen when the input contains neither CJK script chunks nor latin/digits
181
+ # (e.g., mislabeled corpus lines). Fall back to whitespace tokens.
182
+ if not tokens:
183
+ for m in re.finditer(r"\S+", text):
184
+ tokens.append(Token(text=m.group(), start=m.start(), end=m.end()))
185
+
186
+ return TokenizerResult(
187
+ tokens=tokens,
188
+ text=text,
189
+ lang=self.lang,
190
+ morphology_used=False
191
+ )
192
+
193
+ def _postprocess_marks_and_numbers(self, tokens: List[Token]) -> List[Token]:
194
+ """
195
+ Postprocess for robustness / quality:
196
+ - Merge standalone combining marks into previous token when contiguous (e.g., variation selectors in JA).
197
+ - Merge contiguous digit runs (ASCII/fullwidth/etc) into a single token (esp. JA dates/numbers).
198
+ - Merge contiguous ASCII alnum runs (e.g., "F" "I" "R" "E" -> "FIRE") when contiguous.
199
+ """
200
+ if not tokens:
201
+ return tokens
202
+
203
+ def _is_mark_only(s: str) -> bool:
204
+ return bool(s) and all(unicodedata.category(ch) in {"Mn", "Mc", "Me"} for ch in s)
205
+
206
+ def _starts_with_mark(s: str) -> bool:
207
+ return bool(s) and unicodedata.category(s[0]) in {"Mn", "Mc", "Me"}
208
+
209
+ def _is_all_digits(s: str) -> bool:
210
+ return bool(s) and all(ch.isdigit() for ch in s)
211
+
212
+ def _is_ascii_alnum(s: str) -> bool:
213
+ return bool(s) and s.isascii() and all(ch.isalnum() for ch in s)
214
+
215
+ # pass 1: simple adjacent merges (marks/digits/ascii)
216
+ out: List[Token] = []
217
+ for t in tokens:
218
+ if not out:
219
+ out.append(t)
220
+ continue
221
+ prev = out[-1]
222
+
223
+ # merge contiguous combining marks (variation selectors, dakuten marks, etc.)
224
+ if (t.start == prev.end) and (t.text and (_is_mark_only(t.text) or _starts_with_mark(t.text))):
225
+ prev.text += t.text
226
+ prev.end = t.end
227
+ continue
228
+
229
+ # merge contiguous digit runs (including full-width digits)
230
+ if (t.start == prev.end) and _is_all_digits(prev.text) and _is_all_digits(t.text):
231
+ prev.text += t.text
232
+ prev.end = t.end
233
+ continue
234
+
235
+ # merge contiguous ASCII alnum runs (common in JA/KO/ZH corpora)
236
+ if (t.start == prev.end) and _is_ascii_alnum(prev.text) and _is_ascii_alnum(t.text):
237
+ prev.text += t.text
238
+ prev.end = t.end
239
+ continue
240
+
241
+ out.append(t)
242
+
243
+ # pass 2 (JA): merge common numeric patterns to avoid token explosions in statistics/news lines
244
+ if self.lang == "ja" and len(out) >= 3:
245
+ merged: List[Token] = []
246
+ i = 0
247
+ JA_NUM_UNITS = {"年", "月", "日", "代", "人", "件", "話", "歳"}
248
+ DEC_SEPS = {".", ".", "・"}
249
+ PERCENTS = {"%", "%"}
250
+ while i < len(out):
251
+ t = out[i]
252
+ # digits + unit (e.g., 20 + 代 -> 20代)
253
+ if i + 1 < len(out) and (out[i].end == out[i + 1].start) and _is_all_digits(out[i].text) and (out[i + 1].text in JA_NUM_UNITS):
254
+ merged.append(Token(text=out[i].text + out[i + 1].text, start=out[i].start, end=out[i + 1].end))
255
+ i += 2
256
+ continue
257
+ # decimal: digits + sep + digits (+ percent)
258
+ if i + 2 < len(out):
259
+ a, b, c = out[i], out[i + 1], out[i + 2]
260
+ if (a.end == b.start) and (b.end == c.start) and _is_all_digits(a.text) and (b.text in DEC_SEPS) and _is_all_digits(c.text):
261
+ txt = a.text + b.text + c.text
262
+ end = c.end
263
+ j = i + 3
264
+ if j < len(out) and (out[j].start == end) and (out[j].text in PERCENTS):
265
+ txt += out[j].text
266
+ end = out[j].end
267
+ j += 1
268
+ merged.append(Token(text=txt, start=a.start, end=end))
269
+ i = j
270
+ continue
271
+ # digits + percent (e.g., 10 + % -> 10%)
272
+ if i + 1 < len(out) and (out[i].end == out[i + 1].start) and _is_all_digits(out[i].text) and (out[i + 1].text in PERCENTS):
273
+ merged.append(Token(text=out[i].text + out[i + 1].text, start=out[i].start, end=out[i + 1].end))
274
+ i += 2
275
+ continue
276
+ merged.append(t)
277
+ i += 1
278
+ out = merged
279
+
280
+ # pass 3 (ZH): merge contiguous ASCII digit token + common CJK unit char
281
+ # e.g., 10 + 亿 -> 10亿, 2025 + 年 -> 2025年
282
+ if self.lang == "zh" and len(out) >= 2:
283
+ merged2: List[Token] = []
284
+ i = 0
285
+ ZH_NUM_UNITS = {"年", "月", "日", "号", "亿", "万", "元", "%", "%", "度", "岁"}
286
+ while i < len(out):
287
+ t = out[i]
288
+ if i + 1 < len(out):
289
+ a, b = out[i], out[i + 1]
290
+ if (a.end == b.start) and _is_all_digits(a.text) and (b.text in ZH_NUM_UNITS):
291
+ merged2.append(Token(text=a.text + b.text, start=a.start, end=b.end))
292
+ i += 2
293
+ continue
294
+ # If the next token starts with a unit char (e.g., '亿人'), split it:
295
+ # 10 + 亿人 -> 10亿 + 人
296
+ if (a.end == b.start) and _is_all_digits(a.text) and b.text and (b.text[0] in ZH_NUM_UNITS) and len(b.text) > 1:
297
+ unit = b.text[0]
298
+ rest = b.text[1:]
299
+ merged2.append(Token(text=a.text + unit, start=a.start, end=b.start + 1))
300
+ merged2.append(Token(text=rest, start=b.start + 1, end=b.end))
301
+ i += 2
302
+ continue
303
+ merged2.append(t)
304
+ i += 1
305
+ out = merged2
306
+
307
+ return out
308
+
309
+ def extract_ngrams(self, text: str, min_n: int = 2, max_n: int = 8) -> List[str]:
310
+ """N-gram 추출"""
311
+ ngrams = []
312
+
313
+ for match in self._script_pattern.finditer(text):
314
+ chunk = match.group()
315
+ for n in range(min_n, min(max_n + 1, len(chunk) + 1)):
316
+ for i in range(len(chunk) - n + 1):
317
+ ngrams.append(chunk[i:i+n])
318
+
319
+ return ngrams
320
+
321
+
322
+ # =============================================================================
323
+ # 형태소 분석기 (자체 구현만 사용)
324
+ # =============================================================================
325
+
326
+ class KoreanMorphologyAnalyzer(MorphologicalAnalyzer):
327
+ """한국어 형태소 분석기 (자체 구현)"""
328
+
329
+ def __init__(self):
330
+ self._analyzer = None
331
+ self._init_analyzer()
332
+
333
+ def _init_analyzer(self):
334
+ """분석기 초기화"""
335
+ try:
336
+ from .morphology.korean import KoreanAnalyzer
337
+ self._analyzer = KoreanAnalyzer()
338
+ except ImportError:
339
+ self._analyzer = None
340
+
341
+ def is_available(self) -> bool:
342
+ return self._analyzer is not None
343
+
344
+ def analyze(self, text: str) -> List[Token]:
345
+ """형태소 분석"""
346
+ if not self._analyzer:
347
+ return []
348
+
349
+ tokens = []
350
+ result = self._analyzer.analyze(text)
351
+ for morph in result:
352
+ tokens.append(Token(
353
+ text=morph.surface,
354
+ start=morph.start,
355
+ end=morph.end,
356
+ lemma=morph.lemma,
357
+ pos=morph.pos,
358
+ ))
359
+ return tokens
360
+
361
+ def nouns(self, text: str) -> List[str]:
362
+ """명사 추출"""
363
+ if not self._analyzer:
364
+ return []
365
+ result = self._analyzer.analyze(text)
366
+ return [m.surface for m in result if m.pos.startswith('N')]
367
+
368
+
369
+ class JapaneseMorphologyAnalyzer(MorphologicalAnalyzer):
370
+ """일본어 형태소 분석기 (자체 구현)"""
371
+
372
+ def __init__(self):
373
+ self._analyzer = None
374
+ self._init_analyzer()
375
+
376
+ def _init_analyzer(self):
377
+ """분석기 초기화"""
378
+ try:
379
+ from .morphology.japanese import JapaneseAnalyzer
380
+ self._analyzer = JapaneseAnalyzer()
381
+ except ImportError:
382
+ self._analyzer = None
383
+
384
+ def is_available(self) -> bool:
385
+ return self._analyzer is not None
386
+
387
+ def analyze(self, text: str) -> List[Token]:
388
+ """형태소 분석"""
389
+ if not self._analyzer:
390
+ return []
391
+
392
+ tokens = []
393
+ result = self._analyzer.analyze(text)
394
+ for morph in result:
395
+ tokens.append(Token(
396
+ text=morph.surface,
397
+ start=morph.start,
398
+ end=morph.end,
399
+ lemma=morph.lemma,
400
+ pos=morph.pos,
401
+ ))
402
+ return tokens
403
+
404
+
405
+ class ChineseMorphologyAnalyzer(MorphologicalAnalyzer):
406
+ """중국어 형태소 분석기 (자체 구현)"""
407
+
408
+ def __init__(self, join_dates: Optional[bool] = None):
409
+ self._analyzer = None
410
+ self._join_dates = join_dates
411
+ self._init_analyzer()
412
+
413
+ def _init_analyzer(self):
414
+ """분석기 초기화"""
415
+ try:
416
+ from .morphology.chinese import ChineseAnalyzer
417
+ self._analyzer = ChineseAnalyzer(join_dates=self._join_dates)
418
+ except ImportError:
419
+ self._analyzer = None
420
+
421
+ def is_available(self) -> bool:
422
+ return self._analyzer is not None
423
+
424
+ def analyze(self, text: str) -> List[Token]:
425
+ """형태소 분석"""
426
+ if not self._analyzer:
427
+ return []
428
+
429
+ tokens = []
430
+ result = self._analyzer.analyze(text)
431
+ for morph in result:
432
+ tokens.append(Token(
433
+ text=morph.surface,
434
+ start=morph.start,
435
+ end=morph.end,
436
+ lemma=morph.lemma,
437
+ pos=morph.pos,
438
+ ))
439
+ return tokens
440
+
441
+
442
+ # =============================================================================
443
+ # 언어별 특화 토크나이저
444
+ # =============================================================================
445
+
446
+ class KoreanTokenizer(CJKTokenizer):
447
+ """한국어 특화 토크나이저"""
448
+
449
+ SUPPORTED_LANGUAGES = {'ko'}
450
+
451
+ def __init__(self, use_morphology: bool = True):
452
+ """
453
+ Args:
454
+ use_morphology: 형태소 분석 사용 (기본 True)
455
+ """
456
+ super().__init__('ko', use_morphology)
457
+
458
+ def _init_morphology(self):
459
+ self._morphology_analyzer = KoreanMorphologyAnalyzer()
460
+
461
+ def nouns(self, text: str) -> List[str]:
462
+ """명사 추출"""
463
+ if self._morphology_analyzer and self._morphology_analyzer.is_available():
464
+ return self._morphology_analyzer.nouns(text)
465
+ return []
466
+
467
+
468
+ class JapaneseTokenizer(CJKTokenizer):
469
+ """일본어 특화 토크나이저"""
470
+
471
+ SUPPORTED_LANGUAGES = {'ja'}
472
+
473
+ def __init__(self, use_morphology: bool = True):
474
+ """
475
+ Args:
476
+ use_morphology: 형태소 분석 사용 (기본 True)
477
+ """
478
+ super().__init__('ja', use_morphology)
479
+
480
+ def _init_morphology(self):
481
+ self._morphology_analyzer = JapaneseMorphologyAnalyzer()
482
+
483
+
484
+ class ChineseTokenizer(CJKTokenizer):
485
+ """중국어 특화 토크나이저"""
486
+
487
+ SUPPORTED_LANGUAGES = {'zh'}
488
+
489
+ def __init__(self, use_morphology: bool = True, *, zh_join_dates: Optional[bool] = None):
490
+ """
491
+ Args:
492
+ use_morphology: 형태소 분석 사용 (기본 True)
493
+ """
494
+ super().__init__('zh', use_morphology, zh_join_dates=zh_join_dates)
495
+
496
+ def _init_morphology(self):
497
+ self._morphology_analyzer = ChineseMorphologyAnalyzer(join_dates=self._zh_join_dates)
@@ -0,0 +1,11 @@
1
+ """
2
+ Domain lexicons (small, optional)
3
+ ================================
4
+
5
+ TokMor core focuses on tokenization/morphology. Domain lexicons are small,
6
+ optional add-ons that can be shipped in a data pack (`TOKMOR_DATA_DIR`) or as
7
+ tiny bundled assets.
8
+ """
9
+
10
+ from .sentiment import load_sentiment_lexicon, sentiment_hint # noqa: F401
11
+
@@ -0,0 +1,198 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import dataclass
5
+ from functools import lru_cache
6
+ from pathlib import Path
7
+ from typing import Any, Dict, Iterable, List, Literal, Optional, Set, Tuple
8
+
9
+ from ..factory import detect_language
10
+ from ..preprocess import normalize_text
11
+ from ..resources import data_dir
12
+
13
+
14
+ Polarity = Literal["pos", "neg", "neu"]
15
+
16
+
17
+ def _domain_sentiment_path(lang: str) -> Path:
18
+ # Prefer external assets under TOKMOR_DATA_DIR, fallback to bundled models.
19
+ #
20
+ # IMPORTANT:
21
+ # - `resources.data_dir()` returns TOKMOR_DATA_DIR when set, which would hide bundled
22
+ # seed lexicons if we only looked there.
23
+ # - So we explicitly check external first, then the package-bundled models dir.
24
+ from .. import resources
25
+
26
+ l = (lang or "").lower().replace("_", "-")
27
+ rel = Path("domain") / "sentiment" / f"{l}.json"
28
+
29
+ env = resources.data_dir() / rel
30
+ bundled = Path(__file__).resolve().parents[1] / "models" / rel # tokmor/models/...
31
+
32
+ if env.exists():
33
+ return env
34
+ return bundled
35
+
36
+
37
+ @dataclass(frozen=True)
38
+ class SentimentLexicon:
39
+ lang: str
40
+ pos: Set[str]
41
+ neg: Set[str]
42
+ negators: Set[str]
43
+ intensifiers: Set[str]
44
+ diminishers: Set[str]
45
+
46
+
47
+ def _as_set(xs: Any) -> Set[str]:
48
+ if not xs:
49
+ return set()
50
+ out: Set[str] = set()
51
+ for x in xs if isinstance(xs, list) else []:
52
+ s = str(x).strip()
53
+ if s:
54
+ out.add(s)
55
+ return out
56
+
57
+
58
+ @lru_cache(maxsize=32)
59
+ def load_sentiment_lexicon(lang: str) -> Optional[SentimentLexicon]:
60
+ """
61
+ Load a small sentiment lexicon for a language.
62
+
63
+ Returns None if the lexicon does not exist.
64
+ """
65
+ # Allow disabling domain lexicons entirely (e.g., minimal deployments).
66
+ try:
67
+ import os
68
+
69
+ v = (os.getenv("TOKMOR_DISABLE_DOMAIN_LEXICONS", "") or "").strip().lower()
70
+ if v in {"1", "true", "yes", "y", "on"}:
71
+ return None
72
+ except Exception:
73
+ pass
74
+ l = (lang or "").lower().replace("_", "-")
75
+ if not l:
76
+ return None
77
+ p = _domain_sentiment_path(l)
78
+ if not p.exists():
79
+ return None
80
+
81
+ obj = json.loads(p.read_text(encoding="utf-8", errors="ignore"))
82
+ return SentimentLexicon(
83
+ lang=str(obj.get("lang") or l),
84
+ pos=_as_set(obj.get("pos")),
85
+ neg=_as_set(obj.get("neg")),
86
+ negators=_as_set(obj.get("negators")),
87
+ intensifiers=_as_set(obj.get("intensifiers")),
88
+ diminishers=_as_set(obj.get("diminishers")),
89
+ )
90
+
91
+
92
+ def _normalize_token_for_match(lang: str, tok: str) -> str:
93
+ s = (tok or "").strip()
94
+ if not s:
95
+ return ""
96
+ if lang.startswith("en"):
97
+ return s.lower()
98
+ return s
99
+
100
+
101
+ def _iter_surface_tokens_for_sentiment(text: str, *, lang: str, sns: bool) -> List[str]:
102
+ # Use ner_preprocess surfaces to avoid morpheme splits where possible.
103
+ from ..ner_prep import ner_preprocess as _ner_preprocess
104
+
105
+ out = _ner_preprocess(
106
+ text,
107
+ lang=lang,
108
+ sns=bool(sns),
109
+ morphology=None,
110
+ include_token_hints=False,
111
+ include_function_word_hints=False,
112
+ drop_function_words=False, # keep negators like "not", "안"
113
+ include_pos4_hints=False,
114
+ use_surfaces=True,
115
+ )
116
+ return [str(x) for x in (out.get("ner_input_tokens") or []) if str(x).strip()]
117
+
118
+
119
+ def sentiment_hint(
120
+ text: str,
121
+ *,
122
+ lang: str = "auto",
123
+ sns: bool = True,
124
+ window_negate: int = 1,
125
+ ) -> Dict[str, Any]:
126
+ """
127
+ Best-effort sentiment hint (ko/en seed).
128
+
129
+ This is intentionally simple and deterministic:
130
+ - lexicon match on surface tokens
131
+ - optional 1-token negation inversion ("not good", "안 좋아")
132
+ - optional intensifier/diminisher multiplier
133
+ """
134
+ if lang == "auto":
135
+ text_norm = normalize_text(text, sns=bool(sns))
136
+ lang = detect_language(text_norm)
137
+
138
+ lex = load_sentiment_lexicon(lang)
139
+ if lex is None:
140
+ return {
141
+ "lang": lang,
142
+ "supported": False,
143
+ "polarity": "neu",
144
+ "score": 0.0,
145
+ "hits": [],
146
+ }
147
+
148
+ toks = _iter_surface_tokens_for_sentiment(text, lang=lang, sns=bool(sns))
149
+ norm = [_normalize_token_for_match(lex.lang, t) for t in toks]
150
+
151
+ hits: List[Dict[str, Any]] = []
152
+ score = 0.0
153
+
154
+ def _mult(i: int) -> float:
155
+ # Look one token back for degree modifiers.
156
+ if i - 1 >= 0 and norm[i - 1] in lex.intensifiers:
157
+ return 1.5
158
+ if i - 1 >= 0 and norm[i - 1] in lex.diminishers:
159
+ return 0.5
160
+ return 1.0
161
+
162
+ def _is_negated(i: int) -> bool:
163
+ for j in range(max(0, i - int(window_negate)), i):
164
+ if norm[j] in lex.negators:
165
+ return True
166
+ return False
167
+
168
+ for i, t in enumerate(norm):
169
+ if not t:
170
+ continue
171
+ w = _mult(i)
172
+ if t in lex.pos:
173
+ s = (1.0 * w)
174
+ if _is_negated(i):
175
+ s = -s
176
+ score += s
177
+ hits.append({"token": toks[i], "match": "pos", "weight": s})
178
+ elif t in lex.neg:
179
+ s = (-1.0 * w)
180
+ if _is_negated(i):
181
+ s = -s
182
+ score += s
183
+ hits.append({"token": toks[i], "match": "neg", "weight": s})
184
+
185
+ pol: Polarity = "neu"
186
+ if score > 0.25:
187
+ pol = "pos"
188
+ elif score < -0.25:
189
+ pol = "neg"
190
+
191
+ return {
192
+ "lang": lex.lang,
193
+ "supported": True,
194
+ "polarity": pol,
195
+ "score": float(score),
196
+ "hits": hits,
197
+ }
198
+