tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
tokmor/space_based.py ADDED
@@ -0,0 +1,272 @@
1
+ """
2
+ Space-Based Tokenizer
3
+ =====================
4
+
5
+ 공백 기반 언어용 토크나이저 (영어, 유럽어 등)
6
+ """
7
+
8
+ import re
9
+ import unicodedata
10
+ from typing import List
11
+ from .base import BaseTokenizer, Token, TokenizerResult
12
+
13
+
14
+ class SpaceBasedTokenizer(BaseTokenizer):
15
+ """
16
+ 공백 기반 토크나이저
17
+
18
+ 대부분의 언어 (영어, 독일어, 프랑스어, 스페인어 등)
19
+ """
20
+
21
+ SUPPORTED_LANGUAGES = {
22
+ # Germanic
23
+ 'en', 'de', 'nl', 'sv', 'da', 'no', 'nb', 'nn', 'is', 'af',
24
+ # Romance
25
+ 'fr', 'es', 'pt', 'it', 'ro', 'ca', 'gl', 'oc',
26
+ # Slavic
27
+ 'ru', 'uk', 'pl', 'cs', 'sk', 'hr', 'sr', 'bg', 'sl', 'mk', 'be',
28
+ # Baltic
29
+ 'lv', 'lt',
30
+ # Finno-Ugric
31
+ 'fi', 'et', 'hu',
32
+ # Other European
33
+ 'el', 'sq', 'mt', 'eu', 'cy', 'ga',
34
+ # Turkic
35
+ 'tr', 'az', 'kk', 'uz', 'ky', 'tk', 'ug',
36
+ # Other
37
+ 'id', 'ms', 'tl', 'vi', 'sw', 'ha', 'yo', 'ig', 'zu', 'am',
38
+ 'mn', 'ka', 'hy',
39
+ # Indic (space-based but special handling)
40
+ 'hi', 'bn', 'gu', 'pa', 'mr', 'ne', 'si', 'ta', 'te', 'kn', 'ml',
41
+ }
42
+
43
+ # 언어별 특수 패턴
44
+ LANGUAGE_PATTERNS = {
45
+ # 독일어: 복합어
46
+ 'de': {
47
+ 'compound_split': True,
48
+ 'preserve_case': True,
49
+ },
50
+ # 터키어: 접미사
51
+ 'tr': {
52
+ 'agglutinative': True,
53
+ },
54
+ # 핀란드어: 접미사
55
+ 'fi': {
56
+ 'agglutinative': True,
57
+ },
58
+ # 헝가리어: 접미사
59
+ 'hu': {
60
+ 'agglutinative': True,
61
+ },
62
+ }
63
+
64
+ def __init__(self, lang: str, use_morphology: bool = False):
65
+ super().__init__(lang, use_morphology)
66
+ self._word_pattern = re.compile(r'\b[\w\-\']+\b', re.UNICODE)
67
+ # Pre-scan special tokens that should stay intact for downstream normalization/value extraction.
68
+ self._special_pattern = re.compile(
69
+ r"(https?://\S+|www\.\S+|[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"
70
+ r"|[$€£¥₩]\s*[0-9][0-9,._]*(?:\.[0-9]+)?|[0-9][0-9,._]*(?:\.[0-9]+)?\s*(?:%|%)"
71
+ r"|\d{4}[-/\.]\d{1,2}[-/\.]\d{1,2})",
72
+ re.UNICODE,
73
+ )
74
+ self._config = self.LANGUAGE_PATTERNS.get(lang, {})
75
+
76
+ @staticmethod
77
+ def _is_word_char(ch: str) -> bool:
78
+ """
79
+ Unicode-aware word character definition:
80
+ - letters (L*)
81
+ - marks (M*) (important for Indic/Brahmic scripts)
82
+ - numbers (N*)
83
+ - joiners (ZWJ/ZWNJ) used in some scripts (e.g., Sinhala conjuncts)
84
+ """
85
+ if not ch:
86
+ return False
87
+ # Keep joiners inside tokens to avoid breaking conjuncts (e.g., Sinhala: ක්‍රී)
88
+ if ch in {"\u200d", "\u200c"}: # ZWJ / ZWNJ
89
+ return True
90
+ try:
91
+ cat0 = unicodedata.category(ch)[0]
92
+ return cat0 in {"L", "M", "N"}
93
+ except Exception:
94
+ return False
95
+
96
+ def _iter_word_spans(self, text: str, covered: List[bool]) -> List[Token]:
97
+ """
98
+ Find word-like spans using a unicode category scan.
99
+ This is more robust than regex word-boundaries (\b) for Indic/Brahmic scripts.
100
+ """
101
+ tokens: List[Token] = []
102
+ i = 0
103
+ n = len(text)
104
+ while i < n:
105
+ if i < n and covered[i]:
106
+ i += 1
107
+ continue
108
+ ch = text[i]
109
+ if ch.isspace():
110
+ i += 1
111
+ continue
112
+ if not self._is_word_char(ch):
113
+ i += 1
114
+ continue
115
+
116
+ s = i
117
+ i += 1
118
+ while i < n:
119
+ if covered[i]:
120
+ break
121
+ c = text[i]
122
+ if self._is_word_char(c):
123
+ i += 1
124
+ continue
125
+ # allow in-word apostrophes/hyphens when surrounded by word chars
126
+ if c in {"'", "’", "-"} and (i + 1) < n:
127
+ if self._is_word_char(text[i - 1]) and self._is_word_char(text[i + 1]):
128
+ i += 1
129
+ continue
130
+ break
131
+ e = i
132
+ if e > s:
133
+ tokens.append(Token(text=text[s:e], start=s, end=e))
134
+ else:
135
+ i += 1
136
+ return tokens
137
+
138
+ def tokenize(self, text: str) -> TokenizerResult:
139
+ """공백 기반 토크나이징"""
140
+ text = self.clean_text(text)
141
+ if not text:
142
+ return TokenizerResult(tokens=[], text=text, lang=self.lang)
143
+
144
+ tokens = []
145
+
146
+ # 0) Special tokens (URLs/emails/money/percent/dates) first
147
+ covered = [False] * (len(text) + 1)
148
+ for m in self._special_pattern.finditer(text):
149
+ s, e = m.start(), m.end()
150
+ if s < 0 or e <= s:
151
+ continue
152
+ tokens.append(Token(text=m.group(), start=s, end=e))
153
+ for i in range(s, min(e, len(text))):
154
+ covered[i] = True
155
+
156
+ # 1) Unicode-aware word scan (robust for Indic scripts and generally safe)
157
+ word_tokens = self._iter_word_spans(text, covered)
158
+ tokens.extend(word_tokens)
159
+ for t in word_tokens:
160
+ for i in range(int(t.start), min(int(t.end), len(text))):
161
+ covered[i] = True
162
+
163
+ # 2) Keep remaining visible symbols as single-char tokens.
164
+ # This improves SNS/user-generated text handling (emoji, hashtags symbols, punctuation),
165
+ # while remaining deterministic and easy to filter downstream.
166
+ for i, ch in enumerate(text):
167
+ if covered[i]:
168
+ continue
169
+ if ch.isspace():
170
+ continue
171
+ tokens.append(Token(text=ch, start=i, end=i + 1))
172
+
173
+ # 형태소 분석 적용
174
+ if self.use_morphology and self._morphology_analyzer:
175
+ tokens = self._apply_morphology(text, tokens)
176
+
177
+ tokens.sort(key=lambda t: t.start)
178
+ return TokenizerResult(
179
+ tokens=tokens,
180
+ text=text,
181
+ lang=self.lang,
182
+ morphology_used=self.use_morphology and self._morphology_analyzer is not None
183
+ )
184
+
185
+ def _apply_morphology(self, text: str, tokens: List[Token]) -> List[Token]:
186
+ """형태소 분석 적용"""
187
+ if not self._morphology_analyzer:
188
+ return tokens
189
+
190
+ analyzed = self._morphology_analyzer.analyze(text)
191
+
192
+ # Match analyzed results with tokens
193
+ # (simplified - assumes same tokenization)
194
+ for i, token in enumerate(tokens):
195
+ if i < len(analyzed):
196
+ token.lemma = analyzed[i].lemma
197
+ token.pos = analyzed[i].pos
198
+
199
+ return tokens
200
+
201
+
202
+ class EnglishTokenizer(SpaceBasedTokenizer):
203
+ """영어 특화 토크나이저"""
204
+
205
+ SUPPORTED_LANGUAGES = {'en'}
206
+
207
+ # 축약형 패턴
208
+ CONTRACTIONS = {
209
+ "n't": " not",
210
+ "'re": " are",
211
+ "'ve": " have",
212
+ "'ll": " will",
213
+ "'d": " would",
214
+ "'m": " am",
215
+ "'s": " is", # or possessive
216
+ }
217
+
218
+ def __init__(self, lang: str = 'en', use_morphology: bool = False):
219
+ super().__init__(lang, use_morphology)
220
+ self._contraction_pattern = re.compile(
221
+ r"(\w+)(n't|'re|'ve|'ll|'d|'m|'s)",
222
+ re.IGNORECASE
223
+ )
224
+
225
+ def tokenize(self, text: str, expand_contractions: bool = False) -> TokenizerResult:
226
+ """영어 토크나이징"""
227
+ text = self.clean_text(text)
228
+
229
+ if expand_contractions:
230
+ text = self._expand_contractions(text)
231
+
232
+ return super().tokenize(text)
233
+
234
+ def _expand_contractions(self, text: str) -> str:
235
+ """축약형 확장"""
236
+ def replace(match):
237
+ word = match.group(1)
238
+ contraction = match.group(2).lower()
239
+ expansion = self.CONTRACTIONS.get(contraction, contraction)
240
+ return word + expansion
241
+
242
+ return self._contraction_pattern.sub(replace, text)
243
+
244
+
245
+ class GermanTokenizer(SpaceBasedTokenizer):
246
+ """독일어 특화 토크나이저"""
247
+
248
+ SUPPORTED_LANGUAGES = {'de'}
249
+
250
+ def __init__(self, lang: str = 'de', use_morphology: bool = False):
251
+ super().__init__(lang, use_morphology)
252
+
253
+ def split_compound(self, word: str) -> List[str]:
254
+ """
255
+ 독일어 복합어 분리 (휴리스틱)
256
+
257
+ Note: 완전한 분리를 위해서는 형태소 분석기 필요
258
+ """
259
+ # 기본적인 복합어 패턴
260
+ # 실제로는 사전 기반이나 ML 기반 분리가 필요
261
+ return [word] # 기본 구현은 분리 안 함
262
+
263
+
264
+ class RussianTokenizer(SpaceBasedTokenizer):
265
+ """러시아어 특화 토크나이저"""
266
+
267
+ SUPPORTED_LANGUAGES = {'ru'}
268
+
269
+ def __init__(self, lang: str = 'ru', use_morphology: bool = False):
270
+ super().__init__(lang, use_morphology)
271
+ # 키릴 문자 포함 패턴
272
+ self._word_pattern = re.compile(r'[а-яА-ЯёЁa-zA-Z0-9]+', re.UNICODE)