tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
tokmor/unified_tokens.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unified token output for downstream pipelines
|
|
3
|
+
=============================================
|
|
4
|
+
|
|
5
|
+
TokMor core intentionally avoids shipping a full learned POS tagger.
|
|
6
|
+
However, some downstream systems expect token objects that include:
|
|
7
|
+
- offsets
|
|
8
|
+
- a best-effort POS label (language-specific if available)
|
|
9
|
+
- a confidence-like score
|
|
10
|
+
- a particle/function-word flag (for boundary / hard-block)
|
|
11
|
+
|
|
12
|
+
This module provides a deterministic, lightweight adapter on top of TokMor tokenizers.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from dataclasses import asdict
|
|
18
|
+
from typing import Any, Dict, List, Optional
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _is_particle(lang: str, *, pos: Optional[str], text: str) -> bool:
|
|
22
|
+
base = (lang or "").split("-", 1)[0].lower()
|
|
23
|
+
p = (pos or "").strip()
|
|
24
|
+
|
|
25
|
+
# Prefer language-specific tokenizer POS when present.
|
|
26
|
+
if base == "ko":
|
|
27
|
+
# Sejong-ish: J* particles, JC conjunction particle, etc.
|
|
28
|
+
if p.startswith("J"):
|
|
29
|
+
return True
|
|
30
|
+
return False
|
|
31
|
+
|
|
32
|
+
if base == "ja":
|
|
33
|
+
# Our JA analyzers may return string tags like 助詞/助動詞 (tokenizer compatibility).
|
|
34
|
+
if p in {"助詞", "助動詞"}:
|
|
35
|
+
return True
|
|
36
|
+
return False
|
|
37
|
+
|
|
38
|
+
if base.startswith("zh"):
|
|
39
|
+
# Chinese analyzer tags: u (particle), p (prep), c (conj)
|
|
40
|
+
if p in {"u"}:
|
|
41
|
+
return True
|
|
42
|
+
return False
|
|
43
|
+
|
|
44
|
+
# Fallback: use function-word hints when available (closed class).
|
|
45
|
+
try:
|
|
46
|
+
from .ner_prep import function_word_tag
|
|
47
|
+
|
|
48
|
+
fw = function_word_tag(lang, text) or ""
|
|
49
|
+
return fw in {"PART"}
|
|
50
|
+
except Exception:
|
|
51
|
+
return False
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _pos_coarse(lang: str, *, pos: Optional[str], text: str) -> str:
|
|
55
|
+
"""
|
|
56
|
+
Best-effort mapping from native POS tags to a small universal-style coarse tag set.
|
|
57
|
+
Returns empty string when unknown.
|
|
58
|
+
"""
|
|
59
|
+
base = (lang or "").split("-", 1)[0].lower()
|
|
60
|
+
p = (pos or "").strip()
|
|
61
|
+
if not p:
|
|
62
|
+
return ""
|
|
63
|
+
|
|
64
|
+
if base == "ko":
|
|
65
|
+
# Sejong-ish (token-level): NNG/NNP/NNB/NP/NR, VV/VA/VX/VCP/VCN, MAG/MAJ, J*, E*, X*, SN, S*
|
|
66
|
+
if p.startswith("NNP"):
|
|
67
|
+
return "PROPN"
|
|
68
|
+
if p.startswith("NN") or p in {"NP", "NR"}:
|
|
69
|
+
return "NOUN"
|
|
70
|
+
if p in {"VV", "VX"}:
|
|
71
|
+
return "VERB"
|
|
72
|
+
if p in {"VCP", "VCN"}:
|
|
73
|
+
return "AUX"
|
|
74
|
+
if p == "VA":
|
|
75
|
+
return "ADJ"
|
|
76
|
+
if p in {"MAG", "MAJ"}:
|
|
77
|
+
return "ADV"
|
|
78
|
+
if p.startswith("J"):
|
|
79
|
+
return "PART"
|
|
80
|
+
if p.startswith("E") or p.startswith("X"):
|
|
81
|
+
return "PART"
|
|
82
|
+
if p == "SN":
|
|
83
|
+
return "NUM"
|
|
84
|
+
if p.startswith("S"):
|
|
85
|
+
return "PUNCT"
|
|
86
|
+
return ""
|
|
87
|
+
|
|
88
|
+
if base == "ja":
|
|
89
|
+
# Our JA tokens may carry string tags.
|
|
90
|
+
if p in {"助詞", "助動詞"}:
|
|
91
|
+
return "PART"
|
|
92
|
+
if "名詞" in p:
|
|
93
|
+
return "NOUN"
|
|
94
|
+
if "固有名詞" in p:
|
|
95
|
+
return "PROPN"
|
|
96
|
+
if "動詞" in p:
|
|
97
|
+
return "VERB"
|
|
98
|
+
if "形容詞" in p:
|
|
99
|
+
return "ADJ"
|
|
100
|
+
if "副詞" in p:
|
|
101
|
+
return "ADV"
|
|
102
|
+
return ""
|
|
103
|
+
|
|
104
|
+
if base.startswith("zh"):
|
|
105
|
+
# Chinese analyzer tags: n/ns/nr/nt/nrt, v, a, d, p, u, c, r...
|
|
106
|
+
if p.startswith("n"):
|
|
107
|
+
return "NOUN"
|
|
108
|
+
if p.startswith("v"):
|
|
109
|
+
return "VERB"
|
|
110
|
+
if p.startswith("a"):
|
|
111
|
+
return "ADJ"
|
|
112
|
+
if p.startswith("d"):
|
|
113
|
+
return "ADV"
|
|
114
|
+
if p in {"p"}:
|
|
115
|
+
return "ADP"
|
|
116
|
+
if p in {"u"}:
|
|
117
|
+
return "PART"
|
|
118
|
+
if p in {"c"}:
|
|
119
|
+
return "CCONJ"
|
|
120
|
+
if p in {"m"}:
|
|
121
|
+
return "NUM"
|
|
122
|
+
if p in {"w"}:
|
|
123
|
+
return "PUNCT"
|
|
124
|
+
return ""
|
|
125
|
+
|
|
126
|
+
return ""
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _pos_conf(lang: str, *, pos: Optional[str], is_particle: bool) -> float:
|
|
130
|
+
"""
|
|
131
|
+
Deterministic confidence-like value (NOT calibrated).
|
|
132
|
+
Useful as a downstream heuristic weight / debug signal.
|
|
133
|
+
"""
|
|
134
|
+
if not pos:
|
|
135
|
+
return 0.0
|
|
136
|
+
# Match the style seen in downstream logs: particles high, content tokens moderate.
|
|
137
|
+
if is_particle:
|
|
138
|
+
return 0.95
|
|
139
|
+
base = (lang or "").split("-", 1)[0].lower()
|
|
140
|
+
if base == "ko":
|
|
141
|
+
return 0.60
|
|
142
|
+
return 0.70
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def unified_tokenize(
|
|
146
|
+
text: str,
|
|
147
|
+
*,
|
|
148
|
+
lang: str,
|
|
149
|
+
sns: bool = True,
|
|
150
|
+
morphology: Optional[bool] = None,
|
|
151
|
+
include_sns_tags: bool = False,
|
|
152
|
+
include_pos4: bool = True,
|
|
153
|
+
) -> Dict[str, Any]:
|
|
154
|
+
"""
|
|
155
|
+
Return a token list with offsets + best-effort POS/particle hints.
|
|
156
|
+
|
|
157
|
+
Notes:
|
|
158
|
+
- `pos` comes from TokMor tokenizer morphology when available (language-specific tags).
|
|
159
|
+
- `pos_conf` is a deterministic heuristic (not ML confidence).
|
|
160
|
+
- `pos4` is a coarse hint in {N,V,ADJ,ADV,UNK} (optional).
|
|
161
|
+
"""
|
|
162
|
+
from .api import detect_language
|
|
163
|
+
from .factory import get_tokenizer
|
|
164
|
+
from .preprocess import normalize_text
|
|
165
|
+
|
|
166
|
+
text_norm = normalize_text(text, sns=bool(sns))
|
|
167
|
+
if lang == "auto":
|
|
168
|
+
lang = detect_language(text_norm)
|
|
169
|
+
|
|
170
|
+
# Match segment() defaults for quality.
|
|
171
|
+
# Enable morphology by default for languages that need segmentation for NER.
|
|
172
|
+
if morphology is None:
|
|
173
|
+
# CJK: Chinese, Japanese, Korean
|
|
174
|
+
# SEA: Thai, Myanmar, Khmer, Lao (no spaces between words)
|
|
175
|
+
if lang in {"zh", "ja", "ko", "th", "my", "km", "lo"}:
|
|
176
|
+
morphology = True
|
|
177
|
+
|
|
178
|
+
tok = get_tokenizer(lang, use_morphology=bool(morphology))
|
|
179
|
+
res = tok.tokenize(text_norm)
|
|
180
|
+
|
|
181
|
+
out_tokens: List[Dict[str, Any]] = []
|
|
182
|
+
for t in res.tokens:
|
|
183
|
+
d = asdict(t)
|
|
184
|
+
# ensure minimal keys for external systems
|
|
185
|
+
d = {
|
|
186
|
+
"text": d.get("text"),
|
|
187
|
+
"start": int(d.get("start") or 0),
|
|
188
|
+
"end": int(d.get("end") or 0),
|
|
189
|
+
"pos": d.get("pos") or "",
|
|
190
|
+
}
|
|
191
|
+
d["is_particle"] = _is_particle(lang, pos=d.get("pos"), text=str(d.get("text") or ""))
|
|
192
|
+
d["pos_conf"] = float(_pos_conf(lang, pos=d.get("pos"), is_particle=bool(d["is_particle"])))
|
|
193
|
+
if include_pos4:
|
|
194
|
+
try:
|
|
195
|
+
from .ner_prep import pos4_hint
|
|
196
|
+
|
|
197
|
+
# POS4 is a *content* hint for NER: N/V/ADJ/ADV/UNK.
|
|
198
|
+
# Prefer tokenizer POS mapping when available; it is more reliable than surface heuristics.
|
|
199
|
+
coarse = _pos_coarse(lang, pos=d.get("pos"), text=str(d.get("text") or ""))
|
|
200
|
+
if coarse in {"PART", "PUNCT", "SYM", "X"} or bool(d.get("is_particle")):
|
|
201
|
+
d["pos4"] = "UNK"
|
|
202
|
+
elif coarse in {"VERB", "AUX"}:
|
|
203
|
+
d["pos4"] = "V"
|
|
204
|
+
elif coarse == "ADJ":
|
|
205
|
+
d["pos4"] = "ADJ"
|
|
206
|
+
elif coarse == "ADV":
|
|
207
|
+
d["pos4"] = "ADV"
|
|
208
|
+
elif coarse in {"NOUN", "PROPN"}:
|
|
209
|
+
d["pos4"] = "N"
|
|
210
|
+
else:
|
|
211
|
+
d["pos4"] = pos4_hint(lang, str(d.get("text") or ""))
|
|
212
|
+
except Exception:
|
|
213
|
+
d["pos4"] = "UNK"
|
|
214
|
+
|
|
215
|
+
if include_sns_tags:
|
|
216
|
+
from .sns_tags import classify_sns_token
|
|
217
|
+
|
|
218
|
+
d["sns"] = classify_sns_token(str(d.get("text") or ""), lang=lang)
|
|
219
|
+
|
|
220
|
+
out_tokens.append(d)
|
|
221
|
+
|
|
222
|
+
return {
|
|
223
|
+
"lang": lang,
|
|
224
|
+
"text_norm": text_norm,
|
|
225
|
+
"morphology_used": bool(getattr(res, "morphology_used", False)),
|
|
226
|
+
"tokens": out_tokens,
|
|
227
|
+
}
|
|
228
|
+
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tokmor
|
|
3
|
+
Version: 1.2.9
|
|
4
|
+
Summary: Fast multilingual tokenizer and morphological analyzer (core)
|
|
5
|
+
Author: TokMor Team
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 TokMor Contributors
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
Project-URL: Homepage, https://github.com/tokmorlab/tokmor
|
|
30
|
+
Project-URL: Documentation, https://github.com/tokmorlab/tokmor#readme
|
|
31
|
+
Project-URL: Source, https://github.com/tokmorlab/tokmor
|
|
32
|
+
Project-URL: Issues, https://github.com/tokmorlab/tokmor/issues
|
|
33
|
+
Keywords: nlp,preprocessing,multilingual,tokenizer,tokenization,segmentation,morphology,lemmatization,ner,rag,information-extraction,offline
|
|
34
|
+
Classifier: Development Status :: 4 - Beta
|
|
35
|
+
Classifier: Intended Audience :: Developers
|
|
36
|
+
Classifier: Intended Audience :: Science/Research
|
|
37
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
38
|
+
Classifier: Programming Language :: Python :: 3
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
42
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
43
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
44
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
45
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
46
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
47
|
+
Requires-Python: >=3.8
|
|
48
|
+
Description-Content-Type: text/markdown
|
|
49
|
+
License-File: LICENSE
|
|
50
|
+
Provides-Extra: dev
|
|
51
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
52
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
53
|
+
Dynamic: license-file
|
|
54
|
+
|
|
55
|
+
# TokMor (Core)
|
|
56
|
+
|
|
57
|
+
TokMor is a **small, deterministic multilingual preprocessing library** (no training/inference).
|
|
58
|
+
It focuses on **stable tokenization/segmentation**, **NER-friendly preprocessing**, and **optional offline data packs**.
|
|
59
|
+
|
|
60
|
+
TokMor is **not** a linguistic POS tagger and does **not** run ML models at inference time.
|
|
61
|
+
|
|
62
|
+
## Install
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pip install tokmor
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Quick start (Python)
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
import tokmor
|
|
72
|
+
|
|
73
|
+
out = tokmor.unified_tokenize("We visited Seoul on 2025-01-10.", lang="en", sns=False)
|
|
74
|
+
print(out["tokens"][:5])
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## NER preprocessing helper
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
import tokmor
|
|
81
|
+
|
|
82
|
+
prep = tokmor.ner_preprocess("LOL!!! Apple announced new products in Seoul...", lang="en")
|
|
83
|
+
print(prep["tokens"][:8])
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Optional offline data packs
|
|
87
|
+
|
|
88
|
+
Large lemma/POS-hint packs are not bundled in the wheel. Provide them via:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
export TOKMOR_DATA_DIR=/path/to/tokmor_data_pack
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Docs / repo
|
|
95
|
+
|
|
96
|
+
See the repo docs for data pack layout, licenses, and policies:
|
|
97
|
+
- `docs/PACKAGING.md`
|
|
98
|
+
- `docs/DATA_SOURCES_AND_LICENSES.md`
|
|
99
|
+
- `docs/DOMAIN_LEXICONS.md`
|
|
100
|
+
|
|
101
|
+
## License
|
|
102
|
+
|
|
103
|
+
MIT (see `LICENSE`)
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
tokmor/__init__.py,sha256=fBxOs4tO8YesWXMeNzUQp6_cOs1HiSurW6nDSJMRBXg,1910
|
|
2
|
+
tokmor/api.py,sha256=B8W_Q03fImwB47ZwXO1zAJcgqT1JpgGoSJl6_SAkXiQ,5549
|
|
3
|
+
tokmor/assets.py,sha256=oB3AtUvX9U4oP2RGRFj53_iujS4-bM9jvVlWM0BU7JQ,12088
|
|
4
|
+
tokmor/base.py,sha256=ttthd9aNY11VFyO0QPwVI96nTlIMhVzfUPupNVUiq5c,7434
|
|
5
|
+
tokmor/brahmic.py,sha256=uyRyuafjd8mombR9oVg1qpGdOSIqR9Jqy9NkmvyEnKA,20436
|
|
6
|
+
tokmor/cjk.py,sha256=TJg6SBWfVMSkOaOxyrbMJfJsFGmZE759-EZyhOaS7pA,18913
|
|
7
|
+
tokmor/factory.py,sha256=xg7lUZcam5mIt-bwlhfx2uGojgryVqn-OTP2XqaM4cI,14608
|
|
8
|
+
tokmor/indic.py,sha256=EW4lyc9GNdnB3XlMUb46mwlpWJ-L_m-iPq04mvSX5wU,8844
|
|
9
|
+
tokmor/inventory.py,sha256=TC-epjATIn-FaA0U-g1kY_cqguKKcG9J4U31XH9tsBI,1531
|
|
10
|
+
tokmor/legacy_api.py,sha256=yb1AXi6jXxYWrrsLUJU2EydC80Guwzsh1PPpM3KnuYk,4190
|
|
11
|
+
tokmor/lemma_store.py,sha256=l44veiwNUZdGgcIO5tIl5E-I7ppVyQGhQuBuyw5v_Hs,2614
|
|
12
|
+
tokmor/lookup_keys.py,sha256=t8TRBX2FLuCUSxwAf2blzFbGwqILLx2IIs7iDYRhR8w,4399
|
|
13
|
+
tokmor/ner_prep.py,sha256=2GOWUDJdBHUn3nLScGgLxCY37zXI1VWXx97Bp8AFoT0,24046
|
|
14
|
+
tokmor/offline.py,sha256=YwsadRjPPD_tWUGnUWbb3xMfr63p1eaukc18fCqlwE0,2362
|
|
15
|
+
tokmor/preprocess.py,sha256=MmkhaQvNYOjaxSm6iX6wP2pvBZhq7PfeVxDVV8LZFxA,2392
|
|
16
|
+
tokmor/resources.py,sha256=ylbbcLjJI-2QH_b3s3zpYTqy_VnBuENfKe1yv7UX4PA,8763
|
|
17
|
+
tokmor/routing.py,sha256=YQb0v05Ue1ct9SEFaKFT2R-VQi6LoN6wvDPImRYlSxY,4944
|
|
18
|
+
tokmor/rtl.py,sha256=6azqZhxVDZKINlhAdqzetk0zmQYARORTXkPcznbVkag,10039
|
|
19
|
+
tokmor/schema.py,sha256=nr3NxgNNTXh5A4Vb5MFIJCgj5tvmzejORC7U_5D9ngo,341
|
|
20
|
+
tokmor/sns_tags.py,sha256=uveJfgB1oX7X0Sck7Y041unOHGbu8bcF2zCHhaSDx-Q,13377
|
|
21
|
+
tokmor/space_based.py,sha256=8g6gJZzgB_NMfgkc7XYUypSEzgsuNOTItMtLgKQ32tA,8920
|
|
22
|
+
tokmor/token_quality.py,sha256=WrsFnVx2cfmvgsIz5qcfUv-0AVBWUpt2VlamhA9CWSU,40188
|
|
23
|
+
tokmor/unified_tokens.py,sha256=Yu1WiA6cylpbt18g1zs0uAXxQH05YFV767y7SFUBFao,7246
|
|
24
|
+
tokmor/domain/__init__.py,sha256=V_8vaiZE3MuNSj4_6XWaf9B6PpgUAqFh6jJmDG3_jpM,328
|
|
25
|
+
tokmor/domain/sentiment.py,sha256=uvu1NIy2L0O3Go-qHeNR33r8h10GBwIq3UL1FtOfBHA,5493
|
|
26
|
+
tokmor/models/domain/sentiment/en.json,sha256=OQwYyKHCbLxR6YZ4wwlLGf4ebUDq_3x6avyw3BnUd4M,662
|
|
27
|
+
tokmor/models/domain/sentiment/ko.json,sha256=IWWPiB0SwBxzkDseqBsMDJlNIVx7TumsQRE22tz-jHU,660
|
|
28
|
+
tokmor/models/seg_lexicon/km_wordfreq.pkl,sha256=lidHd1PXS9ucg4nLjo4lgNaAnnwkgBBFUmtJT-4vTyE,1263734
|
|
29
|
+
tokmor/models/seg_lexicon/km_wordlist.pkl,sha256=2djvMrVu6YCaj-GqJRWCNVAU0D7mpzZAou1c-4B5TYs,417451
|
|
30
|
+
tokmor/models/seg_lexicon/lo_wordfreq.pkl,sha256=tn9miUY8Yybxwwg1Tv0LHVIofq6XAz8fqpBN9Wm5Pos,1530855
|
|
31
|
+
tokmor/models/seg_lexicon/lo_wordlist.pkl,sha256=aQEunKNue9zNY8y8Ea-q5xw7oxuprHUf8OeAFQ9SLG4,464660
|
|
32
|
+
tokmor/models/seg_lexicon/my_wordfreq.pkl,sha256=34njTN4HngBa757r-GkKW55BdJBx3b23NimDco3boOs,1522085
|
|
33
|
+
tokmor/models/seg_lexicon/my_wordlist.pkl,sha256=EdS1iySSP3ivZFWXC9rsWkVeV0vDjPfKR04vmoyQMSU,582850
|
|
34
|
+
tokmor/models/seg_lexicon/th_wordfreq.pkl,sha256=lQ01Unz1N7VYDkf1X1EK13XTJdCIQ6F1Jo4t7kz7Zu4,2469349
|
|
35
|
+
tokmor/models/seg_lexicon/th_wordlist.pkl,sha256=rJXCPEIZepHCBykAFzJYkMmUJLBQE5VWoOoRGrdPXSE,749702
|
|
36
|
+
tokmor/models/seg_lexicon/zh_extra_dict.json,sha256=7d7mUX6Nj5JSiAFaiJ5TOmMV44wMpBvuORzEq7c04UI,833
|
|
37
|
+
tokmor/models/seg_lexicon/zh_wordfreq.pkl,sha256=UJES6ezc2SsQmeHXyY4xX9tXmmoLy_gHs6s4ASjGQO4,2717916
|
|
38
|
+
tokmor/morphology/__init__.py,sha256=kURcONW5r9Wnw9vOYDiI3IcUwhIenFr4aeQnBYo2388,12428
|
|
39
|
+
tokmor/morphology/advanced_base.py,sha256=LCVS3gmiJz_AUXXGGQ-mw8PDdJenHdWpfGL4_AE-lAc,15671
|
|
40
|
+
tokmor/morphology/arabic_advanced.py,sha256=olq6lAjNJO-UVQWCNsTQkSCjNSCrs9QWYlEFzpNU3FU,10139
|
|
41
|
+
tokmor/morphology/chinese.py,sha256=tOHc57yEvkj4X1YoW2jh5xaK87Q9mHB7Gg9Lg3gsYOg,31626
|
|
42
|
+
tokmor/morphology/chinese_advanced.py,sha256=JJSfXgkXSaJGsJq99I_rKdH8-v41oKQ8iLQwG4epHlc,16769
|
|
43
|
+
tokmor/morphology/english.py,sha256=AcVb_7Ek6ATlmSzZgT05oDNubwmlt5qhp7xS9_QN8aA,13540
|
|
44
|
+
tokmor/morphology/english_advanced.py,sha256=fOjTSFnMw1Rb43R76Wk3KcT051q5eDjcxKB4jTNE9SA,23090
|
|
45
|
+
tokmor/morphology/french_advanced.py,sha256=yNcv2LYRbX0wtMQ_wFUdPX30GiZklZU_agn7Y6jTbJk,9748
|
|
46
|
+
tokmor/morphology/german_advanced.py,sha256=v2U4xd3IIvDSWIjVRfqXG5mpqdmtfKiSSTDvlrKSo8g,15766
|
|
47
|
+
tokmor/morphology/hindi_advanced.py,sha256=rB0m-vJy-dhWToNNnc7RkjcDSRkdefiCRNr1EnzVpOA,10722
|
|
48
|
+
tokmor/morphology/japanese.py,sha256=5RV_qzf1yDenqJU2rYLtens7mjUdEzOMPoU9VWXvHl4,16014
|
|
49
|
+
tokmor/morphology/japanese_advanced.py,sha256=YMCbAOsfZacvwpyY0mOW0O51oTlKJIdTTzzbUpChCyo,23342
|
|
50
|
+
tokmor/morphology/korean.py,sha256=GCgdKLx51Z_tObfH-tXwWuJ2bcO-Ap8lVZmuqWWWGgs,21868
|
|
51
|
+
tokmor/morphology/korean_advanced.py,sha256=oM3F5pmIy1ULWoS7I_n278U8fzR4OhIeUN9irHfOguE,25292
|
|
52
|
+
tokmor/morphology/russian_advanced.py,sha256=5WXwwf2zDtT7f1ChXBugHbehrhuajOZUaWHQzK564-A,9861
|
|
53
|
+
tokmor/morphology/spanish_advanced.py,sha256=o9L-iOrH6coyHVYfZMCtLdaxa08ae9Hkj3JgAIzd1BA,9520
|
|
54
|
+
tokmor/morphology/thai_native.py,sha256=06PzmhWYgCAvPignaMbRmphavQW1G6vp6Q9yjiCTjPI,17289
|
|
55
|
+
tokmor/morphology/tier2.py,sha256=I6mNushn2MYpPXo-L-IqPPcYp63c6y3quCCB30lTfEY,24360
|
|
56
|
+
tokmor/morphology/tier3.py,sha256=RIcdHv80VYN0P9qmg9ZZRvzrf9xJ-YAWZkfMhbTbHlY,19945
|
|
57
|
+
tokmor/morphology/tier4.py,sha256=PG_UCu48fRATCiTC7rWW_J9zT9JJZJwl11vKRRqesq8,17245
|
|
58
|
+
tokmor/morphology/unified.py,sha256=MndJLdy7LXvu26VfB7Mhp1GaK-GoE8ij7BscinNQXOg,32245
|
|
59
|
+
tokmor/morphology/universal_fallback.py,sha256=QKgMxOYBmWrTm-Uw71Q2iWIcRKuvK-ElyTO48oFsAKw,13406
|
|
60
|
+
tokmor/morphology/templates/__init__.py,sha256=TR8m-pWsEPPMUjr1G7YT4jwt_lUPGJr0L15CeC_PK88,844
|
|
61
|
+
tokmor/morphology/templates/arabic_script_template.py,sha256=xpByjeOvsmdQg6AlEuDtVRXuYZKmQgCfP_cabMvEGGQ,6224
|
|
62
|
+
tokmor/morphology/templates/brahmic_template.py,sha256=JHSR6TsjZBv7s6BhQgt2V7PSD9bibVhwmZPajWv2VLo,6640
|
|
63
|
+
tokmor/morphology/templates/cyrillic_template.py,sha256=usNMeWRAbEbactywDczORKB05utjM1ytreOMco1hpPo,6397
|
|
64
|
+
tokmor/morphology/templates/latin_template.py,sha256=DjVd_6IYbMLIjO5DkNO84ZPUcetelYILl9ifaUvj6nM,9577
|
|
65
|
+
tokmor/morphology/templates/other_scripts_template.py,sha256=HNsf4T-BT5evuDbhMcSI6nSnSYgSrIyFlPcwUEBNWJU,20326
|
|
66
|
+
tokmor-1.2.9.dist-info/licenses/LICENSE,sha256=OfjtVmlC7qrOAU3U7_NErSYR19dT8Zb0RBD-vNjq16E,1077
|
|
67
|
+
tokmor-1.2.9.dist-info/METADATA,sha256=tDKWR_UqHM5f7rOwVcttdXwlTiinLcWTLjofCH-6tyE,3763
|
|
68
|
+
tokmor-1.2.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
69
|
+
tokmor-1.2.9.dist-info/top_level.txt,sha256=Su9FBpMkpwogpvXhgjDVBKWrXIS9NSWbDCUVi1BNEx0,7
|
|
70
|
+
tokmor-1.2.9.dist-info/RECORD,,
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 TokMor Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
tokmor
|