tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
tokmor/offline.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Offline enforcement
|
|
3
|
+
===================
|
|
4
|
+
|
|
5
|
+
TokMor is designed to run fully offline at runtime.
|
|
6
|
+
|
|
7
|
+
This module hard-blocks any attempt to opt into online / remote behaviors via
|
|
8
|
+
TokMor-related environment variables.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
from typing import Iterable
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
_TRUTHY = {"1", "true", "yes", "y", "on"}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _is_truthy(v: str | None) -> bool:
|
|
21
|
+
if v is None:
|
|
22
|
+
return False
|
|
23
|
+
return v.strip().lower() in _TRUTHY
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
_ALLOWED_TOKMOR_ENV = {
|
|
27
|
+
# resource routing
|
|
28
|
+
"TOKMOR_DATA_DIR",
|
|
29
|
+
"TOKMOR_MODELS_DIR",
|
|
30
|
+
"TOKMOR_LEMMA_DICT_DIR",
|
|
31
|
+
"TOKMOR_INFLECT_DIR",
|
|
32
|
+
"TOKMOR_INFLECT_KAIKKI_DIR",
|
|
33
|
+
"TOKMOR_INFLECT_UNIMORPH_DIR",
|
|
34
|
+
"TOKMOR_COARSE_MODELS_DIR",
|
|
35
|
+
# runtime toggles (only disabling things is allowed)
|
|
36
|
+
"TOKMOR_DISABLE_ML",
|
|
37
|
+
"TOKMOR_DISABLE_LEMMA_PACK",
|
|
38
|
+
"TOKMOR_DISABLE_EXTENDED_DICT",
|
|
39
|
+
"TOKMOR_DISABLE_DOMAIN_LEXICONS",
|
|
40
|
+
# quality knobs (offline-safe; local-only computation)
|
|
41
|
+
"TOKMOR_COARSE_HYBRID",
|
|
42
|
+
"TOKMOR_COARSE_HYBRID_MIN_PROB",
|
|
43
|
+
# language-specific behavior knobs
|
|
44
|
+
"TOKMOR_KO_LEMMA_STYLE",
|
|
45
|
+
"TOKMOR_ZH_JOIN_DATES",
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _iter_blocked_env_names() -> Iterable[str]:
|
|
50
|
+
"""
|
|
51
|
+
Block any TokMor-scoped env vars that are not explicitly allowed.
|
|
52
|
+
This prevents any accidental opt-in flags from being introduced via env vars.
|
|
53
|
+
"""
|
|
54
|
+
for k in os.environ.keys():
|
|
55
|
+
ku = k.upper()
|
|
56
|
+
if not ku.startswith("TOKMOR_"):
|
|
57
|
+
continue
|
|
58
|
+
if ku not in _ALLOWED_TOKMOR_ENV:
|
|
59
|
+
yield k
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def enforce_offline() -> None:
|
|
63
|
+
"""
|
|
64
|
+
Enforce offline-only runtime.
|
|
65
|
+
|
|
66
|
+
Behavior:
|
|
67
|
+
- If any non-allowed TOKMOR_* env var is set (non-empty and not an explicit false) -> raise RuntimeError.
|
|
68
|
+
- This is intentionally strict to prevent accidental online/remote fallbacks.
|
|
69
|
+
"""
|
|
70
|
+
offenders = []
|
|
71
|
+
for k in _iter_blocked_env_names():
|
|
72
|
+
v = os.getenv(k)
|
|
73
|
+
if v is None:
|
|
74
|
+
continue
|
|
75
|
+
vs = v.strip().lower()
|
|
76
|
+
if not vs:
|
|
77
|
+
continue
|
|
78
|
+
if vs in ("0", "false", "no", "n", "off"):
|
|
79
|
+
continue
|
|
80
|
+
if _is_truthy(v) or vs:
|
|
81
|
+
offenders.append(k)
|
|
82
|
+
if offenders:
|
|
83
|
+
offenders = sorted(set(offenders))
|
|
84
|
+
raise RuntimeError(
|
|
85
|
+
"TokMor is offline-only. Remove/disable non-allowed TOKMOR_* env vars: "
|
|
86
|
+
+ ", ".join(offenders)
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
tokmor/preprocess.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Preprocessing utilities (POS-free)
|
|
3
|
+
==================================
|
|
4
|
+
|
|
5
|
+
TokMor's core product direction: fast preprocessing utilities that are useful even
|
|
6
|
+
without POS tagging or PPMI.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
import unicodedata
|
|
13
|
+
from typing import List
|
|
14
|
+
|
|
15
|
+
from .base import BaseTokenizer
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def normalize_text(text: str, *, sns: bool = False) -> str:
|
|
19
|
+
"""
|
|
20
|
+
Conservative normalization:
|
|
21
|
+
- Unicode NFC
|
|
22
|
+
- Strip control chars
|
|
23
|
+
- Normalize newlines
|
|
24
|
+
- Collapse excessive whitespace
|
|
25
|
+
"""
|
|
26
|
+
if text is None:
|
|
27
|
+
return ""
|
|
28
|
+
text = BaseTokenizer.clean_text(str(text))
|
|
29
|
+
|
|
30
|
+
# SNS-friendly normalization (still deterministic, no language assumptions):
|
|
31
|
+
# - normalize common fullwidth ASCII variants (#@$% etc.) so downstream patterns work.
|
|
32
|
+
if sns:
|
|
33
|
+
# Convert fullwidth ASCII range FF01-FF5E to ASCII 21-7E.
|
|
34
|
+
# This is a common SNS normalization and improves URL/@/# detection.
|
|
35
|
+
def _fw_to_ascii(ch: str) -> str:
|
|
36
|
+
o = ord(ch)
|
|
37
|
+
if 0xFF01 <= o <= 0xFF5E:
|
|
38
|
+
return chr(o - 0xFEE0)
|
|
39
|
+
return ch
|
|
40
|
+
|
|
41
|
+
text = "".join(_fw_to_ascii(ch) for ch in text)
|
|
42
|
+
# Normalize a few compatibility spaces often seen in SNS copy/paste.
|
|
43
|
+
text = text.replace("\u3000", " ") # IDEOGRAPHIC SPACE
|
|
44
|
+
# Keep NFC, but avoid NFKC here to remain conservative.
|
|
45
|
+
text = unicodedata.normalize("NFC", text)
|
|
46
|
+
# Normalize newlines
|
|
47
|
+
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
|
48
|
+
# Collapse spaces/tabs but keep newlines
|
|
49
|
+
text = re.sub(r"[ \t\f\v]+", " ", text)
|
|
50
|
+
# Trim trailing spaces per line
|
|
51
|
+
text = "\n".join([ln.strip() for ln in text.split("\n")])
|
|
52
|
+
# Collapse multiple blank lines
|
|
53
|
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
54
|
+
return text.strip()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def chunk_tokens(tokens: List[str], *, max_tokens: int, overlap: int = 0) -> List[List[str]]:
|
|
58
|
+
"""
|
|
59
|
+
Chunk a token sequence into overlapping windows.
|
|
60
|
+
"""
|
|
61
|
+
if max_tokens <= 0:
|
|
62
|
+
raise ValueError("max_tokens must be > 0")
|
|
63
|
+
if overlap < 0:
|
|
64
|
+
raise ValueError("overlap must be >= 0")
|
|
65
|
+
if overlap >= max_tokens:
|
|
66
|
+
raise ValueError("overlap must be < max_tokens")
|
|
67
|
+
|
|
68
|
+
out: List[List[str]] = []
|
|
69
|
+
i = 0
|
|
70
|
+
n = len(tokens)
|
|
71
|
+
step = max_tokens - overlap
|
|
72
|
+
while i < n:
|
|
73
|
+
out.append(tokens[i : i + max_tokens])
|
|
74
|
+
i += step
|
|
75
|
+
return out
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
|
tokmor/resources.py
ADDED
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
"""
|
|
2
|
+
TokMor Resource Manager
|
|
3
|
+
======================
|
|
4
|
+
|
|
5
|
+
OSS Core 핵심:
|
|
6
|
+
- 코드(wheel)와 데이터(assets)를 분리한다.
|
|
7
|
+
- 운영에서는 외부 디렉토리(TOKMOR_DATA_DIR)로 리소스를 교체/확장한다.
|
|
8
|
+
|
|
9
|
+
디렉토리 구조(기본):
|
|
10
|
+
{data_dir}/
|
|
11
|
+
lemma_dict/ # 제품 lemma lexicon: {lang}_lemma.pkl or {lang}.pkl
|
|
12
|
+
seg_lexicon/ # segmentation lexicons (zh/SEA 등)
|
|
13
|
+
domain/ # optional domain lexicons (e.g., sentiment)
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
from functools import lru_cache
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Optional
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _package_models_dir() -> Path:
|
|
25
|
+
# NOTE: OSS core does not bundle large assets in the wheel/sdist.
|
|
26
|
+
# This path is used only if a downstream package/vendor bundles assets alongside code.
|
|
27
|
+
return Path(__file__).parent / "models"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def normalize_lang_for_models(lang: str) -> str:
|
|
31
|
+
"""
|
|
32
|
+
Map Wikipedia-style codes (top100 list) to the closest available model code.
|
|
33
|
+
This is a pragmatic compatibility shim to make '100 languages' run end-to-end.
|
|
34
|
+
"""
|
|
35
|
+
l = (lang or "").lower().replace("_", "-")
|
|
36
|
+
# direct aliases
|
|
37
|
+
alias = {
|
|
38
|
+
"simple": "en",
|
|
39
|
+
# ISO / wiki variants
|
|
40
|
+
"zh-cn": "zh",
|
|
41
|
+
"zh-tw": "zh",
|
|
42
|
+
# top100 mismatches / approximations
|
|
43
|
+
"als": "gsw", # Alsatian Wikipedia ≈ Swiss German model
|
|
44
|
+
"li": "nl", # Limburgish ≈ Dutch
|
|
45
|
+
"fy": "nl", # West Frisian ≈ Dutch (best available)
|
|
46
|
+
"bs": "hr", # Bosnian ≈ Croatian (best available)
|
|
47
|
+
"ast": "es", # Asturian ≈ Spanish
|
|
48
|
+
"an": "es", # Aragonese ≈ Spanish
|
|
49
|
+
"ckb": "kmr", # Central Kurdish ≈ Kurmanji Kurdish
|
|
50
|
+
"ku": "kmr",
|
|
51
|
+
"ps": "ur", # Pashto ≈ Urdu (Arabic script)
|
|
52
|
+
"sd": "ur", # Sindhi ≈ Urdu
|
|
53
|
+
"yi": "de", # Yiddish ≈ German
|
|
54
|
+
"tg": "ru", # Tajik ≈ Russian (Cyrillic)
|
|
55
|
+
"uz": "tr", # Uzbek ≈ Turkish
|
|
56
|
+
"mn": "bxr", # Mongolian ≈ Buryat (Cyrillic)
|
|
57
|
+
# Missing in a smaller model snapshot: choose closest script/family
|
|
58
|
+
"ms": "id", # Malay ≈ Indonesian
|
|
59
|
+
"su": "id", # Sundanese ≈ Indonesian
|
|
60
|
+
"war": "tl", # Waray ≈ Tagalog
|
|
61
|
+
"min": "id", # Minangkabau ≈ Indonesian
|
|
62
|
+
"eo": "en", # Esperanto ≈ English (fallback)
|
|
63
|
+
"oc": "fr", # Occitan ≈ French
|
|
64
|
+
"ne": "hi", # Nepali (Devanagari) ≈ Hindi
|
|
65
|
+
"kn": "hi", # Kannada ≈ Hindi (fallback)
|
|
66
|
+
"pa": "hi", # Punjabi ≈ Hindi (fallback)
|
|
67
|
+
"sw": "swl", # Swahili model code in our snapshot
|
|
68
|
+
# SEA no-space: use Thai model as fallback if missing
|
|
69
|
+
"my": "th",
|
|
70
|
+
"km": "th",
|
|
71
|
+
"lo": "th",
|
|
72
|
+
}
|
|
73
|
+
return alias.get(l, l)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def normalize_lang_for_lemma(lang: str) -> str:
|
|
77
|
+
"""
|
|
78
|
+
Lemma store aliasing: prefer a close high-resource lemma dict when missing.
|
|
79
|
+
(If no dict exists, UnifiedMorphAnalyzer still falls back to identity lemma.)
|
|
80
|
+
"""
|
|
81
|
+
l = (lang or "").lower().replace("_", "-")
|
|
82
|
+
alias = {
|
|
83
|
+
"simple": "en",
|
|
84
|
+
"als": "de",
|
|
85
|
+
"li": "nl",
|
|
86
|
+
"fy": "nl",
|
|
87
|
+
"bs": "hr",
|
|
88
|
+
"ast": "es",
|
|
89
|
+
"an": "es",
|
|
90
|
+
"ckb": "fa",
|
|
91
|
+
"ku": "tr",
|
|
92
|
+
"ps": "ur",
|
|
93
|
+
"sd": "ur",
|
|
94
|
+
"yi": "de",
|
|
95
|
+
"tg": "ru",
|
|
96
|
+
"uz": "tr",
|
|
97
|
+
"mn": "ru",
|
|
98
|
+
"ms": "id",
|
|
99
|
+
"su": "id",
|
|
100
|
+
"war": "tl",
|
|
101
|
+
"ceb": "tl",
|
|
102
|
+
"min": "id",
|
|
103
|
+
"eo": "en",
|
|
104
|
+
"oc": "fr",
|
|
105
|
+
"bar": "de",
|
|
106
|
+
"nds": "de",
|
|
107
|
+
"lb": "de",
|
|
108
|
+
"fo": "da",
|
|
109
|
+
"mt": "it",
|
|
110
|
+
"am": "ar",
|
|
111
|
+
"ne": "hi",
|
|
112
|
+
"kn": "hi",
|
|
113
|
+
"pa": "hi",
|
|
114
|
+
"my": "th",
|
|
115
|
+
"km": "th",
|
|
116
|
+
"lo": "th",
|
|
117
|
+
}
|
|
118
|
+
return alias.get(l, l)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@lru_cache(maxsize=1)
|
|
122
|
+
def data_dir() -> Path:
|
|
123
|
+
"""
|
|
124
|
+
리소스 루트 디렉토리.
|
|
125
|
+
우선순위:
|
|
126
|
+
- TOKMOR_DATA_DIR
|
|
127
|
+
- 패키지 번들 tokmor/models
|
|
128
|
+
"""
|
|
129
|
+
env = os.getenv("TOKMOR_DATA_DIR")
|
|
130
|
+
if env:
|
|
131
|
+
p = Path(env).expanduser()
|
|
132
|
+
return p
|
|
133
|
+
return _package_models_dir()
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _first_existing(*candidates: Path) -> Path:
|
|
137
|
+
for c in candidates:
|
|
138
|
+
if c.exists():
|
|
139
|
+
return c
|
|
140
|
+
# return first even if missing (callers often check exists())
|
|
141
|
+
return candidates[0]
|
|
142
|
+
|
|
143
|
+
def _first_existing_nonempty_dir(candidates: list[Path], *, glob_pat: str) -> Path:
|
|
144
|
+
"""
|
|
145
|
+
Like _first_existing, but prefers directories that contain at least one matching file.
|
|
146
|
+
Used to avoid selecting empty packaged dirs (often only contain .gitkeep) during dev.
|
|
147
|
+
"""
|
|
148
|
+
for c in candidates:
|
|
149
|
+
try:
|
|
150
|
+
if c.exists() and c.is_dir() and any(c.glob(glob_pat)):
|
|
151
|
+
return c
|
|
152
|
+
except Exception:
|
|
153
|
+
continue
|
|
154
|
+
# fallback: first existing dir
|
|
155
|
+
for c in candidates:
|
|
156
|
+
if c.exists():
|
|
157
|
+
return c
|
|
158
|
+
return candidates[0]
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def multilingual_dir() -> Path:
|
|
162
|
+
env = os.getenv("TOKMOR_MODELS_DIR") # optional override for legacy naming
|
|
163
|
+
if env:
|
|
164
|
+
return Path(env).expanduser()
|
|
165
|
+
base = data_dir()
|
|
166
|
+
return _first_existing(base / "multilingual", _package_models_dir() / "multilingual")
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def pmi_crf_dir() -> Path:
|
|
170
|
+
"""
|
|
171
|
+
(Deprecated) PMI/CRF resources were removed from OSS core.
|
|
172
|
+
|
|
173
|
+
This function remains for source compatibility but always returns the data_dir-based path.
|
|
174
|
+
"""
|
|
175
|
+
return data_dir() / "pmi_crf"
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def lemma_dict_dir() -> Path:
|
|
179
|
+
env = os.getenv("TOKMOR_LEMMA_DICT_DIR")
|
|
180
|
+
if env:
|
|
181
|
+
return Path(env).expanduser()
|
|
182
|
+
base = data_dir()
|
|
183
|
+
# OSS core: lemma dictionaries are optional external assets.
|
|
184
|
+
return _first_existing(base / "lemma_dict", _package_models_dir() / "lemma_dict")
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def resolve_lemma_dict_path(lang: str) -> Optional[Path]:
|
|
188
|
+
"""
|
|
189
|
+
제품 lemma lexicon 경로를 찾는다.
|
|
190
|
+
우선순위:
|
|
191
|
+
- lemma_dict/{lang}.sqlite (or .db/.sqlite3)
|
|
192
|
+
- lemma_dict/{lang}.pkl
|
|
193
|
+
- lemma_dict/{lang}_lemma.pkl
|
|
194
|
+
"""
|
|
195
|
+
lang = (lang or "").lower()
|
|
196
|
+
if not lang:
|
|
197
|
+
return None
|
|
198
|
+
|
|
199
|
+
# Allow forcing pack-less runtime (use only specialized analyzers + fallback rules).
|
|
200
|
+
# This is useful for "lite" deployments or experiments where lemma assets are unavailable.
|
|
201
|
+
v = os.getenv("TOKMOR_DISABLE_LEMMA_PACK", "").strip().lower()
|
|
202
|
+
if v in {"1", "true", "yes", "y", "on"}:
|
|
203
|
+
return None
|
|
204
|
+
|
|
205
|
+
ld = lemma_dict_dir()
|
|
206
|
+
for name in (f"{lang}.sqlite", f"{lang}.db", f"{lang}.sqlite3", f"{lang}.pkl", f"{lang}_lemma.pkl"):
|
|
207
|
+
p = ld / name
|
|
208
|
+
if p.exists():
|
|
209
|
+
return p
|
|
210
|
+
|
|
211
|
+
return None
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _removed_feature(*_args, **_kwargs):
|
|
215
|
+
raise RuntimeError("TokMor OSS core does not ship inflection packs.")
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def seg_lexicon_dir() -> Path:
|
|
219
|
+
"""
|
|
220
|
+
Segmentation lexicon directory (e.g., zh wordfreq for internal segmenters).
|
|
221
|
+
"""
|
|
222
|
+
base = data_dir()
|
|
223
|
+
# IMPORTANT:
|
|
224
|
+
# - Users often set TOKMOR_DATA_DIR to a lemma pack that may NOT contain seg_lexicon/.
|
|
225
|
+
# - We still want bundled starter lexicons (zh/th/lo/km/my) to work by default.
|
|
226
|
+
# So: prefer a directory that actually contains lexicon files.
|
|
227
|
+
return _first_existing_nonempty_dir(
|
|
228
|
+
[base / "seg_lexicon", _package_models_dir() / "seg_lexicon"],
|
|
229
|
+
glob_pat="*.pkl",
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def sea_wordlist_dir() -> Path:
|
|
234
|
+
"""
|
|
235
|
+
SEA tokenizer wordlist directory (offline).
|
|
236
|
+
Files:
|
|
237
|
+
seg_lexicon/{lang}_wordlist.pkl (pickled set[str])
|
|
238
|
+
or seg_lexicon/{lang}_wordlist.txt (one token per line)
|
|
239
|
+
"""
|
|
240
|
+
# Reuse seg_lexicon location (keeps artifacts simple)
|
|
241
|
+
return seg_lexicon_dir()
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def resolve_sea_wordlist_path(lang: str) -> Optional[Path]:
|
|
245
|
+
lang = (lang or "").lower()
|
|
246
|
+
if not lang:
|
|
247
|
+
return None
|
|
248
|
+
d = sea_wordlist_dir()
|
|
249
|
+
for name in (f"{lang}_wordlist.pkl", f"{lang}_wordlist.txt"):
|
|
250
|
+
p = d / name
|
|
251
|
+
if p.exists():
|
|
252
|
+
return p
|
|
253
|
+
return None
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def resolve_seg_lexicon_path(lang: str) -> Optional[Path]:
|
|
257
|
+
"""
|
|
258
|
+
Resolve path to segmentation lexicon for a given language.
|
|
259
|
+
Currently used for zh internal segmenter improvements.
|
|
260
|
+
"""
|
|
261
|
+
lang = (lang or "").lower()
|
|
262
|
+
if not lang:
|
|
263
|
+
return None
|
|
264
|
+
sd = seg_lexicon_dir()
|
|
265
|
+
for name in (f"{lang}_wordfreq.pkl", f"{lang}_seg_lexicon.pkl"):
|
|
266
|
+
p = sd / name
|
|
267
|
+
if p.exists():
|
|
268
|
+
return p
|
|
269
|
+
return None
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def resolve_extra_dict_path(lang: str) -> Optional[Path]:
|
|
273
|
+
"""
|
|
274
|
+
Optional extra lexicon for improving segmentation/merges.
|
|
275
|
+
Example:
|
|
276
|
+
seg_lexicon/zh_extra_dict.json (token -> pos)
|
|
277
|
+
"""
|
|
278
|
+
lang = (lang or "").lower()
|
|
279
|
+
if not lang:
|
|
280
|
+
return None
|
|
281
|
+
sd = seg_lexicon_dir()
|
|
282
|
+
for name in (f"{lang}_extra_dict.json", f"{lang}_extra_lexicon.json"):
|
|
283
|
+
p = sd / name
|
|
284
|
+
if p.exists():
|
|
285
|
+
return p
|
|
286
|
+
return None
|
|
287
|
+
|
|
288
|
+
|
tokmor/routing.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any, Dict, List, Tuple
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# Script buckets we care about for preprocessing/NER routing.
|
|
8
|
+
_SCRIPT_RX: Dict[str, re.Pattern] = {
|
|
9
|
+
"hangul": re.compile(r"[\uac00-\ud7af]"),
|
|
10
|
+
"hiragana": re.compile(r"[\u3040-\u309f]"),
|
|
11
|
+
"katakana": re.compile(r"[\u30a0-\u30ff]"),
|
|
12
|
+
"han": re.compile(r"[\u4e00-\u9fff]"),
|
|
13
|
+
"arabic": re.compile(r"[\u0600-\u06ff]"),
|
|
14
|
+
"hebrew": re.compile(r"[\u0590-\u05ff]"),
|
|
15
|
+
"thai": re.compile(r"[\u0e00-\u0e7f]"),
|
|
16
|
+
"devanagari": re.compile(r"[\u0900-\u097f]"),
|
|
17
|
+
"cyrillic": re.compile(r"[\u0400-\u04ff]"),
|
|
18
|
+
"latin": re.compile(r"[A-Za-z]"),
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def script_counts(text: str) -> Dict[str, int]:
|
|
23
|
+
t = text or ""
|
|
24
|
+
return {k: len(rx.findall(t)) for k, rx in _SCRIPT_RX.items()}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def script_profile(text: str) -> Dict[str, Any]:
|
|
28
|
+
"""
|
|
29
|
+
Return an explainable script profile for routing.
|
|
30
|
+
|
|
31
|
+
This is intentionally *not* a language detector. It's a script detector.
|
|
32
|
+
"""
|
|
33
|
+
counts = script_counts(text)
|
|
34
|
+
total = int(sum(counts.values()))
|
|
35
|
+
if total <= 0:
|
|
36
|
+
return {
|
|
37
|
+
"total": 0,
|
|
38
|
+
"counts": counts,
|
|
39
|
+
"ratios": {k: 0.0 for k in counts},
|
|
40
|
+
"dominant": None,
|
|
41
|
+
"dominant_ratio": 0.0,
|
|
42
|
+
"is_mixed": False,
|
|
43
|
+
"top": [],
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
ratios = {k: (v / total) for k, v in counts.items()}
|
|
47
|
+
dominant = max(ratios, key=ratios.get)
|
|
48
|
+
dominant_ratio = float(ratios[dominant])
|
|
49
|
+
top = sorted(((k, float(ratios[k]), int(counts[k])) for k in ratios if counts[k] > 0), key=lambda x: x[1], reverse=True)
|
|
50
|
+
|
|
51
|
+
# Mixed script heuristic: dominant script not strong enough.
|
|
52
|
+
is_mixed = bool(dominant_ratio < 0.30)
|
|
53
|
+
|
|
54
|
+
return {
|
|
55
|
+
"total": total,
|
|
56
|
+
"counts": counts,
|
|
57
|
+
"ratios": {k: round(float(v), 4) for k, v in ratios.items()},
|
|
58
|
+
"dominant": dominant,
|
|
59
|
+
"dominant_ratio": round(float(dominant_ratio), 4),
|
|
60
|
+
"is_mixed": is_mixed,
|
|
61
|
+
"top": [{"script": s, "ratio": round(r, 4), "count": c} for (s, r, c) in top],
|
|
62
|
+
"has_kana": bool(counts["hiragana"] + counts["katakana"]),
|
|
63
|
+
"has_cjk": bool(counts["han"]),
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def structure_profile(lang: str, sp: Dict[str, Any]) -> Dict[str, Any]:
|
|
68
|
+
"""
|
|
69
|
+
Structure routing hints for downstream (e.g., NER adapter selection).
|
|
70
|
+
|
|
71
|
+
IMPORTANT:
|
|
72
|
+
- This does NOT promise full morphology for a language.
|
|
73
|
+
- It only recommends which *type* of adapter tends to matter for NER.
|
|
74
|
+
"""
|
|
75
|
+
l = (lang or "").lower()
|
|
76
|
+
dom = str(sp.get("dominant") or "")
|
|
77
|
+
|
|
78
|
+
# Default: space-delimited text, unknown morphology complexity.
|
|
79
|
+
stype = "space_delimited"
|
|
80
|
+
needs_segmentation = False
|
|
81
|
+
needs_morpheme_split = False
|
|
82
|
+
|
|
83
|
+
# High-signal languages (known by tokmor's lightweight language detector)
|
|
84
|
+
if l == "ko":
|
|
85
|
+
stype = "agglutinative_hangul"
|
|
86
|
+
needs_morpheme_split = True
|
|
87
|
+
elif l == "ja":
|
|
88
|
+
stype = "agglutinative_japanese"
|
|
89
|
+
needs_segmentation = True
|
|
90
|
+
needs_morpheme_split = True
|
|
91
|
+
elif l == "zh":
|
|
92
|
+
stype = "unsegmented_han"
|
|
93
|
+
needs_segmentation = True
|
|
94
|
+
elif l in {"fi", "hu", "tr", "et"}:
|
|
95
|
+
# Rich suffixing languages written in Latin script.
|
|
96
|
+
# For NER, a conservative suffix-strip/suffix-split adapter is often helpful.
|
|
97
|
+
stype = "suffixing_latin"
|
|
98
|
+
needs_morpheme_split = True
|
|
99
|
+
elif l in {"ar", "he"}:
|
|
100
|
+
stype = "semitic_rtl"
|
|
101
|
+
needs_morpheme_split = True
|
|
102
|
+
elif l == "th":
|
|
103
|
+
stype = "unsegmented_thai"
|
|
104
|
+
needs_segmentation = True
|
|
105
|
+
elif l == "hi":
|
|
106
|
+
stype = "indic_devanagari"
|
|
107
|
+
elif l == "ru":
|
|
108
|
+
stype = "slavic_cyrillic"
|
|
109
|
+
|
|
110
|
+
# Script-only hints when lang is unknown/mixed
|
|
111
|
+
if not l or l == "auto":
|
|
112
|
+
if dom in {"han", "hiragana", "katakana"} or bool(sp.get("has_kana")):
|
|
113
|
+
stype = "unsegmented_cjk"
|
|
114
|
+
needs_segmentation = True
|
|
115
|
+
elif dom == "thai":
|
|
116
|
+
stype = "unsegmented_thai"
|
|
117
|
+
needs_segmentation = True
|
|
118
|
+
elif dom in {"arabic", "hebrew"}:
|
|
119
|
+
stype = "rtl_script"
|
|
120
|
+
needs_morpheme_split = True
|
|
121
|
+
|
|
122
|
+
# Recommendations for NER candidate generation (core-neutral)
|
|
123
|
+
rec = {
|
|
124
|
+
"prefer_token_quality": True,
|
|
125
|
+
# For candidate generation, morpheme splitting is especially useful for agglutinative & semitic scripts.
|
|
126
|
+
"ner_prefer_morpheme_split": bool(needs_morpheme_split),
|
|
127
|
+
# For no-space scripts, segmentation quality is key.
|
|
128
|
+
"ner_needs_segmentation": bool(needs_segmentation),
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
return {
|
|
132
|
+
"type": stype,
|
|
133
|
+
"needs_segmentation": bool(needs_segmentation),
|
|
134
|
+
"needs_morpheme_split": bool(needs_morpheme_split),
|
|
135
|
+
"recommendations": rec,
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def route(text: str, *, lang: str) -> Dict[str, Any]:
|
|
140
|
+
"""
|
|
141
|
+
Combined routing payload (script + structure).
|
|
142
|
+
"""
|
|
143
|
+
sp = script_profile(text)
|
|
144
|
+
st = structure_profile(lang, sp)
|
|
145
|
+
return {"script": sp, "structure": st}
|
|
146
|
+
|
|
147
|
+
|