tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
tokmor/offline.py ADDED
@@ -0,0 +1,89 @@
1
+ """
2
+ Offline enforcement
3
+ ===================
4
+
5
+ TokMor is designed to run fully offline at runtime.
6
+
7
+ This module hard-blocks any attempt to opt into online / remote behaviors via
8
+ TokMor-related environment variables.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import os
14
+ from typing import Iterable
15
+
16
+
17
+ _TRUTHY = {"1", "true", "yes", "y", "on"}
18
+
19
+
20
+ def _is_truthy(v: str | None) -> bool:
21
+ if v is None:
22
+ return False
23
+ return v.strip().lower() in _TRUTHY
24
+
25
+
26
+ _ALLOWED_TOKMOR_ENV = {
27
+ # resource routing
28
+ "TOKMOR_DATA_DIR",
29
+ "TOKMOR_MODELS_DIR",
30
+ "TOKMOR_LEMMA_DICT_DIR",
31
+ "TOKMOR_INFLECT_DIR",
32
+ "TOKMOR_INFLECT_KAIKKI_DIR",
33
+ "TOKMOR_INFLECT_UNIMORPH_DIR",
34
+ "TOKMOR_COARSE_MODELS_DIR",
35
+ # runtime toggles (only disabling things is allowed)
36
+ "TOKMOR_DISABLE_ML",
37
+ "TOKMOR_DISABLE_LEMMA_PACK",
38
+ "TOKMOR_DISABLE_EXTENDED_DICT",
39
+ "TOKMOR_DISABLE_DOMAIN_LEXICONS",
40
+ # quality knobs (offline-safe; local-only computation)
41
+ "TOKMOR_COARSE_HYBRID",
42
+ "TOKMOR_COARSE_HYBRID_MIN_PROB",
43
+ # language-specific behavior knobs
44
+ "TOKMOR_KO_LEMMA_STYLE",
45
+ "TOKMOR_ZH_JOIN_DATES",
46
+ }
47
+
48
+
49
+ def _iter_blocked_env_names() -> Iterable[str]:
50
+ """
51
+ Block any TokMor-scoped env vars that are not explicitly allowed.
52
+ This prevents any accidental opt-in flags from being introduced via env vars.
53
+ """
54
+ for k in os.environ.keys():
55
+ ku = k.upper()
56
+ if not ku.startswith("TOKMOR_"):
57
+ continue
58
+ if ku not in _ALLOWED_TOKMOR_ENV:
59
+ yield k
60
+
61
+
62
+ def enforce_offline() -> None:
63
+ """
64
+ Enforce offline-only runtime.
65
+
66
+ Behavior:
67
+ - If any non-allowed TOKMOR_* env var is set (non-empty and not an explicit false) -> raise RuntimeError.
68
+ - This is intentionally strict to prevent accidental online/remote fallbacks.
69
+ """
70
+ offenders = []
71
+ for k in _iter_blocked_env_names():
72
+ v = os.getenv(k)
73
+ if v is None:
74
+ continue
75
+ vs = v.strip().lower()
76
+ if not vs:
77
+ continue
78
+ if vs in ("0", "false", "no", "n", "off"):
79
+ continue
80
+ if _is_truthy(v) or vs:
81
+ offenders.append(k)
82
+ if offenders:
83
+ offenders = sorted(set(offenders))
84
+ raise RuntimeError(
85
+ "TokMor is offline-only. Remove/disable non-allowed TOKMOR_* env vars: "
86
+ + ", ".join(offenders)
87
+ )
88
+
89
+
tokmor/preprocess.py ADDED
@@ -0,0 +1,80 @@
1
+ """
2
+ Preprocessing utilities (POS-free)
3
+ ==================================
4
+
5
+ TokMor's core product direction: fast preprocessing utilities that are useful even
6
+ without POS tagging or PPMI.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import re
12
+ import unicodedata
13
+ from typing import List
14
+
15
+ from .base import BaseTokenizer
16
+
17
+
18
+ def normalize_text(text: str, *, sns: bool = False) -> str:
19
+ """
20
+ Conservative normalization:
21
+ - Unicode NFC
22
+ - Strip control chars
23
+ - Normalize newlines
24
+ - Collapse excessive whitespace
25
+ """
26
+ if text is None:
27
+ return ""
28
+ text = BaseTokenizer.clean_text(str(text))
29
+
30
+ # SNS-friendly normalization (still deterministic, no language assumptions):
31
+ # - normalize common fullwidth ASCII variants (#@$% etc.) so downstream patterns work.
32
+ if sns:
33
+ # Convert fullwidth ASCII range FF01-FF5E to ASCII 21-7E.
34
+ # This is a common SNS normalization and improves URL/@/# detection.
35
+ def _fw_to_ascii(ch: str) -> str:
36
+ o = ord(ch)
37
+ if 0xFF01 <= o <= 0xFF5E:
38
+ return chr(o - 0xFEE0)
39
+ return ch
40
+
41
+ text = "".join(_fw_to_ascii(ch) for ch in text)
42
+ # Normalize a few compatibility spaces often seen in SNS copy/paste.
43
+ text = text.replace("\u3000", " ") # IDEOGRAPHIC SPACE
44
+ # Keep NFC, but avoid NFKC here to remain conservative.
45
+ text = unicodedata.normalize("NFC", text)
46
+ # Normalize newlines
47
+ text = text.replace("\r\n", "\n").replace("\r", "\n")
48
+ # Collapse spaces/tabs but keep newlines
49
+ text = re.sub(r"[ \t\f\v]+", " ", text)
50
+ # Trim trailing spaces per line
51
+ text = "\n".join([ln.strip() for ln in text.split("\n")])
52
+ # Collapse multiple blank lines
53
+ text = re.sub(r"\n{3,}", "\n\n", text)
54
+ return text.strip()
55
+
56
+
57
+ def chunk_tokens(tokens: List[str], *, max_tokens: int, overlap: int = 0) -> List[List[str]]:
58
+ """
59
+ Chunk a token sequence into overlapping windows.
60
+ """
61
+ if max_tokens <= 0:
62
+ raise ValueError("max_tokens must be > 0")
63
+ if overlap < 0:
64
+ raise ValueError("overlap must be >= 0")
65
+ if overlap >= max_tokens:
66
+ raise ValueError("overlap must be < max_tokens")
67
+
68
+ out: List[List[str]] = []
69
+ i = 0
70
+ n = len(tokens)
71
+ step = max_tokens - overlap
72
+ while i < n:
73
+ out.append(tokens[i : i + max_tokens])
74
+ i += step
75
+ return out
76
+
77
+
78
+
79
+
80
+
tokmor/resources.py ADDED
@@ -0,0 +1,288 @@
1
+ """
2
+ TokMor Resource Manager
3
+ ======================
4
+
5
+ OSS Core 핵심:
6
+ - 코드(wheel)와 데이터(assets)를 분리한다.
7
+ - 운영에서는 외부 디렉토리(TOKMOR_DATA_DIR)로 리소스를 교체/확장한다.
8
+
9
+ 디렉토리 구조(기본):
10
+ {data_dir}/
11
+ lemma_dict/ # 제품 lemma lexicon: {lang}_lemma.pkl or {lang}.pkl
12
+ seg_lexicon/ # segmentation lexicons (zh/SEA 등)
13
+ domain/ # optional domain lexicons (e.g., sentiment)
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import os
19
+ from functools import lru_cache
20
+ from pathlib import Path
21
+ from typing import Optional
22
+
23
+
24
+ def _package_models_dir() -> Path:
25
+ # NOTE: OSS core does not bundle large assets in the wheel/sdist.
26
+ # This path is used only if a downstream package/vendor bundles assets alongside code.
27
+ return Path(__file__).parent / "models"
28
+
29
+
30
+ def normalize_lang_for_models(lang: str) -> str:
31
+ """
32
+ Map Wikipedia-style codes (top100 list) to the closest available model code.
33
+ This is a pragmatic compatibility shim to make '100 languages' run end-to-end.
34
+ """
35
+ l = (lang or "").lower().replace("_", "-")
36
+ # direct aliases
37
+ alias = {
38
+ "simple": "en",
39
+ # ISO / wiki variants
40
+ "zh-cn": "zh",
41
+ "zh-tw": "zh",
42
+ # top100 mismatches / approximations
43
+ "als": "gsw", # Alsatian Wikipedia ≈ Swiss German model
44
+ "li": "nl", # Limburgish ≈ Dutch
45
+ "fy": "nl", # West Frisian ≈ Dutch (best available)
46
+ "bs": "hr", # Bosnian ≈ Croatian (best available)
47
+ "ast": "es", # Asturian ≈ Spanish
48
+ "an": "es", # Aragonese ≈ Spanish
49
+ "ckb": "kmr", # Central Kurdish ≈ Kurmanji Kurdish
50
+ "ku": "kmr",
51
+ "ps": "ur", # Pashto ≈ Urdu (Arabic script)
52
+ "sd": "ur", # Sindhi ≈ Urdu
53
+ "yi": "de", # Yiddish ≈ German
54
+ "tg": "ru", # Tajik ≈ Russian (Cyrillic)
55
+ "uz": "tr", # Uzbek ≈ Turkish
56
+ "mn": "bxr", # Mongolian ≈ Buryat (Cyrillic)
57
+ # Missing in a smaller model snapshot: choose closest script/family
58
+ "ms": "id", # Malay ≈ Indonesian
59
+ "su": "id", # Sundanese ≈ Indonesian
60
+ "war": "tl", # Waray ≈ Tagalog
61
+ "min": "id", # Minangkabau ≈ Indonesian
62
+ "eo": "en", # Esperanto ≈ English (fallback)
63
+ "oc": "fr", # Occitan ≈ French
64
+ "ne": "hi", # Nepali (Devanagari) ≈ Hindi
65
+ "kn": "hi", # Kannada ≈ Hindi (fallback)
66
+ "pa": "hi", # Punjabi ≈ Hindi (fallback)
67
+ "sw": "swl", # Swahili model code in our snapshot
68
+ # SEA no-space: use Thai model as fallback if missing
69
+ "my": "th",
70
+ "km": "th",
71
+ "lo": "th",
72
+ }
73
+ return alias.get(l, l)
74
+
75
+
76
+ def normalize_lang_for_lemma(lang: str) -> str:
77
+ """
78
+ Lemma store aliasing: prefer a close high-resource lemma dict when missing.
79
+ (If no dict exists, UnifiedMorphAnalyzer still falls back to identity lemma.)
80
+ """
81
+ l = (lang or "").lower().replace("_", "-")
82
+ alias = {
83
+ "simple": "en",
84
+ "als": "de",
85
+ "li": "nl",
86
+ "fy": "nl",
87
+ "bs": "hr",
88
+ "ast": "es",
89
+ "an": "es",
90
+ "ckb": "fa",
91
+ "ku": "tr",
92
+ "ps": "ur",
93
+ "sd": "ur",
94
+ "yi": "de",
95
+ "tg": "ru",
96
+ "uz": "tr",
97
+ "mn": "ru",
98
+ "ms": "id",
99
+ "su": "id",
100
+ "war": "tl",
101
+ "ceb": "tl",
102
+ "min": "id",
103
+ "eo": "en",
104
+ "oc": "fr",
105
+ "bar": "de",
106
+ "nds": "de",
107
+ "lb": "de",
108
+ "fo": "da",
109
+ "mt": "it",
110
+ "am": "ar",
111
+ "ne": "hi",
112
+ "kn": "hi",
113
+ "pa": "hi",
114
+ "my": "th",
115
+ "km": "th",
116
+ "lo": "th",
117
+ }
118
+ return alias.get(l, l)
119
+
120
+
121
+ @lru_cache(maxsize=1)
122
+ def data_dir() -> Path:
123
+ """
124
+ 리소스 루트 디렉토리.
125
+ 우선순위:
126
+ - TOKMOR_DATA_DIR
127
+ - 패키지 번들 tokmor/models
128
+ """
129
+ env = os.getenv("TOKMOR_DATA_DIR")
130
+ if env:
131
+ p = Path(env).expanduser()
132
+ return p
133
+ return _package_models_dir()
134
+
135
+
136
+ def _first_existing(*candidates: Path) -> Path:
137
+ for c in candidates:
138
+ if c.exists():
139
+ return c
140
+ # return first even if missing (callers often check exists())
141
+ return candidates[0]
142
+
143
+ def _first_existing_nonempty_dir(candidates: list[Path], *, glob_pat: str) -> Path:
144
+ """
145
+ Like _first_existing, but prefers directories that contain at least one matching file.
146
+ Used to avoid selecting empty packaged dirs (often only contain .gitkeep) during dev.
147
+ """
148
+ for c in candidates:
149
+ try:
150
+ if c.exists() and c.is_dir() and any(c.glob(glob_pat)):
151
+ return c
152
+ except Exception:
153
+ continue
154
+ # fallback: first existing dir
155
+ for c in candidates:
156
+ if c.exists():
157
+ return c
158
+ return candidates[0]
159
+
160
+
161
+ def multilingual_dir() -> Path:
162
+ env = os.getenv("TOKMOR_MODELS_DIR") # optional override for legacy naming
163
+ if env:
164
+ return Path(env).expanduser()
165
+ base = data_dir()
166
+ return _first_existing(base / "multilingual", _package_models_dir() / "multilingual")
167
+
168
+
169
+ def pmi_crf_dir() -> Path:
170
+ """
171
+ (Deprecated) PMI/CRF resources were removed from OSS core.
172
+
173
+ This function remains for source compatibility but always returns the data_dir-based path.
174
+ """
175
+ return data_dir() / "pmi_crf"
176
+
177
+
178
+ def lemma_dict_dir() -> Path:
179
+ env = os.getenv("TOKMOR_LEMMA_DICT_DIR")
180
+ if env:
181
+ return Path(env).expanduser()
182
+ base = data_dir()
183
+ # OSS core: lemma dictionaries are optional external assets.
184
+ return _first_existing(base / "lemma_dict", _package_models_dir() / "lemma_dict")
185
+
186
+
187
+ def resolve_lemma_dict_path(lang: str) -> Optional[Path]:
188
+ """
189
+ 제품 lemma lexicon 경로를 찾는다.
190
+ 우선순위:
191
+ - lemma_dict/{lang}.sqlite (or .db/.sqlite3)
192
+ - lemma_dict/{lang}.pkl
193
+ - lemma_dict/{lang}_lemma.pkl
194
+ """
195
+ lang = (lang or "").lower()
196
+ if not lang:
197
+ return None
198
+
199
+ # Allow forcing pack-less runtime (use only specialized analyzers + fallback rules).
200
+ # This is useful for "lite" deployments or experiments where lemma assets are unavailable.
201
+ v = os.getenv("TOKMOR_DISABLE_LEMMA_PACK", "").strip().lower()
202
+ if v in {"1", "true", "yes", "y", "on"}:
203
+ return None
204
+
205
+ ld = lemma_dict_dir()
206
+ for name in (f"{lang}.sqlite", f"{lang}.db", f"{lang}.sqlite3", f"{lang}.pkl", f"{lang}_lemma.pkl"):
207
+ p = ld / name
208
+ if p.exists():
209
+ return p
210
+
211
+ return None
212
+
213
+
214
+ def _removed_feature(*_args, **_kwargs):
215
+ raise RuntimeError("TokMor OSS core does not ship inflection packs.")
216
+
217
+
218
+ def seg_lexicon_dir() -> Path:
219
+ """
220
+ Segmentation lexicon directory (e.g., zh wordfreq for internal segmenters).
221
+ """
222
+ base = data_dir()
223
+ # IMPORTANT:
224
+ # - Users often set TOKMOR_DATA_DIR to a lemma pack that may NOT contain seg_lexicon/.
225
+ # - We still want bundled starter lexicons (zh/th/lo/km/my) to work by default.
226
+ # So: prefer a directory that actually contains lexicon files.
227
+ return _first_existing_nonempty_dir(
228
+ [base / "seg_lexicon", _package_models_dir() / "seg_lexicon"],
229
+ glob_pat="*.pkl",
230
+ )
231
+
232
+
233
+ def sea_wordlist_dir() -> Path:
234
+ """
235
+ SEA tokenizer wordlist directory (offline).
236
+ Files:
237
+ seg_lexicon/{lang}_wordlist.pkl (pickled set[str])
238
+ or seg_lexicon/{lang}_wordlist.txt (one token per line)
239
+ """
240
+ # Reuse seg_lexicon location (keeps artifacts simple)
241
+ return seg_lexicon_dir()
242
+
243
+
244
+ def resolve_sea_wordlist_path(lang: str) -> Optional[Path]:
245
+ lang = (lang or "").lower()
246
+ if not lang:
247
+ return None
248
+ d = sea_wordlist_dir()
249
+ for name in (f"{lang}_wordlist.pkl", f"{lang}_wordlist.txt"):
250
+ p = d / name
251
+ if p.exists():
252
+ return p
253
+ return None
254
+
255
+
256
+ def resolve_seg_lexicon_path(lang: str) -> Optional[Path]:
257
+ """
258
+ Resolve path to segmentation lexicon for a given language.
259
+ Currently used for zh internal segmenter improvements.
260
+ """
261
+ lang = (lang or "").lower()
262
+ if not lang:
263
+ return None
264
+ sd = seg_lexicon_dir()
265
+ for name in (f"{lang}_wordfreq.pkl", f"{lang}_seg_lexicon.pkl"):
266
+ p = sd / name
267
+ if p.exists():
268
+ return p
269
+ return None
270
+
271
+
272
+ def resolve_extra_dict_path(lang: str) -> Optional[Path]:
273
+ """
274
+ Optional extra lexicon for improving segmentation/merges.
275
+ Example:
276
+ seg_lexicon/zh_extra_dict.json (token -> pos)
277
+ """
278
+ lang = (lang or "").lower()
279
+ if not lang:
280
+ return None
281
+ sd = seg_lexicon_dir()
282
+ for name in (f"{lang}_extra_dict.json", f"{lang}_extra_lexicon.json"):
283
+ p = sd / name
284
+ if p.exists():
285
+ return p
286
+ return None
287
+
288
+
tokmor/routing.py ADDED
@@ -0,0 +1,147 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import Any, Dict, List, Tuple
5
+
6
+
7
+ # Script buckets we care about for preprocessing/NER routing.
8
+ _SCRIPT_RX: Dict[str, re.Pattern] = {
9
+ "hangul": re.compile(r"[\uac00-\ud7af]"),
10
+ "hiragana": re.compile(r"[\u3040-\u309f]"),
11
+ "katakana": re.compile(r"[\u30a0-\u30ff]"),
12
+ "han": re.compile(r"[\u4e00-\u9fff]"),
13
+ "arabic": re.compile(r"[\u0600-\u06ff]"),
14
+ "hebrew": re.compile(r"[\u0590-\u05ff]"),
15
+ "thai": re.compile(r"[\u0e00-\u0e7f]"),
16
+ "devanagari": re.compile(r"[\u0900-\u097f]"),
17
+ "cyrillic": re.compile(r"[\u0400-\u04ff]"),
18
+ "latin": re.compile(r"[A-Za-z]"),
19
+ }
20
+
21
+
22
+ def script_counts(text: str) -> Dict[str, int]:
23
+ t = text or ""
24
+ return {k: len(rx.findall(t)) for k, rx in _SCRIPT_RX.items()}
25
+
26
+
27
+ def script_profile(text: str) -> Dict[str, Any]:
28
+ """
29
+ Return an explainable script profile for routing.
30
+
31
+ This is intentionally *not* a language detector. It's a script detector.
32
+ """
33
+ counts = script_counts(text)
34
+ total = int(sum(counts.values()))
35
+ if total <= 0:
36
+ return {
37
+ "total": 0,
38
+ "counts": counts,
39
+ "ratios": {k: 0.0 for k in counts},
40
+ "dominant": None,
41
+ "dominant_ratio": 0.0,
42
+ "is_mixed": False,
43
+ "top": [],
44
+ }
45
+
46
+ ratios = {k: (v / total) for k, v in counts.items()}
47
+ dominant = max(ratios, key=ratios.get)
48
+ dominant_ratio = float(ratios[dominant])
49
+ top = sorted(((k, float(ratios[k]), int(counts[k])) for k in ratios if counts[k] > 0), key=lambda x: x[1], reverse=True)
50
+
51
+ # Mixed script heuristic: dominant script not strong enough.
52
+ is_mixed = bool(dominant_ratio < 0.30)
53
+
54
+ return {
55
+ "total": total,
56
+ "counts": counts,
57
+ "ratios": {k: round(float(v), 4) for k, v in ratios.items()},
58
+ "dominant": dominant,
59
+ "dominant_ratio": round(float(dominant_ratio), 4),
60
+ "is_mixed": is_mixed,
61
+ "top": [{"script": s, "ratio": round(r, 4), "count": c} for (s, r, c) in top],
62
+ "has_kana": bool(counts["hiragana"] + counts["katakana"]),
63
+ "has_cjk": bool(counts["han"]),
64
+ }
65
+
66
+
67
+ def structure_profile(lang: str, sp: Dict[str, Any]) -> Dict[str, Any]:
68
+ """
69
+ Structure routing hints for downstream (e.g., NER adapter selection).
70
+
71
+ IMPORTANT:
72
+ - This does NOT promise full morphology for a language.
73
+ - It only recommends which *type* of adapter tends to matter for NER.
74
+ """
75
+ l = (lang or "").lower()
76
+ dom = str(sp.get("dominant") or "")
77
+
78
+ # Default: space-delimited text, unknown morphology complexity.
79
+ stype = "space_delimited"
80
+ needs_segmentation = False
81
+ needs_morpheme_split = False
82
+
83
+ # High-signal languages (known by tokmor's lightweight language detector)
84
+ if l == "ko":
85
+ stype = "agglutinative_hangul"
86
+ needs_morpheme_split = True
87
+ elif l == "ja":
88
+ stype = "agglutinative_japanese"
89
+ needs_segmentation = True
90
+ needs_morpheme_split = True
91
+ elif l == "zh":
92
+ stype = "unsegmented_han"
93
+ needs_segmentation = True
94
+ elif l in {"fi", "hu", "tr", "et"}:
95
+ # Rich suffixing languages written in Latin script.
96
+ # For NER, a conservative suffix-strip/suffix-split adapter is often helpful.
97
+ stype = "suffixing_latin"
98
+ needs_morpheme_split = True
99
+ elif l in {"ar", "he"}:
100
+ stype = "semitic_rtl"
101
+ needs_morpheme_split = True
102
+ elif l == "th":
103
+ stype = "unsegmented_thai"
104
+ needs_segmentation = True
105
+ elif l == "hi":
106
+ stype = "indic_devanagari"
107
+ elif l == "ru":
108
+ stype = "slavic_cyrillic"
109
+
110
+ # Script-only hints when lang is unknown/mixed
111
+ if not l or l == "auto":
112
+ if dom in {"han", "hiragana", "katakana"} or bool(sp.get("has_kana")):
113
+ stype = "unsegmented_cjk"
114
+ needs_segmentation = True
115
+ elif dom == "thai":
116
+ stype = "unsegmented_thai"
117
+ needs_segmentation = True
118
+ elif dom in {"arabic", "hebrew"}:
119
+ stype = "rtl_script"
120
+ needs_morpheme_split = True
121
+
122
+ # Recommendations for NER candidate generation (core-neutral)
123
+ rec = {
124
+ "prefer_token_quality": True,
125
+ # For candidate generation, morpheme splitting is especially useful for agglutinative & semitic scripts.
126
+ "ner_prefer_morpheme_split": bool(needs_morpheme_split),
127
+ # For no-space scripts, segmentation quality is key.
128
+ "ner_needs_segmentation": bool(needs_segmentation),
129
+ }
130
+
131
+ return {
132
+ "type": stype,
133
+ "needs_segmentation": bool(needs_segmentation),
134
+ "needs_morpheme_split": bool(needs_morpheme_split),
135
+ "recommendations": rec,
136
+ }
137
+
138
+
139
+ def route(text: str, *, lang: str) -> Dict[str, Any]:
140
+ """
141
+ Combined routing payload (script + structure).
142
+ """
143
+ sp = script_profile(text)
144
+ st = structure_profile(lang, sp)
145
+ return {"script": sp, "structure": st}
146
+
147
+