controlmt 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- controlmt/__init__.py +25 -0
- controlmt/_version.py +1 -0
- controlmt/batching.py +47 -0
- controlmt/client.py +202 -0
- controlmt/device.py +129 -0
- controlmt-0.1.0.dist-info/METADATA +135 -0
- controlmt-0.1.0.dist-info/RECORD +9 -0
- controlmt-0.1.0.dist-info/WHEEL +5 -0
- controlmt-0.1.0.dist-info/top_level.txt +1 -0
controlmt/__init__.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""ControlMT — Python SDK for the ControlMT v2.3 KN↔EN translator.
|
|
2
|
+
|
|
3
|
+
Drop-in entry point for users who don't want to wire HuggingFace Transformers
|
|
4
|
+
themselves. Picks the right device + dtype + quantization automatically;
|
|
5
|
+
overridable when you need control.
|
|
6
|
+
|
|
7
|
+
Quick start:
|
|
8
|
+
from controlmt import ControlMT
|
|
9
|
+
model = ControlMT.from_hf() # auto everything
|
|
10
|
+
print(model.translate("ನಾನು ಕನ್ನಡ ಮಾತನಾಡುತ್ತೇನೆ.")) # → "I speak Kannada."
|
|
11
|
+
|
|
12
|
+
Explicit:
|
|
13
|
+
model = ControlMT.from_hf(device="cpu", quant="int8") # int8 dynamic on CPU
|
|
14
|
+
model = ControlMT.from_hf(device="gpu", dtype="float16") # fp16 on GPU
|
|
15
|
+
model = ControlMT.from_hf(device="auto") # GPU-with-CPU-fallback (default)
|
|
16
|
+
|
|
17
|
+
Batched (user-defined size, defaults to 1 = no batching):
|
|
18
|
+
model.batch_translate(texts, batch_size=8)
|
|
19
|
+
model.batch_translate(texts, auto_batch=True) # GPU-only auto-fit by VRAM
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from controlmt.client import ControlMT
|
|
23
|
+
from controlmt._version import __version__
|
|
24
|
+
|
|
25
|
+
__all__ = ["ControlMT", "__version__"]
|
controlmt/_version.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
controlmt/batching.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Auto-batch sizing for GPU inference.
|
|
2
|
+
|
|
3
|
+
ControlMT's `translate()` runs beam search sentence-by-sentence, but we
|
|
4
|
+
ThreadPoolExecutor across N concurrent calls. N is bounded by free VRAM.
|
|
5
|
+
|
|
6
|
+
Memory per concurrent translation at beam=2, max_len=256:
|
|
7
|
+
fp16: ~25 MB (encoder + decoder state + beam tensors + kv-cache)
|
|
8
|
+
bf16: ~25 MB
|
|
9
|
+
fp32: ~50 MB
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
import warnings
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
_MEM_PER_SENT_MB = {
|
|
17
|
+
"float16": 25,
|
|
18
|
+
"bfloat16": 25,
|
|
19
|
+
"float32": 50,
|
|
20
|
+
"int8-dynamic": 12,
|
|
21
|
+
}
|
|
22
|
+
_MAX_BATCH = 64 # ControlMT.translate() beam-search per-sentence — gain plateaus past ~16
|
|
23
|
+
_TARGET_VRAM_FRAC = 0.8
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def auto_batch_size(device: str, dtype: str, quant: str = "none",
|
|
27
|
+
free_vram_mb: float | None = None) -> int:
|
|
28
|
+
"""Estimate batch size that fits in ~80% of free VRAM.
|
|
29
|
+
|
|
30
|
+
Returns 1 on CPU (no auto-batching there) with a warning.
|
|
31
|
+
On GPU returns 1..MAX_BATCH.
|
|
32
|
+
"""
|
|
33
|
+
if device != "cuda":
|
|
34
|
+
warnings.warn(
|
|
35
|
+
"auto_batch=True ignored on CPU (no VRAM signal to size against). "
|
|
36
|
+
"Pass an explicit batch_size for batched CPU translation.",
|
|
37
|
+
RuntimeWarning, stacklevel=2)
|
|
38
|
+
return 1
|
|
39
|
+
|
|
40
|
+
if free_vram_mb is None:
|
|
41
|
+
import torch
|
|
42
|
+
free, _total = torch.cuda.mem_get_info()
|
|
43
|
+
free_vram_mb = free / 1024**2
|
|
44
|
+
|
|
45
|
+
per_sent_mb = _MEM_PER_SENT_MB.get(quant if quant != "none" else dtype, 50)
|
|
46
|
+
n = int((free_vram_mb * _TARGET_VRAM_FRAC) // per_sent_mb)
|
|
47
|
+
return max(1, min(_MAX_BATCH, n))
|
controlmt/client.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""High-level client for ControlMT v2.3."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
import re
|
|
5
|
+
import time
|
|
6
|
+
import warnings
|
|
7
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
8
|
+
from typing import Iterable, Sequence
|
|
9
|
+
|
|
10
|
+
from controlmt.device import ResolvedConfig, detect_libraries, resolve
|
|
11
|
+
from controlmt.batching import auto_batch_size
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
DEFAULT_MODEL_ID = "anandkaman/controlmt-v2.3"
|
|
15
|
+
|
|
16
|
+
# Heuristic: 'kn2en' if input is mostly Kannada chars, 'en2kn' otherwise.
|
|
17
|
+
_KN_RE = re.compile(r"[ಀ-]")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ControlMT:
|
|
21
|
+
"""High-level wrapper around the HuggingFace `model.translate()` API.
|
|
22
|
+
|
|
23
|
+
Usage:
|
|
24
|
+
from controlmt import ControlMT
|
|
25
|
+
model = ControlMT.from_hf() # auto
|
|
26
|
+
model.translate("ನಾನು ಕನ್ನಡ ಮಾತನಾಡುತ್ತೇನೆ.")
|
|
27
|
+
model.batch_translate(texts, batch_size=8)
|
|
28
|
+
model.batch_translate(texts, auto_batch=True) # GPU only
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
# ──────────────────────────────────────────────────────────────
|
|
32
|
+
# Construction
|
|
33
|
+
# ──────────────────────────────────────────────────────────────
|
|
34
|
+
def __init__(self, hf_model, tokenizer, config: ResolvedConfig, model_id: str):
|
|
35
|
+
self._model = hf_model
|
|
36
|
+
self._tokenizer = tokenizer
|
|
37
|
+
self.config = config
|
|
38
|
+
self.model_id = model_id
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
def from_hf(
|
|
42
|
+
cls,
|
|
43
|
+
model_id: str = DEFAULT_MODEL_ID,
|
|
44
|
+
*,
|
|
45
|
+
device: str = "auto", # "auto" | "gpu" | "cuda" | "cpu"
|
|
46
|
+
dtype: str | None = None, # "float32" | "bfloat16" | "float16" | None
|
|
47
|
+
quant: str = "none", # "none" | "int8" (CPU-only)
|
|
48
|
+
revision: str | None = None, # HF revision
|
|
49
|
+
verbose: bool = False,
|
|
50
|
+
) -> "ControlMT":
|
|
51
|
+
"""Load ControlMT from HuggingFace + auto-resolve config.
|
|
52
|
+
|
|
53
|
+
Defaults: GPU if available, else CPU. fp16 on GPU, bf16 on CPU. No quantization.
|
|
54
|
+
"""
|
|
55
|
+
import torch
|
|
56
|
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
|
57
|
+
|
|
58
|
+
cfg = resolve(device=device, dtype=dtype, quant=quant)
|
|
59
|
+
|
|
60
|
+
if verbose:
|
|
61
|
+
libs = detect_libraries()
|
|
62
|
+
print(f"[controlmt] loading {model_id} ({cfg.describe()})")
|
|
63
|
+
print(f"[controlmt] env torch={libs.get('torch')} transformers={libs.get('transformers')} "
|
|
64
|
+
f"cuda={libs.get('_cuda')} device={libs.get('_cuda_device')}")
|
|
65
|
+
|
|
66
|
+
torch_dtype = {"float32": torch.float32, "bfloat16": torch.bfloat16,
|
|
67
|
+
"float16": torch.float16}[cfg.dtype_str]
|
|
68
|
+
|
|
69
|
+
load_kw = {"trust_remote_code": True}
|
|
70
|
+
if revision: load_kw["revision"] = revision
|
|
71
|
+
if cfg.quant != "int8-dynamic" and cfg.dtype_str != "float32":
|
|
72
|
+
load_kw["dtype"] = torch_dtype # let HF cast during load
|
|
73
|
+
|
|
74
|
+
tokenizer = AutoTokenizer.from_pretrained(model_id, **{k: v for k, v in load_kw.items() if k != "dtype"})
|
|
75
|
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, **load_kw)
|
|
76
|
+
|
|
77
|
+
# int8 dynamic quantization (CPU-only — checked in resolve())
|
|
78
|
+
if cfg.quant == "int8-dynamic":
|
|
79
|
+
model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
|
|
80
|
+
|
|
81
|
+
model = model.to(torch.device(cfg.device)).eval()
|
|
82
|
+
return cls(hf_model=model, tokenizer=tokenizer, config=cfg, model_id=model_id)
|
|
83
|
+
|
|
84
|
+
# ──────────────────────────────────────────────────────────────
|
|
85
|
+
# Inference
|
|
86
|
+
# ──────────────────────────────────────────────────────────────
|
|
87
|
+
def translate(
|
|
88
|
+
self,
|
|
89
|
+
text: str,
|
|
90
|
+
*,
|
|
91
|
+
direction: str | None = None, # "kn2en" | "en2kn" | None=auto-detect
|
|
92
|
+
num_beams: int = 2,
|
|
93
|
+
anti_lm_alpha: float = 0.5,
|
|
94
|
+
max_length: int = 200,
|
|
95
|
+
) -> str:
|
|
96
|
+
"""Translate one sentence. Direction auto-detected from input script if not given."""
|
|
97
|
+
if not text or not text.strip():
|
|
98
|
+
return ""
|
|
99
|
+
if direction is None:
|
|
100
|
+
direction = self.detect_direction(text)
|
|
101
|
+
return self._model.translate(
|
|
102
|
+
text.strip(),
|
|
103
|
+
tokenizer=self._tokenizer,
|
|
104
|
+
direction=direction,
|
|
105
|
+
num_beams=num_beams,
|
|
106
|
+
anti_lm_alpha=anti_lm_alpha,
|
|
107
|
+
max_length=max_length,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
def batch_translate(
|
|
111
|
+
self,
|
|
112
|
+
texts: Sequence[str],
|
|
113
|
+
*,
|
|
114
|
+
batch_size: int | None = None, # None → 1 (safe), or int N
|
|
115
|
+
auto_batch: bool = False, # GPU only — auto-fit by VRAM
|
|
116
|
+
direction: str | None = None,
|
|
117
|
+
num_beams: int = 2,
|
|
118
|
+
anti_lm_alpha: float = 0.5,
|
|
119
|
+
max_length: int = 200,
|
|
120
|
+
) -> list[str]:
|
|
121
|
+
"""Translate a batch. User must supply batch_size, or auto_batch=True on GPU.
|
|
122
|
+
|
|
123
|
+
Policy (matching DEPLOYMENT.md §11):
|
|
124
|
+
- batch_size=None and auto_batch=False → batch_size=1 (one at a time, safe)
|
|
125
|
+
- batch_size=N → uses N concurrent translations
|
|
126
|
+
- auto_batch=True → GPU: probe free VRAM, pick N; CPU: warn + N=1
|
|
127
|
+
|
|
128
|
+
We use a ThreadPoolExecutor — each thread runs one model.translate() call.
|
|
129
|
+
For our model this is ~the same throughput as a true batched call (beam
|
|
130
|
+
search is per-sentence), with much simpler code.
|
|
131
|
+
"""
|
|
132
|
+
if not texts:
|
|
133
|
+
return []
|
|
134
|
+
|
|
135
|
+
if auto_batch:
|
|
136
|
+
batch_size = auto_batch_size(
|
|
137
|
+
device=self.config.device,
|
|
138
|
+
dtype=self.config.dtype_str,
|
|
139
|
+
quant=self.config.quant,
|
|
140
|
+
)
|
|
141
|
+
if batch_size is None:
|
|
142
|
+
batch_size = 1
|
|
143
|
+
batch_size = max(1, int(batch_size))
|
|
144
|
+
|
|
145
|
+
# If batch_size=1, no need for threadpool overhead
|
|
146
|
+
if batch_size == 1:
|
|
147
|
+
return [self.translate(t, direction=direction, num_beams=num_beams,
|
|
148
|
+
anti_lm_alpha=anti_lm_alpha, max_length=max_length)
|
|
149
|
+
for t in texts]
|
|
150
|
+
|
|
151
|
+
def _one(t: str) -> str:
|
|
152
|
+
return self.translate(t, direction=direction, num_beams=num_beams,
|
|
153
|
+
anti_lm_alpha=anti_lm_alpha, max_length=max_length)
|
|
154
|
+
|
|
155
|
+
with ThreadPoolExecutor(max_workers=batch_size) as ex:
|
|
156
|
+
return list(ex.map(_one, texts))
|
|
157
|
+
|
|
158
|
+
# ──────────────────────────────────────────────────────────────
|
|
159
|
+
# Helpers
|
|
160
|
+
# ──────────────────────────────────────────────────────────────
|
|
161
|
+
@staticmethod
|
|
162
|
+
def detect_direction(text: str) -> str:
|
|
163
|
+
"""Heuristic: > 30% Kannada characters → kn2en, else en2kn."""
|
|
164
|
+
if not text: return "en2kn"
|
|
165
|
+
kn_chars = sum(1 for c in text if _KN_RE.match(c))
|
|
166
|
+
total = sum(1 for c in text if not c.isspace() and c.isprintable())
|
|
167
|
+
return "kn2en" if total and kn_chars / total > 0.3 else "en2kn"
|
|
168
|
+
|
|
169
|
+
def warmup(self) -> float:
|
|
170
|
+
"""JIT/compile kernels on a throwaway translation. Returns elapsed seconds."""
|
|
171
|
+
t0 = time.time()
|
|
172
|
+
self.translate("hello.", direction="en2kn", num_beams=1, max_length=20)
|
|
173
|
+
return time.time() - t0
|
|
174
|
+
|
|
175
|
+
def benchmark(self, num_beams: int = 2) -> dict:
|
|
176
|
+
"""Run the 6-pair DEPLOYMENT.md verification suite on this loaded model.
|
|
177
|
+
Returns the same shape as scripts/verify_deployment.py's JSON output."""
|
|
178
|
+
TEST_PAIRS = [
|
|
179
|
+
("kn2en", "ನಾನು ಕನ್ನಡ ಮಾತನಾಡುತ್ತೇನೆ."),
|
|
180
|
+
("kn2en", "ಬೆಂಗಳೂರಿನಲ್ಲಿ ಮೆಟ್ರೋ ಬಹಳ ಅನುಕೂಲಕರವಾಗಿದೆ."),
|
|
181
|
+
("kn2en", "ಆಪಲ್ ಹೊಸ ಐಫೋನ್ 17 ಬಿಡುಗಡೆ ಮಾಡಿದೆ."),
|
|
182
|
+
("en2kn", "I speak Kannada."),
|
|
183
|
+
("en2kn", "The new metro line opens next month."),
|
|
184
|
+
("en2kn", "Please transfer money to my UPI ID."),
|
|
185
|
+
]
|
|
186
|
+
self.warmup()
|
|
187
|
+
rows = []
|
|
188
|
+
for direction, src in TEST_PAIRS:
|
|
189
|
+
t0 = time.time()
|
|
190
|
+
out = self.translate(src, direction=direction, num_beams=num_beams)
|
|
191
|
+
rows.append({"direction": direction, "src": src, "out": out,
|
|
192
|
+
"latency_s": round(time.time() - t0, 3)})
|
|
193
|
+
lats = sorted([r["latency_s"] for r in rows])
|
|
194
|
+
return {
|
|
195
|
+
"config": self.config.describe(),
|
|
196
|
+
"num_beams": num_beams,
|
|
197
|
+
"rows": rows,
|
|
198
|
+
"median_latency_s": lats[len(lats)//2],
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
def __repr__(self) -> str:
|
|
202
|
+
return f"<ControlMT model_id={self.model_id!r} {self.config.describe()}>"
|
controlmt/device.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""Device + dtype + quantization auto-detection.
|
|
2
|
+
|
|
3
|
+
Resolves the user's (device, dtype, quant) preferences into concrete torch
|
|
4
|
+
objects. Always returns a usable config — falls back to CPU if GPU is
|
|
5
|
+
requested but unavailable, with a warning.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
import warnings
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class ResolvedConfig:
|
|
15
|
+
device: str # "cuda" or "cpu"
|
|
16
|
+
dtype_str: str # "float32" | "bfloat16" | "float16"
|
|
17
|
+
quant: str # "none" | "int8-dynamic"
|
|
18
|
+
bf16_cpu: bool # convenience flag — bf16 works on CPU iff torch supports it
|
|
19
|
+
|
|
20
|
+
def describe(self) -> str:
|
|
21
|
+
bits = [self.device, self.dtype_str]
|
|
22
|
+
if self.quant != "none":
|
|
23
|
+
bits.append(self.quant)
|
|
24
|
+
return " · ".join(bits)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def resolve(
|
|
28
|
+
device: str = "auto", # "auto" | "gpu" | "cuda" | "cpu"
|
|
29
|
+
dtype: str | None = None, # "float32" | "bfloat16" | "float16" | None
|
|
30
|
+
quant: str = "none", # "none" | "int8" / "int8-dynamic"
|
|
31
|
+
) -> ResolvedConfig:
|
|
32
|
+
"""Resolve the user's preferences into a concrete config.
|
|
33
|
+
|
|
34
|
+
Logic:
|
|
35
|
+
- device="auto" (default): GPU if available, else CPU with a quiet info note
|
|
36
|
+
- device="gpu" or "cuda": GPU required; fall back to CPU with a WARNING if absent
|
|
37
|
+
- device="cpu": forces CPU
|
|
38
|
+
- dtype=None: pick the best dtype for the resolved device
|
|
39
|
+
GPU: float16 (broadest compatibility — works on Volta+, Pascal too)
|
|
40
|
+
CPU: bfloat16 if torch supports it (~2.8× faster than fp32 in our tests),
|
|
41
|
+
else float32
|
|
42
|
+
- quant: "int8" / "int8-dynamic" only valid on CPU. GPU + int8 = silently
|
|
43
|
+
falls back to fp16 with a warning (custom-arch incompat with bitsandbytes —
|
|
44
|
+
see DEPLOYMENT.md §9).
|
|
45
|
+
"""
|
|
46
|
+
import torch
|
|
47
|
+
|
|
48
|
+
# ── device ────────────────────────────────────────────────────
|
|
49
|
+
device = device.lower()
|
|
50
|
+
if device in ("gpu", "cuda"):
|
|
51
|
+
if torch.cuda.is_available():
|
|
52
|
+
resolved_device = "cuda"
|
|
53
|
+
else:
|
|
54
|
+
warnings.warn("device='gpu' requested but CUDA unavailable; falling back to CPU.",
|
|
55
|
+
RuntimeWarning, stacklevel=2)
|
|
56
|
+
resolved_device = "cpu"
|
|
57
|
+
elif device == "cpu":
|
|
58
|
+
resolved_device = "cpu"
|
|
59
|
+
elif device == "auto":
|
|
60
|
+
resolved_device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
61
|
+
else:
|
|
62
|
+
raise ValueError(f"unknown device {device!r} — use 'auto', 'gpu', 'cuda', or 'cpu'")
|
|
63
|
+
|
|
64
|
+
# ── quant ─────────────────────────────────────────────────────
|
|
65
|
+
quant = (quant or "none").lower().replace("-dynamic", "")
|
|
66
|
+
if quant in ("int8", "int8_dynamic"):
|
|
67
|
+
quant = "int8-dynamic"
|
|
68
|
+
if quant not in ("none", "int8-dynamic"):
|
|
69
|
+
raise ValueError(f"unknown quant {quant!r} — use 'none' or 'int8' (CPU-only)")
|
|
70
|
+
|
|
71
|
+
if quant == "int8-dynamic" and resolved_device == "cuda":
|
|
72
|
+
warnings.warn(
|
|
73
|
+
"int8 dynamic quantization is CPU-only on this model "
|
|
74
|
+
"(see DEPLOYMENT.md §9). Falling back to fp16 on GPU.",
|
|
75
|
+
RuntimeWarning, stacklevel=2)
|
|
76
|
+
quant = "none"
|
|
77
|
+
|
|
78
|
+
# ── dtype ─────────────────────────────────────────────────────
|
|
79
|
+
bf16_cpu_ok = _cpu_supports_bf16(torch)
|
|
80
|
+
if dtype is None:
|
|
81
|
+
if quant == "int8-dynamic":
|
|
82
|
+
resolved_dtype = "float32" # quantize_dynamic operates on fp32 weights
|
|
83
|
+
elif resolved_device == "cuda":
|
|
84
|
+
resolved_dtype = "float16" # widest GPU compatibility
|
|
85
|
+
else:
|
|
86
|
+
resolved_dtype = "bfloat16" if bf16_cpu_ok else "float32"
|
|
87
|
+
else:
|
|
88
|
+
dtype = dtype.lower().replace("fp32", "float32").replace("fp16", "float16").replace("bf16", "bfloat16")
|
|
89
|
+
if dtype not in ("float32", "float16", "bfloat16"):
|
|
90
|
+
raise ValueError(f"unknown dtype {dtype!r} — use float32/float16/bfloat16")
|
|
91
|
+
resolved_dtype = dtype
|
|
92
|
+
|
|
93
|
+
return ResolvedConfig(
|
|
94
|
+
device=resolved_device,
|
|
95
|
+
dtype_str=resolved_dtype,
|
|
96
|
+
quant=quant,
|
|
97
|
+
bf16_cpu=bf16_cpu_ok,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _cpu_supports_bf16(torch_mod) -> bool:
|
|
102
|
+
"""Quick runtime probe — try a 1-element bf16 add. Some old CPUs and some
|
|
103
|
+
libtorch builds segfault on bf16; this catches that path before model load."""
|
|
104
|
+
try:
|
|
105
|
+
x = torch_mod.zeros(1, dtype=torch_mod.bfloat16)
|
|
106
|
+
_ = x + x
|
|
107
|
+
return True
|
|
108
|
+
except Exception:
|
|
109
|
+
return False
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def detect_libraries() -> dict:
|
|
113
|
+
"""Probe what's installed. Used for the SDK's status banner + auto-picking
|
|
114
|
+
code paths (e.g. if onnxruntime is present, future ONNX backend can be used)."""
|
|
115
|
+
out = {}
|
|
116
|
+
for name in ("torch", "transformers", "accelerate", "bitsandbytes", "onnxruntime", "safetensors"):
|
|
117
|
+
try:
|
|
118
|
+
mod = __import__(name)
|
|
119
|
+
out[name] = getattr(mod, "__version__", "?")
|
|
120
|
+
except ImportError:
|
|
121
|
+
out[name] = None
|
|
122
|
+
try:
|
|
123
|
+
import torch
|
|
124
|
+
out["_cuda"] = torch.cuda.is_available()
|
|
125
|
+
out["_cuda_device"] = torch.cuda.get_device_name(0) if out["_cuda"] else None
|
|
126
|
+
except Exception:
|
|
127
|
+
out["_cuda"] = False
|
|
128
|
+
out["_cuda_device"] = None
|
|
129
|
+
return out
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: controlmt
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python SDK for the ControlMT v2.3 Kannada↔English translator
|
|
5
|
+
Author: Anand Kaman
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://huggingface.co/anandkaman/controlmt-v2.3
|
|
8
|
+
Project-URL: Source, https://github.com/anandkaman/ControlMT
|
|
9
|
+
Project-URL: Demo, https://huggingface.co/spaces/anandkaman/controlmt-demo
|
|
10
|
+
Keywords: translation,kannada,machine-translation,nlp,indic,controlmt
|
|
11
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
Requires-Dist: torch<3,>=2.0
|
|
22
|
+
Requires-Dist: transformers<5,>=4.40
|
|
23
|
+
Requires-Dist: sentencepiece>=0.1.99
|
|
24
|
+
Requires-Dist: safetensors>=0.4
|
|
25
|
+
Requires-Dist: huggingface_hub>=0.27
|
|
26
|
+
Provides-Extra: test
|
|
27
|
+
Requires-Dist: pytest>=7; extra == "test"
|
|
28
|
+
|
|
29
|
+
# controlmt
|
|
30
|
+
|
|
31
|
+
Python SDK for **[ControlMT v2.3](https://huggingface.co/anandkaman/controlmt-v2.3)** —
|
|
32
|
+
a compact 139M-parameter Kannada ↔ English translator.
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install controlmt
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Quick start
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from controlmt import ControlMT
|
|
42
|
+
|
|
43
|
+
model = ControlMT.from_hf() # auto everything
|
|
44
|
+
print(model.translate("ನಾನು ಕನ್ನಡ ಮಾತನಾಡುತ್ತೇನೆ."))
|
|
45
|
+
# → "I speak Kannada."
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
That's it. The SDK auto-detects:
|
|
49
|
+
- **Device** — GPU if CUDA is available, else CPU (overridable)
|
|
50
|
+
- **Dtype** — fp16 on GPU, bf16 on CPU (overridable; bf16 only when supported)
|
|
51
|
+
- **Direction** — Kannada → English if input is mostly Kannada chars, else reverse
|
|
52
|
+
|
|
53
|
+
## Explicit control
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
# Force CPU, even on a GPU box
|
|
57
|
+
model = ControlMT.from_hf(device="cpu")
|
|
58
|
+
|
|
59
|
+
# Force GPU; falls back to CPU with a warning if not present
|
|
60
|
+
model = ControlMT.from_hf(device="gpu")
|
|
61
|
+
|
|
62
|
+
# Pick an exact dtype
|
|
63
|
+
model = ControlMT.from_hf(device="cpu", dtype="bf16") # bfloat16
|
|
64
|
+
model = ControlMT.from_hf(device="gpu", dtype="fp16") # float16
|
|
65
|
+
model = ControlMT.from_hf(dtype="fp32") # full precision
|
|
66
|
+
|
|
67
|
+
# CPU int8 dynamic quantization (~2× faster than bf16 on CPU)
|
|
68
|
+
model = ControlMT.from_hf(device="cpu", quant="int8")
|
|
69
|
+
|
|
70
|
+
# Specific HF revision (e.g. a pre-quantized branch)
|
|
71
|
+
model = ControlMT.from_hf(model_id="anandkaman/controlmt-v2.3-int8", quant="int8")
|
|
72
|
+
|
|
73
|
+
# Loud: print the auto-pick decisions
|
|
74
|
+
model = ControlMT.from_hf(verbose=True)
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Inspect the resolved config:
|
|
78
|
+
```python
|
|
79
|
+
>>> model
|
|
80
|
+
<ControlMT model_id='anandkaman/controlmt-v2.3' cuda · float16>
|
|
81
|
+
>>> model.config
|
|
82
|
+
ResolvedConfig(device='cuda', dtype_str='float16', quant='none', bf16_cpu=True)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Batched translation
|
|
86
|
+
|
|
87
|
+
By design: **you must specify `batch_size` to opt into batching.** Otherwise the SDK
|
|
88
|
+
runs one sentence at a time — predictable memory, no surprises.
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
texts = ["ನಾನು ಕನ್ನಡ.", "I speak English.", ...]
|
|
92
|
+
|
|
93
|
+
# Default: one at a time (safe everywhere)
|
|
94
|
+
outs = model.batch_translate(texts)
|
|
95
|
+
|
|
96
|
+
# Explicit fixed batch size
|
|
97
|
+
outs = model.batch_translate(texts, batch_size=8)
|
|
98
|
+
|
|
99
|
+
# Auto-pick batch size from free VRAM (GPU only)
|
|
100
|
+
outs = model.batch_translate(texts, auto_batch=True)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
| Mode | CPU | GPU |
|
|
104
|
+
|---|---|---|
|
|
105
|
+
| `(no batch_size, no auto_batch)` | 1 sentence at a time | 1 sentence at a time |
|
|
106
|
+
| `batch_size=N` | uses N | uses N |
|
|
107
|
+
| `auto_batch=True` | ignored + warning → 1 | probes `torch.cuda.mem_get_info()`, picks N ≤ 64 |
|
|
108
|
+
|
|
109
|
+
## Other endpoints
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
# Heuristic direction detection (>30% KN chars → kn2en, else en2kn)
|
|
113
|
+
ControlMT.detect_direction("ನಾನು ಕನ್ನಡ.") # → "kn2en"
|
|
114
|
+
|
|
115
|
+
# JIT/compile warmup — kills the 5–10s "first request" lag in production
|
|
116
|
+
model.warmup()
|
|
117
|
+
|
|
118
|
+
# Run the 6-pair DEPLOYMENT.md benchmark suite on YOUR hardware
|
|
119
|
+
result = model.benchmark()
|
|
120
|
+
# {'config': 'cuda · float16', 'num_beams': 2, 'median_latency_s': 0.19, 'rows': [...]}
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Architecture note
|
|
124
|
+
|
|
125
|
+
ControlMT v2.3 is an **encoder-decoder seq2seq** model (T5/mBART family), not a
|
|
126
|
+
decoder-only LM. That means:
|
|
127
|
+
|
|
128
|
+
- ✅ Works: this SDK, raw Transformers, FastAPI, Docker, HF Inference Endpoints
|
|
129
|
+
- ❌ Doesn't work without significant adapter work: vLLM, Ollama, llama.cpp/GGUF, HF TGI
|
|
130
|
+
|
|
131
|
+
See [DEPLOYMENT.md](https://huggingface.co/anandkaman/controlmt-v2.3/blob/main/DEPLOYMENT.md) §9 for the full "not supported" table and why.
|
|
132
|
+
|
|
133
|
+
## License
|
|
134
|
+
|
|
135
|
+
Apache 2.0. Same as the underlying model weights.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
controlmt/__init__.py,sha256=gcO_c2HXEmaAEbB31WRYK7polvVQgbPr0rJJJcwN3DU,1096
|
|
2
|
+
controlmt/_version.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
|
|
3
|
+
controlmt/batching.py,sha256=w7-G6ayshjM7tuD1HIVDLO9Os-VggmzZmjGNw2WE05E,1511
|
|
4
|
+
controlmt/client.py,sha256=1-K2mkdvY_GHY8lhcdolT8b2uRRl_VoK_PY4eHhQJT8,9221
|
|
5
|
+
controlmt/device.py,sha256=k2EImENU5Vxtu1nKUCvK2I9rMha_KYqooSjjt7gbvD8,5435
|
|
6
|
+
controlmt-0.1.0.dist-info/METADATA,sha256=AKtjzm9IcA-QGZePWOawV0ZSeoRX96GKAG5p5vQ-kIE,4639
|
|
7
|
+
controlmt-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
8
|
+
controlmt-0.1.0.dist-info/top_level.txt,sha256=1fMNkQVgA081BBsoTHm_8vD7BpDL-kFl3t5VvdJZkbY,10
|
|
9
|
+
controlmt-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
controlmt
|