controlmt 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
controlmt/__init__.py ADDED
@@ -0,0 +1,25 @@
1
+ """ControlMT — Python SDK for the ControlMT v2.3 KN↔EN translator.
2
+
3
+ Drop-in entry point for users who don't want to wire HuggingFace Transformers
4
+ themselves. Picks the right device + dtype + quantization automatically;
5
+ overridable when you need control.
6
+
7
+ Quick start:
8
+ from controlmt import ControlMT
9
+ model = ControlMT.from_hf() # auto everything
10
+ print(model.translate("ನಾನು ಕನ್ನಡ ಮಾತನಾಡುತ್ತೇನೆ.")) # → "I speak Kannada."
11
+
12
+ Explicit:
13
+ model = ControlMT.from_hf(device="cpu", quant="int8") # int8 dynamic on CPU
14
+ model = ControlMT.from_hf(device="gpu", dtype="float16") # fp16 on GPU
15
+ model = ControlMT.from_hf(device="auto") # GPU-with-CPU-fallback (default)
16
+
17
+ Batched (user-defined size, defaults to 1 = no batching):
18
+ model.batch_translate(texts, batch_size=8)
19
+ model.batch_translate(texts, auto_batch=True) # GPU-only auto-fit by VRAM
20
+ """
21
+
22
+ from controlmt.client import ControlMT
23
+ from controlmt._version import __version__
24
+
25
+ __all__ = ["ControlMT", "__version__"]
controlmt/_version.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
controlmt/batching.py ADDED
@@ -0,0 +1,47 @@
1
+ """Auto-batch sizing for GPU inference.
2
+
3
+ ControlMT's `translate()` runs beam search sentence-by-sentence, but we
4
+ ThreadPoolExecutor across N concurrent calls. N is bounded by free VRAM.
5
+
6
+ Memory per concurrent translation at beam=2, max_len=256:
7
+ fp16: ~25 MB (encoder + decoder state + beam tensors + kv-cache)
8
+ bf16: ~25 MB
9
+ fp32: ~50 MB
10
+ """
11
+
12
+ from __future__ import annotations
13
+ import warnings
14
+
15
+
16
+ _MEM_PER_SENT_MB = {
17
+ "float16": 25,
18
+ "bfloat16": 25,
19
+ "float32": 50,
20
+ "int8-dynamic": 12,
21
+ }
22
+ _MAX_BATCH = 64 # ControlMT.translate() beam-search per-sentence — gain plateaus past ~16
23
+ _TARGET_VRAM_FRAC = 0.8
24
+
25
+
26
+ def auto_batch_size(device: str, dtype: str, quant: str = "none",
27
+ free_vram_mb: float | None = None) -> int:
28
+ """Estimate batch size that fits in ~80% of free VRAM.
29
+
30
+ Returns 1 on CPU (no auto-batching there) with a warning.
31
+ On GPU returns 1..MAX_BATCH.
32
+ """
33
+ if device != "cuda":
34
+ warnings.warn(
35
+ "auto_batch=True ignored on CPU (no VRAM signal to size against). "
36
+ "Pass an explicit batch_size for batched CPU translation.",
37
+ RuntimeWarning, stacklevel=2)
38
+ return 1
39
+
40
+ if free_vram_mb is None:
41
+ import torch
42
+ free, _total = torch.cuda.mem_get_info()
43
+ free_vram_mb = free / 1024**2
44
+
45
+ per_sent_mb = _MEM_PER_SENT_MB.get(quant if quant != "none" else dtype, 50)
46
+ n = int((free_vram_mb * _TARGET_VRAM_FRAC) // per_sent_mb)
47
+ return max(1, min(_MAX_BATCH, n))
controlmt/client.py ADDED
@@ -0,0 +1,202 @@
1
+ """High-level client for ControlMT v2.3."""
2
+
3
+ from __future__ import annotations
4
+ import re
5
+ import time
6
+ import warnings
7
+ from concurrent.futures import ThreadPoolExecutor
8
+ from typing import Iterable, Sequence
9
+
10
+ from controlmt.device import ResolvedConfig, detect_libraries, resolve
11
+ from controlmt.batching import auto_batch_size
12
+
13
+
14
+ DEFAULT_MODEL_ID = "anandkaman/controlmt-v2.3"
15
+
16
+ # Heuristic: 'kn2en' if input is mostly Kannada chars, 'en2kn' otherwise.
17
+ _KN_RE = re.compile(r"[ಀ-೿]")
18
+
19
+
20
+ class ControlMT:
21
+ """High-level wrapper around the HuggingFace `model.translate()` API.
22
+
23
+ Usage:
24
+ from controlmt import ControlMT
25
+ model = ControlMT.from_hf() # auto
26
+ model.translate("ನಾನು ಕನ್ನಡ ಮಾತನಾಡುತ್ತೇನೆ.")
27
+ model.batch_translate(texts, batch_size=8)
28
+ model.batch_translate(texts, auto_batch=True) # GPU only
29
+ """
30
+
31
+ # ──────────────────────────────────────────────────────────────
32
+ # Construction
33
+ # ──────────────────────────────────────────────────────────────
34
+ def __init__(self, hf_model, tokenizer, config: ResolvedConfig, model_id: str):
35
+ self._model = hf_model
36
+ self._tokenizer = tokenizer
37
+ self.config = config
38
+ self.model_id = model_id
39
+
40
+ @classmethod
41
+ def from_hf(
42
+ cls,
43
+ model_id: str = DEFAULT_MODEL_ID,
44
+ *,
45
+ device: str = "auto", # "auto" | "gpu" | "cuda" | "cpu"
46
+ dtype: str | None = None, # "float32" | "bfloat16" | "float16" | None
47
+ quant: str = "none", # "none" | "int8" (CPU-only)
48
+ revision: str | None = None, # HF revision
49
+ verbose: bool = False,
50
+ ) -> "ControlMT":
51
+ """Load ControlMT from HuggingFace + auto-resolve config.
52
+
53
+ Defaults: GPU if available, else CPU. fp16 on GPU, bf16 on CPU. No quantization.
54
+ """
55
+ import torch
56
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
57
+
58
+ cfg = resolve(device=device, dtype=dtype, quant=quant)
59
+
60
+ if verbose:
61
+ libs = detect_libraries()
62
+ print(f"[controlmt] loading {model_id} ({cfg.describe()})")
63
+ print(f"[controlmt] env torch={libs.get('torch')} transformers={libs.get('transformers')} "
64
+ f"cuda={libs.get('_cuda')} device={libs.get('_cuda_device')}")
65
+
66
+ torch_dtype = {"float32": torch.float32, "bfloat16": torch.bfloat16,
67
+ "float16": torch.float16}[cfg.dtype_str]
68
+
69
+ load_kw = {"trust_remote_code": True}
70
+ if revision: load_kw["revision"] = revision
71
+ if cfg.quant != "int8-dynamic" and cfg.dtype_str != "float32":
72
+ load_kw["dtype"] = torch_dtype # let HF cast during load
73
+
74
+ tokenizer = AutoTokenizer.from_pretrained(model_id, **{k: v for k, v in load_kw.items() if k != "dtype"})
75
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_id, **load_kw)
76
+
77
+ # int8 dynamic quantization (CPU-only — checked in resolve())
78
+ if cfg.quant == "int8-dynamic":
79
+ model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
80
+
81
+ model = model.to(torch.device(cfg.device)).eval()
82
+ return cls(hf_model=model, tokenizer=tokenizer, config=cfg, model_id=model_id)
83
+
84
+ # ──────────────────────────────────────────────────────────────
85
+ # Inference
86
+ # ──────────────────────────────────────────────────────────────
87
+ def translate(
88
+ self,
89
+ text: str,
90
+ *,
91
+ direction: str | None = None, # "kn2en" | "en2kn" | None=auto-detect
92
+ num_beams: int = 2,
93
+ anti_lm_alpha: float = 0.5,
94
+ max_length: int = 200,
95
+ ) -> str:
96
+ """Translate one sentence. Direction auto-detected from input script if not given."""
97
+ if not text or not text.strip():
98
+ return ""
99
+ if direction is None:
100
+ direction = self.detect_direction(text)
101
+ return self._model.translate(
102
+ text.strip(),
103
+ tokenizer=self._tokenizer,
104
+ direction=direction,
105
+ num_beams=num_beams,
106
+ anti_lm_alpha=anti_lm_alpha,
107
+ max_length=max_length,
108
+ )
109
+
110
+ def batch_translate(
111
+ self,
112
+ texts: Sequence[str],
113
+ *,
114
+ batch_size: int | None = None, # None → 1 (safe), or int N
115
+ auto_batch: bool = False, # GPU only — auto-fit by VRAM
116
+ direction: str | None = None,
117
+ num_beams: int = 2,
118
+ anti_lm_alpha: float = 0.5,
119
+ max_length: int = 200,
120
+ ) -> list[str]:
121
+ """Translate a batch. User must supply batch_size, or auto_batch=True on GPU.
122
+
123
+ Policy (matching DEPLOYMENT.md §11):
124
+ - batch_size=None and auto_batch=False → batch_size=1 (one at a time, safe)
125
+ - batch_size=N → uses N concurrent translations
126
+ - auto_batch=True → GPU: probe free VRAM, pick N; CPU: warn + N=1
127
+
128
+ We use a ThreadPoolExecutor — each thread runs one model.translate() call.
129
+ For our model this is ~the same throughput as a true batched call (beam
130
+ search is per-sentence), with much simpler code.
131
+ """
132
+ if not texts:
133
+ return []
134
+
135
+ if auto_batch:
136
+ batch_size = auto_batch_size(
137
+ device=self.config.device,
138
+ dtype=self.config.dtype_str,
139
+ quant=self.config.quant,
140
+ )
141
+ if batch_size is None:
142
+ batch_size = 1
143
+ batch_size = max(1, int(batch_size))
144
+
145
+ # If batch_size=1, no need for threadpool overhead
146
+ if batch_size == 1:
147
+ return [self.translate(t, direction=direction, num_beams=num_beams,
148
+ anti_lm_alpha=anti_lm_alpha, max_length=max_length)
149
+ for t in texts]
150
+
151
+ def _one(t: str) -> str:
152
+ return self.translate(t, direction=direction, num_beams=num_beams,
153
+ anti_lm_alpha=anti_lm_alpha, max_length=max_length)
154
+
155
+ with ThreadPoolExecutor(max_workers=batch_size) as ex:
156
+ return list(ex.map(_one, texts))
157
+
158
+ # ──────────────────────────────────────────────────────────────
159
+ # Helpers
160
+ # ──────────────────────────────────────────────────────────────
161
+ @staticmethod
162
+ def detect_direction(text: str) -> str:
163
+ """Heuristic: > 30% Kannada characters → kn2en, else en2kn."""
164
+ if not text: return "en2kn"
165
+ kn_chars = sum(1 for c in text if _KN_RE.match(c))
166
+ total = sum(1 for c in text if not c.isspace() and c.isprintable())
167
+ return "kn2en" if total and kn_chars / total > 0.3 else "en2kn"
168
+
169
+ def warmup(self) -> float:
170
+ """JIT/compile kernels on a throwaway translation. Returns elapsed seconds."""
171
+ t0 = time.time()
172
+ self.translate("hello.", direction="en2kn", num_beams=1, max_length=20)
173
+ return time.time() - t0
174
+
175
+ def benchmark(self, num_beams: int = 2) -> dict:
176
+ """Run the 6-pair DEPLOYMENT.md verification suite on this loaded model.
177
+ Returns the same shape as scripts/verify_deployment.py's JSON output."""
178
+ TEST_PAIRS = [
179
+ ("kn2en", "ನಾನು ಕನ್ನಡ ಮಾತನಾಡುತ್ತೇನೆ."),
180
+ ("kn2en", "ಬೆಂಗಳೂರಿನಲ್ಲಿ ಮೆಟ್ರೋ ಬಹಳ ಅನುಕೂಲಕರವಾಗಿದೆ."),
181
+ ("kn2en", "ಆಪಲ್ ಹೊಸ ಐಫೋನ್ 17 ಬಿಡುಗಡೆ ಮಾಡಿದೆ."),
182
+ ("en2kn", "I speak Kannada."),
183
+ ("en2kn", "The new metro line opens next month."),
184
+ ("en2kn", "Please transfer money to my UPI ID."),
185
+ ]
186
+ self.warmup()
187
+ rows = []
188
+ for direction, src in TEST_PAIRS:
189
+ t0 = time.time()
190
+ out = self.translate(src, direction=direction, num_beams=num_beams)
191
+ rows.append({"direction": direction, "src": src, "out": out,
192
+ "latency_s": round(time.time() - t0, 3)})
193
+ lats = sorted([r["latency_s"] for r in rows])
194
+ return {
195
+ "config": self.config.describe(),
196
+ "num_beams": num_beams,
197
+ "rows": rows,
198
+ "median_latency_s": lats[len(lats)//2],
199
+ }
200
+
201
+ def __repr__(self) -> str:
202
+ return f"<ControlMT model_id={self.model_id!r} {self.config.describe()}>"
controlmt/device.py ADDED
@@ -0,0 +1,129 @@
1
+ """Device + dtype + quantization auto-detection.
2
+
3
+ Resolves the user's (device, dtype, quant) preferences into concrete torch
4
+ objects. Always returns a usable config — falls back to CPU if GPU is
5
+ requested but unavailable, with a warning.
6
+ """
7
+
8
+ from __future__ import annotations
9
+ import warnings
10
+ from dataclasses import dataclass
11
+
12
+
13
+ @dataclass
14
+ class ResolvedConfig:
15
+ device: str # "cuda" or "cpu"
16
+ dtype_str: str # "float32" | "bfloat16" | "float16"
17
+ quant: str # "none" | "int8-dynamic"
18
+ bf16_cpu: bool # convenience flag — bf16 works on CPU iff torch supports it
19
+
20
+ def describe(self) -> str:
21
+ bits = [self.device, self.dtype_str]
22
+ if self.quant != "none":
23
+ bits.append(self.quant)
24
+ return " · ".join(bits)
25
+
26
+
27
+ def resolve(
28
+ device: str = "auto", # "auto" | "gpu" | "cuda" | "cpu"
29
+ dtype: str | None = None, # "float32" | "bfloat16" | "float16" | None
30
+ quant: str = "none", # "none" | "int8" / "int8-dynamic"
31
+ ) -> ResolvedConfig:
32
+ """Resolve the user's preferences into a concrete config.
33
+
34
+ Logic:
35
+ - device="auto" (default): GPU if available, else CPU with a quiet info note
36
+ - device="gpu" or "cuda": GPU required; fall back to CPU with a WARNING if absent
37
+ - device="cpu": forces CPU
38
+ - dtype=None: pick the best dtype for the resolved device
39
+ GPU: float16 (broadest compatibility — works on Volta+, Pascal too)
40
+ CPU: bfloat16 if torch supports it (~2.8× faster than fp32 in our tests),
41
+ else float32
42
+ - quant: "int8" / "int8-dynamic" only valid on CPU. GPU + int8 = silently
43
+ falls back to fp16 with a warning (custom-arch incompat with bitsandbytes —
44
+ see DEPLOYMENT.md §9).
45
+ """
46
+ import torch
47
+
48
+ # ── device ────────────────────────────────────────────────────
49
+ device = device.lower()
50
+ if device in ("gpu", "cuda"):
51
+ if torch.cuda.is_available():
52
+ resolved_device = "cuda"
53
+ else:
54
+ warnings.warn("device='gpu' requested but CUDA unavailable; falling back to CPU.",
55
+ RuntimeWarning, stacklevel=2)
56
+ resolved_device = "cpu"
57
+ elif device == "cpu":
58
+ resolved_device = "cpu"
59
+ elif device == "auto":
60
+ resolved_device = "cuda" if torch.cuda.is_available() else "cpu"
61
+ else:
62
+ raise ValueError(f"unknown device {device!r} — use 'auto', 'gpu', 'cuda', or 'cpu'")
63
+
64
+ # ── quant ─────────────────────────────────────────────────────
65
+ quant = (quant or "none").lower().replace("-dynamic", "")
66
+ if quant in ("int8", "int8_dynamic"):
67
+ quant = "int8-dynamic"
68
+ if quant not in ("none", "int8-dynamic"):
69
+ raise ValueError(f"unknown quant {quant!r} — use 'none' or 'int8' (CPU-only)")
70
+
71
+ if quant == "int8-dynamic" and resolved_device == "cuda":
72
+ warnings.warn(
73
+ "int8 dynamic quantization is CPU-only on this model "
74
+ "(see DEPLOYMENT.md §9). Falling back to fp16 on GPU.",
75
+ RuntimeWarning, stacklevel=2)
76
+ quant = "none"
77
+
78
+ # ── dtype ─────────────────────────────────────────────────────
79
+ bf16_cpu_ok = _cpu_supports_bf16(torch)
80
+ if dtype is None:
81
+ if quant == "int8-dynamic":
82
+ resolved_dtype = "float32" # quantize_dynamic operates on fp32 weights
83
+ elif resolved_device == "cuda":
84
+ resolved_dtype = "float16" # widest GPU compatibility
85
+ else:
86
+ resolved_dtype = "bfloat16" if bf16_cpu_ok else "float32"
87
+ else:
88
+ dtype = dtype.lower().replace("fp32", "float32").replace("fp16", "float16").replace("bf16", "bfloat16")
89
+ if dtype not in ("float32", "float16", "bfloat16"):
90
+ raise ValueError(f"unknown dtype {dtype!r} — use float32/float16/bfloat16")
91
+ resolved_dtype = dtype
92
+
93
+ return ResolvedConfig(
94
+ device=resolved_device,
95
+ dtype_str=resolved_dtype,
96
+ quant=quant,
97
+ bf16_cpu=bf16_cpu_ok,
98
+ )
99
+
100
+
101
+ def _cpu_supports_bf16(torch_mod) -> bool:
102
+ """Quick runtime probe — try a 1-element bf16 add. Some old CPUs and some
103
+ libtorch builds segfault on bf16; this catches that path before model load."""
104
+ try:
105
+ x = torch_mod.zeros(1, dtype=torch_mod.bfloat16)
106
+ _ = x + x
107
+ return True
108
+ except Exception:
109
+ return False
110
+
111
+
112
+ def detect_libraries() -> dict:
113
+ """Probe what's installed. Used for the SDK's status banner + auto-picking
114
+ code paths (e.g. if onnxruntime is present, future ONNX backend can be used)."""
115
+ out = {}
116
+ for name in ("torch", "transformers", "accelerate", "bitsandbytes", "onnxruntime", "safetensors"):
117
+ try:
118
+ mod = __import__(name)
119
+ out[name] = getattr(mod, "__version__", "?")
120
+ except ImportError:
121
+ out[name] = None
122
+ try:
123
+ import torch
124
+ out["_cuda"] = torch.cuda.is_available()
125
+ out["_cuda_device"] = torch.cuda.get_device_name(0) if out["_cuda"] else None
126
+ except Exception:
127
+ out["_cuda"] = False
128
+ out["_cuda_device"] = None
129
+ return out
@@ -0,0 +1,135 @@
1
+ Metadata-Version: 2.4
2
+ Name: controlmt
3
+ Version: 0.1.0
4
+ Summary: Python SDK for the ControlMT v2.3 Kannada↔English translator
5
+ Author: Anand Kaman
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://huggingface.co/anandkaman/controlmt-v2.3
8
+ Project-URL: Source, https://github.com/anandkaman/ControlMT
9
+ Project-URL: Demo, https://huggingface.co/spaces/anandkaman/controlmt-demo
10
+ Keywords: translation,kannada,machine-translation,nlp,indic,controlmt
11
+ Classifier: License :: OSI Approved :: Apache Software License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Topic :: Text Processing :: Linguistic
19
+ Requires-Python: >=3.10
20
+ Description-Content-Type: text/markdown
21
+ Requires-Dist: torch<3,>=2.0
22
+ Requires-Dist: transformers<5,>=4.40
23
+ Requires-Dist: sentencepiece>=0.1.99
24
+ Requires-Dist: safetensors>=0.4
25
+ Requires-Dist: huggingface_hub>=0.27
26
+ Provides-Extra: test
27
+ Requires-Dist: pytest>=7; extra == "test"
28
+
29
+ # controlmt
30
+
31
+ Python SDK for **[ControlMT v2.3](https://huggingface.co/anandkaman/controlmt-v2.3)** —
32
+ a compact 139M-parameter Kannada ↔ English translator.
33
+
34
+ ```bash
35
+ pip install controlmt
36
+ ```
37
+
38
+ ## Quick start
39
+
40
+ ```python
41
+ from controlmt import ControlMT
42
+
43
+ model = ControlMT.from_hf() # auto everything
44
+ print(model.translate("ನಾನು ಕನ್ನಡ ಮಾತನಾಡುತ್ತೇನೆ."))
45
+ # → "I speak Kannada."
46
+ ```
47
+
48
+ That's it. The SDK auto-detects:
49
+ - **Device** — GPU if CUDA is available, else CPU (overridable)
50
+ - **Dtype** — fp16 on GPU, bf16 on CPU (overridable; bf16 only when supported)
51
+ - **Direction** — Kannada → English if input is mostly Kannada chars, else reverse
52
+
53
+ ## Explicit control
54
+
55
+ ```python
56
+ # Force CPU, even on a GPU box
57
+ model = ControlMT.from_hf(device="cpu")
58
+
59
+ # Force GPU; falls back to CPU with a warning if not present
60
+ model = ControlMT.from_hf(device="gpu")
61
+
62
+ # Pick an exact dtype
63
+ model = ControlMT.from_hf(device="cpu", dtype="bf16") # bfloat16
64
+ model = ControlMT.from_hf(device="gpu", dtype="fp16") # float16
65
+ model = ControlMT.from_hf(dtype="fp32") # full precision
66
+
67
+ # CPU int8 dynamic quantization (~2× faster than bf16 on CPU)
68
+ model = ControlMT.from_hf(device="cpu", quant="int8")
69
+
70
+ # Specific HF revision (e.g. a pre-quantized branch)
71
+ model = ControlMT.from_hf(model_id="anandkaman/controlmt-v2.3-int8", quant="int8")
72
+
73
+ # Loud: print the auto-pick decisions
74
+ model = ControlMT.from_hf(verbose=True)
75
+ ```
76
+
77
+ Inspect the resolved config:
78
+ ```python
79
+ >>> model
80
+ <ControlMT model_id='anandkaman/controlmt-v2.3' cuda · float16>
81
+ >>> model.config
82
+ ResolvedConfig(device='cuda', dtype_str='float16', quant='none', bf16_cpu=True)
83
+ ```
84
+
85
+ ## Batched translation
86
+
87
+ By design: **you must specify `batch_size` to opt into batching.** Otherwise the SDK
88
+ runs one sentence at a time — predictable memory, no surprises.
89
+
90
+ ```python
91
+ texts = ["ನಾನು ಕನ್ನಡ.", "I speak English.", ...]
92
+
93
+ # Default: one at a time (safe everywhere)
94
+ outs = model.batch_translate(texts)
95
+
96
+ # Explicit fixed batch size
97
+ outs = model.batch_translate(texts, batch_size=8)
98
+
99
+ # Auto-pick batch size from free VRAM (GPU only)
100
+ outs = model.batch_translate(texts, auto_batch=True)
101
+ ```
102
+
103
+ | Mode | CPU | GPU |
104
+ |---|---|---|
105
+ | `(no batch_size, no auto_batch)` | 1 sentence at a time | 1 sentence at a time |
106
+ | `batch_size=N` | uses N | uses N |
107
+ | `auto_batch=True` | ignored + warning → 1 | probes `torch.cuda.mem_get_info()`, picks N ≤ 64 |
108
+
109
+ ## Other endpoints
110
+
111
+ ```python
112
+ # Heuristic direction detection (>30% KN chars → kn2en, else en2kn)
113
+ ControlMT.detect_direction("ನಾನು ಕನ್ನಡ.") # → "kn2en"
114
+
115
+ # JIT/compile warmup — kills the 5–10s "first request" lag in production
116
+ model.warmup()
117
+
118
+ # Run the 6-pair DEPLOYMENT.md benchmark suite on YOUR hardware
119
+ result = model.benchmark()
120
+ # {'config': 'cuda · float16', 'num_beams': 2, 'median_latency_s': 0.19, 'rows': [...]}
121
+ ```
122
+
123
+ ## Architecture note
124
+
125
+ ControlMT v2.3 is an **encoder-decoder seq2seq** model (T5/mBART family), not a
126
+ decoder-only LM. That means:
127
+
128
+ - ✅ Works: this SDK, raw Transformers, FastAPI, Docker, HF Inference Endpoints
129
+ - ❌ Doesn't work without significant adapter work: vLLM, Ollama, llama.cpp/GGUF, HF TGI
130
+
131
+ See [DEPLOYMENT.md](https://huggingface.co/anandkaman/controlmt-v2.3/blob/main/DEPLOYMENT.md) §9 for the full "not supported" table and why.
132
+
133
+ ## License
134
+
135
+ Apache 2.0. Same as the underlying model weights.
@@ -0,0 +1,9 @@
1
+ controlmt/__init__.py,sha256=gcO_c2HXEmaAEbB31WRYK7polvVQgbPr0rJJJcwN3DU,1096
2
+ controlmt/_version.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
3
+ controlmt/batching.py,sha256=w7-G6ayshjM7tuD1HIVDLO9Os-VggmzZmjGNw2WE05E,1511
4
+ controlmt/client.py,sha256=1-K2mkdvY_GHY8lhcdolT8b2uRRl_VoK_PY4eHhQJT8,9221
5
+ controlmt/device.py,sha256=k2EImENU5Vxtu1nKUCvK2I9rMha_KYqooSjjt7gbvD8,5435
6
+ controlmt-0.1.0.dist-info/METADATA,sha256=AKtjzm9IcA-QGZePWOawV0ZSeoRX96GKAG5p5vQ-kIE,4639
7
+ controlmt-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
8
+ controlmt-0.1.0.dist-info/top_level.txt,sha256=1fMNkQVgA081BBsoTHm_8vD7BpDL-kFl3t5VvdJZkbY,10
9
+ controlmt-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ controlmt