model-gear 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- model_gear/__init__.py +11 -0
- model_gear/__main__.py +8 -0
- model_gear/assess.py +268 -0
- model_gear/cli/__init__.py +149 -0
- model_gear/cli/_commands/__init__.py +0 -0
- model_gear/cli/_commands/assess.py +53 -0
- model_gear/cli/_commands/benchmark.py +57 -0
- model_gear/cli/_commands/cli.py +38 -0
- model_gear/cli/_commands/doctor.py +152 -0
- model_gear/cli/_commands/explain.py +38 -0
- model_gear/cli/_commands/init.py +73 -0
- model_gear/cli/_commands/learn.py +125 -0
- model_gear/cli/_commands/overview.py +179 -0
- model_gear/cli/_commands/serve.py +62 -0
- model_gear/cli/_commands/status.py +60 -0
- model_gear/cli/_commands/stop.py +50 -0
- model_gear/cli/_commands/switch.py +89 -0
- model_gear/cli/_commands/whoami.py +118 -0
- model_gear/cli/_errors.py +42 -0
- model_gear/cli/_output.py +56 -0
- model_gear/cli/_runtime_ops.py +57 -0
- model_gear/explain/__init__.py +27 -0
- model_gear/explain/catalog.py +244 -0
- model_gear/runtime/__init__.py +8 -0
- model_gear/runtime/_compose.py +187 -0
- model_gear/runtime/_env.py +70 -0
- model_gear/runtime/_health.py +60 -0
- model_gear/templates/__init__.py +5 -0
- model_gear/templates/docker-compose.yml +70 -0
- model_gear/templates/env.example +29 -0
- model_gear-0.5.0.dist-info/METADATA +118 -0
- model_gear-0.5.0.dist-info/RECORD +35 -0
- model_gear-0.5.0.dist-info/WHEEL +4 -0
- model_gear-0.5.0.dist-info/entry_points.txt +2 -0
- model_gear-0.5.0.dist-info/licenses/LICENSE +21 -0
model_gear/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""model-gear — run, assess, and switch the local vLLM model."""
|
|
2
|
+
|
|
3
|
+
from importlib.metadata import PackageNotFoundError
|
|
4
|
+
from importlib.metadata import version as _v
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
__version__ = _v("model-gear")
|
|
8
|
+
except PackageNotFoundError: # editable install without metadata
|
|
9
|
+
__version__ = "0.0.0+local"
|
|
10
|
+
|
|
11
|
+
__all__ = ["__version__"]
|
model_gear/__main__.py
ADDED
model_gear/assess.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
"""API-side assessment and benchmark of a vLLM-served model (stdlib only).
|
|
2
|
+
|
|
3
|
+
Talks only to the OpenAI-compatible endpoint (``urllib``, no third-party deps).
|
|
4
|
+
Ported from the original ``_assess.py`` and split into two concerns:
|
|
5
|
+
|
|
6
|
+
* :func:`run_correctness` — fixed correctness probes + reasoning-trace detection
|
|
7
|
+
(drives ``model assess``);
|
|
8
|
+
* :func:`run_benchmark` — decode throughput + prefill latency (drives
|
|
9
|
+
``model benchmark``).
|
|
10
|
+
|
|
11
|
+
Host-side facts (image tag, GPU memory) are gathered by the command handlers via
|
|
12
|
+
:mod:`model_gear.runtime._compose` and printed alongside this output.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import contextlib
|
|
18
|
+
import json
|
|
19
|
+
import time
|
|
20
|
+
import urllib.request
|
|
21
|
+
|
|
22
|
+
from model_gear.cli._errors import EXIT_ENV_ERROR, ModelGearError
|
|
23
|
+
|
|
24
|
+
# urllib.error.URLError is a subclass of OSError, so `except OSError` covers
|
|
25
|
+
# connection failures, timeouts, and HTTPError without listing it redundantly.
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@contextlib.contextmanager
|
|
29
|
+
def _api_errors(what: str):
|
|
30
|
+
"""Turn raw HTTP / JSON / response-shape failures into a structured error.
|
|
31
|
+
|
|
32
|
+
Without this, an ``HTTPError``/``URLError`` or an unexpected payload
|
|
33
|
+
(``KeyError``/``JSONDecodeError``) bubbles to the dispatcher's catch-all and
|
|
34
|
+
appears as ``unexpected: ...`` with no remediation.
|
|
35
|
+
"""
|
|
36
|
+
try:
|
|
37
|
+
yield
|
|
38
|
+
except ModelGearError:
|
|
39
|
+
raise
|
|
40
|
+
except OSError as exc:
|
|
41
|
+
raise ModelGearError(
|
|
42
|
+
code=EXIT_ENV_ERROR,
|
|
43
|
+
message=f"{what} failed: {exc}",
|
|
44
|
+
remediation="check 'model status' / 'docker logs model-gear-vllm'",
|
|
45
|
+
) from exc
|
|
46
|
+
except (json.JSONDecodeError, KeyError, IndexError, TypeError) as exc:
|
|
47
|
+
raise ModelGearError(
|
|
48
|
+
code=EXIT_ENV_ERROR,
|
|
49
|
+
message=f"{what}: unexpected response shape ({exc.__class__.__name__}: {exc})",
|
|
50
|
+
remediation="the served model returned an unexpected payload; check the vLLM logs",
|
|
51
|
+
) from exc
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# (prompt, expected-substring, table-label) — the two fixed correctness probes.
|
|
55
|
+
_PROBES = [
|
|
56
|
+
("What is 17 * 23?", "391", "`17 * 23 = 391`"),
|
|
57
|
+
(
|
|
58
|
+
"If a train leaves at 14:45 and arrives at 17:10, how long is the journey in minutes?",
|
|
59
|
+
"145",
|
|
60
|
+
"train 14:45→17:10 = 145 min",
|
|
61
|
+
),
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _post(url: str, payload: dict, timeout: int = 300) -> dict:
|
|
66
|
+
data = json.dumps(payload).encode()
|
|
67
|
+
req = urllib.request.Request(
|
|
68
|
+
url + "/v1/chat/completions",
|
|
69
|
+
data=data,
|
|
70
|
+
headers={"Content-Type": "application/json"},
|
|
71
|
+
)
|
|
72
|
+
with urllib.request.urlopen(req, timeout=timeout) as r: # local endpoint only
|
|
73
|
+
return json.load(r)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _get(url: str, path: str, timeout: int = 10):
|
|
77
|
+
with urllib.request.urlopen(url + path, timeout=timeout) as r: # local endpoint only
|
|
78
|
+
if r.headers.get("content-type", "").startswith("application/json"):
|
|
79
|
+
return r.status, json.load(r)
|
|
80
|
+
return r.status, r.read().decode()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _trace_field(msg: dict) -> tuple[str | None, int]:
|
|
84
|
+
"""Return ``(field_name, length)`` of the reasoning trace, whichever key holds it.
|
|
85
|
+
|
|
86
|
+
vLLM builds vary: the ``<think>`` trace lands in ``reasoning`` on the nv26.04
|
|
87
|
+
image, ``reasoning_content`` on older builds.
|
|
88
|
+
"""
|
|
89
|
+
for key in ("reasoning", "reasoning_content"):
|
|
90
|
+
val = msg.get(key)
|
|
91
|
+
if isinstance(val, str) and val:
|
|
92
|
+
return key, len(val)
|
|
93
|
+
return None, 0
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def health_status(url: str) -> int:
|
|
97
|
+
"""Return the ``/health`` status code, or raise if the endpoint is unreachable."""
|
|
98
|
+
try:
|
|
99
|
+
status, _ = _get(url, "/health")
|
|
100
|
+
except OSError as exc:
|
|
101
|
+
raise ModelGearError(
|
|
102
|
+
code=EXIT_ENV_ERROR,
|
|
103
|
+
message=f"/health unreachable at {url} ({exc})",
|
|
104
|
+
remediation="start the server with 'model serve --apply'",
|
|
105
|
+
) from exc
|
|
106
|
+
return status
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def served_model(url: str, override: str | None = None) -> tuple[str, object]:
|
|
110
|
+
"""Return ``(model_id, max_model_len)`` from ``/v1/models``. Raises if none served."""
|
|
111
|
+
with _api_errors("/v1/models"):
|
|
112
|
+
_, models = _get(url, "/v1/models")
|
|
113
|
+
data = models.get("data") if isinstance(models, dict) else None
|
|
114
|
+
if not data:
|
|
115
|
+
raise ModelGearError(
|
|
116
|
+
code=EXIT_ENV_ERROR,
|
|
117
|
+
message=f"/v1/models returned no models at {url}",
|
|
118
|
+
remediation="check 'model status' / 'docker logs model-gear-vllm'",
|
|
119
|
+
)
|
|
120
|
+
first = data[0]
|
|
121
|
+
return (override or first["id"]), first.get("max_model_len")
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _probe(url: str, model: str, prompt: str, expect: str) -> dict:
|
|
125
|
+
d = _post(
|
|
126
|
+
url,
|
|
127
|
+
{
|
|
128
|
+
"model": model,
|
|
129
|
+
"messages": [{"role": "user", "content": prompt}],
|
|
130
|
+
"max_tokens": 2048,
|
|
131
|
+
"temperature": 0.3,
|
|
132
|
+
},
|
|
133
|
+
)
|
|
134
|
+
msg = d["choices"][0]["message"]
|
|
135
|
+
content = msg.get("content") or ""
|
|
136
|
+
field, tlen = _trace_field(msg)
|
|
137
|
+
return {
|
|
138
|
+
"ok": expect in content,
|
|
139
|
+
"expect": expect,
|
|
140
|
+
"trace_field": field,
|
|
141
|
+
"trace_len": tlen,
|
|
142
|
+
"finish": d["choices"][0].get("finish_reason"),
|
|
143
|
+
"completion_tokens": d.get("usage", {}).get("completion_tokens"),
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _decode_throughput(url: str, model: str, n_tokens: int, runs: int = 2) -> list[float]:
|
|
148
|
+
rates = []
|
|
149
|
+
for _ in range(runs):
|
|
150
|
+
t0 = time.monotonic()
|
|
151
|
+
d = _post(
|
|
152
|
+
url,
|
|
153
|
+
{
|
|
154
|
+
"model": model,
|
|
155
|
+
"messages": [
|
|
156
|
+
{"role": "user", "content": "Write a detailed essay about distributed systems."}
|
|
157
|
+
],
|
|
158
|
+
"max_tokens": n_tokens,
|
|
159
|
+
"temperature": 0,
|
|
160
|
+
"ignore_eos": True,
|
|
161
|
+
},
|
|
162
|
+
)
|
|
163
|
+
dt = time.monotonic() - t0
|
|
164
|
+
ct = d["usage"]["completion_tokens"]
|
|
165
|
+
rates.append(round(ct / dt, 1))
|
|
166
|
+
return rates
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _prefill(url: str, model: str) -> dict:
|
|
170
|
+
prompt = "Summarize this. " + "The system processes events. " * 400
|
|
171
|
+
t0 = time.monotonic()
|
|
172
|
+
d = _post(
|
|
173
|
+
url,
|
|
174
|
+
{
|
|
175
|
+
"model": model,
|
|
176
|
+
"messages": [{"role": "user", "content": prompt}],
|
|
177
|
+
"max_tokens": 16,
|
|
178
|
+
"temperature": 0,
|
|
179
|
+
},
|
|
180
|
+
)
|
|
181
|
+
dt = time.monotonic() - t0
|
|
182
|
+
return {"prompt_tokens": d["usage"]["prompt_tokens"], "seconds": round(dt, 2)}
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def run_correctness(url: str, model: str | None = None) -> dict:
|
|
186
|
+
"""Run the fixed correctness probes; return a structured result."""
|
|
187
|
+
url = url.rstrip("/")
|
|
188
|
+
hstatus = health_status(url)
|
|
189
|
+
model, max_len = served_model(url, model)
|
|
190
|
+
probes = []
|
|
191
|
+
with _api_errors("correctness probe"):
|
|
192
|
+
for prompt, expect, label in _PROBES:
|
|
193
|
+
result = _probe(url, model, prompt, expect)
|
|
194
|
+
result["label"] = label
|
|
195
|
+
probes.append(result)
|
|
196
|
+
trace_field = next((p["trace_field"] for p in probes if p["trace_field"]), None)
|
|
197
|
+
trace_len = max((p["trace_len"] for p in probes), default=0)
|
|
198
|
+
return {
|
|
199
|
+
"model": model,
|
|
200
|
+
"endpoint": url,
|
|
201
|
+
"health": hstatus,
|
|
202
|
+
"max_model_len": max_len,
|
|
203
|
+
"probes": probes,
|
|
204
|
+
"trace_field": trace_field or "(none)",
|
|
205
|
+
"trace_len": trace_len,
|
|
206
|
+
"passed": all(p["ok"] for p in probes),
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def run_benchmark(
|
|
211
|
+
url: str, model: str | None = None, decode_tokens: int = 512, runs: int = 2
|
|
212
|
+
) -> dict:
|
|
213
|
+
"""Measure decode throughput + prefill latency; return a structured result."""
|
|
214
|
+
url = url.rstrip("/")
|
|
215
|
+
health_status(url)
|
|
216
|
+
model, max_len = served_model(url, model)
|
|
217
|
+
with _api_errors("benchmark"):
|
|
218
|
+
rates = _decode_throughput(url, model, decode_tokens, runs)
|
|
219
|
+
pf = _prefill(url, model)
|
|
220
|
+
return {
|
|
221
|
+
"model": model,
|
|
222
|
+
"endpoint": url,
|
|
223
|
+
"max_model_len": max_len,
|
|
224
|
+
"decode_tokens": decode_tokens,
|
|
225
|
+
"decode_rates": rates,
|
|
226
|
+
"prefill": pf,
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def render_correctness(result: dict) -> str:
|
|
231
|
+
"""Render :func:`run_correctness` output as a markdown block for a per-model doc."""
|
|
232
|
+
lines = [
|
|
233
|
+
f"## Assessment — `{result['model']}`",
|
|
234
|
+
"",
|
|
235
|
+
f"- Endpoint: `{result['endpoint']}` · `/health` {result['health']} · "
|
|
236
|
+
f"`max_model_len` {result['max_model_len']}",
|
|
237
|
+
"",
|
|
238
|
+
"| Check | Result |",
|
|
239
|
+
"|---|---|",
|
|
240
|
+
]
|
|
241
|
+
for p in result["probes"]:
|
|
242
|
+
mark = "PASS" if p["ok"] else "FAIL"
|
|
243
|
+
lines.append(
|
|
244
|
+
f"| {p['label']} | {mark} (finish={p['finish']}, {p['completion_tokens']} tok) |"
|
|
245
|
+
)
|
|
246
|
+
lines.append(
|
|
247
|
+
f"| reasoning trace field | `{result['trace_field']}` (len {result['trace_len']}) |"
|
|
248
|
+
)
|
|
249
|
+
return "\n".join(lines)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def render_benchmark(result: dict) -> str:
|
|
253
|
+
"""Render :func:`run_benchmark` output as a markdown block for a per-model doc."""
|
|
254
|
+
rates = "/".join(str(r) for r in result["decode_rates"])
|
|
255
|
+
pf = result["prefill"]
|
|
256
|
+
return "\n".join(
|
|
257
|
+
[
|
|
258
|
+
f"## Benchmark — `{result['model']}`",
|
|
259
|
+
"",
|
|
260
|
+
f"- Endpoint: `{result['endpoint']}` · `max_model_len` {result['max_model_len']}",
|
|
261
|
+
"",
|
|
262
|
+
"| Metric | Result |",
|
|
263
|
+
"|---|---|",
|
|
264
|
+
f"| **decode throughput** | **{rates} tok/s** (batch=1, greedy, "
|
|
265
|
+
f"{result['decode_tokens']} tok forced) |",
|
|
266
|
+
f"| prefill | {pf['prompt_tokens']} prompt tokens + 16 gen in {pf['seconds']} s |",
|
|
267
|
+
]
|
|
268
|
+
)
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""Unified CLI entry point for model-gear (binary: ``model``).
|
|
2
|
+
|
|
3
|
+
The model-ops verbs (``switch``, ``serve``/``stop``, ``status``, ``assess``,
|
|
4
|
+
``benchmark``, ``init``) are the heart of the tool; the agent-first verbs
|
|
5
|
+
(``whoami``, ``learn``, ``explain``, ``overview``, ``doctor``, ``cli``) keep the
|
|
6
|
+
sibling rubric satisfied. Each verb module exposes ``register(sub)`` following
|
|
7
|
+
the same pattern.
|
|
8
|
+
|
|
9
|
+
Error propagation contract
|
|
10
|
+
--------------------------
|
|
11
|
+
Every handler raises :class:`model_gear.cli._errors.ModelGearError` on failure;
|
|
12
|
+
``main()`` catches it via :func:`_dispatch` and routes through
|
|
13
|
+
:mod:`model_gear.cli._output`. Unknown exceptions are wrapped into a
|
|
14
|
+
``ModelGearError`` so no Python traceback leaks to stderr.
|
|
15
|
+
|
|
16
|
+
Argparse errors (unknown verb, missing arg) also route through the structured
|
|
17
|
+
format — ``_ModelGearArgumentParser`` overrides ``.error()`` and the subparsers
|
|
18
|
+
are built with ``parser_class=_ModelGearArgumentParser``. Whether errors render
|
|
19
|
+
as text or JSON depends on whether ``--json`` appears in the raw argv
|
|
20
|
+
(:func:`main` sets ``_json_hint`` before ``parse_args``).
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import argparse
|
|
26
|
+
import sys
|
|
27
|
+
|
|
28
|
+
from model_gear import __version__
|
|
29
|
+
from model_gear.cli._errors import EXIT_USER_ERROR, ModelGearError
|
|
30
|
+
from model_gear.cli._output import emit_error
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class _ModelGearArgumentParser(argparse.ArgumentParser):
|
|
34
|
+
"""ArgumentParser that routes errors through :func:`emit_error`.
|
|
35
|
+
|
|
36
|
+
Argparse's default error handler writes ``prog: error: <msg>`` to stderr
|
|
37
|
+
and exits 2, skipping the ModelGearError plumbing (and the ``hint:`` line
|
|
38
|
+
agents look for). This subclass emits the structured format and exits with
|
|
39
|
+
:attr:`EXIT_USER_ERROR`.
|
|
40
|
+
|
|
41
|
+
JSON mode: parse-time errors happen before ``args.json`` exists, so we rely
|
|
42
|
+
on a class-level ``_json_hint`` that :func:`main` pre-populates by scanning
|
|
43
|
+
raw argv for ``--json``. Shared across all subparser instances.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
_json_hint: bool = False
|
|
47
|
+
|
|
48
|
+
def error(self, message: str) -> None: # type: ignore[override]
|
|
49
|
+
err = ModelGearError(
|
|
50
|
+
code=EXIT_USER_ERROR,
|
|
51
|
+
message=message,
|
|
52
|
+
remediation=f"run '{self.prog} --help' to see valid arguments",
|
|
53
|
+
)
|
|
54
|
+
emit_error(err, json_mode=type(self)._json_hint)
|
|
55
|
+
raise SystemExit(err.code)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _argv_has_json(argv: list[str] | None) -> bool:
|
|
59
|
+
tokens = argv if argv is not None else sys.argv[1:]
|
|
60
|
+
return any(t == "--json" or t.startswith("--json=") for t in tokens)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
64
|
+
from model_gear.cli._commands import assess as _assess_cmd
|
|
65
|
+
from model_gear.cli._commands import benchmark as _benchmark_cmd
|
|
66
|
+
from model_gear.cli._commands import cli as _cli_group
|
|
67
|
+
from model_gear.cli._commands import doctor as _doctor_cmd
|
|
68
|
+
from model_gear.cli._commands import explain as _explain_cmd
|
|
69
|
+
from model_gear.cli._commands import init as _init_cmd
|
|
70
|
+
from model_gear.cli._commands import learn as _learn_cmd
|
|
71
|
+
from model_gear.cli._commands import overview as _overview_cmd
|
|
72
|
+
from model_gear.cli._commands import serve as _serve_cmd
|
|
73
|
+
from model_gear.cli._commands import status as _status_cmd
|
|
74
|
+
from model_gear.cli._commands import stop as _stop_cmd
|
|
75
|
+
from model_gear.cli._commands import switch as _switch_cmd
|
|
76
|
+
from model_gear.cli._commands import whoami as _whoami_cmd
|
|
77
|
+
|
|
78
|
+
parser = _ModelGearArgumentParser(
|
|
79
|
+
prog="model",
|
|
80
|
+
description="model-gear — run, assess, and switch the local vLLM model",
|
|
81
|
+
)
|
|
82
|
+
parser.add_argument(
|
|
83
|
+
"--version",
|
|
84
|
+
action="version",
|
|
85
|
+
version=f"%(prog)s {__version__}",
|
|
86
|
+
)
|
|
87
|
+
# parser_class propagates to every subparser so their .error() routes
|
|
88
|
+
# through _ModelGearArgumentParser too.
|
|
89
|
+
sub = parser.add_subparsers(dest="command", parser_class=_ModelGearArgumentParser)
|
|
90
|
+
|
|
91
|
+
# Model-ops verbs (the heart of the tool).
|
|
92
|
+
_switch_cmd.register(sub)
|
|
93
|
+
_serve_cmd.register(sub)
|
|
94
|
+
_stop_cmd.register(sub)
|
|
95
|
+
_status_cmd.register(sub)
|
|
96
|
+
_assess_cmd.register(sub)
|
|
97
|
+
_benchmark_cmd.register(sub)
|
|
98
|
+
_init_cmd.register(sub)
|
|
99
|
+
|
|
100
|
+
# Agent-first / introspection verbs (sibling rubric).
|
|
101
|
+
_whoami_cmd.register(sub)
|
|
102
|
+
_learn_cmd.register(sub)
|
|
103
|
+
_explain_cmd.register(sub)
|
|
104
|
+
_overview_cmd.register(sub)
|
|
105
|
+
_doctor_cmd.register(sub)
|
|
106
|
+
_cli_group.register(sub)
|
|
107
|
+
|
|
108
|
+
return parser
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _dispatch(args: argparse.Namespace) -> int:
|
|
112
|
+
"""Invoke the registered handler and translate exceptions to exit codes.
|
|
113
|
+
|
|
114
|
+
A handler may return ``None`` (success, exit 0) or an ``int`` exit code.
|
|
115
|
+
Failures MUST raise :class:`ModelGearError`; any other exception is wrapped
|
|
116
|
+
into one so no Python traceback leaks.
|
|
117
|
+
"""
|
|
118
|
+
json_mode = bool(getattr(args, "json", False))
|
|
119
|
+
try:
|
|
120
|
+
rc = args.func(args)
|
|
121
|
+
except ModelGearError as err:
|
|
122
|
+
emit_error(err, json_mode=json_mode)
|
|
123
|
+
return err.code
|
|
124
|
+
except Exception as err: # noqa: BLE001 - last-resort; wrap and route cleanly
|
|
125
|
+
wrapped = ModelGearError(
|
|
126
|
+
code=EXIT_USER_ERROR,
|
|
127
|
+
message=f"unexpected: {err.__class__.__name__}: {err}",
|
|
128
|
+
remediation="file a bug at https://github.com/agentculture/model-gear/issues",
|
|
129
|
+
)
|
|
130
|
+
emit_error(wrapped, json_mode=json_mode)
|
|
131
|
+
return wrapped.code
|
|
132
|
+
return rc if rc is not None else 0
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def main(argv: list[str] | None = None) -> int:
|
|
136
|
+
# Pre-parse peek so argparse-level errors honour --json.
|
|
137
|
+
_ModelGearArgumentParser._json_hint = _argv_has_json(argv)
|
|
138
|
+
parser = _build_parser()
|
|
139
|
+
args = parser.parse_args(argv)
|
|
140
|
+
|
|
141
|
+
if args.command is None:
|
|
142
|
+
parser.print_help()
|
|
143
|
+
return 0
|
|
144
|
+
|
|
145
|
+
return _dispatch(args)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
if __name__ == "__main__":
|
|
149
|
+
sys.exit(main())
|
|
File without changes
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""``model assess`` — correctness probes against the served model.
|
|
2
|
+
|
|
3
|
+
Read-only. Runs the two fixed correctness probes and detects the reasoning-trace
|
|
4
|
+
field, then emits a markdown block (plus host-side facts) ready to paste into a
|
|
5
|
+
per-model doc under ``docs/``. Throughput lives in ``model benchmark``.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
|
|
12
|
+
from model_gear import assess as _assess
|
|
13
|
+
from model_gear.cli import _runtime_ops
|
|
14
|
+
from model_gear.cli._output import emit_result
|
|
15
|
+
from model_gear.runtime import _compose, _env
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def cmd_assess(args: argparse.Namespace) -> int:
|
|
19
|
+
json_mode = bool(getattr(args, "json", False))
|
|
20
|
+
port, deploy_dir = _runtime_ops.resolve_port_soft(args)
|
|
21
|
+
model = args.model
|
|
22
|
+
if model is None and deploy_dir is not None:
|
|
23
|
+
model = _env.read_env(deploy_dir / _compose.ENV_FILE, "VLLM_SERVED_NAME")
|
|
24
|
+
|
|
25
|
+
url = f"http://localhost:{port}"
|
|
26
|
+
result = _assess.run_correctness(url, model)
|
|
27
|
+
host = {"image": _compose.container_image(), "gpu_memory": _compose.gpu_engine_mem()}
|
|
28
|
+
|
|
29
|
+
if json_mode:
|
|
30
|
+
emit_result({**result, "host": host}, json_mode=True)
|
|
31
|
+
else:
|
|
32
|
+
header = (
|
|
33
|
+
"### Host-side\n"
|
|
34
|
+
f"- Image: `{host['image']}` · GPU memory (EngineCore): {host['gpu_memory']}\n"
|
|
35
|
+
)
|
|
36
|
+
emit_result(header + "\n" + _assess.render_correctness(result), json_mode=False)
|
|
37
|
+
return 0
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def register(sub: argparse._SubParsersAction) -> None:
|
|
41
|
+
p = sub.add_parser(
|
|
42
|
+
"assess",
|
|
43
|
+
help="Correctness probes against the served model (markdown for a per-model doc).",
|
|
44
|
+
)
|
|
45
|
+
p.add_argument("--port", type=int, help="Host port (default: VLLM_PORT in .env, else 8000).")
|
|
46
|
+
p.add_argument(
|
|
47
|
+
"--model", help="Served model name (default: VLLM_SERVED_NAME, else first /v1/models)."
|
|
48
|
+
)
|
|
49
|
+
p.add_argument(
|
|
50
|
+
"--compose-dir", help="Deployment dir (default: $MODEL_GEAR_DIR or ~/.model-gear)."
|
|
51
|
+
)
|
|
52
|
+
p.add_argument("--json", action="store_true", help="Emit structured JSON.")
|
|
53
|
+
p.set_defaults(func=cmd_assess)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""``model benchmark`` — decode throughput + prefill latency for the served model.
|
|
2
|
+
|
|
3
|
+
Read-only. Forces a fixed decode length over a couple of runs and measures a
|
|
4
|
+
large-prompt prefill, then emits a markdown block (plus host-side facts) for a
|
|
5
|
+
per-model doc under ``docs/``. Correctness lives in ``model assess``.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
|
|
12
|
+
from model_gear import assess as _assess
|
|
13
|
+
from model_gear.cli import _runtime_ops
|
|
14
|
+
from model_gear.cli._output import emit_result
|
|
15
|
+
from model_gear.runtime import _compose, _env
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def cmd_benchmark(args: argparse.Namespace) -> int:
|
|
19
|
+
json_mode = bool(getattr(args, "json", False))
|
|
20
|
+
port, deploy_dir = _runtime_ops.resolve_port_soft(args)
|
|
21
|
+
model = args.model
|
|
22
|
+
if model is None and deploy_dir is not None:
|
|
23
|
+
model = _env.read_env(deploy_dir / _compose.ENV_FILE, "VLLM_SERVED_NAME")
|
|
24
|
+
|
|
25
|
+
url = f"http://localhost:{port}"
|
|
26
|
+
result = _assess.run_benchmark(url, model, decode_tokens=args.decode_tokens, runs=args.runs)
|
|
27
|
+
host = {"image": _compose.container_image(), "gpu_memory": _compose.gpu_engine_mem()}
|
|
28
|
+
|
|
29
|
+
if json_mode:
|
|
30
|
+
emit_result({**result, "host": host}, json_mode=True)
|
|
31
|
+
else:
|
|
32
|
+
header = (
|
|
33
|
+
"### Host-side\n"
|
|
34
|
+
f"- Image: `{host['image']}` · GPU memory (EngineCore): {host['gpu_memory']}\n"
|
|
35
|
+
)
|
|
36
|
+
emit_result(header + "\n" + _assess.render_benchmark(result), json_mode=False)
|
|
37
|
+
return 0
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def register(sub: argparse._SubParsersAction) -> None:
|
|
41
|
+
p = sub.add_parser(
|
|
42
|
+
"benchmark",
|
|
43
|
+
help="Decode throughput + prefill latency for the served model (markdown for a doc).",
|
|
44
|
+
)
|
|
45
|
+
p.add_argument("--port", type=int, help="Host port (default: VLLM_PORT in .env, else 8000).")
|
|
46
|
+
p.add_argument(
|
|
47
|
+
"--model", help="Served model name (default: VLLM_SERVED_NAME, else first /v1/models)."
|
|
48
|
+
)
|
|
49
|
+
p.add_argument(
|
|
50
|
+
"--decode-tokens", type=int, default=512, help="Forced decode length (default 512)."
|
|
51
|
+
)
|
|
52
|
+
p.add_argument("--runs", type=int, default=2, help="Decode-throughput repetitions (default 2).")
|
|
53
|
+
p.add_argument(
|
|
54
|
+
"--compose-dir", help="Deployment dir (default: $MODEL_GEAR_DIR or ~/.model-gear)."
|
|
55
|
+
)
|
|
56
|
+
p.add_argument("--json", action="store_true", help="Emit structured JSON.")
|
|
57
|
+
p.set_defaults(func=cmd_benchmark)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""``model cli`` — noun grouping CLI-surface introspection.
|
|
2
|
+
|
|
3
|
+
Exists to satisfy the agent-first rubric's ``overview_cli_noun_exists`` check.
|
|
4
|
+
``model cli overview`` describes the CLI surface itself (distinct from the global
|
|
5
|
+
``overview``, which describes the tool and the served model).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
|
|
12
|
+
from model_gear.cli._commands.overview import cli_sections, emit_overview
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def cmd_cli_overview(args: argparse.Namespace) -> int:
|
|
16
|
+
emit_overview("model cli", cli_sections(), json_mode=bool(getattr(args, "json", False)))
|
|
17
|
+
return 0
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _no_verb(args: argparse.Namespace) -> int:
|
|
21
|
+
# `model cli` with no sub-verb prints the noun's overview.
|
|
22
|
+
return cmd_cli_overview(args)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def register(sub: argparse._SubParsersAction) -> None:
|
|
26
|
+
p = sub.add_parser(
|
|
27
|
+
"cli",
|
|
28
|
+
help="CLI-surface introspection (see 'model cli overview').",
|
|
29
|
+
)
|
|
30
|
+
p.add_argument("--json", action="store_true", help="Emit structured JSON.")
|
|
31
|
+
p.set_defaults(func=_no_verb, json=False)
|
|
32
|
+
# `p` is a _ModelGearArgumentParser (the top-level subparsers were built with
|
|
33
|
+
# that parser_class); propagate it so `cli overview` parse errors route through
|
|
34
|
+
# the structured error contract instead of argparse's default stderr/exit 2.
|
|
35
|
+
noun_sub = p.add_subparsers(dest="cli_command", parser_class=type(p))
|
|
36
|
+
ov = noun_sub.add_parser("overview", help="Describe the model-gear CLI surface.")
|
|
37
|
+
ov.add_argument("--json", action="store_true", help="Emit structured JSON.")
|
|
38
|
+
ov.set_defaults(func=cmd_cli_overview)
|