lobes-cli 0.27.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. lobes/__init__.py +11 -0
  2. lobes/__main__.py +8 -0
  3. lobes/_metrics.py +152 -0
  4. lobes/assess.py +404 -0
  5. lobes/catalog.py +225 -0
  6. lobes/cli/__init__.py +169 -0
  7. lobes/cli/_commands/__init__.py +0 -0
  8. lobes/cli/_commands/assess.py +57 -0
  9. lobes/cli/_commands/benchmark.py +96 -0
  10. lobes/cli/_commands/cli.py +38 -0
  11. lobes/cli/_commands/doctor.py +150 -0
  12. lobes/cli/_commands/explain.py +38 -0
  13. lobes/cli/_commands/fleet.py +181 -0
  14. lobes/cli/_commands/init.py +136 -0
  15. lobes/cli/_commands/learn.py +253 -0
  16. lobes/cli/_commands/logs.py +197 -0
  17. lobes/cli/_commands/overview.py +241 -0
  18. lobes/cli/_commands/serve.py +76 -0
  19. lobes/cli/_commands/status.py +66 -0
  20. lobes/cli/_commands/stop.py +48 -0
  21. lobes/cli/_commands/switch.py +528 -0
  22. lobes/cli/_commands/tunnel.py +181 -0
  23. lobes/cli/_commands/whoami.py +130 -0
  24. lobes/cli/_errors.py +42 -0
  25. lobes/cli/_live.py +148 -0
  26. lobes/cli/_output.py +56 -0
  27. lobes/cli/_runtime_ops.py +78 -0
  28. lobes/explain/__init__.py +27 -0
  29. lobes/explain/catalog.py +811 -0
  30. lobes/gateway/__init__.py +20 -0
  31. lobes/gateway/__main__.py +19 -0
  32. lobes/gateway/_config.py +142 -0
  33. lobes/gateway/_routing.py +126 -0
  34. lobes/gateway/server.py +533 -0
  35. lobes/profiles.py +241 -0
  36. lobes/realtime/__init__.py +21 -0
  37. lobes/realtime/__main__.py +13 -0
  38. lobes/realtime/_readiness.py +49 -0
  39. lobes/realtime/_settings.py +92 -0
  40. lobes/realtime/app.py +98 -0
  41. lobes/realtime/audio_facade.py +106 -0
  42. lobes/realtime/chatterbox_server.py +193 -0
  43. lobes/realtime/protocol.py +83 -0
  44. lobes/realtime/tts_client.py +381 -0
  45. lobes/runtime/__init__.py +8 -0
  46. lobes/runtime/_compose.py +394 -0
  47. lobes/runtime/_env.py +70 -0
  48. lobes/runtime/_health.py +60 -0
  49. lobes/runtime/_parser.py +51 -0
  50. lobes/runtime/_tunnel.py +367 -0
  51. lobes/templates/__init__.py +5 -0
  52. lobes/templates/cf-tunnel.env.example +31 -0
  53. lobes/templates/docker-compose.yml +119 -0
  54. lobes/templates/env.example +105 -0
  55. lobes/templates/fleet/Dockerfile.chatterbox +111 -0
  56. lobes/templates/fleet/Dockerfile.gateway +20 -0
  57. lobes/templates/fleet/Dockerfile.parakeet +31 -0
  58. lobes/templates/fleet/Dockerfile.realtime +20 -0
  59. lobes/templates/fleet/__init__.py +6 -0
  60. lobes/templates/fleet/_readiness.py +48 -0
  61. lobes/templates/fleet/docker-compose.audio.yml +144 -0
  62. lobes/templates/fleet/docker-compose.yml +254 -0
  63. lobes/templates/fleet/env.audio.example +40 -0
  64. lobes/templates/fleet/env.example +111 -0
  65. lobes/templates/fleet/listen_server.py +125 -0
  66. lobes/templates/mg-logwrap.sh +46 -0
  67. lobes_cli-0.27.0.dist-info/METADATA +400 -0
  68. lobes_cli-0.27.0.dist-info/RECORD +71 -0
  69. lobes_cli-0.27.0.dist-info/WHEEL +4 -0
  70. lobes_cli-0.27.0.dist-info/entry_points.txt +3 -0
  71. lobes_cli-0.27.0.dist-info/licenses/LICENSE +201 -0
lobes/__init__.py ADDED
@@ -0,0 +1,11 @@
1
+ """lobes — run, assess, and switch the local vLLM model."""
2
+
3
+ from importlib.metadata import PackageNotFoundError
4
+ from importlib.metadata import version as _v
5
+
6
+ try:
7
+ __version__ = _v("lobes-cli")
8
+ except PackageNotFoundError: # editable install without metadata
9
+ __version__ = "0.0.0+local"
10
+
11
+ __all__ = ["__version__"]
lobes/__main__.py ADDED
@@ -0,0 +1,8 @@
1
+ """Allow running lobes as ``python -m lobes``."""
2
+
3
+ import sys
4
+
5
+ from lobes.cli import main
6
+
7
+ if __name__ == "__main__":
8
+ sys.exit(main())
lobes/_metrics.py ADDED
@@ -0,0 +1,152 @@
1
+ """Parse vLLM Prometheus ``/metrics`` + probe a backend's live state (stdlib only).
2
+
3
+ Shared by the gateway's ``/status`` fan-out and ``lobes overview --live``. The
4
+ parser is pure; the probes are best-effort and **never raise** — an unreachable
5
+ backend folds into a structured result so the live view degrades gracefully
6
+ instead of erroring. vLLM serves ``/metrics`` and ``/health`` unauthenticated, so
7
+ no API key is needed for either.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import math
14
+ import urllib.request
15
+
16
+ # Cap a single GET body so a misbehaving backend can't stress memory/latency. A
17
+ # vLLM /metrics scrape is well under this; /health is tiny.
18
+ _MAX_BODY_BYTES = 5 * 1024 * 1024
19
+
20
+ # The handful of vLLM series the live view reports. "busy" = running/waiting now;
21
+ # "usage" = cumulative tokens + finished requests by reason. Summed across the
22
+ # engine/model labels vLLM attaches (a single backend may expose >1 engine).
23
+ _KV = "vllm:gpu_cache_usage_perc"
24
+ _SUCCESS = "vllm:request_success_total"
25
+ # Series that are simply summed → the live-view field they accumulate into.
26
+ _SUM_FIELDS = {
27
+ "vllm:num_requests_running": "running",
28
+ "vllm:num_requests_waiting": "waiting",
29
+ "vllm:prompt_tokens_total": "prompt_tokens",
30
+ "vllm:generation_tokens_total": "generation_tokens",
31
+ }
32
+
33
+
34
+ def _label(label_block: str, key: str) -> str | None:
35
+ """Extract ``key="value"`` from a Prometheus ``{...}`` label block (best-effort)."""
36
+ needle = f'{key}="'
37
+ start = label_block.find(needle)
38
+ if start < 0:
39
+ return None
40
+ start += len(needle)
41
+ end = label_block.find('"', start)
42
+ return label_block[start:end] if end > start else None
43
+
44
+
45
+ def _iter_samples(text: str):
46
+ """Yield ``(name, labels, value)`` for each finite metric sample line.
47
+
48
+ Skips comments, blanks, malformed lines, and non-finite values (NaN/inf would
49
+ later make ``int()`` raise — the parser is best-effort).
50
+ """
51
+ for raw in text.splitlines():
52
+ line = raw.strip()
53
+ if not line or line.startswith("#"):
54
+ continue
55
+ try:
56
+ left, value = line.rsplit(" ", 1)
57
+ val = float(value)
58
+ except ValueError:
59
+ continue
60
+ if not math.isfinite(val):
61
+ continue
62
+ brace = left.find("{")
63
+ name = left[:brace] if brace >= 0 else left
64
+ labels = left[brace:] if brace >= 0 else ""
65
+ yield name, labels, val
66
+
67
+
68
+ def parse_metrics(text: str) -> dict:
69
+ """Reduce a vLLM ``/metrics`` exposition to the live-view numbers.
70
+
71
+ Returns ints for counts/tokens and a ``by_finish_reason`` map; ``kv_cache_usage``
72
+ (0..1) is included only when the gauge is present. Unknown/malformed lines are
73
+ skipped, so a partial scrape still yields what it can.
74
+ """
75
+ sums = dict.fromkeys(_SUM_FIELDS.values(), 0.0)
76
+ kv: float | None = None
77
+ by_reason: dict[str, float] = {}
78
+ for name, labels, val in _iter_samples(text):
79
+ field = _SUM_FIELDS.get(name)
80
+ if field is not None:
81
+ sums[field] += val
82
+ elif name == _KV:
83
+ kv = val if kv is None else max(kv, val)
84
+ elif name == _SUCCESS:
85
+ reason = _label(labels, "finished_reason") or "?"
86
+ by_reason[reason] = by_reason.get(reason, 0.0) + val
87
+ out = {
88
+ "running": int(sums["running"]),
89
+ "waiting": int(sums["waiting"]),
90
+ "prompt_tokens": int(sums["prompt_tokens"]),
91
+ "generation_tokens": int(sums["generation_tokens"]),
92
+ "requests_succeeded": int(sum(by_reason.values())),
93
+ "by_finish_reason": {k: int(v) for k, v in by_reason.items() if v},
94
+ }
95
+ if kv is not None:
96
+ out["kv_cache_usage"] = round(kv, 3)
97
+ return out
98
+
99
+
100
+ def http_get_text(
101
+ url: str, *, timeout: float = 3.0, max_bytes: int = _MAX_BODY_BYTES
102
+ ) -> str | None:
103
+ """Best-effort GET → body text, or ``None`` if unreachable / non-2xx / oversized.
104
+
105
+ Reads at most ``max_bytes`` (+1 to detect overflow): an over-cap body is treated
106
+ as unavailable rather than buffered whole, so a misbehaving backend can't stress
107
+ memory. Never raises.
108
+ """
109
+ try:
110
+ with urllib.request.urlopen(
111
+ url, timeout=timeout
112
+ ) as r: # nosec B310 - http(s) only, fixed scheme
113
+ if not (200 <= r.status < 300):
114
+ return None
115
+ data = r.read(max_bytes + 1)
116
+ if len(data) > max_bytes:
117
+ return None # oversized → best-effort fail rather than buffer it whole
118
+ return data.decode("utf-8", errors="replace")
119
+ except (OSError, ValueError): # URLError is an OSError subclass — covered
120
+ return None
121
+
122
+
123
+ def http_get_json(url: str, *, timeout: float = 3.0) -> dict | None:
124
+ """Best-effort GET → parsed JSON dict, or ``None`` (unreachable / non-dict). Never raises."""
125
+ text = http_get_text(url, timeout=timeout)
126
+ if text is None:
127
+ return None
128
+ try:
129
+ data = json.loads(text)
130
+ except (ValueError, TypeError):
131
+ return None
132
+ return data if isinstance(data, dict) else None
133
+
134
+
135
+ def health_ok(base_url: str, *, timeout: float = 3.0) -> bool:
136
+ """True when ``<base_url>/health`` returns 2xx."""
137
+ return http_get_text(base_url.rstrip("/") + "/health", timeout=timeout) is not None
138
+
139
+
140
+ def probe_backend(base_url: str, *, timeout: float = 3.0) -> dict:
141
+ """Live ``{health, metrics}`` for one backend base URL (best-effort, never raises).
142
+
143
+ ``health`` is ``"ok"`` / ``"unreachable"``; ``metrics`` is the parsed dict, or
144
+ ``None`` when ``/metrics`` is unreachable (an engine can be loading or down).
145
+ """
146
+ base = base_url.rstrip("/")
147
+ if not health_ok(base, timeout=timeout):
148
+ # Short-circuit: a down backend has no useful /metrics, so skip the second
149
+ # request (halves the timeout cost for a dead backend).
150
+ return {"health": "unreachable", "metrics": None}
151
+ raw = http_get_text(base + "/metrics", timeout=timeout)
152
+ return {"health": "ok", "metrics": parse_metrics(raw) if raw is not None else None}
lobes/assess.py ADDED
@@ -0,0 +1,404 @@
1
+ """API-side assessment and benchmark of a vLLM-served model (stdlib only).
2
+
3
+ Talks only to the OpenAI-compatible endpoint (``urllib``, no third-party deps).
4
+ Ported from the original ``_assess.py`` and split into two concerns:
5
+
6
+ * :func:`run_correctness` — fixed correctness probes + reasoning-trace detection
7
+ (drives ``lobes assess``);
8
+ * :func:`run_benchmark` — decode throughput + prefill latency (drives
9
+ ``lobes benchmark``).
10
+
11
+ Host-side facts (image tag, GPU memory) are gathered by the command handlers via
12
+ :mod:`lobes.runtime._compose` and printed alongside this output.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import contextlib
18
+ import json
19
+ import time
20
+ import urllib.error
21
+ import urllib.request
22
+
23
+ from lobes.cli._errors import EXIT_ENV_ERROR, ModelGearError
24
+
25
+ # urllib.error.URLError is a subclass of OSError, so `except OSError` covers
26
+ # connection failures, timeouts, and HTTPError without listing it redundantly.
27
+
28
+
29
+ @contextlib.contextmanager
30
+ def _api_errors(what: str):
31
+ """Turn raw HTTP / JSON / response-shape failures into a structured error.
32
+
33
+ Without this, an ``HTTPError``/``URLError`` or an unexpected payload
34
+ (``KeyError``/``JSONDecodeError``) bubbles to the dispatcher's catch-all and
35
+ appears as ``unexpected: ...`` with no remediation.
36
+ """
37
+ try:
38
+ yield
39
+ except ModelGearError:
40
+ raise
41
+ except OSError as exc:
42
+ raise ModelGearError(
43
+ code=EXIT_ENV_ERROR,
44
+ message=f"{what} failed: {exc}",
45
+ remediation="check 'lobes status' / 'docker logs model-gear-vllm'",
46
+ ) from exc
47
+ except (json.JSONDecodeError, KeyError, IndexError, TypeError) as exc:
48
+ raise ModelGearError(
49
+ code=EXIT_ENV_ERROR,
50
+ message=f"{what}: unexpected response shape ({exc.__class__.__name__}: {exc})",
51
+ remediation="the served model returned an unexpected payload; check the vLLM logs",
52
+ ) from exc
53
+
54
+
55
+ # (prompt, expected-substring, table-label) — the two fixed correctness probes.
56
+ _PROBES = [
57
+ ("What is 17 * 23?", "391", "`17 * 23 = 391`"),
58
+ (
59
+ "If a train leaves at 14:45 and arrives at 17:10, how long is the journey in minutes?",
60
+ "145",
61
+ "train 14:45→17:10 = 145 min",
62
+ ),
63
+ ]
64
+
65
+ # Tool-calling probe (opt-in via ``lobes assess --tools``): mirrors issue #9's
66
+ # acceptance check — a ``tool_choice:"auto"`` request must return a ``tool_calls``
67
+ # array naming the ``finish`` function. Requires the server's
68
+ # ``--enable-auto-tool-choice`` + ``--tool-call-parser`` flags.
69
+ _TOOL_PROBE_PROMPT = "Call the finish tool with summary hello."
70
+ _TOOL_PROBE_TOOLS = [
71
+ {
72
+ "type": "function",
73
+ "function": {
74
+ "name": "finish",
75
+ "description": "Finish the task with a short summary.",
76
+ "parameters": {
77
+ "type": "object",
78
+ "properties": {"summary": {"type": "string"}},
79
+ "required": ["summary"],
80
+ },
81
+ },
82
+ }
83
+ ]
84
+
85
+
86
+ def _post(url: str, payload: dict, timeout: int = 300) -> dict:
87
+ data = json.dumps(payload).encode()
88
+ req = urllib.request.Request(
89
+ url + "/v1/chat/completions",
90
+ data=data,
91
+ headers={"Content-Type": "application/json"},
92
+ )
93
+ with urllib.request.urlopen(req, timeout=timeout) as r: # local endpoint only
94
+ return json.load(r)
95
+
96
+
97
+ def _get(url: str, path: str, timeout: int = 10):
98
+ with urllib.request.urlopen(url + path, timeout=timeout) as r: # local endpoint only
99
+ if r.headers.get("content-type", "").startswith("application/json"):
100
+ return r.status, json.load(r)
101
+ return r.status, r.read().decode()
102
+
103
+
104
+ def _trace_field(msg: dict) -> tuple[str | None, int]:
105
+ """Return ``(field_name, length)`` of the reasoning trace, whichever key holds it.
106
+
107
+ vLLM builds vary: the ``<think>`` trace lands in ``reasoning`` on the nv26.04
108
+ image, ``reasoning_content`` on older builds.
109
+ """
110
+ for key in ("reasoning", "reasoning_content"):
111
+ val = msg.get(key)
112
+ if isinstance(val, str) and val:
113
+ return key, len(val)
114
+ return None, 0
115
+
116
+
117
+ def health_status(url: str) -> int:
118
+ """Return the ``/health`` status code, or raise if the endpoint is unreachable."""
119
+ try:
120
+ status, _ = _get(url, "/health")
121
+ except OSError as exc:
122
+ raise ModelGearError(
123
+ code=EXIT_ENV_ERROR,
124
+ message=f"/health unreachable at {url} ({exc})",
125
+ remediation="start the server with 'lobes serve --apply'",
126
+ ) from exc
127
+ return status
128
+
129
+
130
+ def served_model(url: str, override: str | None = None) -> tuple[str, object]:
131
+ """Return ``(model_id, max_model_len)`` from ``/v1/models``. Raises if none served."""
132
+ with _api_errors("/v1/models"):
133
+ _, models = _get(url, "/v1/models")
134
+ data = models.get("data") if isinstance(models, dict) else None
135
+ if not data:
136
+ raise ModelGearError(
137
+ code=EXIT_ENV_ERROR,
138
+ message=f"/v1/models returned no models at {url}",
139
+ remediation="check 'lobes status' / 'docker logs model-gear-vllm'",
140
+ )
141
+ first = data[0]
142
+ return (override or first["id"]), first.get("max_model_len")
143
+
144
+
145
+ def _probe(url: str, model: str, prompt: str, expect: str) -> dict:
146
+ d = _post(
147
+ url,
148
+ {
149
+ "model": model,
150
+ "messages": [{"role": "user", "content": prompt}],
151
+ "max_tokens": 2048,
152
+ "temperature": 0.3,
153
+ },
154
+ )
155
+ msg = d["choices"][0]["message"]
156
+ content = msg.get("content") or ""
157
+ field, tlen = _trace_field(msg)
158
+ return {
159
+ "ok": expect in content,
160
+ "expect": expect,
161
+ "trace_field": field,
162
+ "trace_len": tlen,
163
+ "finish": d["choices"][0].get("finish_reason"),
164
+ "completion_tokens": d.get("usage", {}).get("completion_tokens"),
165
+ }
166
+
167
+
168
+ def _tool_probe(url: str, model: str) -> dict:
169
+ """Probe OpenAI tool calling; degrade gracefully, never abort the assess run.
170
+
171
+ A server without ``--enable-auto-tool-choice`` rejects ``tool_choice:"auto"``
172
+ with HTTP 400. A server that *has* the flags but returns an unexpected payload
173
+ (no ``choices``/``message``, or a wrong-shaped ``tool_calls``) would otherwise
174
+ raise inside :func:`run_correctness`'s ``_api_errors`` block and abort. Both
175
+ cases are surfaced here as a structured ``ok=False`` result with a FAIL row.
176
+ """
177
+ payload = {
178
+ "model": model,
179
+ "messages": [{"role": "user", "content": _TOOL_PROBE_PROMPT}],
180
+ "tools": _TOOL_PROBE_TOOLS,
181
+ "tool_choice": "auto",
182
+ "max_tokens": 512,
183
+ "temperature": 0,
184
+ }
185
+ try:
186
+ d = _post(url, payload)
187
+ except urllib.error.HTTPError as exc:
188
+ body = exc.read().decode(errors="replace").strip()
189
+ return {
190
+ "ok": False,
191
+ "tool_calls": [],
192
+ "finish": None,
193
+ "error": f"HTTP {exc.code}: {body[:200]}",
194
+ }
195
+ # Defensive parsing: a malformed 200 must not abort the run (documented
196
+ # "FAIL row, no abort"). Use .get()/isinstance throughout, with a catch-all
197
+ # net for any remaining shape surprise.
198
+ try:
199
+ choices = d.get("choices") if isinstance(d, dict) else None
200
+ choice = choices[0] if isinstance(choices, list) and choices else {}
201
+ msg = choice.get("message") or {}
202
+ raw_calls = msg.get("tool_calls")
203
+ calls = raw_calls if isinstance(raw_calls, list) else []
204
+ names = []
205
+ for c in calls:
206
+ fn = c.get("function") if isinstance(c, dict) else None
207
+ name = fn.get("name") if isinstance(fn, dict) else None
208
+ if name:
209
+ names.append(name)
210
+ return {
211
+ "ok": "finish" in names,
212
+ "tool_calls": names,
213
+ "finish": choice.get("finish_reason"),
214
+ "error": None,
215
+ }
216
+ except (KeyError, IndexError, TypeError, AttributeError) as exc:
217
+ return {
218
+ "ok": False,
219
+ "tool_calls": [],
220
+ "finish": None,
221
+ "error": f"unexpected response shape ({exc.__class__.__name__}: {exc})",
222
+ }
223
+
224
+
225
+ def probe_tool_calls(url: str, model: str) -> dict:
226
+ """One-shot tool-calling probe, without the arithmetic correctness probes.
227
+
228
+ Used by ``lobes switch`` / ``lobes serve`` to verify, the moment the
229
+ container is healthy, that ``tool_choice:"auto"`` returns a ``tool_calls``
230
+ response (no HTTP 400, a ``finish`` call present). Returns the same
231
+ structured dict as the in-``assess`` probe (``ok``/``tool_calls``/``finish``/
232
+ ``error``).
233
+
234
+ Never raises. ``_tool_probe`` already folds HTTP 400 and malformed-200
235
+ payloads into ``ok=False``; the two failure modes it lets through —
236
+ a connection failure (``OSError``) or an undecodable body
237
+ (``JSONDecodeError``) from ``_post``/``json.load`` — are caught here and
238
+ likewise returned as a structured ``ok=False``, so a post-switch/post-serve
239
+ probe can never abort the command.
240
+ """
241
+ try:
242
+ return _tool_probe(url.rstrip("/"), model)
243
+ except (OSError, json.JSONDecodeError) as exc:
244
+ return {"ok": False, "tool_calls": [], "finish": None, "error": f"probe failed: {exc}"}
245
+
246
+
247
+ def _decode_throughput(url: str, model: str, n_tokens: int, runs: int = 2) -> list[float]:
248
+ rates = []
249
+ for _ in range(runs):
250
+ t0 = time.monotonic()
251
+ d = _post(
252
+ url,
253
+ {
254
+ "model": model,
255
+ "messages": [
256
+ {"role": "user", "content": "Write a detailed essay about distributed systems."}
257
+ ],
258
+ "max_tokens": n_tokens,
259
+ "temperature": 0,
260
+ "ignore_eos": True,
261
+ },
262
+ )
263
+ dt = time.monotonic() - t0
264
+ ct = d["usage"]["completion_tokens"]
265
+ rates.append(round(ct / dt, 1))
266
+ return rates
267
+
268
+
269
+ def _prefill(url: str, model: str, input_len: int = 2000) -> dict:
270
+ # ~6 tokens per "The system processes events. " phrase — scale the repeat
271
+ # count so the prompt approximates the requested input_len (the actual
272
+ # prompt_tokens is measured and reported, so the estimate need only be close).
273
+ reps = max(1, input_len // 6)
274
+ prompt = "Summarize this. " + "The system processes events. " * reps
275
+ t0 = time.monotonic()
276
+ d = _post(
277
+ url,
278
+ {
279
+ "model": model,
280
+ "messages": [{"role": "user", "content": prompt}],
281
+ "max_tokens": 16,
282
+ "temperature": 0,
283
+ },
284
+ )
285
+ dt = time.monotonic() - t0
286
+ return {"prompt_tokens": d["usage"]["prompt_tokens"], "seconds": round(dt, 2)}
287
+
288
+
289
+ def run_correctness(url: str, model: str | None = None, check_tools: bool = False) -> dict:
290
+ """Run the fixed correctness probes; return a structured result.
291
+
292
+ When ``check_tools`` is set, also probe OpenAI tool calling and report it
293
+ under ``tool_calling`` (``None`` otherwise). ``passed`` reflects the content
294
+ probes only — a tool-less server still passes correctness.
295
+ """
296
+ url = url.rstrip("/")
297
+ hstatus = health_status(url)
298
+ model, max_len = served_model(url, model)
299
+ probes = []
300
+ tool_calling = None
301
+ with _api_errors("correctness probe"):
302
+ for prompt, expect, label in _PROBES:
303
+ result = _probe(url, model, prompt, expect)
304
+ result["label"] = label
305
+ probes.append(result)
306
+ if check_tools:
307
+ tool_calling = _tool_probe(url, model)
308
+ trace_field = next((p["trace_field"] for p in probes if p["trace_field"]), None)
309
+ trace_len = max((p["trace_len"] for p in probes), default=0)
310
+ return {
311
+ "model": model,
312
+ "endpoint": url,
313
+ "health": hstatus,
314
+ "max_model_len": max_len,
315
+ "probes": probes,
316
+ "trace_field": trace_field or "(none)",
317
+ "trace_len": trace_len,
318
+ "passed": all(p["ok"] for p in probes),
319
+ "tool_calling": tool_calling,
320
+ }
321
+
322
+
323
+ def run_benchmark(
324
+ url: str,
325
+ model: str | None = None,
326
+ *,
327
+ purpose: str = "balanced",
328
+ input_len: int = 1000,
329
+ output_len: int = 1000,
330
+ runs: int = 2,
331
+ ) -> dict:
332
+ """Measure decode throughput + prefill latency for a workload shape.
333
+
334
+ The shape (``input_len`` prompt, ``output_len`` decode) is the workload
335
+ *purpose* — ``lobes benchmark`` derives it from the configured ``VLLM_PURPOSE``
336
+ so the numbers track the serve config (see :mod:`lobes.profiles`).
337
+ """
338
+ url = url.rstrip("/")
339
+ health_status(url)
340
+ model, max_len = served_model(url, model)
341
+ with _api_errors("benchmark"):
342
+ rates = _decode_throughput(url, model, output_len, runs)
343
+ pf = _prefill(url, model, input_len)
344
+ return {
345
+ "model": model,
346
+ "endpoint": url,
347
+ "max_model_len": max_len,
348
+ "purpose": purpose,
349
+ "input_len": input_len,
350
+ "output_len": output_len,
351
+ "decode_rates": rates,
352
+ "prefill": pf,
353
+ }
354
+
355
+
356
+ def render_correctness(result: dict) -> str:
357
+ """Render :func:`run_correctness` output as a markdown block for a per-model doc."""
358
+ lines = [
359
+ f"## Assessment — `{result['model']}`",
360
+ "",
361
+ f"- Endpoint: `{result['endpoint']}` · `/health` {result['health']} · "
362
+ f"`max_model_len` {result['max_model_len']}",
363
+ "",
364
+ "| Check | Result |",
365
+ "|---|---|",
366
+ ]
367
+ for p in result["probes"]:
368
+ mark = "PASS" if p["ok"] else "FAIL"
369
+ lines.append(
370
+ f"| {p['label']} | {mark} (finish={p['finish']}, {p['completion_tokens']} tok) |"
371
+ )
372
+ lines.append(
373
+ f"| reasoning trace field | `{result['trace_field']}` (len {result['trace_len']}) |"
374
+ )
375
+ tc = result.get("tool_calling")
376
+ if tc is not None:
377
+ if tc["ok"]:
378
+ detail = f"PASS — called {', '.join(tc['tool_calls'])}"
379
+ else:
380
+ detail = "FAIL — " + (
381
+ tc.get("error") or f"no finish call (tool_calls={tc['tool_calls']})"
382
+ )
383
+ lines.append(f"| tool calling (`tool_choice:auto`) | {detail} |")
384
+ return "\n".join(lines)
385
+
386
+
387
+ def render_benchmark(result: dict) -> str:
388
+ """Render :func:`run_benchmark` output as a markdown block for a per-model doc."""
389
+ rates = "/".join(str(r) for r in result["decode_rates"])
390
+ pf = result["prefill"]
391
+ return "\n".join(
392
+ [
393
+ f"## Benchmark — `{result['model']}` ({result['purpose']})",
394
+ "",
395
+ f"- Endpoint: `{result['endpoint']}` · `max_model_len` {result['max_model_len']} · "
396
+ f"shape {result['input_len']} in / {result['output_len']} out",
397
+ "",
398
+ "| Metric | Result |",
399
+ "|---|---|",
400
+ f"| **decode throughput** | **{rates} tok/s** (batch=1, greedy, "
401
+ f"{result['output_len']} tok forced) |",
402
+ f"| prefill | {pf['prompt_tokens']} prompt tokens + 16 gen in {pf['seconds']} s |",
403
+ ]
404
+ )