PyPI - lobes-cli - Versions diffs - 0.27.0__py3-none-any.whl - Mend

lobes-cli 0.27.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

lobes/__init__.py +11 -0
lobes/__main__.py +8 -0
lobes/_metrics.py +152 -0
lobes/assess.py +404 -0
lobes/catalog.py +225 -0
lobes/cli/__init__.py +169 -0
lobes/cli/_commands/__init__.py +0 -0
lobes/cli/_commands/assess.py +57 -0
lobes/cli/_commands/benchmark.py +96 -0
lobes/cli/_commands/cli.py +38 -0
lobes/cli/_commands/doctor.py +150 -0
lobes/cli/_commands/explain.py +38 -0
lobes/cli/_commands/fleet.py +181 -0
lobes/cli/_commands/init.py +136 -0
lobes/cli/_commands/learn.py +253 -0
lobes/cli/_commands/logs.py +197 -0
lobes/cli/_commands/overview.py +241 -0
lobes/cli/_commands/serve.py +76 -0
lobes/cli/_commands/status.py +66 -0
lobes/cli/_commands/stop.py +48 -0
lobes/cli/_commands/switch.py +528 -0
lobes/cli/_commands/tunnel.py +181 -0
lobes/cli/_commands/whoami.py +130 -0
lobes/cli/_errors.py +42 -0
lobes/cli/_live.py +148 -0
lobes/cli/_output.py +56 -0
lobes/cli/_runtime_ops.py +78 -0
lobes/explain/__init__.py +27 -0
lobes/explain/catalog.py +811 -0
lobes/gateway/__init__.py +20 -0
lobes/gateway/__main__.py +19 -0
lobes/gateway/_config.py +142 -0
lobes/gateway/_routing.py +126 -0
lobes/gateway/server.py +533 -0
lobes/profiles.py +241 -0
lobes/realtime/__init__.py +21 -0
lobes/realtime/__main__.py +13 -0
lobes/realtime/_readiness.py +49 -0
lobes/realtime/_settings.py +92 -0
lobes/realtime/app.py +98 -0
lobes/realtime/audio_facade.py +106 -0
lobes/realtime/chatterbox_server.py +193 -0
lobes/realtime/protocol.py +83 -0
lobes/realtime/tts_client.py +381 -0
lobes/runtime/__init__.py +8 -0
lobes/runtime/_compose.py +394 -0
lobes/runtime/_env.py +70 -0
lobes/runtime/_health.py +60 -0
lobes/runtime/_parser.py +51 -0
lobes/runtime/_tunnel.py +367 -0
lobes/templates/__init__.py +5 -0
lobes/templates/cf-tunnel.env.example +31 -0
lobes/templates/docker-compose.yml +119 -0
lobes/templates/env.example +105 -0
lobes/templates/fleet/Dockerfile.chatterbox +111 -0
lobes/templates/fleet/Dockerfile.gateway +20 -0
lobes/templates/fleet/Dockerfile.parakeet +31 -0
lobes/templates/fleet/Dockerfile.realtime +20 -0
lobes/templates/fleet/__init__.py +6 -0
lobes/templates/fleet/_readiness.py +48 -0
lobes/templates/fleet/docker-compose.audio.yml +144 -0
lobes/templates/fleet/docker-compose.yml +254 -0
lobes/templates/fleet/env.audio.example +40 -0
lobes/templates/fleet/env.example +111 -0
lobes/templates/fleet/listen_server.py +125 -0
lobes/templates/mg-logwrap.sh +46 -0
lobes_cli-0.27.0.dist-info/METADATA +400 -0
lobes_cli-0.27.0.dist-info/RECORD +71 -0
lobes_cli-0.27.0.dist-info/WHEEL +4 -0
lobes_cli-0.27.0.dist-info/entry_points.txt +3 -0
lobes_cli-0.27.0.dist-info/licenses/LICENSE +201 -0

lobes/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""lobes — run, assess, and switch the local vLLM model."""
+from importlib.metadata import PackageNotFoundError
+from importlib.metadata import version as _v
+try:
+    __version__ = _v("lobes-cli")
+except PackageNotFoundError:  # editable install without metadata
+    __version__ = "0.0.0+local"
+__all__ = ["__version__"]

lobes/__main__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""Allow running lobes as ``python -m lobes``."""
+import sys
+from lobes.cli import main
+if __name__ == "__main__":
+    sys.exit(main())

lobes/_metrics.py ADDED Viewed

@@ -0,0 +1,152 @@
+"""Parse vLLM Prometheus ``/metrics`` + probe a backend's live state (stdlib only).
+Shared by the gateway's ``/status`` fan-out and ``lobes overview --live``. The
+parser is pure; the probes are best-effort and **never raise** — an unreachable
+backend folds into a structured result so the live view degrades gracefully
+instead of erroring. vLLM serves ``/metrics`` and ``/health`` unauthenticated, so
+no API key is needed for either.
+"""
+from __future__ import annotations
+import json
+import math
+import urllib.request
+# Cap a single GET body so a misbehaving backend can't stress memory/latency. A
+# vLLM /metrics scrape is well under this; /health is tiny.
+_MAX_BODY_BYTES = 5 * 1024 * 1024
+# The handful of vLLM series the live view reports. "busy" = running/waiting now;
+# "usage" = cumulative tokens + finished requests by reason. Summed across the
+# engine/model labels vLLM attaches (a single backend may expose >1 engine).
+_KV = "vllm:gpu_cache_usage_perc"
+_SUCCESS = "vllm:request_success_total"
+# Series that are simply summed → the live-view field they accumulate into.
+_SUM_FIELDS = {
+    "vllm:num_requests_running": "running",
+    "vllm:num_requests_waiting": "waiting",
+    "vllm:prompt_tokens_total": "prompt_tokens",
+    "vllm:generation_tokens_total": "generation_tokens",
+}
+def _label(label_block: str, key: str) -> str | None:
+    """Extract ``key="value"`` from a Prometheus ``{...}`` label block (best-effort)."""
+    needle = f'{key}="'
+    start = label_block.find(needle)
+    if start < 0:
+        return None
+    start += len(needle)
+    end = label_block.find('"', start)
+    return label_block[start:end] if end > start else None
+def _iter_samples(text: str):
+    """Yield ``(name, labels, value)`` for each finite metric sample line.
+    Skips comments, blanks, malformed lines, and non-finite values (NaN/inf would
+    later make ``int()`` raise — the parser is best-effort).
+    """
+    for raw in text.splitlines():
+        line = raw.strip()
+        if not line or line.startswith("#"):
+            continue
+        try:
+            left, value = line.rsplit(" ", 1)
+            val = float(value)
+        except ValueError:
+            continue
+        if not math.isfinite(val):
+            continue
+        brace = left.find("{")
+        name = left[:brace] if brace >= 0 else left
+        labels = left[brace:] if brace >= 0 else ""
+        yield name, labels, val
+def parse_metrics(text: str) -> dict:
+    """Reduce a vLLM ``/metrics`` exposition to the live-view numbers.
+    Returns ints for counts/tokens and a ``by_finish_reason`` map; ``kv_cache_usage``
+    (0..1) is included only when the gauge is present. Unknown/malformed lines are
+    skipped, so a partial scrape still yields what it can.
+    """
+    sums = dict.fromkeys(_SUM_FIELDS.values(), 0.0)
+    kv: float | None = None
+    by_reason: dict[str, float] = {}
+    for name, labels, val in _iter_samples(text):
+        field = _SUM_FIELDS.get(name)
+        if field is not None:
+            sums[field] += val
+        elif name == _KV:
+            kv = val if kv is None else max(kv, val)
+        elif name == _SUCCESS:
+            reason = _label(labels, "finished_reason") or "?"
+            by_reason[reason] = by_reason.get(reason, 0.0) + val
+    out = {
+        "running": int(sums["running"]),
+        "waiting": int(sums["waiting"]),
+        "prompt_tokens": int(sums["prompt_tokens"]),
+        "generation_tokens": int(sums["generation_tokens"]),
+        "requests_succeeded": int(sum(by_reason.values())),
+        "by_finish_reason": {k: int(v) for k, v in by_reason.items() if v},
+    }
+    if kv is not None:
+        out["kv_cache_usage"] = round(kv, 3)
+    return out
+def http_get_text(
+    url: str, *, timeout: float = 3.0, max_bytes: int = _MAX_BODY_BYTES
+) -> str | None:
+    """Best-effort GET → body text, or ``None`` if unreachable / non-2xx / oversized.
+    Reads at most ``max_bytes`` (+1 to detect overflow): an over-cap body is treated
+    as unavailable rather than buffered whole, so a misbehaving backend can't stress
+    memory. Never raises.
+    """
+    try:
+        with urllib.request.urlopen(
+            url, timeout=timeout
+        ) as r:  # nosec B310 - http(s) only, fixed scheme
+            if not (200 <= r.status < 300):
+                return None
+            data = r.read(max_bytes + 1)
+            if len(data) > max_bytes:
+                return None  # oversized → best-effort fail rather than buffer it whole
+            return data.decode("utf-8", errors="replace")
+    except (OSError, ValueError):  # URLError is an OSError subclass — covered
+        return None
+def http_get_json(url: str, *, timeout: float = 3.0) -> dict | None:
+    """Best-effort GET → parsed JSON dict, or ``None`` (unreachable / non-dict). Never raises."""
+    text = http_get_text(url, timeout=timeout)
+    if text is None:
+        return None
+    try:
+        data = json.loads(text)
+    except (ValueError, TypeError):
+        return None
+    return data if isinstance(data, dict) else None
+def health_ok(base_url: str, *, timeout: float = 3.0) -> bool:
+    """True when ``<base_url>/health`` returns 2xx."""
+    return http_get_text(base_url.rstrip("/") + "/health", timeout=timeout) is not None
+def probe_backend(base_url: str, *, timeout: float = 3.0) -> dict:
+    """Live ``{health, metrics}`` for one backend base URL (best-effort, never raises).
+    ``health`` is ``"ok"`` / ``"unreachable"``; ``metrics`` is the parsed dict, or
+    ``None`` when ``/metrics`` is unreachable (an engine can be loading or down).
+    """
+    base = base_url.rstrip("/")
+    if not health_ok(base, timeout=timeout):
+        # Short-circuit: a down backend has no useful /metrics, so skip the second
+        # request (halves the timeout cost for a dead backend).
+        return {"health": "unreachable", "metrics": None}
+    raw = http_get_text(base + "/metrics", timeout=timeout)
+    return {"health": "ok", "metrics": parse_metrics(raw) if raw is not None else None}

lobes/assess.py ADDED Viewed

@@ -0,0 +1,404 @@
+"""API-side assessment and benchmark of a vLLM-served model (stdlib only).
+Talks only to the OpenAI-compatible endpoint (``urllib``, no third-party deps).
+Ported from the original ``_assess.py`` and split into two concerns:
+* :func:`run_correctness` — fixed correctness probes + reasoning-trace detection
+  (drives ``lobes assess``);
+* :func:`run_benchmark` — decode throughput + prefill latency (drives
+  ``lobes benchmark``).
+Host-side facts (image tag, GPU memory) are gathered by the command handlers via
+:mod:`lobes.runtime._compose` and printed alongside this output.
+"""
+from __future__ import annotations
+import contextlib
+import json
+import time
+import urllib.error
+import urllib.request
+from lobes.cli._errors import EXIT_ENV_ERROR, ModelGearError
+# urllib.error.URLError is a subclass of OSError, so `except OSError` covers
+# connection failures, timeouts, and HTTPError without listing it redundantly.
+@contextlib.contextmanager
+def _api_errors(what: str):
+    """Turn raw HTTP / JSON / response-shape failures into a structured error.
+    Without this, an ``HTTPError``/``URLError`` or an unexpected payload
+    (``KeyError``/``JSONDecodeError``) bubbles to the dispatcher's catch-all and
+    appears as ``unexpected: ...`` with no remediation.
+    """
+    try:
+        yield
+    except ModelGearError:
+        raise
+    except OSError as exc:
+        raise ModelGearError(
+            code=EXIT_ENV_ERROR,
+            message=f"{what} failed: {exc}",
+            remediation="check 'lobes status' / 'docker logs model-gear-vllm'",
+        ) from exc
+    except (json.JSONDecodeError, KeyError, IndexError, TypeError) as exc:
+        raise ModelGearError(
+            code=EXIT_ENV_ERROR,
+            message=f"{what}: unexpected response shape ({exc.__class__.__name__}: {exc})",
+            remediation="the served model returned an unexpected payload; check the vLLM logs",
+        ) from exc
+# (prompt, expected-substring, table-label) — the two fixed correctness probes.
+_PROBES = [
+    ("What is 17 * 23?", "391", "`17 * 23 = 391`"),
+    (
+        "If a train leaves at 14:45 and arrives at 17:10, how long is the journey in minutes?",
+        "145",
+        "train 14:45→17:10 = 145 min",
+    ),
+]
+# Tool-calling probe (opt-in via ``lobes assess --tools``): mirrors issue #9's
+# acceptance check — a ``tool_choice:"auto"`` request must return a ``tool_calls``
+# array naming the ``finish`` function. Requires the server's
+# ``--enable-auto-tool-choice`` + ``--tool-call-parser`` flags.
+_TOOL_PROBE_PROMPT = "Call the finish tool with summary hello."
+_TOOL_PROBE_TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "finish",
+            "description": "Finish the task with a short summary.",
+            "parameters": {
+                "type": "object",
+                "properties": {"summary": {"type": "string"}},
+                "required": ["summary"],
+            },
+        },
+    }
+]
+def _post(url: str, payload: dict, timeout: int = 300) -> dict:
+    data = json.dumps(payload).encode()
+    req = urllib.request.Request(
+        url + "/v1/chat/completions",
+        data=data,
+        headers={"Content-Type": "application/json"},
+    )
+    with urllib.request.urlopen(req, timeout=timeout) as r:  # local endpoint only
+        return json.load(r)
+def _get(url: str, path: str, timeout: int = 10):
+    with urllib.request.urlopen(url + path, timeout=timeout) as r:  # local endpoint only
+        if r.headers.get("content-type", "").startswith("application/json"):
+            return r.status, json.load(r)
+        return r.status, r.read().decode()
+def _trace_field(msg: dict) -> tuple[str | None, int]:
+    """Return ``(field_name, length)`` of the reasoning trace, whichever key holds it.
+    vLLM builds vary: the ``<think>`` trace lands in ``reasoning`` on the nv26.04
+    image, ``reasoning_content`` on older builds.
+    """
+    for key in ("reasoning", "reasoning_content"):
+        val = msg.get(key)
+        if isinstance(val, str) and val:
+            return key, len(val)
+    return None, 0
+def health_status(url: str) -> int:
+    """Return the ``/health`` status code, or raise if the endpoint is unreachable."""
+    try:
+        status, _ = _get(url, "/health")
+    except OSError as exc:
+        raise ModelGearError(
+            code=EXIT_ENV_ERROR,
+            message=f"/health unreachable at {url} ({exc})",
+            remediation="start the server with 'lobes serve --apply'",
+        ) from exc
+    return status
+def served_model(url: str, override: str | None = None) -> tuple[str, object]:
+    """Return ``(model_id, max_model_len)`` from ``/v1/models``. Raises if none served."""
+    with _api_errors("/v1/models"):
+        _, models = _get(url, "/v1/models")
+        data = models.get("data") if isinstance(models, dict) else None
+        if not data:
+            raise ModelGearError(
+                code=EXIT_ENV_ERROR,
+                message=f"/v1/models returned no models at {url}",
+                remediation="check 'lobes status' / 'docker logs model-gear-vllm'",
+            )
+        first = data[0]
+        return (override or first["id"]), first.get("max_model_len")
+def _probe(url: str, model: str, prompt: str, expect: str) -> dict:
+    d = _post(
+        url,
+        {
+            "model": model,
+            "messages": [{"role": "user", "content": prompt}],
+            "max_tokens": 2048,
+            "temperature": 0.3,
+        },
+    )
+    msg = d["choices"][0]["message"]
+    content = msg.get("content") or ""
+    field, tlen = _trace_field(msg)
+    return {
+        "ok": expect in content,
+        "expect": expect,
+        "trace_field": field,
+        "trace_len": tlen,
+        "finish": d["choices"][0].get("finish_reason"),
+        "completion_tokens": d.get("usage", {}).get("completion_tokens"),
+    }
+def _tool_probe(url: str, model: str) -> dict:
+    """Probe OpenAI tool calling; degrade gracefully, never abort the assess run.
+    A server without ``--enable-auto-tool-choice`` rejects ``tool_choice:"auto"``
+    with HTTP 400. A server that *has* the flags but returns an unexpected payload
+    (no ``choices``/``message``, or a wrong-shaped ``tool_calls``) would otherwise
+    raise inside :func:`run_correctness`'s ``_api_errors`` block and abort. Both
+    cases are surfaced here as a structured ``ok=False`` result with a FAIL row.
+    """
+    payload = {
+        "model": model,
+        "messages": [{"role": "user", "content": _TOOL_PROBE_PROMPT}],
+        "tools": _TOOL_PROBE_TOOLS,
+        "tool_choice": "auto",
+        "max_tokens": 512,
+        "temperature": 0,
+    }
+    try:
+        d = _post(url, payload)
+    except urllib.error.HTTPError as exc:
+        body = exc.read().decode(errors="replace").strip()
+        return {
+            "ok": False,
+            "tool_calls": [],
+            "finish": None,
+            "error": f"HTTP {exc.code}: {body[:200]}",
+        }
+    # Defensive parsing: a malformed 200 must not abort the run (documented
+    # "FAIL row, no abort"). Use .get()/isinstance throughout, with a catch-all
+    # net for any remaining shape surprise.
+    try:
+        choices = d.get("choices") if isinstance(d, dict) else None
+        choice = choices[0] if isinstance(choices, list) and choices else {}
+        msg = choice.get("message") or {}
+        raw_calls = msg.get("tool_calls")
+        calls = raw_calls if isinstance(raw_calls, list) else []
+        names = []
+        for c in calls:
+            fn = c.get("function") if isinstance(c, dict) else None
+            name = fn.get("name") if isinstance(fn, dict) else None
+            if name:
+                names.append(name)
+        return {
+            "ok": "finish" in names,
+            "tool_calls": names,
+            "finish": choice.get("finish_reason"),
+            "error": None,
+        }
+    except (KeyError, IndexError, TypeError, AttributeError) as exc:
+        return {
+            "ok": False,
+            "tool_calls": [],
+            "finish": None,
+            "error": f"unexpected response shape ({exc.__class__.__name__}: {exc})",
+        }
+def probe_tool_calls(url: str, model: str) -> dict:
+    """One-shot tool-calling probe, without the arithmetic correctness probes.
+    Used by ``lobes switch`` / ``lobes serve`` to verify, the moment the
+    container is healthy, that ``tool_choice:"auto"`` returns a ``tool_calls``
+    response (no HTTP 400, a ``finish`` call present). Returns the same
+    structured dict as the in-``assess`` probe (``ok``/``tool_calls``/``finish``/
+    ``error``).
+    Never raises. ``_tool_probe`` already folds HTTP 400 and malformed-200
+    payloads into ``ok=False``; the two failure modes it lets through —
+    a connection failure (``OSError``) or an undecodable body
+    (``JSONDecodeError``) from ``_post``/``json.load`` — are caught here and
+    likewise returned as a structured ``ok=False``, so a post-switch/post-serve
+    probe can never abort the command.
+    """
+    try:
+        return _tool_probe(url.rstrip("/"), model)
+    except (OSError, json.JSONDecodeError) as exc:
+        return {"ok": False, "tool_calls": [], "finish": None, "error": f"probe failed: {exc}"}
+def _decode_throughput(url: str, model: str, n_tokens: int, runs: int = 2) -> list[float]:
+    rates = []
+    for _ in range(runs):
+        t0 = time.monotonic()
+        d = _post(
+            url,
+            {
+                "model": model,
+                "messages": [
+                    {"role": "user", "content": "Write a detailed essay about distributed systems."}
+                ],
+                "max_tokens": n_tokens,
+                "temperature": 0,
+                "ignore_eos": True,
+            },
+        )
+        dt = time.monotonic() - t0
+        ct = d["usage"]["completion_tokens"]
+        rates.append(round(ct / dt, 1))
+    return rates
+def _prefill(url: str, model: str, input_len: int = 2000) -> dict:
+    # ~6 tokens per "The system processes events. " phrase — scale the repeat
+    # count so the prompt approximates the requested input_len (the actual
+    # prompt_tokens is measured and reported, so the estimate need only be close).
+    reps = max(1, input_len // 6)
+    prompt = "Summarize this. " + "The system processes events. " * reps
+    t0 = time.monotonic()
+    d = _post(
+        url,
+        {
+            "model": model,
+            "messages": [{"role": "user", "content": prompt}],
+            "max_tokens": 16,
+            "temperature": 0,
+        },
+    )
+    dt = time.monotonic() - t0
+    return {"prompt_tokens": d["usage"]["prompt_tokens"], "seconds": round(dt, 2)}
+def run_correctness(url: str, model: str | None = None, check_tools: bool = False) -> dict:
+    """Run the fixed correctness probes; return a structured result.
+    When ``check_tools`` is set, also probe OpenAI tool calling and report it
+    under ``tool_calling`` (``None`` otherwise). ``passed`` reflects the content
+    probes only — a tool-less server still passes correctness.
+    """
+    url = url.rstrip("/")
+    hstatus = health_status(url)
+    model, max_len = served_model(url, model)
+    probes = []
+    tool_calling = None
+    with _api_errors("correctness probe"):
+        for prompt, expect, label in _PROBES:
+            result = _probe(url, model, prompt, expect)
+            result["label"] = label
+            probes.append(result)
+        if check_tools:
+            tool_calling = _tool_probe(url, model)
+    trace_field = next((p["trace_field"] for p in probes if p["trace_field"]), None)
+    trace_len = max((p["trace_len"] for p in probes), default=0)
+    return {
+        "model": model,
+        "endpoint": url,
+        "health": hstatus,
+        "max_model_len": max_len,
+        "probes": probes,
+        "trace_field": trace_field or "(none)",
+        "trace_len": trace_len,
+        "passed": all(p["ok"] for p in probes),
+        "tool_calling": tool_calling,
+    }
+def run_benchmark(
+    url: str,
+    model: str | None = None,
+    *,
+    purpose: str = "balanced",
+    input_len: int = 1000,
+    output_len: int = 1000,
+    runs: int = 2,
+) -> dict:
+    """Measure decode throughput + prefill latency for a workload shape.
+    The shape (``input_len`` prompt, ``output_len`` decode) is the workload
+    *purpose* — ``lobes benchmark`` derives it from the configured ``VLLM_PURPOSE``
+    so the numbers track the serve config (see :mod:`lobes.profiles`).
+    """
+    url = url.rstrip("/")
+    health_status(url)
+    model, max_len = served_model(url, model)
+    with _api_errors("benchmark"):
+        rates = _decode_throughput(url, model, output_len, runs)
+        pf = _prefill(url, model, input_len)
+    return {
+        "model": model,
+        "endpoint": url,
+        "max_model_len": max_len,
+        "purpose": purpose,
+        "input_len": input_len,
+        "output_len": output_len,
+        "decode_rates": rates,
+        "prefill": pf,
+    }
+def render_correctness(result: dict) -> str:
+    """Render :func:`run_correctness` output as a markdown block for a per-model doc."""
+    lines = [
+        f"## Assessment — `{result['model']}`",
+        "",
+        f"- Endpoint: `{result['endpoint']}` · `/health` {result['health']} · "
+        f"`max_model_len` {result['max_model_len']}",
+        "",
+        "| Check | Result |",
+        "|---|---|",
+    ]
+    for p in result["probes"]:
+        mark = "PASS" if p["ok"] else "FAIL"
+        lines.append(
+            f"| {p['label']} | {mark} (finish={p['finish']}, {p['completion_tokens']} tok) |"
+        )
+    lines.append(
+        f"| reasoning trace field | `{result['trace_field']}` (len {result['trace_len']}) |"
+    )
+    tc = result.get("tool_calling")
+    if tc is not None:
+        if tc["ok"]:
+            detail = f"PASS — called {', '.join(tc['tool_calls'])}"
+        else:
+            detail = "FAIL — " + (
+                tc.get("error") or f"no finish call (tool_calls={tc['tool_calls']})"
+            )
+        lines.append(f"| tool calling (`tool_choice:auto`) | {detail} |")
+    return "\n".join(lines)
+def render_benchmark(result: dict) -> str:
+    """Render :func:`run_benchmark` output as a markdown block for a per-model doc."""
+    rates = "/".join(str(r) for r in result["decode_rates"])
+    pf = result["prefill"]
+    return "\n".join(
+        [
+            f"## Benchmark — `{result['model']}` ({result['purpose']})",
+            "",
+            f"- Endpoint: `{result['endpoint']}` · `max_model_len` {result['max_model_len']} · "
+            f"shape {result['input_len']} in / {result['output_len']} out",
+            "",
+            "| Metric | Result |",
+            "|---|---|",
+            f"| **decode throughput** | **{rates} tok/s** (batch=1, greedy, "
+            f"{result['output_len']} tok forced) |",
+            f"| prefill | {pf['prompt_tokens']} prompt tokens + 16 gen in {pf['seconds']} s |",
+        ]
+    )