PyPI - lobes-cli - Versions diffs - 0.27.0__py3-none-any.whl - Mend

lobes-cli 0.27.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

lobes/__init__.py +11 -0
lobes/__main__.py +8 -0
lobes/_metrics.py +152 -0
lobes/assess.py +404 -0
lobes/catalog.py +225 -0
lobes/cli/__init__.py +169 -0
lobes/cli/_commands/__init__.py +0 -0
lobes/cli/_commands/assess.py +57 -0
lobes/cli/_commands/benchmark.py +96 -0
lobes/cli/_commands/cli.py +38 -0
lobes/cli/_commands/doctor.py +150 -0
lobes/cli/_commands/explain.py +38 -0
lobes/cli/_commands/fleet.py +181 -0
lobes/cli/_commands/init.py +136 -0
lobes/cli/_commands/learn.py +253 -0
lobes/cli/_commands/logs.py +197 -0
lobes/cli/_commands/overview.py +241 -0
lobes/cli/_commands/serve.py +76 -0
lobes/cli/_commands/status.py +66 -0
lobes/cli/_commands/stop.py +48 -0
lobes/cli/_commands/switch.py +528 -0
lobes/cli/_commands/tunnel.py +181 -0
lobes/cli/_commands/whoami.py +130 -0
lobes/cli/_errors.py +42 -0
lobes/cli/_live.py +148 -0
lobes/cli/_output.py +56 -0
lobes/cli/_runtime_ops.py +78 -0
lobes/explain/__init__.py +27 -0
lobes/explain/catalog.py +811 -0
lobes/gateway/__init__.py +20 -0
lobes/gateway/__main__.py +19 -0
lobes/gateway/_config.py +142 -0
lobes/gateway/_routing.py +126 -0
lobes/gateway/server.py +533 -0
lobes/profiles.py +241 -0
lobes/realtime/__init__.py +21 -0
lobes/realtime/__main__.py +13 -0
lobes/realtime/_readiness.py +49 -0
lobes/realtime/_settings.py +92 -0
lobes/realtime/app.py +98 -0
lobes/realtime/audio_facade.py +106 -0
lobes/realtime/chatterbox_server.py +193 -0
lobes/realtime/protocol.py +83 -0
lobes/realtime/tts_client.py +381 -0
lobes/runtime/__init__.py +8 -0
lobes/runtime/_compose.py +394 -0
lobes/runtime/_env.py +70 -0
lobes/runtime/_health.py +60 -0
lobes/runtime/_parser.py +51 -0
lobes/runtime/_tunnel.py +367 -0
lobes/templates/__init__.py +5 -0
lobes/templates/cf-tunnel.env.example +31 -0
lobes/templates/docker-compose.yml +119 -0
lobes/templates/env.example +105 -0
lobes/templates/fleet/Dockerfile.chatterbox +111 -0
lobes/templates/fleet/Dockerfile.gateway +20 -0
lobes/templates/fleet/Dockerfile.parakeet +31 -0
lobes/templates/fleet/Dockerfile.realtime +20 -0
lobes/templates/fleet/__init__.py +6 -0
lobes/templates/fleet/_readiness.py +48 -0
lobes/templates/fleet/docker-compose.audio.yml +144 -0
lobes/templates/fleet/docker-compose.yml +254 -0
lobes/templates/fleet/env.audio.example +40 -0
lobes/templates/fleet/env.example +111 -0
lobes/templates/fleet/listen_server.py +125 -0
lobes/templates/mg-logwrap.sh +46 -0
lobes_cli-0.27.0.dist-info/METADATA +400 -0
lobes_cli-0.27.0.dist-info/RECORD +71 -0
lobes_cli-0.27.0.dist-info/WHEEL +4 -0
lobes_cli-0.27.0.dist-info/entry_points.txt +3 -0
lobes_cli-0.27.0.dist-info/licenses/LICENSE +201 -0

lobes/catalog.py ADDED Viewed

@@ -0,0 +1,225 @@
+"""The supported-model catalog — the "gears" lobes can change to.
+A pure, dependency-free data module: the single source of truth for the models
+lobes knows how to serve (each one load-tested or configured on the DGX
+Spark and documented under ``docs/``). It ships *in the wheel* so both runtimes
+can read it:
+* the CLI (``lobes overview --list``) — which would otherwise scan ``docs/`` and
+  find nothing in a wheel install (``docs/`` is not packaged), and
+* the gateway (``GET /v1/models/supported``) — which runs from a pip-installed
+  wheel inside its container and has no source tree to scan.
+The per-model ``docs/`` files remain the *human* prose; this module is the
+*machine* catalog. ``tests/test_catalog.py`` asserts the two cannot silently
+diverge (every ``doc`` file exists; every parser matches ``infer_parser``).
+"""
+from __future__ import annotations
+from dataclasses import asdict, dataclass
+@dataclass(frozen=True)
+class SupportedModel:
+    """One model the fleet/CLI can serve — a gear you can change to."""
+    id: str  # OpenAI model id (== the vLLM --served-model-name)
+    role_hint: str  # "primary" | "fallback" | "candidate" (the fleet's default role)
+    shape: str  # architecture in a phrase, e.g. "dense" / "MoE (~3B active)"
+    context: str  # native context window, human-readable
+    # The largest --max-model-len this checkpoint serves with vLLM's *default* rope
+    # (no YaRN/rope-scaling override) — a hard ceiling: vLLM refuses a larger value
+    # and the container fails to boot. `lobes switch` clamps the machine-profile
+    # context default DOWN to this, so a high machine default (e.g. spark's 256K)
+    # can't silently boot-fail a 32K-native model. An explicit --max-model-len wins.
+    native_max_model_len: int
+    tool_parser: str  # vLLM --tool-call-parser (must match runtime._parser.infer_parser)
+    quantization: str  # vLLM --quantization
+    status: str  # "load-tested" (measured on this hardware) | "configured" (not yet)
+    doc: str  # per-model markdown under docs/ (filename only)
+    # Per-model serve extras for MoE checkpoints. Empty for dense/hybrid models;
+    # set only where the architecture needs them. These are NOT in the default
+    # single-model template (docker compose can't conditionally omit a flag, and
+    # an empty `--moe-backend=` token breaks vLLM) — `lobes switch` surfaces them
+    # as a documented compose edit. See docs/qwen3.6-35b-a3b-nvfp4.md.
+    moe_backend: str = ""  # vLLM --moe-backend (e.g. "marlin") for MoE models
+    speculative_config: str = ""  # vLLM --speculative-config JSON (e.g. MTP draft)
+    task: str = "generate"  # "generate" | "embed" | "score"
+    dimension: int = 0  # embedding output dimension; 0 for non-embedding models
+    hf_overrides: str = ""  # vLLM --hf-overrides JSON string
+SUPPORTED_MODELS: tuple[SupportedModel, ...] = (
+    SupportedModel(
+        id="mmangkad/Qwen3.6-27B-NVFP4",
+        # Archived former primary (superseded 2026-05-31 by the MTP build below).
+        # Kept in the catalog for two reasons: (1) it is the tokenizer source the
+        # MTP primary serves with (--tokenizer=mmangkad/Qwen3.6-27B-NVFP4), and
+        # (2) it is the only *vision-capable* 27B — the MTP primary is text-only,
+        # so this is the fallback when an image path is needed.
+        role_hint="candidate",
+        shape="hybrid Mamba/linear-attn + ViT (multimodal)",
+        context="256K native",
+        native_max_model_len=262144,
+        tool_parser="qwen3_coder",
+        quantization="modelopt_fp4",
+        status="load-tested",
+        doc="qwen3.6-27b-nvfp4.md",
+    ),
+    SupportedModel(
+        id="RedHatAI/Mistral-Small-3.2-24B-Instruct-2506-NVFP4",
+        role_hint="fallback",
+        shape="dense (vision-capable)",
+        context="128K native",
+        native_max_model_len=131072,
+        tool_parser="mistral",
+        quantization="compressed-tensors",
+        status="load-tested",
+        doc="mistral-small-3.2-24b-nvfp4.md",
+    ),
+    SupportedModel(
+        id="nvidia/Qwen3-32B-NVFP4",
+        role_hint="candidate",
+        shape="dense",
+        context="32K (→131K via YaRN)",
+        # 32K native: 131K needs an explicit YaRN --rope-scaling override (pass
+        # --max-model-len 131072 with it). Without that, 32768 is the boot ceiling.
+        native_max_model_len=32768,
+        tool_parser="hermes",
+        quantization="modelopt_fp4",
+        status="load-tested",
+        doc="qwen3-32b-nvfp4.md",
+    ),
+    SupportedModel(
+        id="sakamakismile/Qwen3.6-27B-Text-NVFP4-MTP",
+        # Fleet default primary since 2026-05-31 (promoted from candidate after the
+        # tool-calling gate passed: a valid qwen3_coder tool call + full tool
+        # round-trip + reasoning trace, all under the production compose, with MTP
+        # spec-decode active at 78.6% draft acceptance and 18.7 tok/s decode —
+        # ~2.4x the archived baseline 27B). Replaces mmangkad/Qwen3.6-27B-NVFP4.
+        role_hint="primary",
+        shape="hybrid Mamba/linear-attn (text-only, MTP draft head)",
+        context="256K native (served at full 256K on the shared GB10)",
+        native_max_model_len=262144,
+        tool_parser="qwen3_coder",
+        quantization="modelopt",
+        status="load-tested",
+        doc="qwen3.6-27b-text-nvfp4-mtp.md",
+        # MTP primary (issue #26): an MTP-grafted re-export of the archived 27B —
+        # the baseline NVFP4 export drops the MTP draft head (0% draft acceptance),
+        # so this repo restores it in bf16 for vLLM speculative decoding. The
+        # --speculative-config is catalog data (like moe_backend): compose can't omit
+        # an empty flag, so `lobes switch` surfaces it as a hand edit. Load-tested on
+        # the GB10 2026-05-31: 19.1 tok/s decode (~2.4x the baseline 27B) at 72% MTP
+        # acceptance on vLLM 0.19.0+nv26.04. Also needs --trust-remote-code +
+        # --language-model-only, VLLM_MAX_NUM_SEQS=2 (4 OOMs at n=3/256K), and a
+        # tokenizer override (--tokenizer=mmangkad/Qwen3.6-27B-NVFP4 — the checkpoint's
+        # tokenizer_config declares TokenizersBackend, absent from the nv26.04 image).
+        # Quantization `modelopt` resolves to modelopt_fp4. See the doc.
+        speculative_config='{"method": "qwen3_5_mtp", "num_speculative_tokens": 3}',
+    ),
+    SupportedModel(
+        id="mmangkad/Qwen3.6-35B-A3B-NVFP4",
+        role_hint="candidate",
+        shape="MoE (~3B active per token)",
+        context="32K",
+        native_max_model_len=32768,
+        tool_parser="qwen3_coder",
+        quantization="modelopt_fp4",
+        status="configured",
+        doc="qwen3.6-35b-a3b-nvfp4.md",
+        # MoE-only serve extra: the marlin MoE kernel — verified to load this
+        # checkpoint *solo* on the GB10 (2026-05-31, util 0.70). lobes switch
+        # surfaces it as a compose edit; it must not land on the dense/hybrid models.
+        # shahizat's MTP --speculative-config is intentionally NOT carried: it is
+        # tied to the nvidia/ checkpoint and FAILS to load on this mmangkad copy
+        # (qwen3_5_mtp.py weight-shape mismatch on vLLM nv26.04). See the doc.
+        moe_backend="marlin",
+    ),
+    SupportedModel(
+        id="Qwen/Qwen3-Embedding-0.6B",
+        # Embedding gear (issue #44): 1024-dim dense text embeddings with Matryoshka
+        # nesting (32/64/128/256/512/768/1024). Zero tool-parser and quantization —
+        # this is a pooling model, not a chat/completion model. Served via vLLM's
+        # embedding endpoint (/v1/embeddings). The hf_overrides enables Matryoshka
+        # truncation so consumers can request sub-1024 dimensions without re-serving.
+        role_hint="embedding",
+        shape="dense embedding (text)",
+        context="32K native",
+        native_max_model_len=32768,
+        tool_parser="",
+        quantization="",
+        status="load-tested",  # GB10 2026-06-19: dim 1024, MRL 256 ✓, ~28ms warm, co-resident
+        doc="qwen3-embedding-0.6b.md",
+        task="embed",
+        dimension=1024,
+        hf_overrides=(
+            '{"is_matryoshka": true,'
+            ' "matryoshka_dimensions": [32, 64, 128, 256, 512, 768, 1024]}'
+        ),
+    ),
+    SupportedModel(
+        id="Qwen/Qwen3-Reranker-0.6B",
+        # Reranker gear (issue #44): cross-encoder that scores (query, passage) pairs.
+        # Built on Qwen3ForSequenceClassification with a binary yes/no logit head;
+        # served via vLLM's score endpoint (/v1/score). The hf_overrides declare the
+        # non-standard architecture class and the two classifier tokens so vLLM can
+        # load the head correctly. Zero tool-parser and quantization (score-only model).
+        role_hint="reranker",
+        shape="dense cross-encoder (Qwen3ForSequenceClassification)",
+        context="32K native",
+        native_max_model_len=32768,
+        tool_parser="",
+        quantization="",
+        status="load-tested",  # GB10 2026-06-19: /v1/rerank+/v1/score ✓, ~25ms warm, co-resident
+        doc="qwen3-reranker-0.6b.md",
+        task="score",
+        dimension=0,
+        hf_overrides=(
+            '{"architectures": ["Qwen3ForSequenceClassification"],'
+            ' "classifier_from_token": ["no", "yes"],'
+            ' "is_original_qwen3_reranker": true}'
+        ),
+    ),
+)
+def supported_models() -> tuple[SupportedModel, ...]:
+    """The full supported-model catalog (the gears you can change to)."""
+    return SUPPORTED_MODELS
+def as_dicts() -> list[dict[str, str]]:
+    """The catalog as plain dicts — for JSON emission without importing the dataclass."""
+    return [asdict(model) for model in SUPPORTED_MODELS]
+# The tokenizer the MTP primary serves with — a base-checkpoint override (the MTP
+# checkpoint's tokenizer_config declares a class absent from the nv26.04 image; see
+# docs/qwen3.6-27b-text-nvfp4-mtp.md caveat 1). Drop once fixed upstream (issue #29).
+MTP_TOKENIZER_OVERRIDE = "mmangkad/Qwen3.6-27B-NVFP4"
+def mtp_compose_command_items() -> list[str]:
+    """The extra compose ``command:`` items the MTP default primary needs.
+    These four flags are baked into the packaged compose templates *and* named by
+    ``lobes switch`` as the lines to remove when switching to a non-MTP model. This
+    is the single source of truth so the two cannot drift — ``tests/test_catalog.py``
+    asserts the packaged templates contain exactly these items, and the speculative
+    config is pulled from the primary catalog entry rather than re-typed.
+    Returns argv tokens (no YAML quoting) in compose ``command:`` order.
+    """
+    primary = next(
+        (m for m in SUPPORTED_MODELS if m.role_hint == "primary" and m.speculative_config),
+        None,
+    )
+    spec = primary.speculative_config if primary else '{"method": "..."}'
+    return [
+        f"--speculative-config={spec}",
+        "--trust-remote-code",
+        "--language-model-only",
+        f"--tokenizer={MTP_TOKENIZER_OVERRIDE}",
+    ]

lobes/cli/__init__.py ADDED Viewed

@@ -0,0 +1,169 @@
+"""Unified CLI entry point for lobes (binary: ``lobes``; ``model`` is a deprecated alias).
+The model-ops verbs (``switch``, ``serve``/``stop``, ``status``, ``assess``,
+``benchmark``, ``init``, ``tunnel``) are the heart of the tool; the agent-first verbs
+(``whoami``, ``learn``, ``explain``, ``overview``, ``doctor``, ``cli``) keep the
+sibling rubric satisfied. Each verb module exposes ``register(sub)`` following
+the same pattern.
+Error propagation contract
+--------------------------
+Every handler raises :class:`lobes.cli._errors.ModelGearError` on failure;
+``main()`` catches it via :func:`_dispatch` and routes through
+:mod:`lobes.cli._output`. Unknown exceptions are wrapped into a
+``ModelGearError`` so no Python traceback leaks to stderr.
+Argparse errors (unknown verb, missing arg) also route through the structured
+format — ``_ModelGearArgumentParser`` overrides ``.error()`` and the subparsers
+are built with ``parser_class=_ModelGearArgumentParser``. Whether errors render
+as text or JSON depends on whether ``--json`` appears in the raw argv
+(:func:`main` sets ``_json_hint`` before ``parse_args``).
+"""
+from __future__ import annotations
+import argparse
+import sys
+from lobes import __version__
+from lobes.cli._errors import EXIT_USER_ERROR, ModelGearError
+from lobes.cli._output import emit_error
+class _ModelGearArgumentParser(argparse.ArgumentParser):
+    """ArgumentParser that routes errors through :func:`emit_error`.
+    Argparse's default error handler writes ``prog: error: <msg>`` to stderr
+    and exits 2, skipping the ModelGearError plumbing (and the ``hint:`` line
+    agents look for). This subclass emits the structured format and exits with
+    :attr:`EXIT_USER_ERROR`.
+    JSON mode: parse-time errors happen before ``args.json`` exists, so we rely
+    on a class-level ``_json_hint`` that :func:`main` pre-populates by scanning
+    raw argv for ``--json``. Shared across all subparser instances.
+    """
+    _json_hint: bool = False
+    def error(self, message: str) -> None:  # type: ignore[override]
+        err = ModelGearError(
+            code=EXIT_USER_ERROR,
+            message=message,
+            remediation=f"run '{self.prog} --help' to see valid arguments",
+        )
+        emit_error(err, json_mode=type(self)._json_hint)
+        raise SystemExit(err.code)
+def _argv_has_json(argv: list[str] | None) -> bool:
+    tokens = argv if argv is not None else sys.argv[1:]
+    return any(t == "--json" or t.startswith("--json=") for t in tokens)
+def _detect_prog() -> str:
+    """Return the invocation name so ``--version`` and help text match the binary.
+    When invoked as ``lobes`` → ``"lobes"``; as ``model`` (deprecated alias) → ``"model"``;
+    as ``python -m lobes`` → ``"lobes"``.
+    """
+    import os
+    argv0 = os.path.basename(sys.argv[0]) if sys.argv else "lobes"
+    # Strip .py suffix (python -m lobes → __main__.py on some Python versions)
+    name = argv0.removesuffix(".py").removesuffix("__main__")
+    return name if name in ("lobes", "model") else "lobes"
+def _build_parser() -> argparse.ArgumentParser:
+    from lobes.cli._commands import assess as _assess_cmd
+    from lobes.cli._commands import benchmark as _benchmark_cmd
+    from lobes.cli._commands import cli as _cli_group
+    from lobes.cli._commands import doctor as _doctor_cmd
+    from lobes.cli._commands import explain as _explain_cmd
+    from lobes.cli._commands import fleet as _fleet_cmd
+    from lobes.cli._commands import init as _init_cmd
+    from lobes.cli._commands import learn as _learn_cmd
+    from lobes.cli._commands import logs as _logs_cmd
+    from lobes.cli._commands import overview as _overview_cmd
+    from lobes.cli._commands import serve as _serve_cmd
+    from lobes.cli._commands import status as _status_cmd
+    from lobes.cli._commands import stop as _stop_cmd
+    from lobes.cli._commands import switch as _switch_cmd
+    from lobes.cli._commands import tunnel as _tunnel_cmd
+    from lobes.cli._commands import whoami as _whoami_cmd
+    parser = _ModelGearArgumentParser(
+        prog=_detect_prog(),
+        description="lobes — run, assess, and switch the local vLLM model",
+    )
+    parser.add_argument(
+        "--version",
+        action="version",
+        version=f"%(prog)s {__version__}",
+    )
+    # parser_class propagates to every subparser so their .error() routes
+    # through _ModelGearArgumentParser too.
+    sub = parser.add_subparsers(dest="command", parser_class=_ModelGearArgumentParser)
+    # Model-ops verbs (the heart of the tool).
+    _switch_cmd.register(sub)
+    _serve_cmd.register(sub)
+    _stop_cmd.register(sub)
+    _status_cmd.register(sub)
+    _assess_cmd.register(sub)
+    _benchmark_cmd.register(sub)
+    _init_cmd.register(sub)
+    _fleet_cmd.register(sub)
+    _logs_cmd.register(sub)
+    _tunnel_cmd.register(sub)
+    # Agent-first / introspection verbs (sibling rubric).
+    _whoami_cmd.register(sub)
+    _learn_cmd.register(sub)
+    _explain_cmd.register(sub)
+    _overview_cmd.register(sub)
+    _doctor_cmd.register(sub)
+    _cli_group.register(sub)
+    return parser
+def _dispatch(args: argparse.Namespace) -> int:
+    """Invoke the registered handler and translate exceptions to exit codes.
+    A handler may return ``None`` (success, exit 0) or an ``int`` exit code.
+    Failures MUST raise :class:`ModelGearError`; any other exception is wrapped
+    into one so no Python traceback leaks.
+    """
+    json_mode = bool(getattr(args, "json", False))
+    try:
+        rc = args.func(args)
+    except ModelGearError as err:
+        emit_error(err, json_mode=json_mode)
+        return err.code
+    except Exception as err:  # noqa: BLE001 - last-resort; wrap and route cleanly
+        wrapped = ModelGearError(
+            code=EXIT_USER_ERROR,
+            message=f"unexpected: {err.__class__.__name__}: {err}",
+            remediation="file a bug at https://github.com/agentculture/lobes-cli/issues",
+        )
+        emit_error(wrapped, json_mode=json_mode)
+        return wrapped.code
+    return rc if rc is not None else 0
+def main(argv: list[str] | None = None) -> int:
+    # Pre-parse peek so argparse-level errors honour --json.
+    _ModelGearArgumentParser._json_hint = _argv_has_json(argv)
+    parser = _build_parser()
+    args = parser.parse_args(argv)
+    if args.command is None:
+        parser.print_help()
+        return 0
+    return _dispatch(args)
+if __name__ == "__main__":
+    sys.exit(main())

lobes/cli/_commands/__init__.py ADDED Viewed

File without changes

lobes/cli/_commands/assess.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""``lobes assess`` — correctness probes against the served model.
+Read-only. Runs the two fixed correctness probes and detects the reasoning-trace
+field, then emits a markdown block (plus host-side facts) ready to paste into a
+per-model doc under ``docs/``. ``--tools`` additionally probes OpenAI tool
+calling. Throughput lives in ``lobes benchmark``.
+"""
+from __future__ import annotations
+import argparse
+from lobes import assess as _assess
+from lobes.cli import _runtime_ops
+from lobes.cli._output import emit_result
+from lobes.runtime import _compose, _env
+def cmd_assess(args: argparse.Namespace) -> int:
+    json_mode = bool(getattr(args, "json", False))
+    port, deploy_dir = _runtime_ops.resolve_port_soft(args)
+    model = args.model
+    if model is None and deploy_dir is not None:
+        model = _env.read_env(deploy_dir / _compose.ENV_FILE, "VLLM_SERVED_NAME")
+    url = f"http://localhost:{port}"
+    result = _assess.run_correctness(url, model, check_tools=bool(getattr(args, "tools", False)))
+    host = {"image": _compose.container_image(), "gpu_memory": _compose.gpu_engine_mem()}
+    if json_mode:
+        emit_result({**result, "host": host}, json_mode=True)
+    else:
+        header = (
+            "### Host-side\n"
+            f"- Image: `{host['image']}`  ·  GPU memory (EngineCore): {host['gpu_memory']}\n"
+        )
+        emit_result(header + "\n" + _assess.render_correctness(result), json_mode=False)
+    return 0
+def register(sub: argparse._SubParsersAction) -> None:
+    p = sub.add_parser(
+        "assess",
+        help="Correctness probes against the served model (markdown for a per-model doc).",
+    )
+    p.add_argument("--port", type=int, help="Host port (default: VLLM_PORT in .env, else 8000).")
+    p.add_argument(
+        "--model", help="Served model name (default: VLLM_SERVED_NAME, else first /v1/models)."
+    )
+    p.add_argument(
+        "--tools",
+        action="store_true",
+        help="Also probe OpenAI tool calling (tool_choice:auto must return a tool_calls array).",
+    )
+    p.add_argument("--compose-dir", help="Deployment dir (default: $LOBES_DIR or ~/.lobes).")
+    p.add_argument("--json", action="store_true", help="Emit structured JSON.")
+    p.set_defaults(func=cmd_assess)

lobes/cli/_commands/benchmark.py ADDED Viewed

@@ -0,0 +1,96 @@
+"""``lobes benchmark`` — decode throughput + prefill latency for the served model.
+Read-only. The workload shape is the active *purpose*: it defaults to the
+configured ``VLLM_PURPOSE`` (so the numbers track the serve config) and can be
+overridden with ``--purpose`` or explicit ``--input-len`` / ``--output-len``.
+Forces a fixed decode length over a couple of runs and measures a prompt-sized
+prefill, then emits a markdown block (plus host-side facts) for a per-model doc
+under ``docs/``. Correctness lives in ``lobes assess``.
+"""
+from __future__ import annotations
+import argparse
+from lobes import assess as _assess
+from lobes import profiles
+from lobes.cli import _runtime_ops
+from lobes.cli._output import emit_result
+from lobes.runtime import _compose, _env
+def _resolve_shape(args, deploy_dir) -> tuple[profiles.WorkloadProfile, int, int]:
+    """Resolve the (purpose, input_len, output_len) shape — flag > .env > default."""
+    purpose = args.purpose
+    if purpose is None and deploy_dir is not None:
+        purpose = _env.read_env(
+            deploy_dir / _compose.ENV_FILE, "VLLM_PURPOSE", profiles.DEFAULT_PURPOSE
+        )
+    wl = profiles.workload_profile(purpose or profiles.DEFAULT_PURPOSE)
+    input_len = args.input_len if args.input_len is not None else wl.bench_input_len
+    output_len = args.output_len if args.output_len is not None else wl.bench_output_len
+    return wl, input_len, output_len
+def cmd_benchmark(args: argparse.Namespace) -> int:
+    json_mode = bool(getattr(args, "json", False))
+    port, deploy_dir = _runtime_ops.resolve_port_soft(args)
+    model = args.model
+    if model is None and deploy_dir is not None:
+        model = _env.read_env(deploy_dir / _compose.ENV_FILE, "VLLM_SERVED_NAME")
+    wl, input_len, output_len = _resolve_shape(args, deploy_dir)
+    url = f"http://localhost:{port}"
+    result = _assess.run_benchmark(
+        url,
+        model,
+        purpose=wl.name,
+        input_len=input_len,
+        output_len=output_len,
+        runs=args.runs,
+    )
+    host = {"image": _compose.container_image(), "gpu_memory": _compose.gpu_engine_mem()}
+    if json_mode:
+        emit_result({**result, "host": host}, json_mode=True)
+    else:
+        header = (
+            "### Host-side\n"
+            f"- Image: `{host['image']}`  ·  GPU memory (EngineCore): {host['gpu_memory']}\n"
+        )
+        emit_result(header + "\n" + _assess.render_benchmark(result), json_mode=False)
+    return 0
+def register(sub: argparse._SubParsersAction) -> None:
+    p = sub.add_parser(
+        "benchmark",
+        help="Decode throughput + prefill latency for the served model (markdown for a doc).",
+    )
+    p.add_argument("--port", type=int, help="Host port (default: VLLM_PORT in .env, else 8000).")
+    p.add_argument(
+        "--model", help="Served model name (default: VLLM_SERVED_NAME, else first /v1/models)."
+    )
+    p.add_argument(
+        "--purpose",
+        choices=[wp.name for wp in profiles.WORKLOAD_PROFILES],
+        default=None,
+        help="Workload shape (default: the configured VLLM_PURPOSE, else balanced).",
+    )
+    p.add_argument(
+        "--input-len",
+        type=int,
+        default=None,
+        help="Override prompt length (default: the purpose's shape).",
+    )
+    p.add_argument(
+        "--output-len",
+        type=int,
+        default=None,
+        help="Override forced decode length (default: the purpose's shape).",
+    )
+    p.add_argument("--runs", type=int, default=2, help="Decode-throughput repetitions (default 2).")
+    p.add_argument("--compose-dir", help="Deployment dir (default: $LOBES_DIR or ~/.lobes).")
+    p.add_argument("--json", action="store_true", help="Emit structured JSON.")
+    p.set_defaults(func=cmd_benchmark)

lobes/cli/_commands/cli.py ADDED Viewed

@@ -0,0 +1,38 @@
+"""``model cli`` — noun grouping CLI-surface introspection.
+Exists to satisfy the agent-first rubric's ``overview_cli_noun_exists`` check.
+``model cli overview`` describes the CLI surface itself (distinct from the global
+``overview``, which describes the tool and the served model).
+"""
+from __future__ import annotations
+import argparse
+from lobes.cli._commands.overview import cli_sections, emit_overview
+def cmd_cli_overview(args: argparse.Namespace) -> int:
+    emit_overview("model cli", cli_sections(), json_mode=bool(getattr(args, "json", False)))
+    return 0
+def _no_verb(args: argparse.Namespace) -> int:
+    # `model cli` with no sub-verb prints the noun's overview.
+    return cmd_cli_overview(args)
+def register(sub: argparse._SubParsersAction) -> None:
+    p = sub.add_parser(
+        "cli",
+        help="CLI-surface introspection (see 'model cli overview').",
+    )
+    p.add_argument("--json", action="store_true", help="Emit structured JSON.")
+    p.set_defaults(func=_no_verb, json=False)
+    # `p` is a _ModelGearArgumentParser (the top-level subparsers were built with
+    # that parser_class); propagate it so `cli overview` parse errors route through
+    # the structured error contract instead of argparse's default stderr/exit 2.
+    noun_sub = p.add_subparsers(dest="cli_command", parser_class=type(p))
+    ov = noun_sub.add_parser("overview", help="Describe the lobes CLI surface.")
+    ov.add_argument("--json", action="store_true", help="Emit structured JSON.")
+    ov.set_defaults(func=cmd_cli_overview)