lobes-cli 0.27.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. lobes/__init__.py +11 -0
  2. lobes/__main__.py +8 -0
  3. lobes/_metrics.py +152 -0
  4. lobes/assess.py +404 -0
  5. lobes/catalog.py +225 -0
  6. lobes/cli/__init__.py +169 -0
  7. lobes/cli/_commands/__init__.py +0 -0
  8. lobes/cli/_commands/assess.py +57 -0
  9. lobes/cli/_commands/benchmark.py +96 -0
  10. lobes/cli/_commands/cli.py +38 -0
  11. lobes/cli/_commands/doctor.py +150 -0
  12. lobes/cli/_commands/explain.py +38 -0
  13. lobes/cli/_commands/fleet.py +181 -0
  14. lobes/cli/_commands/init.py +136 -0
  15. lobes/cli/_commands/learn.py +253 -0
  16. lobes/cli/_commands/logs.py +197 -0
  17. lobes/cli/_commands/overview.py +241 -0
  18. lobes/cli/_commands/serve.py +76 -0
  19. lobes/cli/_commands/status.py +66 -0
  20. lobes/cli/_commands/stop.py +48 -0
  21. lobes/cli/_commands/switch.py +528 -0
  22. lobes/cli/_commands/tunnel.py +181 -0
  23. lobes/cli/_commands/whoami.py +130 -0
  24. lobes/cli/_errors.py +42 -0
  25. lobes/cli/_live.py +148 -0
  26. lobes/cli/_output.py +56 -0
  27. lobes/cli/_runtime_ops.py +78 -0
  28. lobes/explain/__init__.py +27 -0
  29. lobes/explain/catalog.py +811 -0
  30. lobes/gateway/__init__.py +20 -0
  31. lobes/gateway/__main__.py +19 -0
  32. lobes/gateway/_config.py +142 -0
  33. lobes/gateway/_routing.py +126 -0
  34. lobes/gateway/server.py +533 -0
  35. lobes/profiles.py +241 -0
  36. lobes/realtime/__init__.py +21 -0
  37. lobes/realtime/__main__.py +13 -0
  38. lobes/realtime/_readiness.py +49 -0
  39. lobes/realtime/_settings.py +92 -0
  40. lobes/realtime/app.py +98 -0
  41. lobes/realtime/audio_facade.py +106 -0
  42. lobes/realtime/chatterbox_server.py +193 -0
  43. lobes/realtime/protocol.py +83 -0
  44. lobes/realtime/tts_client.py +381 -0
  45. lobes/runtime/__init__.py +8 -0
  46. lobes/runtime/_compose.py +394 -0
  47. lobes/runtime/_env.py +70 -0
  48. lobes/runtime/_health.py +60 -0
  49. lobes/runtime/_parser.py +51 -0
  50. lobes/runtime/_tunnel.py +367 -0
  51. lobes/templates/__init__.py +5 -0
  52. lobes/templates/cf-tunnel.env.example +31 -0
  53. lobes/templates/docker-compose.yml +119 -0
  54. lobes/templates/env.example +105 -0
  55. lobes/templates/fleet/Dockerfile.chatterbox +111 -0
  56. lobes/templates/fleet/Dockerfile.gateway +20 -0
  57. lobes/templates/fleet/Dockerfile.parakeet +31 -0
  58. lobes/templates/fleet/Dockerfile.realtime +20 -0
  59. lobes/templates/fleet/__init__.py +6 -0
  60. lobes/templates/fleet/_readiness.py +48 -0
  61. lobes/templates/fleet/docker-compose.audio.yml +144 -0
  62. lobes/templates/fleet/docker-compose.yml +254 -0
  63. lobes/templates/fleet/env.audio.example +40 -0
  64. lobes/templates/fleet/env.example +111 -0
  65. lobes/templates/fleet/listen_server.py +125 -0
  66. lobes/templates/mg-logwrap.sh +46 -0
  67. lobes_cli-0.27.0.dist-info/METADATA +400 -0
  68. lobes_cli-0.27.0.dist-info/RECORD +71 -0
  69. lobes_cli-0.27.0.dist-info/WHEEL +4 -0
  70. lobes_cli-0.27.0.dist-info/entry_points.txt +3 -0
  71. lobes_cli-0.27.0.dist-info/licenses/LICENSE +201 -0
lobes/catalog.py ADDED
@@ -0,0 +1,225 @@
1
+ """The supported-model catalog — the "gears" lobes can change to.
2
+
3
+ A pure, dependency-free data module: the single source of truth for the models
4
+ lobes knows how to serve (each one load-tested or configured on the DGX
5
+ Spark and documented under ``docs/``). It ships *in the wheel* so both runtimes
6
+ can read it:
7
+
8
+ * the CLI (``lobes overview --list``) — which would otherwise scan ``docs/`` and
9
+ find nothing in a wheel install (``docs/`` is not packaged), and
10
+ * the gateway (``GET /v1/models/supported``) — which runs from a pip-installed
11
+ wheel inside its container and has no source tree to scan.
12
+
13
+ The per-model ``docs/`` files remain the *human* prose; this module is the
14
+ *machine* catalog. ``tests/test_catalog.py`` asserts the two cannot silently
15
+ diverge (every ``doc`` file exists; every parser matches ``infer_parser``).
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from dataclasses import asdict, dataclass
21
+
22
+
23
+ @dataclass(frozen=True)
24
+ class SupportedModel:
25
+ """One model the fleet/CLI can serve — a gear you can change to."""
26
+
27
+ id: str # OpenAI model id (== the vLLM --served-model-name)
28
+ role_hint: str # "primary" | "fallback" | "candidate" (the fleet's default role)
29
+ shape: str # architecture in a phrase, e.g. "dense" / "MoE (~3B active)"
30
+ context: str # native context window, human-readable
31
+ # The largest --max-model-len this checkpoint serves with vLLM's *default* rope
32
+ # (no YaRN/rope-scaling override) — a hard ceiling: vLLM refuses a larger value
33
+ # and the container fails to boot. `lobes switch` clamps the machine-profile
34
+ # context default DOWN to this, so a high machine default (e.g. spark's 256K)
35
+ # can't silently boot-fail a 32K-native model. An explicit --max-model-len wins.
36
+ native_max_model_len: int
37
+ tool_parser: str # vLLM --tool-call-parser (must match runtime._parser.infer_parser)
38
+ quantization: str # vLLM --quantization
39
+ status: str # "load-tested" (measured on this hardware) | "configured" (not yet)
40
+ doc: str # per-model markdown under docs/ (filename only)
41
+ # Per-model serve extras for MoE checkpoints. Empty for dense/hybrid models;
42
+ # set only where the architecture needs them. These are NOT in the default
43
+ # single-model template (docker compose can't conditionally omit a flag, and
44
+ # an empty `--moe-backend=` token breaks vLLM) — `lobes switch` surfaces them
45
+ # as a documented compose edit. See docs/qwen3.6-35b-a3b-nvfp4.md.
46
+ moe_backend: str = "" # vLLM --moe-backend (e.g. "marlin") for MoE models
47
+ speculative_config: str = "" # vLLM --speculative-config JSON (e.g. MTP draft)
48
+ task: str = "generate" # "generate" | "embed" | "score"
49
+ dimension: int = 0 # embedding output dimension; 0 for non-embedding models
50
+ hf_overrides: str = "" # vLLM --hf-overrides JSON string
51
+
52
+
53
+ SUPPORTED_MODELS: tuple[SupportedModel, ...] = (
54
+ SupportedModel(
55
+ id="mmangkad/Qwen3.6-27B-NVFP4",
56
+ # Archived former primary (superseded 2026-05-31 by the MTP build below).
57
+ # Kept in the catalog for two reasons: (1) it is the tokenizer source the
58
+ # MTP primary serves with (--tokenizer=mmangkad/Qwen3.6-27B-NVFP4), and
59
+ # (2) it is the only *vision-capable* 27B — the MTP primary is text-only,
60
+ # so this is the fallback when an image path is needed.
61
+ role_hint="candidate",
62
+ shape="hybrid Mamba/linear-attn + ViT (multimodal)",
63
+ context="256K native",
64
+ native_max_model_len=262144,
65
+ tool_parser="qwen3_coder",
66
+ quantization="modelopt_fp4",
67
+ status="load-tested",
68
+ doc="qwen3.6-27b-nvfp4.md",
69
+ ),
70
+ SupportedModel(
71
+ id="RedHatAI/Mistral-Small-3.2-24B-Instruct-2506-NVFP4",
72
+ role_hint="fallback",
73
+ shape="dense (vision-capable)",
74
+ context="128K native",
75
+ native_max_model_len=131072,
76
+ tool_parser="mistral",
77
+ quantization="compressed-tensors",
78
+ status="load-tested",
79
+ doc="mistral-small-3.2-24b-nvfp4.md",
80
+ ),
81
+ SupportedModel(
82
+ id="nvidia/Qwen3-32B-NVFP4",
83
+ role_hint="candidate",
84
+ shape="dense",
85
+ context="32K (→131K via YaRN)",
86
+ # 32K native: 131K needs an explicit YaRN --rope-scaling override (pass
87
+ # --max-model-len 131072 with it). Without that, 32768 is the boot ceiling.
88
+ native_max_model_len=32768,
89
+ tool_parser="hermes",
90
+ quantization="modelopt_fp4",
91
+ status="load-tested",
92
+ doc="qwen3-32b-nvfp4.md",
93
+ ),
94
+ SupportedModel(
95
+ id="sakamakismile/Qwen3.6-27B-Text-NVFP4-MTP",
96
+ # Fleet default primary since 2026-05-31 (promoted from candidate after the
97
+ # tool-calling gate passed: a valid qwen3_coder tool call + full tool
98
+ # round-trip + reasoning trace, all under the production compose, with MTP
99
+ # spec-decode active at 78.6% draft acceptance and 18.7 tok/s decode —
100
+ # ~2.4x the archived baseline 27B). Replaces mmangkad/Qwen3.6-27B-NVFP4.
101
+ role_hint="primary",
102
+ shape="hybrid Mamba/linear-attn (text-only, MTP draft head)",
103
+ context="256K native (served at full 256K on the shared GB10)",
104
+ native_max_model_len=262144,
105
+ tool_parser="qwen3_coder",
106
+ quantization="modelopt",
107
+ status="load-tested",
108
+ doc="qwen3.6-27b-text-nvfp4-mtp.md",
109
+ # MTP primary (issue #26): an MTP-grafted re-export of the archived 27B —
110
+ # the baseline NVFP4 export drops the MTP draft head (0% draft acceptance),
111
+ # so this repo restores it in bf16 for vLLM speculative decoding. The
112
+ # --speculative-config is catalog data (like moe_backend): compose can't omit
113
+ # an empty flag, so `lobes switch` surfaces it as a hand edit. Load-tested on
114
+ # the GB10 2026-05-31: 19.1 tok/s decode (~2.4x the baseline 27B) at 72% MTP
115
+ # acceptance on vLLM 0.19.0+nv26.04. Also needs --trust-remote-code +
116
+ # --language-model-only, VLLM_MAX_NUM_SEQS=2 (4 OOMs at n=3/256K), and a
117
+ # tokenizer override (--tokenizer=mmangkad/Qwen3.6-27B-NVFP4 — the checkpoint's
118
+ # tokenizer_config declares TokenizersBackend, absent from the nv26.04 image).
119
+ # Quantization `modelopt` resolves to modelopt_fp4. See the doc.
120
+ speculative_config='{"method": "qwen3_5_mtp", "num_speculative_tokens": 3}',
121
+ ),
122
+ SupportedModel(
123
+ id="mmangkad/Qwen3.6-35B-A3B-NVFP4",
124
+ role_hint="candidate",
125
+ shape="MoE (~3B active per token)",
126
+ context="32K",
127
+ native_max_model_len=32768,
128
+ tool_parser="qwen3_coder",
129
+ quantization="modelopt_fp4",
130
+ status="configured",
131
+ doc="qwen3.6-35b-a3b-nvfp4.md",
132
+ # MoE-only serve extra: the marlin MoE kernel — verified to load this
133
+ # checkpoint *solo* on the GB10 (2026-05-31, util 0.70). lobes switch
134
+ # surfaces it as a compose edit; it must not land on the dense/hybrid models.
135
+ # shahizat's MTP --speculative-config is intentionally NOT carried: it is
136
+ # tied to the nvidia/ checkpoint and FAILS to load on this mmangkad copy
137
+ # (qwen3_5_mtp.py weight-shape mismatch on vLLM nv26.04). See the doc.
138
+ moe_backend="marlin",
139
+ ),
140
+ SupportedModel(
141
+ id="Qwen/Qwen3-Embedding-0.6B",
142
+ # Embedding gear (issue #44): 1024-dim dense text embeddings with Matryoshka
143
+ # nesting (32/64/128/256/512/768/1024). Zero tool-parser and quantization —
144
+ # this is a pooling model, not a chat/completion model. Served via vLLM's
145
+ # embedding endpoint (/v1/embeddings). The hf_overrides enables Matryoshka
146
+ # truncation so consumers can request sub-1024 dimensions without re-serving.
147
+ role_hint="embedding",
148
+ shape="dense embedding (text)",
149
+ context="32K native",
150
+ native_max_model_len=32768,
151
+ tool_parser="",
152
+ quantization="",
153
+ status="load-tested", # GB10 2026-06-19: dim 1024, MRL 256 ✓, ~28ms warm, co-resident
154
+ doc="qwen3-embedding-0.6b.md",
155
+ task="embed",
156
+ dimension=1024,
157
+ hf_overrides=(
158
+ '{"is_matryoshka": true,'
159
+ ' "matryoshka_dimensions": [32, 64, 128, 256, 512, 768, 1024]}'
160
+ ),
161
+ ),
162
+ SupportedModel(
163
+ id="Qwen/Qwen3-Reranker-0.6B",
164
+ # Reranker gear (issue #44): cross-encoder that scores (query, passage) pairs.
165
+ # Built on Qwen3ForSequenceClassification with a binary yes/no logit head;
166
+ # served via vLLM's score endpoint (/v1/score). The hf_overrides declare the
167
+ # non-standard architecture class and the two classifier tokens so vLLM can
168
+ # load the head correctly. Zero tool-parser and quantization (score-only model).
169
+ role_hint="reranker",
170
+ shape="dense cross-encoder (Qwen3ForSequenceClassification)",
171
+ context="32K native",
172
+ native_max_model_len=32768,
173
+ tool_parser="",
174
+ quantization="",
175
+ status="load-tested", # GB10 2026-06-19: /v1/rerank+/v1/score ✓, ~25ms warm, co-resident
176
+ doc="qwen3-reranker-0.6b.md",
177
+ task="score",
178
+ dimension=0,
179
+ hf_overrides=(
180
+ '{"architectures": ["Qwen3ForSequenceClassification"],'
181
+ ' "classifier_from_token": ["no", "yes"],'
182
+ ' "is_original_qwen3_reranker": true}'
183
+ ),
184
+ ),
185
+ )
186
+
187
+
188
+ def supported_models() -> tuple[SupportedModel, ...]:
189
+ """The full supported-model catalog (the gears you can change to)."""
190
+ return SUPPORTED_MODELS
191
+
192
+
193
+ def as_dicts() -> list[dict[str, str]]:
194
+ """The catalog as plain dicts — for JSON emission without importing the dataclass."""
195
+ return [asdict(model) for model in SUPPORTED_MODELS]
196
+
197
+
198
+ # The tokenizer the MTP primary serves with — a base-checkpoint override (the MTP
199
+ # checkpoint's tokenizer_config declares a class absent from the nv26.04 image; see
200
+ # docs/qwen3.6-27b-text-nvfp4-mtp.md caveat 1). Drop once fixed upstream (issue #29).
201
+ MTP_TOKENIZER_OVERRIDE = "mmangkad/Qwen3.6-27B-NVFP4"
202
+
203
+
204
+ def mtp_compose_command_items() -> list[str]:
205
+ """The extra compose ``command:`` items the MTP default primary needs.
206
+
207
+ These four flags are baked into the packaged compose templates *and* named by
208
+ ``lobes switch`` as the lines to remove when switching to a non-MTP model. This
209
+ is the single source of truth so the two cannot drift — ``tests/test_catalog.py``
210
+ asserts the packaged templates contain exactly these items, and the speculative
211
+ config is pulled from the primary catalog entry rather than re-typed.
212
+
213
+ Returns argv tokens (no YAML quoting) in compose ``command:`` order.
214
+ """
215
+ primary = next(
216
+ (m for m in SUPPORTED_MODELS if m.role_hint == "primary" and m.speculative_config),
217
+ None,
218
+ )
219
+ spec = primary.speculative_config if primary else '{"method": "..."}'
220
+ return [
221
+ f"--speculative-config={spec}",
222
+ "--trust-remote-code",
223
+ "--language-model-only",
224
+ f"--tokenizer={MTP_TOKENIZER_OVERRIDE}",
225
+ ]
lobes/cli/__init__.py ADDED
@@ -0,0 +1,169 @@
1
+ """Unified CLI entry point for lobes (binary: ``lobes``; ``model`` is a deprecated alias).
2
+
3
+ The model-ops verbs (``switch``, ``serve``/``stop``, ``status``, ``assess``,
4
+ ``benchmark``, ``init``, ``tunnel``) are the heart of the tool; the agent-first verbs
5
+ (``whoami``, ``learn``, ``explain``, ``overview``, ``doctor``, ``cli``) keep the
6
+ sibling rubric satisfied. Each verb module exposes ``register(sub)`` following
7
+ the same pattern.
8
+
9
+ Error propagation contract
10
+ --------------------------
11
+ Every handler raises :class:`lobes.cli._errors.ModelGearError` on failure;
12
+ ``main()`` catches it via :func:`_dispatch` and routes through
13
+ :mod:`lobes.cli._output`. Unknown exceptions are wrapped into a
14
+ ``ModelGearError`` so no Python traceback leaks to stderr.
15
+
16
+ Argparse errors (unknown verb, missing arg) also route through the structured
17
+ format — ``_ModelGearArgumentParser`` overrides ``.error()`` and the subparsers
18
+ are built with ``parser_class=_ModelGearArgumentParser``. Whether errors render
19
+ as text or JSON depends on whether ``--json`` appears in the raw argv
20
+ (:func:`main` sets ``_json_hint`` before ``parse_args``).
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import argparse
26
+ import sys
27
+
28
+ from lobes import __version__
29
+ from lobes.cli._errors import EXIT_USER_ERROR, ModelGearError
30
+ from lobes.cli._output import emit_error
31
+
32
+
33
+ class _ModelGearArgumentParser(argparse.ArgumentParser):
34
+ """ArgumentParser that routes errors through :func:`emit_error`.
35
+
36
+ Argparse's default error handler writes ``prog: error: <msg>`` to stderr
37
+ and exits 2, skipping the ModelGearError plumbing (and the ``hint:`` line
38
+ agents look for). This subclass emits the structured format and exits with
39
+ :attr:`EXIT_USER_ERROR`.
40
+
41
+ JSON mode: parse-time errors happen before ``args.json`` exists, so we rely
42
+ on a class-level ``_json_hint`` that :func:`main` pre-populates by scanning
43
+ raw argv for ``--json``. Shared across all subparser instances.
44
+ """
45
+
46
+ _json_hint: bool = False
47
+
48
+ def error(self, message: str) -> None: # type: ignore[override]
49
+ err = ModelGearError(
50
+ code=EXIT_USER_ERROR,
51
+ message=message,
52
+ remediation=f"run '{self.prog} --help' to see valid arguments",
53
+ )
54
+ emit_error(err, json_mode=type(self)._json_hint)
55
+ raise SystemExit(err.code)
56
+
57
+
58
+ def _argv_has_json(argv: list[str] | None) -> bool:
59
+ tokens = argv if argv is not None else sys.argv[1:]
60
+ return any(t == "--json" or t.startswith("--json=") for t in tokens)
61
+
62
+
63
+ def _detect_prog() -> str:
64
+ """Return the invocation name so ``--version`` and help text match the binary.
65
+
66
+ When invoked as ``lobes`` → ``"lobes"``; as ``model`` (deprecated alias) → ``"model"``;
67
+ as ``python -m lobes`` → ``"lobes"``.
68
+ """
69
+ import os
70
+
71
+ argv0 = os.path.basename(sys.argv[0]) if sys.argv else "lobes"
72
+ # Strip .py suffix (python -m lobes → __main__.py on some Python versions)
73
+ name = argv0.removesuffix(".py").removesuffix("__main__")
74
+ return name if name in ("lobes", "model") else "lobes"
75
+
76
+
77
+ def _build_parser() -> argparse.ArgumentParser:
78
+ from lobes.cli._commands import assess as _assess_cmd
79
+ from lobes.cli._commands import benchmark as _benchmark_cmd
80
+ from lobes.cli._commands import cli as _cli_group
81
+ from lobes.cli._commands import doctor as _doctor_cmd
82
+ from lobes.cli._commands import explain as _explain_cmd
83
+ from lobes.cli._commands import fleet as _fleet_cmd
84
+ from lobes.cli._commands import init as _init_cmd
85
+ from lobes.cli._commands import learn as _learn_cmd
86
+ from lobes.cli._commands import logs as _logs_cmd
87
+ from lobes.cli._commands import overview as _overview_cmd
88
+ from lobes.cli._commands import serve as _serve_cmd
89
+ from lobes.cli._commands import status as _status_cmd
90
+ from lobes.cli._commands import stop as _stop_cmd
91
+ from lobes.cli._commands import switch as _switch_cmd
92
+ from lobes.cli._commands import tunnel as _tunnel_cmd
93
+ from lobes.cli._commands import whoami as _whoami_cmd
94
+
95
+ parser = _ModelGearArgumentParser(
96
+ prog=_detect_prog(),
97
+ description="lobes — run, assess, and switch the local vLLM model",
98
+ )
99
+ parser.add_argument(
100
+ "--version",
101
+ action="version",
102
+ version=f"%(prog)s {__version__}",
103
+ )
104
+ # parser_class propagates to every subparser so their .error() routes
105
+ # through _ModelGearArgumentParser too.
106
+ sub = parser.add_subparsers(dest="command", parser_class=_ModelGearArgumentParser)
107
+
108
+ # Model-ops verbs (the heart of the tool).
109
+ _switch_cmd.register(sub)
110
+ _serve_cmd.register(sub)
111
+ _stop_cmd.register(sub)
112
+ _status_cmd.register(sub)
113
+ _assess_cmd.register(sub)
114
+ _benchmark_cmd.register(sub)
115
+ _init_cmd.register(sub)
116
+ _fleet_cmd.register(sub)
117
+ _logs_cmd.register(sub)
118
+ _tunnel_cmd.register(sub)
119
+
120
+ # Agent-first / introspection verbs (sibling rubric).
121
+ _whoami_cmd.register(sub)
122
+ _learn_cmd.register(sub)
123
+ _explain_cmd.register(sub)
124
+ _overview_cmd.register(sub)
125
+ _doctor_cmd.register(sub)
126
+ _cli_group.register(sub)
127
+
128
+ return parser
129
+
130
+
131
+ def _dispatch(args: argparse.Namespace) -> int:
132
+ """Invoke the registered handler and translate exceptions to exit codes.
133
+
134
+ A handler may return ``None`` (success, exit 0) or an ``int`` exit code.
135
+ Failures MUST raise :class:`ModelGearError`; any other exception is wrapped
136
+ into one so no Python traceback leaks.
137
+ """
138
+ json_mode = bool(getattr(args, "json", False))
139
+ try:
140
+ rc = args.func(args)
141
+ except ModelGearError as err:
142
+ emit_error(err, json_mode=json_mode)
143
+ return err.code
144
+ except Exception as err: # noqa: BLE001 - last-resort; wrap and route cleanly
145
+ wrapped = ModelGearError(
146
+ code=EXIT_USER_ERROR,
147
+ message=f"unexpected: {err.__class__.__name__}: {err}",
148
+ remediation="file a bug at https://github.com/agentculture/lobes-cli/issues",
149
+ )
150
+ emit_error(wrapped, json_mode=json_mode)
151
+ return wrapped.code
152
+ return rc if rc is not None else 0
153
+
154
+
155
+ def main(argv: list[str] | None = None) -> int:
156
+ # Pre-parse peek so argparse-level errors honour --json.
157
+ _ModelGearArgumentParser._json_hint = _argv_has_json(argv)
158
+ parser = _build_parser()
159
+ args = parser.parse_args(argv)
160
+
161
+ if args.command is None:
162
+ parser.print_help()
163
+ return 0
164
+
165
+ return _dispatch(args)
166
+
167
+
168
+ if __name__ == "__main__":
169
+ sys.exit(main())
File without changes
@@ -0,0 +1,57 @@
1
+ """``lobes assess`` — correctness probes against the served model.
2
+
3
+ Read-only. Runs the two fixed correctness probes and detects the reasoning-trace
4
+ field, then emits a markdown block (plus host-side facts) ready to paste into a
5
+ per-model doc under ``docs/``. ``--tools`` additionally probes OpenAI tool
6
+ calling. Throughput lives in ``lobes benchmark``.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+
13
+ from lobes import assess as _assess
14
+ from lobes.cli import _runtime_ops
15
+ from lobes.cli._output import emit_result
16
+ from lobes.runtime import _compose, _env
17
+
18
+
19
+ def cmd_assess(args: argparse.Namespace) -> int:
20
+ json_mode = bool(getattr(args, "json", False))
21
+ port, deploy_dir = _runtime_ops.resolve_port_soft(args)
22
+ model = args.model
23
+ if model is None and deploy_dir is not None:
24
+ model = _env.read_env(deploy_dir / _compose.ENV_FILE, "VLLM_SERVED_NAME")
25
+
26
+ url = f"http://localhost:{port}"
27
+ result = _assess.run_correctness(url, model, check_tools=bool(getattr(args, "tools", False)))
28
+ host = {"image": _compose.container_image(), "gpu_memory": _compose.gpu_engine_mem()}
29
+
30
+ if json_mode:
31
+ emit_result({**result, "host": host}, json_mode=True)
32
+ else:
33
+ header = (
34
+ "### Host-side\n"
35
+ f"- Image: `{host['image']}` · GPU memory (EngineCore): {host['gpu_memory']}\n"
36
+ )
37
+ emit_result(header + "\n" + _assess.render_correctness(result), json_mode=False)
38
+ return 0
39
+
40
+
41
+ def register(sub: argparse._SubParsersAction) -> None:
42
+ p = sub.add_parser(
43
+ "assess",
44
+ help="Correctness probes against the served model (markdown for a per-model doc).",
45
+ )
46
+ p.add_argument("--port", type=int, help="Host port (default: VLLM_PORT in .env, else 8000).")
47
+ p.add_argument(
48
+ "--model", help="Served model name (default: VLLM_SERVED_NAME, else first /v1/models)."
49
+ )
50
+ p.add_argument(
51
+ "--tools",
52
+ action="store_true",
53
+ help="Also probe OpenAI tool calling (tool_choice:auto must return a tool_calls array).",
54
+ )
55
+ p.add_argument("--compose-dir", help="Deployment dir (default: $LOBES_DIR or ~/.lobes).")
56
+ p.add_argument("--json", action="store_true", help="Emit structured JSON.")
57
+ p.set_defaults(func=cmd_assess)
@@ -0,0 +1,96 @@
1
+ """``lobes benchmark`` — decode throughput + prefill latency for the served model.
2
+
3
+ Read-only. The workload shape is the active *purpose*: it defaults to the
4
+ configured ``VLLM_PURPOSE`` (so the numbers track the serve config) and can be
5
+ overridden with ``--purpose`` or explicit ``--input-len`` / ``--output-len``.
6
+ Forces a fixed decode length over a couple of runs and measures a prompt-sized
7
+ prefill, then emits a markdown block (plus host-side facts) for a per-model doc
8
+ under ``docs/``. Correctness lives in ``lobes assess``.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+
15
+ from lobes import assess as _assess
16
+ from lobes import profiles
17
+ from lobes.cli import _runtime_ops
18
+ from lobes.cli._output import emit_result
19
+ from lobes.runtime import _compose, _env
20
+
21
+
22
+ def _resolve_shape(args, deploy_dir) -> tuple[profiles.WorkloadProfile, int, int]:
23
+ """Resolve the (purpose, input_len, output_len) shape — flag > .env > default."""
24
+ purpose = args.purpose
25
+ if purpose is None and deploy_dir is not None:
26
+ purpose = _env.read_env(
27
+ deploy_dir / _compose.ENV_FILE, "VLLM_PURPOSE", profiles.DEFAULT_PURPOSE
28
+ )
29
+ wl = profiles.workload_profile(purpose or profiles.DEFAULT_PURPOSE)
30
+ input_len = args.input_len if args.input_len is not None else wl.bench_input_len
31
+ output_len = args.output_len if args.output_len is not None else wl.bench_output_len
32
+ return wl, input_len, output_len
33
+
34
+
35
+ def cmd_benchmark(args: argparse.Namespace) -> int:
36
+ json_mode = bool(getattr(args, "json", False))
37
+ port, deploy_dir = _runtime_ops.resolve_port_soft(args)
38
+ model = args.model
39
+ if model is None and deploy_dir is not None:
40
+ model = _env.read_env(deploy_dir / _compose.ENV_FILE, "VLLM_SERVED_NAME")
41
+
42
+ wl, input_len, output_len = _resolve_shape(args, deploy_dir)
43
+
44
+ url = f"http://localhost:{port}"
45
+ result = _assess.run_benchmark(
46
+ url,
47
+ model,
48
+ purpose=wl.name,
49
+ input_len=input_len,
50
+ output_len=output_len,
51
+ runs=args.runs,
52
+ )
53
+ host = {"image": _compose.container_image(), "gpu_memory": _compose.gpu_engine_mem()}
54
+
55
+ if json_mode:
56
+ emit_result({**result, "host": host}, json_mode=True)
57
+ else:
58
+ header = (
59
+ "### Host-side\n"
60
+ f"- Image: `{host['image']}` · GPU memory (EngineCore): {host['gpu_memory']}\n"
61
+ )
62
+ emit_result(header + "\n" + _assess.render_benchmark(result), json_mode=False)
63
+ return 0
64
+
65
+
66
+ def register(sub: argparse._SubParsersAction) -> None:
67
+ p = sub.add_parser(
68
+ "benchmark",
69
+ help="Decode throughput + prefill latency for the served model (markdown for a doc).",
70
+ )
71
+ p.add_argument("--port", type=int, help="Host port (default: VLLM_PORT in .env, else 8000).")
72
+ p.add_argument(
73
+ "--model", help="Served model name (default: VLLM_SERVED_NAME, else first /v1/models)."
74
+ )
75
+ p.add_argument(
76
+ "--purpose",
77
+ choices=[wp.name for wp in profiles.WORKLOAD_PROFILES],
78
+ default=None,
79
+ help="Workload shape (default: the configured VLLM_PURPOSE, else balanced).",
80
+ )
81
+ p.add_argument(
82
+ "--input-len",
83
+ type=int,
84
+ default=None,
85
+ help="Override prompt length (default: the purpose's shape).",
86
+ )
87
+ p.add_argument(
88
+ "--output-len",
89
+ type=int,
90
+ default=None,
91
+ help="Override forced decode length (default: the purpose's shape).",
92
+ )
93
+ p.add_argument("--runs", type=int, default=2, help="Decode-throughput repetitions (default 2).")
94
+ p.add_argument("--compose-dir", help="Deployment dir (default: $LOBES_DIR or ~/.lobes).")
95
+ p.add_argument("--json", action="store_true", help="Emit structured JSON.")
96
+ p.set_defaults(func=cmd_benchmark)
@@ -0,0 +1,38 @@
1
+ """``model cli`` — noun grouping CLI-surface introspection.
2
+
3
+ Exists to satisfy the agent-first rubric's ``overview_cli_noun_exists`` check.
4
+ ``model cli overview`` describes the CLI surface itself (distinct from the global
5
+ ``overview``, which describes the tool and the served model).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import argparse
11
+
12
+ from lobes.cli._commands.overview import cli_sections, emit_overview
13
+
14
+
15
+ def cmd_cli_overview(args: argparse.Namespace) -> int:
16
+ emit_overview("model cli", cli_sections(), json_mode=bool(getattr(args, "json", False)))
17
+ return 0
18
+
19
+
20
+ def _no_verb(args: argparse.Namespace) -> int:
21
+ # `model cli` with no sub-verb prints the noun's overview.
22
+ return cmd_cli_overview(args)
23
+
24
+
25
+ def register(sub: argparse._SubParsersAction) -> None:
26
+ p = sub.add_parser(
27
+ "cli",
28
+ help="CLI-surface introspection (see 'model cli overview').",
29
+ )
30
+ p.add_argument("--json", action="store_true", help="Emit structured JSON.")
31
+ p.set_defaults(func=_no_verb, json=False)
32
+ # `p` is a _ModelGearArgumentParser (the top-level subparsers were built with
33
+ # that parser_class); propagate it so `cli overview` parse errors route through
34
+ # the structured error contract instead of argparse's default stderr/exit 2.
35
+ noun_sub = p.add_subparsers(dest="cli_command", parser_class=type(p))
36
+ ov = noun_sub.add_parser("overview", help="Describe the lobes CLI surface.")
37
+ ov.add_argument("--json", action="store_true", help="Emit structured JSON.")
38
+ ov.set_defaults(func=cmd_cli_overview)