avp-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
avp_cli/cli.py ADDED
@@ -0,0 +1,1858 @@
1
+ """`avp` - the local AVP CLI for building, running, and iterating on Commissions.
2
+
3
+ avp getting-started + agent routing (no args)
4
+ avp init [KEY] [--dir D] scaffold an eval (in place) + its commissions (to ~/.avp)
5
+ avp eval run CONFIG run an eval config, print a ranked board
6
+ avp eval list list recent eval runs (with their ids)
7
+ avp eval view [ID] open an eval on agentvoyagerproject.com (default: most recent)
8
+ avp eval delete ID [--all] delete one recorded run by id (or --all for every run)
9
+ avp run "TASK" --agent A [--env E] commission an agent to do a task (in a sandbox)
10
+ avp agent install NAME install a prebuilt agent (release, or --binary/--wheel local)
11
+ avp env create NAME [flags] create a declarative environment (--image/--pip/--file/--net ...)
12
+ avp env run NAME -- CMD run a command inside a declarative environment (sandboxed)
13
+ avp sandbox status|stop inspect or stop the managed sandbox server
14
+ avp cm create [ID] build a commission into your library (wizard, or pass flags)
15
+ avp cm list list your portable commission library
16
+ avp cm describe ID render a library commission by id
17
+ avp cm check ID|FILE check a library commission, or a Commission JSON file
18
+ avp cm delete ID remove a commission from your library
19
+
20
+ An eval is a JSON config file authored in place (no code); commissions are
21
+ portable artifacts in ~/.avp/commissions referenced by id. The CLI is the engine.
22
+ Every run executes inside an OpenSandbox container (Docker is the one
23
+ prerequisite; the CLI manages the rest); provider credentials forward from the
24
+ host environment into the sandbox. When the sandbox stack can't run, exit is 2.
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import argparse
30
+ import json
31
+ import re
32
+ import sys
33
+ import webbrowser
34
+ from pathlib import Path
35
+
36
+ import questionary
37
+
38
+ from avp_cli import (
39
+ brand,
40
+ catalog,
41
+ config,
42
+ console,
43
+ images,
44
+ library,
45
+ live,
46
+ osb,
47
+ paths,
48
+ run_manifest,
49
+ state,
50
+ viz,
51
+ )
52
+ from avp_cli import commission as commission_mod
53
+ from avp_cli.agent import SandboxContext, SandboxedAgent
54
+ from avp_cli.agents import (
55
+ DEFAULT_AGENT,
56
+ NoContainerRecipe,
57
+ container_recipe,
58
+ known_agents,
59
+ preflight,
60
+ resolve_agent,
61
+ )
62
+ from avp_cli.eval.engine import Eval, RunObserver, RunResult, run_matrix, setups_for
63
+ from avp_cli.eval.report import board_table, comparison_table, dump_json, failures
64
+ from avp_cli.observability import tool_tally
65
+ from avp_cli.onboarding import welcome
66
+
67
+ _SKIP = 2 # the sandbox stack / agent can't run here; skip cleanly rather than error
68
+
69
+ # Branded picker palette: sail-gold marker/pointer/answer, sky highlight, and
70
+ # gold titles with dim descriptions in the choice rows.
71
+ # `noreverse` on highlighted/selected: convey state with the pointer and the
72
+ # filled circle (●) + gold text, not a reversed background block.
73
+ _PICKER_STYLE = questionary.Style(
74
+ [
75
+ ("qmark", f"fg:{brand.SAIL} bold"),
76
+ ("question", "bold"),
77
+ ("pointer", f"fg:{brand.SAIL} bold noreverse"),
78
+ ("highlighted", f"fg:{brand.SKY} bold noreverse"),
79
+ ("selected", f"fg:{brand.SAIL} noreverse"),
80
+ ("answer", f"fg:{brand.SAIL} bold"),
81
+ ("instruction", "fg:#7fa0b3"),
82
+ ("ben-title", f"fg:{brand.SAIL} bold"),
83
+ ("ben-desc", "fg:#7fa0b3"),
84
+ ("ben-needs", f"fg:{brand.HULL}"),
85
+ ]
86
+ )
87
+
88
+
89
+ def _default_out_dir() -> Path:
90
+ return paths.runs_dir()
91
+
92
+
93
+ def _tilde(p: Path) -> str:
94
+ """Render a path with $HOME collapsed to ~ for tidy display."""
95
+ home = Path.home()
96
+ return f"~/{p.relative_to(home)}" if p.is_relative_to(home) else str(p)
97
+
98
+
99
+ def _run_and_report(
100
+ ev: Eval,
101
+ *,
102
+ run_id: str,
103
+ agent_specs: list[str],
104
+ out: Path,
105
+ json_path: str | None,
106
+ max_items: int | None,
107
+ model: str | None,
108
+ quiet: bool,
109
+ sandbox_ctx: SandboxContext,
110
+ env_obj,
111
+ config_path: str | None = None,
112
+ timeout_s: float = 300.0,
113
+ resume: bool = False,
114
+ ) -> int:
115
+ runnable = []
116
+ for spec in agent_specs:
117
+ agent = resolve_agent(spec)
118
+ prepared = _prepare_agent(agent, env_obj, quiet=quiet)
119
+ if prepared is not None:
120
+ runnable.append(prepared)
121
+ if not runnable:
122
+ console.warn("no runnable agents; nothing to do.")
123
+ return _SKIP
124
+
125
+ # Snapshot what this run uses BEFORE the matrix, so a crashed run still records
126
+ # its inputs and the record is immune to later edits of the library commissions.
127
+ run_manifest.write(
128
+ out,
129
+ run_id=run_id,
130
+ setups=ev.setups,
131
+ eval_config_path=config_path,
132
+ agents=[a.name for a in runnable],
133
+ model_override=model,
134
+ max_items=max_items,
135
+ threshold_override=getattr(ev.scorer, "threshold", None),
136
+ )
137
+
138
+ compare = len(runnable) > 1 # multiple agents -> interleave by task + head-to-head
139
+ n_items = max_items or len(ev.dataset)
140
+ total = n_items * sum(len(setups_for(ev.setups, a.name)) for a in runnable)
141
+ label = " vs ".join(a.name for a in runnable)
142
+ # Live voyage when we have an interactive terminal; plain note lines otherwise.
143
+ live_mode = not quiet and console.err.is_terminal
144
+
145
+ if live_mode:
146
+ with live.VoyageLive(label, total, compare=compare) as vl:
147
+ boards = run_matrix(
148
+ ev,
149
+ runnable,
150
+ sandbox_ctx,
151
+ out_dir=out,
152
+ max_items=max_items,
153
+ model=model,
154
+ timeout_s=timeout_s,
155
+ observer=vl.observer(),
156
+ compare=compare,
157
+ resume=resume,
158
+ )
159
+ else:
160
+ observer = None if quiet else _note_observer(total, compare=compare)
161
+ boards = run_matrix(
162
+ ev,
163
+ runnable,
164
+ sandbox_ctx,
165
+ out_dir=out,
166
+ max_items=max_items,
167
+ model=model,
168
+ timeout_s=timeout_s,
169
+ observer=observer,
170
+ compare=compare,
171
+ resume=resume,
172
+ )
173
+
174
+ for board in boards:
175
+ console.out.print(board_table(board))
176
+ if board.interrupted:
177
+ console.warn("stopped early (Ctrl-C) — board reflects only the runs that finished")
178
+ fails = failures(board)
179
+ if fails:
180
+ console.diag("failures", "\n".join(fails))
181
+ if json_path:
182
+ path = (
183
+ json_path
184
+ if len(boards) == 1
185
+ else f"{json_path.rsplit('.', 1)[0]}.{board.agent_label}.json"
186
+ )
187
+ dump_json(board, path)
188
+ console.note(f"wrote machine-readable board to {path}")
189
+
190
+ if compare and len(boards) > 1:
191
+ console.out.print(comparison_table(boards))
192
+
193
+ _finish_run(ev, boards, out, run_id)
194
+ return 0
195
+
196
+
197
+ def _prepare_agent(agent, env_obj, *, quiet: bool) -> SandboxedAgent | None:
198
+ """Resolve one agent to its sandbox form: recipe + derived image (built or
199
+ reused). Returns None (with a warning) when the agent can't run."""
200
+ try:
201
+ recipe = container_recipe(agent)
202
+ except NoContainerRecipe as exc:
203
+ console.warn(f"skipping agent '{agent.name}': {exc}")
204
+ return None
205
+ tag = images.image_tag(env_obj, recipe)
206
+ on_line = None
207
+ if not quiet:
208
+ on_line = lambda line: console.err.print(line, style="dim", markup=False, highlight=False) # noqa: E731
209
+ try:
210
+ built = images.ensure_image(env_obj, recipe, on_line=on_line)
211
+ except images.ImageBuildError as exc:
212
+ console.warn(f"skipping agent '{agent.name}': {exc}")
213
+ return None
214
+ if built == tag and not quiet:
215
+ console.note(f"sandbox image for {agent.name}: {built}")
216
+ return SandboxedAgent(
217
+ name=agent.name,
218
+ image=built,
219
+ command=recipe.command,
220
+ env={**dict(recipe.env), **agent.manifest.env},
221
+ )
222
+
223
+
224
+ def _prepare_sandbox(env_spec: str | None, workspace_root: Path):
225
+ """Parse the env (or the default), bring up the sandbox server, and seed the
226
+ run workspace. Returns (env_obj, SandboxContext); raises EnvError /
227
+ SandboxUnavailable / OSError / JSONDecodeError."""
228
+ from avp_cli import environment as env_mod
229
+
230
+ if env_spec:
231
+ p = _resolve_env_file(env_spec)
232
+ env_obj = env_mod.Environment.parse(json.loads(p.read_text()))
233
+ base_dir = p.parent
234
+ else:
235
+ env_obj = env_mod.Environment.parse({})
236
+ base_dir = Path.cwd()
237
+ conn = osb.ensure_server()
238
+ workspace = env_mod.seed_workspace(env_obj, workspace_root / "workspace", base_dir=base_dir)
239
+ return env_obj, SandboxContext(
240
+ connection=conn,
241
+ workspace=workspace,
242
+ setup=env_obj.setup,
243
+ net=env_obj.net,
244
+ resources=env_obj.resources,
245
+ )
246
+
247
+
248
+ def _workspace_root(out: Path, run_id: str) -> Path:
249
+ """Where the run's workspace lives: under `out` when that's inside ~/.avp
250
+ (the sandbox server only bind-mounts paths there), else under ~/.avp/runs."""
251
+ home = paths.avp_home().resolve()
252
+ o = out.resolve()
253
+ return o if o.is_relative_to(home) else paths.runs_dir() / run_id
254
+
255
+
256
+ def _finish_run(ev: Eval, boards: list, out: Path, run_id: str) -> None:
257
+ """Write each agent's trajectories.json, record the run, and print its id.
258
+
259
+ The CLI produces the data, the site renders it: `trajectories.json` (one per
260
+ agent, the site's `by_commission` shape + a `commissions` config block) is
261
+ what `avp eval view <id>` opens. The run is recorded so `view` / `list` find
262
+ it by id even when `--out` was used.
263
+ """
264
+ from datetime import datetime
265
+
266
+ if not boards or not any(b.rows for b in boards):
267
+ return
268
+ eval_version = datetime.now().strftime("%Y%m%d-%H%M%S")
269
+
270
+ out.mkdir(parents=True, exist_ok=True)
271
+ for board in boards:
272
+ payload = viz.to_trajectories_payload(board, eval_version=eval_version, ev=ev)
273
+ name = "trajectories.json" if len(boards) == 1 else f"trajectories.{board.agent_label}.json"
274
+ (out / name).write_text(json.dumps(payload, indent=2))
275
+
276
+ state.record_run(
277
+ run_id=run_id,
278
+ out_dir=out,
279
+ dataset=ev.dataset.name,
280
+ agents=[b.agent_label for b in boards],
281
+ commissions=list(dict.fromkeys(s.id for s in ev.setups)),
282
+ )
283
+
284
+ console.out.print()
285
+ console.out.print(
286
+ f" [bold {brand.SAIL}]{brand.SAILBOAT} {run_id}[/] "
287
+ f"view it: [bold]uv run avp eval view {run_id}[/bold]"
288
+ )
289
+
290
+
291
+ def _note_observer(total: int, *, compare: bool) -> RunObserver:
292
+ """Plain stderr progress (one line per run start/end) for non-TTY / piped use."""
293
+
294
+ def on_start(n: int, _total: int, agent: str, setup: str, item: str) -> None:
295
+ console.note(f"[{n}/{total}] {agent} / {setup} / {item} ... running")
296
+
297
+ def on_end(n: int, _total: int, agent: str, r: RunResult) -> None:
298
+ if r.spawn_error is not None:
299
+ console.note(
300
+ f"[{n}/{total}] {agent} / {r.setup_name} / {r.item_id} ERROR: {r.spawn_error}"
301
+ )
302
+ return
303
+ passed = bool(r.score and r.score.passed)
304
+ score = f"{r.score.value:.0%}" if r.score else "—"
305
+ cost = f"${r.summary.total_cost_usd:.4f}" if r.summary else "—"
306
+ turns = r.summary.total_turns if r.summary else "—"
307
+ tally = f" {tool_tally(r.summary)}" if r.summary and tool_tally(r.summary) else ""
308
+ console.note(
309
+ f"[{n}/{total}] {agent} / {r.setup_name} / {r.item_id} "
310
+ f"{'PASS' if passed else 'fail'} score={score} {cost} {turns} turns{tally}"
311
+ )
312
+
313
+ def on_compare(setup: str, item: str, pairs: list[tuple[str, RunResult]]) -> None:
314
+ parts = []
315
+ for agent, r in pairs:
316
+ if r.spawn_error is not None:
317
+ parts.append(f"{agent} err")
318
+ else:
319
+ acc = f"{r.score.value:.0%}" if r.score else "—"
320
+ cost = f"${r.summary.total_cost_usd:.4f}" if r.summary else "—"
321
+ parts.append(f"{agent} {acc} {cost}")
322
+ console.note(f" ⚖ {setup} / {item} " + " · ".join(parts))
323
+
324
+ return RunObserver(on_start=on_start, on_end=on_end, on_compare=on_compare if compare else None)
325
+
326
+
327
+ # ── init ─────────────────────────────────────────────────────────────────────
328
+
329
+
330
+ def _pick_entry() -> catalog.CatalogEntry:
331
+ """Arrow-key select a benchmark. Falls back to the first entry off a TTY."""
332
+ if not sys.stdin.isatty():
333
+ return catalog.ENTRIES[0]
334
+ choices = []
335
+ for e in catalog.ENTRIES:
336
+ # Two-tone, brand-colored row: gold title, dim description, hull "needs".
337
+ title = [("class:ben-title", e.title), ("class:ben-desc", f" — {e.description}")]
338
+ if e.needs:
339
+ title.append(("class:ben-needs", f" (needs --extra {' --extra '.join(e.needs)})"))
340
+ choices.append(questionary.Choice(title=title, value=e.key))
341
+ key = questionary.select(
342
+ "Pick a benchmark to scaffold:",
343
+ choices=choices,
344
+ qmark=brand.SAILBOAT,
345
+ pointer="»",
346
+ instruction=" ",
347
+ style=_PICKER_STYLE,
348
+ ).ask()
349
+ if key is None: # user hit Ctrl-C / Esc
350
+ raise SystemExit(0)
351
+ return catalog.get(key) # type: ignore[return-value]
352
+
353
+
354
+ def _pick_agents() -> list[str]:
355
+ """Space-bar multiselect which agent(s) the scaffolded eval runs against.
356
+
357
+ The chosen names are pinned into the config so `avp eval run` needs no
358
+ `--agent`. Falls back to the default agent off a TTY.
359
+ """
360
+ names = known_agents()
361
+ if not sys.stdin.isatty():
362
+ return [DEFAULT_AGENT] if DEFAULT_AGENT in names else names[:1]
363
+ # Nothing pre-checked: the user actively chooses, and must pick at least one.
364
+ choices = [questionary.Choice(title=n, value=n) for n in names]
365
+ picked = questionary.checkbox(
366
+ "Which agent(s) should this eval run against?",
367
+ choices=choices,
368
+ qmark=brand.SAILBOAT,
369
+ pointer="»",
370
+ instruction="(space to toggle, enter to confirm)",
371
+ validate=lambda sel: bool(sel) or "select at least one agent (space to toggle)",
372
+ style=_PICKER_STYLE,
373
+ ).ask()
374
+ if picked is None:
375
+ raise SystemExit(0)
376
+ return picked
377
+
378
+
379
+ def _cmd_init(args: argparse.Namespace) -> int:
380
+ if args.key:
381
+ entry = catalog.get(args.key)
382
+ if entry is None:
383
+ keys = ", ".join(e.key for e in catalog.ENTRIES)
384
+ console.error_panel("unknown benchmark", f"{args.key!r}; choose from: {keys}")
385
+ return 1
386
+ else:
387
+ entry = _pick_entry()
388
+
389
+ agents = (
390
+ [a.strip() for a in args.agent.split(",") if a.strip()] if args.agent else _pick_agents()
391
+ )
392
+
393
+ result = catalog.scaffold(entry, Path(args.dir).resolve(), agents=agents)
394
+ target = result.eval_path
395
+ body = [f"eval file (edit me): [bold]{target}[/bold]"]
396
+ if result.installed:
397
+ body.append(f"commissions → library: [bold]{', '.join(result.installed)}[/bold]")
398
+ if result.skipped:
399
+ body.append(f"[dim]already in your library (reused): {', '.join(result.skipped)}[/dim]")
400
+ body.append(f'agents: [bold]{", ".join(agents)}[/bold] (edit the "agents" key or pass --agent)')
401
+ if entry.needs:
402
+ body.append(
403
+ f"\nthis benchmark needs: [bold]uv sync --extra {' --extra '.join(entry.needs)}[/bold]"
404
+ )
405
+ body.append(f"\nnext:\n uv run avp eval run {target.name}")
406
+ console.panel(entry.title, "\n".join(body), style="green")
407
+ return 0
408
+
409
+
410
+ # ── commission ────────────────────────────────────────────────────────────────
411
+
412
+
413
+ def _cmd_commission(args: argparse.Namespace) -> int:
414
+ if args.commission_cmd == "create":
415
+ return _cmd_commission_create(args)
416
+ if args.commission_cmd == "list":
417
+ return _cmd_commission_list()
418
+ if args.commission_cmd == "check":
419
+ return _cmd_commission_validate(args.target)
420
+ if args.commission_cmd == "delete":
421
+ if not library.delete(args.id):
422
+ console.error_panel(
423
+ f"no commission {args.id!r}",
424
+ f"not in your library ({_tilde(paths.commissions_dir())}) — see `avp cm list`.",
425
+ )
426
+ return 1
427
+ console.note(f"deleted commission {args.id}")
428
+ return 0
429
+
430
+ # describe: a library id (the raw wire Commission) first, else a Commission JSON file
431
+ if library.exists(args.id):
432
+ c = library.load(args.id)
433
+ console.print_json(commission_mod.full_dict(c))
434
+ console.note(
435
+ "the raw AVP Commission. `{input}` in the prompt is filled per dataset case, "
436
+ "and run_id + supervisor are assigned at run time."
437
+ )
438
+ return 0
439
+ try:
440
+ c = commission_mod.load_commission_file(args.id)
441
+ except Exception as exc:
442
+ console.error_panel(
443
+ f"no commission {args.id!r}",
444
+ f"not a library id (see `avp cm list`) nor a readable Commission file: {exc}",
445
+ )
446
+ return 1
447
+ console.print_json(commission_mod.full_dict(c))
448
+ return 0
449
+
450
+
451
+ _ID_RE = re.compile(r"^[a-z0-9_-]+$")
452
+
453
+ # Flags that, if any are set, mean "the caller specified this commission via
454
+ # flags" — so we skip the interactive wizard even on a TTY. `--agent` is NOT
455
+ # here: it just pre-selects the anchor for the wizard's pickers.
456
+ _CONTENT_FLAG_ATTRS = (
457
+ "from_id",
458
+ "model",
459
+ "prompt",
460
+ "system_prompt",
461
+ "provider_id",
462
+ "provider_base_url",
463
+ "credential",
464
+ "enable_tools",
465
+ "enable_subagents",
466
+ "enable_skills",
467
+ "enable_mcp",
468
+ "tags",
469
+ )
470
+
471
+
472
+ def _describe_for_create(spec: str) -> tuple[object | None, str | None]:
473
+ """Resolve an agent spec and fetch its Descriptor; (descriptor, error)."""
474
+ from avp_cli.agent import describe_agent
475
+
476
+ try:
477
+ agent = resolve_agent(spec)
478
+ except SystemExit as exc: # resolve_agent raises this on a missing manifest
479
+ return None, str(exc)
480
+ reason = preflight(agent.name)
481
+ if reason is not None:
482
+ return None, reason
483
+ return describe_agent(agent.manifest, agent.manifest_cwd)
484
+
485
+
486
+ def _cmd_commission_create(args: argparse.Namespace) -> int:
487
+ """Build a wire Commission into the library, via wizard or flags.
488
+
489
+ Flags fully specify a commission for non-interactive / coding-agent use; with
490
+ none of them on a TTY, an interactive wizard fills the fields. Either way the
491
+ bulky fields (`output_schema`, inline `mcp_servers` / `skills`) come from a
492
+ cloned base (`--from <id>`), not inline authoring; edit the JSON for the rest.
493
+ """
494
+ flag_mode = any(getattr(args, a) is not None for a in _CONTENT_FLAG_ATTRS)
495
+ interactive = sys.stdin.isatty() and console.err.is_terminal and not flag_mode
496
+
497
+ cid = args.id
498
+ if not cid and interactive:
499
+ cid = questionary.text(
500
+ "Commission id (becomes <id>.json in your library):",
501
+ qmark=brand.SAILBOAT,
502
+ style=_PICKER_STYLE,
503
+ validate=lambda s: bool(_ID_RE.match(s.strip())) or "use a-z, 0-9, '-', '_'",
504
+ ).ask()
505
+ if cid is None:
506
+ raise SystemExit(0)
507
+ cid = cid.strip()
508
+ if not cid:
509
+ console.error_panel(
510
+ "commission id required",
511
+ "pass one: `avp cm create <id> [flags]` (or run it on a TTY for the wizard).",
512
+ )
513
+ return 1
514
+ if not _ID_RE.match(cid):
515
+ console.error_panel(
516
+ "invalid commission id",
517
+ f"{cid!r}: use lowercase letters, digits, '-' and '_' (it becomes <id>.json).",
518
+ )
519
+ return 1
520
+ if library.exists(cid) and not args.force:
521
+ console.error_panel(
522
+ f"commission {cid!r} already exists",
523
+ f"in {_tilde(paths.commissions_dir())} — pass --force to overwrite, or pick another id.",
524
+ )
525
+ return 1
526
+
527
+ base = None
528
+ if args.from_id:
529
+ try:
530
+ base = library.load(args.from_id)
531
+ except library.CommissionError as exc:
532
+ console.error_panel(f"can't clone {args.from_id!r}", str(exc))
533
+ return 1
534
+
535
+ # Anchor agent: pre-selected by --agent, else picked in the wizard. Describing
536
+ # it unlocks the enabled_* pickers and validates enabled_* names.
537
+ descriptor = None
538
+ agent_spec = args.agent
539
+ if agent_spec is None and interactive:
540
+ agent_spec = _pick_anchor_agent()
541
+ if agent_spec:
542
+ descriptor, err = _describe_for_create(agent_spec)
543
+ if descriptor is None:
544
+ console.warn(
545
+ f"couldn't describe '{agent_spec}': {err}. "
546
+ "Continuing without its tool/skill pickers + validation."
547
+ )
548
+
549
+ model = args.model
550
+ prompt = args.prompt
551
+ system_prompt = args.system_prompt
552
+ enabled = {
553
+ "enabled_builtin_tools": args.enable_tools,
554
+ "enabled_builtin_subagents": args.enable_subagents,
555
+ "enabled_builtin_skills": args.enable_skills,
556
+ "enabled_builtin_mcp_servers": args.enable_mcp,
557
+ }
558
+ if interactive:
559
+ model = _ask_text(
560
+ "Model (blank for the agent's default):",
561
+ (base.model if base else None) or (descriptor.default_model if descriptor else None),
562
+ )
563
+ prompt = _ask_text(
564
+ "User prompt (use {input} where the dataset case goes; blank to skip):",
565
+ base.prompt if base else None,
566
+ )
567
+ system_prompt = _ask_text(
568
+ "System prompt (blank to skip):", base.system_prompt if base else None
569
+ )
570
+ if descriptor is not None:
571
+ enabled = _ask_enabled(descriptor, base)
572
+
573
+ try:
574
+ c = commission_mod.build_commission(
575
+ cid,
576
+ base=base,
577
+ model=model,
578
+ prompt=prompt,
579
+ system_prompt=system_prompt,
580
+ provider_id=args.provider_id,
581
+ provider_base_url=args.provider_base_url,
582
+ credential=args.credential,
583
+ tags=args.tags,
584
+ descriptor=descriptor,
585
+ **enabled,
586
+ )
587
+ except commission_mod.BuildError as exc:
588
+ console.error_panel("can't build that commission", str(exc))
589
+ return 1
590
+
591
+ library.save(cid, c, overwrite=args.force)
592
+ console.out.print(
593
+ f"[green]✓[/] created [bold {brand.SAIL}]{cid}[/] in {_tilde(paths.commissions_dir())}"
594
+ )
595
+ console.note(f"avp cm describe {cid} · see the full wire Commission")
596
+ console.note(f'reference it in an eval\'s "commissions" list as "{cid}"')
597
+ return 0
598
+
599
+
600
+ def _pick_anchor_agent() -> str | None:
601
+ """Wizard step: pick the agent to build against (or skip for a plain commission)."""
602
+ choices = [
603
+ questionary.Choice(title="None - do not use agent built-in capabilities", value=""),
604
+ *(questionary.Choice(title=n, value=n) for n in known_agents()),
605
+ ]
606
+ pick = questionary.select(
607
+ "Build against which agent? (unlocks its tool / skill / subagent pickers)",
608
+ choices=choices,
609
+ qmark=brand.SAILBOAT,
610
+ pointer="»",
611
+ instruction=" ",
612
+ style=_PICKER_STYLE,
613
+ ).ask()
614
+ if pick is None: # Esc / Ctrl-C
615
+ raise SystemExit(0)
616
+ return pick or None
617
+
618
+
619
+ def _ask_text(message: str, default: str | None) -> str | None:
620
+ """Wizard text input prefilled with `default`; returns the trimmed value or None."""
621
+ ans = questionary.text(
622
+ message, default=default or "", qmark=brand.SAILBOAT, style=_PICKER_STYLE
623
+ ).ask()
624
+ if ans is None:
625
+ raise SystemExit(0)
626
+ return ans.strip() or None
627
+
628
+
629
+ def _ask_enabled(descriptor, base) -> dict[str, list[str] | None]:
630
+ """Per builtin category, optionally restrict to a subset of what the agent has.
631
+
632
+ Skipping the restriction leaves the field `None` (every entry exposed, the
633
+ default); restricting then selecting nothing yields `[]` (expose none).
634
+ """
635
+ cats = [
636
+ ("enabled_builtin_tools", "tools", [t.name for t in descriptor.tools or []]),
637
+ ("enabled_builtin_subagents", "subagents", [s.name for s in descriptor.subagents or []]),
638
+ ("enabled_builtin_skills", "skills", [s.name for s in descriptor.skills or []]),
639
+ (
640
+ "enabled_builtin_mcp_servers",
641
+ "MCP servers",
642
+ [m.id for m in descriptor.mcp_servers or []],
643
+ ),
644
+ ]
645
+ out: dict[str, list[str] | None] = {}
646
+ for field, label, names in cats:
647
+ if not names:
648
+ out[field] = None
649
+ continue
650
+ base_val = getattr(base, field) if base else None
651
+ restrict = questionary.confirm(
652
+ f"Restrict which {label} the agent may use? (default: expose all {len(names)})",
653
+ default=base_val is not None,
654
+ qmark=brand.SAILBOAT,
655
+ style=_PICKER_STYLE,
656
+ ).ask()
657
+ if restrict is None:
658
+ raise SystemExit(0)
659
+ if not restrict:
660
+ out[field] = None
661
+ continue
662
+ preselected = set(base_val or [])
663
+ sel = questionary.checkbox(
664
+ f"Select {label} to expose (none selected = expose none):",
665
+ choices=[questionary.Choice(title=n, value=n, checked=n in preselected) for n in names],
666
+ qmark=brand.SAILBOAT,
667
+ pointer="»",
668
+ instruction="(space to toggle, enter to confirm)",
669
+ style=_PICKER_STYLE,
670
+ ).ask()
671
+ if sel is None:
672
+ raise SystemExit(0)
673
+ out[field] = sel
674
+ return out
675
+
676
+
677
+ def _cmd_commission_validate(target: str) -> int:
678
+ """Validate a library commission by id, or a wire Commission JSON file.
679
+
680
+ Either way it's the same check: does it parse as a valid AVP wire Commission?
681
+ """
682
+ if library.exists(target):
683
+ try:
684
+ library.load(target)
685
+ except library.CommissionError as exc:
686
+ console.error_panel(f"invalid commission {target!r}", str(exc))
687
+ return 1
688
+ console.out.print(f"[green]✓[/] commission '{target}' is a valid Commission")
689
+ return 0
690
+ if not Path(target).is_file():
691
+ console.error_panel(
692
+ f"no commission {target!r}",
693
+ "not a library id (see `avp cm list`) nor a path to a Commission .json file.",
694
+ )
695
+ return 1
696
+ ok, msg = commission_mod.validate_file(target)
697
+ if ok:
698
+ console.out.print(f"[green]✓[/] {target} is a valid Commission")
699
+ return 0
700
+ console.error_panel("invalid Commission", msg)
701
+ return 1
702
+
703
+
704
+ def _cmd_commission_list() -> int:
705
+ """List the portable commission library (`~/.avp/commissions/`)."""
706
+ from rich.table import Table
707
+
708
+ commissions = library.list_commissions()
709
+ if not commissions:
710
+ console.note(f"no commissions in {paths.commissions_dir()} — `avp init` scaffolds some.")
711
+ return 0
712
+ table = Table(title="", header_style="bold")
713
+ table.add_column("id", style=f"bold {brand.SAIL}", no_wrap=True)
714
+ table.add_column("model")
715
+ table.add_column("prompt", overflow="fold")
716
+ for cid, c in commissions:
717
+ table.add_row(cid, c.model or "[dim](agent default)[/dim]", c.prompt or "[dim]—[/dim]")
718
+ console.out.print(table)
719
+ console.note(f"in {_tilde(paths.commissions_dir())}")
720
+ console.note("avp cm describe <id> · see the full Commission")
721
+ return 0
722
+
723
+
724
+ # ── agent ───────────────────────────────────────────────────────────────────────
725
+
726
+
727
+ def _cmd_agent(args: argparse.Namespace) -> int:
728
+ if args.agent_cmd == "list":
729
+ return _cmd_agent_list()
730
+ if args.agent_cmd == "install":
731
+ return _cmd_agent_install(args)
732
+ if args.agent_cmd == "uninstall":
733
+ return _cmd_agent_uninstall(args)
734
+ return _cmd_agent_describe(args.name, json_out=args.json_out)
735
+
736
+
737
+ def _cmd_agent_list() -> int:
738
+ """List known agents: whether each is installed and ready to run."""
739
+ from rich.table import Table
740
+
741
+ from avp_cli import agent_install
742
+ from avp_cli.agents import has_dev_fallback
743
+
744
+ table = Table(title="agents", header_style="bold")
745
+ table.add_column("name", style=f"bold {brand.SAIL}", no_wrap=True)
746
+ table.add_column("installed")
747
+ table.add_column("status")
748
+ for name in known_agents():
749
+ info = agent_install.installed_info(name)
750
+ dev = has_dev_fallback(name)
751
+ if info:
752
+ installed = f"v{info.get('version', '?')} ({info.get('source', '?')})"
753
+ elif dev:
754
+ installed = "[dim]in-repo (dev)[/dim]"
755
+ else:
756
+ installed = "[yellow]not installed[/yellow]"
757
+ if info or dev:
758
+ reason = preflight(name)
759
+ status = "[green]ready[/]" if reason is None else f"[yellow]{reason}[/yellow]"
760
+ else:
761
+ status = f"[yellow]run: avp agent install {name}[/yellow]"
762
+ table.add_row(name, installed, status)
763
+ console.out.print(table)
764
+ console.note("avp agent install <name> · install the prebuilt agent")
765
+ console.note("avp agent describe <name> · see its tools, models, skills")
766
+ return 0
767
+
768
+
769
+ def _cmd_agent_install(args: argparse.Namespace) -> int:
770
+ """Install a prebuilt agent from a release, or from local artifacts."""
771
+ from avp_cli import agent_install
772
+
773
+ try:
774
+ result = agent_install.install(
775
+ args.name,
776
+ version=args.version,
777
+ binary=args.binary,
778
+ wheels=args.wheel or None,
779
+ force=args.force,
780
+ )
781
+ except agent_install.InstallError as exc:
782
+ console.error_panel(f"couldn't install '{args.name}'", str(exc))
783
+ return 1
784
+ console.out.print(
785
+ f"[green]✓[/] installed [bold {brand.SAIL}]{result.name}[/] "
786
+ f"(v{result.version}, {result.source} {result.kind}) → {_tilde(result.install_dir)}"
787
+ )
788
+ reason = preflight(result.name)
789
+ if reason is not None:
790
+ console.warn(f"runtime prerequisite still needed: {reason}")
791
+ console.note(f"avp agent describe {result.name} · verify it boots")
792
+ return 0
793
+
794
+
795
+ def _cmd_agent_uninstall(args: argparse.Namespace) -> int:
796
+ """Remove an installed agent from ~/.avp/agents."""
797
+ from avp_cli import agent_install
798
+
799
+ if agent_install.uninstall(args.name):
800
+ console.note(f"uninstalled agent {args.name}")
801
+ return 0
802
+ console.error_panel(
803
+ f"'{args.name}' is not installed",
804
+ f"nothing at {_tilde(paths.agents_dir() / args.name)} — see `avp agent list`.",
805
+ )
806
+ return 1
807
+
808
+
809
+ def _cmd_agent_describe(name: str, *, json_out: bool) -> int:
810
+ """Fetch + render an agent's AgentDescriptor via its `describe` contract."""
811
+ from avp_cli.agent import describe_agent
812
+
813
+ agent = resolve_agent(name)
814
+ reason = preflight(agent.name)
815
+ if reason is not None:
816
+ console.error_panel(f"can't describe '{agent.name}'", reason)
817
+ return _SKIP
818
+ descriptor, err = describe_agent(agent.manifest, agent.manifest_cwd)
819
+ if descriptor is None:
820
+ console.error_panel(f"describe failed for '{agent.name}'", err or "no descriptor")
821
+ return 1
822
+ if json_out:
823
+ console.print_json(descriptor.model_dump(mode="json", by_alias=True, exclude_none=True))
824
+ return 0
825
+ _render_descriptor(name, descriptor)
826
+ return 0
827
+
828
+
829
+ def _render_descriptor(name: str, d) -> None:
830
+ """A readable view of an AgentDescriptor (tools with their first-line docs)."""
831
+ console.out.print(
832
+ f"[bold {brand.SAIL}]{d.agent_name}[/] v{d.agent_version} · spec {d.spec_version}"
833
+ )
834
+ if d.default_model:
835
+ console.out.print(f" default_model: {d.default_model}")
836
+ if d.supported_models:
837
+ console.out.print(f" supported_models: {', '.join(d.supported_models)}")
838
+ if d.system_prompt:
839
+ console.out.print(f" system_prompt: [dim]{d.system_prompt[:100]}[/dim]")
840
+
841
+ tools = d.tools or []
842
+ console.out.print(f"\n[bold]tools[/] ({len(tools)})")
843
+ for t in tools:
844
+ params = list((t.inputSchema or {}).get("properties", {})) if t.inputSchema else []
845
+ first_line = (t.description or "").strip().splitlines()
846
+ desc = first_line[0][:80] if first_line else ""
847
+ suffix = f" · via {t.mcp_server_id}" if getattr(t, "mcp_server_id", None) else ""
848
+ console.out.print(f" [bold {brand.SAIL}]{t.name}[/]{suffix}")
849
+ if desc:
850
+ console.out.print(f" [dim]{desc}[/dim]")
851
+ if params:
852
+ console.out.print(f" args: {', '.join(params)}")
853
+
854
+ if d.skills:
855
+ console.out.print(f"\nskills: {[s.name for s in d.skills]}")
856
+ if d.mcp_servers:
857
+ console.out.print(f"mcp_servers: {[m.id for m in d.mcp_servers]}")
858
+ if d.subagents:
859
+ console.out.print(f"subagents: {[s.name for s in d.subagents]}")
860
+ if d.capabilities:
861
+ console.out.print(f"capabilities: {', '.join(d.capabilities)}")
862
+ console.note(f"avp agent describe {name} --json · full tool input schemas")
863
+
864
+
865
+ # ── eval ───────────────────────────────────────────────────────────────────────
866
+
867
+ # Browsers cap total URL length (~2 MB in Chrome); budget the encoded blob under it.
868
+ _MAX_VIEW_URL = 1_900_000
869
+
870
+
871
+ def _run_trajectories(path: str | None) -> list[Path]:
872
+ """Every `trajectories*.json` for a run (one per agent when several were compared).
873
+
874
+ With a `path`: a voyage id from `avp eval list` (resolved via history), else a
875
+ file as-is, else all `trajectories*.json` in that dir. Without one: the most
876
+ recent recorded run (tracked across `--out`), else the default out dir.
877
+ """
878
+ if path:
879
+ run = state.find_run(path)
880
+ if run is None and Path(path).is_file():
881
+ return [Path(path)]
882
+ base = Path(run["out_dir"]) if run else Path(path)
883
+ else:
884
+ last = state.last_run()
885
+ base = Path(last["out_dir"]) if last else _default_out_dir()
886
+ return sorted(base.glob("trajectories*.json")) if base.is_dir() else []
887
+
888
+
889
+ def _cmd_eval_view(args: argparse.Namespace) -> int:
890
+ """Open a finished eval on the site by encoding its trajectories into the URL.
891
+
892
+ A run compared against several agents has one trajectories file per agent; this
893
+ opens them all (a tab each) unless `--agent` narrows it to one.
894
+ """
895
+ files = _run_trajectories(args.path)
896
+ if not files:
897
+ console.error_panel(
898
+ "no trajectories.json found",
899
+ "run `avp eval run <config>` first, or pass a voyage id from "
900
+ "`avp eval list` (or a path to a trajectories.json / eval out dir).",
901
+ )
902
+ return 1
903
+
904
+ # Point at the immutable input snapshot for this run, if present.
905
+ mani = run_manifest.read(files[0].parent)
906
+ if mani:
907
+ cids = ", ".join(mani.get("commissions", {}))
908
+ console.note(
909
+ f"config snapshot: {files[0].parent / run_manifest.MANIFEST_NAME}"
910
+ + (f" (commissions: {cids})" if cids else "")
911
+ )
912
+
913
+ payloads = [json.loads(f.read_text()) for f in files]
914
+ runs = [(p.get("agent") or f.stem, p) for f, p in zip(files, payloads, strict=True)]
915
+ available = [label for label, _ in runs]
916
+ if args.agent:
917
+ runs = [(label, p) for label, p in runs if label == args.agent]
918
+ if not runs:
919
+ console.error_panel(
920
+ f"no trajectories for agent {args.agent!r}",
921
+ f"this run has: {', '.join(available)}",
922
+ )
923
+ return 1
924
+
925
+ # One link, all agents: the site shows them head-to-head on a single page.
926
+ labels = [label for label, _ in runs]
927
+ url = viz.view_url(viz.combine_payloads([p for _, p in runs]), site=args.site)
928
+ if len(url) > _MAX_VIEW_URL:
929
+ console.error_panel(
930
+ "eval too large to view via link",
931
+ f"the encoded link is ~{len(url) // 1024} KB, over the ~{_MAX_VIEW_URL // 1024} KB "
932
+ "URL budget. Narrow it (`--agent <name>`, or fewer items/commissions) to view it.",
933
+ )
934
+ return 1
935
+ if args.no_open:
936
+ console.out.print(url)
937
+ else:
938
+ webbrowser.open(url)
939
+ console.note(f"opening {args.site}/view for: {', '.join(labels)}")
940
+ return 0
941
+
942
+
943
+ def _cmd_eval_list(args: argparse.Namespace) -> int:
944
+ """List recent eval runs (newest first) so you can `view` one."""
945
+ from rich.table import Table
946
+
947
+ runs = state.recent_runs()
948
+ if not runs:
949
+ console.note("no recent eval runs — run `avp eval run <config>` first.")
950
+ return 0
951
+ table = Table(title="recent eval runs", header_style="bold")
952
+ table.add_column("id", style=f"bold {brand.SAIL}", no_wrap=True)
953
+ table.add_column("when")
954
+ table.add_column("dataset")
955
+ table.add_column("agents")
956
+ table.add_column("commissions", justify="right")
957
+ for r in runs:
958
+ when = r.get("ts", "")[:19].replace("T", " ")
959
+ table.add_row(
960
+ r.get("id", "?"),
961
+ when,
962
+ r.get("dataset", "?"),
963
+ ", ".join(r.get("agents", [])),
964
+ str(len(r.get("commissions", []))),
965
+ )
966
+ console.out.print(table)
967
+ console.note("view one: uv run avp eval view <id> (no arg opens the most recent)")
968
+ return 0
969
+
970
+
971
+ def _resolve_run_id(args: argparse.Namespace) -> str:
972
+ """This run's voyage id: `--name`, else an interactive prompt, else autogenerated."""
973
+ if args.name:
974
+ return state.claim_id(args.name)
975
+ if not args.quiet and sys.stdin.isatty() and console.err.is_terminal:
976
+ answer = questionary.text(
977
+ "Name this voyage (blank to autogenerate):",
978
+ qmark=brand.SAILBOAT,
979
+ style=_PICKER_STYLE,
980
+ ).ask()
981
+ if answer and answer.strip():
982
+ return state.claim_id(answer)
983
+ return state.new_id()
984
+
985
+
986
+ def _cmd_eval_delete(args: argparse.Namespace) -> int:
987
+ """Delete one recorded run by id, or every run with `--all`."""
988
+ if args.all:
989
+ n = state.clear_runs()
990
+ console.note(f"deleted {n} run{'s' if n != 1 else ''} from {paths.runs_dir()}")
991
+ return 0
992
+ if not args.name:
993
+ console.error_panel(
994
+ "nothing to delete",
995
+ "name a run to delete (a voyage id from `avp eval list`), or pass --all "
996
+ "to delete every recorded run.",
997
+ )
998
+ return 1
999
+ if not state.delete_run(args.name):
1000
+ console.error_panel(
1001
+ f"no run {args.name!r}", "no recorded run with that id — see `avp eval list`."
1002
+ )
1003
+ return 1
1004
+ console.note(f"deleted run {args.name}")
1005
+ return 0
1006
+
1007
+
1008
+ def _resolve_resume_dir(args: argparse.Namespace) -> Path | None:
1009
+ """The output dir of the run `--resume` names: an explicit `--out`, the run's
1010
+ recorded dir, or the default `<runs>/<id>`. Errors (returns None) if none exist
1011
+ on disk, since there'd be nothing to resume."""
1012
+ candidates = []
1013
+ if args.out:
1014
+ candidates.append(Path(args.out))
1015
+ rec = state.find_run(args.resume)
1016
+ if rec:
1017
+ candidates.append(Path(rec["out_dir"]))
1018
+ candidates.append(_default_out_dir() / args.resume)
1019
+ for c in candidates:
1020
+ if c.is_dir():
1021
+ return c
1022
+ console.error_panel(
1023
+ f"can't resume {args.resume!r}",
1024
+ "no run dir found. Pass an id from `avp eval list`, or `--out <dir>` "
1025
+ "pointing at the run's trajectories.",
1026
+ )
1027
+ return None
1028
+
1029
+
1030
+ def _resume_drift(out: Path, ev: Eval) -> str | None:
1031
+ """Why resuming `out` with the current eval would be unsafe, or None if it's
1032
+ consistent. Splicing a different config into an old run dir would build one
1033
+ board from two different evals; the run manifest is the source of truth for
1034
+ what originally ran."""
1035
+ mani = run_manifest.read(out)
1036
+ if not mani:
1037
+ return None # older run without a manifest: trust the caller
1038
+ was = set(mani.get("commissions", {}))
1039
+ now = {s.id for s in ev.setups}
1040
+ if was and was != now:
1041
+ return (
1042
+ f"this run's commissions were {sorted(was)}, but the config now has "
1043
+ f"{sorted(now)}. Resume the original config, or start a fresh run."
1044
+ )
1045
+ return None
1046
+
1047
+
1048
+ def _cmd_eval(args: argparse.Namespace) -> int:
1049
+ if args.eval_cmd == "view":
1050
+ return _cmd_eval_view(args)
1051
+ if args.eval_cmd == "list":
1052
+ return _cmd_eval_list(args)
1053
+ if args.eval_cmd == "delete":
1054
+ return _cmd_eval_delete(args)
1055
+ try:
1056
+ ev = config.load_eval(args.path)
1057
+ except config.EvalConfigError as exc:
1058
+ console.error_panel("bad eval config", str(exc))
1059
+ return 1
1060
+
1061
+ if args.threshold is not None and hasattr(ev.scorer, "threshold"):
1062
+ ev.scorer.threshold = args.threshold
1063
+ # One voyage per run: a default run lands in its own subdir (so runs don't
1064
+ # clobber each other and each has a stable home `view <id>` resolves to); an
1065
+ # explicit --out is taken literally.
1066
+ if args.resume:
1067
+ run_id = args.resume
1068
+ out = _resolve_resume_dir(args)
1069
+ if out is None:
1070
+ return 1
1071
+ drift = _resume_drift(out, ev)
1072
+ if drift is not None:
1073
+ console.error_panel("can't resume: config changed", drift)
1074
+ return 1
1075
+ console.note(f"resuming {run_id}: reusing finished cells in {_tilde(out)}")
1076
+ else:
1077
+ run_id = _resolve_run_id(args)
1078
+ out = Path(args.out) if args.out else _default_out_dir() / run_id
1079
+ # Bring up the sandbox stack once (not per cell): Docker preflight, the
1080
+ # managed server, and the seeded run workspace every cell mounts.
1081
+ try:
1082
+ env_obj, sandbox_ctx = _prepare_sandbox(args.env, _workspace_root(out, run_id))
1083
+ except osb.SandboxUnavailable as exc:
1084
+ console.error_panel("sandbox unavailable", str(exc))
1085
+ return _SKIP
1086
+ except Exception as exc:
1087
+ console.error_panel(f"environment '{args.env}'", str(exc))
1088
+ return 1
1089
+ if args.env:
1090
+ console.note(f"environment: {args.env} → {_tilde(sandbox_ctx.workspace)}")
1091
+ if args.agent:
1092
+ agent_specs = [s.strip() for s in args.agent.split(",") if s.strip()]
1093
+ else:
1094
+ agent_specs = ev.agents or [DEFAULT_AGENT]
1095
+ return _run_and_report(
1096
+ ev,
1097
+ run_id=run_id,
1098
+ agent_specs=agent_specs,
1099
+ out=out,
1100
+ json_path=args.json_path,
1101
+ max_items=args.max_items,
1102
+ model=args.model,
1103
+ quiet=args.quiet,
1104
+ sandbox_ctx=sandbox_ctx,
1105
+ env_obj=env_obj,
1106
+ config_path=args.path,
1107
+ timeout_s=args.timeout,
1108
+ resume=bool(args.resume),
1109
+ )
1110
+
1111
+
1112
+ def _resolve_env_file(spec: str) -> Path:
1113
+ """An env spec is a path to a JSON file, or a name in ~/.avp/environments."""
1114
+ from avp_cli import environment as env_mod
1115
+
1116
+ p = Path(spec)
1117
+ if p.is_file():
1118
+ return p
1119
+ named = paths.environments_dir() / f"{spec}.json"
1120
+ if named.is_file():
1121
+ return named
1122
+ raise env_mod.EnvError(
1123
+ f"not a file, and no environment named {spec!r} in {_tilde(paths.environments_dir())}"
1124
+ )
1125
+
1126
+
1127
+ # ── env ───────────────────────────────────────────────────────────────────────
1128
+
1129
+
1130
+ def _task_event(ev) -> None:
1131
+ """Compact live progress for a single task run: one line per tool call + stop."""
1132
+ from avp.trajectory import AgentStoppedEvent, ToolInvokedEvent
1133
+
1134
+ if isinstance(ev, ToolInvokedEvent):
1135
+ console.note(f" ⚒ {ev.data.tool_name}")
1136
+ elif isinstance(ev, AgentStoppedEvent):
1137
+ console.note(f" ■ {ev.data.reason}")
1138
+
1139
+
1140
+ def _cmd_run(args: argparse.Namespace) -> int:
1141
+ """Commission one agent to do one task, optionally inside an environment.
1142
+
1143
+ The task is the Commission prompt; the env (if given) supplies the working
1144
+ context (code via `paths`, fixtures via `files`, a runtime). The workspace
1145
+ persists so you can inspect what the agent changed.
1146
+ """
1147
+ from avp.commission import Commission
1148
+ from avp_cli.agent import run_agent
1149
+ from avp_cli.eval.engine import extract_final_output
1150
+ from avp_cli.observability import summarize
1151
+
1152
+ agent = resolve_agent(args.agent)
1153
+ run_id = state.new_id()
1154
+ rundir = paths.runs_dir() / run_id
1155
+ rundir.mkdir(parents=True, exist_ok=True)
1156
+
1157
+ try:
1158
+ env_obj, sandbox_ctx = _prepare_sandbox(args.env, rundir)
1159
+ except osb.SandboxUnavailable as exc:
1160
+ console.error_panel("sandbox unavailable", str(exc))
1161
+ return _SKIP
1162
+ except Exception as exc:
1163
+ console.error_panel(f"environment '{args.env}'", str(exc))
1164
+ return 1
1165
+ prepared = _prepare_agent(agent, env_obj, quiet=False)
1166
+ if prepared is None:
1167
+ return _SKIP
1168
+
1169
+ if not args.model or "/" not in args.model:
1170
+ console.error_panel(
1171
+ "model must be a provider/model slug",
1172
+ f"got {args.model!r}; pass e.g. --model anthropic/claude-opus-4-8 "
1173
+ "or openai/gpt-4o (canonical models.dev namespace).",
1174
+ )
1175
+ return 1
1176
+
1177
+ commission = Commission(
1178
+ schema_version="0.1", run_id=run_id, prompt=args.prompt, model=args.model
1179
+ )
1180
+ traj = rundir / "trajectory.ndjson"
1181
+ where = f" in {args.env}" if args.env else ""
1182
+ console.note(f"{agent.name} working on the task{where} …")
1183
+ events, err = run_agent(
1184
+ prepared,
1185
+ sandbox_ctx,
1186
+ commission,
1187
+ out_path=traj,
1188
+ timeout_s=args.timeout,
1189
+ on_event=_task_event,
1190
+ )
1191
+ if err is not None or events is None:
1192
+ console.error_panel(f"'{agent.name}' run failed", err or "no events")
1193
+ return 1
1194
+
1195
+ summary = summarize(events)
1196
+ final = extract_final_output(events)
1197
+ console.out.print()
1198
+ if summary:
1199
+ tally = f" {tool_tally(summary)}" if tool_tally(summary) else ""
1200
+ console.out.print(
1201
+ f"[bold {brand.SAIL}]done[/] {summary.total_turns} turns · "
1202
+ f"${summary.total_cost_usd:.4f}{tally}"
1203
+ )
1204
+ if final.text:
1205
+ console.out.print(final.text.strip())
1206
+ console.note(f"trajectory: {_tilde(traj)}")
1207
+ console.note(f"workspace (inspect what changed): {_tilde(sandbox_ctx.workspace)}")
1208
+ return 0
1209
+
1210
+
1211
+ def _cmd_env(args: argparse.Namespace) -> int:
1212
+ if args.env_cmd == "list":
1213
+ return _cmd_env_list()
1214
+ if args.env_cmd == "create":
1215
+ return _cmd_env_create(args)
1216
+ if args.env_cmd == "show":
1217
+ return _cmd_env_show(args.env)
1218
+ if args.env_cmd == "delete":
1219
+ return _cmd_env_delete(args.name)
1220
+ if args.env_cmd == "secret":
1221
+ return _cmd_env_secret(args)
1222
+ return _cmd_env_run(args)
1223
+
1224
+
1225
+ def _cmd_env_secret(args: argparse.Namespace) -> int:
1226
+ """Manage the vault: secrets a Commission references by handle. The value is
1227
+ stored on the host (~/.avp/secrets.toml, 0600) and injected by the broker at
1228
+ run time, so it never enters the sandbox or the wire."""
1229
+ from avp_cli import vault
1230
+
1231
+ if args.secret_cmd == "list":
1232
+ handles = vault.names()
1233
+ if not handles:
1234
+ console.note(f"no secrets stored ({_tilde(vault.secrets_path())})")
1235
+ return 0
1236
+ for handle in handles:
1237
+ console.out.print(handle)
1238
+ return 0
1239
+ if args.secret_cmd == "delete":
1240
+ if vault.remove(args.handle):
1241
+ console.note(f"removed {args.handle!r}")
1242
+ return 0
1243
+ console.error_panel("no such secret", f"{args.handle!r} is not in the vault")
1244
+ return 1
1245
+ # create
1246
+ if args.handle in vault.names() and not args.force:
1247
+ console.error_panel(
1248
+ f"secret {args.handle!r} already exists",
1249
+ "pass --force to overwrite, or pick another handle.",
1250
+ )
1251
+ return 1
1252
+ value = args.value
1253
+ if value is None:
1254
+ import getpass
1255
+
1256
+ value = getpass.getpass(f"value for {args.handle!r}: ")
1257
+ try:
1258
+ vault.store(args.handle, value)
1259
+ except vault.VaultError as exc:
1260
+ console.error_panel("could not store secret", str(exc))
1261
+ return 1
1262
+ console.note(
1263
+ f'stored {args.handle!r} — reference it in a Commission as {{"vault": "{args.handle}"}}'
1264
+ )
1265
+ return 0
1266
+
1267
+
1268
+ def _cmd_env_create(args: argparse.Namespace) -> int:
1269
+ """Write an environment to ~/.avp/environments/<name>.json from flags."""
1270
+ from avp_cli import environment as env_mod
1271
+
1272
+ if not _ID_RE.match(args.name):
1273
+ console.error_panel(
1274
+ "invalid environment name", f"{args.name!r}: use lowercase letters, digits, '-', '_'."
1275
+ )
1276
+ return 1
1277
+ dest = paths.environments_dir() / f"{args.name}.json"
1278
+ if dest.exists() and not args.force:
1279
+ console.error_panel(
1280
+ f"environment {args.name!r} already exists",
1281
+ f"in {_tilde(paths.environments_dir())} — pass --force, or pick another name.",
1282
+ )
1283
+ return 1
1284
+ # Resolve --path to absolute now (so it still resolves when materialized from
1285
+ # ~/.avp later) and fail early if it's missing.
1286
+ abs_paths = []
1287
+ for p in args.path or ():
1288
+ rp = Path(p)
1289
+ if not rp.exists():
1290
+ console.error_panel("path not found", f"{p!r} does not exist")
1291
+ return 1
1292
+ abs_paths.append(str(rp.resolve()))
1293
+ try:
1294
+ block = env_mod.build_block(
1295
+ image=args.image,
1296
+ apt=tuple(args.apt or ()),
1297
+ pip=tuple(args.pip or ()),
1298
+ paths=tuple(abs_paths),
1299
+ files=tuple(args.file or ()),
1300
+ setup=tuple(args.setup or ()),
1301
+ net=tuple(args.net or ()),
1302
+ cpu=args.cpu,
1303
+ memory=args.memory,
1304
+ )
1305
+ except env_mod.EnvError as exc:
1306
+ console.error_panel("can't build that environment", str(exc))
1307
+ return 1
1308
+ dest.parent.mkdir(parents=True, exist_ok=True)
1309
+ dest.write_text(json.dumps(block, indent=2) + "\n")
1310
+ console.out.print(
1311
+ f"[green]✓[/] created environment [bold {brand.SAIL}]{args.name}[/] → {_tilde(dest)}"
1312
+ )
1313
+ console.note(f"avp env show {args.name} · avp env run {args.name} -- <command>")
1314
+ return 0
1315
+
1316
+
1317
+ def _cmd_env_delete(name: str) -> int:
1318
+ """Remove an environment from ~/.avp/environments."""
1319
+ dest = paths.environments_dir() / f"{name}.json"
1320
+ if not dest.is_file():
1321
+ console.error_panel(
1322
+ f"no environment {name!r}", f"nothing at {_tilde(dest)} — see `avp env list`."
1323
+ )
1324
+ return 1
1325
+ dest.unlink()
1326
+ console.note(f"deleted environment {name}")
1327
+ return 0
1328
+
1329
+
1330
+ def _cmd_env_list() -> int:
1331
+ """List named environments in ~/.avp/environments."""
1332
+ from rich.table import Table
1333
+
1334
+ from avp_cli import environment as env_mod
1335
+
1336
+ d = paths.environments_dir()
1337
+ files = sorted(d.glob("*.json")) if d.is_dir() else []
1338
+ if not files:
1339
+ console.note(
1340
+ f"no environments in {_tilde(d)} — drop an env JSON there, "
1341
+ "or pass a path to `avp env show/run`."
1342
+ )
1343
+ return 0
1344
+ table = Table(title="environments", header_style="bold")
1345
+ table.add_column("name", style=f"bold {brand.SAIL}", no_wrap=True)
1346
+ table.add_column("image")
1347
+ table.add_column("packages")
1348
+ for f in files:
1349
+ try:
1350
+ e = env_mod.Environment.parse(json.loads(f.read_text()))
1351
+ except (env_mod.EnvError, json.JSONDecodeError, OSError):
1352
+ continue
1353
+ pkgs = ", ".join(f"{k}:{len(v)}" for k, v in e.packages.items())
1354
+ table.add_row(f.stem, e.image, pkgs or "[dim]—[/dim]")
1355
+ console.out.print(table)
1356
+ console.note("avp env show <name> · avp env run <name> -- <command>")
1357
+ return 0
1358
+
1359
+
1360
+ def _cmd_env_show(spec: str) -> int:
1361
+ """Render an environment without building it."""
1362
+ from avp_cli import environment as env_mod
1363
+
1364
+ try:
1365
+ p = _resolve_env_file(spec)
1366
+ e = env_mod.Environment.parse(json.loads(p.read_text()))
1367
+ except (env_mod.EnvError, json.JSONDecodeError, OSError) as exc:
1368
+ console.error_panel(f"environment '{spec}'", str(exc))
1369
+ return 1
1370
+ console.out.print(f"[bold {brand.SAIL}]{spec}[/] · {_tilde(p)}")
1371
+ console.out.print(f" image: {e.image}")
1372
+ for eco, pkgs in e.packages.items():
1373
+ console.out.print(f" {eco}: {', '.join(pkgs)}")
1374
+ if e.paths:
1375
+ console.out.print(f" paths: {', '.join(e.paths)}")
1376
+ if e.files:
1377
+ console.out.print(f" files: {', '.join(e.files)}")
1378
+ if e.setup:
1379
+ console.out.print(f" setup: {len(e.setup)} command(s)")
1380
+ if e.net:
1381
+ console.out.print(f" network: {', '.join(e.net)}")
1382
+ if e.resources:
1383
+ console.out.print(" resources: " + ", ".join(f"{k}={v}" for k, v in e.resources.items()))
1384
+ console.note(f"avp env run {spec} -- <command> · run a command inside it")
1385
+ return 0
1386
+
1387
+
1388
+ def _cmd_env_run(args: argparse.Namespace) -> int:
1389
+ """Run a command inside an environment's sandbox (image + workspace + egress).
1390
+
1391
+ The same world an agent would get, minus the agent: the env's derived image
1392
+ (no agent recipe), the seeded workspace mounted rw, setup run, default-deny
1393
+ egress. The workspace persists under ~/.avp/runs for inspection.
1394
+ """
1395
+ import contextlib as _contextlib
1396
+ import shlex as _shlex
1397
+ from datetime import timedelta
1398
+
1399
+ from opensandbox import SandboxSync
1400
+ from opensandbox.models.execd import RunCommandOpts
1401
+ from opensandbox.models.sandboxes import Host, Volume
1402
+
1403
+ from avp_cli.agent import _WORKSPACE_MNT, _run_setup
1404
+
1405
+ command = list(args.command)
1406
+ if command and command[0] == "--": # argparse REMAINDER keeps the separator
1407
+ command = command[1:]
1408
+ if not command:
1409
+ console.error_panel("no command", "usage: avp env run <env> -- <command> ...")
1410
+ return 1
1411
+
1412
+ run_id = state.new_id()
1413
+ rundir = paths.runs_dir() / run_id
1414
+ try:
1415
+ env_obj, ctx = _prepare_sandbox(args.env, rundir)
1416
+ except osb.SandboxUnavailable as exc:
1417
+ console.error_panel("sandbox unavailable", str(exc))
1418
+ return _SKIP
1419
+ except Exception as exc:
1420
+ console.error_panel(f"environment '{args.env}'", str(exc))
1421
+ return 1
1422
+ try:
1423
+ image = images.ensure_image(
1424
+ env_obj,
1425
+ images.ContainerRecipe(install=(), command=()),
1426
+ on_line=lambda line: console.err.print(
1427
+ line, style="dim", markup=False, highlight=False
1428
+ ),
1429
+ )
1430
+ except images.ImageBuildError as exc:
1431
+ console.error_panel(f"environment '{args.env}'", str(exc))
1432
+ return 1
1433
+
1434
+ console.note(f"running in {_tilde(ctx.workspace)} (sandboxed)")
1435
+ try:
1436
+ box = SandboxSync.create(
1437
+ image,
1438
+ connection_config=ctx.connection.config(),
1439
+ volumes=[
1440
+ Volume(
1441
+ name="workspace",
1442
+ host=Host(path=str(ctx.workspace.resolve())),
1443
+ mount_path=_WORKSPACE_MNT,
1444
+ )
1445
+ ],
1446
+ network_policy=osb.network_policy(ctx.net),
1447
+ resource=ctx.resources or None,
1448
+ timeout=timedelta(hours=1),
1449
+ )
1450
+ except Exception as exc:
1451
+ console.error_panel("sandbox create failed", str(exc))
1452
+ return 1
1453
+ try:
1454
+ err = _run_setup(box, ctx.setup)
1455
+ if err is not None:
1456
+ console.error_panel("setup failed", err)
1457
+ return 1
1458
+ execution = box.commands.run(
1459
+ _shlex.join(command),
1460
+ opts=RunCommandOpts(working_directory=_WORKSPACE_MNT, timeout=timedelta(hours=1)),
1461
+ )
1462
+ for log in execution.logs.stdout or []:
1463
+ console.out.print(log.text, end="", markup=False, highlight=False)
1464
+ for log in execution.logs.stderr or []:
1465
+ console.err.print(log.text, end="", markup=False, highlight=False)
1466
+ return execution.exit_code or 0
1467
+ finally:
1468
+ with _contextlib.suppress(Exception):
1469
+ box.kill()
1470
+
1471
+
1472
+ def _cmd_sandbox(args: argparse.Namespace) -> int:
1473
+ if args.sandbox_cmd == "stop":
1474
+ if osb.stop_server():
1475
+ console.note("sandbox server stopped")
1476
+ else:
1477
+ console.note("no managed sandbox server is running")
1478
+ return 0
1479
+ status = osb.server_status()
1480
+ console.out.print(f" docker: {status['docker']}")
1481
+ console.out.print(
1482
+ f" config: {status['config']}" + ("" if status["configured"] else " (not yet generated)")
1483
+ )
1484
+ if "domain" in status:
1485
+ health = "healthy" if status.get("healthy") else "not running"
1486
+ console.out.print(f" server: {status['domain']} ({health})")
1487
+ if status.get("sandboxes") is not None:
1488
+ console.out.print(f" sandboxes: {status['sandboxes']}")
1489
+ return 0
1490
+
1491
+
1492
+ # ── parser ─────────────────────────────────────────────────────────────────────
1493
+
1494
+
1495
+ def _add_run_args(p: argparse.ArgumentParser, *, needs_path: bool) -> None:
1496
+ if needs_path:
1497
+ p.add_argument("path", help="Path to an eval config (.eval.json)")
1498
+ p.add_argument(
1499
+ "--agent",
1500
+ default=None,
1501
+ help=(
1502
+ f"Agent(s) to run, comma-separated. Known: {', '.join(known_agents())}, "
1503
+ "or a path to any agent's avp-conformance.json. One board per agent. "
1504
+ 'Overrides the config\'s "agents"; falls back to that, then to '
1505
+ f"{DEFAULT_AGENT}."
1506
+ ),
1507
+ )
1508
+ p.add_argument(
1509
+ "--model",
1510
+ default=None,
1511
+ help="Override the model every commission runs (e.g. claude-sonnet-4-6)",
1512
+ )
1513
+ p.add_argument(
1514
+ "--name",
1515
+ default=None,
1516
+ help="Name this voyage (its id for `eval view`/`list`). Prompts if a TTY; else autogenerates.",
1517
+ )
1518
+ p.add_argument("--out", default=None, help="Directory for NDJSON trajectories")
1519
+ p.add_argument("--json", dest="json_path", default=None, help="Write a machine-readable board")
1520
+ p.add_argument(
1521
+ "--threshold", type=float, default=None, help="Override the scorer pass threshold"
1522
+ )
1523
+ p.add_argument(
1524
+ "--max-items", type=int, default=None, help="Cap items per commission (cost control)"
1525
+ )
1526
+ p.add_argument(
1527
+ "--timeout",
1528
+ type=float,
1529
+ default=300.0,
1530
+ help="Max seconds per run before it's recorded as an error (default: 300)",
1531
+ )
1532
+ p.add_argument(
1533
+ "--resume",
1534
+ metavar="RUN_ID",
1535
+ default=None,
1536
+ help="Resume a run by id: reuse cells whose trajectory finished, re-run the rest",
1537
+ )
1538
+ p.add_argument(
1539
+ "--env",
1540
+ default=None,
1541
+ help="Run agents inside a declarative environment (a path to an env JSON, or a name "
1542
+ "in ~/.avp/environments). Defines the sandbox image, workspace, and egress.",
1543
+ )
1544
+ p.add_argument("--quiet", action="store_true", help="Suppress per-run progress on stderr")
1545
+
1546
+
1547
+ def _build_parser() -> argparse.ArgumentParser:
1548
+ parser = argparse.ArgumentParser(prog="avp", description="The local AVP CLI.")
1549
+ groups = parser.add_subparsers(dest="group") # not required: no args -> WELCOME
1550
+
1551
+ p_init = groups.add_parser("init", help="Scaffold an eval config from the catalog")
1552
+ p_init.add_argument(
1553
+ "key", nargs="?", default=None, help="Catalog key (omit to pick interactively)"
1554
+ )
1555
+ p_init.add_argument("--dir", default=".", help="Directory to write the config into")
1556
+ p_init.add_argument(
1557
+ "--agent",
1558
+ default=None,
1559
+ help=(
1560
+ f"Agent(s) to pin in the config, comma-separated. Known: {', '.join(known_agents())}. "
1561
+ "Omit to pick interactively."
1562
+ ),
1563
+ )
1564
+
1565
+ eval_p = groups.add_parser("eval", help="Run and compare Commission setups")
1566
+ esub = eval_p.add_subparsers(dest="eval_cmd", required=True)
1567
+ _add_run_args(
1568
+ esub.add_parser("run", help="Run an eval config and print the board"), needs_path=True
1569
+ )
1570
+ esub.add_parser("list", help="List recent eval runs (newest first)")
1571
+ p_del = esub.add_parser("delete", help="Delete one recorded run by id (or --all for every run)")
1572
+ p_del.add_argument(
1573
+ "name",
1574
+ nargs="?",
1575
+ default=None,
1576
+ metavar="ID",
1577
+ help="Voyage id of the run to delete (from `avp eval list`)",
1578
+ )
1579
+ p_del.add_argument(
1580
+ "--all",
1581
+ action="store_true",
1582
+ help="Delete every recorded run + its outputs (~/.avp/runs)",
1583
+ )
1584
+
1585
+ p_view = esub.add_parser(
1586
+ "view", help="Open a finished eval on agentvoyagerproject.com (encodes it into the URL)"
1587
+ )
1588
+ p_view.add_argument(
1589
+ "path",
1590
+ nargs="?",
1591
+ default=None,
1592
+ metavar="ID",
1593
+ help="A voyage id from `avp eval list` (or a trajectories.json / out dir). "
1594
+ "Default: the most recent run.",
1595
+ )
1596
+ p_view.add_argument(
1597
+ "--agent",
1598
+ default=None,
1599
+ help="Open only this agent's trajectories (default: all in the run)",
1600
+ )
1601
+ p_view.add_argument("--site", default=viz.SITE, help="Viewer base URL (override for dev)")
1602
+ p_view.add_argument(
1603
+ "--no-open", action="store_true", help="Print the URL(s), don't open a browser"
1604
+ )
1605
+
1606
+ com_p = groups.add_parser("cm", help="Build and inspect commissions in your library")
1607
+ csub = com_p.add_subparsers(dest="commission_cmd", required=True)
1608
+ p_create = csub.add_parser(
1609
+ "create",
1610
+ help="Build a commission into your library (interactive wizard, or fully via flags)",
1611
+ )
1612
+ p_create.add_argument(
1613
+ "id", nargs="?", default=None, help="Commission id (becomes <id>.json; prompts if a TTY)"
1614
+ )
1615
+ p_create.add_argument(
1616
+ "--from",
1617
+ dest="from_id",
1618
+ default=None,
1619
+ metavar="ID",
1620
+ help="Clone an existing library commission as the base (carries its output_schema, "
1621
+ "mcp_servers, skills); other flags override on top.",
1622
+ )
1623
+ p_create.add_argument(
1624
+ "--agent",
1625
+ default=None,
1626
+ help=f"Anchor agent for the enabled-* pickers + validation ({', '.join(known_agents())}, "
1627
+ "or a manifest path).",
1628
+ )
1629
+ p_create.add_argument(
1630
+ "--model", default=None, help="Model slug, e.g. anthropic/claude-opus-4-8"
1631
+ )
1632
+ p_create.add_argument(
1633
+ "--prompt", default=None, help="User prompt; use {input} where the dataset case goes"
1634
+ )
1635
+ p_create.add_argument("--system-prompt", dest="system_prompt", default=None)
1636
+ p_create.add_argument(
1637
+ "--provider-id",
1638
+ dest="provider_id",
1639
+ default=None,
1640
+ metavar="ID",
1641
+ help="Provider/storefront id (e.g. anthropic, openrouter); sets the provider block",
1642
+ )
1643
+ p_create.add_argument(
1644
+ "--provider-base-url",
1645
+ dest="provider_base_url",
1646
+ default=None,
1647
+ metavar="URL",
1648
+ help="Provider endpoint override (requires --provider-id)",
1649
+ )
1650
+ p_create.add_argument(
1651
+ "--credential",
1652
+ dest="credential",
1653
+ default=None,
1654
+ metavar="HANDLE",
1655
+ help="Vault handle for the provider key (a SecretRef; see `avp env secret create`)",
1656
+ )
1657
+ p_create.add_argument(
1658
+ "--enable-tool",
1659
+ dest="enable_tools",
1660
+ action="append",
1661
+ metavar="NAME",
1662
+ help="Expose only this builtin tool (repeatable); validated against --agent. "
1663
+ "Omit to expose all the agent's tools.",
1664
+ )
1665
+ p_create.add_argument(
1666
+ "--enable-subagent", dest="enable_subagents", action="append", metavar="NAME"
1667
+ )
1668
+ p_create.add_argument("--enable-skill", dest="enable_skills", action="append", metavar="NAME")
1669
+ p_create.add_argument(
1670
+ "--enable-mcp",
1671
+ dest="enable_mcp",
1672
+ action="append",
1673
+ metavar="ID",
1674
+ help="Expose only this builtin MCP server (repeatable).",
1675
+ )
1676
+ p_create.add_argument("--tag", dest="tags", action="append", metavar="TAG")
1677
+ p_create.add_argument(
1678
+ "--force", action="store_true", help="Overwrite an existing commission with this id"
1679
+ )
1680
+ csub.add_parser("list", help="List the commissions in your library (~/.avp/commissions)")
1681
+ p_describe = csub.add_parser(
1682
+ "describe",
1683
+ help="Describe a library commission (the raw wire Commission) by id, or a Commission file",
1684
+ )
1685
+ p_describe.add_argument(
1686
+ "id", help="A commission id from your library (or a Commission .json file)"
1687
+ )
1688
+ p_check = csub.add_parser("check", help="Check a library commission by id or path")
1689
+ p_check.add_argument(
1690
+ "target", help="A commission id from your library, or a path to a Commission .json file"
1691
+ )
1692
+ p_cdel = csub.add_parser("delete", help="Delete a commission from your library by id")
1693
+ p_cdel.add_argument("id", help="A commission id from your library")
1694
+
1695
+ agent_p = groups.add_parser("agent", help="Install and inspect the agents you can run against")
1696
+ asub = agent_p.add_subparsers(dest="agent_cmd", required=True)
1697
+ asub.add_parser("list", help="List known agents, whether each is installed and ready")
1698
+ p_adesc = asub.add_parser(
1699
+ "describe", help="Print an agent's AgentDescriptor (its tools, models, skills, MCP)"
1700
+ )
1701
+ p_adesc.add_argument(
1702
+ "name", help=f"Agent name ({', '.join(known_agents())}) or a path to a manifest"
1703
+ )
1704
+ p_adesc.add_argument(
1705
+ "--json",
1706
+ dest="json_out",
1707
+ action="store_true",
1708
+ help="Print the raw AgentDescriptor JSON (full tool input schemas)",
1709
+ )
1710
+
1711
+ p_ainstall = asub.add_parser(
1712
+ "install", help="Install a prebuilt agent (from a GitHub release, or local artifacts)"
1713
+ )
1714
+ p_ainstall.add_argument("name", help=f"Agent to install ({', '.join(known_agents())})")
1715
+ p_ainstall.add_argument(
1716
+ "--version", default=None, help="Release version to install (e.g. 0.0.1). Default: latest."
1717
+ )
1718
+ p_ainstall.add_argument(
1719
+ "--binary",
1720
+ default=None,
1721
+ metavar="PATH",
1722
+ help="Install a binary agent from this local executable, skipping the release "
1723
+ "(the local-package dev loop).",
1724
+ )
1725
+ p_ainstall.add_argument(
1726
+ "--wheel",
1727
+ action="append",
1728
+ metavar="WHL",
1729
+ help="Install a Python agent from local wheel(s), skipping the release (repeatable).",
1730
+ )
1731
+ p_ainstall.add_argument(
1732
+ "--force", action="store_true", help="Reinstall over an existing install"
1733
+ )
1734
+
1735
+ p_auninstall = asub.add_parser("uninstall", help="Remove an installed agent")
1736
+ p_auninstall.add_argument("name", help="Installed agent name")
1737
+
1738
+ run_p = groups.add_parser(
1739
+ "run", help="Commission an agent to do a task, optionally inside an environment"
1740
+ )
1741
+ run_p.add_argument("prompt", help="The task for the agent")
1742
+ run_p.add_argument(
1743
+ "--agent",
1744
+ required=True,
1745
+ help=f"Agent to run ({', '.join(known_agents())}, or a manifest path)",
1746
+ )
1747
+ run_p.add_argument(
1748
+ "--env", default=None, help="Environment to run inside (a name or a path to an env JSON)"
1749
+ )
1750
+ run_p.add_argument("--model", default=None, help="Model override (else the agent's default)")
1751
+ run_p.add_argument(
1752
+ "--timeout", type=float, default=600.0, help="Max seconds for the run (default: 600)"
1753
+ )
1754
+
1755
+ env_p = groups.add_parser(
1756
+ "env", help="Define + run agent environments (a container image + workspace + egress)"
1757
+ )
1758
+ ensub = env_p.add_subparsers(dest="env_cmd", required=True)
1759
+ ensub.add_parser("list", help="List named environments (~/.avp/environments)")
1760
+ p_ecreate = ensub.add_parser("create", help="Create an environment in your library")
1761
+ p_ecreate.add_argument("name", help="Environment name (becomes <name>.json)")
1762
+ p_ecreate.add_argument(
1763
+ "--image",
1764
+ default=None,
1765
+ metavar="IMAGE",
1766
+ help="Base container image (default: python:3.12-slim)",
1767
+ )
1768
+ p_ecreate.add_argument("--apt", action="append", metavar="PKG", help="apt package (repeatable)")
1769
+ p_ecreate.add_argument("--pip", action="append", metavar="PKG", help="pip package (repeatable)")
1770
+ p_ecreate.add_argument(
1771
+ "--path",
1772
+ action="append",
1773
+ metavar="SRC",
1774
+ help="Copy a local file or directory into the env workspace (repeatable). "
1775
+ "Re-copied each run, so edits to SRC are picked up.",
1776
+ )
1777
+ p_ecreate.add_argument(
1778
+ "--file",
1779
+ action="append",
1780
+ metavar="PATH=SRC",
1781
+ help="Seed a file: PATH=inline-content, or PATH=@localfile (repeatable)",
1782
+ )
1783
+ p_ecreate.add_argument(
1784
+ "--setup",
1785
+ action="append",
1786
+ metavar="CMD",
1787
+ help="Command run in the sandbox workspace before the agent (repeatable)",
1788
+ )
1789
+ p_ecreate.add_argument(
1790
+ "--net", action="append", metavar="DOMAIN", help="Allowed egress domain (repeatable)"
1791
+ )
1792
+ p_ecreate.add_argument("--cpu", default=None, metavar="N", help='CPU cap (e.g. "2")')
1793
+ p_ecreate.add_argument("--memory", default=None, metavar="SIZE", help='Memory cap (e.g. "4Gi")')
1794
+ p_ecreate.add_argument("--force", action="store_true", help="Overwrite an existing environment")
1795
+ p_eshow = ensub.add_parser("show", help="Show an environment (a name or a path to an env JSON)")
1796
+ p_eshow.add_argument("env", help="Environment name (in ~/.avp/environments) or a path")
1797
+ p_erun = ensub.add_parser("run", help="Run a command inside an environment's sandbox")
1798
+ p_erun.add_argument("env", help="Environment name or path to an env JSON")
1799
+ p_erun.add_argument(
1800
+ "command", nargs=argparse.REMAINDER, help="Command to run inside the env, after `--`"
1801
+ )
1802
+ p_edel = ensub.add_parser("delete", help="Delete an environment from your library")
1803
+ p_edel.add_argument("name", help="Environment name")
1804
+
1805
+ # Secrets a Commission references by handle ({"vault": "<handle>"}). Stored
1806
+ # in ~/.avp/secrets.toml (0600); the credential broker injects the value at
1807
+ # run time so it never enters the sandbox.
1808
+ p_secret = ensub.add_parser(
1809
+ "secret", help="Store credentials a Commission references by vault handle"
1810
+ )
1811
+ secsub = p_secret.add_subparsers(dest="secret_cmd", required=True)
1812
+ p_secset = secsub.add_parser("create", help="Store a secret by handle")
1813
+ p_secset.add_argument("handle", help='Vault handle (a Commission\'s {"vault": <handle>})')
1814
+ p_secset.add_argument(
1815
+ "value",
1816
+ nargs="?",
1817
+ default=None,
1818
+ help="The secret value. Omit to be prompted (keeps it out of shell history).",
1819
+ )
1820
+ p_secset.add_argument("--force", action="store_true", help="Overwrite an existing secret")
1821
+ secsub.add_parser("list", help="List stored handles (never the values)")
1822
+ p_secrm = secsub.add_parser("delete", help="Delete a stored secret by handle")
1823
+ p_secrm.add_argument("handle", help="Vault handle to delete")
1824
+
1825
+ sandbox_p = groups.add_parser("sandbox", help="The managed sandbox server (OpenSandbox)")
1826
+ ssub = sandbox_p.add_subparsers(dest="sandbox_cmd", required=True)
1827
+ ssub.add_parser("status", help="Docker + server health, config path, live sandbox count")
1828
+ ssub.add_parser("stop", help="Stop the managed sandbox server")
1829
+
1830
+ return parser
1831
+
1832
+
1833
+ def main(argv: list[str] | None = None) -> int:
1834
+ args = _build_parser().parse_args(argv)
1835
+ if args.group is None:
1836
+ console.out.print(brand.logo())
1837
+ console.out.print()
1838
+ console.out.print(welcome())
1839
+ return 0
1840
+ if args.group == "init":
1841
+ return _cmd_init(args)
1842
+ if args.group == "eval":
1843
+ return _cmd_eval(args)
1844
+ if args.group == "cm":
1845
+ return _cmd_commission(args)
1846
+ if args.group == "agent":
1847
+ return _cmd_agent(args)
1848
+ if args.group == "env":
1849
+ return _cmd_env(args)
1850
+ if args.group == "run":
1851
+ return _cmd_run(args)
1852
+ if args.group == "sandbox":
1853
+ return _cmd_sandbox(args)
1854
+ return 2
1855
+
1856
+
1857
+ if __name__ == "__main__":
1858
+ sys.exit(main())