avp-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avp/__init__.py +31 -0
- avp/commission.py +236 -0
- avp/content.py +273 -0
- avp/data/__init__.py +0 -0
- avp/data/prices.json +21945 -0
- avp/descriptor.py +204 -0
- avp/envelope.py +108 -0
- avp/gen_ai.py +160 -0
- avp/history.py +86 -0
- avp/pricing.py +138 -0
- avp/sink.py +62 -0
- avp/trajectory.py +530 -0
- avp_cli/__init__.py +82 -0
- avp_cli/agent.py +566 -0
- avp_cli/agent_install.py +331 -0
- avp_cli/agent_manifest.py +73 -0
- avp_cli/agents.py +258 -0
- avp_cli/brand.py +46 -0
- avp_cli/broker.py +227 -0
- avp_cli/catalog/__init__.py +128 -0
- avp_cli/catalog/capitals.json +67 -0
- avp_cli/catalog/custom.json +35 -0
- avp_cli/catalog/parsebench.json +44 -0
- avp_cli/cli.py +1858 -0
- avp_cli/commission.py +144 -0
- avp_cli/config.py +250 -0
- avp_cli/console.py +51 -0
- avp_cli/environment.py +218 -0
- avp_cli/eval/__init__.py +0 -0
- avp_cli/eval/dataset.py +37 -0
- avp_cli/eval/engine.py +426 -0
- avp_cli/eval/report.py +178 -0
- avp_cli/eval/scoring.py +260 -0
- avp_cli/eval/setup.py +69 -0
- avp_cli/images.py +119 -0
- avp_cli/library.py +95 -0
- avp_cli/live.py +185 -0
- avp_cli/observability.py +128 -0
- avp_cli/onboarding.py +80 -0
- avp_cli/osb.py +347 -0
- avp_cli/paths.py +47 -0
- avp_cli/run_manifest.py +113 -0
- avp_cli/state.py +195 -0
- avp_cli/vault.py +116 -0
- avp_cli/viz.py +303 -0
- avp_cli-0.1.0.dist-info/METADATA +359 -0
- avp_cli-0.1.0.dist-info/RECORD +49 -0
- avp_cli-0.1.0.dist-info/WHEEL +4 -0
- avp_cli-0.1.0.dist-info/entry_points.txt +2 -0
avp_cli/agent.py
ADDED
|
@@ -0,0 +1,566 @@
|
|
|
1
|
+
"""Run a real AVP agent against one Commission, inside a sandbox.
|
|
2
|
+
|
|
3
|
+
Every conforming AVP agent honors the same run contract the conformance
|
|
4
|
+
harness uses:
|
|
5
|
+
|
|
6
|
+
<command> run --commission <path> --out <ndjson>
|
|
7
|
+
|
|
8
|
+
Here that command executes inside an OpenSandbox container built from the
|
|
9
|
+
environment's derived image (see `avp_cli.images`): the per-run host workspace
|
|
10
|
+
and an io dir are bind-mounted in, the Commission goes in as a file, and the
|
|
11
|
+
NDJSON trajectory lands back on the host where it is tailed live and parsed.
|
|
12
|
+
The host machine is not part of the agent's world; everything the agent sees
|
|
13
|
+
is declared (image, mounts, env vars, egress policy).
|
|
14
|
+
|
|
15
|
+
`describe_agent` stays a host-side subprocess: it is the free pre-flight view
|
|
16
|
+
(no model turn, no tools), driven from the agent's manifest like the
|
|
17
|
+
conformance harness does.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import contextlib
|
|
23
|
+
import json
|
|
24
|
+
import os
|
|
25
|
+
import shlex
|
|
26
|
+
import shutil
|
|
27
|
+
import subprocess
|
|
28
|
+
import time
|
|
29
|
+
import uuid
|
|
30
|
+
from collections.abc import Callable
|
|
31
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
32
|
+
from dataclasses import dataclass, field
|
|
33
|
+
from datetime import timedelta
|
|
34
|
+
from pathlib import Path
|
|
35
|
+
from typing import Any
|
|
36
|
+
from urllib.parse import urlparse
|
|
37
|
+
|
|
38
|
+
from opensandbox import SandboxSync
|
|
39
|
+
from opensandbox.models.execd import RunCommandOpts
|
|
40
|
+
from opensandbox.models.sandboxes import Host, Volume
|
|
41
|
+
from pydantic import BaseModel
|
|
42
|
+
|
|
43
|
+
from avp.commission import Commission
|
|
44
|
+
from avp.descriptor import AgentDescriptor
|
|
45
|
+
from avp.trajectory import parse_event
|
|
46
|
+
from avp_cli import broker, osb, paths, vault
|
|
47
|
+
from avp_cli.agent_manifest import AgentManifest
|
|
48
|
+
|
|
49
|
+
# How often the streaming path re-reads the growing --out file (seconds).
|
|
50
|
+
_TAIL_INTERVAL = 0.06
|
|
51
|
+
|
|
52
|
+
# In-sandbox layout: one rw mount for the (possibly run-shared) workspace, one
|
|
53
|
+
# rw mount for this run's io (commission in, trajectory + stderr out).
|
|
54
|
+
_WORKSPACE_MNT = "/avp/workspace"
|
|
55
|
+
_IO_MNT = "/avp/io"
|
|
56
|
+
|
|
57
|
+
# Host env vars forwarded into the sandbox: model-provider credentials and
|
|
58
|
+
# agent routing knobs. CLAUDE_ covers CLAUDE_CODE_OAUTH_TOKEN (the
|
|
59
|
+
# `claude setup-token` subscription credential the claude CLI accepts in
|
|
60
|
+
# place of an API key). The rest of the host environment stays on the host;
|
|
61
|
+
# the sandbox env is otherwise fully declared.
|
|
62
|
+
_ENV_PASSTHROUGH_PREFIXES = (
|
|
63
|
+
"ANTHROPIC_",
|
|
64
|
+
"CLAUDE_",
|
|
65
|
+
"OPENAI_",
|
|
66
|
+
"GOOGLE_",
|
|
67
|
+
"GEMINI_",
|
|
68
|
+
"MISTRAL_",
|
|
69
|
+
"OPENROUTER_",
|
|
70
|
+
"GOOSE_",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Sandbox lifetime margin beyond the run timeout: covers image boot, setup
|
|
74
|
+
# commands, and trajectory readback before the server reaps the sandbox.
|
|
75
|
+
_SANDBOX_TTL_MARGIN_S = 180.0
|
|
76
|
+
|
|
77
|
+
_DEFAULT_RESOURCES = {"cpu": "2", "memory": "4Gi"}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass(frozen=True)
|
|
81
|
+
class SandboxedAgent:
|
|
82
|
+
"""An agent resolved to its in-sandbox form: the derived image that has it
|
|
83
|
+
installed, the argv that runs it there, and its manifest env."""
|
|
84
|
+
|
|
85
|
+
name: str
|
|
86
|
+
image: str
|
|
87
|
+
command: tuple[str, ...]
|
|
88
|
+
env: dict[str, str] = field(default_factory=dict)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@dataclass
|
|
92
|
+
class SandboxContext:
|
|
93
|
+
"""The run-level sandbox facts shared by every cell of an eval (or the one
|
|
94
|
+
cell of `avp run`): server connection, the seeded host workspace, and the
|
|
95
|
+
environment's setup / egress / resource asks."""
|
|
96
|
+
|
|
97
|
+
connection: osb.Connection
|
|
98
|
+
workspace: Path
|
|
99
|
+
setup: list[str] = field(default_factory=list)
|
|
100
|
+
net: list[str] = field(default_factory=list)
|
|
101
|
+
resources: dict[str, str] = field(default_factory=dict)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def load_manifest(path: str | Path) -> tuple[AgentManifest, Path]:
|
|
105
|
+
"""Load an agent manifest and resolve its cwd relative to the manifest file."""
|
|
106
|
+
p = Path(path).resolve()
|
|
107
|
+
manifest = AgentManifest.model_validate(json.loads(p.read_text()))
|
|
108
|
+
cwd = (p.parent / manifest.cwd).resolve()
|
|
109
|
+
return manifest, cwd
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def run_agent(
|
|
113
|
+
agent: SandboxedAgent,
|
|
114
|
+
ctx: SandboxContext,
|
|
115
|
+
commission: Commission,
|
|
116
|
+
*,
|
|
117
|
+
out_path: str | Path,
|
|
118
|
+
timeout_s: float = 300.0,
|
|
119
|
+
on_event: Callable[[BaseModel | dict[str, Any]], None] | None = None,
|
|
120
|
+
) -> tuple[list[BaseModel | dict[str, Any]] | None, str | None]:
|
|
121
|
+
"""Run the agent for one Commission in a fresh sandbox. Returns (events, error).
|
|
122
|
+
|
|
123
|
+
On success `error` is None and `events` is the parsed NDJSON trajectory
|
|
124
|
+
(custom event types pass through as dicts). On failure `events` is None and
|
|
125
|
+
`error` is a short diagnostic (nonzero exit, timeout, sandbox create
|
|
126
|
+
failure) so one bad run is recorded per-cell rather than aborting a matrix.
|
|
127
|
+
|
|
128
|
+
If `on_event` is given, the trajectory file is tailed on the host (through
|
|
129
|
+
the io bind mount) while the agent runs and `on_event` fires per event for
|
|
130
|
+
live progress. The returned list is always re-parsed from the finished
|
|
131
|
+
file, so it stays authoritative regardless of what the live tail saw.
|
|
132
|
+
|
|
133
|
+
The sandbox is always killed before returning; the workspace mount is the
|
|
134
|
+
only place agent writes survive.
|
|
135
|
+
"""
|
|
136
|
+
out = Path(out_path)
|
|
137
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
138
|
+
# io lives under ~/.avp: the server's allowed_host_paths confines bind
|
|
139
|
+
# mounts there, and --out may point anywhere on the host. The trajectory
|
|
140
|
+
# is moved to `out` once the run settles.
|
|
141
|
+
io_dir = paths.avp_home() / "tmp" / uuid.uuid4().hex
|
|
142
|
+
io_dir.mkdir(parents=True)
|
|
143
|
+
traj_host = io_dir / "trajectory.ndjson"
|
|
144
|
+
|
|
145
|
+
# Vault broker: when the Commission references any secret (provider
|
|
146
|
+
# credential or MCP auth), a host-side credential-injecting proxy serves
|
|
147
|
+
# those endpoints so the resolved value never enters the sandbox. The
|
|
148
|
+
# written commission + sandbox env then point at the broker with sentinels.
|
|
149
|
+
brk: broker.Broker | None = None
|
|
150
|
+
try:
|
|
151
|
+
brk = _start_broker(commission)
|
|
152
|
+
except vault.VaultError as exc:
|
|
153
|
+
shutil.rmtree(io_dir, ignore_errors=True)
|
|
154
|
+
return None, f"vault: {exc}"
|
|
155
|
+
except Exception as exc:
|
|
156
|
+
shutil.rmtree(io_dir, ignore_errors=True)
|
|
157
|
+
return None, f"vault broker startup failed: {exc}"
|
|
158
|
+
|
|
159
|
+
try:
|
|
160
|
+
(io_dir / "commission.json").write_text(_commission_for_sandbox(commission, brk))
|
|
161
|
+
if brk is not None:
|
|
162
|
+
(io_dir / "broker-preflight.sh").write_text(_BROKER_PREFLIGHT_SH)
|
|
163
|
+
try:
|
|
164
|
+
box = SandboxSync.create(
|
|
165
|
+
agent.image,
|
|
166
|
+
connection_config=ctx.connection.config(),
|
|
167
|
+
env=_sandbox_env(agent, commission, brk),
|
|
168
|
+
volumes=[
|
|
169
|
+
Volume(
|
|
170
|
+
name="workspace",
|
|
171
|
+
host=Host(path=str(ctx.workspace.resolve())),
|
|
172
|
+
mount_path=_WORKSPACE_MNT,
|
|
173
|
+
),
|
|
174
|
+
Volume(name="io", host=Host(path=str(io_dir)), mount_path=_IO_MNT),
|
|
175
|
+
],
|
|
176
|
+
# Egress is default-deny floored by runtime-base domains, plus the
|
|
177
|
+
# env's `net`, plus the hosts the run reaches: under broker mode
|
|
178
|
+
# just the broker host (the broker reaches providers/MCP from the
|
|
179
|
+
# host); otherwise the provider + MCP hosts the Commission names.
|
|
180
|
+
# A run can never be commissioned to call a URL the sandbox blocks.
|
|
181
|
+
network_policy=osb.network_policy([*ctx.net, *_egress_extra(commission, brk)]),
|
|
182
|
+
resource={**_DEFAULT_RESOURCES, **ctx.resources},
|
|
183
|
+
timeout=timedelta(seconds=timeout_s + _SANDBOX_TTL_MARGIN_S),
|
|
184
|
+
)
|
|
185
|
+
except Exception as exc:
|
|
186
|
+
return None, f"sandbox create failed: {exc}"
|
|
187
|
+
try:
|
|
188
|
+
# Fail closed: if the vault broker isn't reachable from the sandbox,
|
|
189
|
+
# refuse the run rather than ever placing the secret inside it.
|
|
190
|
+
err = _preflight_broker(box, brk) if brk is not None else None
|
|
191
|
+
if err is None:
|
|
192
|
+
err = _run_setup(box, ctx.setup)
|
|
193
|
+
if err is None:
|
|
194
|
+
argv = [
|
|
195
|
+
*agent.command,
|
|
196
|
+
"run",
|
|
197
|
+
"--commission",
|
|
198
|
+
f"{_IO_MNT}/commission.json",
|
|
199
|
+
"--out",
|
|
200
|
+
f"{_IO_MNT}/trajectory.ndjson",
|
|
201
|
+
]
|
|
202
|
+
command = f"{shlex.join(argv)} 2>{_IO_MNT}/stderr.log"
|
|
203
|
+
if on_event is None:
|
|
204
|
+
err = _exec(box, command, timeout_s, io_dir)
|
|
205
|
+
else:
|
|
206
|
+
err = _exec_streaming(box, command, timeout_s, io_dir, traj_host, on_event)
|
|
207
|
+
finally:
|
|
208
|
+
with contextlib.suppress(Exception): # reaped by TTL if the kill call fails
|
|
209
|
+
box.kill()
|
|
210
|
+
if err is not None:
|
|
211
|
+
return None, err
|
|
212
|
+
if not traj_host.exists():
|
|
213
|
+
return None, "agent exited 0 but wrote no trajectory"
|
|
214
|
+
if traj_host != out:
|
|
215
|
+
shutil.move(traj_host, out)
|
|
216
|
+
return read_trajectory(out), None
|
|
217
|
+
finally:
|
|
218
|
+
if brk is not None:
|
|
219
|
+
brk.stop()
|
|
220
|
+
shutil.rmtree(io_dir, ignore_errors=True)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
# The sentinel key the agent receives in place of a vault-managed credential.
|
|
224
|
+
# Non-empty so SDKs that require a key present are satisfied; the broker
|
|
225
|
+
# overwrites it with the real value on the host before forwarding.
|
|
226
|
+
_VAULT_SENTINEL = "avp-vault-managed"
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _provider_credentialed(commission: Commission) -> bool:
|
|
230
|
+
p = commission.provider
|
|
231
|
+
return p is not None and p.credential is not None
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _start_broker(commission: Commission) -> broker.Broker | None:
|
|
235
|
+
"""Start a vault broker iff the Commission references any secret. Resolves
|
|
236
|
+
handles host-side (may raise VaultError); secrets live only in the broker."""
|
|
237
|
+
routes: list[tuple[str, broker.Route]] = []
|
|
238
|
+
prov = commission.provider
|
|
239
|
+
if _provider_credentialed(commission):
|
|
240
|
+
base = prov.base_url or (osb.PROVIDER_REGISTRY.get(prov.id) or (None, None))[1] or ""
|
|
241
|
+
header, prefix = (
|
|
242
|
+
("x-api-key", "") if prov.id == "anthropic" else ("authorization", "Bearer ")
|
|
243
|
+
)
|
|
244
|
+
routes.append(
|
|
245
|
+
(
|
|
246
|
+
f"llm/{prov.id}",
|
|
247
|
+
broker.Route(
|
|
248
|
+
upstream=broker.origin_of(base),
|
|
249
|
+
header=header,
|
|
250
|
+
prefix=prefix,
|
|
251
|
+
secret=vault.resolve_ref(prov.credential),
|
|
252
|
+
),
|
|
253
|
+
)
|
|
254
|
+
)
|
|
255
|
+
for server in commission.mcp_servers or []:
|
|
256
|
+
auth = getattr(server, "auth", None)
|
|
257
|
+
if auth is not None:
|
|
258
|
+
routes.append(
|
|
259
|
+
(
|
|
260
|
+
f"mcp/{server.id}",
|
|
261
|
+
broker.Route(
|
|
262
|
+
upstream=server.url,
|
|
263
|
+
header="authorization",
|
|
264
|
+
prefix="Bearer ",
|
|
265
|
+
secret=vault.resolve(auth.vault),
|
|
266
|
+
),
|
|
267
|
+
)
|
|
268
|
+
)
|
|
269
|
+
if not routes:
|
|
270
|
+
return None
|
|
271
|
+
brk = broker.Broker()
|
|
272
|
+
for key, route in routes:
|
|
273
|
+
brk.add_route(key, route)
|
|
274
|
+
brk.start()
|
|
275
|
+
return brk
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def _sandbox_env(
|
|
279
|
+
agent: SandboxedAgent, commission: Commission, brk: broker.Broker | None
|
|
280
|
+
) -> dict[str, str]:
|
|
281
|
+
"""The declared sandbox environment: provider routing (broker urls +
|
|
282
|
+
sentinels for vault-credentialed providers, real base_url otherwise), the
|
|
283
|
+
manifest's env, and the AVP workspace convention (AVP_WORKSPACE /
|
|
284
|
+
AVP_ENV_ROOT). Host provider vars (ANTHROPIC_*, …) are forwarded for the
|
|
285
|
+
no-vault case where the user supplies their own ambient key.
|
|
286
|
+
"""
|
|
287
|
+
env = {k: v for k, v in os.environ.items() if k.startswith(_ENV_PASSTHROUGH_PREFIXES)}
|
|
288
|
+
prov = commission.provider
|
|
289
|
+
if prov is not None:
|
|
290
|
+
up = prov.id.upper().replace("-", "_")
|
|
291
|
+
if prov.credential is not None and brk is not None:
|
|
292
|
+
# Vault: route through the broker, hand the agent only a sentinel.
|
|
293
|
+
route = brk.route_url(f"llm/{prov.id}")
|
|
294
|
+
if prov.id == "anthropic":
|
|
295
|
+
env["ANTHROPIC_BASE_URL"] = route
|
|
296
|
+
env["ANTHROPIC_API_KEY"] = _VAULT_SENTINEL
|
|
297
|
+
else:
|
|
298
|
+
env[f"{up}_HOST"] = route
|
|
299
|
+
env[f"{up}_API_KEY"] = _VAULT_SENTINEL
|
|
300
|
+
env["GOOSE_PROVIDER"] = prov.id
|
|
301
|
+
else:
|
|
302
|
+
# Non-vault provider: real endpoint, ambient key (forwarded above).
|
|
303
|
+
base = prov.base_url or (osb.PROVIDER_REGISTRY.get(prov.id) or (None, None))[1]
|
|
304
|
+
if prov.id == "anthropic":
|
|
305
|
+
if base:
|
|
306
|
+
env["ANTHROPIC_BASE_URL"] = base
|
|
307
|
+
else:
|
|
308
|
+
if base:
|
|
309
|
+
env[f"{up}_HOST"] = base
|
|
310
|
+
env["GOOSE_PROVIDER"] = prov.id
|
|
311
|
+
env.update(agent.env)
|
|
312
|
+
env["AVP_WORKSPACE"] = _WORKSPACE_MNT
|
|
313
|
+
env["AVP_ENV_ROOT"] = "/avp"
|
|
314
|
+
return env
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def _commission_for_sandbox(commission: Commission, brk: broker.Broker | None) -> str:
|
|
318
|
+
"""The commission.json the agent reads.
|
|
319
|
+
|
|
320
|
+
Provider routing and MCP auth are supervisor concerns the CLI delivers via
|
|
321
|
+
env (`_sandbox_env`) and the broker, not fields the agent consumes — so they
|
|
322
|
+
are stripped from what the agent receives. This also keeps the written
|
|
323
|
+
commission compatible with released agents whose (older) wire types reject
|
|
324
|
+
unknown fields like `provider`/`auth`, and keeps the resolved value out of
|
|
325
|
+
the agent's run_requested snapshot. Under broker mode each authed MCP url is
|
|
326
|
+
rewritten to its broker route (the broker injects the credential)."""
|
|
327
|
+
data = commission.model_dump(by_alias=True, exclude_none=True, mode="json")
|
|
328
|
+
data.pop("provider", None)
|
|
329
|
+
for server in data.get("mcp_servers") or []:
|
|
330
|
+
if brk is not None and server.get("type") == "http" and server.get("auth") is not None:
|
|
331
|
+
server["url"] = brk.route_url(f"mcp/{server['id']}")
|
|
332
|
+
server.pop("auth", None)
|
|
333
|
+
return json.dumps(data)
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def _egress_extra(commission: Commission, brk: broker.Broker | None) -> list[str]:
|
|
337
|
+
"""Per-run egress additions. Without a broker: the provider + MCP hosts the
|
|
338
|
+
Commission names. With a broker: the broker host, plus the hosts of any
|
|
339
|
+
endpoints NOT routed through it (non-credentialed provider / unauthed MCP)."""
|
|
340
|
+
if brk is None:
|
|
341
|
+
return osb.commission_egress_hosts(commission)
|
|
342
|
+
hosts = [broker.SANDBOX_HOST_ALIAS]
|
|
343
|
+
prov = commission.provider
|
|
344
|
+
if prov is not None and prov.credential is None:
|
|
345
|
+
base = prov.base_url or (osb.PROVIDER_REGISTRY.get(prov.id) or (None, None))[1]
|
|
346
|
+
if base:
|
|
347
|
+
hosts.append(urlparse(base).hostname or "")
|
|
348
|
+
for server in commission.mcp_servers or []:
|
|
349
|
+
if getattr(server, "url", None) and getattr(server, "auth", None) is None:
|
|
350
|
+
hosts.append(urlparse(server.url).hostname or "")
|
|
351
|
+
return [h for h in hosts if h]
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
# Reach the host-side vault broker from inside the sandbox, cross-platform.
|
|
355
|
+
# Docker Desktop / OrbStack inject `host.docker.internal`; plain Linux Docker
|
|
356
|
+
# does not, so we map it to the default-route gateway (the host) in /etc/hosts.
|
|
357
|
+
# That keeps a single broker address (host.docker.internal) working everywhere —
|
|
358
|
+
# the agent's base_urls / MCP urls never have to change per host. Written to the
|
|
359
|
+
# io dir and run by `_preflight_broker`; exit 0 iff the broker is reachable.
|
|
360
|
+
_BROKER_PREFLIGHT_SH = r"""#!/bin/sh
|
|
361
|
+
URL="http://host.docker.internal:$1/health"
|
|
362
|
+
fetch() {
|
|
363
|
+
curl -fsS -m 3 "$URL" >/dev/null 2>&1 && return 0
|
|
364
|
+
wget -q -T 3 -O - "$URL" >/dev/null 2>&1 && return 0
|
|
365
|
+
python3 -c 'import urllib.request,sys; urllib.request.urlopen(sys.argv[1],timeout=3)' "$URL" >/dev/null 2>&1
|
|
366
|
+
}
|
|
367
|
+
fetch && exit 0
|
|
368
|
+
gw="$(ip -4 route show default 2>/dev/null | awk '{print $3; exit}')"
|
|
369
|
+
if [ -z "$gw" ]; then
|
|
370
|
+
gw="$(python3 -c 'import socket, struct
|
|
371
|
+
for line in open("/proc/net/route").read().splitlines()[1:]:
|
|
372
|
+
f = line.split()
|
|
373
|
+
if len(f) > 3 and f[1] == "00000000" and int(f[3], 16) & 2:
|
|
374
|
+
print(socket.inet_ntoa(struct.pack("<L", int(f[2], 16)))); break' 2>/dev/null)"
|
|
375
|
+
fi
|
|
376
|
+
[ -n "$gw" ] && printf '%s host.docker.internal\n' "$gw" >> /etc/hosts 2>/dev/null
|
|
377
|
+
fetch
|
|
378
|
+
"""
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def _preflight_broker(box: SandboxSync, brk: broker.Broker) -> str | None:
|
|
382
|
+
"""Confirm the sandbox can reach the vault broker before the agent runs (and
|
|
383
|
+
ensure host.docker.internal routes to the host on plain Linux Docker).
|
|
384
|
+
Returns an error string (fail closed) when it can't; never falls back to
|
|
385
|
+
placing the secret in the sandbox."""
|
|
386
|
+
try:
|
|
387
|
+
execution = box.commands.run(
|
|
388
|
+
f"sh {_IO_MNT}/broker-preflight.sh {brk.port}",
|
|
389
|
+
opts=RunCommandOpts(timeout=timedelta(seconds=25)),
|
|
390
|
+
)
|
|
391
|
+
except Exception as exc:
|
|
392
|
+
return f"vault broker preflight failed to run ({exc}); refusing to run"
|
|
393
|
+
if execution.exit_code not in (0, None):
|
|
394
|
+
return (
|
|
395
|
+
f"vault broker unreachable from the sandbox (host broker on port {brk.port}); "
|
|
396
|
+
"refusing to run rather than expose the secret"
|
|
397
|
+
)
|
|
398
|
+
return None
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def _run_setup(box: SandboxSync, setup: list[str]) -> str | None:
|
|
402
|
+
"""Run the env's setup commands in the workspace; first failure reports."""
|
|
403
|
+
for command in setup:
|
|
404
|
+
try:
|
|
405
|
+
execution = box.commands.run(
|
|
406
|
+
command,
|
|
407
|
+
opts=RunCommandOpts(
|
|
408
|
+
working_directory=_WORKSPACE_MNT, timeout=timedelta(minutes=10)
|
|
409
|
+
),
|
|
410
|
+
)
|
|
411
|
+
except Exception as exc:
|
|
412
|
+
return f"setup failed ({command!r}): {exc}"
|
|
413
|
+
if execution.exit_code not in (0, None):
|
|
414
|
+
tail = _logs_tail(execution)
|
|
415
|
+
return f"setup exit {execution.exit_code} ({command!r}): {tail}"
|
|
416
|
+
return None
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
def _exec(box: SandboxSync, command: str, timeout_s: float, io_dir: Path) -> str | None:
|
|
420
|
+
"""Run the agent command and wait; return an error string or None."""
|
|
421
|
+
try:
|
|
422
|
+
execution = box.commands.run(
|
|
423
|
+
command,
|
|
424
|
+
opts=RunCommandOpts(
|
|
425
|
+
working_directory=_WORKSPACE_MNT, timeout=timedelta(seconds=timeout_s)
|
|
426
|
+
),
|
|
427
|
+
)
|
|
428
|
+
except Exception as exc:
|
|
429
|
+
return _timeout_or(f"agent run failed: {exc}", exc, timeout_s)
|
|
430
|
+
return _exit_error(execution, io_dir)
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def _exec_streaming(
|
|
434
|
+
box: SandboxSync,
|
|
435
|
+
command: str,
|
|
436
|
+
timeout_s: float,
|
|
437
|
+
io_dir: Path,
|
|
438
|
+
traj_host: Path,
|
|
439
|
+
on_event: Callable[[BaseModel | dict[str, Any]], None],
|
|
440
|
+
) -> str | None:
|
|
441
|
+
"""Run the agent command in a worker thread while tailing the growing
|
|
442
|
+
trajectory file (visible on the host through the io bind mount)."""
|
|
443
|
+
with ThreadPoolExecutor(max_workers=1) as pool:
|
|
444
|
+
future = pool.submit(_exec, box, command, timeout_s, io_dir)
|
|
445
|
+
deadline = time.monotonic() + timeout_s + _SANDBOX_TTL_MARGIN_S
|
|
446
|
+
pos = 0
|
|
447
|
+
buf = ""
|
|
448
|
+
try:
|
|
449
|
+
while not future.done():
|
|
450
|
+
pos, buf = _drain(traj_host, pos, buf, on_event)
|
|
451
|
+
if time.monotonic() > deadline: # belt-and-suspenders over execd's timeout
|
|
452
|
+
return f"timed out after {timeout_s:.0f}s"
|
|
453
|
+
time.sleep(_TAIL_INTERVAL)
|
|
454
|
+
except KeyboardInterrupt:
|
|
455
|
+
with contextlib.suppress(Exception):
|
|
456
|
+
box.kill() # unblocks the worker; the caller decides what stops
|
|
457
|
+
raise
|
|
458
|
+
_drain(traj_host, pos, buf, on_event) # lines written between last poll and exit
|
|
459
|
+
return future.result()
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def _exit_error(execution: Any, io_dir: Path) -> str | None:
|
|
463
|
+
if execution.exit_code in (0, None):
|
|
464
|
+
return None
|
|
465
|
+
stderr = io_dir / "stderr.log"
|
|
466
|
+
tail = ""
|
|
467
|
+
if stderr.exists():
|
|
468
|
+
tail = "\n".join(stderr.read_text().strip().splitlines()[-3:])
|
|
469
|
+
return f"exit {execution.exit_code}: {tail or _logs_tail(execution) or '(no stderr)'}"
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def _logs_tail(execution: Any, lines: int = 3) -> str:
|
|
473
|
+
chunks = [log.text for log in (execution.logs.stderr or execution.logs.stdout or [])]
|
|
474
|
+
return "\n".join("".join(chunks).strip().splitlines()[-lines:])
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def _timeout_or(message: str, exc: Exception, timeout_s: float) -> str:
|
|
478
|
+
"""execd surfaces a run timeout as an SDK exception; report it as ours."""
|
|
479
|
+
if "timeout" in str(exc).lower() or "timed out" in str(exc).lower():
|
|
480
|
+
return f"timed out after {timeout_s:.0f}s"
|
|
481
|
+
return message
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
def read_trajectory(path: Path) -> list[BaseModel | dict[str, Any]]:
|
|
485
|
+
"""Parse a finished NDJSON trajectory file into events (custom types as dicts).
|
|
486
|
+
|
|
487
|
+
The inverse of what an agent's `--out` stream produces; used both to return a
|
|
488
|
+
run's events and to re-read a previously completed run for `--resume`.
|
|
489
|
+
"""
|
|
490
|
+
events: list[BaseModel | dict[str, Any]] = []
|
|
491
|
+
for line in path.read_text().splitlines():
|
|
492
|
+
line = line.strip()
|
|
493
|
+
if line:
|
|
494
|
+
events.append(parse_event(json.loads(line)))
|
|
495
|
+
return events
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
def describe_agent(
|
|
499
|
+
manifest: AgentManifest,
|
|
500
|
+
manifest_cwd: Path,
|
|
501
|
+
*,
|
|
502
|
+
timeout_s: float = 120.0,
|
|
503
|
+
) -> tuple[AgentDescriptor | None, str | None]:
|
|
504
|
+
"""Fetch an agent's self-description via `<command> describe --out <file>`.
|
|
505
|
+
|
|
506
|
+
This is the spec's pre-flight view: the agent boots, lists its surface, and
|
|
507
|
+
exits without a model turn, so it's free, and it runs on the host (no
|
|
508
|
+
sandbox; nothing untrusted executes).
|
|
509
|
+
"""
|
|
510
|
+
import tempfile
|
|
511
|
+
|
|
512
|
+
env = {**os.environ, **manifest.env}
|
|
513
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
514
|
+
out = Path(tmp) / "descriptor.json"
|
|
515
|
+
cmd = [*manifest.command, "describe", "--out", str(out)]
|
|
516
|
+
err = _run_blocking(cmd, manifest_cwd, env, timeout_s)
|
|
517
|
+
if err is not None:
|
|
518
|
+
return None, err
|
|
519
|
+
if not out.exists():
|
|
520
|
+
return None, "agent exited 0 but wrote no descriptor"
|
|
521
|
+
try:
|
|
522
|
+
return AgentDescriptor.model_validate(json.loads(out.read_text())), None
|
|
523
|
+
except Exception as exc: # malformed / not a descriptor
|
|
524
|
+
return None, f"unparseable descriptor: {exc}"
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
def _run_blocking(cmd: list[str], cwd: Path, env: dict[str, str], timeout_s: float) -> str | None:
|
|
528
|
+
"""Spawn a host subprocess and wait; return an error string or None."""
|
|
529
|
+
try:
|
|
530
|
+
result = subprocess.run(
|
|
531
|
+
cmd, cwd=cwd, env=env, capture_output=True, text=True, timeout=timeout_s
|
|
532
|
+
)
|
|
533
|
+
except subprocess.TimeoutExpired:
|
|
534
|
+
return f"timed out after {timeout_s:.0f}s"
|
|
535
|
+
except OSError as exc:
|
|
536
|
+
return f"spawn failed: {exc}"
|
|
537
|
+
if result.returncode != 0:
|
|
538
|
+
tail = "\n".join(result.stderr.strip().splitlines()[-3:]) or "(no stderr)"
|
|
539
|
+
return f"exit {result.returncode}: {tail}"
|
|
540
|
+
return None
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
def _drain(
|
|
544
|
+
out: Path, pos: int, buf: str, on_event: Callable[[BaseModel | dict[str, Any]], None]
|
|
545
|
+
) -> tuple[int, str]:
|
|
546
|
+
"""Read `out` from byte offset `pos`, emit each complete NDJSON line to
|
|
547
|
+
`on_event`. Returns the new offset and any trailing partial line."""
|
|
548
|
+
if not out.exists():
|
|
549
|
+
return pos, buf
|
|
550
|
+
try:
|
|
551
|
+
with out.open("r") as f:
|
|
552
|
+
f.seek(pos)
|
|
553
|
+
buf += f.read()
|
|
554
|
+
pos = f.tell()
|
|
555
|
+
except (OSError, UnicodeDecodeError):
|
|
556
|
+
return pos, buf # transient (mid-write / partial multibyte); retry next tick
|
|
557
|
+
while "\n" in buf:
|
|
558
|
+
line, buf = buf.split("\n", 1)
|
|
559
|
+
line = line.strip()
|
|
560
|
+
if not line:
|
|
561
|
+
continue
|
|
562
|
+
# A partial line or a live-display glitch must not abort the run; the
|
|
563
|
+
# final full-file parse in run_agent is authoritative anyway.
|
|
564
|
+
with contextlib.suppress(Exception):
|
|
565
|
+
on_event(parse_event(json.loads(line)))
|
|
566
|
+
return pos, buf
|