avp-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
avp_cli/brand.py ADDED
@@ -0,0 +1,46 @@
1
+ """AVP branding for the terminal: the ship logo + the palette.
2
+
3
+ The logo is the same monospace ASCII ship as `assets/logo.svg` (the `<avp>`
4
+ sail on a mast, over a hull). Palette matches the logo and the constellation
5
+ viz: sail gold, mast/light, hull, keel.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from rich.text import Text
11
+
12
+ # Brand palette (hex), shared with the trajectory constellation.
13
+ SAIL = "#f3d28a"
14
+ MAST = "#e6eef3"
15
+ HULL = "#c98b4a"
16
+ KEEL = "#8a5b2c"
17
+ SKY = "#9fd6e7"
18
+
19
+ # A compact sailboat mark for prompts (the brand ship's one-glyph proxy).
20
+ SAILBOAT = "⛵"
21
+
22
+ # (text, color) per line, reconstructed from assets/logo.svg's row layout.
23
+ _SHIP: list[list[tuple[str, str]]] = [
24
+ [(" _", SAIL)],
25
+ [(" |", MAST), ("<avp>", SAIL)],
26
+ [(" |", MAST), (" ‾", SAIL)],
27
+ [(" |", MAST), ("\\", SAIL)],
28
+ [(" |", MAST), ("_\\", SAIL)],
29
+ [(" |", MAST), ("__\\", SAIL)],
30
+ [(" |", MAST), ("___\\", SAIL)],
31
+ [(" |", MAST), ("____\\", SAIL)],
32
+ [(" |", MAST), ("_____\\", SAIL)],
33
+ [("\\______________/", HULL)],
34
+ [(" \\____________/", KEEL)],
35
+ ]
36
+
37
+
38
+ def logo() -> Text:
39
+ """The AVP ship as a colored rich Text block."""
40
+ out = Text()
41
+ for i, line in enumerate(_SHIP):
42
+ for span, color in line:
43
+ out.append(span, style=color)
44
+ if i < len(_SHIP) - 1:
45
+ out.append("\n")
46
+ return out
avp_cli/broker.py ADDED
@@ -0,0 +1,227 @@
1
+ """The vault broker: a host-side credential-injecting reverse proxy.
2
+
3
+ The vault's promise is "the agent can *use* a credential it can never *read*."
4
+ The wire keeps secrets off the Commission (credentials travel as handles); this
5
+ broker keeps the resolved value out of the sandbox entirely.
6
+
7
+ How it fits together. OpenSandbox filters egress by DNS only, so a sandboxed
8
+ agent can reach a host process at `host.docker.internal` once that name is in
9
+ the egress allowlist (verified empirically). avp points the agent's provider
10
+ base_url and MCP urls at this broker and hands the agent only sentinels; the
11
+ broker, running on the host where the real secret lives, overwrites the auth
12
+ header with the real value and forwards over TLS to the real upstream. The
13
+ secret never crosses into the sandbox; only the broker (host) and the upstream
14
+ ever see it.
15
+
16
+ The broker is per-run: started in `run_agent`, its routes built from the
17
+ Commission + resolved vault handles, torn down in the run's `finally`. Secrets
18
+ live only in the in-memory route table; nothing is written to disk.
19
+
20
+ This is also the real egress boundary for secret-bearing traffic: a request
21
+ that matches no route is refused, so the broker can only ever reach the
22
+ commission-declared upstreams (tighter than the DNS allowlist, and
23
+ destination-specific).
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import threading
29
+ from dataclasses import dataclass
30
+ from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
31
+ from urllib.parse import urlsplit, urlunsplit
32
+
33
+ import httpx
34
+
35
+ __all__ = ["Broker", "Route"]
36
+
37
+ # The host alias an OpenSandbox bridge container uses to reach the host. avp
38
+ # adds this to the egress allowlist for broker-mode runs. Docker Desktop /
39
+ # OrbStack inject it; on plain Linux Docker the broker preflight maps it to the
40
+ # default-route gateway (the host) in the sandbox's /etc/hosts, so this single
41
+ # address reaches the broker on every host.
42
+ SANDBOX_HOST_ALIAS = "host.docker.internal"
43
+
44
+ # Hop-by-hop / recomputed headers we never forward verbatim.
45
+ _DROP_REQUEST_HEADERS = frozenset(
46
+ {"host", "content-length", "connection", "keep-alive", "transfer-encoding", "te", "upgrade"}
47
+ )
48
+ # Hop-by-hop, plus `content-length` (we re-frame the body as chunked, so the
49
+ # upstream length no longer applies). `content-encoding` is deliberately kept:
50
+ # we forward the body raw (`iter_raw`, still gzip/br-encoded if the upstream
51
+ # compressed it), so the client needs the header to decode it.
52
+ _DROP_RESPONSE_HEADERS = frozenset(
53
+ {"connection", "keep-alive", "transfer-encoding", "te", "upgrade", "content-length"}
54
+ )
55
+
56
+
57
+ @dataclass(frozen=True)
58
+ class Route:
59
+ """One destination the broker injects credentials for.
60
+
61
+ `upstream` is where the request is forwarded. For a provider it is the
62
+ origin (`https://api.anthropic.com`) and the agent's SDK supplies the path;
63
+ for an MCP server it is the full real url. `header`/`prefix` name the auth
64
+ header to overwrite (e.g. `authorization` + `Bearer `, or `x-api-key` + ``);
65
+ `secret` is the resolved value, held only here on the host.
66
+ """
67
+
68
+ upstream: str
69
+ header: str
70
+ prefix: str
71
+ secret: str
72
+
73
+
74
+ class Broker:
75
+ """A per-run credential-injecting reverse proxy bound on the host.
76
+
77
+ Add routes keyed by the first two path segments (`llm/<id>`, `mcp/<id>`),
78
+ then `start()`. The sandbox reaches it at `base_url()`.
79
+ """
80
+
81
+ def __init__(self) -> None:
82
+ self._routes: dict[str, Route] = {}
83
+ self._server: ThreadingHTTPServer | None = None
84
+ self._thread: threading.Thread | None = None
85
+ self._client = httpx.Client(
86
+ timeout=httpx.Timeout(600.0, connect=15.0), follow_redirects=False
87
+ )
88
+
89
+ def add_route(self, key: str, route: Route) -> None:
90
+ """Register a route. `key` is the path prefix after the leading slash,
91
+ e.g. `llm/anthropic` or `mcp/network`."""
92
+ self._routes[key.strip("/")] = route
93
+
94
+ @property
95
+ def port(self) -> int:
96
+ if self._server is None:
97
+ raise RuntimeError("broker not started")
98
+ return self._server.server_address[1]
99
+
100
+ def base_url(self) -> str:
101
+ """The URL the sandboxed agent uses to reach the broker."""
102
+ return f"http://{SANDBOX_HOST_ALIAS}:{self.port}"
103
+
104
+ def route_url(self, key: str) -> str:
105
+ """The full broker URL for a given route key."""
106
+ return f"{self.base_url()}/{key.strip('/')}"
107
+
108
+ def start(self) -> None:
109
+ # Bind on all interfaces so the bridge can reach us; port 0 = ephemeral.
110
+ handler = _make_handler(self)
111
+ self._server = ThreadingHTTPServer(("0.0.0.0", 0), handler)
112
+ self._thread = threading.Thread(target=self._server.serve_forever, daemon=True)
113
+ self._thread.start()
114
+
115
+ def stop(self) -> None:
116
+ if self._server is not None:
117
+ self._server.shutdown()
118
+ self._server.server_close()
119
+ self._server = None
120
+ self._client.close()
121
+
122
+ def __enter__(self) -> Broker:
123
+ self.start()
124
+ return self
125
+
126
+ def __exit__(self, *exc: object) -> None:
127
+ self.stop()
128
+
129
+ # ── request handling ─────────────────────────────────────────────────────
130
+
131
+ def _match(self, path: str) -> tuple[Route, str] | None:
132
+ """Resolve a request path to (route, remainder-after-key)."""
133
+ parts = path.lstrip("/").split("/", 2)
134
+ if len(parts) < 2:
135
+ return None
136
+ key = f"{parts[0]}/{parts[1]}"
137
+ route = self._routes.get(key)
138
+ if route is None:
139
+ return None
140
+ remainder = f"/{parts[2]}" if len(parts) == 3 else ""
141
+ return route, remainder
142
+
143
+ def _target_url(self, route: Route, remainder: str, query: str) -> str:
144
+ base = route.upstream.rstrip("/")
145
+ url = base + remainder if remainder else base
146
+ if query:
147
+ url = f"{url}?{query}"
148
+ return url
149
+
150
+ def _forward_headers(self, route: Route, headers: dict[str, str]) -> dict[str, str]:
151
+ out: dict[str, str] = {}
152
+ for name, value in headers.items():
153
+ low = name.lower()
154
+ if low in _DROP_REQUEST_HEADERS or low == route.header.lower():
155
+ continue
156
+ out[name] = value
157
+ # Overwrite (never append) the auth header with the real secret.
158
+ out[route.header] = f"{route.prefix}{route.secret}"
159
+ # Pin Host to the upstream so the upstream's TLS/vhost routing is correct.
160
+ out["Host"] = urlsplit(self._target_url(route, "", "")).netloc
161
+ return out
162
+
163
+
164
+ def _make_handler(broker: Broker) -> type[BaseHTTPRequestHandler]:
165
+ class Handler(BaseHTTPRequestHandler):
166
+ protocol_version = "HTTP/1.1"
167
+
168
+ def log_message(self, *args: object) -> None: # silence default logging
169
+ pass
170
+
171
+ def _health(self) -> bool:
172
+ if self.path.rstrip("/") == "/health":
173
+ self.send_response(200)
174
+ self.send_header("Content-Length", "2")
175
+ self.end_headers()
176
+ self.wfile.write(b"ok")
177
+ return True
178
+ return False
179
+
180
+ def _handle(self) -> None:
181
+ if self._health():
182
+ return
183
+ split = urlsplit(self.path)
184
+ matched = broker._match(split.path)
185
+ if matched is None:
186
+ self.send_error(404, "no broker route")
187
+ return
188
+ route, remainder = matched
189
+ target = broker._target_url(route, remainder, split.query)
190
+ length = int(self.headers.get("Content-Length") or 0)
191
+ body = self.rfile.read(length) if length else None
192
+ headers = broker._forward_headers(route, dict(self.headers.items()))
193
+ try:
194
+ with broker._client.stream(
195
+ self.command, target, headers=headers, content=body
196
+ ) as upstream:
197
+ self.send_response(upstream.status_code)
198
+ for name, value in upstream.headers.items():
199
+ if name.lower() in _DROP_RESPONSE_HEADERS:
200
+ continue
201
+ self.send_header(name, value)
202
+ self.send_header("Transfer-Encoding", "chunked")
203
+ self.end_headers()
204
+ for chunk in upstream.iter_raw():
205
+ if chunk:
206
+ self.wfile.write(f"{len(chunk):X}\r\n".encode())
207
+ self.wfile.write(chunk)
208
+ self.wfile.write(b"\r\n")
209
+ self.wfile.flush()
210
+ self.wfile.write(b"0\r\n\r\n")
211
+ except Exception as exc: # upstream unreachable / stream error
212
+ self.send_error(502, f"broker upstream error: {exc}")
213
+
214
+ do_GET = _handle
215
+ do_POST = _handle
216
+ do_PUT = _handle
217
+ do_DELETE = _handle
218
+ do_PATCH = _handle
219
+
220
+ return Handler
221
+
222
+
223
+ def origin_of(url: str) -> str:
224
+ """The scheme://host[:port] origin of a url (provider routes forward here;
225
+ the agent SDK supplies the path)."""
226
+ s = urlsplit(url)
227
+ return urlunsplit((s.scheme, s.netloc, "", "", ""))
@@ -0,0 +1,128 @@
1
+ """The `avp init` catalog: real, scaffold-able evals (JSON, not code).
2
+
3
+ Each packaged entry is a `{eval, commissions}` document. `avp init <key>`
4
+ **installs** its commissions into the portable library (`~/.avp/commissions/`,
5
+ skipping any id you already have) and writes the eval file *in place* as
6
+ `<key>.eval.json` so you can edit and commit it. The eval references the
7
+ commissions by id. `capitals` is the smallest entry (the default off a
8
+ non-interactive terminal): inline data, runs in seconds for pennies, no extra deps.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ from dataclasses import dataclass, field
15
+ from importlib.resources import files
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ from avp.commission import Commission
20
+ from avp_cli import library
21
+
22
+
23
+ @dataclass(frozen=True)
24
+ class CatalogEntry:
25
+ key: str
26
+ title: str
27
+ description: str
28
+ file: str
29
+ needs: list[str] = field(default_factory=list) # optional extras the config requires
30
+
31
+
32
+ @dataclass(frozen=True)
33
+ class ScaffoldResult:
34
+ eval_path: Path
35
+ installed: list[str] # commission ids written into the library
36
+ skipped: list[str] # ids already present, left untouched
37
+
38
+
39
+ # Ordered for the picker. Each `file` is a `{eval, commissions}` JSON in this package.
40
+ ENTRIES: list[CatalogEntry] = [
41
+ CatalogEntry(
42
+ key="capitals",
43
+ title="Capitals (structured extraction)",
44
+ description="Tiny structured-extraction sample. Inline data, runs for pennies, no extra deps.",
45
+ file="capitals.json",
46
+ ),
47
+ CatalogEntry(
48
+ key="parsebench",
49
+ title="ParseBench (tables)",
50
+ description="PDF pages to HTML, scored on structural fidelity. Real LlamaIndex benchmark.",
51
+ file="parsebench.json",
52
+ needs=["parsebench"],
53
+ ),
54
+ CatalogEntry(
55
+ key="custom",
56
+ title="Custom (start from scratch)",
57
+ description="A minimal real eval you fill in with your own task and commissions.",
58
+ file="custom.json",
59
+ ),
60
+ ]
61
+
62
+
63
+ def get(key: str) -> CatalogEntry | None:
64
+ return next((e for e in ENTRIES if e.key == key), None)
65
+
66
+
67
+ def load(entry: CatalogEntry) -> dict[str, Any]:
68
+ """Parse the entry's packaged `{eval, commissions}` document."""
69
+ return json.loads((files("avp_cli.catalog") / entry.file).read_text())
70
+
71
+
72
+ def _unique_eval_path(dest_dir: Path, key: str) -> Path:
73
+ """`<dest>/<key>.eval.json`, or `<key>-2.eval.json`, `-3`, ... if taken.
74
+
75
+ `avp init` is non-destructive: scaffolding a benchmark you already have writes
76
+ a fresh, editable copy rather than clobbering or refusing.
77
+ """
78
+ target = dest_dir / f"{key}.eval.json"
79
+ i = 2
80
+ while target.exists():
81
+ target = dest_dir / f"{key}-{i}.eval.json"
82
+ i += 1
83
+ return target
84
+
85
+
86
+ def scaffold(
87
+ entry: CatalogEntry,
88
+ dest_dir: Path,
89
+ agents: list[str] | None = None,
90
+ *,
91
+ commissions_dir: Path | None = None,
92
+ ) -> ScaffoldResult:
93
+ """Install the entry's commissions into the library and write its eval file.
94
+
95
+ The eval file goes to `<dest>/<key>.eval.json` (or a `-2`, `-3`, ... suffix if
96
+ that name is taken) and references the commissions by id. Commissions go to the
97
+ library; an id you already have is left untouched (reported in `skipped`).
98
+
99
+ The eval's `commissions` block is taken verbatim from the entry when present
100
+ (so an entry can bind commissions to agents via the `{agent: [ids]}` map);
101
+ otherwise it defaults to a flat list of every installed id. When `agents` is
102
+ given and the entry doesn't already bind agents via a map, they're pinned in
103
+ an `"agents"` key so `avp eval run` targets them without `--agent`.
104
+ """
105
+ doc = load(entry)
106
+ commissions: dict[str, Any] = doc["commissions"] # {id: wire Commission}
107
+ installed: list[str] = []
108
+ skipped: list[str] = []
109
+ for cid, spec in commissions.items():
110
+ if library.exists(cid, commissions_dir=commissions_dir):
111
+ skipped.append(cid)
112
+ else:
113
+ library.save(cid, Commission.model_validate(spec), commissions_dir=commissions_dir)
114
+ installed.append(cid)
115
+
116
+ target = _unique_eval_path(dest_dir, entry.key)
117
+ eval_spec = doc["eval"]
118
+ eval_doc: dict[str, Any] = {"name": eval_spec.get("name", entry.key)}
119
+ binding = eval_spec.get("commissions", list(commissions.keys()))
120
+ # A map binding names its agents in the keys; only pin a separate "agents"
121
+ # key for the flat-list form.
122
+ if agents and not isinstance(binding, dict):
123
+ eval_doc["agents"] = agents
124
+ eval_doc["dataset"] = eval_spec["dataset"]
125
+ eval_doc["scorer"] = eval_spec["scorer"]
126
+ eval_doc["commissions"] = binding
127
+ target.write_text(json.dumps(eval_doc, indent=2) + "\n")
128
+ return ScaffoldResult(eval_path=target, installed=installed, skipped=skipped)
@@ -0,0 +1,67 @@
1
+ {
2
+ "eval": {
3
+ "name": "capitals-extraction",
4
+ "dataset": {
5
+ "source": "inline",
6
+ "items": [
7
+ {
8
+ "id": "paris",
9
+ "prompt": "Paris is the capital of France; about 2 million people live there.",
10
+ "expected": {
11
+ "city": "Paris",
12
+ "country": "France",
13
+ "population_millions": 2
14
+ }
15
+ },
16
+ {
17
+ "id": "tokyo",
18
+ "prompt": "Tokyo, the capital of Japan, is home to around 14 million.",
19
+ "expected": {
20
+ "city": "Tokyo",
21
+ "country": "Japan",
22
+ "population_millions": 14
23
+ }
24
+ }
25
+ ]
26
+ },
27
+ "scorer": {
28
+ "name": "structural-match",
29
+ "threshold": 1.0
30
+ }
31
+ },
32
+ "commissions": {
33
+ "capitals-baseline": {
34
+ "schema_version": "0.1",
35
+ "run_id": "capitals-baseline",
36
+ "model": "anthropic/claude-haiku-4-5",
37
+ "prompt": "Extract the structured facts from this sentence: {input}",
38
+ "output_schema": {
39
+ "type": "object",
40
+ "properties": {
41
+ "city": { "type": "string" },
42
+ "country": { "type": "string" },
43
+ "population_millions": { "type": "number" }
44
+ },
45
+ "required": ["city", "country", "population_millions"],
46
+ "additionalProperties": false
47
+ }
48
+ },
49
+ "capitals-few-shot": {
50
+ "schema_version": "0.1",
51
+ "run_id": "capitals-few-shot",
52
+ "model": "anthropic/claude-haiku-4-5",
53
+ "system_prompt": "You extract facts as JSON. Example:\nInput: \"Berlin is the capital of Germany, about 3.7 million people.\"\nOutput: {\"city\": \"Berlin\", \"country\": \"Germany\", \"population_millions\": 3.7}",
54
+ "prompt": "Now do the same for: {input}",
55
+ "output_schema": {
56
+ "type": "object",
57
+ "properties": {
58
+ "city": { "type": "string" },
59
+ "country": { "type": "string" },
60
+ "population_millions": { "type": "number" }
61
+ },
62
+ "required": ["city", "country", "population_millions"],
63
+ "additionalProperties": false
64
+ }
65
+ }
66
+ }
67
+ }
@@ -0,0 +1,35 @@
1
+ {
2
+ "eval": {
3
+ "name": "my-eval",
4
+ "dataset": {
5
+ "source": "inline",
6
+ "items": [
7
+ {
8
+ "id": "example-1",
9
+ "prompt": "Replace this with your task input.",
10
+ "expected": {
11
+ "answer": "what a correct result looks like"
12
+ }
13
+ }
14
+ ]
15
+ },
16
+ "scorer": {
17
+ "name": "structural-match",
18
+ "threshold": 1.0
19
+ }
20
+ },
21
+ "commissions": {
22
+ "baseline": {
23
+ "schema_version": "0.1",
24
+ "run_id": "baseline",
25
+ "model": "anthropic/claude-haiku-4-5",
26
+ "prompt": "{input}"
27
+ },
28
+ "variant-a": {
29
+ "schema_version": "0.1",
30
+ "run_id": "variant-a",
31
+ "model": "anthropic/claude-haiku-4-5",
32
+ "prompt": "{input}"
33
+ }
34
+ }
35
+ }
@@ -0,0 +1,44 @@
1
+ {
2
+ "eval": {
3
+ "name": "parsebench-table",
4
+ "dataset": {
5
+ "source": "huggingface",
6
+ "id": "llamaindex/ParseBench",
7
+ "split": "table[:2]",
8
+ "input": "https://huggingface.co/datasets/llamaindex/ParseBench/resolve/main/{pdf}",
9
+ "expected_field": "expected_markdown",
10
+ "id_field": "id"
11
+ },
12
+ "scorer": {
13
+ "name": "structural-fidelity",
14
+ "threshold": 0.8
15
+ },
16
+ "commissions": {
17
+ "goose": ["parsebench-goose"],
18
+ "claude-code": ["parsebench-claude"]
19
+ }
20
+ },
21
+ "commissions": {
22
+ "parsebench-goose": {
23
+ "schema_version": "0.1",
24
+ "run_id": "parsebench-goose",
25
+ "model": "anthropic/claude-opus-4-8",
26
+ "prompt": "Reproduce the table from a PDF page as a single HTML <table>.\n1. Download it: curl -sL '{input}' -o page.pdf\n2. Render the page to an image with the pdf-vision get_page_image tool and LOOK at it. The image is the ground truth for the 2D layout: how many columns there are, which cells are merged or span rows, section-header rows, and which values visually share one cell (e.g. multiple holders inside a single cell). Use get_page_text only to copy exact text; trust the image for structure.\n3. Rebuild as ONE HTML <table> that matches what you see: one <tr> per visual row, <th> for header cells, <td> for data, colspan/rowspan for merged cells. Keep column order and exact cell text. Do NOT split a value into extra columns or merge rows that are visually separate.\n4. Verify against the image: the header column count and each row's cell count must match the page. Fix and redo if not. Do not submit a table you can see is wrong.\n5. Output only the final HTML <table>.",
27
+ "enabled_builtin_tools": ["shell", "write", "edit"],
28
+ "mcp_servers": [
29
+ {
30
+ "type": "stdio",
31
+ "id": "pdf-vision",
32
+ "command": ["uvx", "--from", "git+https://github.com/I-CAN-hack/pdf-mcp.git", "pdf-mcp"]
33
+ }
34
+ ]
35
+ },
36
+ "parsebench-claude": {
37
+ "schema_version": "0.1",
38
+ "run_id": "parsebench-claude",
39
+ "model": "anthropic/claude-opus-4-8",
40
+ "prompt": "Download the PDF page at {input}, read it, rebuild it as a single HTML <table>, then re-read the original and verify your table matches before returning. Output only the HTML.",
41
+ "enabled_builtin_tools": ["Bash", "Read", "Write", "WebFetch"]
42
+ }
43
+ }
44
+ }