avp-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avp/__init__.py +31 -0
- avp/commission.py +236 -0
- avp/content.py +273 -0
- avp/data/__init__.py +0 -0
- avp/data/prices.json +21945 -0
- avp/descriptor.py +204 -0
- avp/envelope.py +108 -0
- avp/gen_ai.py +160 -0
- avp/history.py +86 -0
- avp/pricing.py +138 -0
- avp/sink.py +62 -0
- avp/trajectory.py +530 -0
- avp_cli/__init__.py +82 -0
- avp_cli/agent.py +566 -0
- avp_cli/agent_install.py +331 -0
- avp_cli/agent_manifest.py +73 -0
- avp_cli/agents.py +258 -0
- avp_cli/brand.py +46 -0
- avp_cli/broker.py +227 -0
- avp_cli/catalog/__init__.py +128 -0
- avp_cli/catalog/capitals.json +67 -0
- avp_cli/catalog/custom.json +35 -0
- avp_cli/catalog/parsebench.json +44 -0
- avp_cli/cli.py +1858 -0
- avp_cli/commission.py +144 -0
- avp_cli/config.py +250 -0
- avp_cli/console.py +51 -0
- avp_cli/environment.py +218 -0
- avp_cli/eval/__init__.py +0 -0
- avp_cli/eval/dataset.py +37 -0
- avp_cli/eval/engine.py +426 -0
- avp_cli/eval/report.py +178 -0
- avp_cli/eval/scoring.py +260 -0
- avp_cli/eval/setup.py +69 -0
- avp_cli/images.py +119 -0
- avp_cli/library.py +95 -0
- avp_cli/live.py +185 -0
- avp_cli/observability.py +128 -0
- avp_cli/onboarding.py +80 -0
- avp_cli/osb.py +347 -0
- avp_cli/paths.py +47 -0
- avp_cli/run_manifest.py +113 -0
- avp_cli/state.py +195 -0
- avp_cli/vault.py +116 -0
- avp_cli/viz.py +303 -0
- avp_cli-0.1.0.dist-info/METADATA +359 -0
- avp_cli-0.1.0.dist-info/RECORD +49 -0
- avp_cli-0.1.0.dist-info/WHEEL +4 -0
- avp_cli-0.1.0.dist-info/entry_points.txt +2 -0
avp_cli/brand.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""AVP branding for the terminal: the ship logo + the palette.
|
|
2
|
+
|
|
3
|
+
The logo is the same monospace ASCII ship as `assets/logo.svg` (the `<avp>`
|
|
4
|
+
sail on a mast, over a hull). Palette matches the logo and the constellation
|
|
5
|
+
viz: sail gold, mast/light, hull, keel.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from rich.text import Text
|
|
11
|
+
|
|
12
|
+
# Brand palette (hex), shared with the trajectory constellation.
|
|
13
|
+
SAIL = "#f3d28a"
|
|
14
|
+
MAST = "#e6eef3"
|
|
15
|
+
HULL = "#c98b4a"
|
|
16
|
+
KEEL = "#8a5b2c"
|
|
17
|
+
SKY = "#9fd6e7"
|
|
18
|
+
|
|
19
|
+
# A compact sailboat mark for prompts (the brand ship's one-glyph proxy).
|
|
20
|
+
SAILBOAT = "⛵"
|
|
21
|
+
|
|
22
|
+
# (text, color) per line, reconstructed from assets/logo.svg's row layout.
|
|
23
|
+
_SHIP: list[list[tuple[str, str]]] = [
|
|
24
|
+
[(" _", SAIL)],
|
|
25
|
+
[(" |", MAST), ("<avp>", SAIL)],
|
|
26
|
+
[(" |", MAST), (" ‾", SAIL)],
|
|
27
|
+
[(" |", MAST), ("\\", SAIL)],
|
|
28
|
+
[(" |", MAST), ("_\\", SAIL)],
|
|
29
|
+
[(" |", MAST), ("__\\", SAIL)],
|
|
30
|
+
[(" |", MAST), ("___\\", SAIL)],
|
|
31
|
+
[(" |", MAST), ("____\\", SAIL)],
|
|
32
|
+
[(" |", MAST), ("_____\\", SAIL)],
|
|
33
|
+
[("\\______________/", HULL)],
|
|
34
|
+
[(" \\____________/", KEEL)],
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def logo() -> Text:
|
|
39
|
+
"""The AVP ship as a colored rich Text block."""
|
|
40
|
+
out = Text()
|
|
41
|
+
for i, line in enumerate(_SHIP):
|
|
42
|
+
for span, color in line:
|
|
43
|
+
out.append(span, style=color)
|
|
44
|
+
if i < len(_SHIP) - 1:
|
|
45
|
+
out.append("\n")
|
|
46
|
+
return out
|
avp_cli/broker.py
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
"""The vault broker: a host-side credential-injecting reverse proxy.
|
|
2
|
+
|
|
3
|
+
The vault's promise is "the agent can *use* a credential it can never *read*."
|
|
4
|
+
The wire keeps secrets off the Commission (credentials travel as handles); this
|
|
5
|
+
broker keeps the resolved value out of the sandbox entirely.
|
|
6
|
+
|
|
7
|
+
How it fits together. OpenSandbox filters egress by DNS only, so a sandboxed
|
|
8
|
+
agent can reach a host process at `host.docker.internal` once that name is in
|
|
9
|
+
the egress allowlist (verified empirically). avp points the agent's provider
|
|
10
|
+
base_url and MCP urls at this broker and hands the agent only sentinels; the
|
|
11
|
+
broker, running on the host where the real secret lives, overwrites the auth
|
|
12
|
+
header with the real value and forwards over TLS to the real upstream. The
|
|
13
|
+
secret never crosses into the sandbox; only the broker (host) and the upstream
|
|
14
|
+
ever see it.
|
|
15
|
+
|
|
16
|
+
The broker is per-run: started in `run_agent`, its routes built from the
|
|
17
|
+
Commission + resolved vault handles, torn down in the run's `finally`. Secrets
|
|
18
|
+
live only in the in-memory route table; nothing is written to disk.
|
|
19
|
+
|
|
20
|
+
This is also the real egress boundary for secret-bearing traffic: a request
|
|
21
|
+
that matches no route is refused, so the broker can only ever reach the
|
|
22
|
+
commission-declared upstreams (tighter than the DNS allowlist, and
|
|
23
|
+
destination-specific).
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import threading
|
|
29
|
+
from dataclasses import dataclass
|
|
30
|
+
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
|
31
|
+
from urllib.parse import urlsplit, urlunsplit
|
|
32
|
+
|
|
33
|
+
import httpx
|
|
34
|
+
|
|
35
|
+
__all__ = ["Broker", "Route"]
|
|
36
|
+
|
|
37
|
+
# The host alias an OpenSandbox bridge container uses to reach the host. avp
|
|
38
|
+
# adds this to the egress allowlist for broker-mode runs. Docker Desktop /
|
|
39
|
+
# OrbStack inject it; on plain Linux Docker the broker preflight maps it to the
|
|
40
|
+
# default-route gateway (the host) in the sandbox's /etc/hosts, so this single
|
|
41
|
+
# address reaches the broker on every host.
|
|
42
|
+
SANDBOX_HOST_ALIAS = "host.docker.internal"
|
|
43
|
+
|
|
44
|
+
# Hop-by-hop / recomputed headers we never forward verbatim.
|
|
45
|
+
_DROP_REQUEST_HEADERS = frozenset(
|
|
46
|
+
{"host", "content-length", "connection", "keep-alive", "transfer-encoding", "te", "upgrade"}
|
|
47
|
+
)
|
|
48
|
+
# Hop-by-hop, plus `content-length` (we re-frame the body as chunked, so the
|
|
49
|
+
# upstream length no longer applies). `content-encoding` is deliberately kept:
|
|
50
|
+
# we forward the body raw (`iter_raw`, still gzip/br-encoded if the upstream
|
|
51
|
+
# compressed it), so the client needs the header to decode it.
|
|
52
|
+
_DROP_RESPONSE_HEADERS = frozenset(
|
|
53
|
+
{"connection", "keep-alive", "transfer-encoding", "te", "upgrade", "content-length"}
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass(frozen=True)
|
|
58
|
+
class Route:
|
|
59
|
+
"""One destination the broker injects credentials for.
|
|
60
|
+
|
|
61
|
+
`upstream` is where the request is forwarded. For a provider it is the
|
|
62
|
+
origin (`https://api.anthropic.com`) and the agent's SDK supplies the path;
|
|
63
|
+
for an MCP server it is the full real url. `header`/`prefix` name the auth
|
|
64
|
+
header to overwrite (e.g. `authorization` + `Bearer `, or `x-api-key` + ``);
|
|
65
|
+
`secret` is the resolved value, held only here on the host.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
upstream: str
|
|
69
|
+
header: str
|
|
70
|
+
prefix: str
|
|
71
|
+
secret: str
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class Broker:
|
|
75
|
+
"""A per-run credential-injecting reverse proxy bound on the host.
|
|
76
|
+
|
|
77
|
+
Add routes keyed by the first two path segments (`llm/<id>`, `mcp/<id>`),
|
|
78
|
+
then `start()`. The sandbox reaches it at `base_url()`.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def __init__(self) -> None:
|
|
82
|
+
self._routes: dict[str, Route] = {}
|
|
83
|
+
self._server: ThreadingHTTPServer | None = None
|
|
84
|
+
self._thread: threading.Thread | None = None
|
|
85
|
+
self._client = httpx.Client(
|
|
86
|
+
timeout=httpx.Timeout(600.0, connect=15.0), follow_redirects=False
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
def add_route(self, key: str, route: Route) -> None:
|
|
90
|
+
"""Register a route. `key` is the path prefix after the leading slash,
|
|
91
|
+
e.g. `llm/anthropic` or `mcp/network`."""
|
|
92
|
+
self._routes[key.strip("/")] = route
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def port(self) -> int:
|
|
96
|
+
if self._server is None:
|
|
97
|
+
raise RuntimeError("broker not started")
|
|
98
|
+
return self._server.server_address[1]
|
|
99
|
+
|
|
100
|
+
def base_url(self) -> str:
|
|
101
|
+
"""The URL the sandboxed agent uses to reach the broker."""
|
|
102
|
+
return f"http://{SANDBOX_HOST_ALIAS}:{self.port}"
|
|
103
|
+
|
|
104
|
+
def route_url(self, key: str) -> str:
|
|
105
|
+
"""The full broker URL for a given route key."""
|
|
106
|
+
return f"{self.base_url()}/{key.strip('/')}"
|
|
107
|
+
|
|
108
|
+
def start(self) -> None:
|
|
109
|
+
# Bind on all interfaces so the bridge can reach us; port 0 = ephemeral.
|
|
110
|
+
handler = _make_handler(self)
|
|
111
|
+
self._server = ThreadingHTTPServer(("0.0.0.0", 0), handler)
|
|
112
|
+
self._thread = threading.Thread(target=self._server.serve_forever, daemon=True)
|
|
113
|
+
self._thread.start()
|
|
114
|
+
|
|
115
|
+
def stop(self) -> None:
|
|
116
|
+
if self._server is not None:
|
|
117
|
+
self._server.shutdown()
|
|
118
|
+
self._server.server_close()
|
|
119
|
+
self._server = None
|
|
120
|
+
self._client.close()
|
|
121
|
+
|
|
122
|
+
def __enter__(self) -> Broker:
|
|
123
|
+
self.start()
|
|
124
|
+
return self
|
|
125
|
+
|
|
126
|
+
def __exit__(self, *exc: object) -> None:
|
|
127
|
+
self.stop()
|
|
128
|
+
|
|
129
|
+
# ── request handling ─────────────────────────────────────────────────────
|
|
130
|
+
|
|
131
|
+
def _match(self, path: str) -> tuple[Route, str] | None:
|
|
132
|
+
"""Resolve a request path to (route, remainder-after-key)."""
|
|
133
|
+
parts = path.lstrip("/").split("/", 2)
|
|
134
|
+
if len(parts) < 2:
|
|
135
|
+
return None
|
|
136
|
+
key = f"{parts[0]}/{parts[1]}"
|
|
137
|
+
route = self._routes.get(key)
|
|
138
|
+
if route is None:
|
|
139
|
+
return None
|
|
140
|
+
remainder = f"/{parts[2]}" if len(parts) == 3 else ""
|
|
141
|
+
return route, remainder
|
|
142
|
+
|
|
143
|
+
def _target_url(self, route: Route, remainder: str, query: str) -> str:
|
|
144
|
+
base = route.upstream.rstrip("/")
|
|
145
|
+
url = base + remainder if remainder else base
|
|
146
|
+
if query:
|
|
147
|
+
url = f"{url}?{query}"
|
|
148
|
+
return url
|
|
149
|
+
|
|
150
|
+
def _forward_headers(self, route: Route, headers: dict[str, str]) -> dict[str, str]:
|
|
151
|
+
out: dict[str, str] = {}
|
|
152
|
+
for name, value in headers.items():
|
|
153
|
+
low = name.lower()
|
|
154
|
+
if low in _DROP_REQUEST_HEADERS or low == route.header.lower():
|
|
155
|
+
continue
|
|
156
|
+
out[name] = value
|
|
157
|
+
# Overwrite (never append) the auth header with the real secret.
|
|
158
|
+
out[route.header] = f"{route.prefix}{route.secret}"
|
|
159
|
+
# Pin Host to the upstream so the upstream's TLS/vhost routing is correct.
|
|
160
|
+
out["Host"] = urlsplit(self._target_url(route, "", "")).netloc
|
|
161
|
+
return out
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _make_handler(broker: Broker) -> type[BaseHTTPRequestHandler]:
|
|
165
|
+
class Handler(BaseHTTPRequestHandler):
|
|
166
|
+
protocol_version = "HTTP/1.1"
|
|
167
|
+
|
|
168
|
+
def log_message(self, *args: object) -> None: # silence default logging
|
|
169
|
+
pass
|
|
170
|
+
|
|
171
|
+
def _health(self) -> bool:
|
|
172
|
+
if self.path.rstrip("/") == "/health":
|
|
173
|
+
self.send_response(200)
|
|
174
|
+
self.send_header("Content-Length", "2")
|
|
175
|
+
self.end_headers()
|
|
176
|
+
self.wfile.write(b"ok")
|
|
177
|
+
return True
|
|
178
|
+
return False
|
|
179
|
+
|
|
180
|
+
def _handle(self) -> None:
|
|
181
|
+
if self._health():
|
|
182
|
+
return
|
|
183
|
+
split = urlsplit(self.path)
|
|
184
|
+
matched = broker._match(split.path)
|
|
185
|
+
if matched is None:
|
|
186
|
+
self.send_error(404, "no broker route")
|
|
187
|
+
return
|
|
188
|
+
route, remainder = matched
|
|
189
|
+
target = broker._target_url(route, remainder, split.query)
|
|
190
|
+
length = int(self.headers.get("Content-Length") or 0)
|
|
191
|
+
body = self.rfile.read(length) if length else None
|
|
192
|
+
headers = broker._forward_headers(route, dict(self.headers.items()))
|
|
193
|
+
try:
|
|
194
|
+
with broker._client.stream(
|
|
195
|
+
self.command, target, headers=headers, content=body
|
|
196
|
+
) as upstream:
|
|
197
|
+
self.send_response(upstream.status_code)
|
|
198
|
+
for name, value in upstream.headers.items():
|
|
199
|
+
if name.lower() in _DROP_RESPONSE_HEADERS:
|
|
200
|
+
continue
|
|
201
|
+
self.send_header(name, value)
|
|
202
|
+
self.send_header("Transfer-Encoding", "chunked")
|
|
203
|
+
self.end_headers()
|
|
204
|
+
for chunk in upstream.iter_raw():
|
|
205
|
+
if chunk:
|
|
206
|
+
self.wfile.write(f"{len(chunk):X}\r\n".encode())
|
|
207
|
+
self.wfile.write(chunk)
|
|
208
|
+
self.wfile.write(b"\r\n")
|
|
209
|
+
self.wfile.flush()
|
|
210
|
+
self.wfile.write(b"0\r\n\r\n")
|
|
211
|
+
except Exception as exc: # upstream unreachable / stream error
|
|
212
|
+
self.send_error(502, f"broker upstream error: {exc}")
|
|
213
|
+
|
|
214
|
+
do_GET = _handle
|
|
215
|
+
do_POST = _handle
|
|
216
|
+
do_PUT = _handle
|
|
217
|
+
do_DELETE = _handle
|
|
218
|
+
do_PATCH = _handle
|
|
219
|
+
|
|
220
|
+
return Handler
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def origin_of(url: str) -> str:
|
|
224
|
+
"""The scheme://host[:port] origin of a url (provider routes forward here;
|
|
225
|
+
the agent SDK supplies the path)."""
|
|
226
|
+
s = urlsplit(url)
|
|
227
|
+
return urlunsplit((s.scheme, s.netloc, "", "", ""))
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""The `avp init` catalog: real, scaffold-able evals (JSON, not code).
|
|
2
|
+
|
|
3
|
+
Each packaged entry is a `{eval, commissions}` document. `avp init <key>`
|
|
4
|
+
**installs** its commissions into the portable library (`~/.avp/commissions/`,
|
|
5
|
+
skipping any id you already have) and writes the eval file *in place* as
|
|
6
|
+
`<key>.eval.json` so you can edit and commit it. The eval references the
|
|
7
|
+
commissions by id. `capitals` is the smallest entry (the default off a
|
|
8
|
+
non-interactive terminal): inline data, runs in seconds for pennies, no extra deps.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from importlib.resources import files
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
from avp.commission import Commission
|
|
20
|
+
from avp_cli import library
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass(frozen=True)
|
|
24
|
+
class CatalogEntry:
|
|
25
|
+
key: str
|
|
26
|
+
title: str
|
|
27
|
+
description: str
|
|
28
|
+
file: str
|
|
29
|
+
needs: list[str] = field(default_factory=list) # optional extras the config requires
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass(frozen=True)
|
|
33
|
+
class ScaffoldResult:
|
|
34
|
+
eval_path: Path
|
|
35
|
+
installed: list[str] # commission ids written into the library
|
|
36
|
+
skipped: list[str] # ids already present, left untouched
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# Ordered for the picker. Each `file` is a `{eval, commissions}` JSON in this package.
|
|
40
|
+
ENTRIES: list[CatalogEntry] = [
|
|
41
|
+
CatalogEntry(
|
|
42
|
+
key="capitals",
|
|
43
|
+
title="Capitals (structured extraction)",
|
|
44
|
+
description="Tiny structured-extraction sample. Inline data, runs for pennies, no extra deps.",
|
|
45
|
+
file="capitals.json",
|
|
46
|
+
),
|
|
47
|
+
CatalogEntry(
|
|
48
|
+
key="parsebench",
|
|
49
|
+
title="ParseBench (tables)",
|
|
50
|
+
description="PDF pages to HTML, scored on structural fidelity. Real LlamaIndex benchmark.",
|
|
51
|
+
file="parsebench.json",
|
|
52
|
+
needs=["parsebench"],
|
|
53
|
+
),
|
|
54
|
+
CatalogEntry(
|
|
55
|
+
key="custom",
|
|
56
|
+
title="Custom (start from scratch)",
|
|
57
|
+
description="A minimal real eval you fill in with your own task and commissions.",
|
|
58
|
+
file="custom.json",
|
|
59
|
+
),
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def get(key: str) -> CatalogEntry | None:
|
|
64
|
+
return next((e for e in ENTRIES if e.key == key), None)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def load(entry: CatalogEntry) -> dict[str, Any]:
|
|
68
|
+
"""Parse the entry's packaged `{eval, commissions}` document."""
|
|
69
|
+
return json.loads((files("avp_cli.catalog") / entry.file).read_text())
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _unique_eval_path(dest_dir: Path, key: str) -> Path:
|
|
73
|
+
"""`<dest>/<key>.eval.json`, or `<key>-2.eval.json`, `-3`, ... if taken.
|
|
74
|
+
|
|
75
|
+
`avp init` is non-destructive: scaffolding a benchmark you already have writes
|
|
76
|
+
a fresh, editable copy rather than clobbering or refusing.
|
|
77
|
+
"""
|
|
78
|
+
target = dest_dir / f"{key}.eval.json"
|
|
79
|
+
i = 2
|
|
80
|
+
while target.exists():
|
|
81
|
+
target = dest_dir / f"{key}-{i}.eval.json"
|
|
82
|
+
i += 1
|
|
83
|
+
return target
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def scaffold(
|
|
87
|
+
entry: CatalogEntry,
|
|
88
|
+
dest_dir: Path,
|
|
89
|
+
agents: list[str] | None = None,
|
|
90
|
+
*,
|
|
91
|
+
commissions_dir: Path | None = None,
|
|
92
|
+
) -> ScaffoldResult:
|
|
93
|
+
"""Install the entry's commissions into the library and write its eval file.
|
|
94
|
+
|
|
95
|
+
The eval file goes to `<dest>/<key>.eval.json` (or a `-2`, `-3`, ... suffix if
|
|
96
|
+
that name is taken) and references the commissions by id. Commissions go to the
|
|
97
|
+
library; an id you already have is left untouched (reported in `skipped`).
|
|
98
|
+
|
|
99
|
+
The eval's `commissions` block is taken verbatim from the entry when present
|
|
100
|
+
(so an entry can bind commissions to agents via the `{agent: [ids]}` map);
|
|
101
|
+
otherwise it defaults to a flat list of every installed id. When `agents` is
|
|
102
|
+
given and the entry doesn't already bind agents via a map, they're pinned in
|
|
103
|
+
an `"agents"` key so `avp eval run` targets them without `--agent`.
|
|
104
|
+
"""
|
|
105
|
+
doc = load(entry)
|
|
106
|
+
commissions: dict[str, Any] = doc["commissions"] # {id: wire Commission}
|
|
107
|
+
installed: list[str] = []
|
|
108
|
+
skipped: list[str] = []
|
|
109
|
+
for cid, spec in commissions.items():
|
|
110
|
+
if library.exists(cid, commissions_dir=commissions_dir):
|
|
111
|
+
skipped.append(cid)
|
|
112
|
+
else:
|
|
113
|
+
library.save(cid, Commission.model_validate(spec), commissions_dir=commissions_dir)
|
|
114
|
+
installed.append(cid)
|
|
115
|
+
|
|
116
|
+
target = _unique_eval_path(dest_dir, entry.key)
|
|
117
|
+
eval_spec = doc["eval"]
|
|
118
|
+
eval_doc: dict[str, Any] = {"name": eval_spec.get("name", entry.key)}
|
|
119
|
+
binding = eval_spec.get("commissions", list(commissions.keys()))
|
|
120
|
+
# A map binding names its agents in the keys; only pin a separate "agents"
|
|
121
|
+
# key for the flat-list form.
|
|
122
|
+
if agents and not isinstance(binding, dict):
|
|
123
|
+
eval_doc["agents"] = agents
|
|
124
|
+
eval_doc["dataset"] = eval_spec["dataset"]
|
|
125
|
+
eval_doc["scorer"] = eval_spec["scorer"]
|
|
126
|
+
eval_doc["commissions"] = binding
|
|
127
|
+
target.write_text(json.dumps(eval_doc, indent=2) + "\n")
|
|
128
|
+
return ScaffoldResult(eval_path=target, installed=installed, skipped=skipped)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
{
|
|
2
|
+
"eval": {
|
|
3
|
+
"name": "capitals-extraction",
|
|
4
|
+
"dataset": {
|
|
5
|
+
"source": "inline",
|
|
6
|
+
"items": [
|
|
7
|
+
{
|
|
8
|
+
"id": "paris",
|
|
9
|
+
"prompt": "Paris is the capital of France; about 2 million people live there.",
|
|
10
|
+
"expected": {
|
|
11
|
+
"city": "Paris",
|
|
12
|
+
"country": "France",
|
|
13
|
+
"population_millions": 2
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
"id": "tokyo",
|
|
18
|
+
"prompt": "Tokyo, the capital of Japan, is home to around 14 million.",
|
|
19
|
+
"expected": {
|
|
20
|
+
"city": "Tokyo",
|
|
21
|
+
"country": "Japan",
|
|
22
|
+
"population_millions": 14
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
]
|
|
26
|
+
},
|
|
27
|
+
"scorer": {
|
|
28
|
+
"name": "structural-match",
|
|
29
|
+
"threshold": 1.0
|
|
30
|
+
}
|
|
31
|
+
},
|
|
32
|
+
"commissions": {
|
|
33
|
+
"capitals-baseline": {
|
|
34
|
+
"schema_version": "0.1",
|
|
35
|
+
"run_id": "capitals-baseline",
|
|
36
|
+
"model": "anthropic/claude-haiku-4-5",
|
|
37
|
+
"prompt": "Extract the structured facts from this sentence: {input}",
|
|
38
|
+
"output_schema": {
|
|
39
|
+
"type": "object",
|
|
40
|
+
"properties": {
|
|
41
|
+
"city": { "type": "string" },
|
|
42
|
+
"country": { "type": "string" },
|
|
43
|
+
"population_millions": { "type": "number" }
|
|
44
|
+
},
|
|
45
|
+
"required": ["city", "country", "population_millions"],
|
|
46
|
+
"additionalProperties": false
|
|
47
|
+
}
|
|
48
|
+
},
|
|
49
|
+
"capitals-few-shot": {
|
|
50
|
+
"schema_version": "0.1",
|
|
51
|
+
"run_id": "capitals-few-shot",
|
|
52
|
+
"model": "anthropic/claude-haiku-4-5",
|
|
53
|
+
"system_prompt": "You extract facts as JSON. Example:\nInput: \"Berlin is the capital of Germany, about 3.7 million people.\"\nOutput: {\"city\": \"Berlin\", \"country\": \"Germany\", \"population_millions\": 3.7}",
|
|
54
|
+
"prompt": "Now do the same for: {input}",
|
|
55
|
+
"output_schema": {
|
|
56
|
+
"type": "object",
|
|
57
|
+
"properties": {
|
|
58
|
+
"city": { "type": "string" },
|
|
59
|
+
"country": { "type": "string" },
|
|
60
|
+
"population_millions": { "type": "number" }
|
|
61
|
+
},
|
|
62
|
+
"required": ["city", "country", "population_millions"],
|
|
63
|
+
"additionalProperties": false
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"eval": {
|
|
3
|
+
"name": "my-eval",
|
|
4
|
+
"dataset": {
|
|
5
|
+
"source": "inline",
|
|
6
|
+
"items": [
|
|
7
|
+
{
|
|
8
|
+
"id": "example-1",
|
|
9
|
+
"prompt": "Replace this with your task input.",
|
|
10
|
+
"expected": {
|
|
11
|
+
"answer": "what a correct result looks like"
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
]
|
|
15
|
+
},
|
|
16
|
+
"scorer": {
|
|
17
|
+
"name": "structural-match",
|
|
18
|
+
"threshold": 1.0
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"commissions": {
|
|
22
|
+
"baseline": {
|
|
23
|
+
"schema_version": "0.1",
|
|
24
|
+
"run_id": "baseline",
|
|
25
|
+
"model": "anthropic/claude-haiku-4-5",
|
|
26
|
+
"prompt": "{input}"
|
|
27
|
+
},
|
|
28
|
+
"variant-a": {
|
|
29
|
+
"schema_version": "0.1",
|
|
30
|
+
"run_id": "variant-a",
|
|
31
|
+
"model": "anthropic/claude-haiku-4-5",
|
|
32
|
+
"prompt": "{input}"
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
{
|
|
2
|
+
"eval": {
|
|
3
|
+
"name": "parsebench-table",
|
|
4
|
+
"dataset": {
|
|
5
|
+
"source": "huggingface",
|
|
6
|
+
"id": "llamaindex/ParseBench",
|
|
7
|
+
"split": "table[:2]",
|
|
8
|
+
"input": "https://huggingface.co/datasets/llamaindex/ParseBench/resolve/main/{pdf}",
|
|
9
|
+
"expected_field": "expected_markdown",
|
|
10
|
+
"id_field": "id"
|
|
11
|
+
},
|
|
12
|
+
"scorer": {
|
|
13
|
+
"name": "structural-fidelity",
|
|
14
|
+
"threshold": 0.8
|
|
15
|
+
},
|
|
16
|
+
"commissions": {
|
|
17
|
+
"goose": ["parsebench-goose"],
|
|
18
|
+
"claude-code": ["parsebench-claude"]
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"commissions": {
|
|
22
|
+
"parsebench-goose": {
|
|
23
|
+
"schema_version": "0.1",
|
|
24
|
+
"run_id": "parsebench-goose",
|
|
25
|
+
"model": "anthropic/claude-opus-4-8",
|
|
26
|
+
"prompt": "Reproduce the table from a PDF page as a single HTML <table>.\n1. Download it: curl -sL '{input}' -o page.pdf\n2. Render the page to an image with the pdf-vision get_page_image tool and LOOK at it. The image is the ground truth for the 2D layout: how many columns there are, which cells are merged or span rows, section-header rows, and which values visually share one cell (e.g. multiple holders inside a single cell). Use get_page_text only to copy exact text; trust the image for structure.\n3. Rebuild as ONE HTML <table> that matches what you see: one <tr> per visual row, <th> for header cells, <td> for data, colspan/rowspan for merged cells. Keep column order and exact cell text. Do NOT split a value into extra columns or merge rows that are visually separate.\n4. Verify against the image: the header column count and each row's cell count must match the page. Fix and redo if not. Do not submit a table you can see is wrong.\n5. Output only the final HTML <table>.",
|
|
27
|
+
"enabled_builtin_tools": ["shell", "write", "edit"],
|
|
28
|
+
"mcp_servers": [
|
|
29
|
+
{
|
|
30
|
+
"type": "stdio",
|
|
31
|
+
"id": "pdf-vision",
|
|
32
|
+
"command": ["uvx", "--from", "git+https://github.com/I-CAN-hack/pdf-mcp.git", "pdf-mcp"]
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
},
|
|
36
|
+
"parsebench-claude": {
|
|
37
|
+
"schema_version": "0.1",
|
|
38
|
+
"run_id": "parsebench-claude",
|
|
39
|
+
"model": "anthropic/claude-opus-4-8",
|
|
40
|
+
"prompt": "Download the PDF page at {input}, read it, rebuild it as a single HTML <table>, then re-read the original and verify your table matches before returning. Output only the HTML.",
|
|
41
|
+
"enabled_builtin_tools": ["Bash", "Read", "Write", "WebFetch"]
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
}
|