@tangle-network/agent-eval 0.70.0 → 0.72.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +24 -0
- package/dist/adapters/http.js +1 -1
- package/dist/adapters/http.js.map +1 -1
- package/dist/campaign/index.d.ts +10 -0
- package/dist/campaign/index.js +48 -11
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-ZZCQQHW7.js → chunk-4QJN7RDX.js} +4 -4
- package/dist/chunk-4QJN7RDX.js.map +1 -0
- package/dist/{chunk-3B7Y5AUR.js → chunk-GWGO2K6Y.js} +3 -2
- package/dist/chunk-GWGO2K6Y.js.map +1 -0
- package/dist/{chunk-Z4ZCBC7M.js → chunk-ODGETRTM.js} +4 -3
- package/dist/chunk-ODGETRTM.js.map +1 -0
- package/dist/chunk-SL55X4VN.js +186 -0
- package/dist/chunk-SL55X4VN.js.map +1 -0
- package/dist/{chunk-GYELOWB6.js → chunk-UD6EF73X.js} +3 -3
- package/dist/{chunk-6XQIEUQ2.js → chunk-ZPSKPT3V.js} +5 -3
- package/dist/{chunk-6XQIEUQ2.js.map → chunk-ZPSKPT3V.js.map} +1 -1
- package/dist/contract/index.js +3 -3
- package/dist/index.js +31 -171
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.js +1 -1
- package/dist/rl.d.ts +155 -1
- package/dist/rl.js +195 -6
- package/dist/rl.js.map +1 -1
- package/dist/{run-campaign-BVY3RGAZ.js → run-campaign-OVEZF24D.js} +2 -2
- package/dist/traces.js +1 -1
- package/package.json +1 -1
- package/dist/chunk-3B7Y5AUR.js.map +0 -1
- package/dist/chunk-PQV2TKC3.js +0 -27
- package/dist/chunk-PQV2TKC3.js.map +0 -1
- package/dist/chunk-Z4ZCBC7M.js.map +0 -1
- package/dist/chunk-ZZCQQHW7.js.map +0 -1
- /package/dist/{chunk-GYELOWB6.js.map → chunk-UD6EF73X.js.map} +0 -0
- /package/dist/{run-campaign-BVY3RGAZ.js.map → run-campaign-OVEZF24D.js.map} +0 -0
package/CHANGELOG.md
CHANGED
|
@@ -4,6 +4,30 @@ All notable changes to `@tangle-network/agent-eval` and its sibling `agent-eval-
|
|
|
4
4
|
|
|
5
5
|
---
|
|
6
6
|
|
|
7
|
+
## [0.72.0] — 2026-05-31 — cost axis prices unpriced-at-source models (every run carries a real, labeled cost)
|
|
8
|
+
|
|
9
|
+
A live tax-agent full-loop run (real sandbox, `deepseek-v4-pro`, real tokens) exposed the second root of the cost-ledger split: the sandbox reported `totalCostUsd: 0` despite `17537` input / `622` output tokens — not a stub, not a mis-wired ledger, but a model the **source** can't rate. The cost / Pareto / `tokens_per_dollar` axes blanked even though the substrate's pricing table prices `deepseek` correctly; the table was simply never consulted on the matrix cost projection. A $0 cost on a run that burned real tokens reads as "free," which is the more misleading state.
|
|
10
|
+
|
|
11
|
+
### Fixed
|
|
12
|
+
|
|
13
|
+
- **`runProfileMatrix` prices measured tokens when the source reports $0.** Cost precedence is now explicit: **source-billed > token-estimated > none**. When `cell.costUsd === 0` and real output tokens flowed and the model is priced (`isModelPriced`), `buildRunRecord` sets the cost from `estimateCost(in, out, model)` (real published rate × real tokens) and stamps `raw.cost_estimated = 1`. A billed cost is never overridden; a model the table also can't rate stays $0 (no fabrication). The estimate flows into `record.costUsd`, so `byProfile.totalCostUsd`, `integrity.totalCostUsd`, and `tokens_per_dollar` / `cost_per_quality` all populate.
|
|
14
|
+
- **Every cost surface in the matrix result agrees.** The embedded `campaigns[id].aggregates.totalCostUsd` is reconciled to the priced total instead of runCampaign's raw `ctx.cost` ledger (which only sees the source's $0). No more two-`totalCostUsd`-that-disagree in one result.
|
|
15
|
+
- **Honest integrity diagnosis.** `summarizeBackendIntegrity`'s uncosted-records message now names **both** roots — mis-wired ledger OR unpriced-at-source model — and points at `estimateCost` for the latter, instead of asserting the ledger is broken.
|
|
16
|
+
|
|
17
|
+
Live proof: the same tax case that recorded `$0` now records **`$0.0059453`** (`17537 × 0.0003/1k + 622 × 0.0011/1k`, exact), `cost_estimated: 1`, `uncostedRecords: 0`, verdict `real`. Generalizes to every consumer of `runProfileMatrix`. New regression tests: priced-when-source-zero, billed-takes-precedence, truly-unpriced-stays-$0, campaign-aggregate-reconciled. Full suite (1663) green.
|
|
18
|
+
|
|
19
|
+
## [0.71.0] — 2026-05-31 — corpus-by-default + multi-dimensional capture (datasets as eval exhaust)
|
|
20
|
+
|
|
21
|
+
Every matrix run now emits a multi-dimensional, dataset-able record with no side-channel — the groundwork for "datasets gathered for free by running evals."
|
|
22
|
+
|
|
23
|
+
### Added
|
|
24
|
+
|
|
25
|
+
- **Multi-dim guardrail projection in `buildRunRecord`.** Each `RunRecord.outcome.raw` carries `cost_usd`, `tokens_input` / `tokens_output` (+ `tokens_cached` when present), `latency_ms`, and the guarded ratios `tokens_per_dollar` / `cost_per_quality`. RAW-ONLY — the composite stays the judge objective (anti-Goodhart); these are tracked + dashboarded + carried into datasets, never optimized.
|
|
26
|
+
- **Corpus-by-default via `corpusText`.** An optional `corpusText(artifact, scenario) => {prompt, completion}` stamps the trajectory text onto each record (the `CorpusRecord` shape), so a run is dataset-able with no side-channel. Fail-soft: a throwing extractor omits the text and keeps the graded record.
|
|
27
|
+
- **`appendToCorpus` / `readCorpus` / `buildDatasetFromCorpus`** (`src/rl/corpus.ts`) — append-only JSONL corpus (deduped by `runId`), with score/split filtering into a train/holdout dataset.
|
|
28
|
+
|
|
29
|
+
`buildRunRecord` is generic over `<TScenario, TArtifact>`; a `scenarioById` map threads each scenario into the projection.
|
|
30
|
+
|
|
7
31
|
## [0.70.0] — 2026-05-31 — error-grounded reflection (the driver targets real failures, not blind rewrites)
|
|
8
32
|
|
|
9
33
|
Adversarial verification on TWO domains (legal + tax, two worker models) found the same root cause: the gepaDriver's candidates **regressed** the baseline, so the gate correctly held — but nothing improved. The driver was reflecting on per-scenario *scores* only; the judge's `notes` (the "why it failed") were computed but **dropped** before the reflection. So it proposed generic rewrites a capable model already knows, which distract rather than help.
|
package/dist/adapters/http.js
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/adapters/http.ts"],"sourcesContent":["/**\n * # `@tangle-network/agent-eval/adapters/http` — distributed Dispatch over HTTP.\n *\n * Decouples driver and worker. The driver (running `runImprovementLoop` or\n * `runCampaign`) can live anywhere — your VPC, a dev laptop, a cron VM. The\n * workers (running the actual agent) can live anywhere else — different\n * regions, different clouds, different boxes — as long as they speak HTTP.\n *\n * Both sides:\n *\n * - **`httpDispatch({ url | resolveUrl, ... })`** — client. Returns a\n * `Dispatch` that POSTs `{ scenario, ctx }` to a worker URL and parses\n * the artifact back. AbortSignal-aware, retries on idempotent errors,\n * bounded timeout per call.\n * - **`runDispatchServer({ dispatch, port, ... })`** — server. Wraps your\n * local `Dispatch` as an HTTP endpoint. Handles auth, JSON parsing,\n * error mapping, and cancellation when the client aborts.\n *\n * # Topology examples\n *\n * **Single-worker:** driver on box A, worker on box B. Set\n * `httpDispatch({ url: 'https://box-b/dispatch' })`.\n *\n * **Multi-region:** N workers across regions. Use `httpDispatch({ resolveUrl })`\n * with a function that picks the URL per cell from `ctx.placement`. Combined\n * with `cellPlacement` on `RunCampaignOptions`, the substrate fans cells\n * across geographies in parallel.\n *\n * **Driver-as-a-service:** driver runs as a long-lived process or service\n * (holds optimization state across generations); workers are stateless\n * HTTP services that can scale horizontally per cell.\n */\n\nimport type { Dispatch, DispatchContext, Scenario } from '../contract'\n\n// ── Client ───────────────────────────────────────────────────────────\n\n// eslint-disable-next-line @typescript-eslint/no-unused-vars -- TArtifact is unused\n// in this options interface but kept as a parameter so callers can write\n// `HttpDispatchOptions<MyScenario, MyArtifact>` symmetrically with\n// `Dispatch<MyScenario, MyArtifact>`. Marking it unused at the position\n// where it bites.\nexport interface HttpDispatchOptions<TScenario extends Scenario, _TArtifact> {\n /** Static endpoint URL. Mutually exclusive with `resolveUrl`. */\n url?: string\n /**\n * Dynamic per-cell URL resolver. Receives the scenario + the substrate\n * placement key (from `RunCampaignOptions.cellPlacement`) and returns the\n * worker URL to invoke. Mutually exclusive with `url`.\n */\n resolveUrl?: (input: { scenario: TScenario; placement?: string; cellId: string }) => string\n /** Bearer token or static auth string set as `Authorization`. */\n auth?: string | (() => string | Promise<string>)\n /** Extra headers merged into every request. */\n headers?: Record<string, string>\n /** Per-call timeout in ms. Default 5 minutes. */\n timeoutMs?: number\n /** How many idempotent retries on 5xx / network errors. Default 2. */\n retries?: number\n /** Optional fetch override (auth wrappers, custom agent, mocks). */\n fetchImpl?: typeof fetch\n}\n\nexport interface HttpDispatchRequestBody<TScenario extends Scenario> {\n scenario: TScenario\n cellId: string\n rep: number\n generation?: number\n seed: number\n placement?: string\n cycleId?: string\n}\n\nexport interface HttpDispatchResponseBody<TArtifact> {\n artifact: TArtifact\n}\n\nfunction resolveAuth(auth: HttpDispatchOptions<Scenario, unknown>['auth']): Promise<string | null> {\n if (!auth) return Promise.resolve(null)\n if (typeof auth === 'string') return Promise.resolve(auth)\n return Promise.resolve(auth())\n}\n\n/**\n * Wrap a remote HTTP endpoint as a `Dispatch`. The remote side should run\n * `runDispatchServer` (or any service that speaks the same wire shape).\n *\n * Cancellation: the substrate's per-cell `AbortSignal` is forwarded; the\n * server's `runDispatchServer` translates the resulting `AbortError` into\n * a 499 (client-closed) so the client doesn't retry.\n */\nexport function httpDispatch<TScenario extends Scenario, TArtifact>(\n opts: HttpDispatchOptions<TScenario, TArtifact>,\n): Dispatch<TScenario, TArtifact> {\n if (!opts.url && !opts.resolveUrl) {\n throw new Error('httpDispatch: pass exactly one of `url` or `resolveUrl`.')\n }\n if (opts.url && opts.resolveUrl) {\n throw new Error('httpDispatch: pass exactly one of `url` or `resolveUrl`, not both.')\n }\n const timeoutMs = opts.timeoutMs ?? 5 * 60 * 1000\n const maxRetries = opts.retries ?? 2\n const f: typeof fetch = opts.fetchImpl ?? ((...args) => fetch(...args))\n\n return async (scenario, ctx) => {\n const url =\n opts.url ?? opts.resolveUrl!({ scenario, placement: ctx.placement, cellId: ctx.cellId })\n const authValue = await resolveAuth(opts.auth)\n const body: HttpDispatchRequestBody<TScenario> = {\n scenario,\n cellId: ctx.cellId,\n rep: ctx.rep,\n generation: ctx.generation,\n seed: ctx.seed,\n placement: ctx.placement,\n cycleId: ctx.cycleId,\n }\n\n let lastError: unknown\n for (let attempt = 0; attempt <= maxRetries; attempt++) {\n // Compose the request signal: caller's signal OR our timeout.\n const ourTimeout = AbortSignal.timeout(timeoutMs)\n const combinedSignal = AbortSignal.any([ctx.signal, ourTimeout])\n try {\n const res = await f(url, {\n method: 'POST',\n headers: {\n 'Content-Type': 'application/json',\n ...(authValue\n ? {\n Authorization: authValue.startsWith('Bearer ')\n ? authValue\n : `Bearer ${authValue}`,\n }\n : {}),\n ...opts.headers,\n },\n body: JSON.stringify(body),\n signal: combinedSignal,\n })\n if (!res.ok) {\n // 4xx is non-retryable (caller error, auth, bad scenario shape).\n // 5xx / 408 / 429 / 502 / 503 / 504 are retryable.\n const retryable = res.status >= 500 || res.status === 408 || res.status === 429\n if (!retryable || attempt === maxRetries) {\n const text = await res.text().catch(() => '')\n throw new Error(`httpDispatch ${url} failed (${res.status}): ${text.slice(0, 500)}`)\n }\n // exponential backoff with jitter\n await sleep(2 ** attempt * 200 + Math.random() * 200)\n continue\n }\n const parsed = (await res.json()) as HttpDispatchResponseBody<TArtifact>\n return parsed.artifact\n } catch (err) {\n // Caller-driven abort is terminal — never retry.\n if (ctx.signal.aborted) throw err\n lastError = err\n if (attempt === maxRetries) throw err\n await sleep(2 ** attempt * 200 + Math.random() * 200)\n }\n }\n throw lastError ?? new Error('httpDispatch exhausted retries')\n }\n}\n\nfunction sleep(ms: number): Promise<void> {\n return new Promise((resolve) => {\n const t = setTimeout(resolve, ms)\n // Don't keep node process alive purely for backoff sleeps.\n if (typeof (t as { unref?: () => void }).unref === 'function')\n (t as { unref: () => void }).unref()\n })\n}\n\n// ── Server ───────────────────────────────────────────────────────────\n\nexport interface RunDispatchServerOptions<TScenario extends Scenario, TArtifact> {\n /** The Dispatch this server exposes — what runs when a request lands. */\n dispatch: Dispatch<TScenario, TArtifact>\n /** TCP port to bind. */\n port: number\n /** Optional bind host; defaults to 0.0.0.0. */\n host?: string\n /** Required for any non-test deployment: the bearer token clients must\n * send. The substrate refuses to start without auth unless `auth: false`\n * is set explicitly (intended ONLY for closed-network/internal testing). */\n auth: string | false\n /** Path the server listens on. Default `/dispatch`. */\n path?: string\n /**\n * Per-request handler that wraps `dispatch` with whatever context the\n * worker side needs to construct a `DispatchContext` — typically the\n * trace writer, artifact writer, and cost meter. The substrate provides\n * synthetic-but-typed defaults if not supplied; production deployments\n * should wire real ones (e.g. ship traces to your OTel collector).\n */\n contextFactory?: (\n req: HttpDispatchRequestBody<TScenario>,\n signal: AbortSignal,\n ) => Promise<DispatchContext>\n /** Optional max payload size for the request body (bytes). Default 10 MB. */\n maxBodyBytes?: number\n /** Hook for observability — called on every successful or failed turn. */\n onRequest?: (event: {\n cellId: string\n durationMs: number\n success: boolean\n error?: unknown\n }) => void\n}\n\nexport interface DispatchServerHandle {\n /** The actual bound port (useful when `port: 0` requests an ephemeral port). */\n port: number\n /** Stop accepting new connections and drain existing ones. */\n close: () => Promise<void>\n}\n\n/**\n * Start an HTTP server exposing a local `Dispatch` over the wire. Pair with\n * `httpDispatch` on the driver side.\n *\n * Wire shape:\n *\n * POST /dispatch\n * Authorization: Bearer <token>\n * Body: HttpDispatchRequestBody\n * 200 OK: HttpDispatchResponseBody\n * 401: missing/invalid auth\n * 408: per-request timeout exceeded\n * 499: client aborted before completion\n * 500: dispatch threw\n *\n * The server is `node:http`-based to keep the runtime dependency surface\n * minimal — works in plain Node, sandbox, or any container.\n */\nexport async function runDispatchServer<TScenario extends Scenario, TArtifact>(\n opts: RunDispatchServerOptions<TScenario, TArtifact>,\n): Promise<DispatchServerHandle> {\n if (opts.auth === undefined) {\n throw new Error(\n \"runDispatchServer: 'auth' is required (pass a bearer-token string, or `auth: false` explicitly for a closed-network test deployment).\",\n )\n }\n const path = opts.path ?? '/dispatch'\n const maxBytes = opts.maxBodyBytes ?? 10 * 1024 * 1024\n const expectedAuth =\n typeof opts.auth === 'string' ? `Bearer ${opts.auth.replace(/^Bearer\\s+/, '')}` : null\n\n // Lazy-import node:http so the file is usable from non-Node bundlers\n // that import the client side only (e.g. an edge driver shipping\n // httpDispatch alone). Server side is opt-in by calling this function.\n const { createServer } = await import('node:http')\n\n const server = createServer(async (req, res) => {\n const start = Date.now()\n let cellId = 'unknown'\n let success = false\n let errCaught: unknown\n\n try {\n if (req.method !== 'POST' || req.url?.split('?')[0] !== path) {\n res.statusCode = 404\n res.end('not found')\n return\n }\n if (expectedAuth) {\n const got = req.headers['authorization']\n if (got !== expectedAuth) {\n res.statusCode = 401\n res.end('unauthorized')\n return\n }\n }\n\n // Read body up to maxBytes\n const chunks: Buffer[] = []\n let totalBytes = 0\n const aborter = new AbortController()\n req.on('close', () => {\n if (!res.writableEnded) aborter.abort()\n })\n\n for await (const chunk of req) {\n const buf = chunk as Buffer\n totalBytes += buf.length\n if (totalBytes > maxBytes) {\n res.statusCode = 413\n res.end('payload too large')\n return\n }\n chunks.push(buf)\n }\n\n const body = JSON.parse(\n Buffer.concat(chunks).toString('utf8'),\n ) as HttpDispatchRequestBody<TScenario>\n cellId = body.cellId\n\n const ctx: DispatchContext = opts.contextFactory\n ? await opts.contextFactory(body, aborter.signal)\n : {\n cellId: body.cellId,\n rep: body.rep,\n generation: body.generation,\n seed: body.seed,\n signal: aborter.signal,\n placement: body.placement,\n cycleId: body.cycleId,\n trace: NOOP_TRACE,\n artifacts: NOOP_ARTIFACTS,\n cost: NOOP_COST,\n }\n\n const artifact = await opts.dispatch(body.scenario, ctx)\n const responseBody: HttpDispatchResponseBody<TArtifact> = { artifact }\n\n res.statusCode = 200\n res.setHeader('content-type', 'application/json')\n res.end(JSON.stringify(responseBody))\n success = true\n } catch (err) {\n errCaught = err\n // Client-cancelled — they don't care about the result.\n if ((err as Error)?.name === 'AbortError') {\n res.statusCode = 499\n res.end('client aborted')\n return\n }\n res.statusCode = 500\n res.setHeader('content-type', 'application/json')\n res.end(JSON.stringify({ error: err instanceof Error ? err.message : String(err) }))\n } finally {\n opts.onRequest?.({\n cellId,\n durationMs: Date.now() - start,\n success,\n error: errCaught,\n })\n }\n })\n\n await new Promise<void>((resolve, reject) => {\n server.once('error', reject)\n server.listen(opts.port, opts.host ?? '0.0.0.0', () => resolve())\n })\n\n const addr = server.address()\n const boundPort = typeof addr === 'object' && addr ? addr.port : opts.port\n\n return {\n port: boundPort,\n close: () =>\n new Promise<void>((resolve, reject) => {\n server.close((err) => (err ? reject(err) : resolve()))\n }),\n }\n}\n\n// ── No-op default ctx machinery (worker can replace via contextFactory) ──\n\nconst NOOP_TRACE = {\n span: () => ({\n end: () => {},\n setAttribute: () => {},\n setStatus: () => {},\n recordException: () => {},\n addEvent: () => {},\n }),\n} as unknown as DispatchContext['trace']\n\nconst NOOP_ARTIFACTS = {\n write: async () => undefined,\n read: async () => undefined,\n list: async () => [],\n} as unknown as DispatchContext['artifacts']\n\nconst NOOP_COST = {\n record: () => {},\n total: () => 0,\n} as unknown as DispatchContext['cost']\n"],"mappings":";;;AA6EA,SAAS,YAAY,MAA8E;AACjG,MAAI,CAAC,KAAM,QAAO,QAAQ,QAAQ,IAAI;AACtC,MAAI,OAAO,SAAS,SAAU,QAAO,QAAQ,QAAQ,IAAI;AACzD,SAAO,QAAQ,QAAQ,KAAK,CAAC;AAC/B;AAUO,SAAS,aACd,MACgC;AAChC,MAAI,CAAC,KAAK,OAAO,CAAC,KAAK,YAAY;AACjC,UAAM,IAAI,MAAM,0DAA0D;AAAA,EAC5E;AACA,MAAI,KAAK,OAAO,KAAK,YAAY;AAC/B,UAAM,IAAI,MAAM,oEAAoE;AAAA,EACtF;AACA,QAAM,YAAY,KAAK,aAAa,IAAI,KAAK;AAC7C,QAAM,aAAa,KAAK,WAAW;AACnC,QAAM,IAAkB,KAAK,cAAc,IAAI,SAAS,MAAM,GAAG,IAAI;AAErE,SAAO,OAAO,UAAU,QAAQ;AAC9B,UAAM,MACJ,KAAK,OAAO,KAAK,WAAY,EAAE,UAAU,WAAW,IAAI,WAAW,QAAQ,IAAI,OAAO,CAAC;AACzF,UAAM,YAAY,MAAM,YAAY,KAAK,IAAI;AAC7C,UAAM,OAA2C;AAAA,MAC/C;AAAA,MACA,QAAQ,IAAI;AAAA,MACZ,KAAK,IAAI;AAAA,MACT,YAAY,IAAI;AAAA,MAChB,MAAM,IAAI;AAAA,MACV,WAAW,IAAI;AAAA,MACf,SAAS,IAAI;AAAA,IACf;AAEA,QAAI;AACJ,aAAS,UAAU,GAAG,WAAW,YAAY,WAAW;AAEtD,YAAM,aAAa,YAAY,QAAQ,SAAS;AAChD,YAAM,iBAAiB,YAAY,IAAI,CAAC,IAAI,QAAQ,UAAU,CAAC;AAC/D,UAAI;AACF,cAAM,MAAM,MAAM,EAAE,KAAK;AAAA,UACvB,QAAQ;AAAA,UACR,SAAS;AAAA,YACP,gBAAgB;AAAA,YAChB,GAAI,YACA;AAAA,cACE,eAAe,UAAU,WAAW,SAAS,IACzC,YACA,UAAU,SAAS;AAAA,YACzB,IACA,CAAC;AAAA,YACL,GAAG,KAAK;AAAA,UACV;AAAA,UACA,MAAM,KAAK,UAAU,IAAI;AAAA,UACzB,QAAQ;AAAA,QACV,CAAC;AACD,YAAI,CAAC,IAAI,IAAI;AAGX,gBAAM,YAAY,IAAI,UAAU,OAAO,IAAI,WAAW,OAAO,IAAI,WAAW;AAC5E,cAAI,CAAC,aAAa,YAAY,YAAY;AACxC,kBAAM,OAAO,MAAM,IAAI,KAAK,EAAE,MAAM,MAAM,EAAE;AAC5C,kBAAM,IAAI,MAAM,gBAAgB,GAAG,YAAY,IAAI,MAAM,MAAM,KAAK,MAAM,GAAG,GAAG,CAAC,EAAE;AAAA,UACrF;AAEA,gBAAM,MAAM,KAAK,UAAU,MAAM,KAAK,OAAO,IAAI,GAAG;AACpD;AAAA,QACF;AACA,cAAM,SAAU,MAAM,IAAI,KAAK;AAC/B,eAAO,OAAO;AAAA,MAChB,SAAS,KAAK;AAEZ,YAAI,IAAI,OAAO,QAAS,OAAM;AAC9B,oBAAY;AACZ,YAAI,YAAY,WAAY,OAAM;AAClC,cAAM,MAAM,KAAK,UAAU,MAAM,KAAK,OAAO,IAAI,GAAG;AAAA,MACtD;AAAA,IACF;AACA,UAAM,aAAa,IAAI,MAAM,gCAAgC;AAAA,EAC/D;AACF;AAEA,SAAS,MAAM,IAA2B;AACxC,SAAO,IAAI,QAAQ,CAAC,YAAY;AAC9B,UAAM,IAAI,WAAW,SAAS,EAAE;AAEhC,QAAI,OAAQ,EAA6B,UAAU;AACjD,MAAC,EAA4B,MAAM;AAAA,EACvC,CAAC;AACH;AAgEA,eAAsB,kBACpB,MAC+B;AAC/B,MAAI,KAAK,SAAS,QAAW;AAC3B,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,WAAW,KAAK,gBAAgB,KAAK,OAAO;AAClD,QAAM,eACJ,OAAO,KAAK,SAAS,WAAW,UAAU,KAAK,KAAK,QAAQ,cAAc,EAAE,CAAC,KAAK;AAKpF,QAAM,EAAE,aAAa,IAAI,MAAM,OAAO,MAAW;AAEjD,QAAM,SAAS,aAAa,OAAO,KAAK,QAAQ;AAC9C,UAAM,QAAQ,KAAK,IAAI;AACvB,QAAI,SAAS;AACb,QAAI,UAAU;AACd,QAAI;AAEJ,QAAI;AACF,UAAI,IAAI,WAAW,UAAU,IAAI,KAAK,MAAM,GAAG,EAAE,CAAC,MAAM,MAAM;AAC5D,YAAI,aAAa;AACjB,YAAI,IAAI,WAAW;AACnB;AAAA,MACF;AACA,UAAI,cAAc;AAChB,cAAM,MAAM,IAAI,QAAQ,eAAe;AACvC,YAAI,QAAQ,cAAc;AACxB,cAAI,aAAa;AACjB,cAAI,IAAI,cAAc;AACtB;AAAA,QACF;AAAA,MACF;AAGA,YAAM,SAAmB,CAAC;AAC1B,UAAI,aAAa;AACjB,YAAM,UAAU,IAAI,gBAAgB;AACpC,UAAI,GAAG,SAAS,MAAM;AACpB,YAAI,CAAC,IAAI,cAAe,SAAQ,MAAM;AAAA,MACxC,CAAC;AAED,uBAAiB,SAAS,KAAK;AAC7B,cAAM,MAAM;AACZ,sBAAc,IAAI;AAClB,YAAI,aAAa,UAAU;AACzB,cAAI,aAAa;AACjB,cAAI,IAAI,mBAAmB;AAC3B;AAAA,QACF;AACA,eAAO,KAAK,GAAG;AAAA,MACjB;AAEA,YAAM,OAAO,KAAK;AAAA,QAChB,OAAO,OAAO,MAAM,EAAE,SAAS,MAAM;AAAA,MACvC;AACA,eAAS,KAAK;AAEd,YAAM,MAAuB,KAAK,iBAC9B,MAAM,KAAK,eAAe,MAAM,QAAQ,MAAM,IAC9C;AAAA,QACE,QAAQ,KAAK;AAAA,QACb,KAAK,KAAK;AAAA,QACV,YAAY,KAAK;AAAA,QACjB,MAAM,KAAK;AAAA,QACX,QAAQ,QAAQ;AAAA,QAChB,WAAW,KAAK;AAAA,QAChB,SAAS,KAAK;AAAA,QACd,OAAO;AAAA,QACP,WAAW;AAAA,QACX,MAAM;AAAA,MACR;AAEJ,YAAM,WAAW,MAAM,KAAK,SAAS,KAAK,UAAU,GAAG;AACvD,YAAM,eAAoD,EAAE,SAAS;AAErE,UAAI,aAAa;AACjB,UAAI,UAAU,gBAAgB,kBAAkB;AAChD,UAAI,IAAI,KAAK,UAAU,YAAY,CAAC;AACpC,gBAAU;AAAA,IACZ,SAAS,KAAK;AACZ,kBAAY;AAEZ,UAAK,KAAe,SAAS,cAAc;AACzC,YAAI,aAAa;AACjB,YAAI,IAAI,gBAAgB;AACxB;AAAA,MACF;AACA,UAAI,aAAa;AACjB,UAAI,UAAU,gBAAgB,kBAAkB;AAChD,UAAI,IAAI,KAAK,UAAU,EAAE,OAAO,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,EAAE,CAAC,CAAC;AAAA,IACrF,UAAE;AACA,WAAK,YAAY;AAAA,QACf;AAAA,QACA,YAAY,KAAK,IAAI,IAAI;AAAA,QACzB;AAAA,QACA,OAAO;AAAA,MACT,CAAC;AAAA,IACH;AAAA,EACF,CAAC;AAED,QAAM,IAAI,QAAc,CAAC,SAAS,WAAW;AAC3C,WAAO,KAAK,SAAS,MAAM;AAC3B,WAAO,OAAO,KAAK,MAAM,KAAK,QAAQ,WAAW,MAAM,QAAQ,CAAC;AAAA,EAClE,CAAC;AAED,QAAM,OAAO,OAAO,QAAQ;AAC5B,QAAM,YAAY,OAAO,SAAS,YAAY,OAAO,KAAK,OAAO,KAAK;AAEtE,SAAO;AAAA,IACL,MAAM;AAAA,IACN,OAAO,MACL,IAAI,QAAc,CAAC,SAAS,WAAW;AACrC,aAAO,MAAM,CAAC,QAAS,MAAM,OAAO,GAAG,IAAI,QAAQ,CAAE;AAAA,IACvD,CAAC;AAAA,EACL;AACF;AAIA,IAAM,aAAa;AAAA,EACjB,MAAM,OAAO;AAAA,IACX,KAAK,MAAM;AAAA,IAAC;AAAA,IACZ,cAAc,MAAM;AAAA,IAAC;AAAA,IACrB,WAAW,MAAM;AAAA,IAAC;AAAA,IAClB,iBAAiB,MAAM;AAAA,IAAC;AAAA,IACxB,UAAU,MAAM;AAAA,IAAC;AAAA,EACnB;AACF;AAEA,IAAM,iBAAiB;AAAA,EACrB,OAAO,YAAY;AAAA,EACnB,MAAM,YAAY;AAAA,EAClB,MAAM,YAAY,CAAC;AACrB;AAEA,IAAM,YAAY;AAAA,EAChB,QAAQ,MAAM;AAAA,EAAC;AAAA,EACf,OAAO,MAAM;AACf;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../../src/adapters/http.ts"],"sourcesContent":["/**\n * # `@tangle-network/agent-eval/adapters/http` — distributed Dispatch over HTTP.\n *\n * Decouples driver and worker. The driver (running `runImprovementLoop` or\n * `runCampaign`) can live anywhere — your VPC, a dev laptop, a cron VM. The\n * workers (running the actual agent) can live anywhere else — different\n * regions, different clouds, different boxes — as long as they speak HTTP.\n *\n * Both sides:\n *\n * - **`httpDispatch({ url | resolveUrl, ... })`** — client. Returns a\n * `Dispatch` that POSTs `{ scenario, ctx }` to a worker URL and parses\n * the artifact back. AbortSignal-aware, retries on idempotent errors,\n * bounded timeout per call.\n * - **`runDispatchServer({ dispatch, port, ... })`** — server. Wraps your\n * local `Dispatch` as an HTTP endpoint. Handles auth, JSON parsing,\n * error mapping, and cancellation when the client aborts.\n *\n * # Topology examples\n *\n * **Single-worker:** driver on box A, worker on box B. Set\n * `httpDispatch({ url: 'https://box-b/dispatch' })`.\n *\n * **Multi-region:** N workers across regions. Use `httpDispatch({ resolveUrl })`\n * with a function that picks the URL per cell from `ctx.placement`. Combined\n * with `cellPlacement` on `RunCampaignOptions`, the substrate fans cells\n * across geographies in parallel.\n *\n * **Driver-as-a-service:** driver runs as a long-lived process or service\n * (holds optimization state across generations); workers are stateless\n * HTTP services that can scale horizontally per cell.\n */\n\nimport type { Dispatch, DispatchContext, Scenario } from '../contract'\n\n// ── Client ───────────────────────────────────────────────────────────\n\n// eslint-disable-next-line @typescript-eslint/no-unused-vars -- TArtifact is unused\n// in this options interface but kept as a parameter so callers can write\n// `HttpDispatchOptions<MyScenario, MyArtifact>` symmetrically with\n// `Dispatch<MyScenario, MyArtifact>`. Marking it unused at the position\n// where it bites.\nexport interface HttpDispatchOptions<TScenario extends Scenario, _TArtifact> {\n /** Static endpoint URL. Mutually exclusive with `resolveUrl`. */\n url?: string\n /**\n * Dynamic per-cell URL resolver. Receives the scenario + the substrate\n * placement key (from `RunCampaignOptions.cellPlacement`) and returns the\n * worker URL to invoke. Mutually exclusive with `url`.\n */\n resolveUrl?: (input: { scenario: TScenario; placement?: string; cellId: string }) => string\n /** Bearer token or static auth string set as `Authorization`. */\n auth?: string | (() => string | Promise<string>)\n /** Extra headers merged into every request. */\n headers?: Record<string, string>\n /** Per-call timeout in ms. Default 5 minutes. */\n timeoutMs?: number\n /** How many idempotent retries on 5xx / network errors. Default 2. */\n retries?: number\n /** Optional fetch override (auth wrappers, custom agent, mocks). */\n fetchImpl?: typeof fetch\n}\n\nexport interface HttpDispatchRequestBody<TScenario extends Scenario> {\n scenario: TScenario\n cellId: string\n rep: number\n generation?: number\n seed: number\n placement?: string\n cycleId?: string\n}\n\nexport interface HttpDispatchResponseBody<TArtifact> {\n artifact: TArtifact\n}\n\nfunction resolveAuth(auth: HttpDispatchOptions<Scenario, unknown>['auth']): Promise<string | null> {\n if (!auth) return Promise.resolve(null)\n if (typeof auth === 'string') return Promise.resolve(auth)\n return Promise.resolve(auth())\n}\n\n/**\n * Wrap a remote HTTP endpoint as a `Dispatch`. The remote side should run\n * `runDispatchServer` (or any service that speaks the same wire shape).\n *\n * Cancellation: the substrate's per-cell `AbortSignal` is forwarded; the\n * server's `runDispatchServer` translates the resulting `AbortError` into\n * a 499 (client-closed) so the client doesn't retry.\n */\nexport function httpDispatch<TScenario extends Scenario, TArtifact>(\n opts: HttpDispatchOptions<TScenario, TArtifact>,\n): Dispatch<TScenario, TArtifact> {\n if (!opts.url && !opts.resolveUrl) {\n throw new Error('httpDispatch: pass exactly one of `url` or `resolveUrl`.')\n }\n if (opts.url && opts.resolveUrl) {\n throw new Error('httpDispatch: pass exactly one of `url` or `resolveUrl`, not both.')\n }\n const timeoutMs = opts.timeoutMs ?? 5 * 60 * 1000\n const maxRetries = opts.retries ?? 2\n const f: typeof fetch = opts.fetchImpl ?? ((...args) => fetch(...args))\n\n return async (scenario, ctx) => {\n const url =\n opts.url ?? opts.resolveUrl!({ scenario, placement: ctx.placement, cellId: ctx.cellId })\n const authValue = await resolveAuth(opts.auth)\n const body: HttpDispatchRequestBody<TScenario> = {\n scenario,\n cellId: ctx.cellId,\n rep: ctx.rep,\n generation: ctx.generation,\n seed: ctx.seed,\n placement: ctx.placement,\n cycleId: ctx.cycleId,\n }\n\n let lastError: unknown\n for (let attempt = 0; attempt <= maxRetries; attempt++) {\n // Compose the request signal: caller's signal OR our timeout.\n const ourTimeout = AbortSignal.timeout(timeoutMs)\n const combinedSignal = AbortSignal.any([ctx.signal, ourTimeout])\n try {\n const res = await f(url, {\n method: 'POST',\n headers: {\n 'Content-Type': 'application/json',\n ...(authValue\n ? {\n Authorization: authValue.startsWith('Bearer ')\n ? authValue\n : `Bearer ${authValue}`,\n }\n : {}),\n ...opts.headers,\n },\n body: JSON.stringify(body),\n signal: combinedSignal,\n })\n if (!res.ok) {\n // 4xx is non-retryable (caller error, auth, bad scenario shape).\n // 5xx / 408 / 429 / 502 / 503 / 504 are retryable.\n const retryable = res.status >= 500 || res.status === 408 || res.status === 429\n if (!retryable || attempt === maxRetries) {\n const text = await res.text().catch(() => '')\n throw new Error(`httpDispatch ${url} failed (${res.status}): ${text.slice(0, 500)}`)\n }\n // exponential backoff with jitter\n await sleep(2 ** attempt * 200 + Math.random() * 200)\n continue\n }\n const parsed = (await res.json()) as HttpDispatchResponseBody<TArtifact>\n return parsed.artifact\n } catch (err) {\n // Caller-driven abort is terminal — never retry.\n if (ctx.signal.aborted) throw err\n lastError = err\n if (attempt === maxRetries) throw err\n await sleep(2 ** attempt * 200 + Math.random() * 200)\n }\n }\n throw lastError ?? new Error('httpDispatch exhausted retries')\n }\n}\n\nfunction sleep(ms: number): Promise<void> {\n return new Promise((resolve) => {\n const t = setTimeout(resolve, ms)\n // Don't keep node process alive purely for backoff sleeps.\n if (typeof (t as { unref?: () => void }).unref === 'function')\n (t as { unref: () => void }).unref()\n })\n}\n\n// ── Server ───────────────────────────────────────────────────────────\n\nexport interface RunDispatchServerOptions<TScenario extends Scenario, TArtifact> {\n /** The Dispatch this server exposes — what runs when a request lands. */\n dispatch: Dispatch<TScenario, TArtifact>\n /** TCP port to bind. */\n port: number\n /** Optional bind host; defaults to 0.0.0.0. */\n host?: string\n /** Required for any non-test deployment: the bearer token clients must\n * send. The substrate refuses to start without auth unless `auth: false`\n * is set explicitly (intended ONLY for closed-network/internal testing). */\n auth: string | false\n /** Path the server listens on. Default `/dispatch`. */\n path?: string\n /**\n * Per-request handler that wraps `dispatch` with whatever context the\n * worker side needs to construct a `DispatchContext` — typically the\n * trace writer, artifact writer, and cost meter. The substrate provides\n * synthetic-but-typed defaults if not supplied; production deployments\n * should wire real ones (e.g. ship traces to your OTel collector).\n */\n contextFactory?: (\n req: HttpDispatchRequestBody<TScenario>,\n signal: AbortSignal,\n ) => Promise<DispatchContext>\n /** Optional max payload size for the request body (bytes). Default 10 MB. */\n maxBodyBytes?: number\n /** Hook for observability — called on every successful or failed turn. */\n onRequest?: (event: {\n cellId: string\n durationMs: number\n success: boolean\n error?: unknown\n }) => void\n}\n\nexport interface DispatchServerHandle {\n /** The actual bound port (useful when `port: 0` requests an ephemeral port). */\n port: number\n /** Stop accepting new connections and drain existing ones. */\n close: () => Promise<void>\n}\n\n/**\n * Start an HTTP server exposing a local `Dispatch` over the wire. Pair with\n * `httpDispatch` on the driver side.\n *\n * Wire shape:\n *\n * POST /dispatch\n * Authorization: Bearer <token>\n * Body: HttpDispatchRequestBody\n * 200 OK: HttpDispatchResponseBody\n * 401: missing/invalid auth\n * 408: per-request timeout exceeded\n * 499: client aborted before completion\n * 500: dispatch threw\n *\n * The server is `node:http`-based to keep the runtime dependency surface\n * minimal — works in plain Node, sandbox, or any container.\n */\nexport async function runDispatchServer<TScenario extends Scenario, TArtifact>(\n opts: RunDispatchServerOptions<TScenario, TArtifact>,\n): Promise<DispatchServerHandle> {\n if (opts.auth === undefined) {\n throw new Error(\n \"runDispatchServer: 'auth' is required (pass a bearer-token string, or `auth: false` explicitly for a closed-network test deployment).\",\n )\n }\n const path = opts.path ?? '/dispatch'\n const maxBytes = opts.maxBodyBytes ?? 10 * 1024 * 1024\n const expectedAuth =\n typeof opts.auth === 'string' ? `Bearer ${opts.auth.replace(/^Bearer\\s+/, '')}` : null\n\n // Lazy-import node:http so the file is usable from non-Node bundlers\n // that import the client side only (e.g. an edge driver shipping\n // httpDispatch alone). Server side is opt-in by calling this function.\n const { createServer } = await import('node:http')\n\n const server = createServer(async (req, res) => {\n const start = Date.now()\n let cellId = 'unknown'\n let success = false\n let errCaught: unknown\n\n try {\n if (req.method !== 'POST' || req.url?.split('?')[0] !== path) {\n res.statusCode = 404\n res.end('not found')\n return\n }\n if (expectedAuth) {\n const got = req.headers.authorization\n if (got !== expectedAuth) {\n res.statusCode = 401\n res.end('unauthorized')\n return\n }\n }\n\n // Read body up to maxBytes\n const chunks: Buffer[] = []\n let totalBytes = 0\n const aborter = new AbortController()\n req.on('close', () => {\n if (!res.writableEnded) aborter.abort()\n })\n\n for await (const chunk of req) {\n const buf = chunk as Buffer\n totalBytes += buf.length\n if (totalBytes > maxBytes) {\n res.statusCode = 413\n res.end('payload too large')\n return\n }\n chunks.push(buf)\n }\n\n const body = JSON.parse(\n Buffer.concat(chunks).toString('utf8'),\n ) as HttpDispatchRequestBody<TScenario>\n cellId = body.cellId\n\n const ctx: DispatchContext = opts.contextFactory\n ? await opts.contextFactory(body, aborter.signal)\n : {\n cellId: body.cellId,\n rep: body.rep,\n generation: body.generation,\n seed: body.seed,\n signal: aborter.signal,\n placement: body.placement,\n cycleId: body.cycleId,\n trace: NOOP_TRACE,\n artifacts: NOOP_ARTIFACTS,\n cost: NOOP_COST,\n }\n\n const artifact = await opts.dispatch(body.scenario, ctx)\n const responseBody: HttpDispatchResponseBody<TArtifact> = { artifact }\n\n res.statusCode = 200\n res.setHeader('content-type', 'application/json')\n res.end(JSON.stringify(responseBody))\n success = true\n } catch (err) {\n errCaught = err\n // Client-cancelled — they don't care about the result.\n if ((err as Error)?.name === 'AbortError') {\n res.statusCode = 499\n res.end('client aborted')\n return\n }\n res.statusCode = 500\n res.setHeader('content-type', 'application/json')\n res.end(JSON.stringify({ error: err instanceof Error ? err.message : String(err) }))\n } finally {\n opts.onRequest?.({\n cellId,\n durationMs: Date.now() - start,\n success,\n error: errCaught,\n })\n }\n })\n\n await new Promise<void>((resolve, reject) => {\n server.once('error', reject)\n server.listen(opts.port, opts.host ?? '0.0.0.0', () => resolve())\n })\n\n const addr = server.address()\n const boundPort = typeof addr === 'object' && addr ? addr.port : opts.port\n\n return {\n port: boundPort,\n close: () =>\n new Promise<void>((resolve, reject) => {\n server.close((err) => (err ? reject(err) : resolve()))\n }),\n }\n}\n\n// ── No-op default ctx machinery (worker can replace via contextFactory) ──\n\nconst NOOP_TRACE = {\n span: () => ({\n end: () => {},\n setAttribute: () => {},\n setStatus: () => {},\n recordException: () => {},\n addEvent: () => {},\n }),\n} as unknown as DispatchContext['trace']\n\nconst NOOP_ARTIFACTS = {\n write: async () => undefined,\n read: async () => undefined,\n list: async () => [],\n} as unknown as DispatchContext['artifacts']\n\nconst NOOP_COST = {\n record: () => {},\n total: () => 0,\n} as unknown as DispatchContext['cost']\n"],"mappings":";;;AA6EA,SAAS,YAAY,MAA8E;AACjG,MAAI,CAAC,KAAM,QAAO,QAAQ,QAAQ,IAAI;AACtC,MAAI,OAAO,SAAS,SAAU,QAAO,QAAQ,QAAQ,IAAI;AACzD,SAAO,QAAQ,QAAQ,KAAK,CAAC;AAC/B;AAUO,SAAS,aACd,MACgC;AAChC,MAAI,CAAC,KAAK,OAAO,CAAC,KAAK,YAAY;AACjC,UAAM,IAAI,MAAM,0DAA0D;AAAA,EAC5E;AACA,MAAI,KAAK,OAAO,KAAK,YAAY;AAC/B,UAAM,IAAI,MAAM,oEAAoE;AAAA,EACtF;AACA,QAAM,YAAY,KAAK,aAAa,IAAI,KAAK;AAC7C,QAAM,aAAa,KAAK,WAAW;AACnC,QAAM,IAAkB,KAAK,cAAc,IAAI,SAAS,MAAM,GAAG,IAAI;AAErE,SAAO,OAAO,UAAU,QAAQ;AAC9B,UAAM,MACJ,KAAK,OAAO,KAAK,WAAY,EAAE,UAAU,WAAW,IAAI,WAAW,QAAQ,IAAI,OAAO,CAAC;AACzF,UAAM,YAAY,MAAM,YAAY,KAAK,IAAI;AAC7C,UAAM,OAA2C;AAAA,MAC/C;AAAA,MACA,QAAQ,IAAI;AAAA,MACZ,KAAK,IAAI;AAAA,MACT,YAAY,IAAI;AAAA,MAChB,MAAM,IAAI;AAAA,MACV,WAAW,IAAI;AAAA,MACf,SAAS,IAAI;AAAA,IACf;AAEA,QAAI;AACJ,aAAS,UAAU,GAAG,WAAW,YAAY,WAAW;AAEtD,YAAM,aAAa,YAAY,QAAQ,SAAS;AAChD,YAAM,iBAAiB,YAAY,IAAI,CAAC,IAAI,QAAQ,UAAU,CAAC;AAC/D,UAAI;AACF,cAAM,MAAM,MAAM,EAAE,KAAK;AAAA,UACvB,QAAQ;AAAA,UACR,SAAS;AAAA,YACP,gBAAgB;AAAA,YAChB,GAAI,YACA;AAAA,cACE,eAAe,UAAU,WAAW,SAAS,IACzC,YACA,UAAU,SAAS;AAAA,YACzB,IACA,CAAC;AAAA,YACL,GAAG,KAAK;AAAA,UACV;AAAA,UACA,MAAM,KAAK,UAAU,IAAI;AAAA,UACzB,QAAQ;AAAA,QACV,CAAC;AACD,YAAI,CAAC,IAAI,IAAI;AAGX,gBAAM,YAAY,IAAI,UAAU,OAAO,IAAI,WAAW,OAAO,IAAI,WAAW;AAC5E,cAAI,CAAC,aAAa,YAAY,YAAY;AACxC,kBAAM,OAAO,MAAM,IAAI,KAAK,EAAE,MAAM,MAAM,EAAE;AAC5C,kBAAM,IAAI,MAAM,gBAAgB,GAAG,YAAY,IAAI,MAAM,MAAM,KAAK,MAAM,GAAG,GAAG,CAAC,EAAE;AAAA,UACrF;AAEA,gBAAM,MAAM,KAAK,UAAU,MAAM,KAAK,OAAO,IAAI,GAAG;AACpD;AAAA,QACF;AACA,cAAM,SAAU,MAAM,IAAI,KAAK;AAC/B,eAAO,OAAO;AAAA,MAChB,SAAS,KAAK;AAEZ,YAAI,IAAI,OAAO,QAAS,OAAM;AAC9B,oBAAY;AACZ,YAAI,YAAY,WAAY,OAAM;AAClC,cAAM,MAAM,KAAK,UAAU,MAAM,KAAK,OAAO,IAAI,GAAG;AAAA,MACtD;AAAA,IACF;AACA,UAAM,aAAa,IAAI,MAAM,gCAAgC;AAAA,EAC/D;AACF;AAEA,SAAS,MAAM,IAA2B;AACxC,SAAO,IAAI,QAAQ,CAAC,YAAY;AAC9B,UAAM,IAAI,WAAW,SAAS,EAAE;AAEhC,QAAI,OAAQ,EAA6B,UAAU;AACjD,MAAC,EAA4B,MAAM;AAAA,EACvC,CAAC;AACH;AAgEA,eAAsB,kBACpB,MAC+B;AAC/B,MAAI,KAAK,SAAS,QAAW;AAC3B,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,WAAW,KAAK,gBAAgB,KAAK,OAAO;AAClD,QAAM,eACJ,OAAO,KAAK,SAAS,WAAW,UAAU,KAAK,KAAK,QAAQ,cAAc,EAAE,CAAC,KAAK;AAKpF,QAAM,EAAE,aAAa,IAAI,MAAM,OAAO,MAAW;AAEjD,QAAM,SAAS,aAAa,OAAO,KAAK,QAAQ;AAC9C,UAAM,QAAQ,KAAK,IAAI;AACvB,QAAI,SAAS;AACb,QAAI,UAAU;AACd,QAAI;AAEJ,QAAI;AACF,UAAI,IAAI,WAAW,UAAU,IAAI,KAAK,MAAM,GAAG,EAAE,CAAC,MAAM,MAAM;AAC5D,YAAI,aAAa;AACjB,YAAI,IAAI,WAAW;AACnB;AAAA,MACF;AACA,UAAI,cAAc;AAChB,cAAM,MAAM,IAAI,QAAQ;AACxB,YAAI,QAAQ,cAAc;AACxB,cAAI,aAAa;AACjB,cAAI,IAAI,cAAc;AACtB;AAAA,QACF;AAAA,MACF;AAGA,YAAM,SAAmB,CAAC;AAC1B,UAAI,aAAa;AACjB,YAAM,UAAU,IAAI,gBAAgB;AACpC,UAAI,GAAG,SAAS,MAAM;AACpB,YAAI,CAAC,IAAI,cAAe,SAAQ,MAAM;AAAA,MACxC,CAAC;AAED,uBAAiB,SAAS,KAAK;AAC7B,cAAM,MAAM;AACZ,sBAAc,IAAI;AAClB,YAAI,aAAa,UAAU;AACzB,cAAI,aAAa;AACjB,cAAI,IAAI,mBAAmB;AAC3B;AAAA,QACF;AACA,eAAO,KAAK,GAAG;AAAA,MACjB;AAEA,YAAM,OAAO,KAAK;AAAA,QAChB,OAAO,OAAO,MAAM,EAAE,SAAS,MAAM;AAAA,MACvC;AACA,eAAS,KAAK;AAEd,YAAM,MAAuB,KAAK,iBAC9B,MAAM,KAAK,eAAe,MAAM,QAAQ,MAAM,IAC9C;AAAA,QACE,QAAQ,KAAK;AAAA,QACb,KAAK,KAAK;AAAA,QACV,YAAY,KAAK;AAAA,QACjB,MAAM,KAAK;AAAA,QACX,QAAQ,QAAQ;AAAA,QAChB,WAAW,KAAK;AAAA,QAChB,SAAS,KAAK;AAAA,QACd,OAAO;AAAA,QACP,WAAW;AAAA,QACX,MAAM;AAAA,MACR;AAEJ,YAAM,WAAW,MAAM,KAAK,SAAS,KAAK,UAAU,GAAG;AACvD,YAAM,eAAoD,EAAE,SAAS;AAErE,UAAI,aAAa;AACjB,UAAI,UAAU,gBAAgB,kBAAkB;AAChD,UAAI,IAAI,KAAK,UAAU,YAAY,CAAC;AACpC,gBAAU;AAAA,IACZ,SAAS,KAAK;AACZ,kBAAY;AAEZ,UAAK,KAAe,SAAS,cAAc;AACzC,YAAI,aAAa;AACjB,YAAI,IAAI,gBAAgB;AACxB;AAAA,MACF;AACA,UAAI,aAAa;AACjB,UAAI,UAAU,gBAAgB,kBAAkB;AAChD,UAAI,IAAI,KAAK,UAAU,EAAE,OAAO,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,EAAE,CAAC,CAAC;AAAA,IACrF,UAAE;AACA,WAAK,YAAY;AAAA,QACf;AAAA,QACA,YAAY,KAAK,IAAI,IAAI;AAAA,QACzB;AAAA,QACA,OAAO;AAAA,MACT,CAAC;AAAA,IACH;AAAA,EACF,CAAC;AAED,QAAM,IAAI,QAAc,CAAC,SAAS,WAAW;AAC3C,WAAO,KAAK,SAAS,MAAM;AAC3B,WAAO,OAAO,KAAK,MAAM,KAAK,QAAQ,WAAW,MAAM,QAAQ,CAAC;AAAA,EAClE,CAAC;AAED,QAAM,OAAO,OAAO,QAAQ;AAC5B,QAAM,YAAY,OAAO,SAAS,YAAY,OAAO,KAAK,OAAO,KAAK;AAEtE,SAAO;AAAA,IACL,MAAM;AAAA,IACN,OAAO,MACL,IAAI,QAAc,CAAC,SAAS,WAAW;AACrC,aAAO,MAAM,CAAC,QAAS,MAAM,OAAO,GAAG,IAAI,QAAQ,CAAE;AAAA,IACvD,CAAC;AAAA,EACL;AACF;AAIA,IAAM,aAAa;AAAA,EACjB,MAAM,OAAO;AAAA,IACX,KAAK,MAAM;AAAA,IAAC;AAAA,IACZ,cAAc,MAAM;AAAA,IAAC;AAAA,IACrB,WAAW,MAAM;AAAA,IAAC;AAAA,IAClB,iBAAiB,MAAM;AAAA,IAAC;AAAA,IACxB,UAAU,MAAM;AAAA,IAAC;AAAA,EACnB;AACF;AAEA,IAAM,iBAAiB;AAAA,EACrB,OAAO,YAAY;AAAA,EACnB,MAAM,YAAY;AAAA,EAClB,MAAM,YAAY,CAAC;AACrB;AAEA,IAAM,YAAY;AAAA,EAChB,QAAQ,MAAM;AAAA,EAAC;AAAA,EACf,OAAO,MAAM;AACf;","names":[]}
|
package/dist/campaign/index.d.ts
CHANGED
|
@@ -538,6 +538,16 @@ interface RunProfileMatrixOptions<TScenario extends Scenario, TArtifact> {
|
|
|
538
538
|
* Default true — catches bad model snapshots and non-finite judge dims at
|
|
539
539
|
* the boundary instead of letting them poison downstream analysis. */
|
|
540
540
|
validate?: boolean;
|
|
541
|
+
/** Corpus-by-default: derive the trajectory text (`prompt` + `completion`)
|
|
542
|
+
* for each cell from its artifact + scenario. When set, every produced
|
|
543
|
+
* record carries `prompt`/`completion` (a `CorpusRecord`) so the run's
|
|
544
|
+
* graded trajectories can be appended to the durable RL corpus with no
|
|
545
|
+
* side-channel — `appendToCorpus(result.records, path)`. Fail-soft: a
|
|
546
|
+
* throwing or undefined-returning extractor just omits the text. */
|
|
547
|
+
corpusText?: (artifact: TArtifact, scenario: TScenario) => {
|
|
548
|
+
prompt: string;
|
|
549
|
+
completion: string;
|
|
550
|
+
} | undefined;
|
|
541
551
|
}
|
|
542
552
|
interface ProfileSummary {
|
|
543
553
|
profileId: string;
|
package/dist/campaign/index.js
CHANGED
|
@@ -7,10 +7,12 @@ import {
|
|
|
7
7
|
heldoutSignificance,
|
|
8
8
|
pairHoldout,
|
|
9
9
|
runEval
|
|
10
|
-
} from "../chunk-
|
|
10
|
+
} from "../chunk-UD6EF73X.js";
|
|
11
11
|
import {
|
|
12
|
-
agentProfileHash
|
|
13
|
-
|
|
12
|
+
agentProfileHash,
|
|
13
|
+
estimateCost,
|
|
14
|
+
isModelPriced
|
|
15
|
+
} from "../chunk-SL55X4VN.js";
|
|
14
16
|
import {
|
|
15
17
|
buildLoopProvenanceRecord,
|
|
16
18
|
campaignBreakdown,
|
|
@@ -31,14 +33,14 @@ import {
|
|
|
31
33
|
runOptimization,
|
|
32
34
|
surfaceContentHash,
|
|
33
35
|
surfaceHash
|
|
34
|
-
} from "../chunk-
|
|
36
|
+
} from "../chunk-4QJN7RDX.js";
|
|
35
37
|
import {
|
|
36
38
|
assertRealBackend,
|
|
37
39
|
fsCampaignStorage,
|
|
38
40
|
inMemoryCampaignStorage,
|
|
39
41
|
runCampaign,
|
|
40
42
|
summarizeBackendIntegrity
|
|
41
|
-
} from "../chunk-
|
|
43
|
+
} from "../chunk-ZPSKPT3V.js";
|
|
42
44
|
import "../chunk-YV7J7X5N.js";
|
|
43
45
|
import {
|
|
44
46
|
validateRunRecord
|
|
@@ -866,12 +868,29 @@ function buildRunRecord(args) {
|
|
|
866
868
|
perJudge[judgeName] = { ...js.dimensions };
|
|
867
869
|
for (const [dim, value] of Object.entries(js.dimensions)) {
|
|
868
870
|
raw[`${judgeName}.${dim}`] = value;
|
|
869
|
-
|
|
871
|
+
dimAccum[dim] ??= [];
|
|
872
|
+
dimAccum[dim].push(value);
|
|
870
873
|
}
|
|
871
874
|
if (js.notes) notes.push(`${judgeName}: ${js.notes}`);
|
|
872
875
|
}
|
|
873
876
|
const perDimMean = {};
|
|
874
877
|
for (const [dim, values] of Object.entries(dimAccum)) perDimMean[dim] = mean2(values);
|
|
878
|
+
let costUsd = cell.costUsd;
|
|
879
|
+
let costEstimated = false;
|
|
880
|
+
if (costUsd === 0 && cell.tokenUsage.output > 0 && isModelPriced(profile.model)) {
|
|
881
|
+
costUsd = estimateCost(cell.tokenUsage.input, cell.tokenUsage.output, profile.model);
|
|
882
|
+
costEstimated = costUsd > 0;
|
|
883
|
+
}
|
|
884
|
+
raw.cost_usd = costUsd;
|
|
885
|
+
raw.cost_estimated = costEstimated ? 1 : 0;
|
|
886
|
+
raw.tokens_input = cell.tokenUsage.input;
|
|
887
|
+
raw.tokens_output = cell.tokenUsage.output;
|
|
888
|
+
if (typeof cell.tokenUsage.cached === "number") raw.tokens_cached = cell.tokenUsage.cached;
|
|
889
|
+
raw.latency_ms = cell.durationMs;
|
|
890
|
+
if (costUsd > 0) {
|
|
891
|
+
raw.tokens_per_dollar = (cell.tokenUsage.input + cell.tokenUsage.output) / costUsd;
|
|
892
|
+
}
|
|
893
|
+
if (composite > 0.01) raw.cost_per_quality = costUsd / composite;
|
|
875
894
|
const outcome = splitTag === "holdout" ? { holdoutScore: composite, raw } : { searchScore: composite, raw };
|
|
876
895
|
if (Object.keys(perJudge).length > 0) {
|
|
877
896
|
outcome.judgeScores = {
|
|
@@ -881,7 +900,7 @@ function buildRunRecord(args) {
|
|
|
881
900
|
...notes.length > 0 ? { notes: notes.join(" | ") } : {}
|
|
882
901
|
};
|
|
883
902
|
}
|
|
884
|
-
|
|
903
|
+
const record = {
|
|
885
904
|
runId: `${matrixId}:${profile.id}:${cell.cellId}`,
|
|
886
905
|
experimentId,
|
|
887
906
|
candidateId: profile.id,
|
|
@@ -891,13 +910,24 @@ function buildRunRecord(args) {
|
|
|
891
910
|
configHash,
|
|
892
911
|
commitSha,
|
|
893
912
|
wallMs: cell.durationMs,
|
|
894
|
-
costUsd
|
|
913
|
+
costUsd,
|
|
895
914
|
tokenUsage: cell.tokenUsage,
|
|
896
915
|
outcome,
|
|
897
916
|
splitTag,
|
|
898
917
|
scenarioId: cell.scenarioId,
|
|
899
918
|
...cell.error ? { failureMode: cell.error } : {}
|
|
900
919
|
};
|
|
920
|
+
if (args.corpusText && args.scenario) {
|
|
921
|
+
try {
|
|
922
|
+
const text = args.corpusText(cell.artifact, args.scenario);
|
|
923
|
+
if (text && typeof text.prompt === "string" && typeof text.completion === "string") {
|
|
924
|
+
record.prompt = text.prompt;
|
|
925
|
+
record.completion = text.completion;
|
|
926
|
+
}
|
|
927
|
+
} catch {
|
|
928
|
+
}
|
|
929
|
+
}
|
|
930
|
+
return record;
|
|
901
931
|
}
|
|
902
932
|
async function runProfileMatrix(opts) {
|
|
903
933
|
if (opts.profiles.length === 0) throw new ProfileMatrixError("profiles must not be empty");
|
|
@@ -909,6 +939,7 @@ async function runProfileMatrix(opts) {
|
|
|
909
939
|
const profileIds = opts.profiles.map((p) => p.id);
|
|
910
940
|
const experimentId = opts.experimentId ?? `pm_${sha({ profileIds, scenarios: opts.scenarios.map((s) => s.id) }).slice(0, 16)}`;
|
|
911
941
|
const matrixId = `mtx_${sha({ experimentId, profileIds, seed, splitTag }).slice(0, 16)}`;
|
|
942
|
+
const scenarioById = new Map(opts.scenarios.map((s) => [s.id, s]));
|
|
912
943
|
for (const profile of opts.profiles) {
|
|
913
944
|
const profileHash = agentProfileHash(profile);
|
|
914
945
|
try {
|
|
@@ -960,7 +991,6 @@ async function runProfileMatrix(opts) {
|
|
|
960
991
|
now: opts.now,
|
|
961
992
|
runDir: join2(opts.runDir, sanitize(profile.id))
|
|
962
993
|
});
|
|
963
|
-
campaigns[profile.id] = campaign;
|
|
964
994
|
const profileRecords = [];
|
|
965
995
|
for (const cell of campaign.cells) {
|
|
966
996
|
const record = buildRunRecord({
|
|
@@ -971,19 +1001,26 @@ async function runProfileMatrix(opts) {
|
|
|
971
1001
|
experimentId,
|
|
972
1002
|
splitTag,
|
|
973
1003
|
commitSha: opts.commitSha,
|
|
974
|
-
matrixId
|
|
1004
|
+
matrixId,
|
|
1005
|
+
scenario: scenarioById.get(cell.scenarioId),
|
|
1006
|
+
corpusText: opts.corpusText
|
|
975
1007
|
});
|
|
976
1008
|
if (validate) validateRunRecord(record);
|
|
977
1009
|
profileRecords.push(record);
|
|
978
1010
|
records.push(record);
|
|
979
1011
|
}
|
|
1012
|
+
const pricedTotalCostUsd = profileRecords.reduce((a, r) => a + r.costUsd, 0);
|
|
1013
|
+
campaigns[profile.id] = {
|
|
1014
|
+
...campaign,
|
|
1015
|
+
aggregates: { ...campaign.aggregates, totalCostUsd: pricedTotalCostUsd }
|
|
1016
|
+
};
|
|
980
1017
|
byProfile[profile.id] = {
|
|
981
1018
|
profileId: profile.id,
|
|
982
1019
|
profileHash,
|
|
983
1020
|
model: profile.model,
|
|
984
1021
|
records: profileRecords.length,
|
|
985
1022
|
meanComposite: mean2(profileRecords.map(compositeOf)),
|
|
986
|
-
totalCostUsd:
|
|
1023
|
+
totalCostUsd: pricedTotalCostUsd,
|
|
987
1024
|
integrity: summarizeBackendIntegrity(profileRecords)
|
|
988
1025
|
};
|
|
989
1026
|
}
|