@tangle-network/agent-eval 0.45.0 → 0.46.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- import { S as Scenario, D as DispatchFn, g as DispatchContext } from '../types-BURGZ8Ug.js';
1
+ import { S as Scenario, g as DispatchFn, D as DispatchContext } from '../types-8u72Gc76.js';
2
2
 
3
3
  /**
4
4
  * # `@tangle-network/agent-eval/adapters/http` — distributed Dispatch over HTTP.
@@ -37,7 +37,9 @@ function httpDispatch(opts) {
37
37
  method: "POST",
38
38
  headers: {
39
39
  "Content-Type": "application/json",
40
- ...authValue ? { Authorization: authValue.startsWith("Bearer ") ? authValue : `Bearer ${authValue}` } : {},
40
+ ...authValue ? {
41
+ Authorization: authValue.startsWith("Bearer ") ? authValue : `Bearer ${authValue}`
42
+ } : {},
41
43
  ...opts.headers
42
44
  },
43
45
  body: JSON.stringify(body),
@@ -67,12 +69,15 @@ function httpDispatch(opts) {
67
69
  function sleep(ms) {
68
70
  return new Promise((resolve) => {
69
71
  const t = setTimeout(resolve, ms);
70
- if (typeof t.unref === "function") t.unref();
72
+ if (typeof t.unref === "function")
73
+ t.unref();
71
74
  });
72
75
  }
73
76
  async function runDispatchServer(opts) {
74
77
  if (opts.auth === void 0) {
75
- throw new Error("runDispatchServer: 'auth' is required (pass a bearer-token string, or `auth: false` explicitly for a closed-network test deployment).");
78
+ throw new Error(
79
+ "runDispatchServer: 'auth' is required (pass a bearer-token string, or `auth: false` explicitly for a closed-network test deployment)."
80
+ );
76
81
  }
77
82
  const path = opts.path ?? "/dispatch";
78
83
  const maxBytes = opts.maxBodyBytes ?? 10 * 1024 * 1024;
@@ -113,7 +118,9 @@ async function runDispatchServer(opts) {
113
118
  }
114
119
  chunks.push(buf);
115
120
  }
116
- const body = JSON.parse(Buffer.concat(chunks).toString("utf8"));
121
+ const body = JSON.parse(
122
+ Buffer.concat(chunks).toString("utf8")
123
+ );
117
124
  cellId = body.cellId;
118
125
  const ctx = opts.contextFactory ? await opts.contextFactory(body, aborter.signal) : {
119
126
  cellId: body.cellId,
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/adapters/http.ts"],"sourcesContent":["/**\n * # `@tangle-network/agent-eval/adapters/http` — distributed Dispatch over HTTP.\n *\n * Decouples driver and worker. The driver (running `runImprovementLoop` or\n * `runCampaign`) can live anywhere — your VPC, a dev laptop, a cron VM. The\n * workers (running the actual agent) can live anywhere else — different\n * regions, different clouds, different boxes — as long as they speak HTTP.\n *\n * Both sides:\n *\n * - **`httpDispatch({ url | resolveUrl, ... })`** — client. Returns a\n * `Dispatch` that POSTs `{ scenario, ctx }` to a worker URL and parses\n * the artifact back. AbortSignal-aware, retries on idempotent errors,\n * bounded timeout per call.\n * - **`runDispatchServer({ dispatch, port, ... })`** — server. Wraps your\n * local `Dispatch` as an HTTP endpoint. Handles auth, JSON parsing,\n * error mapping, and cancellation when the client aborts.\n *\n * # Topology examples\n *\n * **Single-worker:** driver on box A, worker on box B. Set\n * `httpDispatch({ url: 'https://box-b/dispatch' })`.\n *\n * **Multi-region:** N workers across regions. Use `httpDispatch({ resolveUrl })`\n * with a function that picks the URL per cell from `ctx.placement`. Combined\n * with `cellPlacement` on `RunCampaignOptions`, the substrate fans cells\n * across geographies in parallel.\n *\n * **Driver-as-a-service:** driver runs as a long-lived process or service\n * (holds optimization state across generations); workers are stateless\n * HTTP services that can scale horizontally per cell.\n */\n\nimport type { Dispatch, DispatchContext, Scenario } from '../contract'\n\n// ── Client ───────────────────────────────────────────────────────────\n\n// eslint-disable-next-line @typescript-eslint/no-unused-vars -- TArtifact is unused\n// in this options interface but kept as a parameter so callers can write\n// `HttpDispatchOptions<MyScenario, MyArtifact>` symmetrically with\n// `Dispatch<MyScenario, MyArtifact>`. Marking it unused at the position\n// where it bites.\nexport interface HttpDispatchOptions<TScenario extends Scenario, _TArtifact> {\n /** Static endpoint URL. Mutually exclusive with `resolveUrl`. */\n url?: string\n /**\n * Dynamic per-cell URL resolver. Receives the scenario + the substrate\n * placement key (from `RunCampaignOptions.cellPlacement`) and returns the\n * worker URL to invoke. Mutually exclusive with `url`.\n */\n resolveUrl?: (input: { scenario: TScenario; placement?: string; cellId: string }) => string\n /** Bearer token or static auth string set as `Authorization`. */\n auth?: string | (() => string | Promise<string>)\n /** Extra headers merged into every request. */\n headers?: Record<string, string>\n /** Per-call timeout in ms. Default 5 minutes. */\n timeoutMs?: number\n /** How many idempotent retries on 5xx / network errors. Default 2. */\n retries?: number\n /** Optional fetch override (auth wrappers, custom agent, mocks). */\n fetchImpl?: typeof fetch\n}\n\nexport interface HttpDispatchRequestBody<TScenario extends Scenario> {\n scenario: TScenario\n cellId: string\n rep: number\n generation?: number\n seed: number\n placement?: string\n cycleId?: string\n}\n\nexport interface HttpDispatchResponseBody<TArtifact> {\n artifact: TArtifact\n}\n\nfunction resolveAuth(auth: HttpDispatchOptions<Scenario, unknown>['auth']): Promise<string | null> {\n if (!auth) return Promise.resolve(null)\n if (typeof auth === 'string') return Promise.resolve(auth)\n return Promise.resolve(auth())\n}\n\n/**\n * Wrap a remote HTTP endpoint as a `Dispatch`. The remote side should run\n * `runDispatchServer` (or any service that speaks the same wire shape).\n *\n * Cancellation: the substrate's per-cell `AbortSignal` is forwarded; the\n * server's `runDispatchServer` translates the resulting `AbortError` into\n * a 499 (client-closed) so the client doesn't retry.\n */\nexport function httpDispatch<TScenario extends Scenario, TArtifact>(\n opts: HttpDispatchOptions<TScenario, TArtifact>,\n): Dispatch<TScenario, TArtifact> {\n if (!opts.url && !opts.resolveUrl) {\n throw new Error('httpDispatch: pass exactly one of `url` or `resolveUrl`.')\n }\n if (opts.url && opts.resolveUrl) {\n throw new Error('httpDispatch: pass exactly one of `url` or `resolveUrl`, not both.')\n }\n const timeoutMs = opts.timeoutMs ?? 5 * 60 * 1000\n const maxRetries = opts.retries ?? 2\n const f: typeof fetch = opts.fetchImpl ?? ((...args) => fetch(...args))\n\n return async (scenario, ctx) => {\n const url = opts.url ?? opts.resolveUrl!({ scenario, placement: ctx.placement, cellId: ctx.cellId })\n const authValue = await resolveAuth(opts.auth)\n const body: HttpDispatchRequestBody<TScenario> = {\n scenario,\n cellId: ctx.cellId,\n rep: ctx.rep,\n generation: ctx.generation,\n seed: ctx.seed,\n placement: ctx.placement,\n cycleId: ctx.cycleId,\n }\n\n let lastError: unknown\n for (let attempt = 0; attempt <= maxRetries; attempt++) {\n // Compose the request signal: caller's signal OR our timeout.\n const ourTimeout = AbortSignal.timeout(timeoutMs)\n const combinedSignal = AbortSignal.any([ctx.signal, ourTimeout])\n try {\n const res = await f(url, {\n method: 'POST',\n headers: {\n 'Content-Type': 'application/json',\n ...(authValue ? { Authorization: authValue.startsWith('Bearer ') ? authValue : `Bearer ${authValue}` } : {}),\n ...opts.headers,\n },\n body: JSON.stringify(body),\n signal: combinedSignal,\n })\n if (!res.ok) {\n // 4xx is non-retryable (caller error, auth, bad scenario shape).\n // 5xx / 408 / 429 / 502 / 503 / 504 are retryable.\n const retryable = res.status >= 500 || res.status === 408 || res.status === 429\n if (!retryable || attempt === maxRetries) {\n const text = await res.text().catch(() => '')\n throw new Error(`httpDispatch ${url} failed (${res.status}): ${text.slice(0, 500)}`)\n }\n // exponential backoff with jitter\n await sleep(2 ** attempt * 200 + Math.random() * 200)\n continue\n }\n const parsed = (await res.json()) as HttpDispatchResponseBody<TArtifact>\n return parsed.artifact\n } catch (err) {\n // Caller-driven abort is terminal — never retry.\n if (ctx.signal.aborted) throw err\n lastError = err\n if (attempt === maxRetries) throw err\n await sleep(2 ** attempt * 200 + Math.random() * 200)\n }\n }\n throw lastError ?? new Error('httpDispatch exhausted retries')\n }\n}\n\nfunction sleep(ms: number): Promise<void> {\n return new Promise((resolve) => {\n const t = setTimeout(resolve, ms)\n // Don't keep node process alive purely for backoff sleeps.\n if (typeof (t as { unref?: () => void }).unref === 'function') (t as { unref: () => void }).unref()\n })\n}\n\n// ── Server ───────────────────────────────────────────────────────────\n\nexport interface RunDispatchServerOptions<TScenario extends Scenario, TArtifact> {\n /** The Dispatch this server exposes — what runs when a request lands. */\n dispatch: Dispatch<TScenario, TArtifact>\n /** TCP port to bind. */\n port: number\n /** Optional bind host; defaults to 0.0.0.0. */\n host?: string\n /** Required for any non-test deployment: the bearer token clients must\n * send. The substrate refuses to start without auth unless `auth: false`\n * is set explicitly (intended ONLY for closed-network/internal testing). */\n auth: string | false\n /** Path the server listens on. Default `/dispatch`. */\n path?: string\n /**\n * Per-request handler that wraps `dispatch` with whatever context the\n * worker side needs to construct a `DispatchContext` — typically the\n * trace writer, artifact writer, and cost meter. The substrate provides\n * synthetic-but-typed defaults if not supplied; production deployments\n * should wire real ones (e.g. ship traces to your OTel collector).\n */\n contextFactory?: (req: HttpDispatchRequestBody<TScenario>, signal: AbortSignal) => Promise<DispatchContext>\n /** Optional max payload size for the request body (bytes). Default 10 MB. */\n maxBodyBytes?: number\n /** Hook for observability — called on every successful or failed turn. */\n onRequest?: (event: {\n cellId: string\n durationMs: number\n success: boolean\n error?: unknown\n }) => void\n}\n\nexport interface DispatchServerHandle {\n /** The actual bound port (useful when `port: 0` requests an ephemeral port). */\n port: number\n /** Stop accepting new connections and drain existing ones. */\n close: () => Promise<void>\n}\n\n/**\n * Start an HTTP server exposing a local `Dispatch` over the wire. Pair with\n * `httpDispatch` on the driver side.\n *\n * Wire shape:\n *\n * POST /dispatch\n * Authorization: Bearer <token>\n * Body: HttpDispatchRequestBody\n * 200 OK: HttpDispatchResponseBody\n * 401: missing/invalid auth\n * 408: per-request timeout exceeded\n * 499: client aborted before completion\n * 500: dispatch threw\n *\n * The server is `node:http`-based to keep the runtime dependency surface\n * minimal — works in plain Node, sandbox, or any container.\n */\nexport async function runDispatchServer<TScenario extends Scenario, TArtifact>(\n opts: RunDispatchServerOptions<TScenario, TArtifact>,\n): Promise<DispatchServerHandle> {\n if (opts.auth === undefined) {\n throw new Error(\"runDispatchServer: 'auth' is required (pass a bearer-token string, or `auth: false` explicitly for a closed-network test deployment).\")\n }\n const path = opts.path ?? '/dispatch'\n const maxBytes = opts.maxBodyBytes ?? 10 * 1024 * 1024\n const expectedAuth = typeof opts.auth === 'string' ? `Bearer ${opts.auth.replace(/^Bearer\\s+/, '')}` : null\n\n // Lazy-import node:http so the file is usable from non-Node bundlers\n // that import the client side only (e.g. an edge driver shipping\n // httpDispatch alone). Server side is opt-in by calling this function.\n const { createServer } = await import('node:http')\n\n const server = createServer(async (req, res) => {\n const start = Date.now()\n let cellId = 'unknown'\n let success = false\n let errCaught: unknown\n\n try {\n if (req.method !== 'POST' || req.url?.split('?')[0] !== path) {\n res.statusCode = 404\n res.end('not found')\n return\n }\n if (expectedAuth) {\n const got = req.headers['authorization']\n if (got !== expectedAuth) {\n res.statusCode = 401\n res.end('unauthorized')\n return\n }\n }\n\n // Read body up to maxBytes\n const chunks: Buffer[] = []\n let totalBytes = 0\n const aborter = new AbortController()\n req.on('close', () => {\n if (!res.writableEnded) aborter.abort()\n })\n\n for await (const chunk of req) {\n const buf = chunk as Buffer\n totalBytes += buf.length\n if (totalBytes > maxBytes) {\n res.statusCode = 413\n res.end('payload too large')\n return\n }\n chunks.push(buf)\n }\n\n const body = JSON.parse(Buffer.concat(chunks).toString('utf8')) as HttpDispatchRequestBody<TScenario>\n cellId = body.cellId\n\n const ctx: DispatchContext = opts.contextFactory\n ? await opts.contextFactory(body, aborter.signal)\n : {\n cellId: body.cellId,\n rep: body.rep,\n generation: body.generation,\n seed: body.seed,\n signal: aborter.signal,\n placement: body.placement,\n cycleId: body.cycleId,\n trace: NOOP_TRACE,\n artifacts: NOOP_ARTIFACTS,\n cost: NOOP_COST,\n }\n\n const artifact = await opts.dispatch(body.scenario, ctx)\n const responseBody: HttpDispatchResponseBody<TArtifact> = { artifact }\n\n res.statusCode = 200\n res.setHeader('content-type', 'application/json')\n res.end(JSON.stringify(responseBody))\n success = true\n } catch (err) {\n errCaught = err\n // Client-cancelled — they don't care about the result.\n if ((err as Error)?.name === 'AbortError') {\n res.statusCode = 499\n res.end('client aborted')\n return\n }\n res.statusCode = 500\n res.setHeader('content-type', 'application/json')\n res.end(JSON.stringify({ error: err instanceof Error ? err.message : String(err) }))\n } finally {\n opts.onRequest?.({\n cellId,\n durationMs: Date.now() - start,\n success,\n error: errCaught,\n })\n }\n })\n\n await new Promise<void>((resolve, reject) => {\n server.once('error', reject)\n server.listen(opts.port, opts.host ?? '0.0.0.0', () => resolve())\n })\n\n const addr = server.address()\n const boundPort = typeof addr === 'object' && addr ? addr.port : opts.port\n\n return {\n port: boundPort,\n close: () =>\n new Promise<void>((resolve, reject) => {\n server.close((err) => (err ? reject(err) : resolve()))\n }),\n }\n}\n\n// ── No-op default ctx machinery (worker can replace via contextFactory) ──\n\nconst NOOP_TRACE = {\n span: () => ({\n end: () => {},\n setAttribute: () => {},\n setStatus: () => {},\n recordException: () => {},\n addEvent: () => {},\n }),\n} as unknown as DispatchContext['trace']\n\nconst NOOP_ARTIFACTS = {\n write: async () => undefined,\n read: async () => undefined,\n list: async () => [],\n} as unknown as DispatchContext['artifacts']\n\nconst NOOP_COST = {\n record: () => {},\n total: () => 0,\n} as unknown as DispatchContext['cost']\n"],"mappings":";;;AA6EA,SAAS,YAAY,MAA8E;AACjG,MAAI,CAAC,KAAM,QAAO,QAAQ,QAAQ,IAAI;AACtC,MAAI,OAAO,SAAS,SAAU,QAAO,QAAQ,QAAQ,IAAI;AACzD,SAAO,QAAQ,QAAQ,KAAK,CAAC;AAC/B;AAUO,SAAS,aACd,MACgC;AAChC,MAAI,CAAC,KAAK,OAAO,CAAC,KAAK,YAAY;AACjC,UAAM,IAAI,MAAM,0DAA0D;AAAA,EAC5E;AACA,MAAI,KAAK,OAAO,KAAK,YAAY;AAC/B,UAAM,IAAI,MAAM,oEAAoE;AAAA,EACtF;AACA,QAAM,YAAY,KAAK,aAAa,IAAI,KAAK;AAC7C,QAAM,aAAa,KAAK,WAAW;AACnC,QAAM,IAAkB,KAAK,cAAc,IAAI,SAAS,MAAM,GAAG,IAAI;AAErE,SAAO,OAAO,UAAU,QAAQ;AAC9B,UAAM,MAAM,KAAK,OAAO,KAAK,WAAY,EAAE,UAAU,WAAW,IAAI,WAAW,QAAQ,IAAI,OAAO,CAAC;AACnG,UAAM,YAAY,MAAM,YAAY,KAAK,IAAI;AAC7C,UAAM,OAA2C;AAAA,MAC/C;AAAA,MACA,QAAQ,IAAI;AAAA,MACZ,KAAK,IAAI;AAAA,MACT,YAAY,IAAI;AAAA,MAChB,MAAM,IAAI;AAAA,MACV,WAAW,IAAI;AAAA,MACf,SAAS,IAAI;AAAA,IACf;AAEA,QAAI;AACJ,aAAS,UAAU,GAAG,WAAW,YAAY,WAAW;AAEtD,YAAM,aAAa,YAAY,QAAQ,SAAS;AAChD,YAAM,iBAAiB,YAAY,IAAI,CAAC,IAAI,QAAQ,UAAU,CAAC;AAC/D,UAAI;AACF,cAAM,MAAM,MAAM,EAAE,KAAK;AAAA,UACvB,QAAQ;AAAA,UACR,SAAS;AAAA,YACP,gBAAgB;AAAA,YAChB,GAAI,YAAY,EAAE,eAAe,UAAU,WAAW,SAAS,IAAI,YAAY,UAAU,SAAS,GAAG,IAAI,CAAC;AAAA,YAC1G,GAAG,KAAK;AAAA,UACV;AAAA,UACA,MAAM,KAAK,UAAU,IAAI;AAAA,UACzB,QAAQ;AAAA,QACV,CAAC;AACD,YAAI,CAAC,IAAI,IAAI;AAGX,gBAAM,YAAY,IAAI,UAAU,OAAO,IAAI,WAAW,OAAO,IAAI,WAAW;AAC5E,cAAI,CAAC,aAAa,YAAY,YAAY;AACxC,kBAAM,OAAO,MAAM,IAAI,KAAK,EAAE,MAAM,MAAM,EAAE;AAC5C,kBAAM,IAAI,MAAM,gBAAgB,GAAG,YAAY,IAAI,MAAM,MAAM,KAAK,MAAM,GAAG,GAAG,CAAC,EAAE;AAAA,UACrF;AAEA,gBAAM,MAAM,KAAK,UAAU,MAAM,KAAK,OAAO,IAAI,GAAG;AACpD;AAAA,QACF;AACA,cAAM,SAAU,MAAM,IAAI,KAAK;AAC/B,eAAO,OAAO;AAAA,MAChB,SAAS,KAAK;AAEZ,YAAI,IAAI,OAAO,QAAS,OAAM;AAC9B,oBAAY;AACZ,YAAI,YAAY,WAAY,OAAM;AAClC,cAAM,MAAM,KAAK,UAAU,MAAM,KAAK,OAAO,IAAI,GAAG;AAAA,MACtD;AAAA,IACF;AACA,UAAM,aAAa,IAAI,MAAM,gCAAgC;AAAA,EAC/D;AACF;AAEA,SAAS,MAAM,IAA2B;AACxC,SAAO,IAAI,QAAQ,CAAC,YAAY;AAC9B,UAAM,IAAI,WAAW,SAAS,EAAE;AAEhC,QAAI,OAAQ,EAA6B,UAAU,WAAY,CAAC,EAA4B,MAAM;AAAA,EACpG,CAAC;AACH;AA6DA,eAAsB,kBACpB,MAC+B;AAC/B,MAAI,KAAK,SAAS,QAAW;AAC3B,UAAM,IAAI,MAAM,uIAAuI;AAAA,EACzJ;AACA,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,WAAW,KAAK,gBAAgB,KAAK,OAAO;AAClD,QAAM,eAAe,OAAO,KAAK,SAAS,WAAW,UAAU,KAAK,KAAK,QAAQ,cAAc,EAAE,CAAC,KAAK;AAKvG,QAAM,EAAE,aAAa,IAAI,MAAM,OAAO,MAAW;AAEjD,QAAM,SAAS,aAAa,OAAO,KAAK,QAAQ;AAC9C,UAAM,QAAQ,KAAK,IAAI;AACvB,QAAI,SAAS;AACb,QAAI,UAAU;AACd,QAAI;AAEJ,QAAI;AACF,UAAI,IAAI,WAAW,UAAU,IAAI,KAAK,MAAM,GAAG,EAAE,CAAC,MAAM,MAAM;AAC5D,YAAI,aAAa;AACjB,YAAI,IAAI,WAAW;AACnB;AAAA,MACF;AACA,UAAI,cAAc;AAChB,cAAM,MAAM,IAAI,QAAQ,eAAe;AACvC,YAAI,QAAQ,cAAc;AACxB,cAAI,aAAa;AACjB,cAAI,IAAI,cAAc;AACtB;AAAA,QACF;AAAA,MACF;AAGA,YAAM,SAAmB,CAAC;AAC1B,UAAI,aAAa;AACjB,YAAM,UAAU,IAAI,gBAAgB;AACpC,UAAI,GAAG,SAAS,MAAM;AACpB,YAAI,CAAC,IAAI,cAAe,SAAQ,MAAM;AAAA,MACxC,CAAC;AAED,uBAAiB,SAAS,KAAK;AAC7B,cAAM,MAAM;AACZ,sBAAc,IAAI;AAClB,YAAI,aAAa,UAAU;AACzB,cAAI,aAAa;AACjB,cAAI,IAAI,mBAAmB;AAC3B;AAAA,QACF;AACA,eAAO,KAAK,GAAG;AAAA,MACjB;AAEA,YAAM,OAAO,KAAK,MAAM,OAAO,OAAO,MAAM,EAAE,SAAS,MAAM,CAAC;AAC9D,eAAS,KAAK;AAEd,YAAM,MAAuB,KAAK,iBAC9B,MAAM,KAAK,eAAe,MAAM,QAAQ,MAAM,IAC9C;AAAA,QACE,QAAQ,KAAK;AAAA,QACb,KAAK,KAAK;AAAA,QACV,YAAY,KAAK;AAAA,QACjB,MAAM,KAAK;AAAA,QACX,QAAQ,QAAQ;AAAA,QAChB,WAAW,KAAK;AAAA,QAChB,SAAS,KAAK;AAAA,QACd,OAAO;AAAA,QACP,WAAW;AAAA,QACX,MAAM;AAAA,MACR;AAEJ,YAAM,WAAW,MAAM,KAAK,SAAS,KAAK,UAAU,GAAG;AACvD,YAAM,eAAoD,EAAE,SAAS;AAErE,UAAI,aAAa;AACjB,UAAI,UAAU,gBAAgB,kBAAkB;AAChD,UAAI,IAAI,KAAK,UAAU,YAAY,CAAC;AACpC,gBAAU;AAAA,IACZ,SAAS,KAAK;AACZ,kBAAY;AAEZ,UAAK,KAAe,SAAS,cAAc;AACzC,YAAI,aAAa;AACjB,YAAI,IAAI,gBAAgB;AACxB;AAAA,MACF;AACA,UAAI,aAAa;AACjB,UAAI,UAAU,gBAAgB,kBAAkB;AAChD,UAAI,IAAI,KAAK,UAAU,EAAE,OAAO,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,EAAE,CAAC,CAAC;AAAA,IACrF,UAAE;AACA,WAAK,YAAY;AAAA,QACf;AAAA,QACA,YAAY,KAAK,IAAI,IAAI;AAAA,QACzB;AAAA,QACA,OAAO;AAAA,MACT,CAAC;AAAA,IACH;AAAA,EACF,CAAC;AAED,QAAM,IAAI,QAAc,CAAC,SAAS,WAAW;AAC3C,WAAO,KAAK,SAAS,MAAM;AAC3B,WAAO,OAAO,KAAK,MAAM,KAAK,QAAQ,WAAW,MAAM,QAAQ,CAAC;AAAA,EAClE,CAAC;AAED,QAAM,OAAO,OAAO,QAAQ;AAC5B,QAAM,YAAY,OAAO,SAAS,YAAY,OAAO,KAAK,OAAO,KAAK;AAEtE,SAAO;AAAA,IACL,MAAM;AAAA,IACN,OAAO,MACL,IAAI,QAAc,CAAC,SAAS,WAAW;AACrC,aAAO,MAAM,CAAC,QAAS,MAAM,OAAO,GAAG,IAAI,QAAQ,CAAE;AAAA,IACvD,CAAC;AAAA,EACL;AACF;AAIA,IAAM,aAAa;AAAA,EACjB,MAAM,OAAO;AAAA,IACX,KAAK,MAAM;AAAA,IAAC;AAAA,IACZ,cAAc,MAAM;AAAA,IAAC;AAAA,IACrB,WAAW,MAAM;AAAA,IAAC;AAAA,IAClB,iBAAiB,MAAM;AAAA,IAAC;AAAA,IACxB,UAAU,MAAM;AAAA,IAAC;AAAA,EACnB;AACF;AAEA,IAAM,iBAAiB;AAAA,EACrB,OAAO,YAAY;AAAA,EACnB,MAAM,YAAY;AAAA,EAClB,MAAM,YAAY,CAAC;AACrB;AAEA,IAAM,YAAY;AAAA,EAChB,QAAQ,MAAM;AAAA,EAAC;AAAA,EACf,OAAO,MAAM;AACf;","names":[]}
1
+ {"version":3,"sources":["../../src/adapters/http.ts"],"sourcesContent":["/**\n * # `@tangle-network/agent-eval/adapters/http` — distributed Dispatch over HTTP.\n *\n * Decouples driver and worker. The driver (running `runImprovementLoop` or\n * `runCampaign`) can live anywhere — your VPC, a dev laptop, a cron VM. The\n * workers (running the actual agent) can live anywhere else — different\n * regions, different clouds, different boxes — as long as they speak HTTP.\n *\n * Both sides:\n *\n * - **`httpDispatch({ url | resolveUrl, ... })`** — client. Returns a\n * `Dispatch` that POSTs `{ scenario, ctx }` to a worker URL and parses\n * the artifact back. AbortSignal-aware, retries on idempotent errors,\n * bounded timeout per call.\n * - **`runDispatchServer({ dispatch, port, ... })`** — server. Wraps your\n * local `Dispatch` as an HTTP endpoint. Handles auth, JSON parsing,\n * error mapping, and cancellation when the client aborts.\n *\n * # Topology examples\n *\n * **Single-worker:** driver on box A, worker on box B. Set\n * `httpDispatch({ url: 'https://box-b/dispatch' })`.\n *\n * **Multi-region:** N workers across regions. Use `httpDispatch({ resolveUrl })`\n * with a function that picks the URL per cell from `ctx.placement`. Combined\n * with `cellPlacement` on `RunCampaignOptions`, the substrate fans cells\n * across geographies in parallel.\n *\n * **Driver-as-a-service:** driver runs as a long-lived process or service\n * (holds optimization state across generations); workers are stateless\n * HTTP services that can scale horizontally per cell.\n */\n\nimport type { Dispatch, DispatchContext, Scenario } from '../contract'\n\n// ── Client ───────────────────────────────────────────────────────────\n\n// eslint-disable-next-line @typescript-eslint/no-unused-vars -- TArtifact is unused\n// in this options interface but kept as a parameter so callers can write\n// `HttpDispatchOptions<MyScenario, MyArtifact>` symmetrically with\n// `Dispatch<MyScenario, MyArtifact>`. Marking it unused at the position\n// where it bites.\nexport interface HttpDispatchOptions<TScenario extends Scenario, _TArtifact> {\n /** Static endpoint URL. Mutually exclusive with `resolveUrl`. */\n url?: string\n /**\n * Dynamic per-cell URL resolver. Receives the scenario + the substrate\n * placement key (from `RunCampaignOptions.cellPlacement`) and returns the\n * worker URL to invoke. Mutually exclusive with `url`.\n */\n resolveUrl?: (input: { scenario: TScenario; placement?: string; cellId: string }) => string\n /** Bearer token or static auth string set as `Authorization`. */\n auth?: string | (() => string | Promise<string>)\n /** Extra headers merged into every request. */\n headers?: Record<string, string>\n /** Per-call timeout in ms. Default 5 minutes. */\n timeoutMs?: number\n /** How many idempotent retries on 5xx / network errors. Default 2. */\n retries?: number\n /** Optional fetch override (auth wrappers, custom agent, mocks). */\n fetchImpl?: typeof fetch\n}\n\nexport interface HttpDispatchRequestBody<TScenario extends Scenario> {\n scenario: TScenario\n cellId: string\n rep: number\n generation?: number\n seed: number\n placement?: string\n cycleId?: string\n}\n\nexport interface HttpDispatchResponseBody<TArtifact> {\n artifact: TArtifact\n}\n\nfunction resolveAuth(auth: HttpDispatchOptions<Scenario, unknown>['auth']): Promise<string | null> {\n if (!auth) return Promise.resolve(null)\n if (typeof auth === 'string') return Promise.resolve(auth)\n return Promise.resolve(auth())\n}\n\n/**\n * Wrap a remote HTTP endpoint as a `Dispatch`. The remote side should run\n * `runDispatchServer` (or any service that speaks the same wire shape).\n *\n * Cancellation: the substrate's per-cell `AbortSignal` is forwarded; the\n * server's `runDispatchServer` translates the resulting `AbortError` into\n * a 499 (client-closed) so the client doesn't retry.\n */\nexport function httpDispatch<TScenario extends Scenario, TArtifact>(\n opts: HttpDispatchOptions<TScenario, TArtifact>,\n): Dispatch<TScenario, TArtifact> {\n if (!opts.url && !opts.resolveUrl) {\n throw new Error('httpDispatch: pass exactly one of `url` or `resolveUrl`.')\n }\n if (opts.url && opts.resolveUrl) {\n throw new Error('httpDispatch: pass exactly one of `url` or `resolveUrl`, not both.')\n }\n const timeoutMs = opts.timeoutMs ?? 5 * 60 * 1000\n const maxRetries = opts.retries ?? 2\n const f: typeof fetch = opts.fetchImpl ?? ((...args) => fetch(...args))\n\n return async (scenario, ctx) => {\n const url =\n opts.url ?? opts.resolveUrl!({ scenario, placement: ctx.placement, cellId: ctx.cellId })\n const authValue = await resolveAuth(opts.auth)\n const body: HttpDispatchRequestBody<TScenario> = {\n scenario,\n cellId: ctx.cellId,\n rep: ctx.rep,\n generation: ctx.generation,\n seed: ctx.seed,\n placement: ctx.placement,\n cycleId: ctx.cycleId,\n }\n\n let lastError: unknown\n for (let attempt = 0; attempt <= maxRetries; attempt++) {\n // Compose the request signal: caller's signal OR our timeout.\n const ourTimeout = AbortSignal.timeout(timeoutMs)\n const combinedSignal = AbortSignal.any([ctx.signal, ourTimeout])\n try {\n const res = await f(url, {\n method: 'POST',\n headers: {\n 'Content-Type': 'application/json',\n ...(authValue\n ? {\n Authorization: authValue.startsWith('Bearer ')\n ? authValue\n : `Bearer ${authValue}`,\n }\n : {}),\n ...opts.headers,\n },\n body: JSON.stringify(body),\n signal: combinedSignal,\n })\n if (!res.ok) {\n // 4xx is non-retryable (caller error, auth, bad scenario shape).\n // 5xx / 408 / 429 / 502 / 503 / 504 are retryable.\n const retryable = res.status >= 500 || res.status === 408 || res.status === 429\n if (!retryable || attempt === maxRetries) {\n const text = await res.text().catch(() => '')\n throw new Error(`httpDispatch ${url} failed (${res.status}): ${text.slice(0, 500)}`)\n }\n // exponential backoff with jitter\n await sleep(2 ** attempt * 200 + Math.random() * 200)\n continue\n }\n const parsed = (await res.json()) as HttpDispatchResponseBody<TArtifact>\n return parsed.artifact\n } catch (err) {\n // Caller-driven abort is terminal — never retry.\n if (ctx.signal.aborted) throw err\n lastError = err\n if (attempt === maxRetries) throw err\n await sleep(2 ** attempt * 200 + Math.random() * 200)\n }\n }\n throw lastError ?? new Error('httpDispatch exhausted retries')\n }\n}\n\nfunction sleep(ms: number): Promise<void> {\n return new Promise((resolve) => {\n const t = setTimeout(resolve, ms)\n // Don't keep node process alive purely for backoff sleeps.\n if (typeof (t as { unref?: () => void }).unref === 'function')\n (t as { unref: () => void }).unref()\n })\n}\n\n// ── Server ───────────────────────────────────────────────────────────\n\nexport interface RunDispatchServerOptions<TScenario extends Scenario, TArtifact> {\n /** The Dispatch this server exposes — what runs when a request lands. */\n dispatch: Dispatch<TScenario, TArtifact>\n /** TCP port to bind. */\n port: number\n /** Optional bind host; defaults to 0.0.0.0. */\n host?: string\n /** Required for any non-test deployment: the bearer token clients must\n * send. The substrate refuses to start without auth unless `auth: false`\n * is set explicitly (intended ONLY for closed-network/internal testing). */\n auth: string | false\n /** Path the server listens on. Default `/dispatch`. */\n path?: string\n /**\n * Per-request handler that wraps `dispatch` with whatever context the\n * worker side needs to construct a `DispatchContext` — typically the\n * trace writer, artifact writer, and cost meter. The substrate provides\n * synthetic-but-typed defaults if not supplied; production deployments\n * should wire real ones (e.g. ship traces to your OTel collector).\n */\n contextFactory?: (\n req: HttpDispatchRequestBody<TScenario>,\n signal: AbortSignal,\n ) => Promise<DispatchContext>\n /** Optional max payload size for the request body (bytes). Default 10 MB. */\n maxBodyBytes?: number\n /** Hook for observability — called on every successful or failed turn. */\n onRequest?: (event: {\n cellId: string\n durationMs: number\n success: boolean\n error?: unknown\n }) => void\n}\n\nexport interface DispatchServerHandle {\n /** The actual bound port (useful when `port: 0` requests an ephemeral port). */\n port: number\n /** Stop accepting new connections and drain existing ones. */\n close: () => Promise<void>\n}\n\n/**\n * Start an HTTP server exposing a local `Dispatch` over the wire. Pair with\n * `httpDispatch` on the driver side.\n *\n * Wire shape:\n *\n * POST /dispatch\n * Authorization: Bearer <token>\n * Body: HttpDispatchRequestBody\n * 200 OK: HttpDispatchResponseBody\n * 401: missing/invalid auth\n * 408: per-request timeout exceeded\n * 499: client aborted before completion\n * 500: dispatch threw\n *\n * The server is `node:http`-based to keep the runtime dependency surface\n * minimal — works in plain Node, sandbox, or any container.\n */\nexport async function runDispatchServer<TScenario extends Scenario, TArtifact>(\n opts: RunDispatchServerOptions<TScenario, TArtifact>,\n): Promise<DispatchServerHandle> {\n if (opts.auth === undefined) {\n throw new Error(\n \"runDispatchServer: 'auth' is required (pass a bearer-token string, or `auth: false` explicitly for a closed-network test deployment).\",\n )\n }\n const path = opts.path ?? '/dispatch'\n const maxBytes = opts.maxBodyBytes ?? 10 * 1024 * 1024\n const expectedAuth =\n typeof opts.auth === 'string' ? `Bearer ${opts.auth.replace(/^Bearer\\s+/, '')}` : null\n\n // Lazy-import node:http so the file is usable from non-Node bundlers\n // that import the client side only (e.g. an edge driver shipping\n // httpDispatch alone). Server side is opt-in by calling this function.\n const { createServer } = await import('node:http')\n\n const server = createServer(async (req, res) => {\n const start = Date.now()\n let cellId = 'unknown'\n let success = false\n let errCaught: unknown\n\n try {\n if (req.method !== 'POST' || req.url?.split('?')[0] !== path) {\n res.statusCode = 404\n res.end('not found')\n return\n }\n if (expectedAuth) {\n const got = req.headers['authorization']\n if (got !== expectedAuth) {\n res.statusCode = 401\n res.end('unauthorized')\n return\n }\n }\n\n // Read body up to maxBytes\n const chunks: Buffer[] = []\n let totalBytes = 0\n const aborter = new AbortController()\n req.on('close', () => {\n if (!res.writableEnded) aborter.abort()\n })\n\n for await (const chunk of req) {\n const buf = chunk as Buffer\n totalBytes += buf.length\n if (totalBytes > maxBytes) {\n res.statusCode = 413\n res.end('payload too large')\n return\n }\n chunks.push(buf)\n }\n\n const body = JSON.parse(\n Buffer.concat(chunks).toString('utf8'),\n ) as HttpDispatchRequestBody<TScenario>\n cellId = body.cellId\n\n const ctx: DispatchContext = opts.contextFactory\n ? await opts.contextFactory(body, aborter.signal)\n : {\n cellId: body.cellId,\n rep: body.rep,\n generation: body.generation,\n seed: body.seed,\n signal: aborter.signal,\n placement: body.placement,\n cycleId: body.cycleId,\n trace: NOOP_TRACE,\n artifacts: NOOP_ARTIFACTS,\n cost: NOOP_COST,\n }\n\n const artifact = await opts.dispatch(body.scenario, ctx)\n const responseBody: HttpDispatchResponseBody<TArtifact> = { artifact }\n\n res.statusCode = 200\n res.setHeader('content-type', 'application/json')\n res.end(JSON.stringify(responseBody))\n success = true\n } catch (err) {\n errCaught = err\n // Client-cancelled — they don't care about the result.\n if ((err as Error)?.name === 'AbortError') {\n res.statusCode = 499\n res.end('client aborted')\n return\n }\n res.statusCode = 500\n res.setHeader('content-type', 'application/json')\n res.end(JSON.stringify({ error: err instanceof Error ? err.message : String(err) }))\n } finally {\n opts.onRequest?.({\n cellId,\n durationMs: Date.now() - start,\n success,\n error: errCaught,\n })\n }\n })\n\n await new Promise<void>((resolve, reject) => {\n server.once('error', reject)\n server.listen(opts.port, opts.host ?? '0.0.0.0', () => resolve())\n })\n\n const addr = server.address()\n const boundPort = typeof addr === 'object' && addr ? addr.port : opts.port\n\n return {\n port: boundPort,\n close: () =>\n new Promise<void>((resolve, reject) => {\n server.close((err) => (err ? reject(err) : resolve()))\n }),\n }\n}\n\n// ── No-op default ctx machinery (worker can replace via contextFactory) ──\n\nconst NOOP_TRACE = {\n span: () => ({\n end: () => {},\n setAttribute: () => {},\n setStatus: () => {},\n recordException: () => {},\n addEvent: () => {},\n }),\n} as unknown as DispatchContext['trace']\n\nconst NOOP_ARTIFACTS = {\n write: async () => undefined,\n read: async () => undefined,\n list: async () => [],\n} as unknown as DispatchContext['artifacts']\n\nconst NOOP_COST = {\n record: () => {},\n total: () => 0,\n} as unknown as DispatchContext['cost']\n"],"mappings":";;;AA6EA,SAAS,YAAY,MAA8E;AACjG,MAAI,CAAC,KAAM,QAAO,QAAQ,QAAQ,IAAI;AACtC,MAAI,OAAO,SAAS,SAAU,QAAO,QAAQ,QAAQ,IAAI;AACzD,SAAO,QAAQ,QAAQ,KAAK,CAAC;AAC/B;AAUO,SAAS,aACd,MACgC;AAChC,MAAI,CAAC,KAAK,OAAO,CAAC,KAAK,YAAY;AACjC,UAAM,IAAI,MAAM,0DAA0D;AAAA,EAC5E;AACA,MAAI,KAAK,OAAO,KAAK,YAAY;AAC/B,UAAM,IAAI,MAAM,oEAAoE;AAAA,EACtF;AACA,QAAM,YAAY,KAAK,aAAa,IAAI,KAAK;AAC7C,QAAM,aAAa,KAAK,WAAW;AACnC,QAAM,IAAkB,KAAK,cAAc,IAAI,SAAS,MAAM,GAAG,IAAI;AAErE,SAAO,OAAO,UAAU,QAAQ;AAC9B,UAAM,MACJ,KAAK,OAAO,KAAK,WAAY,EAAE,UAAU,WAAW,IAAI,WAAW,QAAQ,IAAI,OAAO,CAAC;AACzF,UAAM,YAAY,MAAM,YAAY,KAAK,IAAI;AAC7C,UAAM,OAA2C;AAAA,MAC/C;AAAA,MACA,QAAQ,IAAI;AAAA,MACZ,KAAK,IAAI;AAAA,MACT,YAAY,IAAI;AAAA,MAChB,MAAM,IAAI;AAAA,MACV,WAAW,IAAI;AAAA,MACf,SAAS,IAAI;AAAA,IACf;AAEA,QAAI;AACJ,aAAS,UAAU,GAAG,WAAW,YAAY,WAAW;AAEtD,YAAM,aAAa,YAAY,QAAQ,SAAS;AAChD,YAAM,iBAAiB,YAAY,IAAI,CAAC,IAAI,QAAQ,UAAU,CAAC;AAC/D,UAAI;AACF,cAAM,MAAM,MAAM,EAAE,KAAK;AAAA,UACvB,QAAQ;AAAA,UACR,SAAS;AAAA,YACP,gBAAgB;AAAA,YAChB,GAAI,YACA;AAAA,cACE,eAAe,UAAU,WAAW,SAAS,IACzC,YACA,UAAU,SAAS;AAAA,YACzB,IACA,CAAC;AAAA,YACL,GAAG,KAAK;AAAA,UACV;AAAA,UACA,MAAM,KAAK,UAAU,IAAI;AAAA,UACzB,QAAQ;AAAA,QACV,CAAC;AACD,YAAI,CAAC,IAAI,IAAI;AAGX,gBAAM,YAAY,IAAI,UAAU,OAAO,IAAI,WAAW,OAAO,IAAI,WAAW;AAC5E,cAAI,CAAC,aAAa,YAAY,YAAY;AACxC,kBAAM,OAAO,MAAM,IAAI,KAAK,EAAE,MAAM,MAAM,EAAE;AAC5C,kBAAM,IAAI,MAAM,gBAAgB,GAAG,YAAY,IAAI,MAAM,MAAM,KAAK,MAAM,GAAG,GAAG,CAAC,EAAE;AAAA,UACrF;AAEA,gBAAM,MAAM,KAAK,UAAU,MAAM,KAAK,OAAO,IAAI,GAAG;AACpD;AAAA,QACF;AACA,cAAM,SAAU,MAAM,IAAI,KAAK;AAC/B,eAAO,OAAO;AAAA,MAChB,SAAS,KAAK;AAEZ,YAAI,IAAI,OAAO,QAAS,OAAM;AAC9B,oBAAY;AACZ,YAAI,YAAY,WAAY,OAAM;AAClC,cAAM,MAAM,KAAK,UAAU,MAAM,KAAK,OAAO,IAAI,GAAG;AAAA,MACtD;AAAA,IACF;AACA,UAAM,aAAa,IAAI,MAAM,gCAAgC;AAAA,EAC/D;AACF;AAEA,SAAS,MAAM,IAA2B;AACxC,SAAO,IAAI,QAAQ,CAAC,YAAY;AAC9B,UAAM,IAAI,WAAW,SAAS,EAAE;AAEhC,QAAI,OAAQ,EAA6B,UAAU;AACjD,MAAC,EAA4B,MAAM;AAAA,EACvC,CAAC;AACH;AAgEA,eAAsB,kBACpB,MAC+B;AAC/B,MAAI,KAAK,SAAS,QAAW;AAC3B,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,WAAW,KAAK,gBAAgB,KAAK,OAAO;AAClD,QAAM,eACJ,OAAO,KAAK,SAAS,WAAW,UAAU,KAAK,KAAK,QAAQ,cAAc,EAAE,CAAC,KAAK;AAKpF,QAAM,EAAE,aAAa,IAAI,MAAM,OAAO,MAAW;AAEjD,QAAM,SAAS,aAAa,OAAO,KAAK,QAAQ;AAC9C,UAAM,QAAQ,KAAK,IAAI;AACvB,QAAI,SAAS;AACb,QAAI,UAAU;AACd,QAAI;AAEJ,QAAI;AACF,UAAI,IAAI,WAAW,UAAU,IAAI,KAAK,MAAM,GAAG,EAAE,CAAC,MAAM,MAAM;AAC5D,YAAI,aAAa;AACjB,YAAI,IAAI,WAAW;AACnB;AAAA,MACF;AACA,UAAI,cAAc;AAChB,cAAM,MAAM,IAAI,QAAQ,eAAe;AACvC,YAAI,QAAQ,cAAc;AACxB,cAAI,aAAa;AACjB,cAAI,IAAI,cAAc;AACtB;AAAA,QACF;AAAA,MACF;AAGA,YAAM,SAAmB,CAAC;AAC1B,UAAI,aAAa;AACjB,YAAM,UAAU,IAAI,gBAAgB;AACpC,UAAI,GAAG,SAAS,MAAM;AACpB,YAAI,CAAC,IAAI,cAAe,SAAQ,MAAM;AAAA,MACxC,CAAC;AAED,uBAAiB,SAAS,KAAK;AAC7B,cAAM,MAAM;AACZ,sBAAc,IAAI;AAClB,YAAI,aAAa,UAAU;AACzB,cAAI,aAAa;AACjB,cAAI,IAAI,mBAAmB;AAC3B;AAAA,QACF;AACA,eAAO,KAAK,GAAG;AAAA,MACjB;AAEA,YAAM,OAAO,KAAK;AAAA,QAChB,OAAO,OAAO,MAAM,EAAE,SAAS,MAAM;AAAA,MACvC;AACA,eAAS,KAAK;AAEd,YAAM,MAAuB,KAAK,iBAC9B,MAAM,KAAK,eAAe,MAAM,QAAQ,MAAM,IAC9C;AAAA,QACE,QAAQ,KAAK;AAAA,QACb,KAAK,KAAK;AAAA,QACV,YAAY,KAAK;AAAA,QACjB,MAAM,KAAK;AAAA,QACX,QAAQ,QAAQ;AAAA,QAChB,WAAW,KAAK;AAAA,QAChB,SAAS,KAAK;AAAA,QACd,OAAO;AAAA,QACP,WAAW;AAAA,QACX,MAAM;AAAA,MACR;AAEJ,YAAM,WAAW,MAAM,KAAK,SAAS,KAAK,UAAU,GAAG;AACvD,YAAM,eAAoD,EAAE,SAAS;AAErE,UAAI,aAAa;AACjB,UAAI,UAAU,gBAAgB,kBAAkB;AAChD,UAAI,IAAI,KAAK,UAAU,YAAY,CAAC;AACpC,gBAAU;AAAA,IACZ,SAAS,KAAK;AACZ,kBAAY;AAEZ,UAAK,KAAe,SAAS,cAAc;AACzC,YAAI,aAAa;AACjB,YAAI,IAAI,gBAAgB;AACxB;AAAA,MACF;AACA,UAAI,aAAa;AACjB,UAAI,UAAU,gBAAgB,kBAAkB;AAChD,UAAI,IAAI,KAAK,UAAU,EAAE,OAAO,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,EAAE,CAAC,CAAC;AAAA,IACrF,UAAE;AACA,WAAK,YAAY;AAAA,QACf;AAAA,QACA,YAAY,KAAK,IAAI,IAAI;AAAA,QACzB;AAAA,QACA,OAAO;AAAA,MACT,CAAC;AAAA,IACH;AAAA,EACF,CAAC;AAED,QAAM,IAAI,QAAc,CAAC,SAAS,WAAW;AAC3C,WAAO,KAAK,SAAS,MAAM;AAC3B,WAAO,OAAO,KAAK,MAAM,KAAK,QAAQ,WAAW,MAAM,QAAQ,CAAC;AAAA,EAClE,CAAC;AAED,QAAM,OAAO,OAAO,QAAQ;AAC5B,QAAM,YAAY,OAAO,SAAS,YAAY,OAAO,KAAK,OAAO,KAAK;AAEtE,SAAO;AAAA,IACL,MAAM;AAAA,IACN,OAAO,MACL,IAAI,QAAc,CAAC,SAAS,WAAW;AACrC,aAAO,MAAM,CAAC,QAAS,MAAM,OAAO,GAAG,IAAI,QAAQ,CAAE;AAAA,IACvD,CAAC;AAAA,EACL;AACF;AAIA,IAAM,aAAa;AAAA,EACjB,MAAM,OAAO;AAAA,IACX,KAAK,MAAM;AAAA,IAAC;AAAA,IACZ,cAAc,MAAM;AAAA,IAAC;AAAA,IACrB,WAAW,MAAM;AAAA,IAAC;AAAA,IAClB,iBAAiB,MAAM;AAAA,IAAC;AAAA,IACxB,UAAU,MAAM;AAAA,IAAC;AAAA,EACnB;AACF;AAEA,IAAM,iBAAiB;AAAA,EACrB,OAAO,YAAY;AAAA,EACnB,MAAM,YAAY;AAAA,EAClB,MAAM,YAAY,CAAC;AACrB;AAEA,IAAM,YAAY;AAAA,EAChB,QAAQ,MAAM;AAAA,EAAC;AAAA,EACf,OAAO,MAAM;AACf;","names":[]}
@@ -1,4 +1,4 @@
1
- import { S as Scenario, n as JudgeScore, D as DispatchFn, J as JudgeConfig } from '../types-BURGZ8Ug.js';
1
+ import { S as Scenario, n as JudgeScore, g as DispatchFn, J as JudgeConfig } from '../types-8u72Gc76.js';
2
2
 
3
3
  /**
4
4
  * # `@tangle-network/agent-eval/adapters/langchain` — wrap any LangChain
@@ -1,6 +1,6 @@
1
- export { C as CampaignStorage, D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, G as GepaDriverOptions, H as HeldOutGateOptions, O as OpenAutoPrOptions, m as OpenAutoPrResult, R as RunCampaignOptions, a as RunEvalOptions, b as RunImprovementLoopOptions, c as RunImprovementLoopResult, n as RunOptimizationOptions, o as RunOptimizationResult, d as composeGate, e as defaultProductionGate, f as evolutionaryDriver, g as fsCampaignStorage, h as gepaDriver, i as heldOutGate, j as inMemoryCampaignStorage, p as openAutoPr, r as runCampaign, k as runEval, l as runImprovementLoop, q as runOptimization, s as surfaceHash } from '../run-improvement-loop-pJ4yrx4X.js';
2
- import { L as LabeledScenarioStore, q as LabeledScenarioWrite, r as LabeledScenarioSampleArgs, s as LabeledScenarioRecord, f as CodeSurface } from '../types-BURGZ8Ug.js';
3
- export { C as CampaignAggregates, a as CampaignArtifactWriter, b as CampaignCellResult, c as CampaignCostMeter, d as CampaignResult, e as CampaignTraceWriter, g as DispatchContext, D as DispatchFn, G as Gate, h as GateContext, i as GateDecision, j as GateResult, k as GenerationCandidate, l as GenerationRecord, I as ImprovementDriver, t as JudgeAggregate, J as JudgeConfig, m as JudgeDimension, n as JudgeScore, u as LabeledScenarioSource, M as MutableSurface, o as Mutator, O as OptimizerConfig, P as ProposeContext, R as RedactionStatus, S as Scenario, v as ScenarioAggregate, p as SessionScript, T as TraceSpan } from '../types-BURGZ8Ug.js';
1
+ export { C as CampaignStorage, D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, G as GepaDriverOptions, H as HeldOutGateOptions, O as OpenAutoPrOptions, m as OpenAutoPrResult, a as RunCampaignOptions, b as RunEvalOptions, c as RunImprovementLoopOptions, R as RunImprovementLoopResult, n as RunOptimizationOptions, o as RunOptimizationResult, d as composeGate, e as defaultProductionGate, f as evolutionaryDriver, g as fsCampaignStorage, h as gepaDriver, i as heldOutGate, j as inMemoryCampaignStorage, p as openAutoPr, r as runCampaign, k as runEval, l as runImprovementLoop, q as runOptimization, s as surfaceHash } from '../run-improvement-loop-Bfam3MT1.js';
2
+ import { L as LabeledScenarioStore, q as LabeledScenarioWrite, r as LabeledScenarioSampleArgs, s as LabeledScenarioRecord, f as CodeSurface } from '../types-8u72Gc76.js';
3
+ export { C as CampaignAggregates, a as CampaignArtifactWriter, b as CampaignCellResult, c as CampaignCostMeter, d as CampaignResult, e as CampaignTraceWriter, D as DispatchContext, g as DispatchFn, G as Gate, h as GateContext, i as GateDecision, j as GateResult, k as GenerationCandidate, l as GenerationRecord, I as ImprovementDriver, t as JudgeAggregate, J as JudgeConfig, m as JudgeDimension, n as JudgeScore, u as LabeledScenarioSource, M as MutableSurface, o as Mutator, O as OptimizerConfig, P as ProposeContext, R as RedactionStatus, S as Scenario, v as ScenarioAggregate, p as SessionScript, T as TraceSpan } from '../types-8u72Gc76.js';
4
4
  import '../llm-client-BXVRUZyX.js';
5
5
  import '../errors-mje_cKOs.js';
6
6
  import '../raw-provider-sink-C46HDghv.js';
@@ -1,5 +1,7 @@
1
- export { C as CampaignAggregates, a as CampaignArtifactWriter, b as CampaignCellResult, c as CampaignCostMeter, d as CampaignResult, e as CampaignTraceWriter, f as CodeSurface, D as Dispatch, g as DispatchContext, G as Gate, h as GateContext, i as GateDecision, j as GateResult, k as GenerationCandidate, l as GenerationRecord, I as ImprovementDriver, J as JudgeConfig, m as JudgeDimension, n as JudgeScore, M as MutableSurface, o as Mutator, O as OptimizerConfig, S as Scenario, p as SessionScript } from '../types-BURGZ8Ug.js';
2
- export { C as CampaignStorage, D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, G as GepaDriverOptions, H as HeldOutGateOptions, R as RunCampaignOptions, a as RunEvalOptions, b as RunImprovementLoopOptions, c as RunImprovementLoopResult, d as composeGate, e as defaultProductionGate, f as evolutionaryDriver, g as fsCampaignStorage, h as gepaDriver, i as heldOutGate, j as inMemoryCampaignStorage, r as runCampaign, k as runEval, l as runImprovementLoop } from '../run-improvement-loop-pJ4yrx4X.js';
1
+ import { S as Scenario, M as MutableSurface, D as DispatchContext, J as JudgeConfig, I as ImprovementDriver, G as Gate } from '../types-8u72Gc76.js';
2
+ export { C as CampaignAggregates, a as CampaignArtifactWriter, b as CampaignCellResult, c as CampaignCostMeter, d as CampaignResult, e as CampaignTraceWriter, f as CodeSurface, g as Dispatch, h as GateContext, i as GateDecision, j as GateResult, k as GenerationCandidate, l as GenerationRecord, m as JudgeDimension, n as JudgeScore, o as Mutator, O as OptimizerConfig, p as SessionScript } from '../types-8u72Gc76.js';
3
+ import { C as CampaignStorage, R as RunImprovementLoopResult } from '../run-improvement-loop-Bfam3MT1.js';
4
+ export { D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, G as GepaDriverOptions, H as HeldOutGateOptions, a as RunCampaignOptions, b as RunEvalOptions, c as RunImprovementLoopOptions, d as composeGate, e as defaultProductionGate, f as evolutionaryDriver, g as fsCampaignStorage, h as gepaDriver, i as heldOutGate, j as inMemoryCampaignStorage, r as runCampaign, k as runEval, l as runImprovementLoop } from '../run-improvement-loop-Bfam3MT1.js';
3
5
  export { D as DeploymentOutcome, F as FileSystemOutcomeStore, a as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, O as OutcomeStore } from '../outcome-store-BxJ3DQKJ.js';
4
6
  import '../llm-client-BXVRUZyX.js';
5
7
  import '../errors-mje_cKOs.js';
@@ -8,3 +10,198 @@ import '@tangle-network/agent-runtime';
8
10
  import '../red-team-30II1T4o.js';
9
11
  import '../dataset-BlwAtYYf.js';
10
12
  import '../store-Db2Bv8Cf.js';
13
+
14
+ /**
15
+ * # `selfImprove()` — the LAND-tier one-shot.
16
+ *
17
+ * The cheapest possible call site to run a real closed-loop self-
18
+ * improvement over your agent. Wraps `runImprovementLoop` with smart
19
+ * defaults and a budget-shaped options API; every escape hatch the
20
+ * substrate exposes is reachable from here without losing the
21
+ * one-function feel.
22
+ *
23
+ * Defaults picked to match the LAND-tier story:
24
+ * - In-memory storage (no filesystem touch).
25
+ * - `gepaDriver` reflective mutation with copywriting-flavored primitives
26
+ * (override `driver` or `mutationPrimitives` for any domain).
27
+ * - `defaultProductionGate` with `deltaThreshold: 0.05`.
28
+ * - Held-out split = 25% of scenarios, deterministic by id hash.
29
+ * - 3 generations × population 2 (raise via `budget` for more search).
30
+ * - `autoOnPromote: 'none'` (we don't open PRs unless you ask).
31
+ *
32
+ * Want one-click? Provide `agent` + `scenarios` + `judge`. Done.
33
+ * Want distributed? Pass `cellPlacement` + an `httpDispatch`-backed
34
+ * agent. Want a code-tier surface? Pass a `MutableSurface` + your own
35
+ * `driver`. Same function.
36
+ */
37
+
38
+ interface SelfImproveBudget {
39
+ /** Hard $ ceiling across all cells in baseline + every generation. Cells
40
+ * beyond the ceiling are skipped (cost-aware, not aborted). */
41
+ dollars?: number;
42
+ /** How many improvement generations to explore. Default 3. Set 0 to
43
+ * skip improvement entirely (selfImprove becomes a baseline-only run). */
44
+ generations?: number;
45
+ /** Candidates the driver proposes per generation. Default 2. */
46
+ populationSize?: number;
47
+ /** Max concurrent cells across the loop. Default 2. */
48
+ maxConcurrency?: number;
49
+ /** Fraction of `scenarios` held out from training, used for the gate.
50
+ * Default 0.25. Ignored when `holdoutScenarios` is set explicitly. */
51
+ holdoutFraction?: number;
52
+ /** Explicit held-out scenarios; overrides `holdoutFraction`. */
53
+ holdoutScenarios?: Scenario[];
54
+ }
55
+ interface SelfImproveLlm {
56
+ /** Endpoint base URL. Default Tangle Router. */
57
+ baseUrl?: string;
58
+ /** Bearer token. Default `process.env.OPENAI_API_KEY`. */
59
+ apiKey?: string;
60
+ /** Model id used by `gepaDriver` reflection. Default
61
+ * `anthropic/claude-sonnet-4.6`. */
62
+ model?: string;
63
+ }
64
+ type SelfImproveProgressEvent = {
65
+ kind: 'baseline.started';
66
+ scenarios: number;
67
+ } | {
68
+ kind: 'baseline.completed';
69
+ compositeMean: number;
70
+ durationMs: number;
71
+ } | {
72
+ kind: 'generation.started';
73
+ index: number;
74
+ populationSize: number;
75
+ } | {
76
+ kind: 'generation.completed';
77
+ index: number;
78
+ bestComposite: number;
79
+ durationMs: number;
80
+ } | {
81
+ kind: 'gate.decided';
82
+ decision: string;
83
+ lift: number;
84
+ };
85
+ interface SelfImproveOptions<TScenario extends Scenario, TArtifact> {
86
+ /**
87
+ * Your agent — a function that takes the current `MutableSurface`
88
+ * (typically a system prompt the loop is optimizing) plus the
89
+ * scenario + cell ctx, and returns the artifact your judge scores.
90
+ *
91
+ * Same shape as `RunOptimizationOptions.dispatchWithSurface`. Wrap a
92
+ * plain `Dispatch` if you don't have a surface seam:
93
+ *
94
+ * agent: (_surface, scenario, ctx) => yourPlainDispatch(scenario, ctx)
95
+ *
96
+ * That mode evaluates without mutating any surface — useful as a
97
+ * baseline-only run (set `budget.generations = 0`).
98
+ */
99
+ agent: (surface: MutableSurface, scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>;
100
+ /** Scenarios to evaluate against. Train/holdout split is computed from
101
+ * these unless `budget.holdoutScenarios` is set explicitly. */
102
+ scenarios: TScenario[];
103
+ /** Judge that scores artifacts. Bring your own; use `langchainJudge`
104
+ * from `/adapters/langchain` for a Runnable-shaped one. */
105
+ judge: JudgeConfig<TArtifact, TScenario>;
106
+ /** Starting surface — system prompt, JSON config, anything `MutableSurface`
107
+ * accepts. The driver mutates this each generation. */
108
+ baselineSurface: MutableSurface;
109
+ /** Budget + loop shape. All fields optional; defaults pick the LAND-tier
110
+ * story. */
111
+ budget?: SelfImproveBudget;
112
+ /** Custom driver. Default is `gepaDriver` configured from `llm` +
113
+ * `mutationPrimitives`. */
114
+ driver?: ImprovementDriver;
115
+ /** Default-driver overrides — used when `driver` is unset. */
116
+ mutationPrimitives?: string[];
117
+ driverTarget?: string;
118
+ /** Custom gate. Default is `defaultProductionGate` with
119
+ * `deltaThreshold: 0.05` on the held-out split. */
120
+ gate?: Gate<TArtifact, TScenario>;
121
+ /** LLM config consumed by the default `gepaDriver`. Ignored if you pass
122
+ * your own `driver`. */
123
+ llm?: SelfImproveLlm;
124
+ /** Storage backend. Default `inMemoryCampaignStorage()` — nothing
125
+ * persists past the call. Pass `fsCampaignStorage()` to write to disk. */
126
+ storage?: CampaignStorage;
127
+ /** Run directory (logical for in-memory storage, real path for fs).
128
+ * Default `mem://selfImprove-<timestamp>`. */
129
+ runDir?: string;
130
+ /** Distributed-driver seam — same as `RunCampaignOptions.cellPlacement`.
131
+ * Returns an opaque placement key the substrate forwards to your agent
132
+ * as `ctx.placement`. Combined with `httpDispatch` from
133
+ * `/adapters/http`, fans cells across regions. */
134
+ cellPlacement?: (input: {
135
+ scenario: TScenario;
136
+ rep: number;
137
+ generation?: number;
138
+ }) => string | undefined;
139
+ /** Streaming hook — fires on baseline + each generation + gate decision.
140
+ * Consumer routes events wherever (UI, dashboard, logs). */
141
+ onProgress?: (event: SelfImproveProgressEvent) => void;
142
+ /** Auto-promotion behavior on a ship decision. Default `'none'` — we
143
+ * return the winner; you ship it however you ship. `'pr'` opens a
144
+ * GitHub PR via `openAutoPr`; requires `ghOwner` + `ghRepo`. */
145
+ autoOnPromote?: 'pr' | 'none';
146
+ ghOwner?: string;
147
+ ghRepo?: string;
148
+ }
149
+ interface SelfImproveResult<TScenario extends Scenario, TArtifact> {
150
+ /** Composite mean across all scenarios, baseline run. */
151
+ baseline: {
152
+ compositeMean: number;
153
+ perScenario: Record<string, number>;
154
+ };
155
+ /** Composite mean on the held-out set, winner run. */
156
+ winner: {
157
+ compositeMean: number;
158
+ perScenario: Record<string, number>;
159
+ surface: MutableSurface;
160
+ };
161
+ /** `winner.compositeMean - baselineOnHoldout.compositeMean`. Positive
162
+ * means the gate observed improvement. */
163
+ lift: number;
164
+ /** `defaultProductionGate.decide()` result. */
165
+ gateDecision: 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling';
166
+ /** Number of generations actually explored (may be less than the
167
+ * budget if the driver gave up early). */
168
+ generationsExplored: number;
169
+ /** Wall-clock total. */
170
+ durationMs: number;
171
+ /** Total cost across baseline + every generation. */
172
+ totalCostUsd: number;
173
+ /**
174
+ * Raw substrate result for advanced inspection — full per-generation
175
+ * candidates, full campaign artifacts, all judge scores. Useful for
176
+ * debugging or reporting beyond the summary.
177
+ */
178
+ raw: RunImprovementLoopResult<TArtifact, TScenario>;
179
+ }
180
+ /**
181
+ * One-shot self-improvement loop. See module docstring for defaults +
182
+ * extension points.
183
+ *
184
+ * @example Minimum (LAND tier):
185
+ *
186
+ * const result = await selfImprove({
187
+ * agent: (surface, scenario, ctx) => myAgent(surface, scenario, ctx.signal),
188
+ * scenarios,
189
+ * judge,
190
+ * baselineSurface: DEFAULT_PROMPT,
191
+ * })
192
+ * console.log(`lift: ${result.lift.toFixed(3)} (${result.gateDecision})`)
193
+ *
194
+ * @example Distributed (workers in three regions):
195
+ *
196
+ * await selfImprove({
197
+ * agent: httpDispatch({ resolveUrl: ({ placement }) => REGION_URLS[placement!] }),
198
+ * scenarios,
199
+ * judge,
200
+ * baselineSurface: DEFAULT_PROMPT,
201
+ * cellPlacement: ({ scenario }) => scenario.region,
202
+ * budget: { maxConcurrency: 12 },
203
+ * })
204
+ */
205
+ declare function selfImprove<TScenario extends Scenario, TArtifact>(opts: SelfImproveOptions<TScenario, TArtifact>): Promise<SelfImproveResult<TScenario, TArtifact>>;
206
+
207
+ export { CampaignStorage, DispatchContext, Gate, ImprovementDriver, JudgeConfig, MutableSurface, RunImprovementLoopResult, Scenario, type SelfImproveBudget, type SelfImproveLlm, type SelfImproveOptions, type SelfImproveProgressEvent, type SelfImproveResult, selfImprove };
@@ -24,6 +24,130 @@ import "../chunk-VXNVVBZO.js";
24
24
  import "../chunk-PC4UYEBM.js";
25
25
  import "../chunk-QYJT52YW.js";
26
26
  import "../chunk-NSBPE2FW.js";
27
+
28
+ // src/contract/self-improve.ts
29
+ function splitTrainHoldout(scenarios, fraction) {
30
+ function hash(s) {
31
+ let h = 2166136261 >>> 0;
32
+ for (let i = 0; i < s.length; i++) {
33
+ h ^= s.charCodeAt(i);
34
+ h = Math.imul(h, 16777619) >>> 0;
35
+ }
36
+ return h;
37
+ }
38
+ const sorted = [...scenarios].sort((a, b) => hash(a.id) - hash(b.id));
39
+ const nHoldout = Math.max(1, Math.min(sorted.length - 1, Math.round(sorted.length * fraction)));
40
+ return {
41
+ holdout: sorted.slice(0, nHoldout),
42
+ train: sorted.slice(nHoldout)
43
+ };
44
+ }
45
+ function meanComposite(byScenario) {
46
+ const perScenario = {};
47
+ const values = [];
48
+ for (const [id, agg] of Object.entries(byScenario)) {
49
+ perScenario[id] = agg.meanComposite;
50
+ values.push(agg.meanComposite);
51
+ }
52
+ return {
53
+ compositeMean: values.length === 0 ? 0 : values.reduce((s, v) => s + v, 0) / values.length,
54
+ perScenario
55
+ };
56
+ }
57
+ var DEFAULT_MUTATION_PRIMITIVES = [
58
+ "Tighten the hook: lead with the specific user outcome.",
59
+ "Replace generic adjectives with specific verbs or proof numbers.",
60
+ "Anchor every claim in something the scenario's brief literally supports.",
61
+ "Honor the surface-shape constraint (length, register, audience vocabulary)."
62
+ ];
63
+ async function selfImprove(opts) {
64
+ const startedAt = Date.now();
65
+ const budget = opts.budget ?? {};
66
+ const generations = budget.generations ?? 3;
67
+ const populationSize = budget.populationSize ?? 2;
68
+ const maxConcurrency = budget.maxConcurrency ?? 2;
69
+ const holdoutFraction = budget.holdoutFraction ?? 0.25;
70
+ const costCeiling = budget.dollars;
71
+ const explicitHoldout = budget.holdoutScenarios;
72
+ const { train, holdout } = explicitHoldout ? {
73
+ train: opts.scenarios.filter((s) => !explicitHoldout.some((h) => h.id === s.id)),
74
+ holdout: explicitHoldout
75
+ } : splitTrainHoldout(opts.scenarios, holdoutFraction);
76
+ if (train.length === 0) {
77
+ throw new Error("selfImprove: train split is empty. Reduce holdoutFraction or pass more scenarios.");
78
+ }
79
+ if (holdout.length === 0) {
80
+ throw new Error("selfImprove: holdout split is empty. Pass more scenarios.");
81
+ }
82
+ const driver = opts.driver ?? gepaDriver({
83
+ llm: {
84
+ baseUrl: opts.llm?.baseUrl ?? "https://router.tangle.tools/v1",
85
+ apiKey: opts.llm?.apiKey ?? process.env.OPENAI_API_KEY ?? ""
86
+ },
87
+ model: opts.llm?.model ?? "anthropic/claude-sonnet-4.6",
88
+ target: opts.driverTarget ?? "agent surface (system prompt or config) being optimized by selfImprove",
89
+ mutationPrimitives: opts.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES
90
+ });
91
+ const gate = opts.gate ?? defaultProductionGate({
92
+ holdoutScenarios: holdout,
93
+ deltaThreshold: 0.05
94
+ });
95
+ const storage = opts.storage ?? inMemoryCampaignStorage();
96
+ const runDir = opts.runDir ?? `mem://selfImprove-${startedAt}`;
97
+ if (opts.onProgress) {
98
+ opts.onProgress({ kind: "baseline.started", scenarios: opts.scenarios.length });
99
+ }
100
+ const result = await runImprovementLoop({
101
+ scenarios: train,
102
+ baselineSurface: opts.baselineSurface,
103
+ dispatchWithSurface: opts.agent,
104
+ driver,
105
+ judges: [opts.judge],
106
+ populationSize,
107
+ maxGenerations: generations,
108
+ holdoutScenarios: holdout,
109
+ gate,
110
+ autoOnPromote: opts.autoOnPromote ?? "none",
111
+ ghOwner: opts.ghOwner,
112
+ ghRepo: opts.ghRepo,
113
+ storage,
114
+ runDir,
115
+ maxConcurrency,
116
+ cellPlacement: opts.cellPlacement,
117
+ costCeiling
118
+ });
119
+ const baseline = meanComposite(result.baselineOnHoldout.aggregates.byScenario);
120
+ const winnerStats = meanComposite(result.winnerOnHoldout.aggregates.byScenario);
121
+ if (opts.onProgress) {
122
+ opts.onProgress({
123
+ kind: "baseline.completed",
124
+ compositeMean: baseline.compositeMean,
125
+ durationMs: Date.now() - startedAt
126
+ });
127
+ opts.onProgress({
128
+ kind: "gate.decided",
129
+ decision: result.gateResult.decision,
130
+ lift: winnerStats.compositeMean - baseline.compositeMean
131
+ });
132
+ }
133
+ const totalCost = result.baselineCampaign.aggregates.totalCostUsd + result.generations.reduce(
134
+ (sum, gen) => sum + gen.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),
135
+ 0
136
+ );
137
+ return {
138
+ baseline,
139
+ winner: {
140
+ ...winnerStats,
141
+ surface: result.winnerSurface
142
+ },
143
+ lift: winnerStats.compositeMean - baseline.compositeMean,
144
+ gateDecision: result.gateResult.decision,
145
+ generationsExplored: result.generations.length,
146
+ durationMs: Date.now() - startedAt,
147
+ totalCostUsd: totalCost,
148
+ raw: result
149
+ };
150
+ }
27
151
  export {
28
152
  FileSystemOutcomeStore,
29
153
  InMemoryOutcomeStore,
@@ -36,6 +160,7 @@ export {
36
160
  inMemoryCampaignStorage,
37
161
  runCampaign,
38
162
  runEval,
39
- runImprovementLoop
163
+ runImprovementLoop,
164
+ selfImprove
40
165
  };
41
166
  //# sourceMappingURL=index.js.map
@@ -1 +1 @@
1
- {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
1
+ {"version":3,"sources":["../../src/contract/self-improve.ts"],"sourcesContent":["/**\n * # `selfImprove()` — the LAND-tier one-shot.\n *\n * The cheapest possible call site to run a real closed-loop self-\n * improvement over your agent. Wraps `runImprovementLoop` with smart\n * defaults and a budget-shaped options API; every escape hatch the\n * substrate exposes is reachable from here without losing the\n * one-function feel.\n *\n * Defaults picked to match the LAND-tier story:\n * - In-memory storage (no filesystem touch).\n * - `gepaDriver` reflective mutation with copywriting-flavored primitives\n * (override `driver` or `mutationPrimitives` for any domain).\n * - `defaultProductionGate` with `deltaThreshold: 0.05`.\n * - Held-out split = 25% of scenarios, deterministic by id hash.\n * - 3 generations × population 2 (raise via `budget` for more search).\n * - `autoOnPromote: 'none'` (we don't open PRs unless you ask).\n *\n * Want one-click? Provide `agent` + `scenarios` + `judge`. Done.\n * Want distributed? Pass `cellPlacement` + an `httpDispatch`-backed\n * agent. Want a code-tier surface? Pass a `MutableSurface` + your own\n * `driver`. Same function.\n */\n\nimport { runImprovementLoop, type RunImprovementLoopResult } from '../campaign/presets/run-improvement-loop'\nimport { gepaDriver } from '../campaign/drivers/gepa'\nimport { defaultProductionGate } from '../campaign/gates/default-production-gate'\nimport { type CampaignStorage, inMemoryCampaignStorage } from '../campaign/storage'\nimport type {\n DispatchContext,\n Gate,\n ImprovementDriver,\n JudgeConfig,\n MutableSurface,\n Scenario,\n} from '../campaign/types'\n\nexport interface SelfImproveBudget {\n /** Hard $ ceiling across all cells in baseline + every generation. Cells\n * beyond the ceiling are skipped (cost-aware, not aborted). */\n dollars?: number\n /** How many improvement generations to explore. Default 3. Set 0 to\n * skip improvement entirely (selfImprove becomes a baseline-only run). */\n generations?: number\n /** Candidates the driver proposes per generation. Default 2. */\n populationSize?: number\n /** Max concurrent cells across the loop. Default 2. */\n maxConcurrency?: number\n /** Fraction of `scenarios` held out from training, used for the gate.\n * Default 0.25. Ignored when `holdoutScenarios` is set explicitly. */\n holdoutFraction?: number\n /** Explicit held-out scenarios; overrides `holdoutFraction`. */\n holdoutScenarios?: Scenario[]\n}\n\nexport interface SelfImproveLlm {\n /** Endpoint base URL. Default Tangle Router. */\n baseUrl?: string\n /** Bearer token. Default `process.env.OPENAI_API_KEY`. */\n apiKey?: string\n /** Model id used by `gepaDriver` reflection. Default\n * `anthropic/claude-sonnet-4.6`. */\n model?: string\n}\n\nexport type SelfImproveProgressEvent =\n | { kind: 'baseline.started'; scenarios: number }\n | { kind: 'baseline.completed'; compositeMean: number; durationMs: number }\n | { kind: 'generation.started'; index: number; populationSize: number }\n | { kind: 'generation.completed'; index: number; bestComposite: number; durationMs: number }\n | { kind: 'gate.decided'; decision: string; lift: number }\n\nexport interface SelfImproveOptions<TScenario extends Scenario, TArtifact> {\n /**\n * Your agent — a function that takes the current `MutableSurface`\n * (typically a system prompt the loop is optimizing) plus the\n * scenario + cell ctx, and returns the artifact your judge scores.\n *\n * Same shape as `RunOptimizationOptions.dispatchWithSurface`. Wrap a\n * plain `Dispatch` if you don't have a surface seam:\n *\n * agent: (_surface, scenario, ctx) => yourPlainDispatch(scenario, ctx)\n *\n * That mode evaluates without mutating any surface — useful as a\n * baseline-only run (set `budget.generations = 0`).\n */\n agent: (\n surface: MutableSurface,\n scenario: TScenario,\n ctx: DispatchContext,\n ) => Promise<TArtifact>\n\n /** Scenarios to evaluate against. Train/holdout split is computed from\n * these unless `budget.holdoutScenarios` is set explicitly. */\n scenarios: TScenario[]\n\n /** Judge that scores artifacts. Bring your own; use `langchainJudge`\n * from `/adapters/langchain` for a Runnable-shaped one. */\n judge: JudgeConfig<TArtifact, TScenario>\n\n /** Starting surface — system prompt, JSON config, anything `MutableSurface`\n * accepts. The driver mutates this each generation. */\n baselineSurface: MutableSurface\n\n /** Budget + loop shape. All fields optional; defaults pick the LAND-tier\n * story. */\n budget?: SelfImproveBudget\n\n /** Custom driver. Default is `gepaDriver` configured from `llm` +\n * `mutationPrimitives`. */\n driver?: ImprovementDriver\n\n /** Default-driver overrides — used when `driver` is unset. */\n mutationPrimitives?: string[]\n driverTarget?: string\n\n /** Custom gate. Default is `defaultProductionGate` with\n * `deltaThreshold: 0.05` on the held-out split. */\n gate?: Gate<TArtifact, TScenario>\n\n /** LLM config consumed by the default `gepaDriver`. Ignored if you pass\n * your own `driver`. */\n llm?: SelfImproveLlm\n\n /** Storage backend. Default `inMemoryCampaignStorage()` — nothing\n * persists past the call. Pass `fsCampaignStorage()` to write to disk. */\n storage?: CampaignStorage\n\n /** Run directory (logical for in-memory storage, real path for fs).\n * Default `mem://selfImprove-<timestamp>`. */\n runDir?: string\n\n /** Distributed-driver seam — same as `RunCampaignOptions.cellPlacement`.\n * Returns an opaque placement key the substrate forwards to your agent\n * as `ctx.placement`. Combined with `httpDispatch` from\n * `/adapters/http`, fans cells across regions. */\n cellPlacement?: (input: {\n scenario: TScenario\n rep: number\n generation?: number\n }) => string | undefined\n\n /** Streaming hook — fires on baseline + each generation + gate decision.\n * Consumer routes events wherever (UI, dashboard, logs). */\n onProgress?: (event: SelfImproveProgressEvent) => void\n\n /** Auto-promotion behavior on a ship decision. Default `'none'` — we\n * return the winner; you ship it however you ship. `'pr'` opens a\n * GitHub PR via `openAutoPr`; requires `ghOwner` + `ghRepo`. */\n autoOnPromote?: 'pr' | 'none'\n ghOwner?: string\n ghRepo?: string\n}\n\nexport interface SelfImproveResult<TScenario extends Scenario, TArtifact> {\n /** Composite mean across all scenarios, baseline run. */\n baseline: {\n compositeMean: number\n perScenario: Record<string, number>\n }\n /** Composite mean on the held-out set, winner run. */\n winner: {\n compositeMean: number\n perScenario: Record<string, number>\n surface: MutableSurface\n }\n /** `winner.compositeMean - baselineOnHoldout.compositeMean`. Positive\n * means the gate observed improvement. */\n lift: number\n /** `defaultProductionGate.decide()` result. */\n gateDecision: 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling'\n /** Number of generations actually explored (may be less than the\n * budget if the driver gave up early). */\n generationsExplored: number\n /** Wall-clock total. */\n durationMs: number\n /** Total cost across baseline + every generation. */\n totalCostUsd: number\n /**\n * Raw substrate result for advanced inspection — full per-generation\n * candidates, full campaign artifacts, all judge scores. Useful for\n * debugging or reporting beyond the summary.\n */\n raw: RunImprovementLoopResult<TArtifact, TScenario>\n}\n\n/**\n * Deterministic train/holdout split by a stable hash of `scenario.id`,\n * so the same scenario set always splits the same way across runs.\n */\nfunction splitTrainHoldout<TScenario extends Scenario>(\n scenarios: TScenario[],\n fraction: number,\n): { train: TScenario[]; holdout: TScenario[] } {\n // Stable fnv-1a-ish hash of the id for ordering.\n function hash(s: string): number {\n let h = 2166136261 >>> 0\n for (let i = 0; i < s.length; i++) {\n h ^= s.charCodeAt(i)\n h = Math.imul(h, 16777619) >>> 0\n }\n return h\n }\n const sorted = [...scenarios].sort((a, b) => hash(a.id) - hash(b.id))\n const nHoldout = Math.max(1, Math.min(sorted.length - 1, Math.round(sorted.length * fraction)))\n return {\n holdout: sorted.slice(0, nHoldout),\n train: sorted.slice(nHoldout),\n }\n}\n\nfunction meanComposite(\n byScenario: Record<string, { meanComposite: number }>,\n): { compositeMean: number; perScenario: Record<string, number> } {\n const perScenario: Record<string, number> = {}\n const values: number[] = []\n for (const [id, agg] of Object.entries(byScenario)) {\n perScenario[id] = agg.meanComposite\n values.push(agg.meanComposite)\n }\n return {\n compositeMean: values.length === 0 ? 0 : values.reduce((s, v) => s + v, 0) / values.length,\n perScenario,\n }\n}\n\nconst DEFAULT_MUTATION_PRIMITIVES = [\n 'Tighten the hook: lead with the specific user outcome.',\n 'Replace generic adjectives with specific verbs or proof numbers.',\n 'Anchor every claim in something the scenario\\'s brief literally supports.',\n 'Honor the surface-shape constraint (length, register, audience vocabulary).',\n]\n\n/**\n * One-shot self-improvement loop. See module docstring for defaults +\n * extension points.\n *\n * @example Minimum (LAND tier):\n *\n * const result = await selfImprove({\n * agent: (surface, scenario, ctx) => myAgent(surface, scenario, ctx.signal),\n * scenarios,\n * judge,\n * baselineSurface: DEFAULT_PROMPT,\n * })\n * console.log(`lift: ${result.lift.toFixed(3)} (${result.gateDecision})`)\n *\n * @example Distributed (workers in three regions):\n *\n * await selfImprove({\n * agent: httpDispatch({ resolveUrl: ({ placement }) => REGION_URLS[placement!] }),\n * scenarios,\n * judge,\n * baselineSurface: DEFAULT_PROMPT,\n * cellPlacement: ({ scenario }) => scenario.region,\n * budget: { maxConcurrency: 12 },\n * })\n */\nexport async function selfImprove<TScenario extends Scenario, TArtifact>(\n opts: SelfImproveOptions<TScenario, TArtifact>,\n): Promise<SelfImproveResult<TScenario, TArtifact>> {\n const startedAt = Date.now()\n\n const budget = opts.budget ?? {}\n const generations = budget.generations ?? 3\n const populationSize = budget.populationSize ?? 2\n const maxConcurrency = budget.maxConcurrency ?? 2\n const holdoutFraction = budget.holdoutFraction ?? 0.25\n const costCeiling = budget.dollars\n\n const explicitHoldout = budget.holdoutScenarios\n const { train, holdout } = explicitHoldout\n ? {\n train: opts.scenarios.filter((s) => !explicitHoldout.some((h) => h.id === s.id)),\n holdout: explicitHoldout as TScenario[],\n }\n : splitTrainHoldout(opts.scenarios, holdoutFraction)\n\n if (train.length === 0) {\n throw new Error('selfImprove: train split is empty. Reduce holdoutFraction or pass more scenarios.')\n }\n if (holdout.length === 0) {\n throw new Error('selfImprove: holdout split is empty. Pass more scenarios.')\n }\n\n const driver: ImprovementDriver =\n opts.driver ??\n gepaDriver({\n llm: {\n baseUrl: opts.llm?.baseUrl ?? 'https://router.tangle.tools/v1',\n apiKey: opts.llm?.apiKey ?? process.env.OPENAI_API_KEY ?? '',\n },\n model: opts.llm?.model ?? 'anthropic/claude-sonnet-4.6',\n target: opts.driverTarget ?? 'agent surface (system prompt or config) being optimized by selfImprove',\n mutationPrimitives: opts.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES,\n })\n\n const gate: Gate<TArtifact, TScenario> =\n opts.gate ??\n defaultProductionGate<TArtifact, TScenario>({\n holdoutScenarios: holdout,\n deltaThreshold: 0.05,\n })\n\n const storage = opts.storage ?? inMemoryCampaignStorage()\n const runDir = opts.runDir ?? `mem://selfImprove-${startedAt}`\n\n if (opts.onProgress) {\n opts.onProgress({ kind: 'baseline.started', scenarios: opts.scenarios.length })\n }\n\n const result = await runImprovementLoop<TScenario, TArtifact>({\n scenarios: train,\n baselineSurface: opts.baselineSurface,\n dispatchWithSurface: opts.agent,\n driver,\n judges: [opts.judge],\n populationSize,\n maxGenerations: generations,\n holdoutScenarios: holdout,\n gate,\n autoOnPromote: opts.autoOnPromote ?? 'none',\n ghOwner: opts.ghOwner,\n ghRepo: opts.ghRepo,\n storage,\n runDir,\n maxConcurrency,\n cellPlacement: opts.cellPlacement,\n costCeiling,\n })\n\n const baseline = meanComposite(result.baselineOnHoldout.aggregates.byScenario)\n const winnerStats = meanComposite(result.winnerOnHoldout.aggregates.byScenario)\n\n if (opts.onProgress) {\n opts.onProgress({\n kind: 'baseline.completed',\n compositeMean: baseline.compositeMean,\n durationMs: Date.now() - startedAt,\n })\n opts.onProgress({\n kind: 'gate.decided',\n decision: result.gateResult.decision,\n lift: winnerStats.compositeMean - baseline.compositeMean,\n })\n }\n\n const totalCost =\n result.baselineCampaign.aggregates.totalCostUsd +\n result.generations.reduce(\n (sum, gen) => sum + gen.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),\n 0,\n )\n\n return {\n baseline,\n winner: {\n ...winnerStats,\n surface: result.winnerSurface,\n },\n lift: winnerStats.compositeMean - baseline.compositeMean,\n gateDecision: result.gateResult.decision,\n generationsExplored: result.generations.length,\n durationMs: Date.now() - startedAt,\n totalCostUsd: totalCost,\n raw: result,\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;AA8LA,SAAS,kBACP,WACA,UAC8C;AAE9C,WAAS,KAAK,GAAmB;AAC/B,QAAI,IAAI,eAAe;AACvB,aAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,WAAK,EAAE,WAAW,CAAC;AACnB,UAAI,KAAK,KAAK,GAAG,QAAQ,MAAM;AAAA,IACjC;AACA,WAAO;AAAA,EACT;AACA,QAAM,SAAS,CAAC,GAAG,SAAS,EAAE,KAAK,CAAC,GAAG,MAAM,KAAK,EAAE,EAAE,IAAI,KAAK,EAAE,EAAE,CAAC;AACpE,QAAM,WAAW,KAAK,IAAI,GAAG,KAAK,IAAI,OAAO,SAAS,GAAG,KAAK,MAAM,OAAO,SAAS,QAAQ,CAAC,CAAC;AAC9F,SAAO;AAAA,IACL,SAAS,OAAO,MAAM,GAAG,QAAQ;AAAA,IACjC,OAAO,OAAO,MAAM,QAAQ;AAAA,EAC9B;AACF;AAEA,SAAS,cACP,YACgE;AAChE,QAAM,cAAsC,CAAC;AAC7C,QAAM,SAAmB,CAAC;AAC1B,aAAW,CAAC,IAAI,GAAG,KAAK,OAAO,QAAQ,UAAU,GAAG;AAClD,gBAAY,EAAE,IAAI,IAAI;AACtB,WAAO,KAAK,IAAI,aAAa;AAAA,EAC/B;AACA,SAAO;AAAA,IACL,eAAe,OAAO,WAAW,IAAI,IAAI,OAAO,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,OAAO;AAAA,IACpF;AAAA,EACF;AACF;AAEA,IAAM,8BAA8B;AAAA,EAClC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AA2BA,eAAsB,YACpB,MACkD;AAClD,QAAM,YAAY,KAAK,IAAI;AAE3B,QAAM,SAAS,KAAK,UAAU,CAAC;AAC/B,QAAM,cAAc,OAAO,eAAe;AAC1C,QAAM,iBAAiB,OAAO,kBAAkB;AAChD,QAAM,iBAAiB,OAAO,kBAAkB;AAChD,QAAM,kBAAkB,OAAO,mBAAmB;AAClD,QAAM,cAAc,OAAO;AAE3B,QAAM,kBAAkB,OAAO;AAC/B,QAAM,EAAE,OAAO,QAAQ,IAAI,kBACvB;AAAA,IACE,OAAO,KAAK,UAAU,OAAO,CAAC,MAAM,CAAC,gBAAgB,KAAK,CAAC,MAAM,EAAE,OAAO,EAAE,EAAE,CAAC;AAAA,IAC/E,SAAS;AAAA,EACX,IACA,kBAAkB,KAAK,WAAW,eAAe;AAErD,MAAI,MAAM,WAAW,GAAG;AACtB,UAAM,IAAI,MAAM,mFAAmF;AAAA,EACrG;AACA,MAAI,QAAQ,WAAW,GAAG;AACxB,UAAM,IAAI,MAAM,2DAA2D;AAAA,EAC7E;AAEA,QAAM,SACJ,KAAK,UACL,WAAW;AAAA,IACT,KAAK;AAAA,MACH,SAAS,KAAK,KAAK,WAAW;AAAA,MAC9B,QAAQ,KAAK,KAAK,UAAU,QAAQ,IAAI,kBAAkB;AAAA,IAC5D;AAAA,IACA,OAAO,KAAK,KAAK,SAAS;AAAA,IAC1B,QAAQ,KAAK,gBAAgB;AAAA,IAC7B,oBAAoB,KAAK,sBAAsB;AAAA,EACjD,CAAC;AAEH,QAAM,OACJ,KAAK,QACL,sBAA4C;AAAA,IAC1C,kBAAkB;AAAA,IAClB,gBAAgB;AAAA,EAClB,CAAC;AAEH,QAAM,UAAU,KAAK,WAAW,wBAAwB;AACxD,QAAM,SAAS,KAAK,UAAU,qBAAqB,SAAS;AAE5D,MAAI,KAAK,YAAY;AACnB,SAAK,WAAW,EAAE,MAAM,oBAAoB,WAAW,KAAK,UAAU,OAAO,CAAC;AAAA,EAChF;AAEA,QAAM,SAAS,MAAM,mBAAyC;AAAA,IAC5D,WAAW;AAAA,IACX,iBAAiB,KAAK;AAAA,IACtB,qBAAqB,KAAK;AAAA,IAC1B;AAAA,IACA,QAAQ,CAAC,KAAK,KAAK;AAAA,IACnB;AAAA,IACA,gBAAgB;AAAA,IAChB,kBAAkB;AAAA,IAClB;AAAA,IACA,eAAe,KAAK,iBAAiB;AAAA,IACrC,SAAS,KAAK;AAAA,IACd,QAAQ,KAAK;AAAA,IACb;AAAA,IACA;AAAA,IACA;AAAA,IACA,eAAe,KAAK;AAAA,IACpB;AAAA,EACF,CAAC;AAED,QAAM,WAAW,cAAc,OAAO,kBAAkB,WAAW,UAAU;AAC7E,QAAM,cAAc,cAAc,OAAO,gBAAgB,WAAW,UAAU;AAE9E,MAAI,KAAK,YAAY;AACnB,SAAK,WAAW;AAAA,MACd,MAAM;AAAA,MACN,eAAe,SAAS;AAAA,MACxB,YAAY,KAAK,IAAI,IAAI;AAAA,IAC3B,CAAC;AACD,SAAK,WAAW;AAAA,MACd,MAAM;AAAA,MACN,UAAU,OAAO,WAAW;AAAA,MAC5B,MAAM,YAAY,gBAAgB,SAAS;AAAA,IAC7C,CAAC;AAAA,EACH;AAEA,QAAM,YACJ,OAAO,iBAAiB,WAAW,eACnC,OAAO,YAAY;AAAA,IACjB,CAAC,KAAK,QAAQ,MAAM,IAAI,SAAS,OAAO,CAAC,GAAG,OAAO,IAAI,GAAG,SAAS,WAAW,cAAc,CAAC;AAAA,IAC7F;AAAA,EACF;AAEF,SAAO;AAAA,IACL;AAAA,IACA,QAAQ;AAAA,MACN,GAAG;AAAA,MACH,SAAS,OAAO;AAAA,IAClB;AAAA,IACA,MAAM,YAAY,gBAAgB,SAAS;AAAA,IAC3C,cAAc,OAAO,WAAW;AAAA,IAChC,qBAAqB,OAAO,YAAY;AAAA,IACxC,YAAY,KAAK,IAAI,IAAI;AAAA,IACzB,cAAc;AAAA,IACd,KAAK;AAAA,EACP;AACF;","names":[]}
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.44.1",
5
+ "version": "0.45.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
package/dist/rl.d.ts CHANGED
@@ -1,5 +1,5 @@
1
1
  import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
2
- import { d as CampaignResult } from './types-BURGZ8Ug.js';
2
+ import { d as CampaignResult } from './types-8u72Gc76.js';
3
3
  import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-CoJMs2Iz.js';
4
4
  export { r as runEvalCampaign } from './researcher-CoJMs2Iz.js';
5
5
  import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
@@ -1,4 +1,4 @@
1
- import { S as Scenario, d as CampaignResult, j as GateResult, o as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, J as JudgeConfig, L as LabeledScenarioStore, e as CampaignTraceWriter, M as MutableSurface, l as GenerationRecord } from './types-BURGZ8Ug.js';
1
+ import { S as Scenario, d as CampaignResult, j as GateResult, o as Mutator, I as ImprovementDriver, G as Gate, g as DispatchFn, J as JudgeConfig, L as LabeledScenarioStore, e as CampaignTraceWriter, M as MutableSurface, l as GenerationRecord } from './types-8u72Gc76.js';
2
2
  import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
3
3
  import { RunRecord } from '@tangle-network/agent-runtime';
4
4
  import { R as RedTeamCase } from './red-team-30II1T4o.js';
@@ -414,4 +414,4 @@ interface RunImprovementLoopResult<TArtifact, TScenario extends Scenario> extend
414
414
  }
415
415
  declare function runImprovementLoop<TScenario extends Scenario, TArtifact>(opts: RunImprovementLoopOptions<TScenario, TArtifact>): Promise<RunImprovementLoopResult<TArtifact, TScenario>>;
416
416
 
417
- export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverOptions as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type RunCampaignOptions as R, type RunEvalOptions as a, type RunImprovementLoopOptions as b, type RunImprovementLoopResult as c, composeGate as d, defaultProductionGate as e, evolutionaryDriver as f, fsCampaignStorage as g, gepaDriver as h, heldOutGate as i, inMemoryCampaignStorage as j, runEval as k, runImprovementLoop as l, type OpenAutoPrResult as m, type RunOptimizationOptions as n, type RunOptimizationResult as o, openAutoPr as p, runOptimization as q, runCampaign as r, surfaceHash as s };
417
+ export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverOptions as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type RunImprovementLoopResult as R, type RunCampaignOptions as a, type RunEvalOptions as b, type RunImprovementLoopOptions as c, composeGate as d, defaultProductionGate as e, evolutionaryDriver as f, fsCampaignStorage as g, gepaDriver as h, heldOutGate as i, inMemoryCampaignStorage as j, runEval as k, runImprovementLoop as l, type OpenAutoPrResult as m, type RunOptimizationOptions as n, type RunOptimizationResult as o, openAutoPr as p, runOptimization as q, runCampaign as r, surfaceHash as s };
@@ -372,4 +372,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
372
372
  scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
373
373
  }
374
374
 
375
- export type { CampaignAggregates as C, DispatchFn as D, Gate as G, ImprovementDriver as I, JudgeConfig as J, LabeledScenarioStore as L, MutableSurface as M, OptimizerConfig as O, ProposeContext as P, RedactionStatus as R, Scenario as S, TraceSpan as T, CampaignArtifactWriter as a, CampaignCellResult as b, CampaignCostMeter as c, CampaignResult as d, CampaignTraceWriter as e, CodeSurface as f, DispatchContext as g, GateContext as h, GateDecision as i, GateResult as j, GenerationCandidate as k, GenerationRecord as l, JudgeDimension as m, JudgeScore as n, Mutator as o, SessionScript as p, LabeledScenarioWrite as q, LabeledScenarioSampleArgs as r, LabeledScenarioRecord as s, JudgeAggregate as t, LabeledScenarioSource as u, ScenarioAggregate as v };
375
+ export type { CampaignAggregates as C, DispatchContext as D, Gate as G, ImprovementDriver as I, JudgeConfig as J, LabeledScenarioStore as L, MutableSurface as M, OptimizerConfig as O, ProposeContext as P, RedactionStatus as R, Scenario as S, TraceSpan as T, CampaignArtifactWriter as a, CampaignCellResult as b, CampaignCostMeter as c, CampaignResult as d, CampaignTraceWriter as e, CodeSurface as f, DispatchFn as g, GateContext as h, GateDecision as i, GateResult as j, GenerationCandidate as k, GenerationRecord as l, JudgeDimension as m, JudgeScore as n, Mutator as o, SessionScript as p, LabeledScenarioWrite as q, LabeledScenarioSampleArgs as r, LabeledScenarioRecord as s, JudgeAggregate as t, LabeledScenarioSource as u, ScenarioAggregate as v };
@@ -32,11 +32,11 @@ So adoption is *graduated*, and the builder picks the depth: (1) **trace-analysi
32
32
 
33
33
  | Tier | What they do | What they get | Billing |
34
34
  |---|---|---|---|
35
- | **LAND** (exists today) | `npm i @tangle-network/agent-eval`, wrap their agent behind one `dispatch` seam, bring a judge | Full self-improvement loop + **local** trace/eval artifacts. Any infra, no sandbox. | Free (lib) |
35
+ | **LAND** (exists today) | `npm i @tangle-network/agent-eval`, wrap their agent behind one `dispatch` seam, bring a judge | Full self-improvement loop + **local** trace/eval artifacts. Any infra, no sandbox. | Free (lib) — **with optional Tangle Router as a $0-friction inference upsell.** When a builder points `OPENAI_BASE_URL` at `router.tangle.tools/v1`, every campaign call (agent + judge + reflective mutation) routes through us; we earn the routing margin. Same code, opt-in monetization vector that ships today. |
36
36
  | **EXPAND** (the build) | Route trace/eval/labeled-scenario data to our orchestrator | Hosted dashboards, cross-run intelligence, the capture flywheel as a service | **Metered** — composes with existing sandbox Stripe + cost-ledger |
37
37
  | **PLATFORM** (the carrot) | Move execution into our sandbox (agent-dev-container) | Substrate + orchestrator data/intelligence pre-wired, batteries included | Sandbox usage |
38
38
 
39
- The free lib casts the widest possible net at near-zero cost (it's already published). Value capture is EXPAND: hosting their data/intelligence = a billable surface on the dimensions we already meter (ingested/retained volume, eval-campaign compute, loop runs, seats). "We don't host observability unless they route to us" is the *business model*, not a gap.
39
+ The free lib casts the widest possible net at near-zero cost (it's already published). LAND is **not actually zero-revenue** pointing the loop at Tangle Router is a one-line config change with no other code differences, so we monetize inference for any LAND-tier adopter who opts in. The wedge ladder is therefore four steps: no-revenue install → router routing margin (LAND with router) → metered data hosting (EXPAND) → sandbox usage (PLATFORM). Each step a one-line config change, never a rewrite. Value capture concentrates at EXPAND (hosting their data/intelligence is the biggest billable surface), but LAND-with-router is the immediate upsell available from day one.
40
40
 
41
41
  ## Plan & gates — land-first, validate, then build
42
42
 
@@ -0,0 +1,188 @@
1
+ # Phase-B partner pairing kit
2
+
3
+ Everything we hand a design partner — the pitch, the discovery doc,
4
+ the judge worksheet, the 4-hour pairing agenda, the success criteria.
5
+
6
+ > This file is **partner-facing**. The internal driving runbook is in
7
+ > [`phase-b-runbook.md`](./phase-b-runbook.md).
8
+
9
+ ---
10
+
11
+ ## The pitch (one-pager)
12
+
13
+ You have a working agent. You don't have evals. You don't have a
14
+ self-improvement loop. You don't know which prompt change actually
15
+ made the agent better last week.
16
+
17
+ We have all of that on a shelf — same engine our six internal product
18
+ agents use in production. It's open source, free at the LAND tier, and
19
+ sandbox-free if you don't want our sandbox.
20
+
21
+ **The Phase-B offer:** in one 4-hour pairing, we wrap your agent
22
+ behind our `Dispatch`, author your domain-specific judge with you,
23
+ and run one real campaign + improvement loop on **your actual use
24
+ case**. You walk away with:
25
+
26
+ - A reproducible eval harness against scenarios you control.
27
+ - A judge that scores your outputs on dimensions you defined.
28
+ - One measurable lift on your real product, with a held-out gate.
29
+ - Trace artifacts you own (locally on disk; nothing leaves your
30
+ network unless you point at our hosted tier).
31
+
32
+ What we get: design-partner evidence the substrate works on a foreign
33
+ agent we did not build. That validates the wedge for us. Nothing else
34
+ changes hands.
35
+
36
+ **Cost to you:** 4 hours of pairing + your LLM bill for the campaign
37
+ run (typically $5-$50 depending on model + scenario count). No
38
+ commitment, no contract, no exclusivity. We don't take your code, your
39
+ data, or your secrets.
40
+
41
+ ---
42
+
43
+ ## Discovery questions (15 min, before the pairing)
44
+
45
+ Send these to the partner ahead of the pairing so they walk in with
46
+ their answers.
47
+
48
+ ### About the agent
49
+
50
+ 1. What does your agent **do** — one paragraph, end-user perspective?
51
+ 2. What's the **input** it accepts and the **output** it produces?
52
+ (Schemas help; English is fine.)
53
+ 3. What framework / stack? (LangChain / Mastra / OpenAI Agents SDK /
54
+ bespoke / something else.)
55
+ 4. Where does it run? (Local node / serverless / your sandbox /
56
+ browser / mobile / other.)
57
+ 5. What model(s) does it use today? Any model-routing layer
58
+ (OpenRouter, Portkey, your own)?
59
+
60
+ ### About quality
61
+
62
+ 6. How do you currently know your agent is good? (Eyeballing /
63
+ user feedback / metrics / nothing yet — all fine answers.)
64
+ 7. What does a **bad** output look like for you? Give 2-3 concrete
65
+ examples. Be specific.
66
+ 8. What does a **good** output look like? Same.
67
+ 9. Are there outputs that are *technically correct but feel wrong*?
68
+ What's the signal?
69
+ 10. How would a senior person on your team **score** an output, if
70
+ they had to give it a 1-10? Walk us through the rubric they'd
71
+ use, even informally.
72
+
73
+ ### About the loop
74
+
75
+ 11. If we could improve one thing about the agent in 4 hours, what
76
+ would move the needle the most for you?
77
+ 12. Are there *prompt* changes you've wanted to try but haven't had
78
+ the loop to validate?
79
+ 13. Anything you've explicitly tried that **didn't** work? (Saves us
80
+ suggesting it.)
81
+
82
+ ---
83
+
84
+ ## Judge-design worksheet (45 min into the pairing)
85
+
86
+ The judge is the most under-discussed piece of an eval system. Most
87
+ projects fail at the judge, not the agent.
88
+
89
+ We start with a **strawman** — the 6 dimensions in our canonical
90
+ marketing-quality judge:
91
+
92
+ | Dim | What it measures |
93
+ |---|---|
94
+ | hook_strength | Opens with concrete user outcome, not category |
95
+ | voice_match | Reads human-written; no AI slop |
96
+ | cta_clarity | Next step unambiguous for the audience |
97
+ | factual_grounding | Only claims things the brief supports |
98
+ | surface_fit | Length + register correct for medium |
99
+ | audience_specificity | Vocabulary the audience actually responds to |
100
+
101
+ **Your job in this 45 min:** rip this apart. We expect:
102
+
103
+ - **2-3 of these are wrong for you.** Replace them.
104
+ - **2-3 dimensions are missing.** Add them. (E.g., "tone matches our
105
+ brand book" or "safety-critical claim has a citation" or "answer is
106
+ decisive — no hedging when the user wants a recommendation".)
107
+ - **Weights are wrong.** For your use case some dims matter 5x more.
108
+
109
+ The deliverable: a judge with 4-8 dimensions, each scored 0.0 - 1.0,
110
+ each unambiguous enough that two independent humans would score the
111
+ same artifact within 0.1.
112
+
113
+ If a dimension is squishy, throw it out. A noisy judge poisons the
114
+ loop.
115
+
116
+ ---
117
+
118
+ ## The 4-hour pairing agenda
119
+
120
+ ### Hour 1 — Discovery + Dispatch wiring
121
+
122
+ | Time | What | Deliverable |
123
+ |---|---|---|
124
+ | 0:00 - 0:15 | Review discovery answers, align on scope | Shared doc with goals + constraints |
125
+ | 0:15 - 0:45 | Wire `Dispatch` around their agent — typically 1 function | Working `Dispatch<TScenario, TArtifact>` |
126
+ | 0:45 - 1:00 | Run 1-2 scenarios through `Dispatch` manually; see real artifacts | Confirmed wire shape |
127
+
128
+ ### Hour 2 — Judge calibration
129
+
130
+ | Time | What | Deliverable |
131
+ |---|---|---|
132
+ | 1:00 - 1:45 | Walk through the strawman judge; redesign dimensions with the partner | Final `JudgeConfig` for their domain |
133
+ | 1:45 - 2:00 | Calibrate judge against the 2 manual outputs from Hour 1 | Confirmed judge gives same scores a human would |
134
+
135
+ ### Hour 3 — First campaign + tuning
136
+
137
+ | Time | What | Deliverable |
138
+ |---|---|---|
139
+ | 2:00 - 2:30 | Define 8-15 scenarios with the partner (or use ours as a template) | Scenario set with train + holdout split |
140
+ | 2:30 - 3:00 | Run `runEval` for baseline; review per-scenario scores | Baseline score + identified failure modes |
141
+
142
+ ### Hour 4 — Improvement loop + go/no-go
143
+
144
+ | Time | What | Deliverable |
145
+ |---|---|---|
146
+ | 3:00 - 3:30 | Configure `runImprovementLoop` with `gepaDriver` (3 generations, population 2) + `defaultProductionGate` | Improvement run completes |
147
+ | 3:30 - 3:50 | Walk the partner through the gate decision + lift per scenario | Report artifact |
148
+ | 3:50 - 4:00 | Capture: was the lift real? Would they ship the winner? Will they keep using the lib? | **Go/no-go signal for Phase D** |
149
+
150
+ If we're tracking ahead at any hour, use the slack to deepen — add a
151
+ red-team battery, swap the judge model, run more generations. If we're
152
+ behind, cut the scenario set to 6 and ship.
153
+
154
+ ---
155
+
156
+ ## Success criteria — what counts as Phase B passed
157
+
158
+ For us to greenlight Phase D (hosted orchestrator + metered billing),
159
+ we need ALL of:
160
+
161
+ 1. **Real lift.** Held-out winner score > baseline by ≥ 0.05 composite
162
+ points (or the partner's chosen threshold). Not just train; held-out.
163
+ 2. **Partner-validated lift.** The partner reads the winner output on
164
+ 3+ held-out scenarios and confirms it's actually better.
165
+ 3. **Integration time ≤ 1 day.** Discovery + wiring + judge took ≤ 4
166
+ hours for the pairing; partner could reach the same point solo in
167
+ ≤ 1 day from the quickstart doc.
168
+ 4. **Public commitment.** Partner agrees to a public reference (case
169
+ study / quote / logo) OR commits to running the LAND tier in their
170
+ own product within 2 weeks.
171
+
172
+ 3-of-4 = soft pass (revisit Phase D scope but proceed). 4-of-4 = hard
173
+ pass (build Phase D). ≤ 2 = fail (back to substrate iteration).
174
+
175
+ ---
176
+
177
+ ## What we don't ask for
178
+
179
+ - Your code. Wire `Dispatch` around your existing API; we never see the
180
+ source.
181
+ - Your customer data. Use synthetic scenarios or anonymized real ones —
182
+ whichever you prefer.
183
+ - Your model keys. You bring your own; if you want, route through Tangle
184
+ Router and we never see the prompts either.
185
+ - Exclusivity, commitment, or contract. Walk away whenever.
186
+
187
+ The point is to learn if the substrate works for someone we didn't
188
+ build it for. That's it.
@@ -0,0 +1,176 @@
1
+ # Phase-B runbook (internal)
2
+
3
+ How we drive a design-partner pairing. Goes alongside
4
+ [`phase-b-pairing-kit.md`](./phase-b-pairing-kit.md) (the partner-facing
5
+ materials) — this file is for us.
6
+
7
+ ---
8
+
9
+ ## Before the pairing
10
+
11
+ - **24-48h prior:** send discovery questions from
12
+ [`phase-b-pairing-kit.md`](./phase-b-pairing-kit.md). Don't run the
13
+ pairing without answers in hand. The pairing fails when we discover
14
+ the partner's quality bar live; we don't have time to interview AND
15
+ build in 4 hours.
16
+ - **48h prior:** run the canonical demo (`pnpm tsx
17
+ examples/marketing-agent-canonical/index.ts`) end-to-end against the
18
+ partner's preferred model. Confirms the substrate + their LLM tier
19
+ compose. If it errors, fix the substrate before the pairing.
20
+ - **24h prior:** mirror the partner's stack locally. If they're on
21
+ Cloudflare Workers, run a Worker. On LangChain, install `@langchain/*`.
22
+ Don't debug their tooling on the call.
23
+ - **1h prior:** open the pairing kit, the agent-eval repo, the partner's
24
+ agent code/endpoint, a shared doc, and a screenshare ready.
25
+
26
+ ## During the pairing
27
+
28
+ ### Driving principles
29
+
30
+ - **Talk less, ship more.** The partner is paying with their time and
31
+ attention; every minute we talk we aren't shipping their lift.
32
+ - **They write the judge.** We start with our strawman so they have
33
+ something to react to, but the judge that ends up running is theirs.
34
+ This is the most-discussed seam — they should own it.
35
+ - **No invented features.** Don't promise capabilities that don't exist
36
+ ("we have a hosted ingest for this") unless they actually exist.
37
+ Phase B is honesty's purest test.
38
+ - **Capture verbatim.** Write down their exact words on what's broken /
39
+ what would change their mind. The wedge-gate evidence is qualitative
40
+ too.
41
+
42
+ ### When to escalate to Drew
43
+
44
+ - Partner wants something Phase D would have (hosted dashboard, multi-
45
+ tenant, billing). **Escalate same day** — this is the GTM signal we're
46
+ hunting for; Drew should hear it directly.
47
+ - Partner is the wrong fit (technical or business) and the pairing
48
+ would burn both sides' time. **Pause the pairing**, debrief with Drew,
49
+ reschedule with a better-fit partner.
50
+ - Substrate breaks in a way that requires a published bump. **Pause
51
+ the pairing**, ship the fix in a focused PR, resume.
52
+
53
+ ### What to capture for the wedge gate
54
+
55
+ Per [`docs/design/external-agent-wedge.md`](./design/external-agent-wedge.md),
56
+ the gate decision hinges on Phase B evidence. We capture:
57
+
58
+ 1. **Quantitative lift** — held-out winner composite vs baseline, per
59
+ scenario + overall. Auto-generated in the report artifact by the
60
+ canonical demo (`.phase-b-runs/<ts>/phase-b-report.md`).
61
+ 2. **Qualitative partner-validation** — partner read 3+ winner outputs
62
+ and confirmed they're better. Capture as a 1-paragraph quote.
63
+ 3. **Integration friction** — minutes spent on each pairing phase. Were
64
+ any > 2x estimated? What broke?
65
+ 4. **Judge-design surprise** — which dimensions the partner added or
66
+ killed vs our strawman. Strong signal about what the substrate's
67
+ default judge templates are missing for adjacent domains.
68
+ 5. **Soft commitments** — would they reference us? Would they
69
+ self-serve from the quickstart doc? Would they pay for hosted?
70
+
71
+ Capture into a single `phase-b-debrief.md` per partner. We don't
72
+ publish these; they feed the next substrate iteration + the wedge
73
+ go/no-go.
74
+
75
+ ---
76
+
77
+ ## Failure modes — what we do NOT do
78
+
79
+ ### "We'll just optimize on the train set"
80
+
81
+ Hard no. The held-out gate is the entire point. A win that doesn't
82
+ generalize is worse than no win — it's evidence that the substrate
83
+ overfits, which is the failure mode the wedge tier rewards.
84
+
85
+ If the holdout lift is < threshold but train looks great:
86
+
87
+ 1. Show the partner the gap. Explain what overfitting means here.
88
+ 2. Try raising `maxGenerations` to 5 (gives gepa more search budget).
89
+ 3. Try widening `populationSize` to 3 (more diverse mutations per gen).
90
+ 4. If still no lift on holdout: **report the result honestly**. A
91
+ negative finding is real evidence for us too — tells us this surface
92
+ isn't amenable to prompt-only mutation, and the partner needs Phase
93
+ C (code-tier optimization) or a different approach.
94
+
95
+ ### "The judge is too noisy"
96
+
97
+ A judge whose two-run variance > 0.1 on the same artifact is broken.
98
+ Fixes, in order:
99
+
100
+ 1. Lower temperature to 0.0 (the canonical judge uses 0.2, which is
101
+ already low).
102
+ 2. Use a stronger model than the agent (default: same model. Bump the
103
+ judge to GPT-5.5 / Claude Opus.)
104
+ 3. Add anchors to each dimension ("0.0 = X, 0.5 = Y, 1.0 = Z").
105
+ 4. If still noisy: collapse to fewer, simpler dimensions. 3 unambiguous
106
+ dimensions beat 6 squishy ones.
107
+
108
+ ### "We can't decide what the partner's judge should be"
109
+
110
+ Then we don't have Phase B. The judge IS the partner's quality bar.
111
+ If they can't articulate it in 45 minutes of pairing, we're in the
112
+ wrong pairing — they need to do the interview-themselves work first.
113
+
114
+ **Pause the pairing, send the discovery doc again, regroup in a week.**
115
+
116
+ ### "Their agent is slow / expensive"
117
+
118
+ `maxConcurrency: 1` and reduce scenarios to 6. Cost scales linearly;
119
+ time scales as `(scenarios × reps × generations × population) /
120
+ concurrency`. Tune until the loop completes in ≤ 30 min.
121
+
122
+ If the per-call cost is > $1, talk to Drew before the pairing — we
123
+ might want to subsidize the partner's first run.
124
+
125
+ ### "They want to share their secrets through Tangle Router"
126
+
127
+ Fine — `OPENAI_BASE_URL=https://router.tangle.tools/v1` works. Make
128
+ sure they understand: every call routes through us; the prompts and
129
+ responses are visible to whatever observability we have on the router.
130
+ If they want zero data leaving their network, point at their own
131
+ endpoint, not Tangle Router.
132
+
133
+ ---
134
+
135
+ ## After the pairing
136
+
137
+ ### Same day
138
+
139
+ - Save the `phase-b-report.md` artifact + the partner's debrief notes
140
+ to `~/company/design-partners/<partner>/<date>/`.
141
+ - Send the partner a thank-you with the winner artifact + the next-
142
+ steps doc. Whether or not we proceed to Phase D, leave them with
143
+ something concrete they can ship in their product.
144
+ - Slack Drew the verdict against the [success criteria](./phase-b-pairing-kit.md#success-criteria--what-counts-as-phase-b-passed).
145
+
146
+ ### Within a week
147
+
148
+ - If Phase B passed: open the Phase D RFC. Reuse the partner-validated
149
+ judge dimensions + scenarios as the spec for what the hosted tier
150
+ needs to support out of the box.
151
+ - If Phase B failed: substrate iteration ticket(s). Specific gaps the
152
+ pairing surfaced (judge dim defaults, doc clarity, missing helper).
153
+ - Either way: update the wedge doc (`docs/design/external-agent-wedge.md`)
154
+ with the partner-name redacted + the qualitative signal.
155
+
156
+ ### Within a month (regardless of go/no-go)
157
+
158
+ - Followup with the partner. If they're still using the lib, capture a
159
+ metric. If they stopped, find out why. Both data points feed product.
160
+
161
+ ---
162
+
163
+ ## The canonical demo as a forcing function
164
+
165
+ `examples/marketing-agent-canonical/` is the demo we open the pairing
166
+ with. It does three things at once:
167
+
168
+ 1. **Proves the substrate works** — they see a real lift on a real-
169
+ feeling agent before we touch their code.
170
+ 2. **Sets the bar for the judge conversation** — they react to concrete
171
+ dimensions, not abstract questions.
172
+ 3. **Trains us** — running the canonical demo before the pairing
173
+ surfaces substrate bugs on the partner's preferred model BEFORE the
174
+ partner is watching. We hit those bugs first.
175
+
176
+ Run the canonical demo before every Phase-B pairing. It's not optional.
@@ -13,12 +13,51 @@ Tangle sandbox, no Tangle account, and no hosted infrastructure.
13
13
  ## Install
14
14
 
15
15
  ```sh
16
- npm i @tangle-network/agent-eval@^0.44.0
16
+ npm i @tangle-network/agent-eval@^0.46.0
17
17
  ```
18
18
 
19
- The package's `@tangle-network/sandbox` peer is `optional` (as of
20
- 0.44.0). Foreign consumers can install agent-eval and run the full LAND
21
- tier without our sandbox or its dependencies.
19
+ The package's `@tangle-network/sandbox` peer is `optional`. Foreign
20
+ consumers install agent-eval and run the full LAND tier without our
21
+ sandbox or its dependencies.
22
+
23
+ ## The one-shot happy path
24
+
25
+ If you don't want to learn the substrate, the entire LAND tier reduces
26
+ to one function call:
27
+
28
+ ```ts
29
+ import { selfImprove } from '@tangle-network/agent-eval/contract'
30
+
31
+ const result = await selfImprove({
32
+ agent: (surface, scenario, ctx) =>
33
+ runYourAgent({ systemPrompt: surface as string, scenario, signal: ctx.signal }),
34
+ scenarios,
35
+ judge,
36
+ baselineSurface: 'You are a senior copywriter…',
37
+ budget: { dollars: 10, generations: 3 },
38
+ })
39
+
40
+ console.log(`lift: ${result.lift.toFixed(3)} (${result.gateDecision})`)
41
+ if (result.gateDecision === 'ship') {
42
+ // result.winner.surface is the optimized prompt
43
+ }
44
+ ```
45
+
46
+ That's the LAND happy path. Smart defaults pick: in-memory storage,
47
+ `gepaDriver` with copywriting-flavored mutation primitives,
48
+ `defaultProductionGate` with `deltaThreshold: 0.05`, 25% deterministic
49
+ train/holdout split.
50
+
51
+ Every escape hatch the substrate exposes is reachable from
52
+ `selfImprove` — custom `driver`, custom `gate`, distributed-driver
53
+ `cellPlacement`, `onProgress` streaming callback, `autoOnPromote: 'pr'`
54
+ to open a GitHub PR with the winner. See the type signatures in
55
+ [`src/contract/self-improve.ts`](../src/contract/self-improve.ts) for
56
+ the full surface.
57
+
58
+ The sections below are the lower-level path — useful when you want
59
+ fine-grained control over each piece. Read those next if `selfImprove`
60
+ isn't enough.
22
61
 
23
62
  ## Five types, four functions
24
63
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-eval",
3
- "version": "0.45.0",
3
+ "version": "0.46.0",
4
4
  "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
5
5
  "homepage": "https://github.com/tangle-network/agent-eval#readme",
6
6
  "repository": {