@tangle-network/agent-eval 0.45.0 → 0.46.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/http.js +11 -4
- package/dist/adapters/http.js.map +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/campaign/index.d.ts +3 -3
- package/dist/contract/index.d.ts +199 -2
- package/dist/contract/index.js +126 -1
- package/dist/contract/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/rl.d.ts +1 -1
- package/dist/{run-improvement-loop-pJ4yrx4X.d.ts → run-improvement-loop-Bfam3MT1.d.ts} +2 -2
- package/dist/{types-BURGZ8Ug.d.ts → types-8u72Gc76.d.ts} +1 -1
- package/docs/design/external-agent-wedge.md +2 -2
- package/docs/phase-b-pairing-kit.md +188 -0
- package/docs/phase-b-runbook.md +176 -0
- package/docs/quickstart-external.md +43 -4
- package/package.json +1 -1
package/dist/adapters/http.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { S as Scenario,
|
|
1
|
+
import { S as Scenario, g as DispatchFn, D as DispatchContext } from '../types-8u72Gc76.js';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* # `@tangle-network/agent-eval/adapters/http` — distributed Dispatch over HTTP.
|
package/dist/adapters/http.js
CHANGED
|
@@ -37,7 +37,9 @@ function httpDispatch(opts) {
|
|
|
37
37
|
method: "POST",
|
|
38
38
|
headers: {
|
|
39
39
|
"Content-Type": "application/json",
|
|
40
|
-
...authValue ? {
|
|
40
|
+
...authValue ? {
|
|
41
|
+
Authorization: authValue.startsWith("Bearer ") ? authValue : `Bearer ${authValue}`
|
|
42
|
+
} : {},
|
|
41
43
|
...opts.headers
|
|
42
44
|
},
|
|
43
45
|
body: JSON.stringify(body),
|
|
@@ -67,12 +69,15 @@ function httpDispatch(opts) {
|
|
|
67
69
|
function sleep(ms) {
|
|
68
70
|
return new Promise((resolve) => {
|
|
69
71
|
const t = setTimeout(resolve, ms);
|
|
70
|
-
if (typeof t.unref === "function")
|
|
72
|
+
if (typeof t.unref === "function")
|
|
73
|
+
t.unref();
|
|
71
74
|
});
|
|
72
75
|
}
|
|
73
76
|
async function runDispatchServer(opts) {
|
|
74
77
|
if (opts.auth === void 0) {
|
|
75
|
-
throw new Error(
|
|
78
|
+
throw new Error(
|
|
79
|
+
"runDispatchServer: 'auth' is required (pass a bearer-token string, or `auth: false` explicitly for a closed-network test deployment)."
|
|
80
|
+
);
|
|
76
81
|
}
|
|
77
82
|
const path = opts.path ?? "/dispatch";
|
|
78
83
|
const maxBytes = opts.maxBodyBytes ?? 10 * 1024 * 1024;
|
|
@@ -113,7 +118,9 @@ async function runDispatchServer(opts) {
|
|
|
113
118
|
}
|
|
114
119
|
chunks.push(buf);
|
|
115
120
|
}
|
|
116
|
-
const body = JSON.parse(
|
|
121
|
+
const body = JSON.parse(
|
|
122
|
+
Buffer.concat(chunks).toString("utf8")
|
|
123
|
+
);
|
|
117
124
|
cellId = body.cellId;
|
|
118
125
|
const ctx = opts.contextFactory ? await opts.contextFactory(body, aborter.signal) : {
|
|
119
126
|
cellId: body.cellId,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/adapters/http.ts"],"sourcesContent":["/**\n * # `@tangle-network/agent-eval/adapters/http` — distributed Dispatch over HTTP.\n *\n * Decouples driver and worker. The driver (running `runImprovementLoop` or\n * `runCampaign`) can live anywhere — your VPC, a dev laptop, a cron VM. The\n * workers (running the actual agent) can live anywhere else — different\n * regions, different clouds, different boxes — as long as they speak HTTP.\n *\n * Both sides:\n *\n * - **`httpDispatch({ url | resolveUrl, ... })`** — client. Returns a\n * `Dispatch` that POSTs `{ scenario, ctx }` to a worker URL and parses\n * the artifact back. AbortSignal-aware, retries on idempotent errors,\n * bounded timeout per call.\n * - **`runDispatchServer({ dispatch, port, ... })`** — server. Wraps your\n * local `Dispatch` as an HTTP endpoint. Handles auth, JSON parsing,\n * error mapping, and cancellation when the client aborts.\n *\n * # Topology examples\n *\n * **Single-worker:** driver on box A, worker on box B. Set\n * `httpDispatch({ url: 'https://box-b/dispatch' })`.\n *\n * **Multi-region:** N workers across regions. Use `httpDispatch({ resolveUrl })`\n * with a function that picks the URL per cell from `ctx.placement`. Combined\n * with `cellPlacement` on `RunCampaignOptions`, the substrate fans cells\n * across geographies in parallel.\n *\n * **Driver-as-a-service:** driver runs as a long-lived process or service\n * (holds optimization state across generations); workers are stateless\n * HTTP services that can scale horizontally per cell.\n */\n\nimport type { Dispatch, DispatchContext, Scenario } from '../contract'\n\n// ── Client ───────────────────────────────────────────────────────────\n\n// eslint-disable-next-line @typescript-eslint/no-unused-vars -- TArtifact is unused\n// in this options interface but kept as a parameter so callers can write\n// `HttpDispatchOptions<MyScenario, MyArtifact>` symmetrically with\n// `Dispatch<MyScenario, MyArtifact>`. Marking it unused at the position\n// where it bites.\nexport interface HttpDispatchOptions<TScenario extends Scenario, _TArtifact> {\n /** Static endpoint URL. Mutually exclusive with `resolveUrl`. */\n url?: string\n /**\n * Dynamic per-cell URL resolver. Receives the scenario + the substrate\n * placement key (from `RunCampaignOptions.cellPlacement`) and returns the\n * worker URL to invoke. Mutually exclusive with `url`.\n */\n resolveUrl?: (input: { scenario: TScenario; placement?: string; cellId: string }) => string\n /** Bearer token or static auth string set as `Authorization`. */\n auth?: string | (() => string | Promise<string>)\n /** Extra headers merged into every request. */\n headers?: Record<string, string>\n /** Per-call timeout in ms. Default 5 minutes. */\n timeoutMs?: number\n /** How many idempotent retries on 5xx / network errors. Default 2. */\n retries?: number\n /** Optional fetch override (auth wrappers, custom agent, mocks). */\n fetchImpl?: typeof fetch\n}\n\nexport interface HttpDispatchRequestBody<TScenario extends Scenario> {\n scenario: TScenario\n cellId: string\n rep: number\n generation?: number\n seed: number\n placement?: string\n cycleId?: string\n}\n\nexport interface HttpDispatchResponseBody<TArtifact> {\n artifact: TArtifact\n}\n\nfunction resolveAuth(auth: HttpDispatchOptions<Scenario, unknown>['auth']): Promise<string | null> {\n if (!auth) return Promise.resolve(null)\n if (typeof auth === 'string') return Promise.resolve(auth)\n return Promise.resolve(auth())\n}\n\n/**\n * Wrap a remote HTTP endpoint as a `Dispatch`. The remote side should run\n * `runDispatchServer` (or any service that speaks the same wire shape).\n *\n * Cancellation: the substrate's per-cell `AbortSignal` is forwarded; the\n * server's `runDispatchServer` translates the resulting `AbortError` into\n * a 499 (client-closed) so the client doesn't retry.\n */\nexport function httpDispatch<TScenario extends Scenario, TArtifact>(\n opts: HttpDispatchOptions<TScenario, TArtifact>,\n): Dispatch<TScenario, TArtifact> {\n if (!opts.url && !opts.resolveUrl) {\n throw new Error('httpDispatch: pass exactly one of `url` or `resolveUrl`.')\n }\n if (opts.url && opts.resolveUrl) {\n throw new Error('httpDispatch: pass exactly one of `url` or `resolveUrl`, not both.')\n }\n const timeoutMs = opts.timeoutMs ?? 5 * 60 * 1000\n const maxRetries = opts.retries ?? 2\n const f: typeof fetch = opts.fetchImpl ?? ((...args) => fetch(...args))\n\n return async (scenario, ctx) => {\n const url = opts.url ?? opts.resolveUrl!({ scenario, placement: ctx.placement, cellId: ctx.cellId })\n const authValue = await resolveAuth(opts.auth)\n const body: HttpDispatchRequestBody<TScenario> = {\n scenario,\n cellId: ctx.cellId,\n rep: ctx.rep,\n generation: ctx.generation,\n seed: ctx.seed,\n placement: ctx.placement,\n cycleId: ctx.cycleId,\n }\n\n let lastError: unknown\n for (let attempt = 0; attempt <= maxRetries; attempt++) {\n // Compose the request signal: caller's signal OR our timeout.\n const ourTimeout = AbortSignal.timeout(timeoutMs)\n const combinedSignal = AbortSignal.any([ctx.signal, ourTimeout])\n try {\n const res = await f(url, {\n method: 'POST',\n headers: {\n 'Content-Type': 'application/json',\n ...(authValue ? { Authorization: authValue.startsWith('Bearer ') ? authValue : `Bearer ${authValue}` } : {}),\n ...opts.headers,\n },\n body: JSON.stringify(body),\n signal: combinedSignal,\n })\n if (!res.ok) {\n // 4xx is non-retryable (caller error, auth, bad scenario shape).\n // 5xx / 408 / 429 / 502 / 503 / 504 are retryable.\n const retryable = res.status >= 500 || res.status === 408 || res.status === 429\n if (!retryable || attempt === maxRetries) {\n const text = await res.text().catch(() => '')\n throw new Error(`httpDispatch ${url} failed (${res.status}): ${text.slice(0, 500)}`)\n }\n // exponential backoff with jitter\n await sleep(2 ** attempt * 200 + Math.random() * 200)\n continue\n }\n const parsed = (await res.json()) as HttpDispatchResponseBody<TArtifact>\n return parsed.artifact\n } catch (err) {\n // Caller-driven abort is terminal — never retry.\n if (ctx.signal.aborted) throw err\n lastError = err\n if (attempt === maxRetries) throw err\n await sleep(2 ** attempt * 200 + Math.random() * 200)\n }\n }\n throw lastError ?? new Error('httpDispatch exhausted retries')\n }\n}\n\nfunction sleep(ms: number): Promise<void> {\n return new Promise((resolve) => {\n const t = setTimeout(resolve, ms)\n // Don't keep node process alive purely for backoff sleeps.\n if (typeof (t as { unref?: () => void }).unref === 'function') (t as { unref: () => void }).unref()\n })\n}\n\n// ── Server ───────────────────────────────────────────────────────────\n\nexport interface RunDispatchServerOptions<TScenario extends Scenario, TArtifact> {\n /** The Dispatch this server exposes — what runs when a request lands. */\n dispatch: Dispatch<TScenario, TArtifact>\n /** TCP port to bind. */\n port: number\n /** Optional bind host; defaults to 0.0.0.0. */\n host?: string\n /** Required for any non-test deployment: the bearer token clients must\n * send. The substrate refuses to start without auth unless `auth: false`\n * is set explicitly (intended ONLY for closed-network/internal testing). */\n auth: string | false\n /** Path the server listens on. Default `/dispatch`. */\n path?: string\n /**\n * Per-request handler that wraps `dispatch` with whatever context the\n * worker side needs to construct a `DispatchContext` — typically the\n * trace writer, artifact writer, and cost meter. The substrate provides\n * synthetic-but-typed defaults if not supplied; production deployments\n * should wire real ones (e.g. ship traces to your OTel collector).\n */\n contextFactory?: (req: HttpDispatchRequestBody<TScenario>, signal: AbortSignal) => Promise<DispatchContext>\n /** Optional max payload size for the request body (bytes). Default 10 MB. */\n maxBodyBytes?: number\n /** Hook for observability — called on every successful or failed turn. */\n onRequest?: (event: {\n cellId: string\n durationMs: number\n success: boolean\n error?: unknown\n }) => void\n}\n\nexport interface DispatchServerHandle {\n /** The actual bound port (useful when `port: 0` requests an ephemeral port). */\n port: number\n /** Stop accepting new connections and drain existing ones. */\n close: () => Promise<void>\n}\n\n/**\n * Start an HTTP server exposing a local `Dispatch` over the wire. Pair with\n * `httpDispatch` on the driver side.\n *\n * Wire shape:\n *\n * POST /dispatch\n * Authorization: Bearer <token>\n * Body: HttpDispatchRequestBody\n * 200 OK: HttpDispatchResponseBody\n * 401: missing/invalid auth\n * 408: per-request timeout exceeded\n * 499: client aborted before completion\n * 500: dispatch threw\n *\n * The server is `node:http`-based to keep the runtime dependency surface\n * minimal — works in plain Node, sandbox, or any container.\n */\nexport async function runDispatchServer<TScenario extends Scenario, TArtifact>(\n opts: RunDispatchServerOptions<TScenario, TArtifact>,\n): Promise<DispatchServerHandle> {\n if (opts.auth === undefined) {\n throw new Error(\"runDispatchServer: 'auth' is required (pass a bearer-token string, or `auth: false` explicitly for a closed-network test deployment).\")\n }\n const path = opts.path ?? '/dispatch'\n const maxBytes = opts.maxBodyBytes ?? 10 * 1024 * 1024\n const expectedAuth = typeof opts.auth === 'string' ? `Bearer ${opts.auth.replace(/^Bearer\\s+/, '')}` : null\n\n // Lazy-import node:http so the file is usable from non-Node bundlers\n // that import the client side only (e.g. an edge driver shipping\n // httpDispatch alone). Server side is opt-in by calling this function.\n const { createServer } = await import('node:http')\n\n const server = createServer(async (req, res) => {\n const start = Date.now()\n let cellId = 'unknown'\n let success = false\n let errCaught: unknown\n\n try {\n if (req.method !== 'POST' || req.url?.split('?')[0] !== path) {\n res.statusCode = 404\n res.end('not found')\n return\n }\n if (expectedAuth) {\n const got = req.headers['authorization']\n if (got !== expectedAuth) {\n res.statusCode = 401\n res.end('unauthorized')\n return\n }\n }\n\n // Read body up to maxBytes\n const chunks: Buffer[] = []\n let totalBytes = 0\n const aborter = new AbortController()\n req.on('close', () => {\n if (!res.writableEnded) aborter.abort()\n })\n\n for await (const chunk of req) {\n const buf = chunk as Buffer\n totalBytes += buf.length\n if (totalBytes > maxBytes) {\n res.statusCode = 413\n res.end('payload too large')\n return\n }\n chunks.push(buf)\n }\n\n const body = JSON.parse(Buffer.concat(chunks).toString('utf8')) as HttpDispatchRequestBody<TScenario>\n cellId = body.cellId\n\n const ctx: DispatchContext = opts.contextFactory\n ? await opts.contextFactory(body, aborter.signal)\n : {\n cellId: body.cellId,\n rep: body.rep,\n generation: body.generation,\n seed: body.seed,\n signal: aborter.signal,\n placement: body.placement,\n cycleId: body.cycleId,\n trace: NOOP_TRACE,\n artifacts: NOOP_ARTIFACTS,\n cost: NOOP_COST,\n }\n\n const artifact = await opts.dispatch(body.scenario, ctx)\n const responseBody: HttpDispatchResponseBody<TArtifact> = { artifact }\n\n res.statusCode = 200\n res.setHeader('content-type', 'application/json')\n res.end(JSON.stringify(responseBody))\n success = true\n } catch (err) {\n errCaught = err\n // Client-cancelled — they don't care about the result.\n if ((err as Error)?.name === 'AbortError') {\n res.statusCode = 499\n res.end('client aborted')\n return\n }\n res.statusCode = 500\n res.setHeader('content-type', 'application/json')\n res.end(JSON.stringify({ error: err instanceof Error ? err.message : String(err) }))\n } finally {\n opts.onRequest?.({\n cellId,\n durationMs: Date.now() - start,\n success,\n error: errCaught,\n })\n }\n })\n\n await new Promise<void>((resolve, reject) => {\n server.once('error', reject)\n server.listen(opts.port, opts.host ?? '0.0.0.0', () => resolve())\n })\n\n const addr = server.address()\n const boundPort = typeof addr === 'object' && addr ? addr.port : opts.port\n\n return {\n port: boundPort,\n close: () =>\n new Promise<void>((resolve, reject) => {\n server.close((err) => (err ? reject(err) : resolve()))\n }),\n }\n}\n\n// ── No-op default ctx machinery (worker can replace via contextFactory) ──\n\nconst NOOP_TRACE = {\n span: () => ({\n end: () => {},\n setAttribute: () => {},\n setStatus: () => {},\n recordException: () => {},\n addEvent: () => {},\n }),\n} as unknown as DispatchContext['trace']\n\nconst NOOP_ARTIFACTS = {\n write: async () => undefined,\n read: async () => undefined,\n list: async () => [],\n} as unknown as DispatchContext['artifacts']\n\nconst NOOP_COST = {\n record: () => {},\n total: () => 0,\n} as unknown as DispatchContext['cost']\n"],"mappings":";;;AA6EA,SAAS,YAAY,MAA8E;AACjG,MAAI,CAAC,KAAM,QAAO,QAAQ,QAAQ,IAAI;AACtC,MAAI,OAAO,SAAS,SAAU,QAAO,QAAQ,QAAQ,IAAI;AACzD,SAAO,QAAQ,QAAQ,KAAK,CAAC;AAC/B;AAUO,SAAS,aACd,MACgC;AAChC,MAAI,CAAC,KAAK,OAAO,CAAC,KAAK,YAAY;AACjC,UAAM,IAAI,MAAM,0DAA0D;AAAA,EAC5E;AACA,MAAI,KAAK,OAAO,KAAK,YAAY;AAC/B,UAAM,IAAI,MAAM,oEAAoE;AAAA,EACtF;AACA,QAAM,YAAY,KAAK,aAAa,IAAI,KAAK;AAC7C,QAAM,aAAa,KAAK,WAAW;AACnC,QAAM,IAAkB,KAAK,cAAc,IAAI,SAAS,MAAM,GAAG,IAAI;AAErE,SAAO,OAAO,UAAU,QAAQ;AAC9B,UAAM,MAAM,KAAK,OAAO,KAAK,WAAY,EAAE,UAAU,WAAW,IAAI,WAAW,QAAQ,IAAI,OAAO,CAAC;AACnG,UAAM,YAAY,MAAM,YAAY,KAAK,IAAI;AAC7C,UAAM,OAA2C;AAAA,MAC/C;AAAA,MACA,QAAQ,IAAI;AAAA,MACZ,KAAK,IAAI;AAAA,MACT,YAAY,IAAI;AAAA,MAChB,MAAM,IAAI;AAAA,MACV,WAAW,IAAI;AAAA,MACf,SAAS,IAAI;AAAA,IACf;AAEA,QAAI;AACJ,aAAS,UAAU,GAAG,WAAW,YAAY,WAAW;AAEtD,YAAM,aAAa,YAAY,QAAQ,SAAS;AAChD,YAAM,iBAAiB,YAAY,IAAI,CAAC,IAAI,QAAQ,UAAU,CAAC;AAC/D,UAAI;AACF,cAAM,MAAM,MAAM,EAAE,KAAK;AAAA,UACvB,QAAQ;AAAA,UACR,SAAS;AAAA,YACP,gBAAgB;AAAA,YAChB,GAAI,YAAY,EAAE,eAAe,UAAU,WAAW,SAAS,IAAI,YAAY,UAAU,SAAS,GAAG,IAAI,CAAC;AAAA,YAC1G,GAAG,KAAK;AAAA,UACV;AAAA,UACA,MAAM,KAAK,UAAU,IAAI;AAAA,UACzB,QAAQ;AAAA,QACV,CAAC;AACD,YAAI,CAAC,IAAI,IAAI;AAGX,gBAAM,YAAY,IAAI,UAAU,OAAO,IAAI,WAAW,OAAO,IAAI,WAAW;AAC5E,cAAI,CAAC,aAAa,YAAY,YAAY;AACxC,kBAAM,OAAO,MAAM,IAAI,KAAK,EAAE,MAAM,MAAM,EAAE;AAC5C,kBAAM,IAAI,MAAM,gBAAgB,GAAG,YAAY,IAAI,MAAM,MAAM,KAAK,MAAM,GAAG,GAAG,CAAC,EAAE;AAAA,UACrF;AAEA,gBAAM,MAAM,KAAK,UAAU,MAAM,KAAK,OAAO,IAAI,GAAG;AACpD;AAAA,QACF;AACA,cAAM,SAAU,MAAM,IAAI,KAAK;AAC/B,eAAO,OAAO;AAAA,MAChB,SAAS,KAAK;AAEZ,YAAI,IAAI,OAAO,QAAS,OAAM;AAC9B,oBAAY;AACZ,YAAI,YAAY,WAAY,OAAM;AAClC,cAAM,MAAM,KAAK,UAAU,MAAM,KAAK,OAAO,IAAI,GAAG;AAAA,MACtD;AAAA,IACF;AACA,UAAM,aAAa,IAAI,MAAM,gCAAgC;AAAA,EAC/D;AACF;AAEA,SAAS,MAAM,IAA2B;AACxC,SAAO,IAAI,QAAQ,CAAC,YAAY;AAC9B,UAAM,IAAI,WAAW,SAAS,EAAE;AAEhC,QAAI,OAAQ,EAA6B,UAAU,WAAY,CAAC,EAA4B,MAAM;AAAA,EACpG,CAAC;AACH;AA6DA,eAAsB,kBACpB,MAC+B;AAC/B,MAAI,KAAK,SAAS,QAAW;AAC3B,UAAM,IAAI,MAAM,uIAAuI;AAAA,EACzJ;AACA,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,WAAW,KAAK,gBAAgB,KAAK,OAAO;AAClD,QAAM,eAAe,OAAO,KAAK,SAAS,WAAW,UAAU,KAAK,KAAK,QAAQ,cAAc,EAAE,CAAC,KAAK;AAKvG,QAAM,EAAE,aAAa,IAAI,MAAM,OAAO,MAAW;AAEjD,QAAM,SAAS,aAAa,OAAO,KAAK,QAAQ;AAC9C,UAAM,QAAQ,KAAK,IAAI;AACvB,QAAI,SAAS;AACb,QAAI,UAAU;AACd,QAAI;AAEJ,QAAI;AACF,UAAI,IAAI,WAAW,UAAU,IAAI,KAAK,MAAM,GAAG,EAAE,CAAC,MAAM,MAAM;AAC5D,YAAI,aAAa;AACjB,YAAI,IAAI,WAAW;AACnB;AAAA,MACF;AACA,UAAI,cAAc;AAChB,cAAM,MAAM,IAAI,QAAQ,eAAe;AACvC,YAAI,QAAQ,cAAc;AACxB,cAAI,aAAa;AACjB,cAAI,IAAI,cAAc;AACtB;AAAA,QACF;AAAA,MACF;AAGA,YAAM,SAAmB,CAAC;AAC1B,UAAI,aAAa;AACjB,YAAM,UAAU,IAAI,gBAAgB;AACpC,UAAI,GAAG,SAAS,MAAM;AACpB,YAAI,CAAC,IAAI,cAAe,SAAQ,MAAM;AAAA,MACxC,CAAC;AAED,uBAAiB,SAAS,KAAK;AAC7B,cAAM,MAAM;AACZ,sBAAc,IAAI;AAClB,YAAI,aAAa,UAAU;AACzB,cAAI,aAAa;AACjB,cAAI,IAAI,mBAAmB;AAC3B;AAAA,QACF;AACA,eAAO,KAAK,GAAG;AAAA,MACjB;AAEA,YAAM,OAAO,KAAK,MAAM,OAAO,OAAO,MAAM,EAAE,SAAS,MAAM,CAAC;AAC9D,eAAS,KAAK;AAEd,YAAM,MAAuB,KAAK,iBAC9B,MAAM,KAAK,eAAe,MAAM,QAAQ,MAAM,IAC9C;AAAA,QACE,QAAQ,KAAK;AAAA,QACb,KAAK,KAAK;AAAA,QACV,YAAY,KAAK;AAAA,QACjB,MAAM,KAAK;AAAA,QACX,QAAQ,QAAQ;AAAA,QAChB,WAAW,KAAK;AAAA,QAChB,SAAS,KAAK;AAAA,QACd,OAAO;AAAA,QACP,WAAW;AAAA,QACX,MAAM;AAAA,MACR;AAEJ,YAAM,WAAW,MAAM,KAAK,SAAS,KAAK,UAAU,GAAG;AACvD,YAAM,eAAoD,EAAE,SAAS;AAErE,UAAI,aAAa;AACjB,UAAI,UAAU,gBAAgB,kBAAkB;AAChD,UAAI,IAAI,KAAK,UAAU,YAAY,CAAC;AACpC,gBAAU;AAAA,IACZ,SAAS,KAAK;AACZ,kBAAY;AAEZ,UAAK,KAAe,SAAS,cAAc;AACzC,YAAI,aAAa;AACjB,YAAI,IAAI,gBAAgB;AACxB;AAAA,MACF;AACA,UAAI,aAAa;AACjB,UAAI,UAAU,gBAAgB,kBAAkB;AAChD,UAAI,IAAI,KAAK,UAAU,EAAE,OAAO,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,EAAE,CAAC,CAAC;AAAA,IACrF,UAAE;AACA,WAAK,YAAY;AAAA,QACf;AAAA,QACA,YAAY,KAAK,IAAI,IAAI;AAAA,QACzB;AAAA,QACA,OAAO;AAAA,MACT,CAAC;AAAA,IACH;AAAA,EACF,CAAC;AAED,QAAM,IAAI,QAAc,CAAC,SAAS,WAAW;AAC3C,WAAO,KAAK,SAAS,MAAM;AAC3B,WAAO,OAAO,KAAK,MAAM,KAAK,QAAQ,WAAW,MAAM,QAAQ,CAAC;AAAA,EAClE,CAAC;AAED,QAAM,OAAO,OAAO,QAAQ;AAC5B,QAAM,YAAY,OAAO,SAAS,YAAY,OAAO,KAAK,OAAO,KAAK;AAEtE,SAAO;AAAA,IACL,MAAM;AAAA,IACN,OAAO,MACL,IAAI,QAAc,CAAC,SAAS,WAAW;AACrC,aAAO,MAAM,CAAC,QAAS,MAAM,OAAO,GAAG,IAAI,QAAQ,CAAE;AAAA,IACvD,CAAC;AAAA,EACL;AACF;AAIA,IAAM,aAAa;AAAA,EACjB,MAAM,OAAO;AAAA,IACX,KAAK,MAAM;AAAA,IAAC;AAAA,IACZ,cAAc,MAAM;AAAA,IAAC;AAAA,IACrB,WAAW,MAAM;AAAA,IAAC;AAAA,IAClB,iBAAiB,MAAM;AAAA,IAAC;AAAA,IACxB,UAAU,MAAM;AAAA,IAAC;AAAA,EACnB;AACF;AAEA,IAAM,iBAAiB;AAAA,EACrB,OAAO,YAAY;AAAA,EACnB,MAAM,YAAY;AAAA,EAClB,MAAM,YAAY,CAAC;AACrB;AAEA,IAAM,YAAY;AAAA,EAChB,QAAQ,MAAM;AAAA,EAAC;AAAA,EACf,OAAO,MAAM;AACf;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../../src/adapters/http.ts"],"sourcesContent":["/**\n * # `@tangle-network/agent-eval/adapters/http` — distributed Dispatch over HTTP.\n *\n * Decouples driver and worker. The driver (running `runImprovementLoop` or\n * `runCampaign`) can live anywhere — your VPC, a dev laptop, a cron VM. The\n * workers (running the actual agent) can live anywhere else — different\n * regions, different clouds, different boxes — as long as they speak HTTP.\n *\n * Both sides:\n *\n * - **`httpDispatch({ url | resolveUrl, ... })`** — client. Returns a\n * `Dispatch` that POSTs `{ scenario, ctx }` to a worker URL and parses\n * the artifact back. AbortSignal-aware, retries on idempotent errors,\n * bounded timeout per call.\n * - **`runDispatchServer({ dispatch, port, ... })`** — server. Wraps your\n * local `Dispatch` as an HTTP endpoint. Handles auth, JSON parsing,\n * error mapping, and cancellation when the client aborts.\n *\n * # Topology examples\n *\n * **Single-worker:** driver on box A, worker on box B. Set\n * `httpDispatch({ url: 'https://box-b/dispatch' })`.\n *\n * **Multi-region:** N workers across regions. Use `httpDispatch({ resolveUrl })`\n * with a function that picks the URL per cell from `ctx.placement`. Combined\n * with `cellPlacement` on `RunCampaignOptions`, the substrate fans cells\n * across geographies in parallel.\n *\n * **Driver-as-a-service:** driver runs as a long-lived process or service\n * (holds optimization state across generations); workers are stateless\n * HTTP services that can scale horizontally per cell.\n */\n\nimport type { Dispatch, DispatchContext, Scenario } from '../contract'\n\n// ── Client ───────────────────────────────────────────────────────────\n\n// eslint-disable-next-line @typescript-eslint/no-unused-vars -- TArtifact is unused\n// in this options interface but kept as a parameter so callers can write\n// `HttpDispatchOptions<MyScenario, MyArtifact>` symmetrically with\n// `Dispatch<MyScenario, MyArtifact>`. Marking it unused at the position\n// where it bites.\nexport interface HttpDispatchOptions<TScenario extends Scenario, _TArtifact> {\n /** Static endpoint URL. Mutually exclusive with `resolveUrl`. */\n url?: string\n /**\n * Dynamic per-cell URL resolver. Receives the scenario + the substrate\n * placement key (from `RunCampaignOptions.cellPlacement`) and returns the\n * worker URL to invoke. Mutually exclusive with `url`.\n */\n resolveUrl?: (input: { scenario: TScenario; placement?: string; cellId: string }) => string\n /** Bearer token or static auth string set as `Authorization`. */\n auth?: string | (() => string | Promise<string>)\n /** Extra headers merged into every request. */\n headers?: Record<string, string>\n /** Per-call timeout in ms. Default 5 minutes. */\n timeoutMs?: number\n /** How many idempotent retries on 5xx / network errors. Default 2. */\n retries?: number\n /** Optional fetch override (auth wrappers, custom agent, mocks). */\n fetchImpl?: typeof fetch\n}\n\nexport interface HttpDispatchRequestBody<TScenario extends Scenario> {\n scenario: TScenario\n cellId: string\n rep: number\n generation?: number\n seed: number\n placement?: string\n cycleId?: string\n}\n\nexport interface HttpDispatchResponseBody<TArtifact> {\n artifact: TArtifact\n}\n\nfunction resolveAuth(auth: HttpDispatchOptions<Scenario, unknown>['auth']): Promise<string | null> {\n if (!auth) return Promise.resolve(null)\n if (typeof auth === 'string') return Promise.resolve(auth)\n return Promise.resolve(auth())\n}\n\n/**\n * Wrap a remote HTTP endpoint as a `Dispatch`. The remote side should run\n * `runDispatchServer` (or any service that speaks the same wire shape).\n *\n * Cancellation: the substrate's per-cell `AbortSignal` is forwarded; the\n * server's `runDispatchServer` translates the resulting `AbortError` into\n * a 499 (client-closed) so the client doesn't retry.\n */\nexport function httpDispatch<TScenario extends Scenario, TArtifact>(\n opts: HttpDispatchOptions<TScenario, TArtifact>,\n): Dispatch<TScenario, TArtifact> {\n if (!opts.url && !opts.resolveUrl) {\n throw new Error('httpDispatch: pass exactly one of `url` or `resolveUrl`.')\n }\n if (opts.url && opts.resolveUrl) {\n throw new Error('httpDispatch: pass exactly one of `url` or `resolveUrl`, not both.')\n }\n const timeoutMs = opts.timeoutMs ?? 5 * 60 * 1000\n const maxRetries = opts.retries ?? 2\n const f: typeof fetch = opts.fetchImpl ?? ((...args) => fetch(...args))\n\n return async (scenario, ctx) => {\n const url =\n opts.url ?? opts.resolveUrl!({ scenario, placement: ctx.placement, cellId: ctx.cellId })\n const authValue = await resolveAuth(opts.auth)\n const body: HttpDispatchRequestBody<TScenario> = {\n scenario,\n cellId: ctx.cellId,\n rep: ctx.rep,\n generation: ctx.generation,\n seed: ctx.seed,\n placement: ctx.placement,\n cycleId: ctx.cycleId,\n }\n\n let lastError: unknown\n for (let attempt = 0; attempt <= maxRetries; attempt++) {\n // Compose the request signal: caller's signal OR our timeout.\n const ourTimeout = AbortSignal.timeout(timeoutMs)\n const combinedSignal = AbortSignal.any([ctx.signal, ourTimeout])\n try {\n const res = await f(url, {\n method: 'POST',\n headers: {\n 'Content-Type': 'application/json',\n ...(authValue\n ? {\n Authorization: authValue.startsWith('Bearer ')\n ? authValue\n : `Bearer ${authValue}`,\n }\n : {}),\n ...opts.headers,\n },\n body: JSON.stringify(body),\n signal: combinedSignal,\n })\n if (!res.ok) {\n // 4xx is non-retryable (caller error, auth, bad scenario shape).\n // 5xx / 408 / 429 / 502 / 503 / 504 are retryable.\n const retryable = res.status >= 500 || res.status === 408 || res.status === 429\n if (!retryable || attempt === maxRetries) {\n const text = await res.text().catch(() => '')\n throw new Error(`httpDispatch ${url} failed (${res.status}): ${text.slice(0, 500)}`)\n }\n // exponential backoff with jitter\n await sleep(2 ** attempt * 200 + Math.random() * 200)\n continue\n }\n const parsed = (await res.json()) as HttpDispatchResponseBody<TArtifact>\n return parsed.artifact\n } catch (err) {\n // Caller-driven abort is terminal — never retry.\n if (ctx.signal.aborted) throw err\n lastError = err\n if (attempt === maxRetries) throw err\n await sleep(2 ** attempt * 200 + Math.random() * 200)\n }\n }\n throw lastError ?? new Error('httpDispatch exhausted retries')\n }\n}\n\nfunction sleep(ms: number): Promise<void> {\n return new Promise((resolve) => {\n const t = setTimeout(resolve, ms)\n // Don't keep node process alive purely for backoff sleeps.\n if (typeof (t as { unref?: () => void }).unref === 'function')\n (t as { unref: () => void }).unref()\n })\n}\n\n// ── Server ───────────────────────────────────────────────────────────\n\nexport interface RunDispatchServerOptions<TScenario extends Scenario, TArtifact> {\n /** The Dispatch this server exposes — what runs when a request lands. */\n dispatch: Dispatch<TScenario, TArtifact>\n /** TCP port to bind. */\n port: number\n /** Optional bind host; defaults to 0.0.0.0. */\n host?: string\n /** Required for any non-test deployment: the bearer token clients must\n * send. The substrate refuses to start without auth unless `auth: false`\n * is set explicitly (intended ONLY for closed-network/internal testing). */\n auth: string | false\n /** Path the server listens on. Default `/dispatch`. */\n path?: string\n /**\n * Per-request handler that wraps `dispatch` with whatever context the\n * worker side needs to construct a `DispatchContext` — typically the\n * trace writer, artifact writer, and cost meter. The substrate provides\n * synthetic-but-typed defaults if not supplied; production deployments\n * should wire real ones (e.g. ship traces to your OTel collector).\n */\n contextFactory?: (\n req: HttpDispatchRequestBody<TScenario>,\n signal: AbortSignal,\n ) => Promise<DispatchContext>\n /** Optional max payload size for the request body (bytes). Default 10 MB. */\n maxBodyBytes?: number\n /** Hook for observability — called on every successful or failed turn. */\n onRequest?: (event: {\n cellId: string\n durationMs: number\n success: boolean\n error?: unknown\n }) => void\n}\n\nexport interface DispatchServerHandle {\n /** The actual bound port (useful when `port: 0` requests an ephemeral port). */\n port: number\n /** Stop accepting new connections and drain existing ones. */\n close: () => Promise<void>\n}\n\n/**\n * Start an HTTP server exposing a local `Dispatch` over the wire. Pair with\n * `httpDispatch` on the driver side.\n *\n * Wire shape:\n *\n * POST /dispatch\n * Authorization: Bearer <token>\n * Body: HttpDispatchRequestBody\n * 200 OK: HttpDispatchResponseBody\n * 401: missing/invalid auth\n * 408: per-request timeout exceeded\n * 499: client aborted before completion\n * 500: dispatch threw\n *\n * The server is `node:http`-based to keep the runtime dependency surface\n * minimal — works in plain Node, sandbox, or any container.\n */\nexport async function runDispatchServer<TScenario extends Scenario, TArtifact>(\n opts: RunDispatchServerOptions<TScenario, TArtifact>,\n): Promise<DispatchServerHandle> {\n if (opts.auth === undefined) {\n throw new Error(\n \"runDispatchServer: 'auth' is required (pass a bearer-token string, or `auth: false` explicitly for a closed-network test deployment).\",\n )\n }\n const path = opts.path ?? '/dispatch'\n const maxBytes = opts.maxBodyBytes ?? 10 * 1024 * 1024\n const expectedAuth =\n typeof opts.auth === 'string' ? `Bearer ${opts.auth.replace(/^Bearer\\s+/, '')}` : null\n\n // Lazy-import node:http so the file is usable from non-Node bundlers\n // that import the client side only (e.g. an edge driver shipping\n // httpDispatch alone). Server side is opt-in by calling this function.\n const { createServer } = await import('node:http')\n\n const server = createServer(async (req, res) => {\n const start = Date.now()\n let cellId = 'unknown'\n let success = false\n let errCaught: unknown\n\n try {\n if (req.method !== 'POST' || req.url?.split('?')[0] !== path) {\n res.statusCode = 404\n res.end('not found')\n return\n }\n if (expectedAuth) {\n const got = req.headers['authorization']\n if (got !== expectedAuth) {\n res.statusCode = 401\n res.end('unauthorized')\n return\n }\n }\n\n // Read body up to maxBytes\n const chunks: Buffer[] = []\n let totalBytes = 0\n const aborter = new AbortController()\n req.on('close', () => {\n if (!res.writableEnded) aborter.abort()\n })\n\n for await (const chunk of req) {\n const buf = chunk as Buffer\n totalBytes += buf.length\n if (totalBytes > maxBytes) {\n res.statusCode = 413\n res.end('payload too large')\n return\n }\n chunks.push(buf)\n }\n\n const body = JSON.parse(\n Buffer.concat(chunks).toString('utf8'),\n ) as HttpDispatchRequestBody<TScenario>\n cellId = body.cellId\n\n const ctx: DispatchContext = opts.contextFactory\n ? await opts.contextFactory(body, aborter.signal)\n : {\n cellId: body.cellId,\n rep: body.rep,\n generation: body.generation,\n seed: body.seed,\n signal: aborter.signal,\n placement: body.placement,\n cycleId: body.cycleId,\n trace: NOOP_TRACE,\n artifacts: NOOP_ARTIFACTS,\n cost: NOOP_COST,\n }\n\n const artifact = await opts.dispatch(body.scenario, ctx)\n const responseBody: HttpDispatchResponseBody<TArtifact> = { artifact }\n\n res.statusCode = 200\n res.setHeader('content-type', 'application/json')\n res.end(JSON.stringify(responseBody))\n success = true\n } catch (err) {\n errCaught = err\n // Client-cancelled — they don't care about the result.\n if ((err as Error)?.name === 'AbortError') {\n res.statusCode = 499\n res.end('client aborted')\n return\n }\n res.statusCode = 500\n res.setHeader('content-type', 'application/json')\n res.end(JSON.stringify({ error: err instanceof Error ? err.message : String(err) }))\n } finally {\n opts.onRequest?.({\n cellId,\n durationMs: Date.now() - start,\n success,\n error: errCaught,\n })\n }\n })\n\n await new Promise<void>((resolve, reject) => {\n server.once('error', reject)\n server.listen(opts.port, opts.host ?? '0.0.0.0', () => resolve())\n })\n\n const addr = server.address()\n const boundPort = typeof addr === 'object' && addr ? addr.port : opts.port\n\n return {\n port: boundPort,\n close: () =>\n new Promise<void>((resolve, reject) => {\n server.close((err) => (err ? reject(err) : resolve()))\n }),\n }\n}\n\n// ── No-op default ctx machinery (worker can replace via contextFactory) ──\n\nconst NOOP_TRACE = {\n span: () => ({\n end: () => {},\n setAttribute: () => {},\n setStatus: () => {},\n recordException: () => {},\n addEvent: () => {},\n }),\n} as unknown as DispatchContext['trace']\n\nconst NOOP_ARTIFACTS = {\n write: async () => undefined,\n read: async () => undefined,\n list: async () => [],\n} as unknown as DispatchContext['artifacts']\n\nconst NOOP_COST = {\n record: () => {},\n total: () => 0,\n} as unknown as DispatchContext['cost']\n"],"mappings":";;;AA6EA,SAAS,YAAY,MAA8E;AACjG,MAAI,CAAC,KAAM,QAAO,QAAQ,QAAQ,IAAI;AACtC,MAAI,OAAO,SAAS,SAAU,QAAO,QAAQ,QAAQ,IAAI;AACzD,SAAO,QAAQ,QAAQ,KAAK,CAAC;AAC/B;AAUO,SAAS,aACd,MACgC;AAChC,MAAI,CAAC,KAAK,OAAO,CAAC,KAAK,YAAY;AACjC,UAAM,IAAI,MAAM,0DAA0D;AAAA,EAC5E;AACA,MAAI,KAAK,OAAO,KAAK,YAAY;AAC/B,UAAM,IAAI,MAAM,oEAAoE;AAAA,EACtF;AACA,QAAM,YAAY,KAAK,aAAa,IAAI,KAAK;AAC7C,QAAM,aAAa,KAAK,WAAW;AACnC,QAAM,IAAkB,KAAK,cAAc,IAAI,SAAS,MAAM,GAAG,IAAI;AAErE,SAAO,OAAO,UAAU,QAAQ;AAC9B,UAAM,MACJ,KAAK,OAAO,KAAK,WAAY,EAAE,UAAU,WAAW,IAAI,WAAW,QAAQ,IAAI,OAAO,CAAC;AACzF,UAAM,YAAY,MAAM,YAAY,KAAK,IAAI;AAC7C,UAAM,OAA2C;AAAA,MAC/C;AAAA,MACA,QAAQ,IAAI;AAAA,MACZ,KAAK,IAAI;AAAA,MACT,YAAY,IAAI;AAAA,MAChB,MAAM,IAAI;AAAA,MACV,WAAW,IAAI;AAAA,MACf,SAAS,IAAI;AAAA,IACf;AAEA,QAAI;AACJ,aAAS,UAAU,GAAG,WAAW,YAAY,WAAW;AAEtD,YAAM,aAAa,YAAY,QAAQ,SAAS;AAChD,YAAM,iBAAiB,YAAY,IAAI,CAAC,IAAI,QAAQ,UAAU,CAAC;AAC/D,UAAI;AACF,cAAM,MAAM,MAAM,EAAE,KAAK;AAAA,UACvB,QAAQ;AAAA,UACR,SAAS;AAAA,YACP,gBAAgB;AAAA,YAChB,GAAI,YACA;AAAA,cACE,eAAe,UAAU,WAAW,SAAS,IACzC,YACA,UAAU,SAAS;AAAA,YACzB,IACA,CAAC;AAAA,YACL,GAAG,KAAK;AAAA,UACV;AAAA,UACA,MAAM,KAAK,UAAU,IAAI;AAAA,UACzB,QAAQ;AAAA,QACV,CAAC;AACD,YAAI,CAAC,IAAI,IAAI;AAGX,gBAAM,YAAY,IAAI,UAAU,OAAO,IAAI,WAAW,OAAO,IAAI,WAAW;AAC5E,cAAI,CAAC,aAAa,YAAY,YAAY;AACxC,kBAAM,OAAO,MAAM,IAAI,KAAK,EAAE,MAAM,MAAM,EAAE;AAC5C,kBAAM,IAAI,MAAM,gBAAgB,GAAG,YAAY,IAAI,MAAM,MAAM,KAAK,MAAM,GAAG,GAAG,CAAC,EAAE;AAAA,UACrF;AAEA,gBAAM,MAAM,KAAK,UAAU,MAAM,KAAK,OAAO,IAAI,GAAG;AACpD;AAAA,QACF;AACA,cAAM,SAAU,MAAM,IAAI,KAAK;AAC/B,eAAO,OAAO;AAAA,MAChB,SAAS,KAAK;AAEZ,YAAI,IAAI,OAAO,QAAS,OAAM;AAC9B,oBAAY;AACZ,YAAI,YAAY,WAAY,OAAM;AAClC,cAAM,MAAM,KAAK,UAAU,MAAM,KAAK,OAAO,IAAI,GAAG;AAAA,MACtD;AAAA,IACF;AACA,UAAM,aAAa,IAAI,MAAM,gCAAgC;AAAA,EAC/D;AACF;AAEA,SAAS,MAAM,IAA2B;AACxC,SAAO,IAAI,QAAQ,CAAC,YAAY;AAC9B,UAAM,IAAI,WAAW,SAAS,EAAE;AAEhC,QAAI,OAAQ,EAA6B,UAAU;AACjD,MAAC,EAA4B,MAAM;AAAA,EACvC,CAAC;AACH;AAgEA,eAAsB,kBACpB,MAC+B;AAC/B,MAAI,KAAK,SAAS,QAAW;AAC3B,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,WAAW,KAAK,gBAAgB,KAAK,OAAO;AAClD,QAAM,eACJ,OAAO,KAAK,SAAS,WAAW,UAAU,KAAK,KAAK,QAAQ,cAAc,EAAE,CAAC,KAAK;AAKpF,QAAM,EAAE,aAAa,IAAI,MAAM,OAAO,MAAW;AAEjD,QAAM,SAAS,aAAa,OAAO,KAAK,QAAQ;AAC9C,UAAM,QAAQ,KAAK,IAAI;AACvB,QAAI,SAAS;AACb,QAAI,UAAU;AACd,QAAI;AAEJ,QAAI;AACF,UAAI,IAAI,WAAW,UAAU,IAAI,KAAK,MAAM,GAAG,EAAE,CAAC,MAAM,MAAM;AAC5D,YAAI,aAAa;AACjB,YAAI,IAAI,WAAW;AACnB;AAAA,MACF;AACA,UAAI,cAAc;AAChB,cAAM,MAAM,IAAI,QAAQ,eAAe;AACvC,YAAI,QAAQ,cAAc;AACxB,cAAI,aAAa;AACjB,cAAI,IAAI,cAAc;AACtB;AAAA,QACF;AAAA,MACF;AAGA,YAAM,SAAmB,CAAC;AAC1B,UAAI,aAAa;AACjB,YAAM,UAAU,IAAI,gBAAgB;AACpC,UAAI,GAAG,SAAS,MAAM;AACpB,YAAI,CAAC,IAAI,cAAe,SAAQ,MAAM;AAAA,MACxC,CAAC;AAED,uBAAiB,SAAS,KAAK;AAC7B,cAAM,MAAM;AACZ,sBAAc,IAAI;AAClB,YAAI,aAAa,UAAU;AACzB,cAAI,aAAa;AACjB,cAAI,IAAI,mBAAmB;AAC3B;AAAA,QACF;AACA,eAAO,KAAK,GAAG;AAAA,MACjB;AAEA,YAAM,OAAO,KAAK;AAAA,QAChB,OAAO,OAAO,MAAM,EAAE,SAAS,MAAM;AAAA,MACvC;AACA,eAAS,KAAK;AAEd,YAAM,MAAuB,KAAK,iBAC9B,MAAM,KAAK,eAAe,MAAM,QAAQ,MAAM,IAC9C;AAAA,QACE,QAAQ,KAAK;AAAA,QACb,KAAK,KAAK;AAAA,QACV,YAAY,KAAK;AAAA,QACjB,MAAM,KAAK;AAAA,QACX,QAAQ,QAAQ;AAAA,QAChB,WAAW,KAAK;AAAA,QAChB,SAAS,KAAK;AAAA,QACd,OAAO;AAAA,QACP,WAAW;AAAA,QACX,MAAM;AAAA,MACR;AAEJ,YAAM,WAAW,MAAM,KAAK,SAAS,KAAK,UAAU,GAAG;AACvD,YAAM,eAAoD,EAAE,SAAS;AAErE,UAAI,aAAa;AACjB,UAAI,UAAU,gBAAgB,kBAAkB;AAChD,UAAI,IAAI,KAAK,UAAU,YAAY,CAAC;AACpC,gBAAU;AAAA,IACZ,SAAS,KAAK;AACZ,kBAAY;AAEZ,UAAK,KAAe,SAAS,cAAc;AACzC,YAAI,aAAa;AACjB,YAAI,IAAI,gBAAgB;AACxB;AAAA,MACF;AACA,UAAI,aAAa;AACjB,UAAI,UAAU,gBAAgB,kBAAkB;AAChD,UAAI,IAAI,KAAK,UAAU,EAAE,OAAO,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,EAAE,CAAC,CAAC;AAAA,IACrF,UAAE;AACA,WAAK,YAAY;AAAA,QACf;AAAA,QACA,YAAY,KAAK,IAAI,IAAI;AAAA,QACzB;AAAA,QACA,OAAO;AAAA,MACT,CAAC;AAAA,IACH;AAAA,EACF,CAAC;AAED,QAAM,IAAI,QAAc,CAAC,SAAS,WAAW;AAC3C,WAAO,KAAK,SAAS,MAAM;AAC3B,WAAO,OAAO,KAAK,MAAM,KAAK,QAAQ,WAAW,MAAM,QAAQ,CAAC;AAAA,EAClE,CAAC;AAED,QAAM,OAAO,OAAO,QAAQ;AAC5B,QAAM,YAAY,OAAO,SAAS,YAAY,OAAO,KAAK,OAAO,KAAK;AAEtE,SAAO;AAAA,IACL,MAAM;AAAA,IACN,OAAO,MACL,IAAI,QAAc,CAAC,SAAS,WAAW;AACrC,aAAO,MAAM,CAAC,QAAS,MAAM,OAAO,GAAG,IAAI,QAAQ,CAAE;AAAA,IACvD,CAAC;AAAA,EACL;AACF;AAIA,IAAM,aAAa;AAAA,EACjB,MAAM,OAAO;AAAA,IACX,KAAK,MAAM;AAAA,IAAC;AAAA,IACZ,cAAc,MAAM;AAAA,IAAC;AAAA,IACrB,WAAW,MAAM;AAAA,IAAC;AAAA,IAClB,iBAAiB,MAAM;AAAA,IAAC;AAAA,IACxB,UAAU,MAAM;AAAA,IAAC;AAAA,EACnB;AACF;AAEA,IAAM,iBAAiB;AAAA,EACrB,OAAO,YAAY;AAAA,EACnB,MAAM,YAAY;AAAA,EAClB,MAAM,YAAY,CAAC;AACrB;AAEA,IAAM,YAAY;AAAA,EAChB,QAAQ,MAAM;AAAA,EAAC;AAAA,EACf,OAAO,MAAM;AACf;","names":[]}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { S as Scenario, n as JudgeScore,
|
|
1
|
+
import { S as Scenario, n as JudgeScore, g as DispatchFn, J as JudgeConfig } from '../types-8u72Gc76.js';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* # `@tangle-network/agent-eval/adapters/langchain` — wrap any LangChain
|
package/dist/campaign/index.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
export { C as CampaignStorage, D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, G as GepaDriverOptions, H as HeldOutGateOptions, O as OpenAutoPrOptions, m as OpenAutoPrResult,
|
|
2
|
-
import { L as LabeledScenarioStore, q as LabeledScenarioWrite, r as LabeledScenarioSampleArgs, s as LabeledScenarioRecord, f as CodeSurface } from '../types-
|
|
3
|
-
export { C as CampaignAggregates, a as CampaignArtifactWriter, b as CampaignCellResult, c as CampaignCostMeter, d as CampaignResult, e as CampaignTraceWriter,
|
|
1
|
+
export { C as CampaignStorage, D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, G as GepaDriverOptions, H as HeldOutGateOptions, O as OpenAutoPrOptions, m as OpenAutoPrResult, a as RunCampaignOptions, b as RunEvalOptions, c as RunImprovementLoopOptions, R as RunImprovementLoopResult, n as RunOptimizationOptions, o as RunOptimizationResult, d as composeGate, e as defaultProductionGate, f as evolutionaryDriver, g as fsCampaignStorage, h as gepaDriver, i as heldOutGate, j as inMemoryCampaignStorage, p as openAutoPr, r as runCampaign, k as runEval, l as runImprovementLoop, q as runOptimization, s as surfaceHash } from '../run-improvement-loop-Bfam3MT1.js';
|
|
2
|
+
import { L as LabeledScenarioStore, q as LabeledScenarioWrite, r as LabeledScenarioSampleArgs, s as LabeledScenarioRecord, f as CodeSurface } from '../types-8u72Gc76.js';
|
|
3
|
+
export { C as CampaignAggregates, a as CampaignArtifactWriter, b as CampaignCellResult, c as CampaignCostMeter, d as CampaignResult, e as CampaignTraceWriter, D as DispatchContext, g as DispatchFn, G as Gate, h as GateContext, i as GateDecision, j as GateResult, k as GenerationCandidate, l as GenerationRecord, I as ImprovementDriver, t as JudgeAggregate, J as JudgeConfig, m as JudgeDimension, n as JudgeScore, u as LabeledScenarioSource, M as MutableSurface, o as Mutator, O as OptimizerConfig, P as ProposeContext, R as RedactionStatus, S as Scenario, v as ScenarioAggregate, p as SessionScript, T as TraceSpan } from '../types-8u72Gc76.js';
|
|
4
4
|
import '../llm-client-BXVRUZyX.js';
|
|
5
5
|
import '../errors-mje_cKOs.js';
|
|
6
6
|
import '../raw-provider-sink-C46HDghv.js';
|
package/dist/contract/index.d.ts
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
|
-
|
|
2
|
-
export { C as
|
|
1
|
+
import { S as Scenario, M as MutableSurface, D as DispatchContext, J as JudgeConfig, I as ImprovementDriver, G as Gate } from '../types-8u72Gc76.js';
|
|
2
|
+
export { C as CampaignAggregates, a as CampaignArtifactWriter, b as CampaignCellResult, c as CampaignCostMeter, d as CampaignResult, e as CampaignTraceWriter, f as CodeSurface, g as Dispatch, h as GateContext, i as GateDecision, j as GateResult, k as GenerationCandidate, l as GenerationRecord, m as JudgeDimension, n as JudgeScore, o as Mutator, O as OptimizerConfig, p as SessionScript } from '../types-8u72Gc76.js';
|
|
3
|
+
import { C as CampaignStorage, R as RunImprovementLoopResult } from '../run-improvement-loop-Bfam3MT1.js';
|
|
4
|
+
export { D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, G as GepaDriverOptions, H as HeldOutGateOptions, a as RunCampaignOptions, b as RunEvalOptions, c as RunImprovementLoopOptions, d as composeGate, e as defaultProductionGate, f as evolutionaryDriver, g as fsCampaignStorage, h as gepaDriver, i as heldOutGate, j as inMemoryCampaignStorage, r as runCampaign, k as runEval, l as runImprovementLoop } from '../run-improvement-loop-Bfam3MT1.js';
|
|
3
5
|
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, a as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, O as OutcomeStore } from '../outcome-store-BxJ3DQKJ.js';
|
|
4
6
|
import '../llm-client-BXVRUZyX.js';
|
|
5
7
|
import '../errors-mje_cKOs.js';
|
|
@@ -8,3 +10,198 @@ import '@tangle-network/agent-runtime';
|
|
|
8
10
|
import '../red-team-30II1T4o.js';
|
|
9
11
|
import '../dataset-BlwAtYYf.js';
|
|
10
12
|
import '../store-Db2Bv8Cf.js';
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* # `selfImprove()` — the LAND-tier one-shot.
|
|
16
|
+
*
|
|
17
|
+
* The cheapest possible call site to run a real closed-loop self-
|
|
18
|
+
* improvement over your agent. Wraps `runImprovementLoop` with smart
|
|
19
|
+
* defaults and a budget-shaped options API; every escape hatch the
|
|
20
|
+
* substrate exposes is reachable from here without losing the
|
|
21
|
+
* one-function feel.
|
|
22
|
+
*
|
|
23
|
+
* Defaults picked to match the LAND-tier story:
|
|
24
|
+
* - In-memory storage (no filesystem touch).
|
|
25
|
+
* - `gepaDriver` reflective mutation with copywriting-flavored primitives
|
|
26
|
+
* (override `driver` or `mutationPrimitives` for any domain).
|
|
27
|
+
* - `defaultProductionGate` with `deltaThreshold: 0.05`.
|
|
28
|
+
* - Held-out split = 25% of scenarios, deterministic by id hash.
|
|
29
|
+
* - 3 generations × population 2 (raise via `budget` for more search).
|
|
30
|
+
* - `autoOnPromote: 'none'` (we don't open PRs unless you ask).
|
|
31
|
+
*
|
|
32
|
+
* Want one-click? Provide `agent` + `scenarios` + `judge`. Done.
|
|
33
|
+
* Want distributed? Pass `cellPlacement` + an `httpDispatch`-backed
|
|
34
|
+
* agent. Want a code-tier surface? Pass a `MutableSurface` + your own
|
|
35
|
+
* `driver`. Same function.
|
|
36
|
+
*/
|
|
37
|
+
|
|
38
|
+
interface SelfImproveBudget {
|
|
39
|
+
/** Hard $ ceiling across all cells in baseline + every generation. Cells
|
|
40
|
+
* beyond the ceiling are skipped (cost-aware, not aborted). */
|
|
41
|
+
dollars?: number;
|
|
42
|
+
/** How many improvement generations to explore. Default 3. Set 0 to
|
|
43
|
+
* skip improvement entirely (selfImprove becomes a baseline-only run). */
|
|
44
|
+
generations?: number;
|
|
45
|
+
/** Candidates the driver proposes per generation. Default 2. */
|
|
46
|
+
populationSize?: number;
|
|
47
|
+
/** Max concurrent cells across the loop. Default 2. */
|
|
48
|
+
maxConcurrency?: number;
|
|
49
|
+
/** Fraction of `scenarios` held out from training, used for the gate.
|
|
50
|
+
* Default 0.25. Ignored when `holdoutScenarios` is set explicitly. */
|
|
51
|
+
holdoutFraction?: number;
|
|
52
|
+
/** Explicit held-out scenarios; overrides `holdoutFraction`. */
|
|
53
|
+
holdoutScenarios?: Scenario[];
|
|
54
|
+
}
|
|
55
|
+
interface SelfImproveLlm {
|
|
56
|
+
/** Endpoint base URL. Default Tangle Router. */
|
|
57
|
+
baseUrl?: string;
|
|
58
|
+
/** Bearer token. Default `process.env.OPENAI_API_KEY`. */
|
|
59
|
+
apiKey?: string;
|
|
60
|
+
/** Model id used by `gepaDriver` reflection. Default
|
|
61
|
+
* `anthropic/claude-sonnet-4.6`. */
|
|
62
|
+
model?: string;
|
|
63
|
+
}
|
|
64
|
+
type SelfImproveProgressEvent = {
|
|
65
|
+
kind: 'baseline.started';
|
|
66
|
+
scenarios: number;
|
|
67
|
+
} | {
|
|
68
|
+
kind: 'baseline.completed';
|
|
69
|
+
compositeMean: number;
|
|
70
|
+
durationMs: number;
|
|
71
|
+
} | {
|
|
72
|
+
kind: 'generation.started';
|
|
73
|
+
index: number;
|
|
74
|
+
populationSize: number;
|
|
75
|
+
} | {
|
|
76
|
+
kind: 'generation.completed';
|
|
77
|
+
index: number;
|
|
78
|
+
bestComposite: number;
|
|
79
|
+
durationMs: number;
|
|
80
|
+
} | {
|
|
81
|
+
kind: 'gate.decided';
|
|
82
|
+
decision: string;
|
|
83
|
+
lift: number;
|
|
84
|
+
};
|
|
85
|
+
interface SelfImproveOptions<TScenario extends Scenario, TArtifact> {
|
|
86
|
+
/**
|
|
87
|
+
* Your agent — a function that takes the current `MutableSurface`
|
|
88
|
+
* (typically a system prompt the loop is optimizing) plus the
|
|
89
|
+
* scenario + cell ctx, and returns the artifact your judge scores.
|
|
90
|
+
*
|
|
91
|
+
* Same shape as `RunOptimizationOptions.dispatchWithSurface`. Wrap a
|
|
92
|
+
* plain `Dispatch` if you don't have a surface seam:
|
|
93
|
+
*
|
|
94
|
+
* agent: (_surface, scenario, ctx) => yourPlainDispatch(scenario, ctx)
|
|
95
|
+
*
|
|
96
|
+
* That mode evaluates without mutating any surface — useful as a
|
|
97
|
+
* baseline-only run (set `budget.generations = 0`).
|
|
98
|
+
*/
|
|
99
|
+
agent: (surface: MutableSurface, scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>;
|
|
100
|
+
/** Scenarios to evaluate against. Train/holdout split is computed from
|
|
101
|
+
* these unless `budget.holdoutScenarios` is set explicitly. */
|
|
102
|
+
scenarios: TScenario[];
|
|
103
|
+
/** Judge that scores artifacts. Bring your own; use `langchainJudge`
|
|
104
|
+
* from `/adapters/langchain` for a Runnable-shaped one. */
|
|
105
|
+
judge: JudgeConfig<TArtifact, TScenario>;
|
|
106
|
+
/** Starting surface — system prompt, JSON config, anything `MutableSurface`
|
|
107
|
+
* accepts. The driver mutates this each generation. */
|
|
108
|
+
baselineSurface: MutableSurface;
|
|
109
|
+
/** Budget + loop shape. All fields optional; defaults pick the LAND-tier
|
|
110
|
+
* story. */
|
|
111
|
+
budget?: SelfImproveBudget;
|
|
112
|
+
/** Custom driver. Default is `gepaDriver` configured from `llm` +
|
|
113
|
+
* `mutationPrimitives`. */
|
|
114
|
+
driver?: ImprovementDriver;
|
|
115
|
+
/** Default-driver overrides — used when `driver` is unset. */
|
|
116
|
+
mutationPrimitives?: string[];
|
|
117
|
+
driverTarget?: string;
|
|
118
|
+
/** Custom gate. Default is `defaultProductionGate` with
|
|
119
|
+
* `deltaThreshold: 0.05` on the held-out split. */
|
|
120
|
+
gate?: Gate<TArtifact, TScenario>;
|
|
121
|
+
/** LLM config consumed by the default `gepaDriver`. Ignored if you pass
|
|
122
|
+
* your own `driver`. */
|
|
123
|
+
llm?: SelfImproveLlm;
|
|
124
|
+
/** Storage backend. Default `inMemoryCampaignStorage()` — nothing
|
|
125
|
+
* persists past the call. Pass `fsCampaignStorage()` to write to disk. */
|
|
126
|
+
storage?: CampaignStorage;
|
|
127
|
+
/** Run directory (logical for in-memory storage, real path for fs).
|
|
128
|
+
* Default `mem://selfImprove-<timestamp>`. */
|
|
129
|
+
runDir?: string;
|
|
130
|
+
/** Distributed-driver seam — same as `RunCampaignOptions.cellPlacement`.
|
|
131
|
+
* Returns an opaque placement key the substrate forwards to your agent
|
|
132
|
+
* as `ctx.placement`. Combined with `httpDispatch` from
|
|
133
|
+
* `/adapters/http`, fans cells across regions. */
|
|
134
|
+
cellPlacement?: (input: {
|
|
135
|
+
scenario: TScenario;
|
|
136
|
+
rep: number;
|
|
137
|
+
generation?: number;
|
|
138
|
+
}) => string | undefined;
|
|
139
|
+
/** Streaming hook — fires on baseline + each generation + gate decision.
|
|
140
|
+
* Consumer routes events wherever (UI, dashboard, logs). */
|
|
141
|
+
onProgress?: (event: SelfImproveProgressEvent) => void;
|
|
142
|
+
/** Auto-promotion behavior on a ship decision. Default `'none'` — we
|
|
143
|
+
* return the winner; you ship it however you ship. `'pr'` opens a
|
|
144
|
+
* GitHub PR via `openAutoPr`; requires `ghOwner` + `ghRepo`. */
|
|
145
|
+
autoOnPromote?: 'pr' | 'none';
|
|
146
|
+
ghOwner?: string;
|
|
147
|
+
ghRepo?: string;
|
|
148
|
+
}
|
|
149
|
+
interface SelfImproveResult<TScenario extends Scenario, TArtifact> {
|
|
150
|
+
/** Composite mean across all scenarios, baseline run. */
|
|
151
|
+
baseline: {
|
|
152
|
+
compositeMean: number;
|
|
153
|
+
perScenario: Record<string, number>;
|
|
154
|
+
};
|
|
155
|
+
/** Composite mean on the held-out set, winner run. */
|
|
156
|
+
winner: {
|
|
157
|
+
compositeMean: number;
|
|
158
|
+
perScenario: Record<string, number>;
|
|
159
|
+
surface: MutableSurface;
|
|
160
|
+
};
|
|
161
|
+
/** `winner.compositeMean - baselineOnHoldout.compositeMean`. Positive
|
|
162
|
+
* means the gate observed improvement. */
|
|
163
|
+
lift: number;
|
|
164
|
+
/** `defaultProductionGate.decide()` result. */
|
|
165
|
+
gateDecision: 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling';
|
|
166
|
+
/** Number of generations actually explored (may be less than the
|
|
167
|
+
* budget if the driver gave up early). */
|
|
168
|
+
generationsExplored: number;
|
|
169
|
+
/** Wall-clock total. */
|
|
170
|
+
durationMs: number;
|
|
171
|
+
/** Total cost across baseline + every generation. */
|
|
172
|
+
totalCostUsd: number;
|
|
173
|
+
/**
|
|
174
|
+
* Raw substrate result for advanced inspection — full per-generation
|
|
175
|
+
* candidates, full campaign artifacts, all judge scores. Useful for
|
|
176
|
+
* debugging or reporting beyond the summary.
|
|
177
|
+
*/
|
|
178
|
+
raw: RunImprovementLoopResult<TArtifact, TScenario>;
|
|
179
|
+
}
|
|
180
|
+
/**
|
|
181
|
+
* One-shot self-improvement loop. See module docstring for defaults +
|
|
182
|
+
* extension points.
|
|
183
|
+
*
|
|
184
|
+
* @example Minimum (LAND tier):
|
|
185
|
+
*
|
|
186
|
+
* const result = await selfImprove({
|
|
187
|
+
* agent: (surface, scenario, ctx) => myAgent(surface, scenario, ctx.signal),
|
|
188
|
+
* scenarios,
|
|
189
|
+
* judge,
|
|
190
|
+
* baselineSurface: DEFAULT_PROMPT,
|
|
191
|
+
* })
|
|
192
|
+
* console.log(`lift: ${result.lift.toFixed(3)} (${result.gateDecision})`)
|
|
193
|
+
*
|
|
194
|
+
* @example Distributed (workers in three regions):
|
|
195
|
+
*
|
|
196
|
+
* await selfImprove({
|
|
197
|
+
* agent: httpDispatch({ resolveUrl: ({ placement }) => REGION_URLS[placement!] }),
|
|
198
|
+
* scenarios,
|
|
199
|
+
* judge,
|
|
200
|
+
* baselineSurface: DEFAULT_PROMPT,
|
|
201
|
+
* cellPlacement: ({ scenario }) => scenario.region,
|
|
202
|
+
* budget: { maxConcurrency: 12 },
|
|
203
|
+
* })
|
|
204
|
+
*/
|
|
205
|
+
declare function selfImprove<TScenario extends Scenario, TArtifact>(opts: SelfImproveOptions<TScenario, TArtifact>): Promise<SelfImproveResult<TScenario, TArtifact>>;
|
|
206
|
+
|
|
207
|
+
export { CampaignStorage, DispatchContext, Gate, ImprovementDriver, JudgeConfig, MutableSurface, RunImprovementLoopResult, Scenario, type SelfImproveBudget, type SelfImproveLlm, type SelfImproveOptions, type SelfImproveProgressEvent, type SelfImproveResult, selfImprove };
|
package/dist/contract/index.js
CHANGED
|
@@ -24,6 +24,130 @@ import "../chunk-VXNVVBZO.js";
|
|
|
24
24
|
import "../chunk-PC4UYEBM.js";
|
|
25
25
|
import "../chunk-QYJT52YW.js";
|
|
26
26
|
import "../chunk-NSBPE2FW.js";
|
|
27
|
+
|
|
28
|
+
// src/contract/self-improve.ts
|
|
29
|
+
function splitTrainHoldout(scenarios, fraction) {
|
|
30
|
+
function hash(s) {
|
|
31
|
+
let h = 2166136261 >>> 0;
|
|
32
|
+
for (let i = 0; i < s.length; i++) {
|
|
33
|
+
h ^= s.charCodeAt(i);
|
|
34
|
+
h = Math.imul(h, 16777619) >>> 0;
|
|
35
|
+
}
|
|
36
|
+
return h;
|
|
37
|
+
}
|
|
38
|
+
const sorted = [...scenarios].sort((a, b) => hash(a.id) - hash(b.id));
|
|
39
|
+
const nHoldout = Math.max(1, Math.min(sorted.length - 1, Math.round(sorted.length * fraction)));
|
|
40
|
+
return {
|
|
41
|
+
holdout: sorted.slice(0, nHoldout),
|
|
42
|
+
train: sorted.slice(nHoldout)
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
function meanComposite(byScenario) {
|
|
46
|
+
const perScenario = {};
|
|
47
|
+
const values = [];
|
|
48
|
+
for (const [id, agg] of Object.entries(byScenario)) {
|
|
49
|
+
perScenario[id] = agg.meanComposite;
|
|
50
|
+
values.push(agg.meanComposite);
|
|
51
|
+
}
|
|
52
|
+
return {
|
|
53
|
+
compositeMean: values.length === 0 ? 0 : values.reduce((s, v) => s + v, 0) / values.length,
|
|
54
|
+
perScenario
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
var DEFAULT_MUTATION_PRIMITIVES = [
|
|
58
|
+
"Tighten the hook: lead with the specific user outcome.",
|
|
59
|
+
"Replace generic adjectives with specific verbs or proof numbers.",
|
|
60
|
+
"Anchor every claim in something the scenario's brief literally supports.",
|
|
61
|
+
"Honor the surface-shape constraint (length, register, audience vocabulary)."
|
|
62
|
+
];
|
|
63
|
+
async function selfImprove(opts) {
|
|
64
|
+
const startedAt = Date.now();
|
|
65
|
+
const budget = opts.budget ?? {};
|
|
66
|
+
const generations = budget.generations ?? 3;
|
|
67
|
+
const populationSize = budget.populationSize ?? 2;
|
|
68
|
+
const maxConcurrency = budget.maxConcurrency ?? 2;
|
|
69
|
+
const holdoutFraction = budget.holdoutFraction ?? 0.25;
|
|
70
|
+
const costCeiling = budget.dollars;
|
|
71
|
+
const explicitHoldout = budget.holdoutScenarios;
|
|
72
|
+
const { train, holdout } = explicitHoldout ? {
|
|
73
|
+
train: opts.scenarios.filter((s) => !explicitHoldout.some((h) => h.id === s.id)),
|
|
74
|
+
holdout: explicitHoldout
|
|
75
|
+
} : splitTrainHoldout(opts.scenarios, holdoutFraction);
|
|
76
|
+
if (train.length === 0) {
|
|
77
|
+
throw new Error("selfImprove: train split is empty. Reduce holdoutFraction or pass more scenarios.");
|
|
78
|
+
}
|
|
79
|
+
if (holdout.length === 0) {
|
|
80
|
+
throw new Error("selfImprove: holdout split is empty. Pass more scenarios.");
|
|
81
|
+
}
|
|
82
|
+
const driver = opts.driver ?? gepaDriver({
|
|
83
|
+
llm: {
|
|
84
|
+
baseUrl: opts.llm?.baseUrl ?? "https://router.tangle.tools/v1",
|
|
85
|
+
apiKey: opts.llm?.apiKey ?? process.env.OPENAI_API_KEY ?? ""
|
|
86
|
+
},
|
|
87
|
+
model: opts.llm?.model ?? "anthropic/claude-sonnet-4.6",
|
|
88
|
+
target: opts.driverTarget ?? "agent surface (system prompt or config) being optimized by selfImprove",
|
|
89
|
+
mutationPrimitives: opts.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES
|
|
90
|
+
});
|
|
91
|
+
const gate = opts.gate ?? defaultProductionGate({
|
|
92
|
+
holdoutScenarios: holdout,
|
|
93
|
+
deltaThreshold: 0.05
|
|
94
|
+
});
|
|
95
|
+
const storage = opts.storage ?? inMemoryCampaignStorage();
|
|
96
|
+
const runDir = opts.runDir ?? `mem://selfImprove-${startedAt}`;
|
|
97
|
+
if (opts.onProgress) {
|
|
98
|
+
opts.onProgress({ kind: "baseline.started", scenarios: opts.scenarios.length });
|
|
99
|
+
}
|
|
100
|
+
const result = await runImprovementLoop({
|
|
101
|
+
scenarios: train,
|
|
102
|
+
baselineSurface: opts.baselineSurface,
|
|
103
|
+
dispatchWithSurface: opts.agent,
|
|
104
|
+
driver,
|
|
105
|
+
judges: [opts.judge],
|
|
106
|
+
populationSize,
|
|
107
|
+
maxGenerations: generations,
|
|
108
|
+
holdoutScenarios: holdout,
|
|
109
|
+
gate,
|
|
110
|
+
autoOnPromote: opts.autoOnPromote ?? "none",
|
|
111
|
+
ghOwner: opts.ghOwner,
|
|
112
|
+
ghRepo: opts.ghRepo,
|
|
113
|
+
storage,
|
|
114
|
+
runDir,
|
|
115
|
+
maxConcurrency,
|
|
116
|
+
cellPlacement: opts.cellPlacement,
|
|
117
|
+
costCeiling
|
|
118
|
+
});
|
|
119
|
+
const baseline = meanComposite(result.baselineOnHoldout.aggregates.byScenario);
|
|
120
|
+
const winnerStats = meanComposite(result.winnerOnHoldout.aggregates.byScenario);
|
|
121
|
+
if (opts.onProgress) {
|
|
122
|
+
opts.onProgress({
|
|
123
|
+
kind: "baseline.completed",
|
|
124
|
+
compositeMean: baseline.compositeMean,
|
|
125
|
+
durationMs: Date.now() - startedAt
|
|
126
|
+
});
|
|
127
|
+
opts.onProgress({
|
|
128
|
+
kind: "gate.decided",
|
|
129
|
+
decision: result.gateResult.decision,
|
|
130
|
+
lift: winnerStats.compositeMean - baseline.compositeMean
|
|
131
|
+
});
|
|
132
|
+
}
|
|
133
|
+
const totalCost = result.baselineCampaign.aggregates.totalCostUsd + result.generations.reduce(
|
|
134
|
+
(sum, gen) => sum + gen.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),
|
|
135
|
+
0
|
|
136
|
+
);
|
|
137
|
+
return {
|
|
138
|
+
baseline,
|
|
139
|
+
winner: {
|
|
140
|
+
...winnerStats,
|
|
141
|
+
surface: result.winnerSurface
|
|
142
|
+
},
|
|
143
|
+
lift: winnerStats.compositeMean - baseline.compositeMean,
|
|
144
|
+
gateDecision: result.gateResult.decision,
|
|
145
|
+
generationsExplored: result.generations.length,
|
|
146
|
+
durationMs: Date.now() - startedAt,
|
|
147
|
+
totalCostUsd: totalCost,
|
|
148
|
+
raw: result
|
|
149
|
+
};
|
|
150
|
+
}
|
|
27
151
|
export {
|
|
28
152
|
FileSystemOutcomeStore,
|
|
29
153
|
InMemoryOutcomeStore,
|
|
@@ -36,6 +160,7 @@ export {
|
|
|
36
160
|
inMemoryCampaignStorage,
|
|
37
161
|
runCampaign,
|
|
38
162
|
runEval,
|
|
39
|
-
runImprovementLoop
|
|
163
|
+
runImprovementLoop,
|
|
164
|
+
selfImprove
|
|
40
165
|
};
|
|
41
166
|
//# sourceMappingURL=index.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
|
|
1
|
+
{"version":3,"sources":["../../src/contract/self-improve.ts"],"sourcesContent":["/**\n * # `selfImprove()` — the LAND-tier one-shot.\n *\n * The cheapest possible call site to run a real closed-loop self-\n * improvement over your agent. Wraps `runImprovementLoop` with smart\n * defaults and a budget-shaped options API; every escape hatch the\n * substrate exposes is reachable from here without losing the\n * one-function feel.\n *\n * Defaults picked to match the LAND-tier story:\n * - In-memory storage (no filesystem touch).\n * - `gepaDriver` reflective mutation with copywriting-flavored primitives\n * (override `driver` or `mutationPrimitives` for any domain).\n * - `defaultProductionGate` with `deltaThreshold: 0.05`.\n * - Held-out split = 25% of scenarios, deterministic by id hash.\n * - 3 generations × population 2 (raise via `budget` for more search).\n * - `autoOnPromote: 'none'` (we don't open PRs unless you ask).\n *\n * Want one-click? Provide `agent` + `scenarios` + `judge`. Done.\n * Want distributed? Pass `cellPlacement` + an `httpDispatch`-backed\n * agent. Want a code-tier surface? Pass a `MutableSurface` + your own\n * `driver`. Same function.\n */\n\nimport { runImprovementLoop, type RunImprovementLoopResult } from '../campaign/presets/run-improvement-loop'\nimport { gepaDriver } from '../campaign/drivers/gepa'\nimport { defaultProductionGate } from '../campaign/gates/default-production-gate'\nimport { type CampaignStorage, inMemoryCampaignStorage } from '../campaign/storage'\nimport type {\n DispatchContext,\n Gate,\n ImprovementDriver,\n JudgeConfig,\n MutableSurface,\n Scenario,\n} from '../campaign/types'\n\nexport interface SelfImproveBudget {\n /** Hard $ ceiling across all cells in baseline + every generation. Cells\n * beyond the ceiling are skipped (cost-aware, not aborted). */\n dollars?: number\n /** How many improvement generations to explore. Default 3. Set 0 to\n * skip improvement entirely (selfImprove becomes a baseline-only run). */\n generations?: number\n /** Candidates the driver proposes per generation. Default 2. */\n populationSize?: number\n /** Max concurrent cells across the loop. Default 2. */\n maxConcurrency?: number\n /** Fraction of `scenarios` held out from training, used for the gate.\n * Default 0.25. Ignored when `holdoutScenarios` is set explicitly. */\n holdoutFraction?: number\n /** Explicit held-out scenarios; overrides `holdoutFraction`. */\n holdoutScenarios?: Scenario[]\n}\n\nexport interface SelfImproveLlm {\n /** Endpoint base URL. Default Tangle Router. */\n baseUrl?: string\n /** Bearer token. Default `process.env.OPENAI_API_KEY`. */\n apiKey?: string\n /** Model id used by `gepaDriver` reflection. Default\n * `anthropic/claude-sonnet-4.6`. */\n model?: string\n}\n\nexport type SelfImproveProgressEvent =\n | { kind: 'baseline.started'; scenarios: number }\n | { kind: 'baseline.completed'; compositeMean: number; durationMs: number }\n | { kind: 'generation.started'; index: number; populationSize: number }\n | { kind: 'generation.completed'; index: number; bestComposite: number; durationMs: number }\n | { kind: 'gate.decided'; decision: string; lift: number }\n\nexport interface SelfImproveOptions<TScenario extends Scenario, TArtifact> {\n /**\n * Your agent — a function that takes the current `MutableSurface`\n * (typically a system prompt the loop is optimizing) plus the\n * scenario + cell ctx, and returns the artifact your judge scores.\n *\n * Same shape as `RunOptimizationOptions.dispatchWithSurface`. Wrap a\n * plain `Dispatch` if you don't have a surface seam:\n *\n * agent: (_surface, scenario, ctx) => yourPlainDispatch(scenario, ctx)\n *\n * That mode evaluates without mutating any surface — useful as a\n * baseline-only run (set `budget.generations = 0`).\n */\n agent: (\n surface: MutableSurface,\n scenario: TScenario,\n ctx: DispatchContext,\n ) => Promise<TArtifact>\n\n /** Scenarios to evaluate against. Train/holdout split is computed from\n * these unless `budget.holdoutScenarios` is set explicitly. */\n scenarios: TScenario[]\n\n /** Judge that scores artifacts. Bring your own; use `langchainJudge`\n * from `/adapters/langchain` for a Runnable-shaped one. */\n judge: JudgeConfig<TArtifact, TScenario>\n\n /** Starting surface — system prompt, JSON config, anything `MutableSurface`\n * accepts. The driver mutates this each generation. */\n baselineSurface: MutableSurface\n\n /** Budget + loop shape. All fields optional; defaults pick the LAND-tier\n * story. */\n budget?: SelfImproveBudget\n\n /** Custom driver. Default is `gepaDriver` configured from `llm` +\n * `mutationPrimitives`. */\n driver?: ImprovementDriver\n\n /** Default-driver overrides — used when `driver` is unset. */\n mutationPrimitives?: string[]\n driverTarget?: string\n\n /** Custom gate. Default is `defaultProductionGate` with\n * `deltaThreshold: 0.05` on the held-out split. */\n gate?: Gate<TArtifact, TScenario>\n\n /** LLM config consumed by the default `gepaDriver`. Ignored if you pass\n * your own `driver`. */\n llm?: SelfImproveLlm\n\n /** Storage backend. Default `inMemoryCampaignStorage()` — nothing\n * persists past the call. Pass `fsCampaignStorage()` to write to disk. */\n storage?: CampaignStorage\n\n /** Run directory (logical for in-memory storage, real path for fs).\n * Default `mem://selfImprove-<timestamp>`. */\n runDir?: string\n\n /** Distributed-driver seam — same as `RunCampaignOptions.cellPlacement`.\n * Returns an opaque placement key the substrate forwards to your agent\n * as `ctx.placement`. Combined with `httpDispatch` from\n * `/adapters/http`, fans cells across regions. */\n cellPlacement?: (input: {\n scenario: TScenario\n rep: number\n generation?: number\n }) => string | undefined\n\n /** Streaming hook — fires on baseline + each generation + gate decision.\n * Consumer routes events wherever (UI, dashboard, logs). */\n onProgress?: (event: SelfImproveProgressEvent) => void\n\n /** Auto-promotion behavior on a ship decision. Default `'none'` — we\n * return the winner; you ship it however you ship. `'pr'` opens a\n * GitHub PR via `openAutoPr`; requires `ghOwner` + `ghRepo`. */\n autoOnPromote?: 'pr' | 'none'\n ghOwner?: string\n ghRepo?: string\n}\n\nexport interface SelfImproveResult<TScenario extends Scenario, TArtifact> {\n /** Composite mean across all scenarios, baseline run. */\n baseline: {\n compositeMean: number\n perScenario: Record<string, number>\n }\n /** Composite mean on the held-out set, winner run. */\n winner: {\n compositeMean: number\n perScenario: Record<string, number>\n surface: MutableSurface\n }\n /** `winner.compositeMean - baselineOnHoldout.compositeMean`. Positive\n * means the gate observed improvement. */\n lift: number\n /** `defaultProductionGate.decide()` result. */\n gateDecision: 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling'\n /** Number of generations actually explored (may be less than the\n * budget if the driver gave up early). */\n generationsExplored: number\n /** Wall-clock total. */\n durationMs: number\n /** Total cost across baseline + every generation. */\n totalCostUsd: number\n /**\n * Raw substrate result for advanced inspection — full per-generation\n * candidates, full campaign artifacts, all judge scores. Useful for\n * debugging or reporting beyond the summary.\n */\n raw: RunImprovementLoopResult<TArtifact, TScenario>\n}\n\n/**\n * Deterministic train/holdout split by a stable hash of `scenario.id`,\n * so the same scenario set always splits the same way across runs.\n */\nfunction splitTrainHoldout<TScenario extends Scenario>(\n scenarios: TScenario[],\n fraction: number,\n): { train: TScenario[]; holdout: TScenario[] } {\n // Stable fnv-1a-ish hash of the id for ordering.\n function hash(s: string): number {\n let h = 2166136261 >>> 0\n for (let i = 0; i < s.length; i++) {\n h ^= s.charCodeAt(i)\n h = Math.imul(h, 16777619) >>> 0\n }\n return h\n }\n const sorted = [...scenarios].sort((a, b) => hash(a.id) - hash(b.id))\n const nHoldout = Math.max(1, Math.min(sorted.length - 1, Math.round(sorted.length * fraction)))\n return {\n holdout: sorted.slice(0, nHoldout),\n train: sorted.slice(nHoldout),\n }\n}\n\nfunction meanComposite(\n byScenario: Record<string, { meanComposite: number }>,\n): { compositeMean: number; perScenario: Record<string, number> } {\n const perScenario: Record<string, number> = {}\n const values: number[] = []\n for (const [id, agg] of Object.entries(byScenario)) {\n perScenario[id] = agg.meanComposite\n values.push(agg.meanComposite)\n }\n return {\n compositeMean: values.length === 0 ? 0 : values.reduce((s, v) => s + v, 0) / values.length,\n perScenario,\n }\n}\n\nconst DEFAULT_MUTATION_PRIMITIVES = [\n 'Tighten the hook: lead with the specific user outcome.',\n 'Replace generic adjectives with specific verbs or proof numbers.',\n 'Anchor every claim in something the scenario\\'s brief literally supports.',\n 'Honor the surface-shape constraint (length, register, audience vocabulary).',\n]\n\n/**\n * One-shot self-improvement loop. See module docstring for defaults +\n * extension points.\n *\n * @example Minimum (LAND tier):\n *\n * const result = await selfImprove({\n * agent: (surface, scenario, ctx) => myAgent(surface, scenario, ctx.signal),\n * scenarios,\n * judge,\n * baselineSurface: DEFAULT_PROMPT,\n * })\n * console.log(`lift: ${result.lift.toFixed(3)} (${result.gateDecision})`)\n *\n * @example Distributed (workers in three regions):\n *\n * await selfImprove({\n * agent: httpDispatch({ resolveUrl: ({ placement }) => REGION_URLS[placement!] }),\n * scenarios,\n * judge,\n * baselineSurface: DEFAULT_PROMPT,\n * cellPlacement: ({ scenario }) => scenario.region,\n * budget: { maxConcurrency: 12 },\n * })\n */\nexport async function selfImprove<TScenario extends Scenario, TArtifact>(\n opts: SelfImproveOptions<TScenario, TArtifact>,\n): Promise<SelfImproveResult<TScenario, TArtifact>> {\n const startedAt = Date.now()\n\n const budget = opts.budget ?? {}\n const generations = budget.generations ?? 3\n const populationSize = budget.populationSize ?? 2\n const maxConcurrency = budget.maxConcurrency ?? 2\n const holdoutFraction = budget.holdoutFraction ?? 0.25\n const costCeiling = budget.dollars\n\n const explicitHoldout = budget.holdoutScenarios\n const { train, holdout } = explicitHoldout\n ? {\n train: opts.scenarios.filter((s) => !explicitHoldout.some((h) => h.id === s.id)),\n holdout: explicitHoldout as TScenario[],\n }\n : splitTrainHoldout(opts.scenarios, holdoutFraction)\n\n if (train.length === 0) {\n throw new Error('selfImprove: train split is empty. Reduce holdoutFraction or pass more scenarios.')\n }\n if (holdout.length === 0) {\n throw new Error('selfImprove: holdout split is empty. Pass more scenarios.')\n }\n\n const driver: ImprovementDriver =\n opts.driver ??\n gepaDriver({\n llm: {\n baseUrl: opts.llm?.baseUrl ?? 'https://router.tangle.tools/v1',\n apiKey: opts.llm?.apiKey ?? process.env.OPENAI_API_KEY ?? '',\n },\n model: opts.llm?.model ?? 'anthropic/claude-sonnet-4.6',\n target: opts.driverTarget ?? 'agent surface (system prompt or config) being optimized by selfImprove',\n mutationPrimitives: opts.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES,\n })\n\n const gate: Gate<TArtifact, TScenario> =\n opts.gate ??\n defaultProductionGate<TArtifact, TScenario>({\n holdoutScenarios: holdout,\n deltaThreshold: 0.05,\n })\n\n const storage = opts.storage ?? inMemoryCampaignStorage()\n const runDir = opts.runDir ?? `mem://selfImprove-${startedAt}`\n\n if (opts.onProgress) {\n opts.onProgress({ kind: 'baseline.started', scenarios: opts.scenarios.length })\n }\n\n const result = await runImprovementLoop<TScenario, TArtifact>({\n scenarios: train,\n baselineSurface: opts.baselineSurface,\n dispatchWithSurface: opts.agent,\n driver,\n judges: [opts.judge],\n populationSize,\n maxGenerations: generations,\n holdoutScenarios: holdout,\n gate,\n autoOnPromote: opts.autoOnPromote ?? 'none',\n ghOwner: opts.ghOwner,\n ghRepo: opts.ghRepo,\n storage,\n runDir,\n maxConcurrency,\n cellPlacement: opts.cellPlacement,\n costCeiling,\n })\n\n const baseline = meanComposite(result.baselineOnHoldout.aggregates.byScenario)\n const winnerStats = meanComposite(result.winnerOnHoldout.aggregates.byScenario)\n\n if (opts.onProgress) {\n opts.onProgress({\n kind: 'baseline.completed',\n compositeMean: baseline.compositeMean,\n durationMs: Date.now() - startedAt,\n })\n opts.onProgress({\n kind: 'gate.decided',\n decision: result.gateResult.decision,\n lift: winnerStats.compositeMean - baseline.compositeMean,\n })\n }\n\n const totalCost =\n result.baselineCampaign.aggregates.totalCostUsd +\n result.generations.reduce(\n (sum, gen) => sum + gen.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),\n 0,\n )\n\n return {\n baseline,\n winner: {\n ...winnerStats,\n surface: result.winnerSurface,\n },\n lift: winnerStats.compositeMean - baseline.compositeMean,\n gateDecision: result.gateResult.decision,\n generationsExplored: result.generations.length,\n durationMs: Date.now() - startedAt,\n totalCostUsd: totalCost,\n raw: result,\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;AA8LA,SAAS,kBACP,WACA,UAC8C;AAE9C,WAAS,KAAK,GAAmB;AAC/B,QAAI,IAAI,eAAe;AACvB,aAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,WAAK,EAAE,WAAW,CAAC;AACnB,UAAI,KAAK,KAAK,GAAG,QAAQ,MAAM;AAAA,IACjC;AACA,WAAO;AAAA,EACT;AACA,QAAM,SAAS,CAAC,GAAG,SAAS,EAAE,KAAK,CAAC,GAAG,MAAM,KAAK,EAAE,EAAE,IAAI,KAAK,EAAE,EAAE,CAAC;AACpE,QAAM,WAAW,KAAK,IAAI,GAAG,KAAK,IAAI,OAAO,SAAS,GAAG,KAAK,MAAM,OAAO,SAAS,QAAQ,CAAC,CAAC;AAC9F,SAAO;AAAA,IACL,SAAS,OAAO,MAAM,GAAG,QAAQ;AAAA,IACjC,OAAO,OAAO,MAAM,QAAQ;AAAA,EAC9B;AACF;AAEA,SAAS,cACP,YACgE;AAChE,QAAM,cAAsC,CAAC;AAC7C,QAAM,SAAmB,CAAC;AAC1B,aAAW,CAAC,IAAI,GAAG,KAAK,OAAO,QAAQ,UAAU,GAAG;AAClD,gBAAY,EAAE,IAAI,IAAI;AACtB,WAAO,KAAK,IAAI,aAAa;AAAA,EAC/B;AACA,SAAO;AAAA,IACL,eAAe,OAAO,WAAW,IAAI,IAAI,OAAO,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,OAAO;AAAA,IACpF;AAAA,EACF;AACF;AAEA,IAAM,8BAA8B;AAAA,EAClC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AA2BA,eAAsB,YACpB,MACkD;AAClD,QAAM,YAAY,KAAK,IAAI;AAE3B,QAAM,SAAS,KAAK,UAAU,CAAC;AAC/B,QAAM,cAAc,OAAO,eAAe;AAC1C,QAAM,iBAAiB,OAAO,kBAAkB;AAChD,QAAM,iBAAiB,OAAO,kBAAkB;AAChD,QAAM,kBAAkB,OAAO,mBAAmB;AAClD,QAAM,cAAc,OAAO;AAE3B,QAAM,kBAAkB,OAAO;AAC/B,QAAM,EAAE,OAAO,QAAQ,IAAI,kBACvB;AAAA,IACE,OAAO,KAAK,UAAU,OAAO,CAAC,MAAM,CAAC,gBAAgB,KAAK,CAAC,MAAM,EAAE,OAAO,EAAE,EAAE,CAAC;AAAA,IAC/E,SAAS;AAAA,EACX,IACA,kBAAkB,KAAK,WAAW,eAAe;AAErD,MAAI,MAAM,WAAW,GAAG;AACtB,UAAM,IAAI,MAAM,mFAAmF;AAAA,EACrG;AACA,MAAI,QAAQ,WAAW,GAAG;AACxB,UAAM,IAAI,MAAM,2DAA2D;AAAA,EAC7E;AAEA,QAAM,SACJ,KAAK,UACL,WAAW;AAAA,IACT,KAAK;AAAA,MACH,SAAS,KAAK,KAAK,WAAW;AAAA,MAC9B,QAAQ,KAAK,KAAK,UAAU,QAAQ,IAAI,kBAAkB;AAAA,IAC5D;AAAA,IACA,OAAO,KAAK,KAAK,SAAS;AAAA,IAC1B,QAAQ,KAAK,gBAAgB;AAAA,IAC7B,oBAAoB,KAAK,sBAAsB;AAAA,EACjD,CAAC;AAEH,QAAM,OACJ,KAAK,QACL,sBAA4C;AAAA,IAC1C,kBAAkB;AAAA,IAClB,gBAAgB;AAAA,EAClB,CAAC;AAEH,QAAM,UAAU,KAAK,WAAW,wBAAwB;AACxD,QAAM,SAAS,KAAK,UAAU,qBAAqB,SAAS;AAE5D,MAAI,KAAK,YAAY;AACnB,SAAK,WAAW,EAAE,MAAM,oBAAoB,WAAW,KAAK,UAAU,OAAO,CAAC;AAAA,EAChF;AAEA,QAAM,SAAS,MAAM,mBAAyC;AAAA,IAC5D,WAAW;AAAA,IACX,iBAAiB,KAAK;AAAA,IACtB,qBAAqB,KAAK;AAAA,IAC1B;AAAA,IACA,QAAQ,CAAC,KAAK,KAAK;AAAA,IACnB;AAAA,IACA,gBAAgB;AAAA,IAChB,kBAAkB;AAAA,IAClB;AAAA,IACA,eAAe,KAAK,iBAAiB;AAAA,IACrC,SAAS,KAAK;AAAA,IACd,QAAQ,KAAK;AAAA,IACb;AAAA,IACA;AAAA,IACA;AAAA,IACA,eAAe,KAAK;AAAA,IACpB;AAAA,EACF,CAAC;AAED,QAAM,WAAW,cAAc,OAAO,kBAAkB,WAAW,UAAU;AAC7E,QAAM,cAAc,cAAc,OAAO,gBAAgB,WAAW,UAAU;AAE9E,MAAI,KAAK,YAAY;AACnB,SAAK,WAAW;AAAA,MACd,MAAM;AAAA,MACN,eAAe,SAAS;AAAA,MACxB,YAAY,KAAK,IAAI,IAAI;AAAA,IAC3B,CAAC;AACD,SAAK,WAAW;AAAA,MACd,MAAM;AAAA,MACN,UAAU,OAAO,WAAW;AAAA,MAC5B,MAAM,YAAY,gBAAgB,SAAS;AAAA,IAC7C,CAAC;AAAA,EACH;AAEA,QAAM,YACJ,OAAO,iBAAiB,WAAW,eACnC,OAAO,YAAY;AAAA,IACjB,CAAC,KAAK,QAAQ,MAAM,IAAI,SAAS,OAAO,CAAC,GAAG,OAAO,IAAI,GAAG,SAAS,WAAW,cAAc,CAAC;AAAA,IAC7F;AAAA,EACF;AAEF,SAAO;AAAA,IACL;AAAA,IACA,QAAQ;AAAA,MACN,GAAG;AAAA,MACH,SAAS,OAAO;AAAA,IAClB;AAAA,IACA,MAAM,YAAY,gBAAgB,SAAS;AAAA,IAC3C,cAAc,OAAO,WAAW;AAAA,IAChC,qBAAqB,OAAO,YAAY;AAAA,IACxC,YAAY,KAAK,IAAI,IAAI;AAAA,IACzB,cAAc;AAAA,IACd,KAAK;AAAA,EACP;AACF;","names":[]}
|
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.45.0",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
package/dist/rl.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
|
|
2
|
-
import { d as CampaignResult } from './types-
|
|
2
|
+
import { d as CampaignResult } from './types-8u72Gc76.js';
|
|
3
3
|
import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-CoJMs2Iz.js';
|
|
4
4
|
export { r as runEvalCampaign } from './researcher-CoJMs2Iz.js';
|
|
5
5
|
import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { S as Scenario, d as CampaignResult, j as GateResult, o as Mutator, I as ImprovementDriver, G as Gate,
|
|
1
|
+
import { S as Scenario, d as CampaignResult, j as GateResult, o as Mutator, I as ImprovementDriver, G as Gate, g as DispatchFn, J as JudgeConfig, L as LabeledScenarioStore, e as CampaignTraceWriter, M as MutableSurface, l as GenerationRecord } from './types-8u72Gc76.js';
|
|
2
2
|
import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
|
|
3
3
|
import { RunRecord } from '@tangle-network/agent-runtime';
|
|
4
4
|
import { R as RedTeamCase } from './red-team-30II1T4o.js';
|
|
@@ -414,4 +414,4 @@ interface RunImprovementLoopResult<TArtifact, TScenario extends Scenario> extend
|
|
|
414
414
|
}
|
|
415
415
|
declare function runImprovementLoop<TScenario extends Scenario, TArtifact>(opts: RunImprovementLoopOptions<TScenario, TArtifact>): Promise<RunImprovementLoopResult<TArtifact, TScenario>>;
|
|
416
416
|
|
|
417
|
-
export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverOptions as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type
|
|
417
|
+
export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverOptions as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type RunImprovementLoopResult as R, type RunCampaignOptions as a, type RunEvalOptions as b, type RunImprovementLoopOptions as c, composeGate as d, defaultProductionGate as e, evolutionaryDriver as f, fsCampaignStorage as g, gepaDriver as h, heldOutGate as i, inMemoryCampaignStorage as j, runEval as k, runImprovementLoop as l, type OpenAutoPrResult as m, type RunOptimizationOptions as n, type RunOptimizationResult as o, openAutoPr as p, runOptimization as q, runCampaign as r, surfaceHash as s };
|
|
@@ -372,4 +372,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
|
|
|
372
372
|
scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
|
|
373
373
|
}
|
|
374
374
|
|
|
375
|
-
export type { CampaignAggregates as C,
|
|
375
|
+
export type { CampaignAggregates as C, DispatchContext as D, Gate as G, ImprovementDriver as I, JudgeConfig as J, LabeledScenarioStore as L, MutableSurface as M, OptimizerConfig as O, ProposeContext as P, RedactionStatus as R, Scenario as S, TraceSpan as T, CampaignArtifactWriter as a, CampaignCellResult as b, CampaignCostMeter as c, CampaignResult as d, CampaignTraceWriter as e, CodeSurface as f, DispatchFn as g, GateContext as h, GateDecision as i, GateResult as j, GenerationCandidate as k, GenerationRecord as l, JudgeDimension as m, JudgeScore as n, Mutator as o, SessionScript as p, LabeledScenarioWrite as q, LabeledScenarioSampleArgs as r, LabeledScenarioRecord as s, JudgeAggregate as t, LabeledScenarioSource as u, ScenarioAggregate as v };
|
|
@@ -32,11 +32,11 @@ So adoption is *graduated*, and the builder picks the depth: (1) **trace-analysi
|
|
|
32
32
|
|
|
33
33
|
| Tier | What they do | What they get | Billing |
|
|
34
34
|
|---|---|---|---|
|
|
35
|
-
| **LAND** (exists today) | `npm i @tangle-network/agent-eval`, wrap their agent behind one `dispatch` seam, bring a judge | Full self-improvement loop + **local** trace/eval artifacts. Any infra, no sandbox. | Free (lib) |
|
|
35
|
+
| **LAND** (exists today) | `npm i @tangle-network/agent-eval`, wrap their agent behind one `dispatch` seam, bring a judge | Full self-improvement loop + **local** trace/eval artifacts. Any infra, no sandbox. | Free (lib) — **with optional Tangle Router as a $0-friction inference upsell.** When a builder points `OPENAI_BASE_URL` at `router.tangle.tools/v1`, every campaign call (agent + judge + reflective mutation) routes through us; we earn the routing margin. Same code, opt-in monetization vector that ships today. |
|
|
36
36
|
| **EXPAND** (the build) | Route trace/eval/labeled-scenario data to our orchestrator | Hosted dashboards, cross-run intelligence, the capture flywheel as a service | **Metered** — composes with existing sandbox Stripe + cost-ledger |
|
|
37
37
|
| **PLATFORM** (the carrot) | Move execution into our sandbox (agent-dev-container) | Substrate + orchestrator data/intelligence pre-wired, batteries included | Sandbox usage |
|
|
38
38
|
|
|
39
|
-
The free lib casts the widest possible net at near-zero cost (it's already published).
|
|
39
|
+
The free lib casts the widest possible net at near-zero cost (it's already published). LAND is **not actually zero-revenue** — pointing the loop at Tangle Router is a one-line config change with no other code differences, so we monetize inference for any LAND-tier adopter who opts in. The wedge ladder is therefore four steps: no-revenue install → router routing margin (LAND with router) → metered data hosting (EXPAND) → sandbox usage (PLATFORM). Each step a one-line config change, never a rewrite. Value capture concentrates at EXPAND (hosting their data/intelligence is the biggest billable surface), but LAND-with-router is the immediate upsell available from day one.
|
|
40
40
|
|
|
41
41
|
## Plan & gates — land-first, validate, then build
|
|
42
42
|
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
# Phase-B partner pairing kit
|
|
2
|
+
|
|
3
|
+
Everything we hand a design partner — the pitch, the discovery doc,
|
|
4
|
+
the judge worksheet, the 4-hour pairing agenda, the success criteria.
|
|
5
|
+
|
|
6
|
+
> This file is **partner-facing**. The internal driving runbook is in
|
|
7
|
+
> [`phase-b-runbook.md`](./phase-b-runbook.md).
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## The pitch (one-pager)
|
|
12
|
+
|
|
13
|
+
You have a working agent. You don't have evals. You don't have a
|
|
14
|
+
self-improvement loop. You don't know which prompt change actually
|
|
15
|
+
made the agent better last week.
|
|
16
|
+
|
|
17
|
+
We have all of that on a shelf — same engine our six internal product
|
|
18
|
+
agents use in production. It's open source, free at the LAND tier, and
|
|
19
|
+
sandbox-free if you don't want our sandbox.
|
|
20
|
+
|
|
21
|
+
**The Phase-B offer:** in one 4-hour pairing, we wrap your agent
|
|
22
|
+
behind our `Dispatch`, author your domain-specific judge with you,
|
|
23
|
+
and run one real campaign + improvement loop on **your actual use
|
|
24
|
+
case**. You walk away with:
|
|
25
|
+
|
|
26
|
+
- A reproducible eval harness against scenarios you control.
|
|
27
|
+
- A judge that scores your outputs on dimensions you defined.
|
|
28
|
+
- One measurable lift on your real product, with a held-out gate.
|
|
29
|
+
- Trace artifacts you own (locally on disk; nothing leaves your
|
|
30
|
+
network unless you point at our hosted tier).
|
|
31
|
+
|
|
32
|
+
What we get: design-partner evidence the substrate works on a foreign
|
|
33
|
+
agent we did not build. That validates the wedge for us. Nothing else
|
|
34
|
+
changes hands.
|
|
35
|
+
|
|
36
|
+
**Cost to you:** 4 hours of pairing + your LLM bill for the campaign
|
|
37
|
+
run (typically $5-$50 depending on model + scenario count). No
|
|
38
|
+
commitment, no contract, no exclusivity. We don't take your code, your
|
|
39
|
+
data, or your secrets.
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Discovery questions (15 min, before the pairing)
|
|
44
|
+
|
|
45
|
+
Send these to the partner ahead of the pairing so they walk in with
|
|
46
|
+
their answers.
|
|
47
|
+
|
|
48
|
+
### About the agent
|
|
49
|
+
|
|
50
|
+
1. What does your agent **do** — one paragraph, end-user perspective?
|
|
51
|
+
2. What's the **input** it accepts and the **output** it produces?
|
|
52
|
+
(Schemas help; English is fine.)
|
|
53
|
+
3. What framework / stack? (LangChain / Mastra / OpenAI Agents SDK /
|
|
54
|
+
bespoke / something else.)
|
|
55
|
+
4. Where does it run? (Local node / serverless / your sandbox /
|
|
56
|
+
browser / mobile / other.)
|
|
57
|
+
5. What model(s) does it use today? Any model-routing layer
|
|
58
|
+
(OpenRouter, Portkey, your own)?
|
|
59
|
+
|
|
60
|
+
### About quality
|
|
61
|
+
|
|
62
|
+
6. How do you currently know your agent is good? (Eyeballing /
|
|
63
|
+
user feedback / metrics / nothing yet — all fine answers.)
|
|
64
|
+
7. What does a **bad** output look like for you? Give 2-3 concrete
|
|
65
|
+
examples. Be specific.
|
|
66
|
+
8. What does a **good** output look like? Same.
|
|
67
|
+
9. Are there outputs that are *technically correct but feel wrong*?
|
|
68
|
+
What's the signal?
|
|
69
|
+
10. How would a senior person on your team **score** an output, if
|
|
70
|
+
they had to give it a 1-10? Walk us through the rubric they'd
|
|
71
|
+
use, even informally.
|
|
72
|
+
|
|
73
|
+
### About the loop
|
|
74
|
+
|
|
75
|
+
11. If we could improve one thing about the agent in 4 hours, what
|
|
76
|
+
would move the needle the most for you?
|
|
77
|
+
12. Are there *prompt* changes you've wanted to try but haven't had
|
|
78
|
+
the loop to validate?
|
|
79
|
+
13. Anything you've explicitly tried that **didn't** work? (Saves us
|
|
80
|
+
suggesting it.)
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## Judge-design worksheet (45 min into the pairing)
|
|
85
|
+
|
|
86
|
+
The judge is the most under-discussed piece of an eval system. Most
|
|
87
|
+
projects fail at the judge, not the agent.
|
|
88
|
+
|
|
89
|
+
We start with a **strawman** — the 6 dimensions in our canonical
|
|
90
|
+
marketing-quality judge:
|
|
91
|
+
|
|
92
|
+
| Dim | What it measures |
|
|
93
|
+
|---|---|
|
|
94
|
+
| hook_strength | Opens with concrete user outcome, not category |
|
|
95
|
+
| voice_match | Reads human-written; no AI slop |
|
|
96
|
+
| cta_clarity | Next step unambiguous for the audience |
|
|
97
|
+
| factual_grounding | Only claims things the brief supports |
|
|
98
|
+
| surface_fit | Length + register correct for medium |
|
|
99
|
+
| audience_specificity | Vocabulary the audience actually responds to |
|
|
100
|
+
|
|
101
|
+
**Your job in this 45 min:** rip this apart. We expect:
|
|
102
|
+
|
|
103
|
+
- **2-3 of these are wrong for you.** Replace them.
|
|
104
|
+
- **2-3 dimensions are missing.** Add them. (E.g., "tone matches our
|
|
105
|
+
brand book" or "safety-critical claim has a citation" or "answer is
|
|
106
|
+
decisive — no hedging when the user wants a recommendation".)
|
|
107
|
+
- **Weights are wrong.** For your use case some dims matter 5x more.
|
|
108
|
+
|
|
109
|
+
The deliverable: a judge with 4-8 dimensions, each scored 0.0 - 1.0,
|
|
110
|
+
each unambiguous enough that two independent humans would score the
|
|
111
|
+
same artifact within 0.1.
|
|
112
|
+
|
|
113
|
+
If a dimension is squishy, throw it out. A noisy judge poisons the
|
|
114
|
+
loop.
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## The 4-hour pairing agenda
|
|
119
|
+
|
|
120
|
+
### Hour 1 — Discovery + Dispatch wiring
|
|
121
|
+
|
|
122
|
+
| Time | What | Deliverable |
|
|
123
|
+
|---|---|---|
|
|
124
|
+
| 0:00 - 0:15 | Review discovery answers, align on scope | Shared doc with goals + constraints |
|
|
125
|
+
| 0:15 - 0:45 | Wire `Dispatch` around their agent — typically 1 function | Working `Dispatch<TScenario, TArtifact>` |
|
|
126
|
+
| 0:45 - 1:00 | Run 1-2 scenarios through `Dispatch` manually; see real artifacts | Confirmed wire shape |
|
|
127
|
+
|
|
128
|
+
### Hour 2 — Judge calibration
|
|
129
|
+
|
|
130
|
+
| Time | What | Deliverable |
|
|
131
|
+
|---|---|---|
|
|
132
|
+
| 1:00 - 1:45 | Walk through the strawman judge; redesign dimensions with the partner | Final `JudgeConfig` for their domain |
|
|
133
|
+
| 1:45 - 2:00 | Calibrate judge against the 2 manual outputs from Hour 1 | Confirmed judge gives same scores a human would |
|
|
134
|
+
|
|
135
|
+
### Hour 3 — First campaign + tuning
|
|
136
|
+
|
|
137
|
+
| Time | What | Deliverable |
|
|
138
|
+
|---|---|---|
|
|
139
|
+
| 2:00 - 2:30 | Define 8-15 scenarios with the partner (or use ours as a template) | Scenario set with train + holdout split |
|
|
140
|
+
| 2:30 - 3:00 | Run `runEval` for baseline; review per-scenario scores | Baseline score + identified failure modes |
|
|
141
|
+
|
|
142
|
+
### Hour 4 — Improvement loop + go/no-go
|
|
143
|
+
|
|
144
|
+
| Time | What | Deliverable |
|
|
145
|
+
|---|---|---|
|
|
146
|
+
| 3:00 - 3:30 | Configure `runImprovementLoop` with `gepaDriver` (3 generations, population 2) + `defaultProductionGate` | Improvement run completes |
|
|
147
|
+
| 3:30 - 3:50 | Walk the partner through the gate decision + lift per scenario | Report artifact |
|
|
148
|
+
| 3:50 - 4:00 | Capture: was the lift real? Would they ship the winner? Will they keep using the lib? | **Go/no-go signal for Phase D** |
|
|
149
|
+
|
|
150
|
+
If we're tracking ahead at any hour, use the slack to deepen — add a
|
|
151
|
+
red-team battery, swap the judge model, run more generations. If we're
|
|
152
|
+
behind, cut the scenario set to 6 and ship.
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
## Success criteria — what counts as Phase B passed
|
|
157
|
+
|
|
158
|
+
For us to greenlight Phase D (hosted orchestrator + metered billing),
|
|
159
|
+
we need ALL of:
|
|
160
|
+
|
|
161
|
+
1. **Real lift.** Held-out winner score > baseline by ≥ 0.05 composite
|
|
162
|
+
points (or the partner's chosen threshold). Not just train; held-out.
|
|
163
|
+
2. **Partner-validated lift.** The partner reads the winner output on
|
|
164
|
+
3+ held-out scenarios and confirms it's actually better.
|
|
165
|
+
3. **Integration time ≤ 1 day.** Discovery + wiring + judge took ≤ 4
|
|
166
|
+
hours for the pairing; partner could reach the same point solo in
|
|
167
|
+
≤ 1 day from the quickstart doc.
|
|
168
|
+
4. **Public commitment.** Partner agrees to a public reference (case
|
|
169
|
+
study / quote / logo) OR commits to running the LAND tier in their
|
|
170
|
+
own product within 2 weeks.
|
|
171
|
+
|
|
172
|
+
3-of-4 = soft pass (revisit Phase D scope but proceed). 4-of-4 = hard
|
|
173
|
+
pass (build Phase D). ≤ 2 = fail (back to substrate iteration).
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## What we don't ask for
|
|
178
|
+
|
|
179
|
+
- Your code. Wire `Dispatch` around your existing API; we never see the
|
|
180
|
+
source.
|
|
181
|
+
- Your customer data. Use synthetic scenarios or anonymized real ones —
|
|
182
|
+
whichever you prefer.
|
|
183
|
+
- Your model keys. You bring your own; if you want, route through Tangle
|
|
184
|
+
Router and we never see the prompts either.
|
|
185
|
+
- Exclusivity, commitment, or contract. Walk away whenever.
|
|
186
|
+
|
|
187
|
+
The point is to learn if the substrate works for someone we didn't
|
|
188
|
+
build it for. That's it.
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
# Phase-B runbook (internal)
|
|
2
|
+
|
|
3
|
+
How we drive a design-partner pairing. Goes alongside
|
|
4
|
+
[`phase-b-pairing-kit.md`](./phase-b-pairing-kit.md) (the partner-facing
|
|
5
|
+
materials) — this file is for us.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Before the pairing
|
|
10
|
+
|
|
11
|
+
- **24-48h prior:** send discovery questions from
|
|
12
|
+
[`phase-b-pairing-kit.md`](./phase-b-pairing-kit.md). Don't run the
|
|
13
|
+
pairing without answers in hand. The pairing fails when we discover
|
|
14
|
+
the partner's quality bar live; we don't have time to interview AND
|
|
15
|
+
build in 4 hours.
|
|
16
|
+
- **48h prior:** run the canonical demo (`pnpm tsx
|
|
17
|
+
examples/marketing-agent-canonical/index.ts`) end-to-end against the
|
|
18
|
+
partner's preferred model. Confirms the substrate + their LLM tier
|
|
19
|
+
compose. If it errors, fix the substrate before the pairing.
|
|
20
|
+
- **24h prior:** mirror the partner's stack locally. If they're on
|
|
21
|
+
Cloudflare Workers, run a Worker. On LangChain, install `@langchain/*`.
|
|
22
|
+
Don't debug their tooling on the call.
|
|
23
|
+
- **1h prior:** open the pairing kit, the agent-eval repo, the partner's
|
|
24
|
+
agent code/endpoint, a shared doc, and a screenshare ready.
|
|
25
|
+
|
|
26
|
+
## During the pairing
|
|
27
|
+
|
|
28
|
+
### Driving principles
|
|
29
|
+
|
|
30
|
+
- **Talk less, ship more.** The partner is paying with their time and
|
|
31
|
+
attention; every minute we talk we aren't shipping their lift.
|
|
32
|
+
- **They write the judge.** We start with our strawman so they have
|
|
33
|
+
something to react to, but the judge that ends up running is theirs.
|
|
34
|
+
This is the most-discussed seam — they should own it.
|
|
35
|
+
- **No invented features.** Don't promise capabilities that don't exist
|
|
36
|
+
("we have a hosted ingest for this") unless they actually exist.
|
|
37
|
+
Phase B is honesty's purest test.
|
|
38
|
+
- **Capture verbatim.** Write down their exact words on what's broken /
|
|
39
|
+
what would change their mind. The wedge-gate evidence is qualitative
|
|
40
|
+
too.
|
|
41
|
+
|
|
42
|
+
### When to escalate to Drew
|
|
43
|
+
|
|
44
|
+
- Partner wants something Phase D would have (hosted dashboard, multi-
|
|
45
|
+
tenant, billing). **Escalate same day** — this is the GTM signal we're
|
|
46
|
+
hunting for; Drew should hear it directly.
|
|
47
|
+
- Partner is the wrong fit (technical or business) and the pairing
|
|
48
|
+
would burn both sides' time. **Pause the pairing**, debrief with Drew,
|
|
49
|
+
reschedule with a better-fit partner.
|
|
50
|
+
- Substrate breaks in a way that requires a published bump. **Pause
|
|
51
|
+
the pairing**, ship the fix in a focused PR, resume.
|
|
52
|
+
|
|
53
|
+
### What to capture for the wedge gate
|
|
54
|
+
|
|
55
|
+
Per [`docs/design/external-agent-wedge.md`](./design/external-agent-wedge.md),
|
|
56
|
+
the gate decision hinges on Phase B evidence. We capture:
|
|
57
|
+
|
|
58
|
+
1. **Quantitative lift** — held-out winner composite vs baseline, per
|
|
59
|
+
scenario + overall. Auto-generated in the report artifact by the
|
|
60
|
+
canonical demo (`.phase-b-runs/<ts>/phase-b-report.md`).
|
|
61
|
+
2. **Qualitative partner-validation** — partner read 3+ winner outputs
|
|
62
|
+
and confirmed they're better. Capture as a 1-paragraph quote.
|
|
63
|
+
3. **Integration friction** — minutes spent on each pairing phase. Were
|
|
64
|
+
any > 2x estimated? What broke?
|
|
65
|
+
4. **Judge-design surprise** — which dimensions the partner added or
|
|
66
|
+
killed vs our strawman. Strong signal about what the substrate's
|
|
67
|
+
default judge templates are missing for adjacent domains.
|
|
68
|
+
5. **Soft commitments** — would they reference us? Would they
|
|
69
|
+
self-serve from the quickstart doc? Would they pay for hosted?
|
|
70
|
+
|
|
71
|
+
Capture into a single `phase-b-debrief.md` per partner. We don't
|
|
72
|
+
publish these; they feed the next substrate iteration + the wedge
|
|
73
|
+
go/no-go.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Failure modes — what we do NOT do
|
|
78
|
+
|
|
79
|
+
### "We'll just optimize on the train set"
|
|
80
|
+
|
|
81
|
+
Hard no. The held-out gate is the entire point. A win that doesn't
|
|
82
|
+
generalize is worse than no win — it's evidence that the substrate
|
|
83
|
+
overfits, which is the failure mode the wedge tier rewards.
|
|
84
|
+
|
|
85
|
+
If the holdout lift is < threshold but train looks great:
|
|
86
|
+
|
|
87
|
+
1. Show the partner the gap. Explain what overfitting means here.
|
|
88
|
+
2. Try raising `maxGenerations` to 5 (gives gepa more search budget).
|
|
89
|
+
3. Try widening `populationSize` to 3 (more diverse mutations per gen).
|
|
90
|
+
4. If still no lift on holdout: **report the result honestly**. A
|
|
91
|
+
negative finding is real evidence for us too — tells us this surface
|
|
92
|
+
isn't amenable to prompt-only mutation, and the partner needs Phase
|
|
93
|
+
C (code-tier optimization) or a different approach.
|
|
94
|
+
|
|
95
|
+
### "The judge is too noisy"
|
|
96
|
+
|
|
97
|
+
A judge whose two-run variance > 0.1 on the same artifact is broken.
|
|
98
|
+
Fixes, in order:
|
|
99
|
+
|
|
100
|
+
1. Lower temperature to 0.0 (the canonical judge uses 0.2, which is
|
|
101
|
+
already low).
|
|
102
|
+
2. Use a stronger model than the agent (default: same model. Bump the
|
|
103
|
+
judge to GPT-5.5 / Claude Opus.)
|
|
104
|
+
3. Add anchors to each dimension ("0.0 = X, 0.5 = Y, 1.0 = Z").
|
|
105
|
+
4. If still noisy: collapse to fewer, simpler dimensions. 3 unambiguous
|
|
106
|
+
dimensions beat 6 squishy ones.
|
|
107
|
+
|
|
108
|
+
### "We can't decide what the partner's judge should be"
|
|
109
|
+
|
|
110
|
+
Then we don't have Phase B. The judge IS the partner's quality bar.
|
|
111
|
+
If they can't articulate it in 45 minutes of pairing, we're in the
|
|
112
|
+
wrong pairing — they need to do the interview-themselves work first.
|
|
113
|
+
|
|
114
|
+
**Pause the pairing, send the discovery doc again, regroup in a week.**
|
|
115
|
+
|
|
116
|
+
### "Their agent is slow / expensive"
|
|
117
|
+
|
|
118
|
+
`maxConcurrency: 1` and reduce scenarios to 6. Cost scales linearly;
|
|
119
|
+
time scales as `(scenarios × reps × generations × population) /
|
|
120
|
+
concurrency`. Tune until the loop completes in ≤ 30 min.
|
|
121
|
+
|
|
122
|
+
If the per-call cost is > $1, talk to Drew before the pairing — we
|
|
123
|
+
might want to subsidize the partner's first run.
|
|
124
|
+
|
|
125
|
+
### "They want to share their secrets through Tangle Router"
|
|
126
|
+
|
|
127
|
+
Fine — `OPENAI_BASE_URL=https://router.tangle.tools/v1` works. Make
|
|
128
|
+
sure they understand: every call routes through us; the prompts and
|
|
129
|
+
responses are visible to whatever observability we have on the router.
|
|
130
|
+
If they want zero data leaving their network, point at their own
|
|
131
|
+
endpoint, not Tangle Router.
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## After the pairing
|
|
136
|
+
|
|
137
|
+
### Same day
|
|
138
|
+
|
|
139
|
+
- Save the `phase-b-report.md` artifact + the partner's debrief notes
|
|
140
|
+
to `~/company/design-partners/<partner>/<date>/`.
|
|
141
|
+
- Send the partner a thank-you with the winner artifact + the next-
|
|
142
|
+
steps doc. Whether or not we proceed to Phase D, leave them with
|
|
143
|
+
something concrete they can ship in their product.
|
|
144
|
+
- Slack Drew the verdict against the [success criteria](./phase-b-pairing-kit.md#success-criteria--what-counts-as-phase-b-passed).
|
|
145
|
+
|
|
146
|
+
### Within a week
|
|
147
|
+
|
|
148
|
+
- If Phase B passed: open the Phase D RFC. Reuse the partner-validated
|
|
149
|
+
judge dimensions + scenarios as the spec for what the hosted tier
|
|
150
|
+
needs to support out of the box.
|
|
151
|
+
- If Phase B failed: substrate iteration ticket(s). Specific gaps the
|
|
152
|
+
pairing surfaced (judge dim defaults, doc clarity, missing helper).
|
|
153
|
+
- Either way: update the wedge doc (`docs/design/external-agent-wedge.md`)
|
|
154
|
+
with the partner-name redacted + the qualitative signal.
|
|
155
|
+
|
|
156
|
+
### Within a month (regardless of go/no-go)
|
|
157
|
+
|
|
158
|
+
- Followup with the partner. If they're still using the lib, capture a
|
|
159
|
+
metric. If they stopped, find out why. Both data points feed product.
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## The canonical demo as a forcing function
|
|
164
|
+
|
|
165
|
+
`examples/marketing-agent-canonical/` is the demo we open the pairing
|
|
166
|
+
with. It does three things at once:
|
|
167
|
+
|
|
168
|
+
1. **Proves the substrate works** — they see a real lift on a real-
|
|
169
|
+
feeling agent before we touch their code.
|
|
170
|
+
2. **Sets the bar for the judge conversation** — they react to concrete
|
|
171
|
+
dimensions, not abstract questions.
|
|
172
|
+
3. **Trains us** — running the canonical demo before the pairing
|
|
173
|
+
surfaces substrate bugs on the partner's preferred model BEFORE the
|
|
174
|
+
partner is watching. We hit those bugs first.
|
|
175
|
+
|
|
176
|
+
Run the canonical demo before every Phase-B pairing. It's not optional.
|
|
@@ -13,12 +13,51 @@ Tangle sandbox, no Tangle account, and no hosted infrastructure.
|
|
|
13
13
|
## Install
|
|
14
14
|
|
|
15
15
|
```sh
|
|
16
|
-
npm i @tangle-network/agent-eval@^0.
|
|
16
|
+
npm i @tangle-network/agent-eval@^0.46.0
|
|
17
17
|
```
|
|
18
18
|
|
|
19
|
-
The package's `@tangle-network/sandbox` peer is `optional
|
|
20
|
-
|
|
21
|
-
|
|
19
|
+
The package's `@tangle-network/sandbox` peer is `optional`. Foreign
|
|
20
|
+
consumers install agent-eval and run the full LAND tier without our
|
|
21
|
+
sandbox or its dependencies.
|
|
22
|
+
|
|
23
|
+
## The one-shot happy path
|
|
24
|
+
|
|
25
|
+
If you don't want to learn the substrate, the entire LAND tier reduces
|
|
26
|
+
to one function call:
|
|
27
|
+
|
|
28
|
+
```ts
|
|
29
|
+
import { selfImprove } from '@tangle-network/agent-eval/contract'
|
|
30
|
+
|
|
31
|
+
const result = await selfImprove({
|
|
32
|
+
agent: (surface, scenario, ctx) =>
|
|
33
|
+
runYourAgent({ systemPrompt: surface as string, scenario, signal: ctx.signal }),
|
|
34
|
+
scenarios,
|
|
35
|
+
judge,
|
|
36
|
+
baselineSurface: 'You are a senior copywriter…',
|
|
37
|
+
budget: { dollars: 10, generations: 3 },
|
|
38
|
+
})
|
|
39
|
+
|
|
40
|
+
console.log(`lift: ${result.lift.toFixed(3)} (${result.gateDecision})`)
|
|
41
|
+
if (result.gateDecision === 'ship') {
|
|
42
|
+
// result.winner.surface is the optimized prompt
|
|
43
|
+
}
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
That's the LAND happy path. Smart defaults pick: in-memory storage,
|
|
47
|
+
`gepaDriver` with copywriting-flavored mutation primitives,
|
|
48
|
+
`defaultProductionGate` with `deltaThreshold: 0.05`, 25% deterministic
|
|
49
|
+
train/holdout split.
|
|
50
|
+
|
|
51
|
+
Every escape hatch the substrate exposes is reachable from
|
|
52
|
+
`selfImprove` — custom `driver`, custom `gate`, distributed-driver
|
|
53
|
+
`cellPlacement`, `onProgress` streaming callback, `autoOnPromote: 'pr'`
|
|
54
|
+
to open a GitHub PR with the winner. See the type signatures in
|
|
55
|
+
[`src/contract/self-improve.ts`](../src/contract/self-improve.ts) for
|
|
56
|
+
the full surface.
|
|
57
|
+
|
|
58
|
+
The sections below are the lower-level path — useful when you want
|
|
59
|
+
fine-grained control over each piece. Read those next if `selfImprove`
|
|
60
|
+
isn't enough.
|
|
22
61
|
|
|
23
62
|
## Five types, four functions
|
|
24
63
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.46.0",
|
|
4
4
|
"description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|