@tangle-network/agent-eval 0.44.0 → 0.45.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/campaign/run-campaign.ts","../src/campaign/storage.ts"],"sourcesContent":["/**\n * @experimental\n *\n * `runCampaign` — Pass A substrate primitive. ONE function that orchestrates\n * scenarios → dispatch → artifacts → judges → aggregates, with full\n * reproducibility (seed + manifest hash), cell-level resumability, bootstrap\n * CIs, and the `LabeledScenarioStore` capture flywheel.\n *\n * Improvement loops (optimizer / gate / autoOnPromote) ride on top of this\n * primitive but live in `presets/run-improvement-loop.ts`. This file keeps\n * the core orchestrator minimal — Phase 1 of the Pass A track.\n */\n\nimport { createHash } from 'node:crypto'\nimport { join } from 'node:path'\nimport { confidenceInterval } from '../statistics'\nimport { type CampaignStorage, fsCampaignStorage } from './storage'\nimport type {\n CampaignAggregates,\n CampaignArtifactWriter,\n CampaignCellResult,\n CampaignCostMeter,\n CampaignResult,\n CampaignTraceWriter,\n DispatchContext,\n DispatchFn,\n JudgeAggregate,\n JudgeConfig,\n JudgeScore,\n LabeledScenarioStore,\n Scenario,\n ScenarioAggregate,\n TraceSpan,\n} from './types'\n\nexport interface RunCampaignOptions<TScenario extends Scenario, TArtifact> {\n scenarios: TScenario[]\n dispatch: DispatchFn<TScenario, TArtifact>\n judges?: JudgeConfig<TArtifact, TScenario>[]\n /** Required for reproducibility. Default 42. */\n seed?: number\n /** Per-scenario replicates for CI bands. Default 1; raise to 5+ for\n * bootstrap-tight intervals on critical eval. */\n reps?: number\n /** When true (default), completed cells are cached by\n * (manifestHash, scenarioId, rep, generation). Re-runs skip cached cells. */\n resumable?: boolean\n /** Optional store — when present, every artifact + judge score is captured\n * with the configured `captureSource`. Capture is default ON; pass `'off'`\n * to disable. */\n labeledStore?: LabeledScenarioStore | 'off'\n captureSource?: 'production-trace' | 'eval-run' | 'manual' | 'red-team' | 'synthetic'\n captureSourceVersionHash?: string\n /** Wall-clock cost cap across all cells. Cells beyond ceiling are skipped. */\n costCeiling?: number\n /** Max concurrent cells. Default 2. */\n maxConcurrency?: number\n /** Required: where artifacts + traces land. */\n runDir: string\n /** Tracing posture. Default is the substrate's `FileSystemTraceStore` rooted\n * at `<runDir>/traces/`. `'off'` disables capture entirely — substrate\n * refuses this when the caller wires `autoOnPromote !== 'none'`. */\n tracing?: 'on' | 'off'\n /** Test seam — override the wall clock for deterministic tests. */\n now?: () => Date\n /** Test seam — override per-cell trace writer factory. */\n buildTraceWriter?: (cellId: string, dir: string) => CampaignTraceWriter\n /** Storage backend for run/cell dirs, the resumability cache, artifacts,\n * and trace spans. Default: the Node filesystem (`fsCampaignStorage`).\n * Pass `inMemoryCampaignStorage()` to run in a filesystem-less runtime\n * (Cloudflare Workers, Deno, edge) — the `CampaignResult` is still\n * produced; artifacts/traces just aren't persisted to disk. */\n storage?: CampaignStorage\n /**\n * Optional per-cell placement strategy. Returns an opaque string the\n * substrate forwards as `ctx.placement` to the Dispatch — placement-aware\n * Dispatches (e.g. `httpDispatch` from `/adapters/http`) use it to route\n * each cell to the right worker, region, or sandbox. When unset, every\n * cell receives `ctx.placement = undefined` and behaves identically to\n * the in-process case.\n *\n * @example\n * cellPlacement: ({ scenario }) => scenario.tags?.includes('eu') ? 'eu-west' : 'us-east'\n */\n cellPlacement?: (input: {\n scenario: TScenario\n rep: number\n generation?: number\n }) => string | undefined\n}\n\nexport async function runCampaign<TScenario extends Scenario, TArtifact>(\n opts: RunCampaignOptions<TScenario, TArtifact>,\n): Promise<CampaignResult<TArtifact, TScenario>> {\n const seed = opts.seed ?? 42\n const reps = opts.reps ?? 1\n const resumable = opts.resumable ?? true\n const maxConcurrency = opts.maxConcurrency ?? 2\n const now = opts.now ?? (() => new Date())\n const judges = opts.judges ?? []\n const storage = opts.storage ?? fsCampaignStorage()\n\n storage.ensureDir(opts.runDir)\n\n const manifestHash = computeManifestHash({\n scenarios: opts.scenarios,\n judges: judges as unknown as JudgeConfig<unknown>[],\n dispatchRef: opts.dispatch.name || 'anonymous',\n seed,\n reps,\n })\n\n const startedAt = now()\n const cells: CampaignCellResult<TArtifact>[] = []\n const artifactsByPath: Record<string, string> = {}\n\n // Build the cell schedule (scenario × rep).\n const schedule: Array<{ scenario: TScenario; rep: number; cellId: string; cellSeed: number }> = []\n let cellIndex = 0\n for (const scenario of opts.scenarios) {\n for (let rep = 0; rep < reps; rep++) {\n const cellId = `${scenario.id}:${rep}`\n const cellSeed = seed + cellIndex\n schedule.push({ scenario, rep, cellId, cellSeed })\n cellIndex += 1\n }\n }\n\n // Concurrency-limited execution.\n let totalCostUsd = 0\n let costCeilingReached = false\n const abortController = new AbortController()\n // Concurrency lanes that drain the cell schedule. Named \"lanes\" — not\n // \"workers\" — to avoid clashing with the taxonomy's worker (= the agent\n // harness in a sandbox, invoked behind `dispatch`). See loop-taxonomy.md.\n const lanes: Promise<void>[] = []\n let nextIdx = 0\n const cellsRef = cells\n\n for (let i = 0; i < maxConcurrency; i++) {\n lanes.push(\n (async () => {\n while (true) {\n const myIdx = nextIdx++\n if (myIdx >= schedule.length) return\n const slot = schedule[myIdx]!\n if (costCeilingReached) {\n cellsRef.push(skippedCell(slot, 'cost_ceiling_reached'))\n continue\n }\n const result = await executeCell({\n slot,\n opts,\n manifestHash,\n resumable,\n now,\n storage,\n buildTraceWriter: opts.buildTraceWriter ?? defaultBuildTraceWriter(storage),\n signal: abortController.signal,\n })\n cellsRef.push(result.cell)\n totalCostUsd += result.cell.costUsd\n Object.assign(artifactsByPath, result.artifactsByPath)\n if (opts.costCeiling !== undefined && totalCostUsd >= opts.costCeiling) {\n costCeilingReached = true\n }\n // Capture into LabeledScenarioStore unless explicitly disabled.\n if (opts.labeledStore && opts.labeledStore !== 'off' && !result.cell.error) {\n await captureToStore({\n store: opts.labeledStore,\n cell: result.cell,\n scenario: slot.scenario,\n opts,\n now,\n }).catch((err) => {\n // Capture failures are non-fatal — log but don't crash the campaign.\n // (Trace would normally land here.)\n console.warn(\n `[runCampaign] capture failed for ${result.cell.cellId}: ${err instanceof Error ? err.message : String(err)}`,\n )\n })\n }\n }\n })(),\n )\n }\n await Promise.all(lanes)\n\n const endedAt = now()\n cellsRef.sort((a, b) => a.cellId.localeCompare(b.cellId))\n\n const aggregates = computeAggregates(\n cellsRef,\n judges as unknown as JudgeConfig<TArtifact>[],\n seed,\n )\n\n return {\n manifestHash,\n seed,\n startedAt: startedAt.toISOString(),\n endedAt: endedAt.toISOString(),\n durationMs: endedAt.getTime() - startedAt.getTime(),\n cells: cellsRef,\n aggregates,\n runDir: opts.runDir,\n artifactsByPath,\n scenarios: opts.scenarios.map((s) => ({ id: s.id, kind: s.kind })),\n }\n}\n\n// ── Internals ─────────────────────────────────────────────────────────\n\ninterface ExecuteCellArgs<TScenario extends Scenario, TArtifact> {\n slot: { scenario: TScenario; rep: number; cellId: string; cellSeed: number }\n opts: RunCampaignOptions<TScenario, TArtifact>\n manifestHash: string\n resumable: boolean\n now: () => Date\n storage: CampaignStorage\n buildTraceWriter: (cellId: string, dir: string) => CampaignTraceWriter\n signal: AbortSignal\n}\n\nasync function executeCell<TScenario extends Scenario, TArtifact>(\n args: ExecuteCellArgs<TScenario, TArtifact>,\n): Promise<{ cell: CampaignCellResult<TArtifact>; artifactsByPath: Record<string, string> }> {\n const storage = args.storage\n const cellDir = join(args.opts.runDir, args.slot.cellId.replace(/[^a-zA-Z0-9_-]/g, '_'))\n storage.ensureDir(cellDir)\n\n // Resumability: cache key = (manifestHash, scenarioId, rep)\n const cachePath = join(cellDir, 'cached-result.json')\n if (args.resumable) {\n const raw = storage.read(cachePath)\n if (raw !== undefined) {\n try {\n const cached = JSON.parse(raw) as CampaignCellResult<TArtifact>\n if (cached.cellId === args.slot.cellId) {\n return { cell: { ...cached, cached: true }, artifactsByPath: {} }\n }\n } catch {\n // Corrupt cache — fall through to re-run.\n }\n }\n }\n\n const startMs = Date.now()\n const trace = args.buildTraceWriter(args.slot.cellId, cellDir)\n const artifactsByPath: Record<string, string> = {}\n const artifacts: CampaignArtifactWriter = {\n async write(path, content) {\n const fullPath = join(cellDir, path)\n storage.ensureDir(join(fullPath, '..'))\n storage.write(fullPath, content)\n artifactsByPath[`${args.slot.cellId}/${path}`] = fullPath\n return fullPath\n },\n async writeJson(path, value) {\n return artifacts.write(path, JSON.stringify(value, null, 2))\n },\n }\n let costSoFar = 0\n const cost: CampaignCostMeter = {\n observe(amount, source) {\n costSoFar += amount\n trace.span(`cost.${source}`, { amountUsd: amount }).end()\n },\n current() {\n return costSoFar\n },\n }\n\n const placement = args.opts.cellPlacement?.({\n scenario: args.slot.scenario,\n rep: args.slot.rep,\n })\n\n const ctx: DispatchContext = {\n cellId: args.slot.cellId,\n rep: args.slot.rep,\n seed: args.slot.cellSeed,\n signal: args.signal,\n trace,\n artifacts,\n cost,\n placement,\n }\n\n let artifact: TArtifact | undefined\n let errorMessage: string | undefined\n try {\n artifact = await args.opts.dispatch(args.slot.scenario, ctx)\n } catch (err) {\n errorMessage = err instanceof Error ? err.message : String(err)\n }\n\n // Run judges (only if we have an artifact). A judge that throws invalidates\n // the cell — recorded as `error`, NOT folded into a fake composite:0 (a fake\n // zero is indistinguishable from a real zero and poisons every aggregate).\n const judgeScores: Record<string, JudgeScore> = {}\n if (artifact !== undefined) {\n for (const judge of args.opts.judges ?? []) {\n if (judge.appliesTo && !judge.appliesTo(args.slot.scenario)) continue\n try {\n judgeScores[judge.name] = await runJudgeCell(judge, {\n artifact,\n scenario: args.slot.scenario,\n signal: args.signal,\n })\n } catch (err) {\n errorMessage = `judge '${judge.name}' failed: ${err instanceof Error ? err.message : String(err)}`\n break\n }\n }\n }\n\n await trace.flush()\n\n const cell: CampaignCellResult<TArtifact> = {\n cellId: args.slot.cellId,\n scenarioId: args.slot.scenario.id,\n rep: args.slot.rep,\n artifact: (artifact ?? null) as TArtifact,\n judgeScores,\n costUsd: costSoFar,\n durationMs: Date.now() - startMs,\n seed: args.slot.cellSeed,\n cached: false,\n error: errorMessage,\n }\n\n if (!errorMessage && args.resumable) {\n storage.write(cachePath, JSON.stringify(cell))\n }\n\n return { cell, artifactsByPath }\n}\n\nasync function runJudgeCell<TArtifact, TScenario extends Scenario>(\n judge: JudgeConfig<TArtifact, TScenario>,\n input: { artifact: TArtifact; scenario: TScenario; signal: AbortSignal },\n): Promise<JudgeScore> {\n return judge.score(input)\n}\n\nfunction defaultBuildTraceWriter(\n storage: CampaignStorage,\n): (cellId: string, dir: string) => CampaignTraceWriter {\n return (cellId, dir) => {\n const spans: Array<Record<string, unknown>> = []\n return {\n span(name, attributes) {\n const startMs = Date.now()\n const record: Record<string, unknown> = { name, cellId, startMs, ...(attributes ?? {}) }\n const finish: TraceSpan = {\n end(endAttrs) {\n record.durationMs = Date.now() - startMs\n if (endAttrs) Object.assign(record, endAttrs)\n spans.push(record)\n },\n setAttribute(key, value) {\n record[key] = value\n },\n }\n return finish\n },\n async flush() {\n storage.write(join(dir, 'spans.jsonl'), spans.map((s) => JSON.stringify(s)).join('\\n'))\n },\n }\n }\n}\n\nfunction skippedCell<TScenario extends Scenario, TArtifact>(\n slot: { scenario: TScenario; rep: number; cellId: string; cellSeed: number },\n reason: string,\n): CampaignCellResult<TArtifact> {\n return {\n cellId: slot.cellId,\n scenarioId: slot.scenario.id,\n rep: slot.rep,\n artifact: null as unknown as TArtifact,\n judgeScores: {},\n costUsd: 0,\n durationMs: 0,\n seed: slot.cellSeed,\n cached: false,\n error: `skipped: ${reason}`,\n }\n}\n\ninterface CaptureArgs<TScenario extends Scenario, TArtifact> {\n store: LabeledScenarioStore\n cell: CampaignCellResult<TArtifact>\n scenario: TScenario\n opts: RunCampaignOptions<TScenario, TArtifact>\n now: () => Date\n}\n\nasync function captureToStore<TScenario extends Scenario, TArtifact>(\n args: CaptureArgs<TScenario, TArtifact>,\n): Promise<void> {\n await args.store.observe({\n scenario: args.scenario,\n artifact: args.cell.artifact,\n judgeScores: args.cell.judgeScores,\n source: args.opts.captureSource ?? 'eval-run',\n sourceVersionHash: args.opts.captureSourceVersionHash ?? 'unknown',\n capturedAt: args.now().toISOString(),\n redactionStatus: 'raw',\n })\n}\n\n// ── Aggregates + manifest hash ────────────────────────────────────────\n\nfunction computeManifestHash(input: {\n scenarios: Scenario[]\n judges: JudgeConfig<unknown>[]\n dispatchRef: string\n seed: number\n reps: number\n}): string {\n const canonical = {\n scenarios: input.scenarios.map((s) => ({ id: s.id, kind: s.kind })),\n judges: input.judges.map((j) => ({ name: j.name, dims: j.dimensions.map((d) => d.key) })),\n dispatch: input.dispatchRef,\n seed: input.seed,\n reps: input.reps,\n }\n return createHash('sha256').update(JSON.stringify(canonical)).digest('hex')\n}\n\nfunction computeAggregates<TArtifact>(\n cells: CampaignCellResult<TArtifact>[],\n judges: JudgeConfig<TArtifact>[],\n seed: number,\n): CampaignAggregates {\n const byJudge: Record<string, JudgeAggregate> = {}\n for (const judge of judges) {\n const scores: number[] = []\n for (const cell of cells) {\n const s = cell.judgeScores[judge.name]\n if (s !== undefined) scores.push(s.composite)\n }\n byJudge[judge.name] = aggregate(scores, seed)\n }\n const byScenario: Record<string, ScenarioAggregate> = {}\n const scenarioGroups = new Map<string, number[]>()\n for (const cell of cells) {\n const composites = Object.values(cell.judgeScores).map((s) => s.composite)\n if (composites.length === 0) continue\n const mean = composites.reduce((a, b) => a + b, 0) / composites.length\n const arr = scenarioGroups.get(cell.scenarioId) ?? []\n arr.push(mean)\n scenarioGroups.set(cell.scenarioId, arr)\n }\n for (const [scenarioId, samples] of scenarioGroups) {\n const ag = aggregate(samples, seed)\n byScenario[scenarioId] = { meanComposite: ag.mean, ci95: ag.ci95, n: ag.n }\n }\n return {\n byJudge,\n byScenario,\n totalCostUsd: cells.reduce((a, c) => a + c.costUsd, 0),\n cellsExecuted: cells.filter((c) => !c.error).length,\n cellsSkipped: cells.filter((c) => c.error?.startsWith('skipped:')).length,\n cellsCached: cells.filter((c) => c.cached).length,\n cellsFailed: cells.filter((c) => c.error && !c.error.startsWith('skipped:')).length,\n }\n}\n\n// Percentile bootstrap CI95 via seeded resampling. Deterministic for a given\n// seed — same campaign re-run produces identical CI bands. Falls back to\n// degenerate intervals at n<=1 (the bootstrap is undefined there).\nfunction aggregate(samples: number[], seed: number): JudgeAggregate {\n const n = samples.length\n if (n === 0) return { mean: 0, stdev: 0, ci95: [0, 0], n: 0 }\n const mean = samples.reduce((a, b) => a + b, 0) / n\n const variance = samples.reduce((a, b) => a + (b - mean) ** 2, 0) / Math.max(1, n - 1)\n const stdev = Math.sqrt(variance)\n const ci = confidenceInterval(samples, 0.95, { seed, resamples: 1000 })\n return { mean, stdev, ci95: [ci.lower, ci.upper], n }\n}\n","/**\n * @experimental\n *\n * `CampaignStorage` — the filesystem seam `runCampaign` writes through\n * (run/cell dirs, the resumability cache, per-cell artifacts, trace spans).\n *\n * The default (`fsCampaignStorage`) is the Node filesystem — identical\n * behavior to the inline `node:fs` calls it replaces, so existing CLI\n * consumers are unaffected. `inMemoryCampaignStorage` keeps everything in a\n * `Map`, so the substrate runs in environments WITHOUT a filesystem\n * (Cloudflare Workers, Deno Deploy, other edge runtimes) — the campaign\n * still produces its `CampaignResult` (cells + aggregates) in memory;\n * artifacts/traces simply aren't persisted to disk.\n *\n * Paths are opaque keys to the in-memory adapter — it does not parse them,\n * so the same `join(...)`-built paths work unchanged across both adapters.\n */\nexport interface CampaignStorage {\n /** Ensure a directory exists (recursive). No-op for in-memory. */\n ensureDir(dir: string): void\n /** Does this path exist (as a written file or an ensured dir)? */\n exists(path: string): boolean\n /** Read a UTF-8 file; `undefined` when missing or unreadable. */\n read(path: string): string | undefined\n /** Write a file (string or bytes). Parent dir is assumed ensured. */\n write(path: string, content: string | Uint8Array): void\n}\n\n/** Node-filesystem storage — the default. Lazily requires `node:fs` so the\n * module imports cleanly in non-Node runtimes (where the caller passes\n * `inMemoryCampaignStorage` instead and never constructs this). */\nexport function fsCampaignStorage(): CampaignStorage {\n const { existsSync, mkdirSync, readFileSync, writeFileSync } =\n require('node:fs') as typeof import('node:fs')\n return {\n ensureDir(dir) {\n if (!existsSync(dir)) mkdirSync(dir, { recursive: true })\n },\n exists(path) {\n return existsSync(path)\n },\n read(path) {\n try {\n return readFileSync(path, 'utf8')\n } catch {\n return undefined\n }\n },\n write(path, content) {\n writeFileSync(path, content as Uint8Array)\n },\n }\n}\n\n/** In-memory storage for filesystem-less runtimes. Artifacts + trace spans\n * live in a `Map` for the duration of the run; the `CampaignResult` is\n * fully populated, but nothing is persisted to disk. */\nexport function inMemoryCampaignStorage(): CampaignStorage {\n const files = new Map<string, string | Uint8Array>()\n const dirs = new Set<string>()\n return {\n ensureDir(dir) {\n dirs.add(dir)\n },\n exists(path) {\n return files.has(path) || dirs.has(path)\n },\n read(path) {\n const value = files.get(path)\n if (value === undefined) return undefined\n return typeof value === 'string' ? value : new TextDecoder().decode(value)\n },\n write(path, content) {\n files.set(path, content)\n },\n }\n}\n"],"mappings":";;;;;;;;AAaA,SAAS,kBAAkB;AAC3B,SAAS,YAAY;;;ACiBd,SAAS,oBAAqC;AACnD,QAAM,EAAE,YAAY,WAAW,cAAc,cAAc,IACzD,UAAQ,IAAS;AACnB,SAAO;AAAA,IACL,UAAU,KAAK;AACb,UAAI,CAAC,WAAW,GAAG,EAAG,WAAU,KAAK,EAAE,WAAW,KAAK,CAAC;AAAA,IAC1D;AAAA,IACA,OAAO,MAAM;AACX,aAAO,WAAW,IAAI;AAAA,IACxB;AAAA,IACA,KAAK,MAAM;AACT,UAAI;AACF,eAAO,aAAa,MAAM,MAAM;AAAA,MAClC,QAAQ;AACN,eAAO;AAAA,MACT;AAAA,IACF;AAAA,IACA,MAAM,MAAM,SAAS;AACnB,oBAAc,MAAM,OAAqB;AAAA,IAC3C;AAAA,EACF;AACF;AAKO,SAAS,0BAA2C;AACzD,QAAM,QAAQ,oBAAI,IAAiC;AACnD,QAAM,OAAO,oBAAI,IAAY;AAC7B,SAAO;AAAA,IACL,UAAU,KAAK;AACb,WAAK,IAAI,GAAG;AAAA,IACd;AAAA,IACA,OAAO,MAAM;AACX,aAAO,MAAM,IAAI,IAAI,KAAK,KAAK,IAAI,IAAI;AAAA,IACzC;AAAA,IACA,KAAK,MAAM;AACT,YAAM,QAAQ,MAAM,IAAI,IAAI;AAC5B,UAAI,UAAU,OAAW,QAAO;AAChC,aAAO,OAAO,UAAU,WAAW,QAAQ,IAAI,YAAY,EAAE,OAAO,KAAK;AAAA,IAC3E;AAAA,IACA,MAAM,MAAM,SAAS;AACnB,YAAM,IAAI,MAAM,OAAO;AAAA,IACzB;AAAA,EACF;AACF;;;ADeA,eAAsB,YACpB,MAC+C;AAC/C,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,YAAY,KAAK,aAAa;AACpC,QAAM,iBAAiB,KAAK,kBAAkB;AAC9C,QAAM,MAAM,KAAK,QAAQ,MAAM,oBAAI,KAAK;AACxC,QAAM,SAAS,KAAK,UAAU,CAAC;AAC/B,QAAM,UAAU,KAAK,WAAW,kBAAkB;AAElD,UAAQ,UAAU,KAAK,MAAM;AAE7B,QAAM,eAAe,oBAAoB;AAAA,IACvC,WAAW,KAAK;AAAA,IAChB;AAAA,IACA,aAAa,KAAK,SAAS,QAAQ;AAAA,IACnC;AAAA,IACA;AAAA,EACF,CAAC;AAED,QAAM,YAAY,IAAI;AACtB,QAAM,QAAyC,CAAC;AAChD,QAAM,kBAA0C,CAAC;AAGjD,QAAM,WAA0F,CAAC;AACjG,MAAI,YAAY;AAChB,aAAW,YAAY,KAAK,WAAW;AACrC,aAAS,MAAM,GAAG,MAAM,MAAM,OAAO;AACnC,YAAM,SAAS,GAAG,SAAS,EAAE,IAAI,GAAG;AACpC,YAAM,WAAW,OAAO;AACxB,eAAS,KAAK,EAAE,UAAU,KAAK,QAAQ,SAAS,CAAC;AACjD,mBAAa;AAAA,IACf;AAAA,EACF;AAGA,MAAI,eAAe;AACnB,MAAI,qBAAqB;AACzB,QAAM,kBAAkB,IAAI,gBAAgB;AAI5C,QAAM,QAAyB,CAAC;AAChC,MAAI,UAAU;AACd,QAAM,WAAW;AAEjB,WAAS,IAAI,GAAG,IAAI,gBAAgB,KAAK;AACvC,UAAM;AAAA,OACH,YAAY;AACX,eAAO,MAAM;AACX,gBAAM,QAAQ;AACd,cAAI,SAAS,SAAS,OAAQ;AAC9B,gBAAM,OAAO,SAAS,KAAK;AAC3B,cAAI,oBAAoB;AACtB,qBAAS,KAAK,YAAY,MAAM,sBAAsB,CAAC;AACvD;AAAA,UACF;AACA,gBAAM,SAAS,MAAM,YAAY;AAAA,YAC/B;AAAA,YACA;AAAA,YACA;AAAA,YACA;AAAA,YACA;AAAA,YACA;AAAA,YACA,kBAAkB,KAAK,oBAAoB,wBAAwB,OAAO;AAAA,YAC1E,QAAQ,gBAAgB;AAAA,UAC1B,CAAC;AACD,mBAAS,KAAK,OAAO,IAAI;AACzB,0BAAgB,OAAO,KAAK;AAC5B,iBAAO,OAAO,iBAAiB,OAAO,eAAe;AACrD,cAAI,KAAK,gBAAgB,UAAa,gBAAgB,KAAK,aAAa;AACtE,iCAAqB;AAAA,UACvB;AAEA,cAAI,KAAK,gBAAgB,KAAK,iBAAiB,SAAS,CAAC,OAAO,KAAK,OAAO;AAC1E,kBAAM,eAAe;AAAA,cACnB,OAAO,KAAK;AAAA,cACZ,MAAM,OAAO;AAAA,cACb,UAAU,KAAK;AAAA,cACf;AAAA,cACA;AAAA,YACF,CAAC,EAAE,MAAM,CAAC,QAAQ;AAGhB,sBAAQ;AAAA,gBACN,oCAAoC,OAAO,KAAK,MAAM,KAAK,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,cAC7G;AAAA,YACF,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF,GAAG;AAAA,IACL;AAAA,EACF;AACA,QAAM,QAAQ,IAAI,KAAK;AAEvB,QAAM,UAAU,IAAI;AACpB,WAAS,KAAK,CAAC,GAAG,MAAM,EAAE,OAAO,cAAc,EAAE,MAAM,CAAC;AAExD,QAAM,aAAa;AAAA,IACjB;AAAA,IACA;AAAA,IACA;AAAA,EACF;AAEA,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,WAAW,UAAU,YAAY;AAAA,IACjC,SAAS,QAAQ,YAAY;AAAA,IAC7B,YAAY,QAAQ,QAAQ,IAAI,UAAU,QAAQ;AAAA,IAClD,OAAO;AAAA,IACP;AAAA,IACA,QAAQ,KAAK;AAAA,IACb;AAAA,IACA,WAAW,KAAK,UAAU,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,MAAM,EAAE,KAAK,EAAE;AAAA,EACnE;AACF;AAeA,eAAe,YACb,MAC2F;AAC3F,QAAM,UAAU,KAAK;AACrB,QAAM,UAAU,KAAK,KAAK,KAAK,QAAQ,KAAK,KAAK,OAAO,QAAQ,mBAAmB,GAAG,CAAC;AACvF,UAAQ,UAAU,OAAO;AAGzB,QAAM,YAAY,KAAK,SAAS,oBAAoB;AACpD,MAAI,KAAK,WAAW;AAClB,UAAM,MAAM,QAAQ,KAAK,SAAS;AAClC,QAAI,QAAQ,QAAW;AACrB,UAAI;AACF,cAAM,SAAS,KAAK,MAAM,GAAG;AAC7B,YAAI,OAAO,WAAW,KAAK,KAAK,QAAQ;AACtC,iBAAO,EAAE,MAAM,EAAE,GAAG,QAAQ,QAAQ,KAAK,GAAG,iBAAiB,CAAC,EAAE;AAAA,QAClE;AAAA,MACF,QAAQ;AAAA,MAER;AAAA,IACF;AAAA,EACF;AAEA,QAAM,UAAU,KAAK,IAAI;AACzB,QAAM,QAAQ,KAAK,iBAAiB,KAAK,KAAK,QAAQ,OAAO;AAC7D,QAAM,kBAA0C,CAAC;AACjD,QAAM,YAAoC;AAAA,IACxC,MAAM,MAAM,MAAM,SAAS;AACzB,YAAM,WAAW,KAAK,SAAS,IAAI;AACnC,cAAQ,UAAU,KAAK,UAAU,IAAI,CAAC;AACtC,cAAQ,MAAM,UAAU,OAAO;AAC/B,sBAAgB,GAAG,KAAK,KAAK,MAAM,IAAI,IAAI,EAAE,IAAI;AACjD,aAAO;AAAA,IACT;AAAA,IACA,MAAM,UAAU,MAAM,OAAO;AAC3B,aAAO,UAAU,MAAM,MAAM,KAAK,UAAU,OAAO,MAAM,CAAC,CAAC;AAAA,IAC7D;AAAA,EACF;AACA,MAAI,YAAY;AAChB,QAAM,OAA0B;AAAA,IAC9B,QAAQ,QAAQ,QAAQ;AACtB,mBAAa;AACb,YAAM,KAAK,QAAQ,MAAM,IAAI,EAAE,WAAW,OAAO,CAAC,EAAE,IAAI;AAAA,IAC1D;AAAA,IACA,UAAU;AACR,aAAO;AAAA,IACT;AAAA,EACF;AAEA,QAAM,YAAY,KAAK,KAAK,gBAAgB;AAAA,IAC1C,UAAU,KAAK,KAAK;AAAA,IACpB,KAAK,KAAK,KAAK;AAAA,EACjB,CAAC;AAED,QAAM,MAAuB;AAAA,IAC3B,QAAQ,KAAK,KAAK;AAAA,IAClB,KAAK,KAAK,KAAK;AAAA,IACf,MAAM,KAAK,KAAK;AAAA,IAChB,QAAQ,KAAK;AAAA,IACb;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AAEA,MAAI;AACJ,MAAI;AACJ,MAAI;AACF,eAAW,MAAM,KAAK,KAAK,SAAS,KAAK,KAAK,UAAU,GAAG;AAAA,EAC7D,SAAS,KAAK;AACZ,mBAAe,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAAA,EAChE;AAKA,QAAM,cAA0C,CAAC;AACjD,MAAI,aAAa,QAAW;AAC1B,eAAW,SAAS,KAAK,KAAK,UAAU,CAAC,GAAG;AAC1C,UAAI,MAAM,aAAa,CAAC,MAAM,UAAU,KAAK,KAAK,QAAQ,EAAG;AAC7D,UAAI;AACF,oBAAY,MAAM,IAAI,IAAI,MAAM,aAAa,OAAO;AAAA,UAClD;AAAA,UACA,UAAU,KAAK,KAAK;AAAA,UACpB,QAAQ,KAAK;AAAA,QACf,CAAC;AAAA,MACH,SAAS,KAAK;AACZ,uBAAe,UAAU,MAAM,IAAI,aAAa,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAChG;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,QAAM,MAAM,MAAM;AAElB,QAAM,OAAsC;AAAA,IAC1C,QAAQ,KAAK,KAAK;AAAA,IAClB,YAAY,KAAK,KAAK,SAAS;AAAA,IAC/B,KAAK,KAAK,KAAK;AAAA,IACf,UAAW,YAAY;AAAA,IACvB;AAAA,IACA,SAAS;AAAA,IACT,YAAY,KAAK,IAAI,IAAI;AAAA,IACzB,MAAM,KAAK,KAAK;AAAA,IAChB,QAAQ;AAAA,IACR,OAAO;AAAA,EACT;AAEA,MAAI,CAAC,gBAAgB,KAAK,WAAW;AACnC,YAAQ,MAAM,WAAW,KAAK,UAAU,IAAI,CAAC;AAAA,EAC/C;AAEA,SAAO,EAAE,MAAM,gBAAgB;AACjC;AAEA,eAAe,aACb,OACA,OACqB;AACrB,SAAO,MAAM,MAAM,KAAK;AAC1B;AAEA,SAAS,wBACP,SACsD;AACtD,SAAO,CAAC,QAAQ,QAAQ;AACtB,UAAM,QAAwC,CAAC;AAC/C,WAAO;AAAA,MACL,KAAK,MAAM,YAAY;AACrB,cAAM,UAAU,KAAK,IAAI;AACzB,cAAM,SAAkC,EAAE,MAAM,QAAQ,SAAS,GAAI,cAAc,CAAC,EAAG;AACvF,cAAM,SAAoB;AAAA,UACxB,IAAI,UAAU;AACZ,mBAAO,aAAa,KAAK,IAAI,IAAI;AACjC,gBAAI,SAAU,QAAO,OAAO,QAAQ,QAAQ;AAC5C,kBAAM,KAAK,MAAM;AAAA,UACnB;AAAA,UACA,aAAa,KAAK,OAAO;AACvB,mBAAO,GAAG,IAAI;AAAA,UAChB;AAAA,QACF;AACA,eAAO;AAAA,MACT;AAAA,MACA,MAAM,QAAQ;AACZ,gBAAQ,MAAM,KAAK,KAAK,aAAa,GAAG,MAAM,IAAI,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,EAAE,KAAK,IAAI,CAAC;AAAA,MACxF;AAAA,IACF;AAAA,EACF;AACF;AAEA,SAAS,YACP,MACA,QAC+B;AAC/B,SAAO;AAAA,IACL,QAAQ,KAAK;AAAA,IACb,YAAY,KAAK,SAAS;AAAA,IAC1B,KAAK,KAAK;AAAA,IACV,UAAU;AAAA,IACV,aAAa,CAAC;AAAA,IACd,SAAS;AAAA,IACT,YAAY;AAAA,IACZ,MAAM,KAAK;AAAA,IACX,QAAQ;AAAA,IACR,OAAO,YAAY,MAAM;AAAA,EAC3B;AACF;AAUA,eAAe,eACb,MACe;AACf,QAAM,KAAK,MAAM,QAAQ;AAAA,IACvB,UAAU,KAAK;AAAA,IACf,UAAU,KAAK,KAAK;AAAA,IACpB,aAAa,KAAK,KAAK;AAAA,IACvB,QAAQ,KAAK,KAAK,iBAAiB;AAAA,IACnC,mBAAmB,KAAK,KAAK,4BAA4B;AAAA,IACzD,YAAY,KAAK,IAAI,EAAE,YAAY;AAAA,IACnC,iBAAiB;AAAA,EACnB,CAAC;AACH;AAIA,SAAS,oBAAoB,OAMlB;AACT,QAAM,YAAY;AAAA,IAChB,WAAW,MAAM,UAAU,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,MAAM,EAAE,KAAK,EAAE;AAAA,IAClE,QAAQ,MAAM,OAAO,IAAI,CAAC,OAAO,EAAE,MAAM,EAAE,MAAM,MAAM,EAAE,WAAW,IAAI,CAAC,MAAM,EAAE,GAAG,EAAE,EAAE;AAAA,IACxF,UAAU,MAAM;AAAA,IAChB,MAAM,MAAM;AAAA,IACZ,MAAM,MAAM;AAAA,EACd;AACA,SAAO,WAAW,QAAQ,EAAE,OAAO,KAAK,UAAU,SAAS,CAAC,EAAE,OAAO,KAAK;AAC5E;AAEA,SAAS,kBACP,OACA,QACA,MACoB;AACpB,QAAM,UAA0C,CAAC;AACjD,aAAW,SAAS,QAAQ;AAC1B,UAAM,SAAmB,CAAC;AAC1B,eAAW,QAAQ,OAAO;AACxB,YAAM,IAAI,KAAK,YAAY,MAAM,IAAI;AACrC,UAAI,MAAM,OAAW,QAAO,KAAK,EAAE,SAAS;AAAA,IAC9C;AACA,YAAQ,MAAM,IAAI,IAAI,UAAU,QAAQ,IAAI;AAAA,EAC9C;AACA,QAAM,aAAgD,CAAC;AACvD,QAAM,iBAAiB,oBAAI,IAAsB;AACjD,aAAW,QAAQ,OAAO;AACxB,UAAM,aAAa,OAAO,OAAO,KAAK,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AACzE,QAAI,WAAW,WAAW,EAAG;AAC7B,UAAM,OAAO,WAAW,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,WAAW;AAChE,UAAM,MAAM,eAAe,IAAI,KAAK,UAAU,KAAK,CAAC;AACpD,QAAI,KAAK,IAAI;AACb,mBAAe,IAAI,KAAK,YAAY,GAAG;AAAA,EACzC;AACA,aAAW,CAAC,YAAY,OAAO,KAAK,gBAAgB;AAClD,UAAM,KAAK,UAAU,SAAS,IAAI;AAClC,eAAW,UAAU,IAAI,EAAE,eAAe,GAAG,MAAM,MAAM,GAAG,MAAM,GAAG,GAAG,EAAE;AAAA,EAC5E;AACA,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,cAAc,MAAM,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,SAAS,CAAC;AAAA,IACrD,eAAe,MAAM,OAAO,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE;AAAA,IAC7C,cAAc,MAAM,OAAO,CAAC,MAAM,EAAE,OAAO,WAAW,UAAU,CAAC,EAAE;AAAA,IACnE,aAAa,MAAM,OAAO,CAAC,MAAM,EAAE,MAAM,EAAE;AAAA,IAC3C,aAAa,MAAM,OAAO,CAAC,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,WAAW,UAAU,CAAC,EAAE;AAAA,EAC/E;AACF;AAKA,SAAS,UAAU,SAAmB,MAA8B;AAClE,QAAM,IAAI,QAAQ;AAClB,MAAI,MAAM,EAAG,QAAO,EAAE,MAAM,GAAG,OAAO,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,GAAG,EAAE;AAC5D,QAAM,OAAO,QAAQ,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI;AAClD,QAAM,WAAW,QAAQ,OAAO,CAAC,GAAG,MAAM,KAAK,IAAI,SAAS,GAAG,CAAC,IAAI,KAAK,IAAI,GAAG,IAAI,CAAC;AACrF,QAAM,QAAQ,KAAK,KAAK,QAAQ;AAChC,QAAM,KAAK,mBAAmB,SAAS,MAAM,EAAE,MAAM,WAAW,IAAK,CAAC;AACtE,SAAO,EAAE,MAAM,OAAO,MAAM,CAAC,GAAG,OAAO,GAAG,KAAK,GAAG,EAAE;AACtD;","names":[]}
@@ -1,5 +1,5 @@
1
- export { C as CampaignAggregates, a as CampaignArtifactWriter, b as CampaignCellResult, c as CampaignCostMeter, d as CampaignResult, e as CampaignTraceWriter, f as CodeSurface, D as Dispatch, g as DispatchContext, G as Gate, h as GateContext, i as GateDecision, j as GateResult, k as GenerationCandidate, l as GenerationRecord, I as ImprovementDriver, J as JudgeConfig, m as JudgeDimension, n as JudgeScore, M as MutableSurface, o as Mutator, O as OptimizerConfig, S as Scenario, p as SessionScript } from '../types-DToGONFA.js';
2
- export { C as CampaignStorage, D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, G as GepaDriverOptions, H as HeldOutGateOptions, R as RunCampaignOptions, a as RunEvalOptions, b as RunImprovementLoopOptions, c as RunImprovementLoopResult, d as composeGate, e as defaultProductionGate, f as evolutionaryDriver, g as fsCampaignStorage, h as gepaDriver, i as heldOutGate, j as inMemoryCampaignStorage, r as runCampaign, k as runEval, l as runImprovementLoop } from '../run-improvement-loop-CbilHQAb.js';
1
+ export { C as CampaignAggregates, a as CampaignArtifactWriter, b as CampaignCellResult, c as CampaignCostMeter, d as CampaignResult, e as CampaignTraceWriter, f as CodeSurface, D as Dispatch, g as DispatchContext, G as Gate, h as GateContext, i as GateDecision, j as GateResult, k as GenerationCandidate, l as GenerationRecord, I as ImprovementDriver, J as JudgeConfig, m as JudgeDimension, n as JudgeScore, M as MutableSurface, o as Mutator, O as OptimizerConfig, S as Scenario, p as SessionScript } from '../types-BURGZ8Ug.js';
2
+ export { C as CampaignStorage, D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, G as GepaDriverOptions, H as HeldOutGateOptions, R as RunCampaignOptions, a as RunEvalOptions, b as RunImprovementLoopOptions, c as RunImprovementLoopResult, d as composeGate, e as defaultProductionGate, f as evolutionaryDriver, g as fsCampaignStorage, h as gepaDriver, i as heldOutGate, j as inMemoryCampaignStorage, r as runCampaign, k as runEval, l as runImprovementLoop } from '../run-improvement-loop-pJ4yrx4X.js';
3
3
  export { D as DeploymentOutcome, F as FileSystemOutcomeStore, a as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, O as OutcomeStore } from '../outcome-store-BxJ3DQKJ.js';
4
4
  import '../llm-client-BXVRUZyX.js';
5
5
  import '../errors-mje_cKOs.js';
@@ -6,12 +6,12 @@ import {
6
6
  heldOutGate,
7
7
  runEval,
8
8
  runImprovementLoop
9
- } from "../chunk-H5BGRSN4.js";
9
+ } from "../chunk-HRKOCLQA.js";
10
10
  import {
11
11
  fsCampaignStorage,
12
12
  inMemoryCampaignStorage,
13
13
  runCampaign
14
- } from "../chunk-RXK7FXLV.js";
14
+ } from "../chunk-J3EIOI3O.js";
15
15
  import "../chunk-N4SBKEPJ.js";
16
16
  import "../chunk-YV7J7X5N.js";
17
17
  import {
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.43.2",
5
+ "version": "0.44.1",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
package/dist/rl.d.ts CHANGED
@@ -1,5 +1,5 @@
1
1
  import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
2
- import { d as CampaignResult } from './types-DToGONFA.js';
2
+ import { d as CampaignResult } from './types-BURGZ8Ug.js';
3
3
  import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-CoJMs2Iz.js';
4
4
  export { r as runEvalCampaign } from './researcher-CoJMs2Iz.js';
5
5
  import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
@@ -1,10 +1,10 @@
1
1
  import {
2
2
  runCampaign
3
- } from "./chunk-RXK7FXLV.js";
3
+ } from "./chunk-J3EIOI3O.js";
4
4
  import "./chunk-WP7SY7AI.js";
5
5
  import "./chunk-QYJT52YW.js";
6
6
  import "./chunk-NSBPE2FW.js";
7
7
  export {
8
8
  runCampaign
9
9
  };
10
- //# sourceMappingURL=run-campaign-GNDO66B4.js.map
10
+ //# sourceMappingURL=run-campaign-6UEVBPP3.js.map
@@ -1,4 +1,4 @@
1
- import { S as Scenario, d as CampaignResult, j as GateResult, o as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, J as JudgeConfig, L as LabeledScenarioStore, e as CampaignTraceWriter, M as MutableSurface, l as GenerationRecord } from './types-DToGONFA.js';
1
+ import { S as Scenario, d as CampaignResult, j as GateResult, o as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, J as JudgeConfig, L as LabeledScenarioStore, e as CampaignTraceWriter, M as MutableSurface, l as GenerationRecord } from './types-BURGZ8Ug.js';
2
2
  import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
3
3
  import { RunRecord } from '@tangle-network/agent-runtime';
4
4
  import { R as RedTeamCase } from './red-team-30II1T4o.js';
@@ -267,6 +267,22 @@ interface RunCampaignOptions<TScenario extends Scenario, TArtifact> {
267
267
  * (Cloudflare Workers, Deno, edge) — the `CampaignResult` is still
268
268
  * produced; artifacts/traces just aren't persisted to disk. */
269
269
  storage?: CampaignStorage;
270
+ /**
271
+ * Optional per-cell placement strategy. Returns an opaque string the
272
+ * substrate forwards as `ctx.placement` to the Dispatch — placement-aware
273
+ * Dispatches (e.g. `httpDispatch` from `/adapters/http`) use it to route
274
+ * each cell to the right worker, region, or sandbox. When unset, every
275
+ * cell receives `ctx.placement = undefined` and behaves identically to
276
+ * the in-process case.
277
+ *
278
+ * @example
279
+ * cellPlacement: ({ scenario }) => scenario.tags?.includes('eu') ? 'eu-west' : 'us-east'
280
+ */
281
+ cellPlacement?: (input: {
282
+ scenario: TScenario;
283
+ rep: number;
284
+ generation?: number;
285
+ }) => string | undefined;
270
286
  }
271
287
  declare function runCampaign<TScenario extends Scenario, TArtifact>(opts: RunCampaignOptions<TScenario, TArtifact>): Promise<CampaignResult<TArtifact, TScenario>>;
272
288
 
@@ -40,6 +40,14 @@ interface DispatchContext {
40
40
  cycleId?: string;
41
41
  /** Populated when the substrate resumed from a prior cache hit. */
42
42
  resumedFrom?: string;
43
+ /**
44
+ * Opaque placement key supplied by `RunCampaignOptions.cellPlacement`.
45
+ * The substrate forwards it through unchanged; placement-aware Dispatch
46
+ * implementations (e.g. `httpDispatch` from `/adapters/http`) read it to
47
+ * route the cell to the right worker / region / sandbox. `undefined`
48
+ * when no placement strategy is configured.
49
+ */
50
+ placement?: string;
43
51
  }
44
52
  /** @experimental One function: scenario + ctx → artifact. Dispatcher chooses
45
53
  * whether to call `runMultishot`, `runLoop`, raw `streamPrompt`, anything. */
@@ -0,0 +1,121 @@
1
+ # Composing agent-eval with your observability stack
2
+
3
+ `@tangle-network/agent-eval` ships its own OpenTelemetry pipeline
4
+ (`@tangle-network/agent-eval/telemetry`) that emits spans for every
5
+ cell, judge invocation, mutator proposal, and gate decision. **It's
6
+ just OTel** — same protocol as Langfuse SDK, OpenLLMetry, Arize
7
+ Phoenix, TraceAI, and the OpenTelemetry GenAI semantic conventions.
8
+
9
+ That means: if you already instrument your agent with any OTel-native
10
+ observability tool, the two compose **for free at the protocol layer**.
11
+ This doc shows the composition pattern; no agent-eval-specific adapter
12
+ code required.
13
+
14
+ ## TL;DR — one OTel context, two emitters
15
+
16
+ 1. Set up a shared OTel tracer provider in your process (or service mesh).
17
+ 2. Configure your observability tool (TraceAI / Langfuse / OpenLLMetry /
18
+ Phoenix) to register its instrumentations against that provider.
19
+ 3. Configure agent-eval's `/telemetry` exporter against the same provider.
20
+ 4. Run a campaign. Both sets of spans land at your OTel collector.
21
+ 5. Filter / route / fan-out at the collector layer — Jaeger, Tempo,
22
+ Phoenix, Langfuse cloud, your private collector, whatever.
23
+
24
+ The Tangle substrate doesn't compete with the observability tool;
25
+ they're orthogonal. The tool tells you *what your agent did*; the
26
+ substrate tells you *what the campaign / judge / mutator decided about
27
+ it*. Unified at the trace level, you see both as one timeline per cell.
28
+
29
+ ## Per-tool notes
30
+
31
+ ### TraceAI (Future-AGI)
32
+
33
+ - TS SDK auto-instruments OpenAI/Anthropic SDKs + LangChain.
34
+ - Compatible with the OpenTelemetry GenAI semantic conventions.
35
+ - Compose: register TraceAI's instrumentations on the global tracer
36
+ provider, then either point both at your OTLP collector or at
37
+ TraceAI's hosted backend if you want their UI.
38
+
39
+ ### Langfuse SDK
40
+
41
+ - Larger installed base; has its own hosted product + OSS self-host.
42
+ - Their OpenTelemetry-compatible mode ships LLM call spans with
43
+ Langfuse-specific attributes preserved.
44
+ - Compose: register Langfuse as an OTel processor; agent-eval's
45
+ campaign/judge/mutator spans appear alongside the LLM calls in their
46
+ UI.
47
+
48
+ ### OpenLLMetry (Traceloop)
49
+
50
+ - OSS auto-instrumentation library; OTel-native by design.
51
+ - Wide framework coverage (LangChain, LlamaIndex, Haystack, OpenAI,
52
+ Anthropic).
53
+ - Compose: set up Traceloop's exporter; agent-eval's exporter shares
54
+ the same trace context per cell.
55
+
56
+ ### Arize Phoenix
57
+
58
+ - OSS observability backend; strong in the eval-tooling community.
59
+ - OTel-native ingest; renders trace + span attributes per the GenAI
60
+ semantic conventions.
61
+ - Compose: point both exporters at your local Phoenix instance. Phoenix
62
+ becomes the unified UI for both LLM-call traces and campaign spans.
63
+
64
+ ## Wiring pattern (reference)
65
+
66
+ ```ts
67
+ import { trace } from '@opentelemetry/api'
68
+ import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node'
69
+ import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'
70
+ import { SimpleSpanProcessor } from '@opentelemetry/sdk-trace-base'
71
+
72
+ // 1. One shared tracer provider for the process.
73
+ const provider = new NodeTracerProvider()
74
+ provider.addSpanProcessor(new SimpleSpanProcessor(
75
+ new OTLPTraceExporter({ url: 'http://localhost:4318/v1/traces' }),
76
+ ))
77
+ provider.register()
78
+
79
+ // 2. Your observability tool registers against the global provider.
80
+ // Example for TraceAI / OpenLLMetry / Langfuse — call their init.
81
+ // (See each tool's docs.)
82
+
83
+ // 3. agent-eval is already OTel-native; it picks up the same global
84
+ // provider. Just ensure `@tangle-network/agent-eval/telemetry` is
85
+ // initialized for the campaign:
86
+ import { setOtelExporter } from '@tangle-network/agent-eval/telemetry'
87
+ setOtelExporter({ kind: 'otel-global' }) // use the global provider
88
+
89
+ // 4. Run your campaign — both sets of spans land at the collector.
90
+ import { runEval } from '@tangle-network/agent-eval/contract'
91
+ await runEval({ /* ... */ })
92
+ ```
93
+
94
+ That's it. No new adapter shipping required — the libs are already
95
+ designed to live in the same OTel ecosystem.
96
+
97
+ ## When you'd want a deeper, code-level adapter
98
+
99
+ The two cases where a thin adapter would add value beyond the
100
+ OTel-protocol composition:
101
+
102
+ 1. **Cost-aware judging.** Your observability tool's auto-instrumented
103
+ spans carry token counts + cost. A custom `JudgeConfig` can read
104
+ them via the OTel context and refuse to score artifacts that
105
+ exceeded a per-call budget. Easy to write yourself; we'll ship a
106
+ reference helper (`costAwareJudgeFromOtel`) when a partner pulls on
107
+ this.
108
+ 2. **Tool-aware judging.** Your instrumentation captures the tool-call
109
+ sequence (`langchain.tool.invoked`, `openai.function.called`, etc.).
110
+ A judge that scores "did the agent use the right tool" reads those
111
+ spans directly. Also straightforward; helper ships when needed.
112
+
113
+ Both of these are L1-tier ergonomic helpers; the underlying composition
114
+ works today without them.
115
+
116
+ ## What this does NOT install
117
+
118
+ No new dependencies. No new peer deps. No `@traceai/*`, no
119
+ `@langfuse/*`, no `@opentelemetry/*` in our manifest. You bring the
120
+ observability stack you want; agent-eval just emits OTel and respects
121
+ whatever provider is registered.
@@ -0,0 +1,173 @@
1
+ # Distributed driver — driver-on-A, workers-on-B (and C, D, E…)
2
+
3
+ The driver (running `runCampaign` / `runImprovementLoop` / `gepaDriver`)
4
+ and the worker (running your actual agent) **do not have to live in the
5
+ same process, machine, region, or cloud.** `Dispatch` is just a
6
+ function: scenario in, artifact out. Whatever returns the artifact is
7
+ the worker — local, remote, sandboxed, or fanned out across a fleet.
8
+
9
+ ## Why you'd want this
10
+
11
+ | Pattern | Reason |
12
+ |---|---|
13
+ | **Driver on your VPC, workers on our sandbox fleet** | Driver holds secrets, training data, prompt corpus; workers stay stateless and scale horizontally |
14
+ | **Multi-region campaigns** | Each cell runs in the region closest to its target API (latency, compliance, data residency) |
15
+ | **Driver-as-a-service** | Long-running optimization process; reuses across many short-lived worker invocations |
16
+ | **Heterogeneous workers** | One cell on a CPU container, another on a GPU box, another against a third-party API — same Dispatch shape, different placement |
17
+ | **Budget-isolated workers** | Worker boxes get scoped, time-bounded credentials; driver never holds production keys |
18
+
19
+ ## Two new pieces in 0.45.0
20
+
21
+ | Where | What |
22
+ |---|---|
23
+ | **`DispatchContext.placement?: string`** | Opaque placement key the substrate forwards to the Dispatch. |
24
+ | **`RunCampaignOptions.cellPlacement?(input) → string \| undefined`** | Strategy function the substrate calls per cell to compute the placement key. |
25
+ | **`@tangle-network/agent-eval/adapters/http`** | `httpDispatch` (client) + `runDispatchServer` (server) — wire shape for HTTP-based remote workers. |
26
+
27
+ Both ends of the wire are in the same package; no peer dep, no separate
28
+ install. The substrate doesn't strategy-pick; you provide the
29
+ `cellPlacement` function, the substrate forwards its result, the
30
+ Dispatch reads it. Clean seam, no policy baked in.
31
+
32
+ ## The three reference topologies
33
+
34
+ ### 1. In-process (the default — what you already have)
35
+
36
+ ```ts
37
+ await runCampaign({
38
+ scenarios,
39
+ dispatch, // runs in-process
40
+ judges: [judge],
41
+ storage,
42
+ runDir,
43
+ })
44
+ ```
45
+
46
+ `ctx.placement` is `undefined`; nothing changes for existing consumers.
47
+ This shipped in 0.40.
48
+
49
+ ### 2. Single remote worker
50
+
51
+ Driver-on-A talks to one worker-on-B over HTTP.
52
+
53
+ **Driver side (machine A):**
54
+
55
+ ```ts
56
+ import { httpDispatch } from '@tangle-network/agent-eval/adapters/http'
57
+
58
+ const dispatch = httpDispatch<MyScenario, MyArtifact>({
59
+ url: 'https://worker.your-infra.com/dispatch',
60
+ auth: process.env.WORKER_TOKEN,
61
+ timeoutMs: 5 * 60 * 1000,
62
+ retries: 2,
63
+ })
64
+
65
+ await runImprovementLoop({ scenarios, baselineSurface, dispatchWithSurface: (surface, s, ctx) =>
66
+ dispatch(s, { ...ctx, /* pass the surface through your own protocol */ }),
67
+ /* ... */ })
68
+ ```
69
+
70
+ **Worker side (machine B):**
71
+
72
+ ```ts
73
+ import { runDispatchServer } from '@tangle-network/agent-eval/adapters/http'
74
+
75
+ const handle = await runDispatchServer<MyScenario, MyArtifact>({
76
+ dispatch: async (scenario, ctx) => {
77
+ // your agent — call OpenAI, LangChain, your sandbox, anything.
78
+ const artifact = await runMyAgent(scenario, ctx.signal)
79
+ return artifact
80
+ },
81
+ port: 8080,
82
+ auth: process.env.WORKER_TOKEN, // required; `false` only for closed networks
83
+ })
84
+ console.log(`worker listening on ${handle.port}`)
85
+ ```
86
+
87
+ Cancellation, retries on 5xx / 408 / 429, bounded timeouts, optional
88
+ custom auth headers, optional `fetchImpl` override — all there.
89
+
90
+ ### 3. Multi-region fan-out
91
+
92
+ Driver picks a region per cell; the same `httpDispatch` routes to
93
+ different worker URLs based on placement.
94
+
95
+ ```ts
96
+ import { httpDispatch } from '@tangle-network/agent-eval/adapters/http'
97
+
98
+ const REGION_URLS: Record<string, string> = {
99
+ 'us-east': 'https://worker-use1.your-infra.com/dispatch',
100
+ 'eu-west': 'https://worker-euw1.your-infra.com/dispatch',
101
+ 'ap-south': 'https://worker-aps1.your-infra.com/dispatch',
102
+ }
103
+
104
+ const dispatch = httpDispatch<MyScenario, MyArtifact>({
105
+ resolveUrl: ({ placement }) => REGION_URLS[placement ?? 'us-east'],
106
+ auth: process.env.WORKER_TOKEN,
107
+ })
108
+
109
+ await runCampaign({
110
+ scenarios,
111
+ dispatch,
112
+ judges: [judge],
113
+ storage,
114
+ runDir,
115
+ cellPlacement: ({ scenario }) => {
116
+ if (scenario.tags?.includes('eu')) return 'eu-west'
117
+ if (scenario.tags?.includes('ap')) return 'ap-south'
118
+ return 'us-east'
119
+ },
120
+ maxConcurrency: 8, // 8 cells fan across regions in parallel
121
+ })
122
+ ```
123
+
124
+ `cellPlacement` is a pure function the substrate calls per cell — no
125
+ state. Use whatever signal you want (tags, hash of scenario id,
126
+ round-robin, region-affinity from a previous run, scheduling table).
127
+
128
+ ## What's preserved across the wire
129
+
130
+ | Concern | How |
131
+ |---|---|
132
+ | **Cancellation** | Driver's `AbortSignal` forwards into the HTTP request; server translates `AbortError` → `499` so client doesn't retry. |
133
+ | **Timeouts** | Per-call `timeoutMs` on the client; server can layer its own. |
134
+ | **Retries** | Idempotent retries on 5xx / 408 / 429 with exponential backoff + jitter. Driver-aborts never retry. |
135
+ | **Auth** | Bearer token on `Authorization`; pluggable via `auth: string \| () => string \| Promise<string>` for rotation/refresh. |
136
+ | **Payload size** | Server enforces `maxBodyBytes` (default 10 MB). |
137
+ | **Traces** | Both ends emit OTel — if both point at the same OTLP collector, you get a unified trace per cell. See `docs/adapters-observability.md`. |
138
+ | **Cost** | Worker's `ctx.cost.observe(usd, source)` is local to the worker process. Roll up server-side and attach to your worker-side telemetry; we don't (yet) forward cost back to the driver. Tracked as follow-up. |
139
+
140
+ ## Running the reference example
141
+
142
+ See `examples/distributed-driver/`:
143
+
144
+ ```sh
145
+ # Terminal 1 — worker
146
+ pnpm tsx examples/distributed-driver/worker.ts
147
+
148
+ # Terminal 2 — driver
149
+ WORKER_URL=http://localhost:8080/dispatch \
150
+ WORKER_TOKEN=dev-token \
151
+ pnpm tsx examples/distributed-driver/driver.ts
152
+ ```
153
+
154
+ Two processes, one local TCP loopback, full self-improvement loop end
155
+ to end. Scaling out is dropping `WORKER_URL` to a non-loopback hostname
156
+ and using `cellPlacement` to fan across many of them.
157
+
158
+ ## Known gaps + follow-ups
159
+
160
+ - **Cost roll-up across the wire** — worker-side `ctx.cost` observations
161
+ stay on the worker. We need to forward them in the response body so
162
+ `defaultProductionGate`'s `budgetUsd` ceiling reflects total spend, not
163
+ driver-side spend. Tracked as a 0.45.x follow-up.
164
+ - **Per-cell artifact streaming** — when the worker writes intermediate
165
+ artifacts via `ctx.artifacts.write`, those land on the worker's
166
+ storage. For multi-worker campaigns you'll want a shared object store
167
+ (S3/GCS) reachable from both sides; today consumers wire that as a
168
+ `CampaignStorage` impl. A reference S3-backed storage is on the
169
+ roadmap.
170
+ - **gRPC / NATS / Temporal transports** — the wire is HTTP today by
171
+ default because everything speaks HTTP. Other transports can ship as
172
+ additional adapters; the `Dispatch` interface itself is
173
+ transport-agnostic.
@@ -0,0 +1,190 @@
1
+ # Quickstart — self-improvement loop for any agent (15 minutes)
2
+
3
+ The standalone walkthrough mirroring
4
+ `examples/foreign-agent-quickstart/`. Read this first; copy the runnable
5
+ example second.
6
+
7
+ ## What you get
8
+
9
+ After 15 minutes you have a closed self-improvement loop running
10
+ against your agent — measured, gated, and reproducible — with no
11
+ Tangle sandbox, no Tangle account, and no hosted infrastructure.
12
+
13
+ ## Install
14
+
15
+ ```sh
16
+ npm i @tangle-network/agent-eval@^0.44.0
17
+ ```
18
+
19
+ The package's `@tangle-network/sandbox` peer is `optional` (as of
20
+ 0.44.0). Foreign consumers can install agent-eval and run the full LAND
21
+ tier without our sandbox or its dependencies.
22
+
23
+ ## Five types, four functions
24
+
25
+ ```ts
26
+ import {
27
+ // Types
28
+ type Scenario, // what you evaluate against (id + kind + your fields)
29
+ type Dispatch, // your agent, wrapped as one function
30
+ type JudgeConfig, // pluggable dimensional scorer
31
+ type Mutator, // proposes a next surface
32
+ type Gate, // promotion guard
33
+
34
+ // Functions
35
+ runEval,
36
+ runCampaign,
37
+ runImprovementLoop,
38
+ defaultProductionGate,
39
+
40
+ // Storage
41
+ fsCampaignStorage,
42
+ inMemoryCampaignStorage,
43
+ } from '@tangle-network/agent-eval/contract'
44
+ ```
45
+
46
+ Every export above is committed under semver. New minors only ADD;
47
+ nothing here changes shape in a 0.x minor.
48
+
49
+ ## Three steps to wire your agent
50
+
51
+ ### 1. Scenarios
52
+
53
+ ```ts
54
+ interface MarketingScenario extends Scenario {
55
+ blurb: string
56
+ surface: 'landing-hero' | 'tweet' | 'email-subject'
57
+ audience: string
58
+ }
59
+
60
+ const scenarios: MarketingScenario[] = [
61
+ { id: 's1', kind: 'marketing-rewrite', blurb: '...', surface: 'tweet', audience: '...' },
62
+ // ...
63
+ ]
64
+ ```
65
+
66
+ ### 2. Wrap your agent as `Dispatch`
67
+
68
+ ```ts
69
+ const dispatch: Dispatch<MarketingScenario, MarketingArtifact> = async (scenario, ctx) => {
70
+ const rewrite = await callYourAgent(scenario, { signal: ctx.signal })
71
+ return { rewrite, modelUsed: '...' }
72
+ }
73
+ ```
74
+
75
+ `ctx` carries `signal` (cancellation), `trace` (write spans), `artifacts`
76
+ (write blobs), `cost` (token + $ meter). Use them or ignore them.
77
+
78
+ ### 3. Bring a judge
79
+
80
+ ```ts
81
+ const judge: JudgeConfig<MarketingArtifact, MarketingScenario> = {
82
+ name: 'marketing-quality',
83
+ dimensions: [
84
+ { key: 'hook_strength', description: '...' },
85
+ { key: 'voice_match', description: '...' },
86
+ { key: 'cta_clarity', description: '...' },
87
+ { key: 'factual_grounding', description: '...' },
88
+ ],
89
+ async score({ artifact, scenario, signal }) {
90
+ // LLM call, heuristic, ensemble — anything. Return JudgeScore.
91
+ return { dimensions: { ... }, composite: 0.72, notes: '...' }
92
+ },
93
+ }
94
+ ```
95
+
96
+ Throw on failure; the substrate records it as a failed cell. No silent
97
+ zeros.
98
+
99
+ ## Baseline
100
+
101
+ ```ts
102
+ const baseline = await runEval({
103
+ scenarios,
104
+ dispatch,
105
+ judges: [judge],
106
+ storage: inMemoryCampaignStorage(),
107
+ runDir: 'mem://my-baseline',
108
+ })
109
+
110
+ const score = Object.values(baseline.aggregates.byScenario)
111
+ .reduce((sum, s) => sum + s.meanComposite, 0) / scenarios.length
112
+
113
+ console.log(`Baseline composite: ${score.toFixed(3)}`)
114
+ ```
115
+
116
+ ## Self-improvement loop
117
+
118
+ ```ts
119
+ import { gepaDriver, defaultProductionGate } from '@tangle-network/agent-eval/contract'
120
+
121
+ const result = await runImprovementLoop({
122
+ scenarios: trainScenarios,
123
+ baselineSurface,
124
+ dispatchWithSurface: (surface, scenario, ctx) =>
125
+ runYourAgent({ systemPrompt: surface as string }, scenario, ctx),
126
+ driver: gepaDriver({
127
+ llm: { apiKey: process.env.OPENAI_API_KEY, baseUrl: '...' },
128
+ model: 'gpt-4o-mini',
129
+ target: 'marketing copywriting system prompt',
130
+ mutationPrimitives: [
131
+ 'Tighten the hook: lead with the concrete user outcome.',
132
+ 'Replace generic adjectives with specific verbs.',
133
+ // ...
134
+ ],
135
+ }),
136
+ judges: [judge],
137
+ populationSize: 2,
138
+ maxGenerations: 3,
139
+ holdoutScenarios,
140
+ gate: defaultProductionGate({
141
+ holdoutScenarios,
142
+ deltaThreshold: 0.05,
143
+ }),
144
+ autoOnPromote: 'none',
145
+ storage: inMemoryCampaignStorage(),
146
+ runDir: 'mem://my-improve',
147
+ })
148
+
149
+ if (result.gateResult.decision === 'ship') {
150
+ // Deploy result.winnerSurface — we don't push it for you.
151
+ }
152
+ ```
153
+
154
+ The gate decision is `'ship'` | `'hold'` | `'need_more_work'` |
155
+ `'model_ceiling'` | `'arch_ceiling'`. You define what each means in
156
+ your deploy pipeline.
157
+
158
+ ## What you control
159
+
160
+ - The agent (any framework, any model, any backend).
161
+ - The judge (LLM, heuristic, ensemble; we don't pick).
162
+ - The mutation strategy (`gepaDriver` for reflective LLM mutation,
163
+ `evolutionaryDriver({ mutator })` for population search, or
164
+ implement `ImprovementDriver` directly).
165
+ - The gate (compose `defaultProductionGate` with custom checks via
166
+ `composeGate`).
167
+ - The deploy step (`autoOnPromote: 'pr'` opens a GitHub PR with the
168
+ winner; `'none'` returns the surface and you ship however you ship).
169
+
170
+ ## What this does NOT install
171
+
172
+ - No `@tangle-network/sandbox` — nothing runs in a Tangle sandbox.
173
+ - No hosted orchestrator — traces, artifacts, judge scores stay on
174
+ your machine (or in `inMemoryCampaignStorage` for Workers/edge).
175
+ - No daemons — `runEval` and `runImprovementLoop` complete in-process
176
+ and return.
177
+
178
+ ## When you want more
179
+
180
+ The wedge doc (`docs/design/external-agent-wedge.md`) lays out three
181
+ graduated tiers:
182
+
183
+ | Tier | What you do | What you get |
184
+ |---|---|---|
185
+ | **LAND** (this quickstart) | `npm i @tangle-network/agent-eval`, wrap dispatch + judge, run loops | Local artifacts; full self-improvement; no Tangle infra |
186
+ | **EXPAND** | Point trace/eval data at our hosted orchestrator | Hosted dashboards, cross-run intelligence, billing on data routed to us |
187
+ | **PLATFORM** | Move execution into our sandbox | Substrate + orchestrator data pre-wired; sandbox usage billing |
188
+
189
+ Each tier is opt-in. EXPAND and PLATFORM build on the same primitives;
190
+ upgrading is adding configuration, not rewriting your wiring.