@tangle-network/agent-eval 0.44.0 → 0.45.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/http.d.ts +138 -0
- package/dist/adapters/http.js +196 -0
- package/dist/adapters/http.js.map +1 -0
- package/dist/adapters/langchain.d.ts +91 -0
- package/dist/adapters/langchain.js +34 -0
- package/dist/adapters/langchain.js.map +1 -0
- package/dist/campaign/index.d.ts +3 -3
- package/dist/campaign/index.js +2 -2
- package/dist/{chunk-H5BGRSN4.js → chunk-HRKOCLQA.js} +3 -3
- package/dist/{chunk-RXK7FXLV.js → chunk-J3EIOI3O.js} +7 -2
- package/dist/chunk-J3EIOI3O.js.map +1 -0
- package/dist/contract/index.d.ts +2 -2
- package/dist/contract/index.js +2 -2
- package/dist/openapi.json +1 -1
- package/dist/rl.d.ts +1 -1
- package/dist/{run-campaign-GNDO66B4.js → run-campaign-6UEVBPP3.js} +2 -2
- package/dist/{run-improvement-loop-CbilHQAb.d.ts → run-improvement-loop-pJ4yrx4X.d.ts} +17 -1
- package/dist/{types-DToGONFA.d.ts → types-BURGZ8Ug.d.ts} +8 -0
- package/docs/adapters-observability.md +121 -0
- package/docs/distributed-driver.md +173 -0
- package/docs/quickstart-external.md +190 -0
- package/package.json +11 -1
- package/dist/chunk-RXK7FXLV.js.map +0 -1
- /package/dist/{chunk-H5BGRSN4.js.map → chunk-HRKOCLQA.js.map} +0 -0
- /package/dist/{run-campaign-GNDO66B4.js.map → run-campaign-6UEVBPP3.js.map} +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/campaign/run-campaign.ts","../src/campaign/storage.ts"],"sourcesContent":["/**\n * @experimental\n *\n * `runCampaign` — Pass A substrate primitive. ONE function that orchestrates\n * scenarios → dispatch → artifacts → judges → aggregates, with full\n * reproducibility (seed + manifest hash), cell-level resumability, bootstrap\n * CIs, and the `LabeledScenarioStore` capture flywheel.\n *\n * Improvement loops (optimizer / gate / autoOnPromote) ride on top of this\n * primitive but live in `presets/run-improvement-loop.ts`. This file keeps\n * the core orchestrator minimal — Phase 1 of the Pass A track.\n */\n\nimport { createHash } from 'node:crypto'\nimport { join } from 'node:path'\nimport { confidenceInterval } from '../statistics'\nimport { type CampaignStorage, fsCampaignStorage } from './storage'\nimport type {\n CampaignAggregates,\n CampaignArtifactWriter,\n CampaignCellResult,\n CampaignCostMeter,\n CampaignResult,\n CampaignTraceWriter,\n DispatchContext,\n DispatchFn,\n JudgeAggregate,\n JudgeConfig,\n JudgeScore,\n LabeledScenarioStore,\n Scenario,\n ScenarioAggregate,\n TraceSpan,\n} from './types'\n\nexport interface RunCampaignOptions<TScenario extends Scenario, TArtifact> {\n scenarios: TScenario[]\n dispatch: DispatchFn<TScenario, TArtifact>\n judges?: JudgeConfig<TArtifact, TScenario>[]\n /** Required for reproducibility. Default 42. */\n seed?: number\n /** Per-scenario replicates for CI bands. Default 1; raise to 5+ for\n * bootstrap-tight intervals on critical eval. */\n reps?: number\n /** When true (default), completed cells are cached by\n * (manifestHash, scenarioId, rep, generation). Re-runs skip cached cells. */\n resumable?: boolean\n /** Optional store — when present, every artifact + judge score is captured\n * with the configured `captureSource`. Capture is default ON; pass `'off'`\n * to disable. */\n labeledStore?: LabeledScenarioStore | 'off'\n captureSource?: 'production-trace' | 'eval-run' | 'manual' | 'red-team' | 'synthetic'\n captureSourceVersionHash?: string\n /** Wall-clock cost cap across all cells. Cells beyond ceiling are skipped. */\n costCeiling?: number\n /** Max concurrent cells. Default 2. */\n maxConcurrency?: number\n /** Required: where artifacts + traces land. */\n runDir: string\n /** Tracing posture. Default is the substrate's `FileSystemTraceStore` rooted\n * at `<runDir>/traces/`. `'off'` disables capture entirely — substrate\n * refuses this when the caller wires `autoOnPromote !== 'none'`. */\n tracing?: 'on' | 'off'\n /** Test seam — override the wall clock for deterministic tests. */\n now?: () => Date\n /** Test seam — override per-cell trace writer factory. */\n buildTraceWriter?: (cellId: string, dir: string) => CampaignTraceWriter\n /** Storage backend for run/cell dirs, the resumability cache, artifacts,\n * and trace spans. Default: the Node filesystem (`fsCampaignStorage`).\n * Pass `inMemoryCampaignStorage()` to run in a filesystem-less runtime\n * (Cloudflare Workers, Deno, edge) — the `CampaignResult` is still\n * produced; artifacts/traces just aren't persisted to disk. */\n storage?: CampaignStorage\n /**\n * Optional per-cell placement strategy. Returns an opaque string the\n * substrate forwards as `ctx.placement` to the Dispatch — placement-aware\n * Dispatches (e.g. `httpDispatch` from `/adapters/http`) use it to route\n * each cell to the right worker, region, or sandbox. When unset, every\n * cell receives `ctx.placement = undefined` and behaves identically to\n * the in-process case.\n *\n * @example\n * cellPlacement: ({ scenario }) => scenario.tags?.includes('eu') ? 'eu-west' : 'us-east'\n */\n cellPlacement?: (input: {\n scenario: TScenario\n rep: number\n generation?: number\n }) => string | undefined\n}\n\nexport async function runCampaign<TScenario extends Scenario, TArtifact>(\n opts: RunCampaignOptions<TScenario, TArtifact>,\n): Promise<CampaignResult<TArtifact, TScenario>> {\n const seed = opts.seed ?? 42\n const reps = opts.reps ?? 1\n const resumable = opts.resumable ?? true\n const maxConcurrency = opts.maxConcurrency ?? 2\n const now = opts.now ?? (() => new Date())\n const judges = opts.judges ?? []\n const storage = opts.storage ?? fsCampaignStorage()\n\n storage.ensureDir(opts.runDir)\n\n const manifestHash = computeManifestHash({\n scenarios: opts.scenarios,\n judges: judges as unknown as JudgeConfig<unknown>[],\n dispatchRef: opts.dispatch.name || 'anonymous',\n seed,\n reps,\n })\n\n const startedAt = now()\n const cells: CampaignCellResult<TArtifact>[] = []\n const artifactsByPath: Record<string, string> = {}\n\n // Build the cell schedule (scenario × rep).\n const schedule: Array<{ scenario: TScenario; rep: number; cellId: string; cellSeed: number }> = []\n let cellIndex = 0\n for (const scenario of opts.scenarios) {\n for (let rep = 0; rep < reps; rep++) {\n const cellId = `${scenario.id}:${rep}`\n const cellSeed = seed + cellIndex\n schedule.push({ scenario, rep, cellId, cellSeed })\n cellIndex += 1\n }\n }\n\n // Concurrency-limited execution.\n let totalCostUsd = 0\n let costCeilingReached = false\n const abortController = new AbortController()\n // Concurrency lanes that drain the cell schedule. Named \"lanes\" — not\n // \"workers\" — to avoid clashing with the taxonomy's worker (= the agent\n // harness in a sandbox, invoked behind `dispatch`). See loop-taxonomy.md.\n const lanes: Promise<void>[] = []\n let nextIdx = 0\n const cellsRef = cells\n\n for (let i = 0; i < maxConcurrency; i++) {\n lanes.push(\n (async () => {\n while (true) {\n const myIdx = nextIdx++\n if (myIdx >= schedule.length) return\n const slot = schedule[myIdx]!\n if (costCeilingReached) {\n cellsRef.push(skippedCell(slot, 'cost_ceiling_reached'))\n continue\n }\n const result = await executeCell({\n slot,\n opts,\n manifestHash,\n resumable,\n now,\n storage,\n buildTraceWriter: opts.buildTraceWriter ?? defaultBuildTraceWriter(storage),\n signal: abortController.signal,\n })\n cellsRef.push(result.cell)\n totalCostUsd += result.cell.costUsd\n Object.assign(artifactsByPath, result.artifactsByPath)\n if (opts.costCeiling !== undefined && totalCostUsd >= opts.costCeiling) {\n costCeilingReached = true\n }\n // Capture into LabeledScenarioStore unless explicitly disabled.\n if (opts.labeledStore && opts.labeledStore !== 'off' && !result.cell.error) {\n await captureToStore({\n store: opts.labeledStore,\n cell: result.cell,\n scenario: slot.scenario,\n opts,\n now,\n }).catch((err) => {\n // Capture failures are non-fatal — log but don't crash the campaign.\n // (Trace would normally land here.)\n console.warn(\n `[runCampaign] capture failed for ${result.cell.cellId}: ${err instanceof Error ? err.message : String(err)}`,\n )\n })\n }\n }\n })(),\n )\n }\n await Promise.all(lanes)\n\n const endedAt = now()\n cellsRef.sort((a, b) => a.cellId.localeCompare(b.cellId))\n\n const aggregates = computeAggregates(\n cellsRef,\n judges as unknown as JudgeConfig<TArtifact>[],\n seed,\n )\n\n return {\n manifestHash,\n seed,\n startedAt: startedAt.toISOString(),\n endedAt: endedAt.toISOString(),\n durationMs: endedAt.getTime() - startedAt.getTime(),\n cells: cellsRef,\n aggregates,\n runDir: opts.runDir,\n artifactsByPath,\n scenarios: opts.scenarios.map((s) => ({ id: s.id, kind: s.kind })),\n }\n}\n\n// ── Internals ─────────────────────────────────────────────────────────\n\ninterface ExecuteCellArgs<TScenario extends Scenario, TArtifact> {\n slot: { scenario: TScenario; rep: number; cellId: string; cellSeed: number }\n opts: RunCampaignOptions<TScenario, TArtifact>\n manifestHash: string\n resumable: boolean\n now: () => Date\n storage: CampaignStorage\n buildTraceWriter: (cellId: string, dir: string) => CampaignTraceWriter\n signal: AbortSignal\n}\n\nasync function executeCell<TScenario extends Scenario, TArtifact>(\n args: ExecuteCellArgs<TScenario, TArtifact>,\n): Promise<{ cell: CampaignCellResult<TArtifact>; artifactsByPath: Record<string, string> }> {\n const storage = args.storage\n const cellDir = join(args.opts.runDir, args.slot.cellId.replace(/[^a-zA-Z0-9_-]/g, '_'))\n storage.ensureDir(cellDir)\n\n // Resumability: cache key = (manifestHash, scenarioId, rep)\n const cachePath = join(cellDir, 'cached-result.json')\n if (args.resumable) {\n const raw = storage.read(cachePath)\n if (raw !== undefined) {\n try {\n const cached = JSON.parse(raw) as CampaignCellResult<TArtifact>\n if (cached.cellId === args.slot.cellId) {\n return { cell: { ...cached, cached: true }, artifactsByPath: {} }\n }\n } catch {\n // Corrupt cache — fall through to re-run.\n }\n }\n }\n\n const startMs = Date.now()\n const trace = args.buildTraceWriter(args.slot.cellId, cellDir)\n const artifactsByPath: Record<string, string> = {}\n const artifacts: CampaignArtifactWriter = {\n async write(path, content) {\n const fullPath = join(cellDir, path)\n storage.ensureDir(join(fullPath, '..'))\n storage.write(fullPath, content)\n artifactsByPath[`${args.slot.cellId}/${path}`] = fullPath\n return fullPath\n },\n async writeJson(path, value) {\n return artifacts.write(path, JSON.stringify(value, null, 2))\n },\n }\n let costSoFar = 0\n const cost: CampaignCostMeter = {\n observe(amount, source) {\n costSoFar += amount\n trace.span(`cost.${source}`, { amountUsd: amount }).end()\n },\n current() {\n return costSoFar\n },\n }\n\n const placement = args.opts.cellPlacement?.({\n scenario: args.slot.scenario,\n rep: args.slot.rep,\n })\n\n const ctx: DispatchContext = {\n cellId: args.slot.cellId,\n rep: args.slot.rep,\n seed: args.slot.cellSeed,\n signal: args.signal,\n trace,\n artifacts,\n cost,\n placement,\n }\n\n let artifact: TArtifact | undefined\n let errorMessage: string | undefined\n try {\n artifact = await args.opts.dispatch(args.slot.scenario, ctx)\n } catch (err) {\n errorMessage = err instanceof Error ? err.message : String(err)\n }\n\n // Run judges (only if we have an artifact). A judge that throws invalidates\n // the cell — recorded as `error`, NOT folded into a fake composite:0 (a fake\n // zero is indistinguishable from a real zero and poisons every aggregate).\n const judgeScores: Record<string, JudgeScore> = {}\n if (artifact !== undefined) {\n for (const judge of args.opts.judges ?? []) {\n if (judge.appliesTo && !judge.appliesTo(args.slot.scenario)) continue\n try {\n judgeScores[judge.name] = await runJudgeCell(judge, {\n artifact,\n scenario: args.slot.scenario,\n signal: args.signal,\n })\n } catch (err) {\n errorMessage = `judge '${judge.name}' failed: ${err instanceof Error ? err.message : String(err)}`\n break\n }\n }\n }\n\n await trace.flush()\n\n const cell: CampaignCellResult<TArtifact> = {\n cellId: args.slot.cellId,\n scenarioId: args.slot.scenario.id,\n rep: args.slot.rep,\n artifact: (artifact ?? null) as TArtifact,\n judgeScores,\n costUsd: costSoFar,\n durationMs: Date.now() - startMs,\n seed: args.slot.cellSeed,\n cached: false,\n error: errorMessage,\n }\n\n if (!errorMessage && args.resumable) {\n storage.write(cachePath, JSON.stringify(cell))\n }\n\n return { cell, artifactsByPath }\n}\n\nasync function runJudgeCell<TArtifact, TScenario extends Scenario>(\n judge: JudgeConfig<TArtifact, TScenario>,\n input: { artifact: TArtifact; scenario: TScenario; signal: AbortSignal },\n): Promise<JudgeScore> {\n return judge.score(input)\n}\n\nfunction defaultBuildTraceWriter(\n storage: CampaignStorage,\n): (cellId: string, dir: string) => CampaignTraceWriter {\n return (cellId, dir) => {\n const spans: Array<Record<string, unknown>> = []\n return {\n span(name, attributes) {\n const startMs = Date.now()\n const record: Record<string, unknown> = { name, cellId, startMs, ...(attributes ?? {}) }\n const finish: TraceSpan = {\n end(endAttrs) {\n record.durationMs = Date.now() - startMs\n if (endAttrs) Object.assign(record, endAttrs)\n spans.push(record)\n },\n setAttribute(key, value) {\n record[key] = value\n },\n }\n return finish\n },\n async flush() {\n storage.write(join(dir, 'spans.jsonl'), spans.map((s) => JSON.stringify(s)).join('\\n'))\n },\n }\n }\n}\n\nfunction skippedCell<TScenario extends Scenario, TArtifact>(\n slot: { scenario: TScenario; rep: number; cellId: string; cellSeed: number },\n reason: string,\n): CampaignCellResult<TArtifact> {\n return {\n cellId: slot.cellId,\n scenarioId: slot.scenario.id,\n rep: slot.rep,\n artifact: null as unknown as TArtifact,\n judgeScores: {},\n costUsd: 0,\n durationMs: 0,\n seed: slot.cellSeed,\n cached: false,\n error: `skipped: ${reason}`,\n }\n}\n\ninterface CaptureArgs<TScenario extends Scenario, TArtifact> {\n store: LabeledScenarioStore\n cell: CampaignCellResult<TArtifact>\n scenario: TScenario\n opts: RunCampaignOptions<TScenario, TArtifact>\n now: () => Date\n}\n\nasync function captureToStore<TScenario extends Scenario, TArtifact>(\n args: CaptureArgs<TScenario, TArtifact>,\n): Promise<void> {\n await args.store.observe({\n scenario: args.scenario,\n artifact: args.cell.artifact,\n judgeScores: args.cell.judgeScores,\n source: args.opts.captureSource ?? 'eval-run',\n sourceVersionHash: args.opts.captureSourceVersionHash ?? 'unknown',\n capturedAt: args.now().toISOString(),\n redactionStatus: 'raw',\n })\n}\n\n// ── Aggregates + manifest hash ────────────────────────────────────────\n\nfunction computeManifestHash(input: {\n scenarios: Scenario[]\n judges: JudgeConfig<unknown>[]\n dispatchRef: string\n seed: number\n reps: number\n}): string {\n const canonical = {\n scenarios: input.scenarios.map((s) => ({ id: s.id, kind: s.kind })),\n judges: input.judges.map((j) => ({ name: j.name, dims: j.dimensions.map((d) => d.key) })),\n dispatch: input.dispatchRef,\n seed: input.seed,\n reps: input.reps,\n }\n return createHash('sha256').update(JSON.stringify(canonical)).digest('hex')\n}\n\nfunction computeAggregates<TArtifact>(\n cells: CampaignCellResult<TArtifact>[],\n judges: JudgeConfig<TArtifact>[],\n seed: number,\n): CampaignAggregates {\n const byJudge: Record<string, JudgeAggregate> = {}\n for (const judge of judges) {\n const scores: number[] = []\n for (const cell of cells) {\n const s = cell.judgeScores[judge.name]\n if (s !== undefined) scores.push(s.composite)\n }\n byJudge[judge.name] = aggregate(scores, seed)\n }\n const byScenario: Record<string, ScenarioAggregate> = {}\n const scenarioGroups = new Map<string, number[]>()\n for (const cell of cells) {\n const composites = Object.values(cell.judgeScores).map((s) => s.composite)\n if (composites.length === 0) continue\n const mean = composites.reduce((a, b) => a + b, 0) / composites.length\n const arr = scenarioGroups.get(cell.scenarioId) ?? []\n arr.push(mean)\n scenarioGroups.set(cell.scenarioId, arr)\n }\n for (const [scenarioId, samples] of scenarioGroups) {\n const ag = aggregate(samples, seed)\n byScenario[scenarioId] = { meanComposite: ag.mean, ci95: ag.ci95, n: ag.n }\n }\n return {\n byJudge,\n byScenario,\n totalCostUsd: cells.reduce((a, c) => a + c.costUsd, 0),\n cellsExecuted: cells.filter((c) => !c.error).length,\n cellsSkipped: cells.filter((c) => c.error?.startsWith('skipped:')).length,\n cellsCached: cells.filter((c) => c.cached).length,\n cellsFailed: cells.filter((c) => c.error && !c.error.startsWith('skipped:')).length,\n }\n}\n\n// Percentile bootstrap CI95 via seeded resampling. Deterministic for a given\n// seed — same campaign re-run produces identical CI bands. Falls back to\n// degenerate intervals at n<=1 (the bootstrap is undefined there).\nfunction aggregate(samples: number[], seed: number): JudgeAggregate {\n const n = samples.length\n if (n === 0) return { mean: 0, stdev: 0, ci95: [0, 0], n: 0 }\n const mean = samples.reduce((a, b) => a + b, 0) / n\n const variance = samples.reduce((a, b) => a + (b - mean) ** 2, 0) / Math.max(1, n - 1)\n const stdev = Math.sqrt(variance)\n const ci = confidenceInterval(samples, 0.95, { seed, resamples: 1000 })\n return { mean, stdev, ci95: [ci.lower, ci.upper], n }\n}\n","/**\n * @experimental\n *\n * `CampaignStorage` — the filesystem seam `runCampaign` writes through\n * (run/cell dirs, the resumability cache, per-cell artifacts, trace spans).\n *\n * The default (`fsCampaignStorage`) is the Node filesystem — identical\n * behavior to the inline `node:fs` calls it replaces, so existing CLI\n * consumers are unaffected. `inMemoryCampaignStorage` keeps everything in a\n * `Map`, so the substrate runs in environments WITHOUT a filesystem\n * (Cloudflare Workers, Deno Deploy, other edge runtimes) — the campaign\n * still produces its `CampaignResult` (cells + aggregates) in memory;\n * artifacts/traces simply aren't persisted to disk.\n *\n * Paths are opaque keys to the in-memory adapter — it does not parse them,\n * so the same `join(...)`-built paths work unchanged across both adapters.\n */\nexport interface CampaignStorage {\n /** Ensure a directory exists (recursive). No-op for in-memory. */\n ensureDir(dir: string): void\n /** Does this path exist (as a written file or an ensured dir)? */\n exists(path: string): boolean\n /** Read a UTF-8 file; `undefined` when missing or unreadable. */\n read(path: string): string | undefined\n /** Write a file (string or bytes). Parent dir is assumed ensured. */\n write(path: string, content: string | Uint8Array): void\n}\n\n/** Node-filesystem storage — the default. Lazily requires `node:fs` so the\n * module imports cleanly in non-Node runtimes (where the caller passes\n * `inMemoryCampaignStorage` instead and never constructs this). */\nexport function fsCampaignStorage(): CampaignStorage {\n const { existsSync, mkdirSync, readFileSync, writeFileSync } =\n require('node:fs') as typeof import('node:fs')\n return {\n ensureDir(dir) {\n if (!existsSync(dir)) mkdirSync(dir, { recursive: true })\n },\n exists(path) {\n return existsSync(path)\n },\n read(path) {\n try {\n return readFileSync(path, 'utf8')\n } catch {\n return undefined\n }\n },\n write(path, content) {\n writeFileSync(path, content as Uint8Array)\n },\n }\n}\n\n/** In-memory storage for filesystem-less runtimes. Artifacts + trace spans\n * live in a `Map` for the duration of the run; the `CampaignResult` is\n * fully populated, but nothing is persisted to disk. */\nexport function inMemoryCampaignStorage(): CampaignStorage {\n const files = new Map<string, string | Uint8Array>()\n const dirs = new Set<string>()\n return {\n ensureDir(dir) {\n dirs.add(dir)\n },\n exists(path) {\n return files.has(path) || dirs.has(path)\n },\n read(path) {\n const value = files.get(path)\n if (value === undefined) return undefined\n return typeof value === 'string' ? value : new TextDecoder().decode(value)\n },\n write(path, content) {\n files.set(path, content)\n },\n }\n}\n"],"mappings":";;;;;;;;AAaA,SAAS,kBAAkB;AAC3B,SAAS,YAAY;;;ACiBd,SAAS,oBAAqC;AACnD,QAAM,EAAE,YAAY,WAAW,cAAc,cAAc,IACzD,UAAQ,IAAS;AACnB,SAAO;AAAA,IACL,UAAU,KAAK;AACb,UAAI,CAAC,WAAW,GAAG,EAAG,WAAU,KAAK,EAAE,WAAW,KAAK,CAAC;AAAA,IAC1D;AAAA,IACA,OAAO,MAAM;AACX,aAAO,WAAW,IAAI;AAAA,IACxB;AAAA,IACA,KAAK,MAAM;AACT,UAAI;AACF,eAAO,aAAa,MAAM,MAAM;AAAA,MAClC,QAAQ;AACN,eAAO;AAAA,MACT;AAAA,IACF;AAAA,IACA,MAAM,MAAM,SAAS;AACnB,oBAAc,MAAM,OAAqB;AAAA,IAC3C;AAAA,EACF;AACF;AAKO,SAAS,0BAA2C;AACzD,QAAM,QAAQ,oBAAI,IAAiC;AACnD,QAAM,OAAO,oBAAI,IAAY;AAC7B,SAAO;AAAA,IACL,UAAU,KAAK;AACb,WAAK,IAAI,GAAG;AAAA,IACd;AAAA,IACA,OAAO,MAAM;AACX,aAAO,MAAM,IAAI,IAAI,KAAK,KAAK,IAAI,IAAI;AAAA,IACzC;AAAA,IACA,KAAK,MAAM;AACT,YAAM,QAAQ,MAAM,IAAI,IAAI;AAC5B,UAAI,UAAU,OAAW,QAAO;AAChC,aAAO,OAAO,UAAU,WAAW,QAAQ,IAAI,YAAY,EAAE,OAAO,KAAK;AAAA,IAC3E;AAAA,IACA,MAAM,MAAM,SAAS;AACnB,YAAM,IAAI,MAAM,OAAO;AAAA,IACzB;AAAA,EACF;AACF;;;ADeA,eAAsB,YACpB,MAC+C;AAC/C,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,YAAY,KAAK,aAAa;AACpC,QAAM,iBAAiB,KAAK,kBAAkB;AAC9C,QAAM,MAAM,KAAK,QAAQ,MAAM,oBAAI,KAAK;AACxC,QAAM,SAAS,KAAK,UAAU,CAAC;AAC/B,QAAM,UAAU,KAAK,WAAW,kBAAkB;AAElD,UAAQ,UAAU,KAAK,MAAM;AAE7B,QAAM,eAAe,oBAAoB;AAAA,IACvC,WAAW,KAAK;AAAA,IAChB;AAAA,IACA,aAAa,KAAK,SAAS,QAAQ;AAAA,IACnC;AAAA,IACA;AAAA,EACF,CAAC;AAED,QAAM,YAAY,IAAI;AACtB,QAAM,QAAyC,CAAC;AAChD,QAAM,kBAA0C,CAAC;AAGjD,QAAM,WAA0F,CAAC;AACjG,MAAI,YAAY;AAChB,aAAW,YAAY,KAAK,WAAW;AACrC,aAAS,MAAM,GAAG,MAAM,MAAM,OAAO;AACnC,YAAM,SAAS,GAAG,SAAS,EAAE,IAAI,GAAG;AACpC,YAAM,WAAW,OAAO;AACxB,eAAS,KAAK,EAAE,UAAU,KAAK,QAAQ,SAAS,CAAC;AACjD,mBAAa;AAAA,IACf;AAAA,EACF;AAGA,MAAI,eAAe;AACnB,MAAI,qBAAqB;AACzB,QAAM,kBAAkB,IAAI,gBAAgB;AAI5C,QAAM,QAAyB,CAAC;AAChC,MAAI,UAAU;AACd,QAAM,WAAW;AAEjB,WAAS,IAAI,GAAG,IAAI,gBAAgB,KAAK;AACvC,UAAM;AAAA,OACH,YAAY;AACX,eAAO,MAAM;AACX,gBAAM,QAAQ;AACd,cAAI,SAAS,SAAS,OAAQ;AAC9B,gBAAM,OAAO,SAAS,KAAK;AAC3B,cAAI,oBAAoB;AACtB,qBAAS,KAAK,YAAY,MAAM,sBAAsB,CAAC;AACvD;AAAA,UACF;AACA,gBAAM,SAAS,MAAM,YAAY;AAAA,YAC/B;AAAA,YACA;AAAA,YACA;AAAA,YACA;AAAA,YACA;AAAA,YACA;AAAA,YACA,kBAAkB,KAAK,oBAAoB,wBAAwB,OAAO;AAAA,YAC1E,QAAQ,gBAAgB;AAAA,UAC1B,CAAC;AACD,mBAAS,KAAK,OAAO,IAAI;AACzB,0BAAgB,OAAO,KAAK;AAC5B,iBAAO,OAAO,iBAAiB,OAAO,eAAe;AACrD,cAAI,KAAK,gBAAgB,UAAa,gBAAgB,KAAK,aAAa;AACtE,iCAAqB;AAAA,UACvB;AAEA,cAAI,KAAK,gBAAgB,KAAK,iBAAiB,SAAS,CAAC,OAAO,KAAK,OAAO;AAC1E,kBAAM,eAAe;AAAA,cACnB,OAAO,KAAK;AAAA,cACZ,MAAM,OAAO;AAAA,cACb,UAAU,KAAK;AAAA,cACf;AAAA,cACA;AAAA,YACF,CAAC,EAAE,MAAM,CAAC,QAAQ;AAGhB,sBAAQ;AAAA,gBACN,oCAAoC,OAAO,KAAK,MAAM,KAAK,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,cAC7G;AAAA,YACF,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF,GAAG;AAAA,IACL;AAAA,EACF;AACA,QAAM,QAAQ,IAAI,KAAK;AAEvB,QAAM,UAAU,IAAI;AACpB,WAAS,KAAK,CAAC,GAAG,MAAM,EAAE,OAAO,cAAc,EAAE,MAAM,CAAC;AAExD,QAAM,aAAa;AAAA,IACjB;AAAA,IACA;AAAA,IACA;AAAA,EACF;AAEA,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,WAAW,UAAU,YAAY;AAAA,IACjC,SAAS,QAAQ,YAAY;AAAA,IAC7B,YAAY,QAAQ,QAAQ,IAAI,UAAU,QAAQ;AAAA,IAClD,OAAO;AAAA,IACP;AAAA,IACA,QAAQ,KAAK;AAAA,IACb;AAAA,IACA,WAAW,KAAK,UAAU,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,MAAM,EAAE,KAAK,EAAE;AAAA,EACnE;AACF;AAeA,eAAe,YACb,MAC2F;AAC3F,QAAM,UAAU,KAAK;AACrB,QAAM,UAAU,KAAK,KAAK,KAAK,QAAQ,KAAK,KAAK,OAAO,QAAQ,mBAAmB,GAAG,CAAC;AACvF,UAAQ,UAAU,OAAO;AAGzB,QAAM,YAAY,KAAK,SAAS,oBAAoB;AACpD,MAAI,KAAK,WAAW;AAClB,UAAM,MAAM,QAAQ,KAAK,SAAS;AAClC,QAAI,QAAQ,QAAW;AACrB,UAAI;AACF,cAAM,SAAS,KAAK,MAAM,GAAG;AAC7B,YAAI,OAAO,WAAW,KAAK,KAAK,QAAQ;AACtC,iBAAO,EAAE,MAAM,EAAE,GAAG,QAAQ,QAAQ,KAAK,GAAG,iBAAiB,CAAC,EAAE;AAAA,QAClE;AAAA,MACF,QAAQ;AAAA,MAER;AAAA,IACF;AAAA,EACF;AAEA,QAAM,UAAU,KAAK,IAAI;AACzB,QAAM,QAAQ,KAAK,iBAAiB,KAAK,KAAK,QAAQ,OAAO;AAC7D,QAAM,kBAA0C,CAAC;AACjD,QAAM,YAAoC;AAAA,IACxC,MAAM,MAAM,MAAM,SAAS;AACzB,YAAM,WAAW,KAAK,SAAS,IAAI;AACnC,cAAQ,UAAU,KAAK,UAAU,IAAI,CAAC;AACtC,cAAQ,MAAM,UAAU,OAAO;AAC/B,sBAAgB,GAAG,KAAK,KAAK,MAAM,IAAI,IAAI,EAAE,IAAI;AACjD,aAAO;AAAA,IACT;AAAA,IACA,MAAM,UAAU,MAAM,OAAO;AAC3B,aAAO,UAAU,MAAM,MAAM,KAAK,UAAU,OAAO,MAAM,CAAC,CAAC;AAAA,IAC7D;AAAA,EACF;AACA,MAAI,YAAY;AAChB,QAAM,OAA0B;AAAA,IAC9B,QAAQ,QAAQ,QAAQ;AACtB,mBAAa;AACb,YAAM,KAAK,QAAQ,MAAM,IAAI,EAAE,WAAW,OAAO,CAAC,EAAE,IAAI;AAAA,IAC1D;AAAA,IACA,UAAU;AACR,aAAO;AAAA,IACT;AAAA,EACF;AAEA,QAAM,YAAY,KAAK,KAAK,gBAAgB;AAAA,IAC1C,UAAU,KAAK,KAAK;AAAA,IACpB,KAAK,KAAK,KAAK;AAAA,EACjB,CAAC;AAED,QAAM,MAAuB;AAAA,IAC3B,QAAQ,KAAK,KAAK;AAAA,IAClB,KAAK,KAAK,KAAK;AAAA,IACf,MAAM,KAAK,KAAK;AAAA,IAChB,QAAQ,KAAK;AAAA,IACb;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AAEA,MAAI;AACJ,MAAI;AACJ,MAAI;AACF,eAAW,MAAM,KAAK,KAAK,SAAS,KAAK,KAAK,UAAU,GAAG;AAAA,EAC7D,SAAS,KAAK;AACZ,mBAAe,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAAA,EAChE;AAKA,QAAM,cAA0C,CAAC;AACjD,MAAI,aAAa,QAAW;AAC1B,eAAW,SAAS,KAAK,KAAK,UAAU,CAAC,GAAG;AAC1C,UAAI,MAAM,aAAa,CAAC,MAAM,UAAU,KAAK,KAAK,QAAQ,EAAG;AAC7D,UAAI;AACF,oBAAY,MAAM,IAAI,IAAI,MAAM,aAAa,OAAO;AAAA,UAClD;AAAA,UACA,UAAU,KAAK,KAAK;AAAA,UACpB,QAAQ,KAAK;AAAA,QACf,CAAC;AAAA,MACH,SAAS,KAAK;AACZ,uBAAe,UAAU,MAAM,IAAI,aAAa,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAChG;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,QAAM,MAAM,MAAM;AAElB,QAAM,OAAsC;AAAA,IAC1C,QAAQ,KAAK,KAAK;AAAA,IAClB,YAAY,KAAK,KAAK,SAAS;AAAA,IAC/B,KAAK,KAAK,KAAK;AAAA,IACf,UAAW,YAAY;AAAA,IACvB;AAAA,IACA,SAAS;AAAA,IACT,YAAY,KAAK,IAAI,IAAI;AAAA,IACzB,MAAM,KAAK,KAAK;AAAA,IAChB,QAAQ;AAAA,IACR,OAAO;AAAA,EACT;AAEA,MAAI,CAAC,gBAAgB,KAAK,WAAW;AACnC,YAAQ,MAAM,WAAW,KAAK,UAAU,IAAI,CAAC;AAAA,EAC/C;AAEA,SAAO,EAAE,MAAM,gBAAgB;AACjC;AAEA,eAAe,aACb,OACA,OACqB;AACrB,SAAO,MAAM,MAAM,KAAK;AAC1B;AAEA,SAAS,wBACP,SACsD;AACtD,SAAO,CAAC,QAAQ,QAAQ;AACtB,UAAM,QAAwC,CAAC;AAC/C,WAAO;AAAA,MACL,KAAK,MAAM,YAAY;AACrB,cAAM,UAAU,KAAK,IAAI;AACzB,cAAM,SAAkC,EAAE,MAAM,QAAQ,SAAS,GAAI,cAAc,CAAC,EAAG;AACvF,cAAM,SAAoB;AAAA,UACxB,IAAI,UAAU;AACZ,mBAAO,aAAa,KAAK,IAAI,IAAI;AACjC,gBAAI,SAAU,QAAO,OAAO,QAAQ,QAAQ;AAC5C,kBAAM,KAAK,MAAM;AAAA,UACnB;AAAA,UACA,aAAa,KAAK,OAAO;AACvB,mBAAO,GAAG,IAAI;AAAA,UAChB;AAAA,QACF;AACA,eAAO;AAAA,MACT;AAAA,MACA,MAAM,QAAQ;AACZ,gBAAQ,MAAM,KAAK,KAAK,aAAa,GAAG,MAAM,IAAI,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,EAAE,KAAK,IAAI,CAAC;AAAA,MACxF;AAAA,IACF;AAAA,EACF;AACF;AAEA,SAAS,YACP,MACA,QAC+B;AAC/B,SAAO;AAAA,IACL,QAAQ,KAAK;AAAA,IACb,YAAY,KAAK,SAAS;AAAA,IAC1B,KAAK,KAAK;AAAA,IACV,UAAU;AAAA,IACV,aAAa,CAAC;AAAA,IACd,SAAS;AAAA,IACT,YAAY;AAAA,IACZ,MAAM,KAAK;AAAA,IACX,QAAQ;AAAA,IACR,OAAO,YAAY,MAAM;AAAA,EAC3B;AACF;AAUA,eAAe,eACb,MACe;AACf,QAAM,KAAK,MAAM,QAAQ;AAAA,IACvB,UAAU,KAAK;AAAA,IACf,UAAU,KAAK,KAAK;AAAA,IACpB,aAAa,KAAK,KAAK;AAAA,IACvB,QAAQ,KAAK,KAAK,iBAAiB;AAAA,IACnC,mBAAmB,KAAK,KAAK,4BAA4B;AAAA,IACzD,YAAY,KAAK,IAAI,EAAE,YAAY;AAAA,IACnC,iBAAiB;AAAA,EACnB,CAAC;AACH;AAIA,SAAS,oBAAoB,OAMlB;AACT,QAAM,YAAY;AAAA,IAChB,WAAW,MAAM,UAAU,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,MAAM,EAAE,KAAK,EAAE;AAAA,IAClE,QAAQ,MAAM,OAAO,IAAI,CAAC,OAAO,EAAE,MAAM,EAAE,MAAM,MAAM,EAAE,WAAW,IAAI,CAAC,MAAM,EAAE,GAAG,EAAE,EAAE;AAAA,IACxF,UAAU,MAAM;AAAA,IAChB,MAAM,MAAM;AAAA,IACZ,MAAM,MAAM;AAAA,EACd;AACA,SAAO,WAAW,QAAQ,EAAE,OAAO,KAAK,UAAU,SAAS,CAAC,EAAE,OAAO,KAAK;AAC5E;AAEA,SAAS,kBACP,OACA,QACA,MACoB;AACpB,QAAM,UAA0C,CAAC;AACjD,aAAW,SAAS,QAAQ;AAC1B,UAAM,SAAmB,CAAC;AAC1B,eAAW,QAAQ,OAAO;AACxB,YAAM,IAAI,KAAK,YAAY,MAAM,IAAI;AACrC,UAAI,MAAM,OAAW,QAAO,KAAK,EAAE,SAAS;AAAA,IAC9C;AACA,YAAQ,MAAM,IAAI,IAAI,UAAU,QAAQ,IAAI;AAAA,EAC9C;AACA,QAAM,aAAgD,CAAC;AACvD,QAAM,iBAAiB,oBAAI,IAAsB;AACjD,aAAW,QAAQ,OAAO;AACxB,UAAM,aAAa,OAAO,OAAO,KAAK,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AACzE,QAAI,WAAW,WAAW,EAAG;AAC7B,UAAM,OAAO,WAAW,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,WAAW;AAChE,UAAM,MAAM,eAAe,IAAI,KAAK,UAAU,KAAK,CAAC;AACpD,QAAI,KAAK,IAAI;AACb,mBAAe,IAAI,KAAK,YAAY,GAAG;AAAA,EACzC;AACA,aAAW,CAAC,YAAY,OAAO,KAAK,gBAAgB;AAClD,UAAM,KAAK,UAAU,SAAS,IAAI;AAClC,eAAW,UAAU,IAAI,EAAE,eAAe,GAAG,MAAM,MAAM,GAAG,MAAM,GAAG,GAAG,EAAE;AAAA,EAC5E;AACA,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,cAAc,MAAM,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,SAAS,CAAC;AAAA,IACrD,eAAe,MAAM,OAAO,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE;AAAA,IAC7C,cAAc,MAAM,OAAO,CAAC,MAAM,EAAE,OAAO,WAAW,UAAU,CAAC,EAAE;AAAA,IACnE,aAAa,MAAM,OAAO,CAAC,MAAM,EAAE,MAAM,EAAE;AAAA,IAC3C,aAAa,MAAM,OAAO,CAAC,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,WAAW,UAAU,CAAC,EAAE;AAAA,EAC/E;AACF;AAKA,SAAS,UAAU,SAAmB,MAA8B;AAClE,QAAM,IAAI,QAAQ;AAClB,MAAI,MAAM,EAAG,QAAO,EAAE,MAAM,GAAG,OAAO,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,GAAG,EAAE;AAC5D,QAAM,OAAO,QAAQ,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI;AAClD,QAAM,WAAW,QAAQ,OAAO,CAAC,GAAG,MAAM,KAAK,IAAI,SAAS,GAAG,CAAC,IAAI,KAAK,IAAI,GAAG,IAAI,CAAC;AACrF,QAAM,QAAQ,KAAK,KAAK,QAAQ;AAChC,QAAM,KAAK,mBAAmB,SAAS,MAAM,EAAE,MAAM,WAAW,IAAK,CAAC;AACtE,SAAO,EAAE,MAAM,OAAO,MAAM,CAAC,GAAG,OAAO,GAAG,KAAK,GAAG,EAAE;AACtD;","names":[]}
|
package/dist/contract/index.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
export { C as CampaignAggregates, a as CampaignArtifactWriter, b as CampaignCellResult, c as CampaignCostMeter, d as CampaignResult, e as CampaignTraceWriter, f as CodeSurface, D as Dispatch, g as DispatchContext, G as Gate, h as GateContext, i as GateDecision, j as GateResult, k as GenerationCandidate, l as GenerationRecord, I as ImprovementDriver, J as JudgeConfig, m as JudgeDimension, n as JudgeScore, M as MutableSurface, o as Mutator, O as OptimizerConfig, S as Scenario, p as SessionScript } from '../types-
|
|
2
|
-
export { C as CampaignStorage, D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, G as GepaDriverOptions, H as HeldOutGateOptions, R as RunCampaignOptions, a as RunEvalOptions, b as RunImprovementLoopOptions, c as RunImprovementLoopResult, d as composeGate, e as defaultProductionGate, f as evolutionaryDriver, g as fsCampaignStorage, h as gepaDriver, i as heldOutGate, j as inMemoryCampaignStorage, r as runCampaign, k as runEval, l as runImprovementLoop } from '../run-improvement-loop-
|
|
1
|
+
export { C as CampaignAggregates, a as CampaignArtifactWriter, b as CampaignCellResult, c as CampaignCostMeter, d as CampaignResult, e as CampaignTraceWriter, f as CodeSurface, D as Dispatch, g as DispatchContext, G as Gate, h as GateContext, i as GateDecision, j as GateResult, k as GenerationCandidate, l as GenerationRecord, I as ImprovementDriver, J as JudgeConfig, m as JudgeDimension, n as JudgeScore, M as MutableSurface, o as Mutator, O as OptimizerConfig, S as Scenario, p as SessionScript } from '../types-BURGZ8Ug.js';
|
|
2
|
+
export { C as CampaignStorage, D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, G as GepaDriverOptions, H as HeldOutGateOptions, R as RunCampaignOptions, a as RunEvalOptions, b as RunImprovementLoopOptions, c as RunImprovementLoopResult, d as composeGate, e as defaultProductionGate, f as evolutionaryDriver, g as fsCampaignStorage, h as gepaDriver, i as heldOutGate, j as inMemoryCampaignStorage, r as runCampaign, k as runEval, l as runImprovementLoop } from '../run-improvement-loop-pJ4yrx4X.js';
|
|
3
3
|
export { D as DeploymentOutcome, F as FileSystemOutcomeStore, a as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, O as OutcomeStore } from '../outcome-store-BxJ3DQKJ.js';
|
|
4
4
|
import '../llm-client-BXVRUZyX.js';
|
|
5
5
|
import '../errors-mje_cKOs.js';
|
package/dist/contract/index.js
CHANGED
|
@@ -6,12 +6,12 @@ import {
|
|
|
6
6
|
heldOutGate,
|
|
7
7
|
runEval,
|
|
8
8
|
runImprovementLoop
|
|
9
|
-
} from "../chunk-
|
|
9
|
+
} from "../chunk-HRKOCLQA.js";
|
|
10
10
|
import {
|
|
11
11
|
fsCampaignStorage,
|
|
12
12
|
inMemoryCampaignStorage,
|
|
13
13
|
runCampaign
|
|
14
|
-
} from "../chunk-
|
|
14
|
+
} from "../chunk-J3EIOI3O.js";
|
|
15
15
|
import "../chunk-N4SBKEPJ.js";
|
|
16
16
|
import "../chunk-YV7J7X5N.js";
|
|
17
17
|
import {
|
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.44.1",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
package/dist/rl.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
|
|
2
|
-
import { d as CampaignResult } from './types-
|
|
2
|
+
import { d as CampaignResult } from './types-BURGZ8Ug.js';
|
|
3
3
|
import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-CoJMs2Iz.js';
|
|
4
4
|
export { r as runEvalCampaign } from './researcher-CoJMs2Iz.js';
|
|
5
5
|
import { S as Span, T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import {
|
|
2
2
|
runCampaign
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-J3EIOI3O.js";
|
|
4
4
|
import "./chunk-WP7SY7AI.js";
|
|
5
5
|
import "./chunk-QYJT52YW.js";
|
|
6
6
|
import "./chunk-NSBPE2FW.js";
|
|
7
7
|
export {
|
|
8
8
|
runCampaign
|
|
9
9
|
};
|
|
10
|
-
//# sourceMappingURL=run-campaign-
|
|
10
|
+
//# sourceMappingURL=run-campaign-6UEVBPP3.js.map
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { S as Scenario, d as CampaignResult, j as GateResult, o as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, J as JudgeConfig, L as LabeledScenarioStore, e as CampaignTraceWriter, M as MutableSurface, l as GenerationRecord } from './types-
|
|
1
|
+
import { S as Scenario, d as CampaignResult, j as GateResult, o as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, J as JudgeConfig, L as LabeledScenarioStore, e as CampaignTraceWriter, M as MutableSurface, l as GenerationRecord } from './types-BURGZ8Ug.js';
|
|
2
2
|
import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
|
|
3
3
|
import { RunRecord } from '@tangle-network/agent-runtime';
|
|
4
4
|
import { R as RedTeamCase } from './red-team-30II1T4o.js';
|
|
@@ -267,6 +267,22 @@ interface RunCampaignOptions<TScenario extends Scenario, TArtifact> {
|
|
|
267
267
|
* (Cloudflare Workers, Deno, edge) — the `CampaignResult` is still
|
|
268
268
|
* produced; artifacts/traces just aren't persisted to disk. */
|
|
269
269
|
storage?: CampaignStorage;
|
|
270
|
+
/**
|
|
271
|
+
* Optional per-cell placement strategy. Returns an opaque string the
|
|
272
|
+
* substrate forwards as `ctx.placement` to the Dispatch — placement-aware
|
|
273
|
+
* Dispatches (e.g. `httpDispatch` from `/adapters/http`) use it to route
|
|
274
|
+
* each cell to the right worker, region, or sandbox. When unset, every
|
|
275
|
+
* cell receives `ctx.placement = undefined` and behaves identically to
|
|
276
|
+
* the in-process case.
|
|
277
|
+
*
|
|
278
|
+
* @example
|
|
279
|
+
* cellPlacement: ({ scenario }) => scenario.tags?.includes('eu') ? 'eu-west' : 'us-east'
|
|
280
|
+
*/
|
|
281
|
+
cellPlacement?: (input: {
|
|
282
|
+
scenario: TScenario;
|
|
283
|
+
rep: number;
|
|
284
|
+
generation?: number;
|
|
285
|
+
}) => string | undefined;
|
|
270
286
|
}
|
|
271
287
|
declare function runCampaign<TScenario extends Scenario, TArtifact>(opts: RunCampaignOptions<TScenario, TArtifact>): Promise<CampaignResult<TArtifact, TScenario>>;
|
|
272
288
|
|
|
@@ -40,6 +40,14 @@ interface DispatchContext {
|
|
|
40
40
|
cycleId?: string;
|
|
41
41
|
/** Populated when the substrate resumed from a prior cache hit. */
|
|
42
42
|
resumedFrom?: string;
|
|
43
|
+
/**
|
|
44
|
+
* Opaque placement key supplied by `RunCampaignOptions.cellPlacement`.
|
|
45
|
+
* The substrate forwards it through unchanged; placement-aware Dispatch
|
|
46
|
+
* implementations (e.g. `httpDispatch` from `/adapters/http`) read it to
|
|
47
|
+
* route the cell to the right worker / region / sandbox. `undefined`
|
|
48
|
+
* when no placement strategy is configured.
|
|
49
|
+
*/
|
|
50
|
+
placement?: string;
|
|
43
51
|
}
|
|
44
52
|
/** @experimental One function: scenario + ctx → artifact. Dispatcher chooses
|
|
45
53
|
* whether to call `runMultishot`, `runLoop`, raw `streamPrompt`, anything. */
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# Composing agent-eval with your observability stack
|
|
2
|
+
|
|
3
|
+
`@tangle-network/agent-eval` ships its own OpenTelemetry pipeline
|
|
4
|
+
(`@tangle-network/agent-eval/telemetry`) that emits spans for every
|
|
5
|
+
cell, judge invocation, mutator proposal, and gate decision. **It's
|
|
6
|
+
just OTel** — same protocol as Langfuse SDK, OpenLLMetry, Arize
|
|
7
|
+
Phoenix, TraceAI, and the OpenTelemetry GenAI semantic conventions.
|
|
8
|
+
|
|
9
|
+
That means: if you already instrument your agent with any OTel-native
|
|
10
|
+
observability tool, the two compose **for free at the protocol layer**.
|
|
11
|
+
This doc shows the composition pattern; no agent-eval-specific adapter
|
|
12
|
+
code required.
|
|
13
|
+
|
|
14
|
+
## TL;DR — one OTel context, two emitters
|
|
15
|
+
|
|
16
|
+
1. Set up a shared OTel tracer provider in your process (or service mesh).
|
|
17
|
+
2. Configure your observability tool (TraceAI / Langfuse / OpenLLMetry /
|
|
18
|
+
Phoenix) to register its instrumentations against that provider.
|
|
19
|
+
3. Configure agent-eval's `/telemetry` exporter against the same provider.
|
|
20
|
+
4. Run a campaign. Both sets of spans land at your OTel collector.
|
|
21
|
+
5. Filter / route / fan-out at the collector layer — Jaeger, Tempo,
|
|
22
|
+
Phoenix, Langfuse cloud, your private collector, whatever.
|
|
23
|
+
|
|
24
|
+
The Tangle substrate doesn't compete with the observability tool;
|
|
25
|
+
they're orthogonal. The tool tells you *what your agent did*; the
|
|
26
|
+
substrate tells you *what the campaign / judge / mutator decided about
|
|
27
|
+
it*. Unified at the trace level, you see both as one timeline per cell.
|
|
28
|
+
|
|
29
|
+
## Per-tool notes
|
|
30
|
+
|
|
31
|
+
### TraceAI (Future-AGI)
|
|
32
|
+
|
|
33
|
+
- TS SDK auto-instruments OpenAI/Anthropic SDKs + LangChain.
|
|
34
|
+
- Compatible with the OpenTelemetry GenAI semantic conventions.
|
|
35
|
+
- Compose: register TraceAI's instrumentations on the global tracer
|
|
36
|
+
provider, then either point both at your OTLP collector or at
|
|
37
|
+
TraceAI's hosted backend if you want their UI.
|
|
38
|
+
|
|
39
|
+
### Langfuse SDK
|
|
40
|
+
|
|
41
|
+
- Larger installed base; has its own hosted product + OSS self-host.
|
|
42
|
+
- Their OpenTelemetry-compatible mode ships LLM call spans with
|
|
43
|
+
Langfuse-specific attributes preserved.
|
|
44
|
+
- Compose: register Langfuse as an OTel processor; agent-eval's
|
|
45
|
+
campaign/judge/mutator spans appear alongside the LLM calls in their
|
|
46
|
+
UI.
|
|
47
|
+
|
|
48
|
+
### OpenLLMetry (Traceloop)
|
|
49
|
+
|
|
50
|
+
- OSS auto-instrumentation library; OTel-native by design.
|
|
51
|
+
- Wide framework coverage (LangChain, LlamaIndex, Haystack, OpenAI,
|
|
52
|
+
Anthropic).
|
|
53
|
+
- Compose: set up Traceloop's exporter; agent-eval's exporter shares
|
|
54
|
+
the same trace context per cell.
|
|
55
|
+
|
|
56
|
+
### Arize Phoenix
|
|
57
|
+
|
|
58
|
+
- OSS observability backend; strong in the eval-tooling community.
|
|
59
|
+
- OTel-native ingest; renders trace + span attributes per the GenAI
|
|
60
|
+
semantic conventions.
|
|
61
|
+
- Compose: point both exporters at your local Phoenix instance. Phoenix
|
|
62
|
+
becomes the unified UI for both LLM-call traces and campaign spans.
|
|
63
|
+
|
|
64
|
+
## Wiring pattern (reference)
|
|
65
|
+
|
|
66
|
+
```ts
|
|
67
|
+
import { trace } from '@opentelemetry/api'
|
|
68
|
+
import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node'
|
|
69
|
+
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'
|
|
70
|
+
import { SimpleSpanProcessor } from '@opentelemetry/sdk-trace-base'
|
|
71
|
+
|
|
72
|
+
// 1. One shared tracer provider for the process.
|
|
73
|
+
const provider = new NodeTracerProvider()
|
|
74
|
+
provider.addSpanProcessor(new SimpleSpanProcessor(
|
|
75
|
+
new OTLPTraceExporter({ url: 'http://localhost:4318/v1/traces' }),
|
|
76
|
+
))
|
|
77
|
+
provider.register()
|
|
78
|
+
|
|
79
|
+
// 2. Your observability tool registers against the global provider.
|
|
80
|
+
// Example for TraceAI / OpenLLMetry / Langfuse — call their init.
|
|
81
|
+
// (See each tool's docs.)
|
|
82
|
+
|
|
83
|
+
// 3. agent-eval is already OTel-native; it picks up the same global
|
|
84
|
+
// provider. Just ensure `@tangle-network/agent-eval/telemetry` is
|
|
85
|
+
// initialized for the campaign:
|
|
86
|
+
import { setOtelExporter } from '@tangle-network/agent-eval/telemetry'
|
|
87
|
+
setOtelExporter({ kind: 'otel-global' }) // use the global provider
|
|
88
|
+
|
|
89
|
+
// 4. Run your campaign — both sets of spans land at the collector.
|
|
90
|
+
import { runEval } from '@tangle-network/agent-eval/contract'
|
|
91
|
+
await runEval({ /* ... */ })
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
That's it. No new adapter shipping required — the libs are already
|
|
95
|
+
designed to live in the same OTel ecosystem.
|
|
96
|
+
|
|
97
|
+
## When you'd want a deeper, code-level adapter
|
|
98
|
+
|
|
99
|
+
The two cases where a thin adapter would add value beyond the
|
|
100
|
+
OTel-protocol composition:
|
|
101
|
+
|
|
102
|
+
1. **Cost-aware judging.** Your observability tool's auto-instrumented
|
|
103
|
+
spans carry token counts + cost. A custom `JudgeConfig` can read
|
|
104
|
+
them via the OTel context and refuse to score artifacts that
|
|
105
|
+
exceeded a per-call budget. Easy to write yourself; we'll ship a
|
|
106
|
+
reference helper (`costAwareJudgeFromOtel`) when a partner pulls on
|
|
107
|
+
this.
|
|
108
|
+
2. **Tool-aware judging.** Your instrumentation captures the tool-call
|
|
109
|
+
sequence (`langchain.tool.invoked`, `openai.function.called`, etc.).
|
|
110
|
+
A judge that scores "did the agent use the right tool" reads those
|
|
111
|
+
spans directly. Also straightforward; helper ships when needed.
|
|
112
|
+
|
|
113
|
+
Both of these are L1-tier ergonomic helpers; the underlying composition
|
|
114
|
+
works today without them.
|
|
115
|
+
|
|
116
|
+
## What this does NOT install
|
|
117
|
+
|
|
118
|
+
No new dependencies. No new peer deps. No `@traceai/*`, no
|
|
119
|
+
`@langfuse/*`, no `@opentelemetry/*` in our manifest. You bring the
|
|
120
|
+
observability stack you want; agent-eval just emits OTel and respects
|
|
121
|
+
whatever provider is registered.
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
# Distributed driver — driver-on-A, workers-on-B (and C, D, E…)
|
|
2
|
+
|
|
3
|
+
The driver (running `runCampaign` / `runImprovementLoop` / `gepaDriver`)
|
|
4
|
+
and the worker (running your actual agent) **do not have to live in the
|
|
5
|
+
same process, machine, region, or cloud.** `Dispatch` is just a
|
|
6
|
+
function: scenario in, artifact out. Whatever returns the artifact is
|
|
7
|
+
the worker — local, remote, sandboxed, or fanned out across a fleet.
|
|
8
|
+
|
|
9
|
+
## Why you'd want this
|
|
10
|
+
|
|
11
|
+
| Pattern | Reason |
|
|
12
|
+
|---|---|
|
|
13
|
+
| **Driver on your VPC, workers on our sandbox fleet** | Driver holds secrets, training data, prompt corpus; workers stay stateless and scale horizontally |
|
|
14
|
+
| **Multi-region campaigns** | Each cell runs in the region closest to its target API (latency, compliance, data residency) |
|
|
15
|
+
| **Driver-as-a-service** | Long-running optimization process; reuses across many short-lived worker invocations |
|
|
16
|
+
| **Heterogeneous workers** | One cell on a CPU container, another on a GPU box, another against a third-party API — same Dispatch shape, different placement |
|
|
17
|
+
| **Budget-isolated workers** | Worker boxes get scoped, time-bounded credentials; driver never holds production keys |
|
|
18
|
+
|
|
19
|
+
## Two new pieces in 0.45.0
|
|
20
|
+
|
|
21
|
+
| Where | What |
|
|
22
|
+
|---|---|
|
|
23
|
+
| **`DispatchContext.placement?: string`** | Opaque placement key the substrate forwards to the Dispatch. |
|
|
24
|
+
| **`RunCampaignOptions.cellPlacement?(input) → string \| undefined`** | Strategy function the substrate calls per cell to compute the placement key. |
|
|
25
|
+
| **`@tangle-network/agent-eval/adapters/http`** | `httpDispatch` (client) + `runDispatchServer` (server) — wire shape for HTTP-based remote workers. |
|
|
26
|
+
|
|
27
|
+
Both ends of the wire are in the same package; no peer dep, no separate
|
|
28
|
+
install. The substrate doesn't strategy-pick; you provide the
|
|
29
|
+
`cellPlacement` function, the substrate forwards its result, the
|
|
30
|
+
Dispatch reads it. Clean seam, no policy baked in.
|
|
31
|
+
|
|
32
|
+
## The three reference topologies
|
|
33
|
+
|
|
34
|
+
### 1. In-process (the default — what you already have)
|
|
35
|
+
|
|
36
|
+
```ts
|
|
37
|
+
await runCampaign({
|
|
38
|
+
scenarios,
|
|
39
|
+
dispatch, // runs in-process
|
|
40
|
+
judges: [judge],
|
|
41
|
+
storage,
|
|
42
|
+
runDir,
|
|
43
|
+
})
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
`ctx.placement` is `undefined`; nothing changes for existing consumers.
|
|
47
|
+
This shipped in 0.40.
|
|
48
|
+
|
|
49
|
+
### 2. Single remote worker
|
|
50
|
+
|
|
51
|
+
Driver-on-A talks to one worker-on-B over HTTP.
|
|
52
|
+
|
|
53
|
+
**Driver side (machine A):**
|
|
54
|
+
|
|
55
|
+
```ts
|
|
56
|
+
import { httpDispatch } from '@tangle-network/agent-eval/adapters/http'
|
|
57
|
+
|
|
58
|
+
const dispatch = httpDispatch<MyScenario, MyArtifact>({
|
|
59
|
+
url: 'https://worker.your-infra.com/dispatch',
|
|
60
|
+
auth: process.env.WORKER_TOKEN,
|
|
61
|
+
timeoutMs: 5 * 60 * 1000,
|
|
62
|
+
retries: 2,
|
|
63
|
+
})
|
|
64
|
+
|
|
65
|
+
await runImprovementLoop({ scenarios, baselineSurface, dispatchWithSurface: (surface, s, ctx) =>
|
|
66
|
+
dispatch(s, { ...ctx, /* pass the surface through your own protocol */ }),
|
|
67
|
+
/* ... */ })
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
**Worker side (machine B):**
|
|
71
|
+
|
|
72
|
+
```ts
|
|
73
|
+
import { runDispatchServer } from '@tangle-network/agent-eval/adapters/http'
|
|
74
|
+
|
|
75
|
+
const handle = await runDispatchServer<MyScenario, MyArtifact>({
|
|
76
|
+
dispatch: async (scenario, ctx) => {
|
|
77
|
+
// your agent — call OpenAI, LangChain, your sandbox, anything.
|
|
78
|
+
const artifact = await runMyAgent(scenario, ctx.signal)
|
|
79
|
+
return artifact
|
|
80
|
+
},
|
|
81
|
+
port: 8080,
|
|
82
|
+
auth: process.env.WORKER_TOKEN, // required; `false` only for closed networks
|
|
83
|
+
})
|
|
84
|
+
console.log(`worker listening on ${handle.port}`)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Cancellation, retries on 5xx / 408 / 429, bounded timeouts, optional
|
|
88
|
+
custom auth headers, optional `fetchImpl` override — all there.
|
|
89
|
+
|
|
90
|
+
### 3. Multi-region fan-out
|
|
91
|
+
|
|
92
|
+
Driver picks a region per cell; the same `httpDispatch` routes to
|
|
93
|
+
different worker URLs based on placement.
|
|
94
|
+
|
|
95
|
+
```ts
|
|
96
|
+
import { httpDispatch } from '@tangle-network/agent-eval/adapters/http'
|
|
97
|
+
|
|
98
|
+
const REGION_URLS: Record<string, string> = {
|
|
99
|
+
'us-east': 'https://worker-use1.your-infra.com/dispatch',
|
|
100
|
+
'eu-west': 'https://worker-euw1.your-infra.com/dispatch',
|
|
101
|
+
'ap-south': 'https://worker-aps1.your-infra.com/dispatch',
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
const dispatch = httpDispatch<MyScenario, MyArtifact>({
|
|
105
|
+
resolveUrl: ({ placement }) => REGION_URLS[placement ?? 'us-east'],
|
|
106
|
+
auth: process.env.WORKER_TOKEN,
|
|
107
|
+
})
|
|
108
|
+
|
|
109
|
+
await runCampaign({
|
|
110
|
+
scenarios,
|
|
111
|
+
dispatch,
|
|
112
|
+
judges: [judge],
|
|
113
|
+
storage,
|
|
114
|
+
runDir,
|
|
115
|
+
cellPlacement: ({ scenario }) => {
|
|
116
|
+
if (scenario.tags?.includes('eu')) return 'eu-west'
|
|
117
|
+
if (scenario.tags?.includes('ap')) return 'ap-south'
|
|
118
|
+
return 'us-east'
|
|
119
|
+
},
|
|
120
|
+
maxConcurrency: 8, // 8 cells fan across regions in parallel
|
|
121
|
+
})
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
`cellPlacement` is a pure function the substrate calls per cell — no
|
|
125
|
+
state. Use whatever signal you want (tags, hash of scenario id,
|
|
126
|
+
round-robin, region-affinity from a previous run, scheduling table).
|
|
127
|
+
|
|
128
|
+
## What's preserved across the wire
|
|
129
|
+
|
|
130
|
+
| Concern | How |
|
|
131
|
+
|---|---|
|
|
132
|
+
| **Cancellation** | Driver's `AbortSignal` forwards into the HTTP request; server translates `AbortError` → `499` so client doesn't retry. |
|
|
133
|
+
| **Timeouts** | Per-call `timeoutMs` on the client; server can layer its own. |
|
|
134
|
+
| **Retries** | Idempotent retries on 5xx / 408 / 429 with exponential backoff + jitter. Driver-aborts never retry. |
|
|
135
|
+
| **Auth** | Bearer token on `Authorization`; pluggable via `auth: string \| () => string \| Promise<string>` for rotation/refresh. |
|
|
136
|
+
| **Payload size** | Server enforces `maxBodyBytes` (default 10 MB). |
|
|
137
|
+
| **Traces** | Both ends emit OTel — if both point at the same OTLP collector, you get a unified trace per cell. See `docs/adapters-observability.md`. |
|
|
138
|
+
| **Cost** | Worker's `ctx.cost.observe(usd, source)` is local to the worker process. Roll up server-side and attach to your worker-side telemetry; we don't (yet) forward cost back to the driver. Tracked as follow-up. |
|
|
139
|
+
|
|
140
|
+
## Running the reference example
|
|
141
|
+
|
|
142
|
+
See `examples/distributed-driver/`:
|
|
143
|
+
|
|
144
|
+
```sh
|
|
145
|
+
# Terminal 1 — worker
|
|
146
|
+
pnpm tsx examples/distributed-driver/worker.ts
|
|
147
|
+
|
|
148
|
+
# Terminal 2 — driver
|
|
149
|
+
WORKER_URL=http://localhost:8080/dispatch \
|
|
150
|
+
WORKER_TOKEN=dev-token \
|
|
151
|
+
pnpm tsx examples/distributed-driver/driver.ts
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
Two processes, one local TCP loopback, full self-improvement loop end
|
|
155
|
+
to end. Scaling out is dropping `WORKER_URL` to a non-loopback hostname
|
|
156
|
+
and using `cellPlacement` to fan across many of them.
|
|
157
|
+
|
|
158
|
+
## Known gaps + follow-ups
|
|
159
|
+
|
|
160
|
+
- **Cost roll-up across the wire** — worker-side `ctx.cost` observations
|
|
161
|
+
stay on the worker. We need to forward them in the response body so
|
|
162
|
+
`defaultProductionGate`'s `budgetUsd` ceiling reflects total spend, not
|
|
163
|
+
driver-side spend. Tracked as a 0.45.x follow-up.
|
|
164
|
+
- **Per-cell artifact streaming** — when the worker writes intermediate
|
|
165
|
+
artifacts via `ctx.artifacts.write`, those land on the worker's
|
|
166
|
+
storage. For multi-worker campaigns you'll want a shared object store
|
|
167
|
+
(S3/GCS) reachable from both sides; today consumers wire that as a
|
|
168
|
+
`CampaignStorage` impl. A reference S3-backed storage is on the
|
|
169
|
+
roadmap.
|
|
170
|
+
- **gRPC / NATS / Temporal transports** — the wire is HTTP today by
|
|
171
|
+
default because everything speaks HTTP. Other transports can ship as
|
|
172
|
+
additional adapters; the `Dispatch` interface itself is
|
|
173
|
+
transport-agnostic.
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
# Quickstart — self-improvement loop for any agent (15 minutes)
|
|
2
|
+
|
|
3
|
+
The standalone walkthrough mirroring
|
|
4
|
+
`examples/foreign-agent-quickstart/`. Read this first; copy the runnable
|
|
5
|
+
example second.
|
|
6
|
+
|
|
7
|
+
## What you get
|
|
8
|
+
|
|
9
|
+
After 15 minutes you have a closed self-improvement loop running
|
|
10
|
+
against your agent — measured, gated, and reproducible — with no
|
|
11
|
+
Tangle sandbox, no Tangle account, and no hosted infrastructure.
|
|
12
|
+
|
|
13
|
+
## Install
|
|
14
|
+
|
|
15
|
+
```sh
|
|
16
|
+
npm i @tangle-network/agent-eval@^0.44.0
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
The package's `@tangle-network/sandbox` peer is `optional` (as of
|
|
20
|
+
0.44.0). Foreign consumers can install agent-eval and run the full LAND
|
|
21
|
+
tier without our sandbox or its dependencies.
|
|
22
|
+
|
|
23
|
+
## Five types, four functions
|
|
24
|
+
|
|
25
|
+
```ts
|
|
26
|
+
import {
|
|
27
|
+
// Types
|
|
28
|
+
type Scenario, // what you evaluate against (id + kind + your fields)
|
|
29
|
+
type Dispatch, // your agent, wrapped as one function
|
|
30
|
+
type JudgeConfig, // pluggable dimensional scorer
|
|
31
|
+
type Mutator, // proposes a next surface
|
|
32
|
+
type Gate, // promotion guard
|
|
33
|
+
|
|
34
|
+
// Functions
|
|
35
|
+
runEval,
|
|
36
|
+
runCampaign,
|
|
37
|
+
runImprovementLoop,
|
|
38
|
+
defaultProductionGate,
|
|
39
|
+
|
|
40
|
+
// Storage
|
|
41
|
+
fsCampaignStorage,
|
|
42
|
+
inMemoryCampaignStorage,
|
|
43
|
+
} from '@tangle-network/agent-eval/contract'
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Every export above is committed under semver. New minors only ADD;
|
|
47
|
+
nothing here changes shape in a 0.x minor.
|
|
48
|
+
|
|
49
|
+
## Three steps to wire your agent
|
|
50
|
+
|
|
51
|
+
### 1. Scenarios
|
|
52
|
+
|
|
53
|
+
```ts
|
|
54
|
+
interface MarketingScenario extends Scenario {
|
|
55
|
+
blurb: string
|
|
56
|
+
surface: 'landing-hero' | 'tweet' | 'email-subject'
|
|
57
|
+
audience: string
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
const scenarios: MarketingScenario[] = [
|
|
61
|
+
{ id: 's1', kind: 'marketing-rewrite', blurb: '...', surface: 'tweet', audience: '...' },
|
|
62
|
+
// ...
|
|
63
|
+
]
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### 2. Wrap your agent as `Dispatch`
|
|
67
|
+
|
|
68
|
+
```ts
|
|
69
|
+
const dispatch: Dispatch<MarketingScenario, MarketingArtifact> = async (scenario, ctx) => {
|
|
70
|
+
const rewrite = await callYourAgent(scenario, { signal: ctx.signal })
|
|
71
|
+
return { rewrite, modelUsed: '...' }
|
|
72
|
+
}
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
`ctx` carries `signal` (cancellation), `trace` (write spans), `artifacts`
|
|
76
|
+
(write blobs), `cost` (token + $ meter). Use them or ignore them.
|
|
77
|
+
|
|
78
|
+
### 3. Bring a judge
|
|
79
|
+
|
|
80
|
+
```ts
|
|
81
|
+
const judge: JudgeConfig<MarketingArtifact, MarketingScenario> = {
|
|
82
|
+
name: 'marketing-quality',
|
|
83
|
+
dimensions: [
|
|
84
|
+
{ key: 'hook_strength', description: '...' },
|
|
85
|
+
{ key: 'voice_match', description: '...' },
|
|
86
|
+
{ key: 'cta_clarity', description: '...' },
|
|
87
|
+
{ key: 'factual_grounding', description: '...' },
|
|
88
|
+
],
|
|
89
|
+
async score({ artifact, scenario, signal }) {
|
|
90
|
+
// LLM call, heuristic, ensemble — anything. Return JudgeScore.
|
|
91
|
+
return { dimensions: { ... }, composite: 0.72, notes: '...' }
|
|
92
|
+
},
|
|
93
|
+
}
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Throw on failure; the substrate records it as a failed cell. No silent
|
|
97
|
+
zeros.
|
|
98
|
+
|
|
99
|
+
## Baseline
|
|
100
|
+
|
|
101
|
+
```ts
|
|
102
|
+
const baseline = await runEval({
|
|
103
|
+
scenarios,
|
|
104
|
+
dispatch,
|
|
105
|
+
judges: [judge],
|
|
106
|
+
storage: inMemoryCampaignStorage(),
|
|
107
|
+
runDir: 'mem://my-baseline',
|
|
108
|
+
})
|
|
109
|
+
|
|
110
|
+
const score = Object.values(baseline.aggregates.byScenario)
|
|
111
|
+
.reduce((sum, s) => sum + s.meanComposite, 0) / scenarios.length
|
|
112
|
+
|
|
113
|
+
console.log(`Baseline composite: ${score.toFixed(3)}`)
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Self-improvement loop
|
|
117
|
+
|
|
118
|
+
```ts
|
|
119
|
+
import { gepaDriver, defaultProductionGate } from '@tangle-network/agent-eval/contract'
|
|
120
|
+
|
|
121
|
+
const result = await runImprovementLoop({
|
|
122
|
+
scenarios: trainScenarios,
|
|
123
|
+
baselineSurface,
|
|
124
|
+
dispatchWithSurface: (surface, scenario, ctx) =>
|
|
125
|
+
runYourAgent({ systemPrompt: surface as string }, scenario, ctx),
|
|
126
|
+
driver: gepaDriver({
|
|
127
|
+
llm: { apiKey: process.env.OPENAI_API_KEY, baseUrl: '...' },
|
|
128
|
+
model: 'gpt-4o-mini',
|
|
129
|
+
target: 'marketing copywriting system prompt',
|
|
130
|
+
mutationPrimitives: [
|
|
131
|
+
'Tighten the hook: lead with the concrete user outcome.',
|
|
132
|
+
'Replace generic adjectives with specific verbs.',
|
|
133
|
+
// ...
|
|
134
|
+
],
|
|
135
|
+
}),
|
|
136
|
+
judges: [judge],
|
|
137
|
+
populationSize: 2,
|
|
138
|
+
maxGenerations: 3,
|
|
139
|
+
holdoutScenarios,
|
|
140
|
+
gate: defaultProductionGate({
|
|
141
|
+
holdoutScenarios,
|
|
142
|
+
deltaThreshold: 0.05,
|
|
143
|
+
}),
|
|
144
|
+
autoOnPromote: 'none',
|
|
145
|
+
storage: inMemoryCampaignStorage(),
|
|
146
|
+
runDir: 'mem://my-improve',
|
|
147
|
+
})
|
|
148
|
+
|
|
149
|
+
if (result.gateResult.decision === 'ship') {
|
|
150
|
+
// Deploy result.winnerSurface — we don't push it for you.
|
|
151
|
+
}
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
The gate decision is `'ship'` | `'hold'` | `'need_more_work'` |
|
|
155
|
+
`'model_ceiling'` | `'arch_ceiling'`. You define what each means in
|
|
156
|
+
your deploy pipeline.
|
|
157
|
+
|
|
158
|
+
## What you control
|
|
159
|
+
|
|
160
|
+
- The agent (any framework, any model, any backend).
|
|
161
|
+
- The judge (LLM, heuristic, ensemble; we don't pick).
|
|
162
|
+
- The mutation strategy (`gepaDriver` for reflective LLM mutation,
|
|
163
|
+
`evolutionaryDriver({ mutator })` for population search, or
|
|
164
|
+
implement `ImprovementDriver` directly).
|
|
165
|
+
- The gate (compose `defaultProductionGate` with custom checks via
|
|
166
|
+
`composeGate`).
|
|
167
|
+
- The deploy step (`autoOnPromote: 'pr'` opens a GitHub PR with the
|
|
168
|
+
winner; `'none'` returns the surface and you ship however you ship).
|
|
169
|
+
|
|
170
|
+
## What this does NOT install
|
|
171
|
+
|
|
172
|
+
- No `@tangle-network/sandbox` — nothing runs in a Tangle sandbox.
|
|
173
|
+
- No hosted orchestrator — traces, artifacts, judge scores stay on
|
|
174
|
+
your machine (or in `inMemoryCampaignStorage` for Workers/edge).
|
|
175
|
+
- No daemons — `runEval` and `runImprovementLoop` complete in-process
|
|
176
|
+
and return.
|
|
177
|
+
|
|
178
|
+
## When you want more
|
|
179
|
+
|
|
180
|
+
The wedge doc (`docs/design/external-agent-wedge.md`) lays out three
|
|
181
|
+
graduated tiers:
|
|
182
|
+
|
|
183
|
+
| Tier | What you do | What you get |
|
|
184
|
+
|---|---|---|
|
|
185
|
+
| **LAND** (this quickstart) | `npm i @tangle-network/agent-eval`, wrap dispatch + judge, run loops | Local artifacts; full self-improvement; no Tangle infra |
|
|
186
|
+
| **EXPAND** | Point trace/eval data at our hosted orchestrator | Hosted dashboards, cross-run intelligence, billing on data routed to us |
|
|
187
|
+
| **PLATFORM** | Move execution into our sandbox | Substrate + orchestrator data pre-wired; sandbox usage billing |
|
|
188
|
+
|
|
189
|
+
Each tier is opt-in. EXPAND and PLATFORM build on the same primitives;
|
|
190
|
+
upgrading is adding configuration, not rewriting your wiring.
|