@tangle-network/agent-eval 0.40.3 → 0.40.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/campaign/index.d.ts +12 -9
- package/dist/campaign/index.js +2 -2
- package/dist/{chunk-TMXPFWC7.js → chunk-YNMCYUWT.js} +10 -10
- package/dist/chunk-YNMCYUWT.js.map +1 -0
- package/dist/openapi.json +1 -1
- package/dist/{run-campaign-JYJXYHHL.js → run-campaign-KEJK5KFT.js} +2 -2
- package/docs/design/phase4-consumer-migration.md +70 -0
- package/docs/design/primitives-integration-spec.md +393 -0
- package/docs/design/product-self-improvement-loop.md +146 -0
- package/package.json +1 -1
- package/dist/chunk-TMXPFWC7.js.map +0 -1
- /package/dist/{run-campaign-JYJXYHHL.js.map → run-campaign-KEJK5KFT.js.map} +0 -0
package/dist/campaign/index.d.ts
CHANGED
|
@@ -69,21 +69,24 @@ interface JudgeDimension {
|
|
|
69
69
|
/** Description shown in the judge's user prompt. */
|
|
70
70
|
description: string;
|
|
71
71
|
}
|
|
72
|
-
/** @experimental Pluggable dimensional scorer.
|
|
73
|
-
*
|
|
74
|
-
*
|
|
72
|
+
/** @experimental Pluggable dimensional scorer. `score` is the contract:
|
|
73
|
+
* given an artifact + scenario, return a `JudgeScore`. This is deliberately a
|
|
74
|
+
* function, not a fixed LLM-prompt shape — real consumers judge with
|
|
75
|
+
* ensembles, deterministic checks, or a single LLM call, and the substrate
|
|
76
|
+
* must not constrain that. The `llmJudge()` helper builds a `score` that does
|
|
77
|
+
* one LLM call for the common case. `appliesTo` lets a judge run only on
|
|
78
|
+
* scenarios that match (e.g. a legal-citation judge only on legal scenarios). */
|
|
75
79
|
interface JudgeConfig<TArtifact, TScenario extends Scenario = Scenario> {
|
|
76
80
|
name: string;
|
|
77
|
-
model?: string;
|
|
78
81
|
dimensions: JudgeDimension[];
|
|
79
|
-
|
|
80
|
-
|
|
82
|
+
/** Score one artifact. Throw on failure — a thrown judge is recorded as a
|
|
83
|
+
* failed cell, never silently folded into a zero. */
|
|
84
|
+
score(input: {
|
|
81
85
|
artifact: TArtifact;
|
|
82
86
|
scenario: TScenario;
|
|
83
|
-
|
|
87
|
+
signal: AbortSignal;
|
|
88
|
+
}): JudgeScore | Promise<JudgeScore>;
|
|
84
89
|
appliesTo?: (scenario: TScenario) => boolean;
|
|
85
|
-
apiKey?: string;
|
|
86
|
-
baseUrl?: string;
|
|
87
90
|
}
|
|
88
91
|
interface JudgeScore {
|
|
89
92
|
dimensions: Record<string, number>;
|
package/dist/campaign/index.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
runCampaign
|
|
3
|
-
} from "../chunk-
|
|
3
|
+
} from "../chunk-YNMCYUWT.js";
|
|
4
4
|
import {
|
|
5
5
|
runCanaries,
|
|
6
6
|
scoreRedTeamOutput
|
|
@@ -655,7 +655,7 @@ async function runImprovementLoop(opts) {
|
|
|
655
655
|
throw new Error("runImprovementLoop: autoOnPromote='pr' requires ghOwner + ghRepo.");
|
|
656
656
|
}
|
|
657
657
|
const optimization = await runOptimization(opts);
|
|
658
|
-
const { runCampaign: runCampaign2 } = await import("../run-campaign-
|
|
658
|
+
const { runCampaign: runCampaign2 } = await import("../run-campaign-KEJK5KFT.js");
|
|
659
659
|
const baselineOnHoldout = await runCampaign2({
|
|
660
660
|
...opts,
|
|
661
661
|
scenarios: opts.holdoutScenarios,
|
|
@@ -164,14 +164,14 @@ async function executeCell(args) {
|
|
|
164
164
|
for (const judge of args.opts.judges ?? []) {
|
|
165
165
|
if (judge.appliesTo && !judge.appliesTo(args.slot.scenario)) continue;
|
|
166
166
|
try {
|
|
167
|
-
|
|
168
|
-
|
|
167
|
+
judgeScores[judge.name] = await runJudgeCell(judge, {
|
|
168
|
+
artifact,
|
|
169
|
+
scenario: args.slot.scenario,
|
|
170
|
+
signal: args.signal
|
|
171
|
+
});
|
|
169
172
|
} catch (err) {
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
composite: 0,
|
|
173
|
-
notes: `judge failed: ${err instanceof Error ? err.message : String(err)}`
|
|
174
|
-
};
|
|
173
|
+
errorMessage = `judge '${judge.name}' failed: ${err instanceof Error ? err.message : String(err)}`;
|
|
174
|
+
break;
|
|
175
175
|
}
|
|
176
176
|
}
|
|
177
177
|
}
|
|
@@ -193,8 +193,8 @@ async function executeCell(args) {
|
|
|
193
193
|
}
|
|
194
194
|
return { cell, artifactsByPath };
|
|
195
195
|
}
|
|
196
|
-
async function runJudgeCell(
|
|
197
|
-
return
|
|
196
|
+
async function runJudgeCell(judge, input) {
|
|
197
|
+
return judge.score(input);
|
|
198
198
|
}
|
|
199
199
|
function defaultBuildTraceWriter(cellId, dir) {
|
|
200
200
|
const spans = [];
|
|
@@ -302,4 +302,4 @@ function aggregate(samples, seed) {
|
|
|
302
302
|
export {
|
|
303
303
|
runCampaign
|
|
304
304
|
};
|
|
305
|
-
//# sourceMappingURL=chunk-
|
|
305
|
+
//# sourceMappingURL=chunk-YNMCYUWT.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/campaign/run-campaign.ts"],"sourcesContent":["/**\n * @experimental\n *\n * `runCampaign` — Pass A substrate primitive. ONE function that orchestrates\n * scenarios → dispatch → artifacts → judges → aggregates, with full\n * reproducibility (seed + manifest hash), cell-level resumability, bootstrap\n * CIs, and the `LabeledScenarioStore` capture flywheel.\n *\n * Improvement loops (optimizer / gate / autoOnPromote) ride on top of this\n * primitive but live in `presets/run-improvement-loop.ts`. This file keeps\n * the core orchestrator minimal — Phase 1 of the Pass A track.\n */\n\nimport { createHash } from 'node:crypto'\nimport { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport { confidenceInterval } from '../statistics'\nimport type {\n CampaignAggregates,\n CampaignArtifactWriter,\n CampaignCellResult,\n CampaignCostMeter,\n CampaignResult,\n CampaignTraceWriter,\n DispatchContext,\n DispatchFn,\n JudgeAggregate,\n JudgeConfig,\n JudgeScore,\n LabeledScenarioStore,\n Scenario,\n ScenarioAggregate,\n TraceSpan,\n} from './types'\n\nexport interface RunCampaignOptions<TScenario extends Scenario, TArtifact> {\n scenarios: TScenario[]\n dispatch: DispatchFn<TScenario, TArtifact>\n judges?: JudgeConfig<TArtifact, TScenario>[]\n /** Required for reproducibility. Default 42. */\n seed?: number\n /** Per-scenario replicates for CI bands. Default 1; raise to 5+ for\n * bootstrap-tight intervals on critical eval. */\n reps?: number\n /** When true (default), completed cells are cached by\n * (manifestHash, scenarioId, rep, generation). Re-runs skip cached cells. */\n resumable?: boolean\n /** Optional store — when present, every artifact + judge score is captured\n * with the configured `captureSource`. Capture is default ON; pass `'off'`\n * to disable. */\n labeledStore?: LabeledScenarioStore | 'off'\n captureSource?: 'production-trace' | 'eval-run' | 'manual' | 'red-team' | 'synthetic'\n captureSourceVersionHash?: string\n /** Wall-clock cost cap across all cells. Cells beyond ceiling are skipped. */\n costCeiling?: number\n /** Max concurrent cells. Default 2. */\n maxConcurrency?: number\n /** Required: where artifacts + traces land. */\n runDir: string\n /** Tracing posture. Default is the substrate's `FileSystemTraceStore` rooted\n * at `<runDir>/traces/`. `'off'` disables capture entirely — substrate\n * refuses this when the caller wires `autoOnPromote !== 'none'`. */\n tracing?: 'on' | 'off'\n /** Test seam — override the wall clock for deterministic tests. */\n now?: () => Date\n /** Test seam — override per-cell trace writer factory. */\n buildTraceWriter?: (cellId: string, dir: string) => CampaignTraceWriter\n}\n\nexport async function runCampaign<TScenario extends Scenario, TArtifact>(\n opts: RunCampaignOptions<TScenario, TArtifact>,\n): Promise<CampaignResult<TArtifact, TScenario>> {\n const seed = opts.seed ?? 42\n const reps = opts.reps ?? 1\n const resumable = opts.resumable ?? true\n const maxConcurrency = opts.maxConcurrency ?? 2\n const now = opts.now ?? (() => new Date())\n const judges = opts.judges ?? []\n\n if (!existsSync(opts.runDir)) mkdirSync(opts.runDir, { recursive: true })\n\n const manifestHash = computeManifestHash({\n scenarios: opts.scenarios,\n judges: judges as unknown as JudgeConfig<unknown>[],\n dispatchRef: opts.dispatch.name || 'anonymous',\n seed,\n reps,\n })\n\n const startedAt = now()\n const cells: CampaignCellResult<TArtifact>[] = []\n const artifactsByPath: Record<string, string> = {}\n\n // Build the cell schedule (scenario × rep).\n const schedule: Array<{ scenario: TScenario; rep: number; cellId: string; cellSeed: number }> = []\n let cellIndex = 0\n for (const scenario of opts.scenarios) {\n for (let rep = 0; rep < reps; rep++) {\n const cellId = `${scenario.id}:${rep}`\n const cellSeed = seed + cellIndex\n schedule.push({ scenario, rep, cellId, cellSeed })\n cellIndex += 1\n }\n }\n\n // Concurrency-limited execution.\n let totalCostUsd = 0\n let costCeilingReached = false\n const abortController = new AbortController()\n // Concurrency lanes that drain the cell schedule. Named \"lanes\" — not\n // \"workers\" — to avoid clashing with the taxonomy's worker (= the agent\n // harness in a sandbox, invoked behind `dispatch`). See loop-taxonomy.md.\n const lanes: Promise<void>[] = []\n let nextIdx = 0\n const cellsRef = cells\n\n for (let i = 0; i < maxConcurrency; i++) {\n lanes.push(\n (async () => {\n while (true) {\n const myIdx = nextIdx++\n if (myIdx >= schedule.length) return\n const slot = schedule[myIdx]!\n if (costCeilingReached) {\n cellsRef.push(skippedCell(slot, 'cost_ceiling_reached'))\n continue\n }\n const result = await executeCell({\n slot,\n opts,\n manifestHash,\n resumable,\n now,\n buildTraceWriter: opts.buildTraceWriter ?? defaultBuildTraceWriter,\n signal: abortController.signal,\n })\n cellsRef.push(result.cell)\n totalCostUsd += result.cell.costUsd\n Object.assign(artifactsByPath, result.artifactsByPath)\n if (opts.costCeiling !== undefined && totalCostUsd >= opts.costCeiling) {\n costCeilingReached = true\n }\n // Capture into LabeledScenarioStore unless explicitly disabled.\n if (opts.labeledStore && opts.labeledStore !== 'off' && !result.cell.error) {\n await captureToStore({\n store: opts.labeledStore,\n cell: result.cell,\n scenario: slot.scenario,\n opts,\n now,\n }).catch((err) => {\n // Capture failures are non-fatal — log but don't crash the campaign.\n // (Trace would normally land here.)\n console.warn(\n `[runCampaign] capture failed for ${result.cell.cellId}: ${err instanceof Error ? err.message : String(err)}`,\n )\n })\n }\n }\n })(),\n )\n }\n await Promise.all(lanes)\n\n const endedAt = now()\n cellsRef.sort((a, b) => a.cellId.localeCompare(b.cellId))\n\n const aggregates = computeAggregates(\n cellsRef,\n judges as unknown as JudgeConfig<TArtifact>[],\n seed,\n )\n\n return {\n manifestHash,\n seed,\n startedAt: startedAt.toISOString(),\n endedAt: endedAt.toISOString(),\n durationMs: endedAt.getTime() - startedAt.getTime(),\n cells: cellsRef,\n aggregates,\n runDir: opts.runDir,\n artifactsByPath,\n scenarios: opts.scenarios.map((s) => ({ id: s.id, kind: s.kind })),\n }\n}\n\n// ── Internals ─────────────────────────────────────────────────────────\n\ninterface ExecuteCellArgs<TScenario extends Scenario, TArtifact> {\n slot: { scenario: TScenario; rep: number; cellId: string; cellSeed: number }\n opts: RunCampaignOptions<TScenario, TArtifact>\n manifestHash: string\n resumable: boolean\n now: () => Date\n buildTraceWriter: (cellId: string, dir: string) => CampaignTraceWriter\n signal: AbortSignal\n}\n\nasync function executeCell<TScenario extends Scenario, TArtifact>(\n args: ExecuteCellArgs<TScenario, TArtifact>,\n): Promise<{ cell: CampaignCellResult<TArtifact>; artifactsByPath: Record<string, string> }> {\n const cellDir = join(args.opts.runDir, args.slot.cellId.replace(/[^a-zA-Z0-9_-]/g, '_'))\n if (!existsSync(cellDir)) mkdirSync(cellDir, { recursive: true })\n\n // Resumability: cache key = (manifestHash, scenarioId, rep)\n const cachePath = join(cellDir, 'cached-result.json')\n if (args.resumable && existsSync(cachePath)) {\n try {\n const cached = JSON.parse(readFileSync(cachePath, 'utf8')) as CampaignCellResult<TArtifact>\n if (cached.cellId === args.slot.cellId) {\n return { cell: { ...cached, cached: true }, artifactsByPath: {} }\n }\n } catch {\n // Corrupt cache — fall through to re-run.\n }\n }\n\n const startMs = Date.now()\n const trace = args.buildTraceWriter(args.slot.cellId, cellDir)\n const artifactsByPath: Record<string, string> = {}\n const artifacts: CampaignArtifactWriter = {\n async write(path, content) {\n const fullPath = join(cellDir, path)\n const dir = join(fullPath, '..')\n if (!existsSync(dir)) mkdirSync(dir, { recursive: true })\n writeFileSync(fullPath, content as Uint8Array)\n artifactsByPath[`${args.slot.cellId}/${path}`] = fullPath\n return fullPath\n },\n async writeJson(path, value) {\n return artifacts.write(path, JSON.stringify(value, null, 2))\n },\n }\n let costSoFar = 0\n const cost: CampaignCostMeter = {\n observe(amount, source) {\n costSoFar += amount\n trace.span(`cost.${source}`, { amountUsd: amount }).end()\n },\n current() {\n return costSoFar\n },\n }\n\n const ctx: DispatchContext = {\n cellId: args.slot.cellId,\n rep: args.slot.rep,\n seed: args.slot.cellSeed,\n signal: args.signal,\n trace,\n artifacts,\n cost,\n }\n\n let artifact: TArtifact | undefined\n let errorMessage: string | undefined\n try {\n artifact = await args.opts.dispatch(args.slot.scenario, ctx)\n } catch (err) {\n errorMessage = err instanceof Error ? err.message : String(err)\n }\n\n // Run judges (only if we have an artifact). A judge that throws invalidates\n // the cell — recorded as `error`, NOT folded into a fake composite:0 (a fake\n // zero is indistinguishable from a real zero and poisons every aggregate).\n const judgeScores: Record<string, JudgeScore> = {}\n if (artifact !== undefined) {\n for (const judge of args.opts.judges ?? []) {\n if (judge.appliesTo && !judge.appliesTo(args.slot.scenario)) continue\n try {\n judgeScores[judge.name] = await runJudgeCell(judge, {\n artifact,\n scenario: args.slot.scenario,\n signal: args.signal,\n })\n } catch (err) {\n errorMessage = `judge '${judge.name}' failed: ${err instanceof Error ? err.message : String(err)}`\n break\n }\n }\n }\n\n await trace.flush()\n\n const cell: CampaignCellResult<TArtifact> = {\n cellId: args.slot.cellId,\n scenarioId: args.slot.scenario.id,\n rep: args.slot.rep,\n artifact: (artifact ?? null) as TArtifact,\n judgeScores,\n costUsd: costSoFar,\n durationMs: Date.now() - startMs,\n seed: args.slot.cellSeed,\n cached: false,\n error: errorMessage,\n }\n\n if (!errorMessage && args.resumable) {\n writeFileSync(cachePath, JSON.stringify(cell))\n }\n\n return { cell, artifactsByPath }\n}\n\nasync function runJudgeCell<TArtifact, TScenario extends Scenario>(\n judge: JudgeConfig<TArtifact, TScenario>,\n input: { artifact: TArtifact; scenario: TScenario; signal: AbortSignal },\n): Promise<JudgeScore> {\n return judge.score(input)\n}\n\nfunction defaultBuildTraceWriter(cellId: string, dir: string): CampaignTraceWriter {\n const spans: Array<Record<string, unknown>> = []\n return {\n span(name, attributes) {\n const startMs = Date.now()\n const record: Record<string, unknown> = { name, cellId, startMs, ...(attributes ?? {}) }\n const finish: TraceSpan = {\n end(endAttrs) {\n record.durationMs = Date.now() - startMs\n if (endAttrs) Object.assign(record, endAttrs)\n spans.push(record)\n },\n setAttribute(key, value) {\n record[key] = value\n },\n }\n return finish\n },\n async flush() {\n const path = join(dir, 'spans.jsonl')\n writeFileSync(path, spans.map((s) => JSON.stringify(s)).join('\\n'))\n },\n }\n}\n\nfunction skippedCell<TScenario extends Scenario, TArtifact>(\n slot: { scenario: TScenario; rep: number; cellId: string; cellSeed: number },\n reason: string,\n): CampaignCellResult<TArtifact> {\n return {\n cellId: slot.cellId,\n scenarioId: slot.scenario.id,\n rep: slot.rep,\n artifact: null as unknown as TArtifact,\n judgeScores: {},\n costUsd: 0,\n durationMs: 0,\n seed: slot.cellSeed,\n cached: false,\n error: `skipped: ${reason}`,\n }\n}\n\ninterface CaptureArgs<TScenario extends Scenario, TArtifact> {\n store: LabeledScenarioStore\n cell: CampaignCellResult<TArtifact>\n scenario: TScenario\n opts: RunCampaignOptions<TScenario, TArtifact>\n now: () => Date\n}\n\nasync function captureToStore<TScenario extends Scenario, TArtifact>(\n args: CaptureArgs<TScenario, TArtifact>,\n): Promise<void> {\n await args.store.observe({\n scenario: args.scenario,\n artifact: args.cell.artifact,\n judgeScores: args.cell.judgeScores,\n source: args.opts.captureSource ?? 'eval-run',\n sourceVersionHash: args.opts.captureSourceVersionHash ?? 'unknown',\n capturedAt: args.now().toISOString(),\n redactionStatus: 'raw',\n })\n}\n\n// ── Aggregates + manifest hash ────────────────────────────────────────\n\nfunction computeManifestHash(input: {\n scenarios: Scenario[]\n judges: JudgeConfig<unknown>[]\n dispatchRef: string\n seed: number\n reps: number\n}): string {\n const canonical = {\n scenarios: input.scenarios.map((s) => ({ id: s.id, kind: s.kind })),\n judges: input.judges.map((j) => ({ name: j.name, dims: j.dimensions.map((d) => d.key) })),\n dispatch: input.dispatchRef,\n seed: input.seed,\n reps: input.reps,\n }\n return createHash('sha256').update(JSON.stringify(canonical)).digest('hex')\n}\n\nfunction computeAggregates<TArtifact>(\n cells: CampaignCellResult<TArtifact>[],\n judges: JudgeConfig<TArtifact>[],\n seed: number,\n): CampaignAggregates {\n const byJudge: Record<string, JudgeAggregate> = {}\n for (const judge of judges) {\n const scores: number[] = []\n for (const cell of cells) {\n const s = cell.judgeScores[judge.name]\n if (s !== undefined) scores.push(s.composite)\n }\n byJudge[judge.name] = aggregate(scores, seed)\n }\n const byScenario: Record<string, ScenarioAggregate> = {}\n const scenarioGroups = new Map<string, number[]>()\n for (const cell of cells) {\n const composites = Object.values(cell.judgeScores).map((s) => s.composite)\n if (composites.length === 0) continue\n const mean = composites.reduce((a, b) => a + b, 0) / composites.length\n const arr = scenarioGroups.get(cell.scenarioId) ?? []\n arr.push(mean)\n scenarioGroups.set(cell.scenarioId, arr)\n }\n for (const [scenarioId, samples] of scenarioGroups) {\n const ag = aggregate(samples, seed)\n byScenario[scenarioId] = { meanComposite: ag.mean, ci95: ag.ci95, n: ag.n }\n }\n return {\n byJudge,\n byScenario,\n totalCostUsd: cells.reduce((a, c) => a + c.costUsd, 0),\n cellsExecuted: cells.filter((c) => !c.error).length,\n cellsSkipped: cells.filter((c) => c.error?.startsWith('skipped:')).length,\n cellsCached: cells.filter((c) => c.cached).length,\n cellsFailed: cells.filter((c) => c.error && !c.error.startsWith('skipped:')).length,\n }\n}\n\n// Percentile bootstrap CI95 via seeded resampling. Deterministic for a given\n// seed — same campaign re-run produces identical CI bands. Falls back to\n// degenerate intervals at n<=1 (the bootstrap is undefined there).\nfunction aggregate(samples: number[], seed: number): JudgeAggregate {\n const n = samples.length\n if (n === 0) return { mean: 0, stdev: 0, ci95: [0, 0], n: 0 }\n const mean = samples.reduce((a, b) => a + b, 0) / n\n const variance = samples.reduce((a, b) => a + (b - mean) ** 2, 0) / Math.max(1, n - 1)\n const stdev = Math.sqrt(variance)\n const ci = confidenceInterval(samples, 0.95, { seed, resamples: 1000 })\n return { mean, stdev, ci95: [ci.lower, ci.upper], n }\n}\n"],"mappings":";;;;;AAaA,SAAS,kBAAkB;AAC3B,SAAS,YAAY,WAAW,cAAc,qBAAqB;AACnE,SAAS,YAAY;AAsDrB,eAAsB,YACpB,MAC+C;AAC/C,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,YAAY,KAAK,aAAa;AACpC,QAAM,iBAAiB,KAAK,kBAAkB;AAC9C,QAAM,MAAM,KAAK,QAAQ,MAAM,oBAAI,KAAK;AACxC,QAAM,SAAS,KAAK,UAAU,CAAC;AAE/B,MAAI,CAAC,WAAW,KAAK,MAAM,EAAG,WAAU,KAAK,QAAQ,EAAE,WAAW,KAAK,CAAC;AAExE,QAAM,eAAe,oBAAoB;AAAA,IACvC,WAAW,KAAK;AAAA,IAChB;AAAA,IACA,aAAa,KAAK,SAAS,QAAQ;AAAA,IACnC;AAAA,IACA;AAAA,EACF,CAAC;AAED,QAAM,YAAY,IAAI;AACtB,QAAM,QAAyC,CAAC;AAChD,QAAM,kBAA0C,CAAC;AAGjD,QAAM,WAA0F,CAAC;AACjG,MAAI,YAAY;AAChB,aAAW,YAAY,KAAK,WAAW;AACrC,aAAS,MAAM,GAAG,MAAM,MAAM,OAAO;AACnC,YAAM,SAAS,GAAG,SAAS,EAAE,IAAI,GAAG;AACpC,YAAM,WAAW,OAAO;AACxB,eAAS,KAAK,EAAE,UAAU,KAAK,QAAQ,SAAS,CAAC;AACjD,mBAAa;AAAA,IACf;AAAA,EACF;AAGA,MAAI,eAAe;AACnB,MAAI,qBAAqB;AACzB,QAAM,kBAAkB,IAAI,gBAAgB;AAI5C,QAAM,QAAyB,CAAC;AAChC,MAAI,UAAU;AACd,QAAM,WAAW;AAEjB,WAAS,IAAI,GAAG,IAAI,gBAAgB,KAAK;AACvC,UAAM;AAAA,OACH,YAAY;AACX,eAAO,MAAM;AACX,gBAAM,QAAQ;AACd,cAAI,SAAS,SAAS,OAAQ;AAC9B,gBAAM,OAAO,SAAS,KAAK;AAC3B,cAAI,oBAAoB;AACtB,qBAAS,KAAK,YAAY,MAAM,sBAAsB,CAAC;AACvD;AAAA,UACF;AACA,gBAAM,SAAS,MAAM,YAAY;AAAA,YAC/B;AAAA,YACA;AAAA,YACA;AAAA,YACA;AAAA,YACA;AAAA,YACA,kBAAkB,KAAK,oBAAoB;AAAA,YAC3C,QAAQ,gBAAgB;AAAA,UAC1B,CAAC;AACD,mBAAS,KAAK,OAAO,IAAI;AACzB,0BAAgB,OAAO,KAAK;AAC5B,iBAAO,OAAO,iBAAiB,OAAO,eAAe;AACrD,cAAI,KAAK,gBAAgB,UAAa,gBAAgB,KAAK,aAAa;AACtE,iCAAqB;AAAA,UACvB;AAEA,cAAI,KAAK,gBAAgB,KAAK,iBAAiB,SAAS,CAAC,OAAO,KAAK,OAAO;AAC1E,kBAAM,eAAe;AAAA,cACnB,OAAO,KAAK;AAAA,cACZ,MAAM,OAAO;AAAA,cACb,UAAU,KAAK;AAAA,cACf;AAAA,cACA;AAAA,YACF,CAAC,EAAE,MAAM,CAAC,QAAQ;AAGhB,sBAAQ;AAAA,gBACN,oCAAoC,OAAO,KAAK,MAAM,KAAK,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,cAC7G;AAAA,YACF,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF,GAAG;AAAA,IACL;AAAA,EACF;AACA,QAAM,QAAQ,IAAI,KAAK;AAEvB,QAAM,UAAU,IAAI;AACpB,WAAS,KAAK,CAAC,GAAG,MAAM,EAAE,OAAO,cAAc,EAAE,MAAM,CAAC;AAExD,QAAM,aAAa;AAAA,IACjB;AAAA,IACA;AAAA,IACA;AAAA,EACF;AAEA,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,WAAW,UAAU,YAAY;AAAA,IACjC,SAAS,QAAQ,YAAY;AAAA,IAC7B,YAAY,QAAQ,QAAQ,IAAI,UAAU,QAAQ;AAAA,IAClD,OAAO;AAAA,IACP;AAAA,IACA,QAAQ,KAAK;AAAA,IACb;AAAA,IACA,WAAW,KAAK,UAAU,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,MAAM,EAAE,KAAK,EAAE;AAAA,EACnE;AACF;AAcA,eAAe,YACb,MAC2F;AAC3F,QAAM,UAAU,KAAK,KAAK,KAAK,QAAQ,KAAK,KAAK,OAAO,QAAQ,mBAAmB,GAAG,CAAC;AACvF,MAAI,CAAC,WAAW,OAAO,EAAG,WAAU,SAAS,EAAE,WAAW,KAAK,CAAC;AAGhE,QAAM,YAAY,KAAK,SAAS,oBAAoB;AACpD,MAAI,KAAK,aAAa,WAAW,SAAS,GAAG;AAC3C,QAAI;AACF,YAAM,SAAS,KAAK,MAAM,aAAa,WAAW,MAAM,CAAC;AACzD,UAAI,OAAO,WAAW,KAAK,KAAK,QAAQ;AACtC,eAAO,EAAE,MAAM,EAAE,GAAG,QAAQ,QAAQ,KAAK,GAAG,iBAAiB,CAAC,EAAE;AAAA,MAClE;AAAA,IACF,QAAQ;AAAA,IAER;AAAA,EACF;AAEA,QAAM,UAAU,KAAK,IAAI;AACzB,QAAM,QAAQ,KAAK,iBAAiB,KAAK,KAAK,QAAQ,OAAO;AAC7D,QAAM,kBAA0C,CAAC;AACjD,QAAM,YAAoC;AAAA,IACxC,MAAM,MAAM,MAAM,SAAS;AACzB,YAAM,WAAW,KAAK,SAAS,IAAI;AACnC,YAAM,MAAM,KAAK,UAAU,IAAI;AAC/B,UAAI,CAAC,WAAW,GAAG,EAAG,WAAU,KAAK,EAAE,WAAW,KAAK,CAAC;AACxD,oBAAc,UAAU,OAAqB;AAC7C,sBAAgB,GAAG,KAAK,KAAK,MAAM,IAAI,IAAI,EAAE,IAAI;AACjD,aAAO;AAAA,IACT;AAAA,IACA,MAAM,UAAU,MAAM,OAAO;AAC3B,aAAO,UAAU,MAAM,MAAM,KAAK,UAAU,OAAO,MAAM,CAAC,CAAC;AAAA,IAC7D;AAAA,EACF;AACA,MAAI,YAAY;AAChB,QAAM,OAA0B;AAAA,IAC9B,QAAQ,QAAQ,QAAQ;AACtB,mBAAa;AACb,YAAM,KAAK,QAAQ,MAAM,IAAI,EAAE,WAAW,OAAO,CAAC,EAAE,IAAI;AAAA,IAC1D;AAAA,IACA,UAAU;AACR,aAAO;AAAA,IACT;AAAA,EACF;AAEA,QAAM,MAAuB;AAAA,IAC3B,QAAQ,KAAK,KAAK;AAAA,IAClB,KAAK,KAAK,KAAK;AAAA,IACf,MAAM,KAAK,KAAK;AAAA,IAChB,QAAQ,KAAK;AAAA,IACb;AAAA,IACA;AAAA,IACA;AAAA,EACF;AAEA,MAAI;AACJ,MAAI;AACJ,MAAI;AACF,eAAW,MAAM,KAAK,KAAK,SAAS,KAAK,KAAK,UAAU,GAAG;AAAA,EAC7D,SAAS,KAAK;AACZ,mBAAe,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAAA,EAChE;AAKA,QAAM,cAA0C,CAAC;AACjD,MAAI,aAAa,QAAW;AAC1B,eAAW,SAAS,KAAK,KAAK,UAAU,CAAC,GAAG;AAC1C,UAAI,MAAM,aAAa,CAAC,MAAM,UAAU,KAAK,KAAK,QAAQ,EAAG;AAC7D,UAAI;AACF,oBAAY,MAAM,IAAI,IAAI,MAAM,aAAa,OAAO;AAAA,UAClD;AAAA,UACA,UAAU,KAAK,KAAK;AAAA,UACpB,QAAQ,KAAK;AAAA,QACf,CAAC;AAAA,MACH,SAAS,KAAK;AACZ,uBAAe,UAAU,MAAM,IAAI,aAAa,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAChG;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,QAAM,MAAM,MAAM;AAElB,QAAM,OAAsC;AAAA,IAC1C,QAAQ,KAAK,KAAK;AAAA,IAClB,YAAY,KAAK,KAAK,SAAS;AAAA,IAC/B,KAAK,KAAK,KAAK;AAAA,IACf,UAAW,YAAY;AAAA,IACvB;AAAA,IACA,SAAS;AAAA,IACT,YAAY,KAAK,IAAI,IAAI;AAAA,IACzB,MAAM,KAAK,KAAK;AAAA,IAChB,QAAQ;AAAA,IACR,OAAO;AAAA,EACT;AAEA,MAAI,CAAC,gBAAgB,KAAK,WAAW;AACnC,kBAAc,WAAW,KAAK,UAAU,IAAI,CAAC;AAAA,EAC/C;AAEA,SAAO,EAAE,MAAM,gBAAgB;AACjC;AAEA,eAAe,aACb,OACA,OACqB;AACrB,SAAO,MAAM,MAAM,KAAK;AAC1B;AAEA,SAAS,wBAAwB,QAAgB,KAAkC;AACjF,QAAM,QAAwC,CAAC;AAC/C,SAAO;AAAA,IACL,KAAK,MAAM,YAAY;AACrB,YAAM,UAAU,KAAK,IAAI;AACzB,YAAM,SAAkC,EAAE,MAAM,QAAQ,SAAS,GAAI,cAAc,CAAC,EAAG;AACvF,YAAM,SAAoB;AAAA,QACxB,IAAI,UAAU;AACZ,iBAAO,aAAa,KAAK,IAAI,IAAI;AACjC,cAAI,SAAU,QAAO,OAAO,QAAQ,QAAQ;AAC5C,gBAAM,KAAK,MAAM;AAAA,QACnB;AAAA,QACA,aAAa,KAAK,OAAO;AACvB,iBAAO,GAAG,IAAI;AAAA,QAChB;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,IACA,MAAM,QAAQ;AACZ,YAAM,OAAO,KAAK,KAAK,aAAa;AACpC,oBAAc,MAAM,MAAM,IAAI,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,EAAE,KAAK,IAAI,CAAC;AAAA,IACpE;AAAA,EACF;AACF;AAEA,SAAS,YACP,MACA,QAC+B;AAC/B,SAAO;AAAA,IACL,QAAQ,KAAK;AAAA,IACb,YAAY,KAAK,SAAS;AAAA,IAC1B,KAAK,KAAK;AAAA,IACV,UAAU;AAAA,IACV,aAAa,CAAC;AAAA,IACd,SAAS;AAAA,IACT,YAAY;AAAA,IACZ,MAAM,KAAK;AAAA,IACX,QAAQ;AAAA,IACR,OAAO,YAAY,MAAM;AAAA,EAC3B;AACF;AAUA,eAAe,eACb,MACe;AACf,QAAM,KAAK,MAAM,QAAQ;AAAA,IACvB,UAAU,KAAK;AAAA,IACf,UAAU,KAAK,KAAK;AAAA,IACpB,aAAa,KAAK,KAAK;AAAA,IACvB,QAAQ,KAAK,KAAK,iBAAiB;AAAA,IACnC,mBAAmB,KAAK,KAAK,4BAA4B;AAAA,IACzD,YAAY,KAAK,IAAI,EAAE,YAAY;AAAA,IACnC,iBAAiB;AAAA,EACnB,CAAC;AACH;AAIA,SAAS,oBAAoB,OAMlB;AACT,QAAM,YAAY;AAAA,IAChB,WAAW,MAAM,UAAU,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,MAAM,EAAE,KAAK,EAAE;AAAA,IAClE,QAAQ,MAAM,OAAO,IAAI,CAAC,OAAO,EAAE,MAAM,EAAE,MAAM,MAAM,EAAE,WAAW,IAAI,CAAC,MAAM,EAAE,GAAG,EAAE,EAAE;AAAA,IACxF,UAAU,MAAM;AAAA,IAChB,MAAM,MAAM;AAAA,IACZ,MAAM,MAAM;AAAA,EACd;AACA,SAAO,WAAW,QAAQ,EAAE,OAAO,KAAK,UAAU,SAAS,CAAC,EAAE,OAAO,KAAK;AAC5E;AAEA,SAAS,kBACP,OACA,QACA,MACoB;AACpB,QAAM,UAA0C,CAAC;AACjD,aAAW,SAAS,QAAQ;AAC1B,UAAM,SAAmB,CAAC;AAC1B,eAAW,QAAQ,OAAO;AACxB,YAAM,IAAI,KAAK,YAAY,MAAM,IAAI;AACrC,UAAI,MAAM,OAAW,QAAO,KAAK,EAAE,SAAS;AAAA,IAC9C;AACA,YAAQ,MAAM,IAAI,IAAI,UAAU,QAAQ,IAAI;AAAA,EAC9C;AACA,QAAM,aAAgD,CAAC;AACvD,QAAM,iBAAiB,oBAAI,IAAsB;AACjD,aAAW,QAAQ,OAAO;AACxB,UAAM,aAAa,OAAO,OAAO,KAAK,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AACzE,QAAI,WAAW,WAAW,EAAG;AAC7B,UAAM,OAAO,WAAW,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,WAAW;AAChE,UAAM,MAAM,eAAe,IAAI,KAAK,UAAU,KAAK,CAAC;AACpD,QAAI,KAAK,IAAI;AACb,mBAAe,IAAI,KAAK,YAAY,GAAG;AAAA,EACzC;AACA,aAAW,CAAC,YAAY,OAAO,KAAK,gBAAgB;AAClD,UAAM,KAAK,UAAU,SAAS,IAAI;AAClC,eAAW,UAAU,IAAI,EAAE,eAAe,GAAG,MAAM,MAAM,GAAG,MAAM,GAAG,GAAG,EAAE;AAAA,EAC5E;AACA,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,cAAc,MAAM,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,SAAS,CAAC;AAAA,IACrD,eAAe,MAAM,OAAO,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE;AAAA,IAC7C,cAAc,MAAM,OAAO,CAAC,MAAM,EAAE,OAAO,WAAW,UAAU,CAAC,EAAE;AAAA,IACnE,aAAa,MAAM,OAAO,CAAC,MAAM,EAAE,MAAM,EAAE;AAAA,IAC3C,aAAa,MAAM,OAAO,CAAC,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,WAAW,UAAU,CAAC,EAAE;AAAA,EAC/E;AACF;AAKA,SAAS,UAAU,SAAmB,MAA8B;AAClE,QAAM,IAAI,QAAQ;AAClB,MAAI,MAAM,EAAG,QAAO,EAAE,MAAM,GAAG,OAAO,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,GAAG,EAAE;AAC5D,QAAM,OAAO,QAAQ,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI;AAClD,QAAM,WAAW,QAAQ,OAAO,CAAC,GAAG,MAAM,KAAK,IAAI,SAAS,GAAG,CAAC,IAAI,KAAK,IAAI,GAAG,IAAI,CAAC;AACrF,QAAM,QAAQ,KAAK,KAAK,QAAQ;AAChC,QAAM,KAAK,mBAAmB,SAAS,MAAM,EAAE,MAAM,WAAW,IAAK,CAAC;AACtE,SAAO,EAAE,MAAM,OAAO,MAAM,CAAC,GAAG,OAAO,GAAG,KAAK,GAAG,EAAE;AACtD;","names":[]}
|
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.40.
|
|
5
|
+
"version": "0.40.4",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import {
|
|
2
2
|
runCampaign
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-YNMCYUWT.js";
|
|
4
4
|
import "./chunk-WP7SY7AI.js";
|
|
5
5
|
import "./chunk-QYJT52YW.js";
|
|
6
6
|
import "./chunk-PZ5AY32C.js";
|
|
7
7
|
export {
|
|
8
8
|
runCampaign
|
|
9
9
|
};
|
|
10
|
-
//# sourceMappingURL=run-campaign-
|
|
10
|
+
//# sourceMappingURL=run-campaign-KEJK5KFT.js.map
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# Phase 4 — consumer migration tracking
|
|
2
|
+
|
|
3
|
+
Migrate the product repos off their duplicated eval / prompt-evolution
|
|
4
|
+
orchestration onto the published substrate (`@tangle-network/agent-eval@^0.40.3`
|
|
5
|
+
+ `@tangle-network/agent-runtime@^0.25.0`). Integration contract:
|
|
6
|
+
[`primitives-integration-spec.md`](./primitives-integration-spec.md).
|
|
7
|
+
|
|
8
|
+
**Strategy:** prove **gtm end-to-end first** (the canonical consumer), then fan
|
|
9
|
+
the proven migration pattern to the rest via parallel subagents, each briefed
|
|
10
|
+
with the gtm reference diff + the spec's forbidden-anti-patterns list. Each
|
|
11
|
+
migration is its own reviewable, rollback-able PR.
|
|
12
|
+
|
|
13
|
+
## Status board
|
|
14
|
+
|
|
15
|
+
| Repo | Deletable orchestration (LOC est.) | Dispatch seam | Status | PR |
|
|
16
|
+
|---|---|---|---|---|
|
|
17
|
+
| gtm-agent | ~2,420 | `runChatThroughRuntime` | **IN PROGRESS** | — |
|
|
18
|
+
| legal-agent | tbd | tbd | queued | — |
|
|
19
|
+
| tax-agent | tbd | tbd | queued | — |
|
|
20
|
+
| creative-agent | tbd | tbd | queued | — |
|
|
21
|
+
| agent-builder | tbd | tbd | queued | — |
|
|
22
|
+
| blueprint-agent | tbd | tbd | queued (Drew dispatching via spec) | — |
|
|
23
|
+
| physim | tbd (MultiLayerVerifier adapter) | tbd | queued | — |
|
|
24
|
+
|
|
25
|
+
## Per-repo migration checklist
|
|
26
|
+
|
|
27
|
+
For each repo, in order:
|
|
28
|
+
|
|
29
|
+
- [ ] **Survey** — inventory eval + prompt-evolution wrappers (file:line + LOC).
|
|
30
|
+
Identify the dispatch seam, scenarios, judges, mutation strategy.
|
|
31
|
+
- [ ] **Bump deps** — `@tangle-network/agent-eval` → `^0.40.0`,
|
|
32
|
+
`@tangle-network/agent-runtime` → `^0.25.0`; `pnpm update`; baseline
|
|
33
|
+
typecheck green.
|
|
34
|
+
- [ ] **Rewire seams** — `dispatch`/`dispatchWithSurface`, `judges`,
|
|
35
|
+
`scenarios` extracted from the existing wrappers (KEEP domain logic).
|
|
36
|
+
- [ ] **Replace orchestration** — swap the local generation/population/scorecard
|
|
37
|
+
loop for `runImprovementLoop` (or `runCampaign` for eval-only). DELETE the
|
|
38
|
+
wrapper body.
|
|
39
|
+
- [ ] **Gate** — compose domain gates with `defaultProductionGate`.
|
|
40
|
+
- [ ] **Dataset** — wire `FsLabeledScenarioStore` with correct `captureSource`.
|
|
41
|
+
- [ ] **Tests** — port wrapper contract tests to assert the substrate wiring;
|
|
42
|
+
keep judge/scenario tests. Suite green.
|
|
43
|
+
- [ ] **Prove** — one real eval/improve run end-to-end; confirm scorecard +
|
|
44
|
+
(if applicable) a PR opens on a shipping gate.
|
|
45
|
+
- [ ] **Anti-pattern sweep** — no silent fallbacks, no reimplemented loop, no
|
|
46
|
+
train/holdout conflation, tracing on, dispatch named.
|
|
47
|
+
- [ ] **PR** — open, independent-review, merge.
|
|
48
|
+
|
|
49
|
+
## gtm-agent — migration map (from survey)
|
|
50
|
+
|
|
51
|
+
- **Branch base:** off the repo's working branch (`feat/gtm-rich-chat-actions`)
|
|
52
|
+
or main — confirm before starting.
|
|
53
|
+
- **Dispatch seam:** `runChatThroughRuntime(ctx)`
|
|
54
|
+
(`src/lib/.server/agent-runtime/chat.ts`) — prompt variant + scenario → real
|
|
55
|
+
agent run → artifact + events + token usage.
|
|
56
|
+
- **Scenarios:** `src/lib/.server/production-loop/scenarios.ts` (3 holdout) +
|
|
57
|
+
`eval/business-owner/personas.json` (canonical personas).
|
|
58
|
+
- **Judges:** `src/lib/.server/production-loop/judges.ts` (`runEnsembleJudge`,
|
|
59
|
+
3-model ensemble) + canonical 12-dimension judges in `eval/canonical.ts`.
|
|
60
|
+
- **Delete (~2,420 LOC orchestration):** the generation/population/reps loop in
|
|
61
|
+
`src/lib/.server/production-loop/index.ts` (~450), the checkpoint loop in
|
|
62
|
+
`eval/canonical.ts` (~600), `eval/run-prompt-evolution.ts` wrapper (~800),
|
|
63
|
+
`eval/analyst-loop.ts` wrapper (~300), `eval/optimization-campaign.ts` (~170),
|
|
64
|
+
`scripts/evals/run-optimization-campaign.ts` (~100 scaffold).
|
|
65
|
+
- **Rewire:** `buildHoldoutRunner` → `dispatchWithSurface`; `buildScorer` →
|
|
66
|
+
`judges`; `buildMutator` → `evolutionaryDriver({ mutator })`;
|
|
67
|
+
`runProductionLoop` → `runImprovementLoop`.
|
|
68
|
+
- **Keep:** judges, scenarios, persona data + reactive driver, deterministic
|
|
69
|
+
anti-slop/brief checks, GitHub PR wiring, feedback/trace ingestion.
|
|
70
|
+
- **Net:** ~1,400–1,600 LOC reduction.
|
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
# Self-improvement primitives — integration spec
|
|
2
|
+
|
|
3
|
+
**Audience:** an engineer (or agent) wiring a product onto the Tangle
|
|
4
|
+
self-improvement stack. This is the authoritative "how to use the primitives"
|
|
5
|
+
reference. It is exact: every signature, every seam, every forbidden pattern.
|
|
6
|
+
|
|
7
|
+
**Packages (published):**
|
|
8
|
+
- `@tangle-network/agent-eval@^0.40.3` — measurement + improvement loop +
|
|
9
|
+
worktree adapter + gates + dataset store. The leaf; depends on nothing
|
|
10
|
+
upstream. Import the loop surface from `@tangle-network/agent-eval/campaign`.
|
|
11
|
+
- `@tangle-network/agent-runtime@^0.25.0` — the runtime-side improvement
|
|
12
|
+
driver (`improvementDriver`) + generators (`reflectiveGenerator`,
|
|
13
|
+
`agenticGenerator`). Import from `@tangle-network/agent-runtime/improvement`.
|
|
14
|
+
|
|
15
|
+
Read [`loop-taxonomy.md`](./loop-taxonomy.md) (vocabulary) and
|
|
16
|
+
[`self-improvement-engine.md`](./self-improvement-engine.md) (phases) first.
|
|
17
|
+
This doc is the contract-level detail under them.
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## 0. The one-paragraph model
|
|
22
|
+
|
|
23
|
+
A **measurement** (`runCampaign`) runs your agent (behind a `dispatch` seam)
|
|
24
|
+
over `scenarios`, judges the outputs, and returns a scorecard with confidence
|
|
25
|
+
intervals. An **improvement loop** (`runImprovementLoop`) drives an
|
|
26
|
+
`ImprovementDriver` to propose candidate **surfaces** (a prompt string, or a
|
|
27
|
+
`CodeSurface` = a git worktree of code edits), measures each on a **holdout**,
|
|
28
|
+
runs a release **gate**, and opens a **PR** for the winner. Every run feeds a
|
|
29
|
+
**dataset** (`LabeledScenarioStore`) — the same corpus the optimizer learns
|
|
30
|
+
from. Three roles, fixed meaning: **driver** decides what's next; **worker** =
|
|
31
|
+
the agent in a sandbox (invoked behind `dispatch`); **measurement** runs the
|
|
32
|
+
worker and scores it.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## 1. The seams you implement (everything else is substrate)
|
|
37
|
+
|
|
38
|
+
You implement exactly three things. The substrate owns the rest.
|
|
39
|
+
|
|
40
|
+
| Seam | Type | What it is |
|
|
41
|
+
|---|---|---|
|
|
42
|
+
| `dispatch` | `(scenario, ctx) => Promise<TArtifact>` | invoke YOUR agent on one scenario → the artifact judges score. Topology-opaque: one LLM call, or a driver↔workers-in-a-sandbox loop — substrate doesn't care. |
|
|
43
|
+
| `judges` | `JudgeConfig<TArtifact, TScenario>[]` | score an artifact on named dimensions → composite. Your rubrics. |
|
|
44
|
+
| `scenarios` | `Scenario[]` | the inputs (`{ id, kind, ... }`). Your eval set. |
|
|
45
|
+
|
|
46
|
+
If you are also improving a surface, you additionally provide:
|
|
47
|
+
|
|
48
|
+
| Seam | Type | What it is |
|
|
49
|
+
|---|---|---|
|
|
50
|
+
| `dispatchWithSurface` | `(surface, scenario, ctx) => Promise<TArtifact>` | like `dispatch`, but takes the candidate surface (prompt string or `CodeSurface`) — swap it into your agent before running. |
|
|
51
|
+
| a **driver** | `ImprovementDriver` | how candidates are proposed (see §4). Use a shipped one; don't hand-roll. |
|
|
52
|
+
| a **gate** | `Gate` | ship/hold decision (use `defaultProductionGate`). |
|
|
53
|
+
|
|
54
|
+
**You never implement:** generation loops, population/top-K selection, seed
|
|
55
|
+
propagation, manifest hashing, cell caching, bootstrap CIs, worktree git
|
|
56
|
+
plumbing, PR-opening, or trace capture. Reimplementing any of these is the
|
|
57
|
+
anti-pattern this whole stack exists to delete.
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## 2. `runCampaign` — the measurement primitive
|
|
62
|
+
|
|
63
|
+
```ts
|
|
64
|
+
import { runCampaign, type RunCampaignOptions } from '@tangle-network/agent-eval/campaign'
|
|
65
|
+
|
|
66
|
+
const result = await runCampaign<MyScenario, MyArtifact>({
|
|
67
|
+
scenarios, // MyScenario[]
|
|
68
|
+
dispatch, // (scenario, ctx) => Promise<MyArtifact>
|
|
69
|
+
judges, // JudgeConfig<MyArtifact, MyScenario>[] (optional)
|
|
70
|
+
runDir: '/abs/run/dir', // REQUIRED — where artifacts + traces land
|
|
71
|
+
seed: 42, // default 42 — reproducibility
|
|
72
|
+
reps: 1, // per-scenario replicates; raise to 5+ for tight CIs
|
|
73
|
+
maxConcurrency: 2, // parallel cells
|
|
74
|
+
costCeiling: 5.0, // optional USD soft-abort
|
|
75
|
+
tracing: 'on', // default on; 'off' refused by improvement loop w/ a driver
|
|
76
|
+
labeledStore: store, // optional capture (see §8); 'off' to disable
|
|
77
|
+
captureSource: 'eval-run', // provenance for captured rows
|
|
78
|
+
})
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Returns `CampaignResult<TArtifact, TScenario>`:
|
|
82
|
+
```ts
|
|
83
|
+
{
|
|
84
|
+
manifestHash: string // sha256(scenarios, judges, dispatch ref, seed, reps) — run identity
|
|
85
|
+
seed: number
|
|
86
|
+
startedAt, endedAt, durationMs
|
|
87
|
+
cells: CampaignCellResult[] // one per scenario×rep: { cellId, scenarioId, rep, artifact, judgeScores, costUsd, cached, error? }
|
|
88
|
+
aggregates: {
|
|
89
|
+
byJudge: Record<string, JudgeAggregate> // { mean, stdev, ci95:[lo,hi], n } — bootstrap CIs
|
|
90
|
+
byScenario: Record<string, ScenarioAggregate>
|
|
91
|
+
totalCostUsd, cellsExecuted, cellsSkipped, cellsCached, cellsFailed
|
|
92
|
+
}
|
|
93
|
+
runDir, artifactsByPath, scenarios
|
|
94
|
+
}
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
**Rules:**
|
|
98
|
+
- `dispatch` must be a *named* function (`dispatch.name` feeds the manifest hash
|
|
99
|
+
— anonymous arrows weaken reproducibility identity).
|
|
100
|
+
- Inspect `cell.error` before trusting `cell.artifact`. Cells fail-soft
|
|
101
|
+
individually (one bad scenario doesn't kill the run) but the error is
|
|
102
|
+
recorded, never swallowed.
|
|
103
|
+
- Re-running the same `runDir` with `resumable: true` (default) skips cached
|
|
104
|
+
cells by `(manifestHash, scenarioId, rep)`.
|
|
105
|
+
|
|
106
|
+
`runEval(opts)` is a thin alias for the scorecard-only case (no improvement).
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## 3. `JudgeConfig`, `Scenario` — the domain types you own
|
|
111
|
+
|
|
112
|
+
```ts
|
|
113
|
+
interface Scenario { id: string; kind: string; /* + your fields */ }
|
|
114
|
+
|
|
115
|
+
interface JudgeConfig<TArtifact, TScenario = Scenario> {
|
|
116
|
+
name: string
|
|
117
|
+
dimensions: { key: string; weight?: number }[]
|
|
118
|
+
appliesTo?: (scenario: TScenario) => boolean // scope a judge to some scenarios
|
|
119
|
+
score(args: { artifact: TArtifact; scenario: TScenario; signal: AbortSignal })
|
|
120
|
+
: Promise<JudgeScore> | JudgeScore
|
|
121
|
+
}
|
|
122
|
+
interface JudgeScore { composite: number; dimensions: Record<string, number>; notes: string }
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
Judges are where your rubric lives. They MUST fail loud: if the judge LLM call
|
|
126
|
+
fails, throw — do not return a `composite: 0` (a fake zero is indistinguishable
|
|
127
|
+
from a real zero and silently corrupts every aggregate downstream).
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## 4. The improvement loop — `runImprovementLoop`
|
|
132
|
+
|
|
133
|
+
```ts
|
|
134
|
+
import {
|
|
135
|
+
runImprovementLoop, defaultProductionGate, evolutionaryDriver,
|
|
136
|
+
} from '@tangle-network/agent-eval/campaign'
|
|
137
|
+
|
|
138
|
+
const result = await runImprovementLoop({
|
|
139
|
+
// --- measurement config (same as runCampaign, minus dispatch) ---
|
|
140
|
+
scenarios: trainScenarios,
|
|
141
|
+
judges,
|
|
142
|
+
runDir,
|
|
143
|
+
// --- surface improvement ---
|
|
144
|
+
baselineSurface, // string | CodeSurface — current best
|
|
145
|
+
dispatchWithSurface, // (surface, scenario, ctx) => artifact
|
|
146
|
+
driver, // ImprovementDriver — see §5/§6
|
|
147
|
+
populationSize: 4, // BREADTH: candidates per generation
|
|
148
|
+
maxGenerations: 3,
|
|
149
|
+
promoteTopK: 2,
|
|
150
|
+
maxImprovementShots: 3, // DEPTH: forwarded to the driver's propose()
|
|
151
|
+
// --- gated promotion ---
|
|
152
|
+
holdoutScenarios, // NEVER in the training pool — gate scores on these
|
|
153
|
+
gate: defaultProductionGate({ holdoutScenarios, deltaThreshold: 0.02 }),
|
|
154
|
+
autoOnPromote: 'pr', // 'pr' | 'none' (NO 'config' in v0.40 — throws)
|
|
155
|
+
ghOwner: 'tangle-network',
|
|
156
|
+
ghRepo: 'gtm-agent', // required when autoOnPromote: 'pr'
|
|
157
|
+
})
|
|
158
|
+
// → { winnerSurface, winnerSurfaceHash, generations, baselineOnHoldout,
|
|
159
|
+
// winnerOnHoldout, gateResult, prResult? }
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
`runOptimization(opts)` is the loop body without the gate/holdout/PR (use it
|
|
163
|
+
when you want candidates + a winner but will gate yourself).
|
|
164
|
+
|
|
165
|
+
**Hard refusals (by design — these throw):**
|
|
166
|
+
- `autoOnPromote: 'config'` → deferred to a later pass (live self-mutation
|
|
167
|
+
needs the full safety stack). Use `'pr'` or `'none'`.
|
|
168
|
+
- `tracing: 'off'` while a `driver` is wired → an improvement loop that doesn't
|
|
169
|
+
feed the dataset is unattributable.
|
|
170
|
+
- `autoOnPromote: 'pr'` without `ghOwner`/`ghRepo`.
|
|
171
|
+
|
|
172
|
+
---
|
|
173
|
+
|
|
174
|
+
## 5. `ImprovementDriver` + `ProposeContext` — the contract
|
|
175
|
+
|
|
176
|
+
```ts
|
|
177
|
+
interface ImprovementDriver<TFindings = unknown> {
|
|
178
|
+
kind: string
|
|
179
|
+
propose(ctx: ProposeContext<TFindings>): Promise<MutableSurface[]> // PLAN
|
|
180
|
+
decide?(args: { history: GenerationRecord[] }): { stop: boolean; reason?: string }
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
interface ProposeContext<TFindings = unknown> {
|
|
184
|
+
currentSurface: MutableSurface
|
|
185
|
+
history: GenerationRecord[] // prior generations + scores
|
|
186
|
+
findings: TFindings[]
|
|
187
|
+
populationSize: number // how many candidates to return
|
|
188
|
+
generation: number
|
|
189
|
+
signal: AbortSignal
|
|
190
|
+
report?: unknown // Phase-2 research report (analyst findings + diff)
|
|
191
|
+
dataset?: LabeledScenarioStore // handle to all captured data
|
|
192
|
+
maxImprovementShots?: number // DEPTH knob
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
type MutableSurface = string | CodeSurface
|
|
196
|
+
interface CodeSurface { kind: 'code'; worktreeRef: string; baseRef?: string; summary?: string }
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
`propose()` returns candidates; it does NOT measure (the loop measures). For a
|
|
200
|
+
code-tier driver, `propose()` may itself be agentic (spawn a harness, write a
|
|
201
|
+
worktree) — that's the recursion. Pick a shipped driver:
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
## 6. The shipped drivers (use these; don't hand-roll)
|
|
206
|
+
|
|
207
|
+
### `evolutionaryDriver` (agent-eval) — prompt mutation, no sandbox
|
|
208
|
+
```ts
|
|
209
|
+
import { evolutionaryDriver } from '@tangle-network/agent-eval/campaign'
|
|
210
|
+
|
|
211
|
+
const driver = evolutionaryDriver({
|
|
212
|
+
mutator: { // YOUR Mutator (the only domain bit)
|
|
213
|
+
kind: 'reflection',
|
|
214
|
+
async mutate({ currentSurface, populationSize, findings, signal }) {
|
|
215
|
+
// return N prompt-string variants of currentSurface
|
|
216
|
+
return [...]
|
|
217
|
+
},
|
|
218
|
+
},
|
|
219
|
+
})
|
|
220
|
+
```
|
|
221
|
+
Use when the surface is a **prompt string** and you have a mutation strategy
|
|
222
|
+
(reflection, GEPA, AxGEPA). Cheap, deterministic-friendly.
|
|
223
|
+
|
|
224
|
+
### `improvementDriver` + generators (agent-runtime) — one driver, a cost dial
|
|
225
|
+
```ts
|
|
226
|
+
import {
|
|
227
|
+
improvementDriver, reflectiveGenerator, agenticGenerator,
|
|
228
|
+
} from '@tangle-network/agent-runtime/improvement'
|
|
229
|
+
import { gitWorktreeAdapter } from '@tangle-network/agent-eval/campaign'
|
|
230
|
+
|
|
231
|
+
const worktree = gitWorktreeAdapter({ repoRoot: '/abs/repo' })
|
|
232
|
+
|
|
233
|
+
// cheap, no sandbox: drafts patches from findings, applies them
|
|
234
|
+
const cheap = improvementDriver({
|
|
235
|
+
worktree,
|
|
236
|
+
generator: reflectiveGenerator({ improvementAdapter }), // wraps proposeFromFindings
|
|
237
|
+
baseRef: 'main',
|
|
238
|
+
})
|
|
239
|
+
|
|
240
|
+
// full agentic: a real coding harness edits the worktree, retries up to maxShots
|
|
241
|
+
const deep = improvementDriver({
|
|
242
|
+
worktree,
|
|
243
|
+
generator: agenticGenerator({ harness: 'claude' }), // claude | codex | opencode
|
|
244
|
+
baseRef: 'main',
|
|
245
|
+
})
|
|
246
|
+
```
|
|
247
|
+
One driver; the generator is the cost dial. Both emit `CodeSurface`s the loop
|
|
248
|
+
measures + gates. `agenticGenerator.generate()` runs the harness with
|
|
249
|
+
`cwd = worktree`, trusts the **git diff** (not harness stdout) to decide
|
|
250
|
+
"applied", and retries up to `maxImprovementShots` on a clean tree.
|
|
251
|
+
|
|
252
|
+
---
|
|
253
|
+
|
|
254
|
+
## 7. Gates — `defaultProductionGate`, `composeGate`, `heldOutGate`
|
|
255
|
+
|
|
256
|
+
```ts
|
|
257
|
+
import { defaultProductionGate, composeGate, heldOutGate } from '@tangle-network/agent-eval/campaign'
|
|
258
|
+
|
|
259
|
+
// opinionated default: heldout-delta + budget + red-team + reward-hacking + canary
|
|
260
|
+
const gate = defaultProductionGate({
|
|
261
|
+
holdoutScenarios,
|
|
262
|
+
deltaThreshold: 0.02, // winner must beat baseline by this on holdout
|
|
263
|
+
budgetUsd: 5, // optional cost ceiling
|
|
264
|
+
redTeamBattery: [...], // optional adversarial probes
|
|
265
|
+
})
|
|
266
|
+
|
|
267
|
+
// compose your own: ALL must ship, else the worst verdict wins
|
|
268
|
+
const custom = composeGate(heldOutGate({ scenarios: holdoutScenarios, deltaThreshold: 0.02 }), myDomainGate)
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
`Gate.decide(ctx) → GateResult` with a 5-valued verdict:
|
|
272
|
+
`GateDecision = 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling'`.
|
|
273
|
+
`composeGate` returns `ship` only if all sub-gates ship; otherwise the
|
|
274
|
+
precedence is `arch_ceiling > model_ceiling > hold > need_more_work`. Use the
|
|
275
|
+
non-ship verdicts to route: `need_more_work` → more data, `model_ceiling` →
|
|
276
|
+
try a stronger model, `arch_ceiling` → the surface can't fix it.
|
|
277
|
+
|
|
278
|
+
`openAutoPr({ result, gate, promotedDiff, ghOwner, ghRepo })` opens the PR —
|
|
279
|
+
**refuses unless `gate.decision === 'ship'`**, dry-runs without a GH token.
|
|
280
|
+
|
|
281
|
+
---
|
|
282
|
+
|
|
283
|
+
## 8. The dataset flywheel — `FsLabeledScenarioStore`
|
|
284
|
+
|
|
285
|
+
```ts
|
|
286
|
+
import { FsLabeledScenarioStore } from '@tangle-network/agent-eval/campaign'
|
|
287
|
+
|
|
288
|
+
const store = new FsLabeledScenarioStore({ root: '/abs/dataset', maxWritesPerMinutePerBucket: 60 })
|
|
289
|
+
// pass to runCampaign({ labeledStore: store, captureSource: 'production-trace' })
|
|
290
|
+
```
|
|
291
|
+
Every campaign cell captures `(scenario, artifact, judgeScore, source)`. This
|
|
292
|
+
corpus IS the optimizer's training set. Discipline enforced at the store:
|
|
293
|
+
- **provenance required** on every write (source / sourceVersionHash /
|
|
294
|
+
capturedAt / redactionStatus).
|
|
295
|
+
- **temporal split**: `sample()` requires explicit `split` + `capturedBefore`.
|
|
296
|
+
- **`production-trace` is excluded from the train split by default** (no
|
|
297
|
+
contamination of the holdout it's judged against).
|
|
298
|
+
|
|
299
|
+
---
|
|
300
|
+
|
|
301
|
+
## 9. The migration recipe (what to DELETE / KEEP / REWIRE)
|
|
302
|
+
|
|
303
|
+
For a product that already has eval + prompt-evolution wrappers:
|
|
304
|
+
|
|
305
|
+
**DELETE (orchestration the substrate now owns):**
|
|
306
|
+
- generation/population/top-K loops, trial-matrix construction, frontier
|
|
307
|
+
tracking, seed plumbing, manifest hashing, cell caching, scorecard
|
|
308
|
+
aggregation, CI math, PR-opening scaffolding, worktree git commands.
|
|
309
|
+
- any local `runProductionLoop` / `runPromptEvolution` / `runAnalystLoop`
|
|
310
|
+
wrapper whose body is a loop over generations × candidates × reps.
|
|
311
|
+
|
|
312
|
+
**KEEP (domain logic — it does not move):**
|
|
313
|
+
- scenarios (your eval inputs) → become `scenarios`.
|
|
314
|
+
- judges/rubrics/dimension weights → become `judges`.
|
|
315
|
+
- the agent-invocation function → becomes `dispatch` / `dispatchWithSurface`.
|
|
316
|
+
- the mutation strategy (reflection prompt) → becomes a `Mutator` or a
|
|
317
|
+
generator's `buildPrompt`.
|
|
318
|
+
- domain gates (e.g. anti-fabrication) → compose with `defaultProductionGate`.
|
|
319
|
+
|
|
320
|
+
**REWIRE:**
|
|
321
|
+
- `buildHoldoutRunner()` → `dispatchWithSurface`.
|
|
322
|
+
- `buildScorer()` → `judges`.
|
|
323
|
+
- `buildMutator()` → `evolutionaryDriver({ mutator })`.
|
|
324
|
+
- `runProductionLoop(...)` → `runImprovementLoop(...)`.
|
|
325
|
+
- `runPromptEvolution(...)` → `runImprovementLoop` (surface = prompt string).
|
|
326
|
+
- `runAnalystLoop(...)` improvement step → `improvementDriver` + a generator;
|
|
327
|
+
its findings-ledger + knowledge-graph writes stay.
|
|
328
|
+
|
|
329
|
+
Net for a typical consumer: ~2,400 LOC of orchestration deleted, ~800 LOC
|
|
330
|
+
rewired into the three seams.
|
|
331
|
+
|
|
332
|
+
---
|
|
333
|
+
|
|
334
|
+
## 10. Forbidden anti-patterns (a review will reject these)
|
|
335
|
+
|
|
336
|
+
1. **No silent fallbacks.** No `catch { return null }`, no `?? 0` on a judge
|
|
337
|
+
composite, no returning `false`/empty on an error you can't interpret.
|
|
338
|
+
External-boundary calls return typed outcomes or throw. A git/LLM/subprocess
|
|
339
|
+
failure is a *throw*, never a fold-into-a-default.
|
|
340
|
+
2. **Don't reimplement the loop.** If you write a `for (gen of generations)`
|
|
341
|
+
that mutates + scores + selects, you've rebuilt the substrate. Stop; call
|
|
342
|
+
`runImprovementLoop`.
|
|
343
|
+
3. **Don't conflate train and holdout.** Holdout scenarios never enter the
|
|
344
|
+
training pool. The gate scores on holdout only.
|
|
345
|
+
4. **Don't trust harness stdout.** For code edits, the git diff is the truth,
|
|
346
|
+
not what the agent says it did.
|
|
347
|
+
5. **Account for every worktree.** A created worktree is finalized into a
|
|
348
|
+
surface or discarded — never leaked, even on throw (the shipped
|
|
349
|
+
`improvementDriver` already guarantees this; preserve it if you extend).
|
|
350
|
+
6. **Don't auto-deploy.** Promotion opens a PR (`autoOnPromote: 'pr'`). Live
|
|
351
|
+
self-mutation (`'config'`) is deferred behind the full safety stack.
|
|
352
|
+
7. **Tracing stays on when improving.** The loop refuses `tracing: 'off'` with
|
|
353
|
+
a driver wired — the dataset must be fed.
|
|
354
|
+
8. **Name your `dispatch`.** Anonymous dispatch weakens the manifest-hash
|
|
355
|
+
reproducibility identity.
|
|
356
|
+
|
|
357
|
+
---
|
|
358
|
+
|
|
359
|
+
## 11. Minimal end-to-end skeleton
|
|
360
|
+
|
|
361
|
+
```ts
|
|
362
|
+
import {
|
|
363
|
+
runImprovementLoop, defaultProductionGate, evolutionaryDriver,
|
|
364
|
+
FsLabeledScenarioStore,
|
|
365
|
+
} from '@tangle-network/agent-eval/campaign'
|
|
366
|
+
|
|
367
|
+
const store = new FsLabeledScenarioStore({ root: '.dataset' })
|
|
368
|
+
|
|
369
|
+
async function dispatchWithSurface(surface: string, scenario: MyScenario) {
|
|
370
|
+
return runMyAgent({ systemPrompt: surface, input: scenario }) // → MyArtifact
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
const judges = [{
|
|
374
|
+
name: 'quality',
|
|
375
|
+
dimensions: [{ key: 'grounding' }, { key: 'actionability' }],
|
|
376
|
+
async score({ artifact, scenario }) { /* → JudgeScore, throw on failure */ },
|
|
377
|
+
}]
|
|
378
|
+
|
|
379
|
+
const result = await runImprovementLoop<MyScenario, MyArtifact>({
|
|
380
|
+
scenarios: train, holdoutScenarios: holdout, judges,
|
|
381
|
+
baselineSurface: CURRENT_PROMPT,
|
|
382
|
+
dispatchWithSurface,
|
|
383
|
+
driver: evolutionaryDriver({ mutator: myReflectionMutator }),
|
|
384
|
+
populationSize: 4, maxGenerations: 3, promoteTopK: 2,
|
|
385
|
+
gate: defaultProductionGate({ holdoutScenarios: holdout, deltaThreshold: 0.02 }),
|
|
386
|
+
autoOnPromote: 'pr', ghOwner: 'tangle-network', ghRepo: 'my-agent',
|
|
387
|
+
runDir: '.runs/improve', labeledStore: store, captureSource: 'eval-run',
|
|
388
|
+
})
|
|
389
|
+
|
|
390
|
+
if (result.gateResult.decision === 'ship') console.log('PR:', result.prResult?.prUrl)
|
|
391
|
+
```
|
|
392
|
+
|
|
393
|
+
That is the whole integration. Everything not in this skeleton is substrate.
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# The product self-improvement loop — the finish-line target
|
|
2
|
+
|
|
3
|
+
This is the **end state** every Tangle product agent (gtm, legal, tax, creative,
|
|
4
|
+
agent-builder, blueprint, physim) converges to. It is the target the consumer
|
|
5
|
+
migrations build toward — not a 1:1 port of whatever eval/improvement code a
|
|
6
|
+
product has today.
|
|
7
|
+
|
|
8
|
+
**Thesis.** A product agent is *one closed, automated self-improvement loop*
|
|
9
|
+
that makes the agent measurably better over time while humans only approve
|
|
10
|
+
PRs. A product should NOT have a "production loop" *and* a pile of `eval/*`
|
|
11
|
+
CLIs *and* bespoke optimization orchestration. It has **one** loop, composed
|
|
12
|
+
from the substrate. Everything else is deleted.
|
|
13
|
+
|
|
14
|
+
Primitives reference: [`primitives-integration-spec.md`](./primitives-integration-spec.md).
|
|
15
|
+
Engine internals: [`self-improvement-engine.md`](./self-improvement-engine.md).
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## The loop (7 steps, exact substrate composition)
|
|
20
|
+
|
|
21
|
+
```
|
|
22
|
+
1. SAMPLE the eval matrix
|
|
23
|
+
scenarios = cartesian(
|
|
24
|
+
profileVariants, // the surface(s) under test: baseline + candidates
|
|
25
|
+
productScenarios, // the hard product tasks (gtm: attribution honesty, …)
|
|
26
|
+
personas, // simulated users / drivers
|
|
27
|
+
) ∪ productionFailures // real failures pulled from the LabeledScenarioStore
|
|
28
|
+
// (the flywheel: prod traces become eval scenarios)
|
|
29
|
+
|
|
30
|
+
2. MEASURE — runCampaign
|
|
31
|
+
dispatch(scenario) = runMultishot({ // the multi-turn challenging flow
|
|
32
|
+
persona: scenario.persona, // driver = simulated user
|
|
33
|
+
profile: productAgentProfile(surface), // worker = the agent under test
|
|
34
|
+
shape: scenario.flow, // the real task, many turns
|
|
35
|
+
tools: productTools, // real tools, real side-effects
|
|
36
|
+
}) → transcript artifact
|
|
37
|
+
judges = product ensemble (domain dimensions) → scorecard + bootstrap CIs
|
|
38
|
+
labeledStore: capture EVERY cell (scenario, artifact, score, source) → the dataset
|
|
39
|
+
|
|
40
|
+
3. ANALYZE — trace analysts (runAnalystLoop / AnalystRegistry)
|
|
41
|
+
read the campaign traces → a research report (failure modes, why, where).
|
|
42
|
+
This REPLACES bespoke "failure clustering": the analyst is the richer,
|
|
43
|
+
LLM-driven version of "what should we improve and why".
|
|
44
|
+
|
|
45
|
+
4. IMPROVE — runImprovementLoop( improvementDriver + agenticGenerator )
|
|
46
|
+
driver.propose({ report, dataset, … }) → candidate surfaces.
|
|
47
|
+
The agentic generator runs a coding harness in a worktree, reading the
|
|
48
|
+
report + the codebase, making REAL product changes — prompt, tools, AND
|
|
49
|
+
code — not just an addendum string. Each candidate is measured on a
|
|
50
|
+
HELD-OUT slice of the matrix.
|
|
51
|
+
|
|
52
|
+
5. GATE — defaultProductionGate (+ domain gates, composed)
|
|
53
|
+
heldout-delta + budget + red-team + reward-hacking + canary, plus any
|
|
54
|
+
product-specific gate (e.g. anti-fabrication) and an overfit-gap check.
|
|
55
|
+
Verdict ∈ ship | hold | need_more_work | model_ceiling | arch_ceiling.
|
|
56
|
+
|
|
57
|
+
6. PROMOTE — openAutoPr
|
|
58
|
+
the winning worktree → a PR against the product repo. Human approves → ships.
|
|
59
|
+
(autoOnPromote: 'pr'. Live self-mutation is deferred behind the full safety
|
|
60
|
+
stack.)
|
|
61
|
+
|
|
62
|
+
7. LOOP
|
|
63
|
+
the shipped, improved agent runs in production → emits traces → the dataset
|
|
64
|
+
grows → back to (1). The loop is scheduled (cron) and/or triggered when the
|
|
65
|
+
analyst report crosses a severity threshold. Autonomous between PR approvals.
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
**One entry point, no new abstractions.** A product exposes a single
|
|
69
|
+
`run<Product>ImprovementCycle()` that *composes* the substrate primitives
|
|
70
|
+
above. It does NOT define `runFooPromptEvolution`, `FooOptimizer`,
|
|
71
|
+
`FooProductionLoop`, etc. The substrate carries every name; the product only
|
|
72
|
+
wires its domain pieces into the seams.
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## What each product OWNS vs DELETES vs COMPOSES
|
|
77
|
+
|
|
78
|
+
**OWNS (domain — stays, this is the product's value):**
|
|
79
|
+
- `productScenarios` — the hard tasks the agent must handle.
|
|
80
|
+
- `personas` — the simulated users that drive the multi-shot flows.
|
|
81
|
+
- `judges` / rubrics / dimension weights — how "good" is defined.
|
|
82
|
+
- `productTools` — the real tools the agent uses.
|
|
83
|
+
- deterministic checks (anti-slop, format, forbidden-claim) — fast pre-judges.
|
|
84
|
+
- domain gates (e.g. anti-fabrication) — composed into the gate.
|
|
85
|
+
|
|
86
|
+
**DELETES (orchestration the substrate now owns):**
|
|
87
|
+
- every `for (gen of generations)` mutate→score→select loop.
|
|
88
|
+
- bespoke prompt-evolution / production-loop / analyst-loop wrappers.
|
|
89
|
+
- trial-matrix construction, frontier tracking, seed plumbing, manifest
|
|
90
|
+
hashing, cell caching, scorecard aggregation, CI math.
|
|
91
|
+
- PR-opening scaffolding, worktree git plumbing.
|
|
92
|
+
- parallel `eval/*` CLIs that each re-implement a slice of the above.
|
|
93
|
+
|
|
94
|
+
**COMPOSES (the substrate, in the one cycle):**
|
|
95
|
+
- `runCampaign` (matrix measurement) · `runMultishot` (the dispatch flow) ·
|
|
96
|
+
`FsLabeledScenarioStore` (dataset) · analysts (report) ·
|
|
97
|
+
`runImprovementLoop` + `improvementDriver` + `agenticGenerator` (improve) ·
|
|
98
|
+
`defaultProductionGate` + `composeGate` (gate) · `openAutoPr` (promote).
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## Definition of done (a product is "at the finish line" when)
|
|
103
|
+
|
|
104
|
+
1. **One cycle, one entry.** A single `run<Product>ImprovementCycle()` composes
|
|
105
|
+
the substrate; the old eval/improvement systems are deleted, not coexisting.
|
|
106
|
+
2. **Matrix eval is real.** `dispatch` runs genuine multi-shot persona↔agent
|
|
107
|
+
flows with real tools — not single-turn projections, not stubbed workers
|
|
108
|
+
(non-zero token usage is asserted).
|
|
109
|
+
3. **The dataset is fed.** Every cell captures to `LabeledScenarioStore` with
|
|
110
|
+
correct provenance; production failures flow back in as scenarios.
|
|
111
|
+
4. **Improvement is code-real.** The agentic generator produces worktree
|
|
112
|
+
changes (prompt/tools/code), measured on holdout — not just addendum-string
|
|
113
|
+
mutation.
|
|
114
|
+
5. **The gate is honest.** Composed `defaultProductionGate` + domain gates +
|
|
115
|
+
overfit-gap; fails closed; holdout never overlaps train.
|
|
116
|
+
6. **Promotion is a PR.** `openAutoPr` opens it; a human approves; nothing
|
|
117
|
+
auto-deploys.
|
|
118
|
+
7. **It's scheduled + triggered.** Runs on cadence and/or when the analyst
|
|
119
|
+
report crosses severity; autonomous between approvals.
|
|
120
|
+
8. **Tests + a real proof run.** Contract tests assert the wiring; one real
|
|
121
|
+
end-to-end cycle produces a scorecard and (on a shipping gate) a PR.
|
|
122
|
+
|
|
123
|
+
Anything short of this is mid-migration, not done.
|
|
124
|
+
|
|
125
|
+
---
|
|
126
|
+
|
|
127
|
+
## gtm-agent — the worked instantiation (first reference build)
|
|
128
|
+
|
|
129
|
+
| Loop step | gtm wiring |
|
|
130
|
+
|---|---|
|
|
131
|
+
| SAMPLE | profile variants of `OPERATOR_CEO_SYSTEM_PROMPT` + addendum; `GTM_LOOP_HOLDOUT_SCENARIOS` + `eval/business-owner/personas.json`; production failures from the trace store |
|
|
132
|
+
| MEASURE | `dispatch` = `runMultishot(persona ↔ gtm-agent via runChatThroughRuntime, real tools)`; judges = the 3-model ensemble (`attribution_honesty`, `proposal_grounding`) + canonical 12-dim |
|
|
133
|
+
| ANALYZE | trace analysts over the campaign traces → report (supersedes `FailureClusterConfig` clustering) |
|
|
134
|
+
| IMPROVE | `improvementDriver` + `agenticGenerator` (claude harness) edits prompt/tools/code in a worktree, fed the report |
|
|
135
|
+
| GATE | `composeGate(defaultProductionGate, antiFabricationGate, overfitGapGate)` |
|
|
136
|
+
| PROMOTE | `openAutoPr` → PR against `tangle-network/gtm-agent` |
|
|
137
|
+
|
|
138
|
+
**Deleted:** `eval/run-prompt-evolution.ts`, `eval/analyst-loop.ts`,
|
|
139
|
+
`eval/optimization-campaign.ts`, `scripts/evals/*`, the orchestration body of
|
|
140
|
+
`production-loop/index.ts` and `eval/canonical.ts`.
|
|
141
|
+
**Kept:** scenarios, personas, judges, tools, deterministic checks, the
|
|
142
|
+
`composeProductionLoopSystemPrompt` wiring.
|
|
143
|
+
**Result:** one `runGtmImprovementCycle()`; ~3–4k LOC of scattered orchestration
|
|
144
|
+
gone, replaced by a substrate composition.
|
|
145
|
+
|
|
146
|
+
This gtm build is the reference the other six products copy.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.40.
|
|
3
|
+
"version": "0.40.4",
|
|
4
4
|
"description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/campaign/run-campaign.ts"],"sourcesContent":["/**\n * @experimental\n *\n * `runCampaign` — Pass A substrate primitive. ONE function that orchestrates\n * scenarios → dispatch → artifacts → judges → aggregates, with full\n * reproducibility (seed + manifest hash), cell-level resumability, bootstrap\n * CIs, and the `LabeledScenarioStore` capture flywheel.\n *\n * Improvement loops (optimizer / gate / autoOnPromote) ride on top of this\n * primitive but live in `presets/run-improvement-loop.ts`. This file keeps\n * the core orchestrator minimal — Phase 1 of the Pass A track.\n */\n\nimport { createHash } from 'node:crypto'\nimport { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'\nimport { join } from 'node:path'\nimport { confidenceInterval } from '../statistics'\nimport type {\n CampaignAggregates,\n CampaignArtifactWriter,\n CampaignCellResult,\n CampaignCostMeter,\n CampaignResult,\n CampaignTraceWriter,\n DispatchContext,\n DispatchFn,\n JudgeAggregate,\n JudgeConfig,\n JudgeScore,\n LabeledScenarioStore,\n Scenario,\n ScenarioAggregate,\n TraceSpan,\n} from './types'\n\nexport interface RunCampaignOptions<TScenario extends Scenario, TArtifact> {\n scenarios: TScenario[]\n dispatch: DispatchFn<TScenario, TArtifact>\n judges?: JudgeConfig<TArtifact, TScenario>[]\n /** Required for reproducibility. Default 42. */\n seed?: number\n /** Per-scenario replicates for CI bands. Default 1; raise to 5+ for\n * bootstrap-tight intervals on critical eval. */\n reps?: number\n /** When true (default), completed cells are cached by\n * (manifestHash, scenarioId, rep, generation). Re-runs skip cached cells. */\n resumable?: boolean\n /** Optional store — when present, every artifact + judge score is captured\n * with the configured `captureSource`. Capture is default ON; pass `'off'`\n * to disable. */\n labeledStore?: LabeledScenarioStore | 'off'\n captureSource?: 'production-trace' | 'eval-run' | 'manual' | 'red-team' | 'synthetic'\n captureSourceVersionHash?: string\n /** Wall-clock cost cap across all cells. Cells beyond ceiling are skipped. */\n costCeiling?: number\n /** Max concurrent cells. Default 2. */\n maxConcurrency?: number\n /** Required: where artifacts + traces land. */\n runDir: string\n /** Tracing posture. Default is the substrate's `FileSystemTraceStore` rooted\n * at `<runDir>/traces/`. `'off'` disables capture entirely — substrate\n * refuses this when the caller wires `autoOnPromote !== 'none'`. */\n tracing?: 'on' | 'off'\n /** Test seam — override the wall clock for deterministic tests. */\n now?: () => Date\n /** Test seam — override per-cell trace writer factory. */\n buildTraceWriter?: (cellId: string, dir: string) => CampaignTraceWriter\n}\n\nexport async function runCampaign<TScenario extends Scenario, TArtifact>(\n opts: RunCampaignOptions<TScenario, TArtifact>,\n): Promise<CampaignResult<TArtifact, TScenario>> {\n const seed = opts.seed ?? 42\n const reps = opts.reps ?? 1\n const resumable = opts.resumable ?? true\n const maxConcurrency = opts.maxConcurrency ?? 2\n const now = opts.now ?? (() => new Date())\n const judges = opts.judges ?? []\n\n if (!existsSync(opts.runDir)) mkdirSync(opts.runDir, { recursive: true })\n\n const manifestHash = computeManifestHash({\n scenarios: opts.scenarios,\n judges: judges as unknown as JudgeConfig<unknown>[],\n dispatchRef: opts.dispatch.name || 'anonymous',\n seed,\n reps,\n })\n\n const startedAt = now()\n const cells: CampaignCellResult<TArtifact>[] = []\n const artifactsByPath: Record<string, string> = {}\n\n // Build the cell schedule (scenario × rep).\n const schedule: Array<{ scenario: TScenario; rep: number; cellId: string; cellSeed: number }> = []\n let cellIndex = 0\n for (const scenario of opts.scenarios) {\n for (let rep = 0; rep < reps; rep++) {\n const cellId = `${scenario.id}:${rep}`\n const cellSeed = seed + cellIndex\n schedule.push({ scenario, rep, cellId, cellSeed })\n cellIndex += 1\n }\n }\n\n // Concurrency-limited execution.\n let totalCostUsd = 0\n let costCeilingReached = false\n const abortController = new AbortController()\n // Concurrency lanes that drain the cell schedule. Named \"lanes\" — not\n // \"workers\" — to avoid clashing with the taxonomy's worker (= the agent\n // harness in a sandbox, invoked behind `dispatch`). See loop-taxonomy.md.\n const lanes: Promise<void>[] = []\n let nextIdx = 0\n const cellsRef = cells\n\n for (let i = 0; i < maxConcurrency; i++) {\n lanes.push(\n (async () => {\n while (true) {\n const myIdx = nextIdx++\n if (myIdx >= schedule.length) return\n const slot = schedule[myIdx]!\n if (costCeilingReached) {\n cellsRef.push(skippedCell(slot, 'cost_ceiling_reached'))\n continue\n }\n const result = await executeCell({\n slot,\n opts,\n manifestHash,\n resumable,\n now,\n buildTraceWriter: opts.buildTraceWriter ?? defaultBuildTraceWriter,\n signal: abortController.signal,\n })\n cellsRef.push(result.cell)\n totalCostUsd += result.cell.costUsd\n Object.assign(artifactsByPath, result.artifactsByPath)\n if (opts.costCeiling !== undefined && totalCostUsd >= opts.costCeiling) {\n costCeilingReached = true\n }\n // Capture into LabeledScenarioStore unless explicitly disabled.\n if (opts.labeledStore && opts.labeledStore !== 'off' && !result.cell.error) {\n await captureToStore({\n store: opts.labeledStore,\n cell: result.cell,\n scenario: slot.scenario,\n opts,\n now,\n }).catch((err) => {\n // Capture failures are non-fatal — log but don't crash the campaign.\n // (Trace would normally land here.)\n console.warn(\n `[runCampaign] capture failed for ${result.cell.cellId}: ${err instanceof Error ? err.message : String(err)}`,\n )\n })\n }\n }\n })(),\n )\n }\n await Promise.all(lanes)\n\n const endedAt = now()\n cellsRef.sort((a, b) => a.cellId.localeCompare(b.cellId))\n\n const aggregates = computeAggregates(\n cellsRef,\n judges as unknown as JudgeConfig<TArtifact>[],\n seed,\n )\n\n return {\n manifestHash,\n seed,\n startedAt: startedAt.toISOString(),\n endedAt: endedAt.toISOString(),\n durationMs: endedAt.getTime() - startedAt.getTime(),\n cells: cellsRef,\n aggregates,\n runDir: opts.runDir,\n artifactsByPath,\n scenarios: opts.scenarios.map((s) => ({ id: s.id, kind: s.kind })),\n }\n}\n\n// ── Internals ─────────────────────────────────────────────────────────\n\ninterface ExecuteCellArgs<TScenario extends Scenario, TArtifact> {\n slot: { scenario: TScenario; rep: number; cellId: string; cellSeed: number }\n opts: RunCampaignOptions<TScenario, TArtifact>\n manifestHash: string\n resumable: boolean\n now: () => Date\n buildTraceWriter: (cellId: string, dir: string) => CampaignTraceWriter\n signal: AbortSignal\n}\n\nasync function executeCell<TScenario extends Scenario, TArtifact>(\n args: ExecuteCellArgs<TScenario, TArtifact>,\n): Promise<{ cell: CampaignCellResult<TArtifact>; artifactsByPath: Record<string, string> }> {\n const cellDir = join(args.opts.runDir, args.slot.cellId.replace(/[^a-zA-Z0-9_-]/g, '_'))\n if (!existsSync(cellDir)) mkdirSync(cellDir, { recursive: true })\n\n // Resumability: cache key = (manifestHash, scenarioId, rep)\n const cachePath = join(cellDir, 'cached-result.json')\n if (args.resumable && existsSync(cachePath)) {\n try {\n const cached = JSON.parse(readFileSync(cachePath, 'utf8')) as CampaignCellResult<TArtifact>\n if (cached.cellId === args.slot.cellId) {\n return { cell: { ...cached, cached: true }, artifactsByPath: {} }\n }\n } catch {\n // Corrupt cache — fall through to re-run.\n }\n }\n\n const startMs = Date.now()\n const trace = args.buildTraceWriter(args.slot.cellId, cellDir)\n const artifactsByPath: Record<string, string> = {}\n const artifacts: CampaignArtifactWriter = {\n async write(path, content) {\n const fullPath = join(cellDir, path)\n const dir = join(fullPath, '..')\n if (!existsSync(dir)) mkdirSync(dir, { recursive: true })\n writeFileSync(fullPath, content as Uint8Array)\n artifactsByPath[`${args.slot.cellId}/${path}`] = fullPath\n return fullPath\n },\n async writeJson(path, value) {\n return artifacts.write(path, JSON.stringify(value, null, 2))\n },\n }\n let costSoFar = 0\n const cost: CampaignCostMeter = {\n observe(amount, source) {\n costSoFar += amount\n trace.span(`cost.${source}`, { amountUsd: amount }).end()\n },\n current() {\n return costSoFar\n },\n }\n\n const ctx: DispatchContext = {\n cellId: args.slot.cellId,\n rep: args.slot.rep,\n seed: args.slot.cellSeed,\n signal: args.signal,\n trace,\n artifacts,\n cost,\n }\n\n let artifact: TArtifact | undefined\n let errorMessage: string | undefined\n try {\n artifact = await args.opts.dispatch(args.slot.scenario, ctx)\n } catch (err) {\n errorMessage = err instanceof Error ? err.message : String(err)\n }\n\n // Run judges (only if we have an artifact).\n const judgeScores: Record<string, JudgeScore> = {}\n if (artifact !== undefined) {\n for (const judge of args.opts.judges ?? []) {\n if (judge.appliesTo && !judge.appliesTo(args.slot.scenario)) continue\n try {\n const score = await runJudgeCell(judge, { artifact, scenario: args.slot.scenario })\n judgeScores[judge.name] = score\n } catch (err) {\n judgeScores[judge.name] = {\n dimensions: {},\n composite: 0,\n notes: `judge failed: ${err instanceof Error ? err.message : String(err)}`,\n }\n }\n }\n }\n\n await trace.flush()\n\n const cell: CampaignCellResult<TArtifact> = {\n cellId: args.slot.cellId,\n scenarioId: args.slot.scenario.id,\n rep: args.slot.rep,\n artifact: (artifact ?? null) as TArtifact,\n judgeScores,\n costUsd: costSoFar,\n durationMs: Date.now() - startMs,\n seed: args.slot.cellSeed,\n cached: false,\n error: errorMessage,\n }\n\n if (!errorMessage && args.resumable) {\n writeFileSync(cachePath, JSON.stringify(cell))\n }\n\n return { cell, artifactsByPath }\n}\n\nasync function runJudgeCell<TArtifact, TScenario extends Scenario>(\n _judge: JudgeConfig<TArtifact, TScenario>,\n _input: { artifact: TArtifact; scenario: TScenario },\n): Promise<JudgeScore> {\n // Phase 1 stub — wires to the existing 0.38 runJudge in Phase 2.\n // Returns a zero-score for now; consumer wiring + preset uses this.\n return { dimensions: {}, composite: 0, notes: 'phase-1-stub' }\n}\n\nfunction defaultBuildTraceWriter(cellId: string, dir: string): CampaignTraceWriter {\n const spans: Array<Record<string, unknown>> = []\n return {\n span(name, attributes) {\n const startMs = Date.now()\n const record: Record<string, unknown> = { name, cellId, startMs, ...(attributes ?? {}) }\n const finish: TraceSpan = {\n end(endAttrs) {\n record.durationMs = Date.now() - startMs\n if (endAttrs) Object.assign(record, endAttrs)\n spans.push(record)\n },\n setAttribute(key, value) {\n record[key] = value\n },\n }\n return finish\n },\n async flush() {\n const path = join(dir, 'spans.jsonl')\n writeFileSync(path, spans.map((s) => JSON.stringify(s)).join('\\n'))\n },\n }\n}\n\nfunction skippedCell<TScenario extends Scenario, TArtifact>(\n slot: { scenario: TScenario; rep: number; cellId: string; cellSeed: number },\n reason: string,\n): CampaignCellResult<TArtifact> {\n return {\n cellId: slot.cellId,\n scenarioId: slot.scenario.id,\n rep: slot.rep,\n artifact: null as unknown as TArtifact,\n judgeScores: {},\n costUsd: 0,\n durationMs: 0,\n seed: slot.cellSeed,\n cached: false,\n error: `skipped: ${reason}`,\n }\n}\n\ninterface CaptureArgs<TScenario extends Scenario, TArtifact> {\n store: LabeledScenarioStore\n cell: CampaignCellResult<TArtifact>\n scenario: TScenario\n opts: RunCampaignOptions<TScenario, TArtifact>\n now: () => Date\n}\n\nasync function captureToStore<TScenario extends Scenario, TArtifact>(\n args: CaptureArgs<TScenario, TArtifact>,\n): Promise<void> {\n await args.store.observe({\n scenario: args.scenario,\n artifact: args.cell.artifact,\n judgeScores: args.cell.judgeScores,\n source: args.opts.captureSource ?? 'eval-run',\n sourceVersionHash: args.opts.captureSourceVersionHash ?? 'unknown',\n capturedAt: args.now().toISOString(),\n redactionStatus: 'raw',\n })\n}\n\n// ── Aggregates + manifest hash ────────────────────────────────────────\n\nfunction computeManifestHash(input: {\n scenarios: Scenario[]\n judges: JudgeConfig<unknown>[]\n dispatchRef: string\n seed: number\n reps: number\n}): string {\n const canonical = {\n scenarios: input.scenarios.map((s) => ({ id: s.id, kind: s.kind })),\n judges: input.judges.map((j) => ({ name: j.name, dims: j.dimensions.map((d) => d.key) })),\n dispatch: input.dispatchRef,\n seed: input.seed,\n reps: input.reps,\n }\n return createHash('sha256').update(JSON.stringify(canonical)).digest('hex')\n}\n\nfunction computeAggregates<TArtifact>(\n cells: CampaignCellResult<TArtifact>[],\n judges: JudgeConfig<TArtifact>[],\n seed: number,\n): CampaignAggregates {\n const byJudge: Record<string, JudgeAggregate> = {}\n for (const judge of judges) {\n const scores: number[] = []\n for (const cell of cells) {\n const s = cell.judgeScores[judge.name]\n if (s !== undefined) scores.push(s.composite)\n }\n byJudge[judge.name] = aggregate(scores, seed)\n }\n const byScenario: Record<string, ScenarioAggregate> = {}\n const scenarioGroups = new Map<string, number[]>()\n for (const cell of cells) {\n const composites = Object.values(cell.judgeScores).map((s) => s.composite)\n if (composites.length === 0) continue\n const mean = composites.reduce((a, b) => a + b, 0) / composites.length\n const arr = scenarioGroups.get(cell.scenarioId) ?? []\n arr.push(mean)\n scenarioGroups.set(cell.scenarioId, arr)\n }\n for (const [scenarioId, samples] of scenarioGroups) {\n const ag = aggregate(samples, seed)\n byScenario[scenarioId] = { meanComposite: ag.mean, ci95: ag.ci95, n: ag.n }\n }\n return {\n byJudge,\n byScenario,\n totalCostUsd: cells.reduce((a, c) => a + c.costUsd, 0),\n cellsExecuted: cells.filter((c) => !c.error).length,\n cellsSkipped: cells.filter((c) => c.error?.startsWith('skipped:')).length,\n cellsCached: cells.filter((c) => c.cached).length,\n cellsFailed: cells.filter((c) => c.error && !c.error.startsWith('skipped:')).length,\n }\n}\n\n// Percentile bootstrap CI95 via seeded resampling. Deterministic for a given\n// seed — same campaign re-run produces identical CI bands. Falls back to\n// degenerate intervals at n<=1 (the bootstrap is undefined there).\nfunction aggregate(samples: number[], seed: number): JudgeAggregate {\n const n = samples.length\n if (n === 0) return { mean: 0, stdev: 0, ci95: [0, 0], n: 0 }\n const mean = samples.reduce((a, b) => a + b, 0) / n\n const variance = samples.reduce((a, b) => a + (b - mean) ** 2, 0) / Math.max(1, n - 1)\n const stdev = Math.sqrt(variance)\n const ci = confidenceInterval(samples, 0.95, { seed, resamples: 1000 })\n return { mean, stdev, ci95: [ci.lower, ci.upper], n }\n}\n"],"mappings":";;;;;AAaA,SAAS,kBAAkB;AAC3B,SAAS,YAAY,WAAW,cAAc,qBAAqB;AACnE,SAAS,YAAY;AAsDrB,eAAsB,YACpB,MAC+C;AAC/C,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,YAAY,KAAK,aAAa;AACpC,QAAM,iBAAiB,KAAK,kBAAkB;AAC9C,QAAM,MAAM,KAAK,QAAQ,MAAM,oBAAI,KAAK;AACxC,QAAM,SAAS,KAAK,UAAU,CAAC;AAE/B,MAAI,CAAC,WAAW,KAAK,MAAM,EAAG,WAAU,KAAK,QAAQ,EAAE,WAAW,KAAK,CAAC;AAExE,QAAM,eAAe,oBAAoB;AAAA,IACvC,WAAW,KAAK;AAAA,IAChB;AAAA,IACA,aAAa,KAAK,SAAS,QAAQ;AAAA,IACnC;AAAA,IACA;AAAA,EACF,CAAC;AAED,QAAM,YAAY,IAAI;AACtB,QAAM,QAAyC,CAAC;AAChD,QAAM,kBAA0C,CAAC;AAGjD,QAAM,WAA0F,CAAC;AACjG,MAAI,YAAY;AAChB,aAAW,YAAY,KAAK,WAAW;AACrC,aAAS,MAAM,GAAG,MAAM,MAAM,OAAO;AACnC,YAAM,SAAS,GAAG,SAAS,EAAE,IAAI,GAAG;AACpC,YAAM,WAAW,OAAO;AACxB,eAAS,KAAK,EAAE,UAAU,KAAK,QAAQ,SAAS,CAAC;AACjD,mBAAa;AAAA,IACf;AAAA,EACF;AAGA,MAAI,eAAe;AACnB,MAAI,qBAAqB;AACzB,QAAM,kBAAkB,IAAI,gBAAgB;AAI5C,QAAM,QAAyB,CAAC;AAChC,MAAI,UAAU;AACd,QAAM,WAAW;AAEjB,WAAS,IAAI,GAAG,IAAI,gBAAgB,KAAK;AACvC,UAAM;AAAA,OACH,YAAY;AACX,eAAO,MAAM;AACX,gBAAM,QAAQ;AACd,cAAI,SAAS,SAAS,OAAQ;AAC9B,gBAAM,OAAO,SAAS,KAAK;AAC3B,cAAI,oBAAoB;AACtB,qBAAS,KAAK,YAAY,MAAM,sBAAsB,CAAC;AACvD;AAAA,UACF;AACA,gBAAM,SAAS,MAAM,YAAY;AAAA,YAC/B;AAAA,YACA;AAAA,YACA;AAAA,YACA;AAAA,YACA;AAAA,YACA,kBAAkB,KAAK,oBAAoB;AAAA,YAC3C,QAAQ,gBAAgB;AAAA,UAC1B,CAAC;AACD,mBAAS,KAAK,OAAO,IAAI;AACzB,0BAAgB,OAAO,KAAK;AAC5B,iBAAO,OAAO,iBAAiB,OAAO,eAAe;AACrD,cAAI,KAAK,gBAAgB,UAAa,gBAAgB,KAAK,aAAa;AACtE,iCAAqB;AAAA,UACvB;AAEA,cAAI,KAAK,gBAAgB,KAAK,iBAAiB,SAAS,CAAC,OAAO,KAAK,OAAO;AAC1E,kBAAM,eAAe;AAAA,cACnB,OAAO,KAAK;AAAA,cACZ,MAAM,OAAO;AAAA,cACb,UAAU,KAAK;AAAA,cACf;AAAA,cACA;AAAA,YACF,CAAC,EAAE,MAAM,CAAC,QAAQ;AAGhB,sBAAQ;AAAA,gBACN,oCAAoC,OAAO,KAAK,MAAM,KAAK,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,cAC7G;AAAA,YACF,CAAC;AAAA,UACH;AAAA,QACF;AAAA,MACF,GAAG;AAAA,IACL;AAAA,EACF;AACA,QAAM,QAAQ,IAAI,KAAK;AAEvB,QAAM,UAAU,IAAI;AACpB,WAAS,KAAK,CAAC,GAAG,MAAM,EAAE,OAAO,cAAc,EAAE,MAAM,CAAC;AAExD,QAAM,aAAa;AAAA,IACjB;AAAA,IACA;AAAA,IACA;AAAA,EACF;AAEA,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,WAAW,UAAU,YAAY;AAAA,IACjC,SAAS,QAAQ,YAAY;AAAA,IAC7B,YAAY,QAAQ,QAAQ,IAAI,UAAU,QAAQ;AAAA,IAClD,OAAO;AAAA,IACP;AAAA,IACA,QAAQ,KAAK;AAAA,IACb;AAAA,IACA,WAAW,KAAK,UAAU,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,MAAM,EAAE,KAAK,EAAE;AAAA,EACnE;AACF;AAcA,eAAe,YACb,MAC2F;AAC3F,QAAM,UAAU,KAAK,KAAK,KAAK,QAAQ,KAAK,KAAK,OAAO,QAAQ,mBAAmB,GAAG,CAAC;AACvF,MAAI,CAAC,WAAW,OAAO,EAAG,WAAU,SAAS,EAAE,WAAW,KAAK,CAAC;AAGhE,QAAM,YAAY,KAAK,SAAS,oBAAoB;AACpD,MAAI,KAAK,aAAa,WAAW,SAAS,GAAG;AAC3C,QAAI;AACF,YAAM,SAAS,KAAK,MAAM,aAAa,WAAW,MAAM,CAAC;AACzD,UAAI,OAAO,WAAW,KAAK,KAAK,QAAQ;AACtC,eAAO,EAAE,MAAM,EAAE,GAAG,QAAQ,QAAQ,KAAK,GAAG,iBAAiB,CAAC,EAAE;AAAA,MAClE;AAAA,IACF,QAAQ;AAAA,IAER;AAAA,EACF;AAEA,QAAM,UAAU,KAAK,IAAI;AACzB,QAAM,QAAQ,KAAK,iBAAiB,KAAK,KAAK,QAAQ,OAAO;AAC7D,QAAM,kBAA0C,CAAC;AACjD,QAAM,YAAoC;AAAA,IACxC,MAAM,MAAM,MAAM,SAAS;AACzB,YAAM,WAAW,KAAK,SAAS,IAAI;AACnC,YAAM,MAAM,KAAK,UAAU,IAAI;AAC/B,UAAI,CAAC,WAAW,GAAG,EAAG,WAAU,KAAK,EAAE,WAAW,KAAK,CAAC;AACxD,oBAAc,UAAU,OAAqB;AAC7C,sBAAgB,GAAG,KAAK,KAAK,MAAM,IAAI,IAAI,EAAE,IAAI;AACjD,aAAO;AAAA,IACT;AAAA,IACA,MAAM,UAAU,MAAM,OAAO;AAC3B,aAAO,UAAU,MAAM,MAAM,KAAK,UAAU,OAAO,MAAM,CAAC,CAAC;AAAA,IAC7D;AAAA,EACF;AACA,MAAI,YAAY;AAChB,QAAM,OAA0B;AAAA,IAC9B,QAAQ,QAAQ,QAAQ;AACtB,mBAAa;AACb,YAAM,KAAK,QAAQ,MAAM,IAAI,EAAE,WAAW,OAAO,CAAC,EAAE,IAAI;AAAA,IAC1D;AAAA,IACA,UAAU;AACR,aAAO;AAAA,IACT;AAAA,EACF;AAEA,QAAM,MAAuB;AAAA,IAC3B,QAAQ,KAAK,KAAK;AAAA,IAClB,KAAK,KAAK,KAAK;AAAA,IACf,MAAM,KAAK,KAAK;AAAA,IAChB,QAAQ,KAAK;AAAA,IACb;AAAA,IACA;AAAA,IACA;AAAA,EACF;AAEA,MAAI;AACJ,MAAI;AACJ,MAAI;AACF,eAAW,MAAM,KAAK,KAAK,SAAS,KAAK,KAAK,UAAU,GAAG;AAAA,EAC7D,SAAS,KAAK;AACZ,mBAAe,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAAA,EAChE;AAGA,QAAM,cAA0C,CAAC;AACjD,MAAI,aAAa,QAAW;AAC1B,eAAW,SAAS,KAAK,KAAK,UAAU,CAAC,GAAG;AAC1C,UAAI,MAAM,aAAa,CAAC,MAAM,UAAU,KAAK,KAAK,QAAQ,EAAG;AAC7D,UAAI;AACF,cAAM,QAAQ,MAAM,aAAa,OAAO,EAAE,UAAU,UAAU,KAAK,KAAK,SAAS,CAAC;AAClF,oBAAY,MAAM,IAAI,IAAI;AAAA,MAC5B,SAAS,KAAK;AACZ,oBAAY,MAAM,IAAI,IAAI;AAAA,UACxB,YAAY,CAAC;AAAA,UACb,WAAW;AAAA,UACX,OAAO,iBAAiB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,QAC1E;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,QAAM,MAAM,MAAM;AAElB,QAAM,OAAsC;AAAA,IAC1C,QAAQ,KAAK,KAAK;AAAA,IAClB,YAAY,KAAK,KAAK,SAAS;AAAA,IAC/B,KAAK,KAAK,KAAK;AAAA,IACf,UAAW,YAAY;AAAA,IACvB;AAAA,IACA,SAAS;AAAA,IACT,YAAY,KAAK,IAAI,IAAI;AAAA,IACzB,MAAM,KAAK,KAAK;AAAA,IAChB,QAAQ;AAAA,IACR,OAAO;AAAA,EACT;AAEA,MAAI,CAAC,gBAAgB,KAAK,WAAW;AACnC,kBAAc,WAAW,KAAK,UAAU,IAAI,CAAC;AAAA,EAC/C;AAEA,SAAO,EAAE,MAAM,gBAAgB;AACjC;AAEA,eAAe,aACb,QACA,QACqB;AAGrB,SAAO,EAAE,YAAY,CAAC,GAAG,WAAW,GAAG,OAAO,eAAe;AAC/D;AAEA,SAAS,wBAAwB,QAAgB,KAAkC;AACjF,QAAM,QAAwC,CAAC;AAC/C,SAAO;AAAA,IACL,KAAK,MAAM,YAAY;AACrB,YAAM,UAAU,KAAK,IAAI;AACzB,YAAM,SAAkC,EAAE,MAAM,QAAQ,SAAS,GAAI,cAAc,CAAC,EAAG;AACvF,YAAM,SAAoB;AAAA,QACxB,IAAI,UAAU;AACZ,iBAAO,aAAa,KAAK,IAAI,IAAI;AACjC,cAAI,SAAU,QAAO,OAAO,QAAQ,QAAQ;AAC5C,gBAAM,KAAK,MAAM;AAAA,QACnB;AAAA,QACA,aAAa,KAAK,OAAO;AACvB,iBAAO,GAAG,IAAI;AAAA,QAChB;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,IACA,MAAM,QAAQ;AACZ,YAAM,OAAO,KAAK,KAAK,aAAa;AACpC,oBAAc,MAAM,MAAM,IAAI,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,EAAE,KAAK,IAAI,CAAC;AAAA,IACpE;AAAA,EACF;AACF;AAEA,SAAS,YACP,MACA,QAC+B;AAC/B,SAAO;AAAA,IACL,QAAQ,KAAK;AAAA,IACb,YAAY,KAAK,SAAS;AAAA,IAC1B,KAAK,KAAK;AAAA,IACV,UAAU;AAAA,IACV,aAAa,CAAC;AAAA,IACd,SAAS;AAAA,IACT,YAAY;AAAA,IACZ,MAAM,KAAK;AAAA,IACX,QAAQ;AAAA,IACR,OAAO,YAAY,MAAM;AAAA,EAC3B;AACF;AAUA,eAAe,eACb,MACe;AACf,QAAM,KAAK,MAAM,QAAQ;AAAA,IACvB,UAAU,KAAK;AAAA,IACf,UAAU,KAAK,KAAK;AAAA,IACpB,aAAa,KAAK,KAAK;AAAA,IACvB,QAAQ,KAAK,KAAK,iBAAiB;AAAA,IACnC,mBAAmB,KAAK,KAAK,4BAA4B;AAAA,IACzD,YAAY,KAAK,IAAI,EAAE,YAAY;AAAA,IACnC,iBAAiB;AAAA,EACnB,CAAC;AACH;AAIA,SAAS,oBAAoB,OAMlB;AACT,QAAM,YAAY;AAAA,IAChB,WAAW,MAAM,UAAU,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,MAAM,EAAE,KAAK,EAAE;AAAA,IAClE,QAAQ,MAAM,OAAO,IAAI,CAAC,OAAO,EAAE,MAAM,EAAE,MAAM,MAAM,EAAE,WAAW,IAAI,CAAC,MAAM,EAAE,GAAG,EAAE,EAAE;AAAA,IACxF,UAAU,MAAM;AAAA,IAChB,MAAM,MAAM;AAAA,IACZ,MAAM,MAAM;AAAA,EACd;AACA,SAAO,WAAW,QAAQ,EAAE,OAAO,KAAK,UAAU,SAAS,CAAC,EAAE,OAAO,KAAK;AAC5E;AAEA,SAAS,kBACP,OACA,QACA,MACoB;AACpB,QAAM,UAA0C,CAAC;AACjD,aAAW,SAAS,QAAQ;AAC1B,UAAM,SAAmB,CAAC;AAC1B,eAAW,QAAQ,OAAO;AACxB,YAAM,IAAI,KAAK,YAAY,MAAM,IAAI;AACrC,UAAI,MAAM,OAAW,QAAO,KAAK,EAAE,SAAS;AAAA,IAC9C;AACA,YAAQ,MAAM,IAAI,IAAI,UAAU,QAAQ,IAAI;AAAA,EAC9C;AACA,QAAM,aAAgD,CAAC;AACvD,QAAM,iBAAiB,oBAAI,IAAsB;AACjD,aAAW,QAAQ,OAAO;AACxB,UAAM,aAAa,OAAO,OAAO,KAAK,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AACzE,QAAI,WAAW,WAAW,EAAG;AAC7B,UAAM,OAAO,WAAW,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,WAAW;AAChE,UAAM,MAAM,eAAe,IAAI,KAAK,UAAU,KAAK,CAAC;AACpD,QAAI,KAAK,IAAI;AACb,mBAAe,IAAI,KAAK,YAAY,GAAG;AAAA,EACzC;AACA,aAAW,CAAC,YAAY,OAAO,KAAK,gBAAgB;AAClD,UAAM,KAAK,UAAU,SAAS,IAAI;AAClC,eAAW,UAAU,IAAI,EAAE,eAAe,GAAG,MAAM,MAAM,GAAG,MAAM,GAAG,GAAG,EAAE;AAAA,EAC5E;AACA,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,cAAc,MAAM,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,SAAS,CAAC;AAAA,IACrD,eAAe,MAAM,OAAO,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE;AAAA,IAC7C,cAAc,MAAM,OAAO,CAAC,MAAM,EAAE,OAAO,WAAW,UAAU,CAAC,EAAE;AAAA,IACnE,aAAa,MAAM,OAAO,CAAC,MAAM,EAAE,MAAM,EAAE;AAAA,IAC3C,aAAa,MAAM,OAAO,CAAC,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,WAAW,UAAU,CAAC,EAAE;AAAA,EAC/E;AACF;AAKA,SAAS,UAAU,SAAmB,MAA8B;AAClE,QAAM,IAAI,QAAQ;AAClB,MAAI,MAAM,EAAG,QAAO,EAAE,MAAM,GAAG,OAAO,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,GAAG,EAAE;AAC5D,QAAM,OAAO,QAAQ,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI;AAClD,QAAM,WAAW,QAAQ,OAAO,CAAC,GAAG,MAAM,KAAK,IAAI,SAAS,GAAG,CAAC,IAAI,KAAK,IAAI,GAAG,IAAI,CAAC;AACrF,QAAM,QAAQ,KAAK,KAAK,QAAQ;AAChC,QAAM,KAAK,mBAAmB,SAAS,MAAM,EAAE,MAAM,WAAW,IAAK,CAAC;AACtE,SAAO,EAAE,MAAM,OAAO,MAAM,CAAC,GAAG,OAAO,GAAG,KAAK,GAAG,EAAE;AACtD;","names":[]}
|
|
File without changes
|