npm - @tangle-network/agent-eval - Versions diffs - 0.14.2 → 0.16.2 - Mend

@tangle-network/agent-eval 0.14.2 → 0.16.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/README.md +45 -0
package/dist/chunk-PZ5AY32C.js +10 -0
package/dist/chunk-PZ5AY32C.js.map +1 -0
package/dist/cli.js +1 -0
package/dist/cli.js.map +1 -1
package/dist/index.d.ts +963 -4
package/dist/index.js +1457 -138
package/dist/index.js.map +1 -1
package/dist/telemetry/file.js +2 -0
package/dist/telemetry/file.js.map +1 -1
package/dist/telemetry/index.js +2 -0
package/dist/telemetry/index.js.map +1 -1
package/dist/wire/index.js +1 -0
package/package.json +1 -1

package/dist/index.js CHANGED Viewed

@@ -6,6 +6,9 @@ import {
   probeLlm,
   stripFencedJson
 } from "./chunk-ITN4YOZY.js";
+import {
+  __export
+} from "./chunk-PZ5AY32C.js";
 // src/client.ts
 var ProductClient = class {
@@ -265,12 +268,7 @@ ${codeText}`
 };
 var coherenceJudge = async (tc, { scenario, turns }) => {
   if (turns.length < 2) {
-    return [{
-      judgeName: "coherence",
-      dimension: "coherence",
-      score: 5,
-      reasoning: "Single-turn scenario \u2014 coherence not fully testable."
-    }];
+    return [];
   }
   const conversation = turns.map(
     (t, i) => `Turn ${i + 1}:
@@ -396,36 +394,36 @@ var INVERTED_DIMENSIONS = /* @__PURE__ */ new Set([
   "false_confidence",
   "worst_failure"
 ]);
-function normalizeScores(scores) {
-  return scores.map((s) => {
+function normalizeScores(scores2) {
+  return scores2.map((s) => {
     if (INVERTED_DIMENSIONS.has(s.dimension)) {
       return s;
     }
     return s;
   });
 }
-function weightedMean(scores) {
-  if (scores.length === 0) return 0;
+function weightedMean(scores2) {
+  if (scores2.length === 0) return 0;
   let totalWeight = 0;
   let weightedSum = 0;
-  for (const { score, weight } of scores) {
+  for (const { score, weight } of scores2) {
     const w = weight ?? 1;
     weightedSum += score * w;
     totalWeight += w;
   }
   return totalWeight > 0 ? weightedSum / totalWeight : 0;
 }
-function confidenceInterval(scores, confidence = 0.95) {
-  if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
-  if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
-  const n = scores.length;
-  const mean7 = scores.reduce((a, b) => a + b, 0) / n;
+function confidenceInterval(scores2, confidence = 0.95) {
+  if (scores2.length === 0) return { mean: 0, lower: 0, upper: 0 };
+  if (scores2.length === 1) return { mean: scores2[0], lower: scores2[0], upper: scores2[0] };
+  const n = scores2.length;
+  const mean9 = scores2.reduce((a, b) => a + b, 0) / n;
   const B = 1e3;
   const bootstrapMeans = [];
   for (let i = 0; i < B; i++) {
     let sum2 = 0;
     for (let j = 0; j < n; j++) {
-      sum2 += scores[Math.floor(Math.random() * n)];
+      sum2 += scores2[Math.floor(Math.random() * n)];
     }
     bootstrapMeans.push(sum2 / n);
   }
@@ -434,7 +432,7 @@ function confidenceInterval(scores, confidence = 0.95) {
   const lowerIdx = Math.floor(alpha / 2 * B);
   const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
   return {
-    mean: mean7,
+    mean: mean9,
     lower: bootstrapMeans[lowerIdx],
     upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
   };
@@ -522,11 +520,11 @@ function pairedTTest(before, after) {
   const n = before.length;
   if (n < 2) return { t: 0, df: 0, p: 1 };
   const diffs = before.map((b, i) => after[i] - b);
-  const mean7 = diffs.reduce((a, b) => a + b, 0) / n;
-  const variance2 = diffs.reduce((acc, d) => acc + (d - mean7) ** 2, 0) / (n - 1);
+  const mean9 = diffs.reduce((a, b) => a + b, 0) / n;
+  const variance2 = diffs.reduce((acc, d) => acc + (d - mean9) ** 2, 0) / (n - 1);
   const se = Math.sqrt(variance2 / n);
-  if (se === 0) return { t: mean7 === 0 ? 0 : Infinity, df: n - 1, p: mean7 === 0 ? 1 : 0 };
-  const t = mean7 / se;
+  if (se === 0) return { t: mean9 === 0 ? 0 : Infinity, df: n - 1, p: mean9 === 0 ? 1 : 0 };
+  const t = mean9 / se;
   const df = n - 1;
   const p = 2 * (1 - studentTCdf(Math.abs(t), df));
   return { t, df, p };
@@ -544,15 +542,15 @@ function wilcoxonSignedRank(before, after) {
   while (i < n) {
     let j = i;
     while (j < n && absRanks[j].abs === absRanks[i].abs) j++;
-    const avg = (i + 1 + j) / 2;
-    for (let k = i; k < j; k++) ranks3[absRanks[k].i] = avg;
+    const avg2 = (i + 1 + j) / 2;
+    for (let k = i; k < j; k++) ranks3[absRanks[k].i] = avg2;
     i = j;
   }
   let wPlus = 0;
   for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
-  const mean7 = n * (n + 1) / 4;
+  const mean9 = n * (n + 1) / 4;
   const variance2 = n * (n + 1) * (2 * n + 1) / 24;
-  const z = (wPlus - mean7) / Math.sqrt(variance2);
+  const z = (wPlus - mean9) / Math.sqrt(variance2);
   const p = 2 * (1 - normalCdf(Math.abs(z)));
   return { w: wPlus, p };
 }
@@ -753,8 +751,8 @@ async function executeScenario(tc, scenario, config) {
           console.log(`    judge retry ${attempt}/2 (waiting ${wait / 1e3}s)`);
           await new Promise((r) => setTimeout(r, wait));
         }
-        const scores = await judge(tc, judgeInput);
-        judgeResults.push(scores);
+        const scores2 = await judge(tc, judgeInput);
+        judgeResults.push(scores2);
         await new Promise((r) => setTimeout(r, 3e3));
         break;
       } catch (err) {
@@ -847,8 +845,8 @@ var BenchmarkRunner = class {
         byJudge[js.judgeName].dimensions.push(`${js.dimension}=${js.score}`);
       }
       for (const [name, data] of Object.entries(byJudge)) {
-        const avg = (data.scores.reduce((a, b) => a + b, 0) / data.scores.length).toFixed(1);
-        console.log(`    ${name.padEnd(16)} avg=${avg}  [${data.dimensions.join(", ")}]`);
+        const avg2 = (data.scores.reduce((a, b) => a + b, 0) / data.scores.length).toFixed(1);
+        console.log(`    ${name.padEnd(16)} avg=${avg2}  [${data.dimensions.join(", ")}]`);
       }
       console.log(`  OVERALL: ${result.overallScore.toFixed(1)}/10 (${(result.totalDurationMs / 1e3).toFixed(0)}s)`);
       console.log();
@@ -2270,7 +2268,7 @@ var PromptOptimizer = class {
         });
       }
     }
-    const scores = config.variants.map((variant) => {
+    const scores2 = config.variants.map((variant) => {
       const scenarioMap = rawScores.get(variant.id);
       const allSamples = [];
       const perScenario = {};
@@ -2293,10 +2291,10 @@ var PromptOptimizer = class {
       };
     });
     const rawPairs = [];
-    for (let i = 0; i < scores.length; i++) {
-      for (let j = i + 1; j < scores.length; j++) {
-        const a = scores[i];
-        const b = scores[j];
+    for (let i = 0; i < scores2.length; i++) {
+      for (let j = i + 1; j < scores2.length; j++) {
+        const a = scores2[i];
+        const b = scores2[j];
         const { p } = mannWhitneyU(flatSamples(a), flatSamples(b));
         rawPairs.push({ a, b, p });
       }
@@ -2310,7 +2308,7 @@ var PromptOptimizer = class {
       significant: qValues[idx] < alpha,
       meanDelta: r.b.mean - r.a.mean
     }));
-    const sorted = scores.slice().sort((x, y) => y.mean - x.mean);
+    const sorted = scores2.slice().sort((x, y) => y.mean - x.mean);
     const winner = sorted[0];
     const second = sorted[1];
     const winnerComparisons = pairwise2.filter(
@@ -2324,7 +2322,7 @@ var PromptOptimizer = class {
         significant: significantOverAll,
         ciLowerBoundExceedsSecondMean
       },
-      scores,
+      scores: scores2,
       pairwise: pairwise2,
       config: {
         trialsPerScenario: trials,
@@ -2870,20 +2868,20 @@ async function mapLimit(items, limit, fn) {
 function mean(values) {
   return values.length ? values.reduce((sum2, value) => sum2 + value, 0) / values.length : 0;
 }
-function meanRunScore(scores) {
+function meanRunScore(scores2) {
   return {
-    success: mean(scores.map((s) => s.success)),
-    goalProgress: mean(scores.map((s) => s.goalProgress)),
-    repoGroundedness: mean(scores.map((s) => s.repoGroundedness)),
-    driftPenalty: mean(scores.map((s) => s.driftPenalty)),
-    toolUseQuality: mean(scores.map((s) => s.toolUseQuality)),
-    patchQuality: mean(scores.map((s) => s.patchQuality)),
-    testReality: mean(scores.map((s) => s.testReality)),
-    finalGate: mean(scores.map((s) => s.finalGate)),
-    reviewerBlockers: mean(scores.map((s) => s.reviewerBlockers)),
-    costUsd: mean(scores.map((s) => s.costUsd)),
-    wallSeconds: mean(scores.map((s) => s.wallSeconds)),
-    notes: scores.flatMap((s) => s.notes ?? [])
+    success: mean(scores2.map((s) => s.success)),
+    goalProgress: mean(scores2.map((s) => s.goalProgress)),
+    repoGroundedness: mean(scores2.map((s) => s.repoGroundedness)),
+    driftPenalty: mean(scores2.map((s) => s.driftPenalty)),
+    toolUseQuality: mean(scores2.map((s) => s.toolUseQuality)),
+    patchQuality: mean(scores2.map((s) => s.patchQuality)),
+    testReality: mean(scores2.map((s) => s.testReality)),
+    finalGate: mean(scores2.map((s) => s.finalGate)),
+    reviewerBlockers: mean(scores2.map((s) => s.reviewerBlockers)),
+    costUsd: mean(scores2.map((s) => s.costUsd)),
+    wallSeconds: mean(scores2.map((s) => s.wallSeconds)),
+    notes: scores2.flatMap((s) => s.notes ?? [])
   };
 }
@@ -3339,12 +3337,12 @@ var SubprocessSandboxDriver = class {
     this.defaultEnv = options.env;
   }
   async exec(phase, command, config) {
-    const { spawn } = await import("child_process");
+    const { spawn: spawn2 } = await import("child_process");
     const start = Date.now();
     const effectiveCwd = config.cwd ?? this.defaultCwd;
     const effectiveEnv = { ...process.env, ...this.defaultEnv ?? {}, ...config.env ?? {} };
     return await new Promise((resolve) => {
-      const child = spawn(command, {
+      const child = spawn2(command, {
         shell: true,
         cwd: effectiveCwd,
         env: effectiveEnv
@@ -5392,10 +5390,10 @@ function analyzeSeries(values, options = {}) {
     return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
   }
   const tail = values.slice(-window);
-  const mean7 = tail.reduce((a, b) => a + b, 0) / tail.length;
-  const variance2 = tail.reduce((acc, v) => acc + (v - mean7) ** 2, 0) / tail.length;
+  const mean9 = tail.reduce((a, b) => a + b, 0) / tail.length;
+  const variance2 = tail.reduce((acc, v) => acc + (v - mean9) ** 2, 0) / tail.length;
   const stdDev = Math.sqrt(variance2);
-  const refMean = Math.abs(mean7) > 1e-9 ? Math.abs(mean7) : 1;
+  const refMean = Math.abs(mean9) > 1e-9 ? Math.abs(mean9) : 1;
   const cv = stdDev / refMean;
   const stable = tail.length >= window && cv <= stableCv;
   let tailRun = 0;
@@ -5416,7 +5414,7 @@ function analyzeSeries(values, options = {}) {
   } else {
     state = "noisy";
   }
-  return { state, windowMean: mean7, windowCv: cv, tailRun, stable };
+  return { state, windowMean: mean9, windowCv: cv, tailRun, stable };
 }
 // src/state-continuity.ts
@@ -6012,9 +6010,9 @@ function calibrateJudge(golden, candidate) {
   const worst2 = [...map.entries()].filter(([, v]) => Number.isFinite(v.j)).map(([itemId, v]) => ({ itemId, judge: v.j, human: v.h, delta: Math.abs(v.j - v.h) })).sort((a, b) => b.delta - a.delta).slice(0, 5);
   return { n, pearson: pearson2, kappa, mae, worstItems: worst2 };
 }
-function positionalBias(scores) {
+function positionalBias(scores2) {
   const pairs = /* @__PURE__ */ new Map();
-  for (const s of scores) {
+  for (const s of scores2) {
     const slot = pairs.get(s.itemId) ?? {};
     if (s.positionOfAInput === "first") slot.first = s.score;
     else if (s.positionOfAInput === "second") slot.second = s.score;
@@ -6165,12 +6163,12 @@ function renderMarkdownReport(reports) {
 async function aggregateRunMetrics(runs, store) {
   if (runs.length === 0) return {};
   const durations = [];
-  const scores = [];
+  const scores2 = [];
   const passes = [];
   const costs = [];
   for (const r of runs) {
     if (r.endedAt) durations.push(r.endedAt - r.startedAt);
-    if (r.outcome?.score !== void 0) scores.push(r.outcome.score);
+    if (r.outcome?.score !== void 0) scores2.push(r.outcome.score);
     passes.push(r.outcome?.pass === true ? 1 : 0);
     const llm = await llmSpans(store, r.runId);
     costs.push(aggregateLlm(llm).costUsd);
@@ -6179,7 +6177,7 @@ async function aggregateRunMetrics(runs, store) {
     provisionMs: average(durations),
     firstTokenMs: average(durations),
     wallMs: average(durations),
-    overallScore: average(scores),
+    overallScore: average(scores2),
     passRate: average(passes),
     costUsd: average(costs)
   };
@@ -6242,7 +6240,7 @@ async function toLangfuseEnvelope(store, runId) {
     },
     metadata: { finishReason: s.finishReason, cachedTokens: s.cachedTokens }
   }));
-  const scores = judges.map((j) => ({
+  const scores2 = judges.map((j) => ({
     id: j.spanId,
     traceId: run.runId,
     observationId: j.targetSpanId,
@@ -6250,7 +6248,7 @@ async function toLangfuseEnvelope(store, runId) {
     value: j.score,
     comment: j.rationale
   }));
-  return { traceId: run.runId, generations, scores };
+  return { traceId: run.runId, generations, scores: scores2 };
 }
 async function toPrometheusText(store) {
   const runs = await store.listRuns();
@@ -6344,12 +6342,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
     variantScores.push({ mutator: id, score, mutated });
     all.push(score);
   }
-  const mean7 = all.reduce((a, b) => a + b, 0) / all.length;
-  const variance2 = all.reduce((a, v) => a + (v - mean7) ** 2, 0) / all.length;
+  const mean9 = all.reduce((a, b) => a + b, 0) / all.length;
+  const variance2 = all.reduce((a, v) => a + (v - mean9) ** 2, 0) / all.length;
   const stdDev = Math.sqrt(variance2);
-  const ref = Math.abs(mean7) > 1e-9 ? Math.abs(mean7) : 1;
+  const ref = Math.abs(mean9) > 1e-9 ? Math.abs(mean9) : 1;
   const robustness = Math.max(0, 1 - stdDev / ref);
-  return { originalScore, variantScores, meanScore: mean7, stdDev, robustness };
+  return { originalScore, variantScores, meanScore: mean9, stdDev, robustness };
 }
 var lowercaseMutator = (p) => p.toLowerCase();
 var sentenceReorderMutator = (p, seed) => {
@@ -6684,8 +6682,8 @@ function ranks(xs) {
   for (let i = 0; i < indexed.length; i++) {
     let j = i;
     while (j + 1 < indexed.length && indexed[j + 1].v === indexed[i].v) j++;
-    const avg = (i + j + 2) / 2;
-    for (let k = i; k <= j; k++) r[indexed[k].i] = avg;
+    const avg2 = (i + j + 2) / 2;
+    for (let k = i; k <= j; k++) r[indexed[k].i] = avg2;
     i = j;
   }
   return r;
@@ -6929,8 +6927,8 @@ function ranks2(xs) {
   for (let i = 0; i < indexed.length; i++) {
     let j = i;
     while (j + 1 < indexed.length && indexed[j + 1].v === indexed[i].v) j++;
-    const avg = (i + j + 2) / 2;
-    for (let k = i; k <= j; k++) r[indexed[k].i] = avg;
+    const avg2 = (i + j + 2) / 2;
+    for (let k = i; k <= j; k++) r[indexed[k].i] = avg2;
     i = j;
   }
   return r;
@@ -7270,8 +7268,8 @@ async function prmBestOfN(store, grader, runIds) {
   if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
   const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
   const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
-  const mean7 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
-  const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean7) ** 2, 0) / graded.length;
+  const mean9 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
+  const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean9) ** 2, 0) / graded.length;
   return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
 }
 async function prmEnsembleBestOfN(store, graders, runIds) {
@@ -7293,8 +7291,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
   const ranked = [...byRun.values()].sort(
     (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
   );
-  const mean7 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
-  const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean7) ** 2, 0) / ranked.length;
+  const mean9 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
+  const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean9) ** 2, 0) / ranked.length;
   return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
 }
@@ -7672,15 +7670,15 @@ async function runSelfPlay(proposer, scorer, targets, options = {}) {
     const rejected = [];
     const surviving = [];
     for (const candidate of proposed) {
-      const scores = await scorer.scoreCandidate(candidate, targets);
-      if (scores.length < 2) {
+      const scores2 = await scorer.scoreCandidate(candidate, targets);
+      if (scores2.length < 2) {
         rejected.push({ candidate, reason: "scorer returned <2 results" });
         continue;
       }
-      const values = scores.map((s) => s.score);
+      const values = scores2.map((s) => s.score);
       const spread = Math.max(...values) - Math.min(...values);
       const maxScore = Math.max(...values);
-      scored.push({ candidate, scores, spread });
+      scored.push({ candidate, scores: scores2, spread });
       if (maxScore < floor) {
         rejected.push({ candidate, reason: `every target below floor (max=${maxScore.toFixed(3)} < ${floor})` });
         continue;
@@ -7822,10 +7820,10 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
   }
   for (const s of scenarios) {
     const sRuns = runs.filter((r) => r.scenarioId === s.id);
-    const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
-    if (scores.length < 3) continue;
-    const mean7 = scores.reduce((a, b) => a + b, 0) / scores.length;
-    const variance2 = scores.reduce((a, b) => a + (b - mean7) ** 2, 0) / scores.length;
+    const scores2 = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
+    if (scores2.length < 3) continue;
+    const mean9 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
+    const variance2 = scores2.reduce((a, b) => a + (b - mean9) ** 2, 0) / scores2.length;
     if (variance2 > varianceThreshold) {
       targets.push({
         reason: "high-variance",
@@ -8580,20 +8578,20 @@ function mergeLayerResults(name, perAdapter, options = {}) {
   let durationMs = 0;
   const reasonParts = [];
   const diagnostics = {};
-  for (const { adapter, result } of perAdapter) {
+  for (const { adapter: adapter4, result } of perAdapter) {
     status = worst(status, result.status);
     if (typeof result.score === "number") {
       weightedScoreSum += result.score;
       weightCount += 1;
     }
     durationMs = mergeDuration === "sum" ? durationMs + result.durationMs : Math.max(durationMs, result.durationMs);
-    reasonParts.push(`${adapter}: ${result.status}`);
+    reasonParts.push(`${adapter4}: ${result.status}`);
     for (const f of result.findings) {
       findings.push({
         ...f,
         layer: name,
-        message: prefix ? `${prefix(adapter)} ${f.message}` : f.message,
-        detail: { ...f.detail ?? {}, adapter }
+        message: prefix ? `${prefix(adapter4)} ${f.message}` : f.message,
+        detail: { ...f.detail ?? {}, adapter: adapter4 }
       });
     }
     for (const [k, v] of Object.entries(result.diagnostics ?? {})) {
@@ -8612,8 +8610,8 @@ function mergeLayerResults(name, perAdapter, options = {}) {
     reason: reasonParts.join(" \xB7 "),
     diagnostics: Object.keys(diagnostics).length > 0 ? diagnostics : void 0,
     detail: {
-      adapters: perAdapter.map(({ adapter, result }) => ({
-        adapter,
+      adapters: perAdapter.map(({ adapter: adapter4, result }) => ({
+        adapter: adapter4,
         status: result.status,
         score: result.score ?? null
       })),
@@ -8639,10 +8637,10 @@ function multiToolchainLayer(config) {
           reason: "no adapters detected"
         };
       }
-      const runOne = async (adapter) => {
-        const adapterName = config.adapterName(adapter);
+      const runOne = async (adapter4) => {
+        const adapterName = config.adapterName(adapter4);
         try {
-          const r = await config.run(adapter, ctx);
+          const r = await config.run(adapter4, ctx);
           return { adapter: adapterName, result: r };
         } catch (err) {
           return {
@@ -9345,6 +9343,57 @@ function viteDeployRunner(input) {
     }
   };
 }
+function wranglerDeployRunner(input) {
+  return {
+    run: async () => {
+      const start = Date.now();
+      const buildCmd = input.buildCommand ?? "npm run build";
+      const dryCmd = input.dryRunCommand ?? "npx wrangler deploy --dry-run --outdir dist";
+      const timeoutMs = input.timeoutMs ?? 12e4;
+      const hasToml = await input.exists("wrangler.toml");
+      const hasJsonc = hasToml ? false : await input.exists("wrangler.jsonc");
+      if (!hasToml && !hasJsonc) {
+        return {
+          ok: false,
+          output: "no wrangler config found (wrangler.toml / wrangler.jsonc absent)",
+          durationMs: Date.now() - start,
+          artifactDir: "dist",
+          artifactValid: false
+        };
+      }
+      const build = await input.exec(buildCmd, { cwd: input.workdir, timeoutMs });
+      if (build.exitCode !== 0) {
+        const tail2 = ((build.stderr || build.stdout) ?? "").slice(-1500);
+        return {
+          ok: false,
+          output: `build failed: ${tail2}`,
+          durationMs: Date.now() - start,
+          artifactDir: "dist",
+          artifactValid: false
+        };
+      }
+      const dry = await input.exec(dryCmd, { cwd: input.workdir, timeoutMs });
+      if (dry.exitCode !== 0) {
+        const tail2 = ((dry.stderr || dry.stdout) ?? "").slice(-1500);
+        return {
+          ok: false,
+          output: `wrangler dry-run failed: ${tail2}`,
+          durationMs: Date.now() - start,
+          artifactDir: "dist",
+          artifactValid: false
+        };
+      }
+      const tail = ((dry.stdout || dry.stderr) ?? "").slice(-1500);
+      return {
+        ok: true,
+        output: tail,
+        durationMs: Date.now() - start,
+        artifactDir: "dist",
+        artifactValid: true
+      };
+    }
+  };
+}
 // src/keyword-coverage-judge.ts
 function htmlContainsElement(html, selector) {
@@ -9712,15 +9761,15 @@ function scoreReferenceReplay(scenarios, options = {}) {
   const threshold = options.matchThreshold ?? DEFAULT_MATCH_THRESHOLD;
   const matchStrategy = options.matchStrategy ?? "reference-order";
   const allowedSplits = new Set(options.splits ?? ALL_SPLITS);
-  const scores = scenarios.filter((scenario) => {
+  const scores2 = scenarios.filter((scenario) => {
     const split = scenario.split ?? "train";
     if (split === "holdout" && !options.includeHoldout) return false;
     return allowedSplits.has(split);
   }).map((scenario) => scoreScenario(scenario, matcher, threshold, matchStrategy));
   return {
-    scenarios: scores,
-    aggregate: aggregateScenarioScores(scores),
-    bySplit: aggregateBySplit(scores)
+    scenarios: scores2,
+    aggregate: aggregateScenarioScores(scores2),
+    bySplit: aggregateBySplit(scores2)
   };
 }
 function compareReferenceReplay(baseline, candidate) {
@@ -9935,20 +9984,20 @@ function buildScenarioScore(scenario, matches2, falsePositives) {
     matches: matches2
   };
 }
-function aggregateBySplit(scores) {
+function aggregateBySplit(scores2) {
   const out = {};
   for (const split of ALL_SPLITS) {
-    const scoped = scores.filter((score) => score.split === split);
+    const scoped = scores2.filter((score) => score.split === split);
     if (scoped.length > 0) out[split] = aggregateScenarioScores(scoped);
   }
   return out;
 }
-function aggregateScenarioScores(scores) {
-  const matched = sum(scores.map((score) => score.matched));
-  const total = sum(scores.map((score) => score.total));
-  const falsePositives = sum(scores.map((score) => score.falsePositives));
-  const matchedWeight = sum(scores.map((score) => score.matchedWeight));
-  const totalWeight = sum(scores.map((score) => score.totalWeight));
+function aggregateScenarioScores(scores2) {
+  const matched = sum(scores2.map((score) => score.matched));
+  const total = sum(scores2.map((score) => score.total));
+  const falsePositives = sum(scores2.map((score) => score.falsePositives));
+  const matchedWeight = sum(scores2.map((score) => score.matchedWeight));
+  const totalWeight = sum(scores2.map((score) => score.totalWeight));
   const precision2 = ratio(matched, matched + falsePositives);
   const recall = ratio(matched, total);
   return {
@@ -10027,8 +10076,8 @@ function formatPct(value) {
 function bySplitOrder(a, b) {
   return ALL_SPLITS.indexOf(a) - ALL_SPLITS.indexOf(b);
 }
-function runAdapter(adapter, scenario, context) {
-  return typeof adapter === "function" ? adapter(scenario, context) : adapter.run(scenario, context);
+function runAdapter(adapter4, scenario, context) {
+  return typeof adapter4 === "function" ? adapter4(scenario, context) : adapter4.run(scenario, context);
 }
 function throwIfAborted(signal) {
   if (!signal?.aborted) return;
@@ -10066,6 +10115,1258 @@ var STOP_WORDS = /* @__PURE__ */ new Set([
   "which"
 ]);
+// src/paired-stats.ts
+function pairedBootstrap(before, after, opts = {}) {
+  if (before.length !== after.length) {
+    throw new Error(
+      `pairedBootstrap: unequal sample sizes (${before.length} vs ${after.length})`
+    );
+  }
+  const confidence = opts.confidence ?? 0.95;
+  const resamples = opts.resamples ?? 2e3;
+  const statistic = opts.statistic ?? "median";
+  if (confidence <= 0 || confidence >= 1) {
+    throw new Error(`pairedBootstrap: confidence must be in (0,1), got ${confidence}`);
+  }
+  const n = before.length;
+  const deltas = before.map((b, i) => after[i] - b);
+  if (n === 0) {
+    return { n: 0, median: 0, mean: 0, low: 0, high: 0, confidence, resamples };
+  }
+  if (n === 1) {
+    const d = deltas[0];
+    return { n: 1, median: d, mean: d, low: d, high: d, confidence, resamples };
+  }
+  const rng = makeRng(opts.seed);
+  const samples = new Array(resamples);
+  for (let b = 0; b < resamples; b++) {
+    let acc = null;
+    if (statistic === "mean") {
+      let sum2 = 0;
+      for (let k = 0; k < n; k++) {
+        sum2 += deltas[Math.floor(rng() * n)];
+      }
+      samples[b] = sum2 / n;
+    } else {
+      acc = new Array(n);
+      for (let k = 0; k < n; k++) {
+        acc[k] = deltas[Math.floor(rng() * n)];
+      }
+      samples[b] = medianInPlace(acc);
+    }
+  }
+  samples.sort((a, b) => a - b);
+  const alpha = 1 - confidence;
+  const lowIdx = Math.floor(alpha / 2 * resamples);
+  const highIdx = Math.min(resamples - 1, Math.ceil((1 - alpha / 2) * resamples) - 1);
+  return {
+    n,
+    median: medianInPlace([...deltas]),
+    mean: deltas.reduce((s, x) => s + x, 0) / n,
+    low: samples[lowIdx],
+    high: samples[Math.max(highIdx, lowIdx)],
+    confidence,
+    resamples
+  };
+}
+function pairedWilcoxon(before, after) {
+  return wilcoxonSignedRank(before, after);
+}
+function bhAdjust(pValues, fdr = 0.05) {
+  return benjaminiHochberg(pValues, fdr);
+}
+function medianInPlace(xs) {
+  if (xs.length === 0) return 0;
+  xs.sort((a, b) => a - b);
+  const mid = Math.floor(xs.length / 2);
+  return xs.length % 2 === 0 ? (xs[mid - 1] + xs[mid]) / 2 : xs[mid];
+}
+function makeRng(seed) {
+  if (seed === void 0) return Math.random;
+  let s = seed | 0 || 2654435769;
+  return () => {
+    s = s + 1831565813 | 0;
+    let t = s;
+    t = Math.imul(t ^ t >>> 15, t | 1);
+    t ^= t + Math.imul(t ^ t >>> 7, t | 61);
+    return ((t ^ t >>> 14) >>> 0) / 4294967296;
+  };
+}
+// src/run-record.ts
+var MANDATORY_TOP_LEVEL = [
+  "runId",
+  "experimentId",
+  "candidateId",
+  "seed",
+  "model",
+  "promptHash",
+  "configHash",
+  "commitSha",
+  "wallMs",
+  "costUsd",
+  "tokenUsage",
+  "outcome",
+  "splitTag"
+];
+var SPLIT_TAGS = ["search", "dev", "holdout"];
+var RunRecordValidationError = class extends Error {
+  path;
+  constructor(message, path = "") {
+    super(path ? `${message} (at ${path})` : message);
+    this.name = "RunRecordValidationError";
+    this.path = path;
+  }
+};
+function validateRunRecord(input) {
+  if (input === null || typeof input !== "object") {
+    throw new RunRecordValidationError("expected object");
+  }
+  const obj = input;
+  for (const key of MANDATORY_TOP_LEVEL) {
+    if (!(key in obj)) {
+      throw new RunRecordValidationError(`missing mandatory field "${key}"`);
+    }
+  }
+  expectString(obj.runId, "runId");
+  expectString(obj.experimentId, "experimentId");
+  expectString(obj.candidateId, "candidateId");
+  expectFiniteNumber(obj.seed, "seed");
+  expectString(obj.model, "model");
+  expectString(obj.promptHash, "promptHash");
+  expectString(obj.configHash, "configHash");
+  expectString(obj.commitSha, "commitSha");
+  expectFiniteNumber(obj.wallMs, "wallMs");
+  if (obj.queueMs !== void 0) expectFiniteNumber(obj.queueMs, "queueMs");
+  expectFiniteNumber(obj.costUsd, "costUsd");
+  if (!modelHasSnapshot(obj.model)) {
+    throw new RunRecordValidationError(
+      `model "${obj.model}" lacks a snapshot version (use 'name@YYYY-MM-DD' or 'name-YYYYMMDD')`,
+      "model"
+    );
+  }
+  const tu = obj.tokenUsage;
+  if (tu === null || typeof tu !== "object") {
+    throw new RunRecordValidationError("tokenUsage must be an object", "tokenUsage");
+  }
+  const tuRec = tu;
+  expectFiniteNumber(tuRec.input, "tokenUsage.input");
+  expectFiniteNumber(tuRec.output, "tokenUsage.output");
+  if (tuRec.cached !== void 0) expectFiniteNumber(tuRec.cached, "tokenUsage.cached");
+  if (obj.judgeMetadata !== void 0) {
+    const jm = obj.judgeMetadata;
+    if (jm === null || typeof jm !== "object") {
+      throw new RunRecordValidationError("judgeMetadata must be an object", "judgeMetadata");
+    }
+    const jmRec = jm;
+    expectString(jmRec.model, "judgeMetadata.model");
+    expectString(jmRec.promptVersion, "judgeMetadata.promptVersion");
+    expectFiniteNumber(jmRec.confidence, "judgeMetadata.confidence");
+    if (typeof jmRec.fallback !== "boolean") {
+      throw new RunRecordValidationError("judgeMetadata.fallback must be boolean", "judgeMetadata.fallback");
+    }
+  }
+  const out = obj.outcome;
+  if (out === null || typeof out !== "object") {
+    throw new RunRecordValidationError("outcome must be an object", "outcome");
+  }
+  const outRec = out;
+  if (outRec.searchScore !== void 0) expectFiniteNumber(outRec.searchScore, "outcome.searchScore");
+  if (outRec.holdoutScore !== void 0) expectFiniteNumber(outRec.holdoutScore, "outcome.holdoutScore");
+  if (outRec.searchScore === void 0 && outRec.holdoutScore === void 0) {
+    throw new RunRecordValidationError(
+      "outcome must define searchScore or holdoutScore (or both)",
+      "outcome"
+    );
+  }
+  const raw = outRec.raw;
+  if (raw === null || typeof raw !== "object") {
+    throw new RunRecordValidationError("outcome.raw must be an object", "outcome.raw");
+  }
+  for (const [k, v] of Object.entries(raw)) {
+    expectFiniteNumber(v, `outcome.raw.${k}`);
+  }
+  if (obj.failureMode !== void 0) expectString(obj.failureMode, "failureMode");
+  if (typeof obj.splitTag !== "string" || !SPLIT_TAGS.includes(obj.splitTag)) {
+    throw new RunRecordValidationError(
+      `splitTag must be one of ${SPLIT_TAGS.join(", ")}, got ${String(obj.splitTag)}`,
+      "splitTag"
+    );
+  }
+  return input;
+}
+function isRunRecord(input) {
+  try {
+    validateRunRecord(input);
+    return true;
+  } catch {
+    return false;
+  }
+}
+function parseRunRecordSafe(input) {
+  try {
+    return { ok: true, value: validateRunRecord(input) };
+  } catch (e) {
+    if (e instanceof RunRecordValidationError) return { ok: false, error: e };
+    throw e;
+  }
+}
+function roundTripRunRecord(record) {
+  const json = JSON.stringify(record);
+  return validateRunRecord(JSON.parse(json));
+}
+function expectString(value, path) {
+  if (typeof value !== "string" || value.length === 0) {
+    throw new RunRecordValidationError(`expected non-empty string`, path);
+  }
+}
+function expectFiniteNumber(value, path) {
+  if (typeof value !== "number" || !Number.isFinite(value)) {
+    throw new RunRecordValidationError(`expected finite number`, path);
+  }
+}
+function modelHasSnapshot(model) {
+  if (model.includes("@")) return true;
+  if (/-\d{8}$/.test(model)) return true;
+  if (/-\d{4}-\d{2}-\d{2}$/.test(model)) return true;
+  if (/:date-/.test(model)) return true;
+  return false;
+}
+// src/held-out-gate.ts
+var HeldOutGate = class {
+  minProductiveRuns;
+  pairedDeltaThreshold;
+  overfitGapThreshold;
+  baselineKey;
+  confidence;
+  resamples;
+  seed;
+  constructor(config) {
+    if (!config.baselineKey) {
+      throw new Error("HeldOutGate: baselineKey is required");
+    }
+    this.minProductiveRuns = config.minProductiveRuns ?? 3;
+    this.pairedDeltaThreshold = config.pairedDeltaThreshold ?? 0;
+    this.overfitGapThreshold = config.overfitGapThreshold ?? 0.15;
+    this.baselineKey = config.baselineKey;
+    this.confidence = config.confidence ?? 0.95;
+    this.resamples = config.bootstrapResamples ?? 2e3;
+    this.seed = config.seed;
+  }
+  /** Decide whether `candidate` should replace `baseline`. Pairing
+   *  is by (experimentId, seed) — identical experiment + seed pairs
+   *  the candidate run with the matching baseline run. Pairs without
+   *  a holdout score on both sides are dropped. */
+  evaluate(candidate, baseline) {
+    const candidateId = inferCandidateId(candidate, this.baselineKey);
+    const baselineId = this.baselineKey;
+    const baselineHoldoutByKey = indexHoldoutByKey(baseline);
+    const beforeHoldout = [];
+    const afterHoldout = [];
+    for (const run of candidate) {
+      if (run.splitTag !== "holdout") continue;
+      if (run.outcome.holdoutScore === void 0) continue;
+      const key = pairKey(run);
+      const counterpart = baselineHoldoutByKey.get(key);
+      if (counterpart === void 0) continue;
+      beforeHoldout.push(counterpart);
+      afterHoldout.push(run.outcome.holdoutScore);
+    }
+    const productiveRuns = beforeHoldout.length;
+    const candidateSearchMean = mean5(scores(candidate, "searchScore", "search"));
+    const candidateHoldoutMean = mean5(scores(candidate, "holdoutScore", "holdout"));
+    const baselineSearchMean = mean5(scores(baseline, "searchScore", "search"));
+    const baselineHoldoutMean = mean5(scores(baseline, "holdoutScore", "holdout"));
+    const overfitGap = safeDiff(candidateSearchMean, candidateHoldoutMean);
+    const baselineOverfitGap = safeDiff(baselineSearchMean, baselineHoldoutMean);
+    if (productiveRuns < this.minProductiveRuns) {
+      return {
+        promote: false,
+        candidateId,
+        baselineId,
+        evidence: {
+          productiveRuns,
+          medianPairedDelta: productiveRuns > 0 ? medianDelta(beforeHoldout, afterHoldout) : 0,
+          pairedCI: { low: 0, high: 0 },
+          pairedPValue: 1,
+          searchScore: candidateSearchMean,
+          holdoutScore: candidateHoldoutMean,
+          overfitGap,
+          baselineOverfitGap
+        },
+        reason: `few_runs: ${productiveRuns} paired holdout observation(s) < min ${this.minProductiveRuns}`,
+        rejectionCode: "few_runs"
+      };
+    }
+    const ci = pairedBootstrap(beforeHoldout, afterHoldout, {
+      confidence: this.confidence,
+      resamples: this.resamples,
+      statistic: "median",
+      seed: this.seed
+    });
+    const wilcoxon = pairedWilcoxon(beforeHoldout, afterHoldout);
+    const evidence = {
+      productiveRuns,
+      medianPairedDelta: ci.median,
+      pairedCI: { low: ci.low, high: ci.high },
+      pairedPValue: wilcoxon.p,
+      searchScore: candidateSearchMean,
+      holdoutScore: candidateHoldoutMean,
+      overfitGap,
+      baselineOverfitGap
+    };
+    if (!(ci.low > this.pairedDeltaThreshold)) {
+      return {
+        promote: false,
+        candidateId,
+        baselineId,
+        evidence,
+        reason: `negative_delta: paired holdout median \u0394=${fmt(ci.median)} CI=[${fmt(ci.low)}, ${fmt(ci.high)}] does not clear threshold ${fmt(this.pairedDeltaThreshold)}`,
+        rejectionCode: "negative_delta"
+      };
+    }
+    if (Number.isFinite(overfitGap) && Number.isFinite(baselineOverfitGap) && overfitGap > baselineOverfitGap + this.overfitGapThreshold) {
+      return {
+        promote: false,
+        candidateId,
+        baselineId,
+        evidence,
+        reason: `overfit_gap: candidate gap=${fmt(overfitGap)} exceeds baseline gap=${fmt(baselineOverfitGap)} by more than ${fmt(this.overfitGapThreshold)}`,
+        rejectionCode: "overfit_gap"
+      };
+    }
+    return {
+      promote: true,
+      candidateId,
+      baselineId,
+      evidence,
+      reason: `promote: paired holdout median \u0394=${fmt(ci.median)} CI=[${fmt(ci.low)}, ${fmt(ci.high)}] over ${productiveRuns} pairs; overfit gap candidate=${fmt(overfitGap)} vs baseline=${fmt(baselineOverfitGap)}`,
+      rejectionCode: null
+    };
+  }
+};
+function inferCandidateId(candidate, baselineKey) {
+  for (const run of candidate) {
+    if (run.candidateId && run.candidateId !== baselineKey) return run.candidateId;
+  }
+  return candidate[0]?.candidateId ?? "(unknown candidate)";
+}
+function indexHoldoutByKey(runs) {
+  const out = /* @__PURE__ */ new Map();
+  for (const r of runs) {
+    if (r.splitTag !== "holdout") continue;
+    if (r.outcome.holdoutScore === void 0) continue;
+    out.set(pairKey(r), r.outcome.holdoutScore);
+  }
+  return out;
+}
+function pairKey(r) {
+  return `${r.experimentId}::${r.seed}`;
+}
+function scores(runs, field, splitFilter) {
+  const out = [];
+  for (const r of runs) {
+    if (r.splitTag !== splitFilter) continue;
+    const v = r.outcome[field];
+    if (typeof v === "number" && Number.isFinite(v)) out.push(v);
+  }
+  return out;
+}
+function mean5(xs) {
+  if (xs.length === 0) return Number.NaN;
+  return xs.reduce((s, x) => s + x, 0) / xs.length;
+}
+function safeDiff(a, b) {
+  if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
+  return a - b;
+}
+function medianDelta(before, after) {
+  const ds = before.map((b, i) => after[i] - b).sort((x, y) => x - y);
+  if (ds.length === 0) return 0;
+  const mid = Math.floor(ds.length / 2);
+  return ds.length % 2 === 0 ? (ds[mid - 1] + ds[mid]) / 2 : ds[mid];
+}
+function fmt(x) {
+  if (!Number.isFinite(x)) return String(x);
+  return x.toFixed(4);
+}
+// src/researcher.ts
+var NoopResearcher = class {
+  hint;
+  constructor(hint = "NoopResearcher: no implementation wired") {
+    this.hint = hint;
+  }
+  async inspectFailures(_runs) {
+    throw new Error(`${this.hint} (inspectFailures not implemented)`);
+  }
+  async proposeChange(_failures) {
+    throw new Error(`${this.hint} (proposeChange not implemented)`);
+  }
+  async applyChange(_changes, _baseline) {
+    throw new Error(`${this.hint} (applyChange not implemented)`);
+  }
+  async evaluateChange(_plan) {
+    throw new Error(`${this.hint} (evaluateChange not implemented)`);
+  }
+};
+// src/summary-report.ts
+function summaryTable(runs, opts = {}) {
+  const split = opts.split ?? "holdout";
+  const confidence = opts.confidence ?? 0.95;
+  const fdr = opts.fdr ?? 0.05;
+  const comparator = opts.comparator ?? null;
+  const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
+  const byCandidate = /* @__PURE__ */ new Map();
+  for (const r of runs) {
+    if (r.splitTag !== split) continue;
+    const v = r.outcome[scoreField];
+    if (typeof v !== "number" || !Number.isFinite(v)) continue;
+    const bucket = byCandidate.get(r.candidateId) ?? { runs: [], scores: [] };
+    bucket.runs.push(r);
+    bucket.scores.push(v);
+    byCandidate.set(r.candidateId, bucket);
+  }
+  const candidateIds = [...byCandidate.keys()].sort();
+  const compRuns = comparator ? byCandidate.get(comparator) : void 0;
+  const tentative = [];
+  for (const id of candidateIds) {
+    const bucket = byCandidate.get(id);
+    const ci = confidenceInterval(bucket.scores, confidence);
+    let rawP = Number.NaN;
+    let d = Number.NaN;
+    if (comparator && compRuns && id !== comparator) {
+      const paired = pairScoresByKey(bucket.runs, compRuns.runs, scoreField);
+      if (paired.before.length >= 6) {
+        rawP = wilcoxonSignedRank(paired.before, paired.after).p;
+      }
+      d = cohensD(compRuns.scores, bucket.scores);
+    }
+    tentative.push({
+      candidateId: id,
+      n: bucket.scores.length,
+      mean: ci.mean,
+      ciLow: ci.lower,
+      ciHigh: ci.upper,
+      qValue: rawP,
+      cohensD: d,
+      rawP
+    });
+  }
+  if (comparator) {
+    const idxs = [];
+    const ps = [];
+    for (let i = 0; i < tentative.length; i++) {
+      const r = tentative[i];
+      if (r.candidateId === comparator) continue;
+      if (!Number.isFinite(r.rawP)) continue;
+      idxs.push(i);
+      ps.push(r.rawP);
+    }
+    if (ps.length > 0) {
+      const { qValues } = benjaminiHochberg(ps, fdr);
+      for (let k = 0; k < idxs.length; k++) {
+        tentative[idxs[k]].qValue = qValues[k];
+      }
+    }
+  }
+  const rows = tentative.map(({ rawP: _rawP, ...rest }) => rest);
+  const markdown = renderSummaryTableMarkdown(rows, comparator, split);
+  return { rows, comparator, split, markdown };
+}
+function pairScoresByKey(candidate, baseline, scoreField) {
+  const baseIdx = /* @__PURE__ */ new Map();
+  for (const r of baseline) {
+    const v = r.outcome[scoreField];
+    if (typeof v === "number" && Number.isFinite(v)) {
+      baseIdx.set(`${r.experimentId}::${r.seed}`, v);
+    }
+  }
+  const before = [];
+  const after = [];
+  for (const r of candidate) {
+    const v = r.outcome[scoreField];
+    if (typeof v !== "number" || !Number.isFinite(v)) continue;
+    const key = `${r.experimentId}::${r.seed}`;
+    const b = baseIdx.get(key);
+    if (b === void 0) continue;
+    before.push(b);
+    after.push(v);
+  }
+  return { before, after };
+}
+function renderSummaryTableMarkdown(rows, comparator, split) {
+  const lines = [];
+  const cmpLabel = comparator ? ` (vs ${comparator})` : "";
+  lines.push(`Summary Table \u2014 ${split} split${cmpLabel}`);
+  lines.push("");
+  lines.push("| Candidate | N | Mean | 95% CI | q (BH) | Cohen's d |");
+  lines.push("|---|---:|---:|---|---:|---:|");
+  for (const r of rows) {
+    const ci = `[${fmt2(r.ciLow)}, ${fmt2(r.ciHigh)}]`;
+    const q = Number.isFinite(r.qValue) ? r.qValue.toFixed(4) : "\u2014";
+    const d = Number.isFinite(r.cohensD) ? r.cohensD.toFixed(3) : "\u2014";
+    lines.push(`| ${r.candidateId} | ${r.n} | ${fmt2(r.mean)} | ${ci} | ${q} | ${d} |`);
+  }
+  return lines.join("\n");
+}
+function paretoChart(runs, opts = {}) {
+  const split = opts.split ?? "holdout";
+  const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
+  const buckets = /* @__PURE__ */ new Map();
+  for (const r of runs) {
+    if (r.splitTag !== split) continue;
+    const v = r.outcome[scoreField];
+    if (typeof v !== "number" || !Number.isFinite(v)) continue;
+    const bucket = buckets.get(r.candidateId) ?? { cost: [], quality: [] };
+    bucket.cost.push(r.costUsd);
+    bucket.quality.push(v);
+    buckets.set(r.candidateId, bucket);
+  }
+  const points = [];
+  for (const [candidateId, bucket] of buckets.entries()) {
+    points.push({
+      candidateId,
+      cost: avg(bucket.cost),
+      quality: avg(bucket.quality),
+      n: bucket.cost.length,
+      onFrontier: false,
+      gate: opts.gateDecisions?.[candidateId] ? gateLabel(opts.gateDecisions[candidateId]) : void 0
+    });
+  }
+  for (const p of points) {
+    p.onFrontier = !points.some((q) => q !== p && dominates2(q, p));
+  }
+  return {
+    kind: "pareto-cost-quality",
+    split,
+    axes: { x: "costUsd", y: "score" },
+    points
+  };
+}
+function dominates2(a, b) {
+  return a.cost <= b.cost && a.quality >= b.quality && (a.cost < b.cost || a.quality > b.quality);
+}
+function gateLabel(d) {
+  if (d.promote) return "promote";
+  if (d.rejectionCode === "few_runs") return "reject_few_runs";
+  if (d.rejectionCode === "negative_delta") return "reject_negative_delta";
+  if (d.rejectionCode === "overfit_gap") return "reject_overfit_gap";
+  return null;
+}
+function gainHistogram(runs, candidateId, comparator, opts = {}) {
+  const split = opts.split ?? "holdout";
+  const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
+  const binCount = opts.bins ?? 11;
+  if (binCount < 1) throw new Error("gainHistogram: bins must be \u2265 1");
+  const candidate = runs.filter((r) => r.candidateId === candidateId && r.splitTag === split);
+  const baseline = runs.filter((r) => r.candidateId === comparator && r.splitTag === split);
+  const { before, after } = pairScoresByKey(candidate, baseline, scoreField);
+  const n = before.length;
+  if (n === 0) {
+    return {
+      kind: "gain-distribution",
+      candidateId,
+      comparator,
+      split,
+      n: 0,
+      bins: [],
+      median: 0,
+      ci: { low: 0, high: 0 }
+    };
+  }
+  const deltas = before.map((b, i) => after[i] - b);
+  const sortedDeltas = [...deltas].sort((a, b) => a - b);
+  const median = medianOfSorted(sortedDeltas);
+  const min = sortedDeltas[0];
+  const max = sortedDeltas[sortedDeltas.length - 1];
+  const bound = Math.max(Math.abs(min), Math.abs(max), 1e-6);
+  const lo = -bound;
+  const hi = bound;
+  const width = (hi - lo) / binCount;
+  const bins = [];
+  for (let i = 0; i < binCount; i++) {
+    bins.push({ lo: lo + i * width, hi: lo + (i + 1) * width, count: 0 });
+  }
+  for (const d of deltas) {
+    let idx = Math.floor((d - lo) / width);
+    if (idx < 0) idx = 0;
+    if (idx >= binCount) idx = binCount - 1;
+    bins[idx].count += 1;
+  }
+  const ci = pairedBootstrap(before, after, {
+    confidence: opts.confidence ?? 0.95,
+    resamples: opts.resamples ?? 2e3,
+    statistic: "median",
+    seed: opts.seed
+  });
+  return {
+    kind: "gain-distribution",
+    candidateId,
+    comparator,
+    split,
+    n,
+    bins,
+    median,
+    ci: { low: ci.low, high: ci.high }
+  };
+}
+function avg(xs) {
+  if (xs.length === 0) return Number.NaN;
+  return xs.reduce((s, x) => s + x, 0) / xs.length;
+}
+function medianOfSorted(sorted) {
+  if (sorted.length === 0) return 0;
+  const mid = Math.floor(sorted.length / 2);
+  return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
+}
+function fmt2(x) {
+  if (!Number.isFinite(x)) return String(x);
+  return x.toFixed(4);
+}
+// src/canary.ts
+function runCanaries(runs, opts = {}) {
+  const alerts = [
+    ...detectSilentFallback(runs, opts.silentFallback ?? {}),
+    ...detectCalibrationDrift(runs, opts.calibrationDrift ?? {}),
+    ...opts.distributionShift ? detectDistributionShift(runs, opts.distributionShift) : []
+  ];
+  const counts = {
+    silent_judge_fallback: 0,
+    judge_calibration_drift: 0,
+    distribution_shift: 0
+  };
+  for (const a of alerts) counts[a.kind]++;
+  return { alerts, counts };
+}
+function detectSilentFallback(runs, opts) {
+  const constant = opts.constant ?? 0.3;
+  const threshold = opts.consecutiveThreshold ?? 3;
+  const eps = opts.epsilon ?? 1e-9;
+  const alerts = [];
+  let streak = 0;
+  let streakStartRunId = null;
+  let streakValues = [];
+  let lastFlush = -1;
+  for (let i = 0; i < runs.length; i++) {
+    const run = runs[i];
+    const meta = run.judgeMetadata;
+    if (!meta) {
+      streak = 0;
+      streakStartRunId = null;
+      streakValues = [];
+      continue;
+    }
+    const isFallback = meta.fallback === true || Math.abs(meta.confidence - constant) <= eps;
+    if (isFallback) {
+      streak += 1;
+      if (streak === 1) streakStartRunId = run.runId;
+      streakValues.push(meta.confidence);
+      if (streak >= threshold && lastFlush < i) {
+        alerts.push({
+          kind: "silent_judge_fallback",
+          severity: "error",
+          message: `silent judge fallback: ${streak} consecutive run(s) at confidence\u2248${constant} or fallback=true`,
+          evidence: {
+            streakLength: streak,
+            firstRunId: streakStartRunId,
+            lastRunId: run.runId,
+            confidences: streakValues.slice(-Math.min(streakValues.length, 10)),
+            fallbackConstant: constant
+          }
+        });
+        lastFlush = i;
+      }
+    } else {
+      streak = 0;
+      streakStartRunId = null;
+      streakValues = [];
+      lastFlush = -1;
+    }
+  }
+  return alerts;
+}
+function detectCalibrationDrift(runs, opts) {
+  const historyWindow = opts.historyWindow ?? 50;
+  const recentWindow = opts.recentWindow ?? 20;
+  const alpha = opts.ksAlpha ?? 0.05;
+  const minRecent = opts.minRecent ?? 10;
+  const conf = [];
+  for (const r of runs) {
+    if (r.judgeMetadata && Number.isFinite(r.judgeMetadata.confidence)) {
+      conf.push(r.judgeMetadata.confidence);
+    }
+  }
+  if (conf.length < minRecent + 1) return [];
+  const recent = conf.slice(-Math.min(recentWindow, conf.length));
+  const historical = conf.slice(0, -recent.length).slice(-historyWindow);
+  if (recent.length < minRecent || historical.length < minRecent) return [];
+  const ks = ksTwoSample(recent, historical);
+  const c = alpha <= 0.01 ? 1.63 : alpha <= 0.05 ? 1.36 : alpha <= 0.1 ? 1.22 : 1;
+  const critical = c * Math.sqrt((recent.length + historical.length) / (recent.length * historical.length));
+  if (ks.d > critical) {
+    return [
+      {
+        kind: "judge_calibration_drift",
+        severity: "warn",
+        message: `judge calibration drift: KS D=${ks.d.toFixed(4)} exceeds critical=${critical.toFixed(4)} at alpha=${alpha} (recent n=${recent.length}, history n=${historical.length})`,
+        evidence: {
+          ksD: ks.d,
+          critical,
+          alpha,
+          recentN: recent.length,
+          historyN: historical.length,
+          recentMean: mean6(recent),
+          historyMean: mean6(historical)
+        }
+      }
+    ];
+  }
+  return [];
+}
+function ksTwoSample(a, b) {
+  const sortedA = [...a].sort((x, y) => x - y);
+  const sortedB = [...b].sort((x, y) => x - y);
+  const n1 = sortedA.length;
+  const n2 = sortedB.length;
+  let i = 0;
+  let j = 0;
+  let d = 0;
+  while (i < n1 && j < n2) {
+    const ax = sortedA[i];
+    const bx = sortedB[j];
+    if (ax <= bx) i++;
+    if (bx <= ax) j++;
+    const diff = Math.abs(i / n1 - j / n2);
+    if (diff > d) d = diff;
+  }
+  return { d };
+}
+function detectDistributionShift(runs, opts) {
+  const historyWindow = opts.historyWindow ?? 50;
+  const recentWindow = opts.recentWindow ?? 20;
+  const alpha = opts.chiSquareAlpha ?? 0.05;
+  const minRecent = opts.minRecent ?? 10;
+  const cat = opts.category;
+  const cats = [];
+  for (const r of runs) {
+    const b = cat(r);
+    if (typeof b === "string" && b.length > 0) cats.push({ run: r, bucket: b });
+  }
+  if (cats.length < minRecent + 1) return [];
+  const recent = cats.slice(-Math.min(recentWindow, cats.length));
+  const historical = cats.slice(0, -recent.length).slice(-historyWindow);
+  if (recent.length < minRecent || historical.length < minRecent) return [];
+  const buckets = /* @__PURE__ */ new Set();
+  for (const r of recent) buckets.add(r.bucket);
+  for (const h of historical) buckets.add(h.bucket);
+  const bucketList = [...buckets].sort();
+  const recentCounts = {};
+  const histCounts = {};
+  for (const b of bucketList) {
+    recentCounts[b] = 0;
+    histCounts[b] = 0;
+  }
+  for (const r of recent) recentCounts[r.bucket] += 1;
+  for (const h of historical) histCounts[h.bucket] += 1;
+  let chi = 0;
+  let df = 0;
+  for (const b of bucketList) {
+    const expected = histCounts[b] / historical.length * recent.length;
+    if (expected < 1) continue;
+    const obs = recentCounts[b];
+    chi += (obs - expected) ** 2 / expected;
+    df += 1;
+  }
+  df = Math.max(1, df - 1);
+  const critical = chiSquareCritical(df, alpha);
+  if (chi > critical) {
+    return [
+      {
+        kind: "distribution_shift",
+        severity: "warn",
+        message: `eval-set distribution shift: \u03C7\xB2=${chi.toFixed(2)} df=${df} exceeds critical=${critical.toFixed(2)} at alpha=${alpha}`,
+        evidence: {
+          chi,
+          df,
+          critical,
+          alpha,
+          recentCounts,
+          historicalCounts: histCounts,
+          recentN: recent.length,
+          historyN: historical.length
+        }
+      }
+    ];
+  }
+  return [];
+}
+function chiSquareCritical(df, alpha) {
+  const TABLE = {
+    1: [2.71, 3.84, 5.02, 6.63],
+    2: [4.61, 5.99, 7.38, 9.21],
+    3: [6.25, 7.81, 9.35, 11.34],
+    4: [7.78, 9.49, 11.14, 13.28],
+    5: [9.24, 11.07, 12.83, 15.09],
+    6: [10.64, 12.59, 14.45, 16.81],
+    7: [12.02, 14.07, 16.01, 18.48],
+    8: [13.36, 15.51, 17.53, 20.09],
+    9: [14.68, 16.92, 19.02, 21.67],
+    10: [15.99, 18.31, 20.48, 23.21],
+    15: [22.31, 25, 27.49, 30.58],
+    20: [28.41, 31.41, 34.17, 37.57],
+    25: [34.38, 37.65, 40.65, 44.31],
+    30: [40.26, 43.77, 46.98, 50.89]
+  };
+  const idx = alpha >= 0.1 ? 0 : alpha >= 0.05 ? 1 : alpha >= 0.025 ? 2 : 3;
+  if (TABLE[df]) return TABLE[df][idx];
+  if (df > 30) {
+    const zMap = { 0: 1.282, 1: 1.645, 2: 1.96, 3: 2.326 };
+    const z = zMap[idx] ?? 1.96;
+    const term = 1 - 2 / (9 * df) + z * Math.sqrt(2 / (9 * df));
+    return df * term ** 3;
+  }
+  const keys = Object.keys(TABLE).map((k) => Number(k)).sort((a, b) => a - b);
+  for (let i = 1; i < keys.length; i++) {
+    const lo = keys[i - 1];
+    const hi = keys[i];
+    if (df >= lo && df <= hi) {
+      const t = (df - lo) / (hi - lo);
+      return TABLE[lo][idx] * (1 - t) + TABLE[hi][idx] * t;
+    }
+  }
+  return TABLE[10][idx];
+}
+function mean6(xs) {
+  if (xs.length === 0) return 0;
+  return xs.reduce((s, x) => s + x, 0) / xs.length;
+}
+// src/benchmarks/types.ts
+function fnv1a32(input) {
+  let h = 2166136261;
+  for (let i = 0; i < input.length; i++) {
+    h ^= input.charCodeAt(i) & 255;
+    h = h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24)) >>> 0;
+  }
+  return h >>> 0;
+}
+var BENCHMARK_SPLIT_SEED = "agent-eval-v1";
+function deterministicSplit(itemId, seed = BENCHMARK_SPLIT_SEED) {
+  const h = fnv1a32(`${seed}::${itemId}`);
+  const pos = h / 4294967296;
+  if (pos < 0.6) return "search";
+  if (pos < 0.8) return "dev";
+  return "holdout";
+}
+// src/benchmarks/index.ts
+var benchmarks_exports = {};
+__export(benchmarks_exports, {
+  BENCHMARK_SPLIT_SEED: () => BENCHMARK_SPLIT_SEED,
+  deterministicSplit: () => deterministicSplit,
+  gsm8k: () => gsm8k_exports,
+  routing: () => routing_exports,
+  swebenchLite: () => swebench_lite_exports
+});
+// src/benchmarks/gsm8k/index.ts
+var gsm8k_exports = {};
+__export(gsm8k_exports, {
+  Gsm8kAdapter: () => Gsm8kAdapter,
+  assignSplit: () => assignSplit,
+  evaluate: () => evaluate,
+  loadDataset: () => loadDataset,
+  parseGsm8kAnswer: () => parseGsm8kAnswer
+});
+import { existsSync as existsSync5, readFileSync as readFileSync5 } from "fs";
+var Gsm8kAdapter = class {
+  async loadDataset(split) {
+    const path = process.env.AGENT_EVAL_GSM8K_PATH;
+    if (!path) {
+      throw new Error(
+        "GSM8K dataset not provided. Set AGENT_EVAL_GSM8K_PATH to a JSONL file with {id, question, answer} records (the HF GSM8K mirror converted to JSONL)."
+      );
+    }
+    if (!existsSync5(path)) {
+      throw new Error(`AGENT_EVAL_GSM8K_PATH=${path} does not exist`);
+    }
+    const items = parseJsonl(path).filter((it) => assignSplitImpl(it.id) === split);
+    return items;
+  }
+  async evaluate(item, response) {
+    const expected = parseGsm8kAnswer(item.payload.answer);
+    const observed = parseGsm8kAnswer(response);
+    if (expected === null) {
+      return { score: 0, raw: { reason: "reference_not_numeric", expected: item.payload.answer } };
+    }
+    if (observed === null) {
+      return { score: 0, raw: { reason: "no_numeric_in_response", expected, observed: null } };
+    }
+    const ok = Math.abs(expected - observed) < 1e-6;
+    return { score: ok ? 1 : 0, raw: { expected, observed, exactMatch: ok } };
+  }
+  assignSplit(itemId) {
+    return assignSplitImpl(itemId);
+  }
+};
+function assignSplitImpl(itemId) {
+  return deterministicSplit(`gsm8k::${itemId}`);
+}
+function parseJsonl(path) {
+  const raw = readFileSync5(path, "utf8");
+  const out = [];
+  let lineNo = 0;
+  for (const line of raw.split("\n")) {
+    lineNo++;
+    const trimmed = line.trim();
+    if (!trimmed) continue;
+    let row;
+    try {
+      row = JSON.parse(trimmed);
+    } catch (e) {
+      throw new Error(`GSM8K JSONL parse error at line ${lineNo}: ${e.message}`);
+    }
+    const id = String(row.id ?? `gsm8k_${lineNo}`);
+    const question = String(row.question ?? "");
+    const answer = String(row.answer ?? "");
+    if (!question || !answer) {
+      throw new Error(`GSM8K JSONL line ${lineNo} missing question/answer`);
+    }
+    out.push({ id, payload: { question, answer } });
+  }
+  return out;
+}
+function parseGsm8kAnswer(text) {
+  if (!text) return null;
+  const afterMarker = text.match(/####\s*(-?\d[\d,]*\.?\d*)/);
+  if (afterMarker) {
+    const cleaned2 = afterMarker[1].replace(/,/g, "");
+    const v2 = Number(cleaned2);
+    if (Number.isFinite(v2)) return v2;
+  }
+  const matches2 = text.match(/-?\d[\d,]*\.?\d*/g);
+  if (!matches2 || matches2.length === 0) return null;
+  const last = matches2[matches2.length - 1];
+  const cleaned = last.replace(/,/g, "");
+  const v = Number(cleaned);
+  return Number.isFinite(v) ? v : null;
+}
+var adapter = new Gsm8kAdapter();
+var loadDataset = adapter.loadDataset.bind(adapter);
+var evaluate = adapter.evaluate.bind(adapter);
+var assignSplit = adapter.assignSplit.bind(adapter);
+// src/benchmarks/swebench-lite/index.ts
+var swebench_lite_exports = {};
+__export(swebench_lite_exports, {
+  SweBenchLiteAdapter: () => SweBenchLiteAdapter,
+  assignSplit: () => assignSplit2,
+  evaluate: () => evaluate2,
+  loadDataset: () => loadDataset2
+});
+import { existsSync as existsSync6, readFileSync as readFileSync6 } from "fs";
+import { spawn } from "child_process";
+var SweBenchLiteAdapter = class {
+  async loadDataset(split) {
+    const path = process.env.AGENT_EVAL_SWEBENCH_PATH;
+    if (!path) {
+      throw new Error(
+        "SWE-Bench Lite dataset not provided. Set AGENT_EVAL_SWEBENCH_PATH to a JSONL file with the 30 lite instances. STUB: this wrapper does not bundle the dataset; see https://www.swebench.com/lite.html for the canonical source."
+      );
+    }
+    if (!existsSync6(path)) {
+      throw new Error(`AGENT_EVAL_SWEBENCH_PATH=${path} does not exist`);
+    }
+    const all = parseJsonl2(path);
+    return all.filter((it) => assignSplitImpl2(it.id) === split);
+  }
+  async evaluate(item, response) {
+    const cmd = process.env.AGENT_EVAL_SWEBENCH_GRADER_CMD;
+    if (!cmd) {
+      throw new Error(
+        "SWE-Bench Lite grader not configured. Set AGENT_EVAL_SWEBENCH_GRADER_CMD to an executable that reads {instance_id, patch} JSON on stdin and writes {passed, fail_to_pass_passed, pass_to_pass_passed, log} JSON on stdout. TODO(swebench-lite): bundle a default Docker-based runner once the SDK stabilises (https://github.com/swe-bench/SWE-bench)."
+      );
+    }
+    const stdinPayload = JSON.stringify({ instance_id: item.payload.instanceId, patch: response });
+    const result = await runGrader(cmd, stdinPayload);
+    let parsed;
+    try {
+      parsed = JSON.parse(result.stdout);
+    } catch (e) {
+      throw new Error(
+        `SWE-Bench grader emitted non-JSON stdout: ${e.message}
+stdout=${result.stdout.slice(0, 400)}
+stderr=${result.stderr.slice(0, 400)}`
+      );
+    }
+    const passed = Boolean(parsed.passed);
+    return {
+      score: passed ? 1 : 0,
+      raw: {
+        passed,
+        failToPassPassed: Boolean(parsed.fail_to_pass_passed),
+        passToPassPassed: Boolean(parsed.pass_to_pass_passed),
+        graderLog: typeof parsed.log === "string" ? parsed.log.slice(0, 4e3) : ""
+      }
+    };
+  }
+  assignSplit(itemId) {
+    return assignSplitImpl2(itemId);
+  }
+};
+function assignSplitImpl2(itemId) {
+  return deterministicSplit(`swebench-lite::${itemId}`);
+}
+function parseJsonl2(path) {
+  const raw = readFileSync6(path, "utf8");
+  const out = [];
+  let lineNo = 0;
+  for (const line of raw.split("\n")) {
+    lineNo++;
+    const trimmed = line.trim();
+    if (!trimmed) continue;
+    const row = JSON.parse(trimmed);
+    const instanceId = String(row.instance_id ?? row.instanceId ?? "");
+    if (!instanceId) {
+      throw new Error(`swebench-lite line ${lineNo} missing instance_id`);
+    }
+    out.push({
+      id: instanceId,
+      payload: {
+        instanceId,
+        problemStatement: String(row.problem_statement ?? row.problemStatement ?? ""),
+        baseCommit: String(row.base_commit ?? row.baseCommit ?? ""),
+        repo: String(row.repo ?? ""),
+        failToPass: asStringArray(row.FAIL_TO_PASS ?? row.failToPass),
+        passToPass: asStringArray(row.PASS_TO_PASS ?? row.passToPass)
+      }
+    });
+  }
+  return out;
+}
+function asStringArray(v) {
+  if (Array.isArray(v)) return v.filter((x) => typeof x === "string");
+  if (typeof v === "string") {
+    try {
+      const parsed = JSON.parse(v);
+      if (Array.isArray(parsed)) return parsed.filter((x) => typeof x === "string");
+    } catch {
+      return [v];
+    }
+  }
+  return [];
+}
+function runGrader(cmd, stdin) {
+  return new Promise((resolve, reject) => {
+    const parts = cmd.split(/\s+/);
+    const child = spawn(parts[0], parts.slice(1), { stdio: ["pipe", "pipe", "pipe"] });
+    let stdout = "";
+    let stderr = "";
+    child.stdout.on("data", (b) => stdout += b.toString("utf8"));
+    child.stderr.on("data", (b) => stderr += b.toString("utf8"));
+    child.on("error", reject);
+    child.on("close", (code) => {
+      if (code !== 0) {
+        reject(new Error(`grader exited with code ${code}: ${stderr.slice(0, 400)}`));
+        return;
+      }
+      resolve({ stdout, stderr });
+    });
+    child.stdin.write(stdin);
+    child.stdin.end();
+  });
+}
+var adapter2 = new SweBenchLiteAdapter();
+var loadDataset2 = adapter2.loadDataset.bind(adapter2);
+var evaluate2 = adapter2.evaluate.bind(adapter2);
+var assignSplit2 = adapter2.assignSplit.bind(adapter2);
+// src/benchmarks/routing/index.ts
+var routing_exports = {};
+__export(routing_exports, {
+  ROUTING_DATASET: () => ROUTING_DATASET,
+  RoutingAdapter: () => RoutingAdapter,
+  assignSplit: () => assignSplit3,
+  evaluate: () => evaluate3,
+  extractRouteTokens: () => extractRouteTokens,
+  loadDataset: () => loadDataset3
+});
+// src/benchmarks/routing/dataset.ts
+var ROUTING_DATASET = [
+  {
+    id: "file_001",
+    category: "file",
+    prompt: "Save the meeting notes to /tmp/notes-2025-04.md as markdown.",
+    route: "fs.write",
+    synonyms: ["filesystem.write", "write_file"],
+    hardNegatives: ["fs.read", "chat.reply"]
+  },
+  {
+    id: "file_002",
+    category: "file",
+    prompt: "Read the contents of /etc/hosts and summarize the entries.",
+    route: "fs.read",
+    synonyms: ["filesystem.read", "read_file"],
+    hardNegatives: ["fs.write", "search.web"]
+  },
+  {
+    id: "file_003",
+    category: "file",
+    prompt: "List every Python file under src/ recursively.",
+    route: "fs.list",
+    synonyms: ["filesystem.list", "list_files"],
+    hardNegatives: ["fs.read", "search.code"]
+  },
+  {
+    id: "file_004",
+    category: "file",
+    prompt: "Delete the cached build at .turbo/cache.",
+    route: "fs.delete",
+    synonyms: ["filesystem.delete", "remove_file"],
+    hardNegatives: ["fs.write", "fs.list"]
+  },
+  {
+    id: "math_001",
+    category: "math",
+    prompt: "What is the integral of 3x^2 + 2x from 0 to 5?",
+    route: "math.integral",
+    synonyms: ["calculator.integral", "math.solve"],
+    hardNegatives: ["math.derivative", "chat.reply"]
+  },
+  {
+    id: "math_002",
+    category: "math",
+    prompt: "Compute the derivative of sin(x) * cos(x).",
+    route: "math.derivative",
+    synonyms: ["calculator.derivative", "math.solve"],
+    hardNegatives: ["math.integral", "math.algebra"]
+  },
+  {
+    id: "math_003",
+    category: "math",
+    prompt: "Solve 2x + 7 = 19 for x.",
+    route: "math.algebra",
+    synonyms: ["calculator.algebra", "math.solve"],
+    hardNegatives: ["math.derivative", "math.integral"]
+  },
+  {
+    id: "math_004",
+    category: "math",
+    prompt: "What is the prime factorization of 360?",
+    route: "math.numbertheory",
+    synonyms: ["calculator.factor", "math.solve"],
+    hardNegatives: ["math.algebra", "search.web"]
+  },
+  {
+    id: "search_001",
+    category: "search",
+    prompt: "Find recent papers on agent prompt optimization with held-out promotion gates.",
+    route: "search.web",
+    synonyms: ["web.search", "search.papers"],
+    hardNegatives: ["search.code", "chat.reply"]
+  },
+  {
+    id: "search_002",
+    category: "search",
+    prompt: "Search the codebase for every call site of `runProposeReview`.",
+    route: "search.code",
+    synonyms: ["code.search", "grep"],
+    hardNegatives: ["search.web", "fs.read"]
+  },
+  {
+    id: "search_003",
+    category: "search",
+    prompt: "What is the latest release of the Tangle network on GitHub?",
+    route: "search.web",
+    synonyms: ["web.search", "github.releases"],
+    hardNegatives: ["search.code", "chat.reply"]
+  },
+  {
+    id: "search_004",
+    category: "search",
+    prompt: "Find all TODO comments in the agent-eval src tree.",
+    route: "search.code",
+    synonyms: ["code.search", "grep"],
+    hardNegatives: ["search.web", "fs.list"]
+  },
+  {
+    id: "chat_001",
+    category: "chat",
+    prompt: "Hi there, how are you doing today?",
+    route: "chat.reply",
+    synonyms: ["conversation.reply"],
+    hardNegatives: ["search.web", "fs.read"]
+  },
+  {
+    id: "chat_002",
+    category: "chat",
+    prompt: "Please explain the difference between an LLM and a foundation model.",
+    route: "chat.reply",
+    synonyms: ["conversation.reply", "qa.answer"],
+    hardNegatives: ["search.web", "math.algebra"]
+  },
+  {
+    id: "chat_003",
+    category: "chat",
+    prompt: "Tell me a short joke about distributed systems.",
+    route: "chat.reply",
+    synonyms: ["conversation.reply"],
+    hardNegatives: ["search.web", "fs.read"]
+  },
+  {
+    id: "chat_004",
+    category: "chat",
+    prompt: "Acknowledge my last message with a thumbs up.",
+    route: "chat.reply",
+    synonyms: ["conversation.reply", "react"],
+    hardNegatives: ["fs.write", "search.web"]
+  }
+];
+// src/benchmarks/routing/index.ts
+var RoutingAdapter = class {
+  async loadDataset(split) {
+    return ROUTING_DATASET.map((item) => ({ id: item.id, payload: item })).filter((it) => assignSplitImpl3(it.id) === split);
+  }
+  async evaluate(item, response) {
+    const tokens2 = extractRouteTokens(response);
+    const correct = new Set([item.payload.route, ...item.payload.synonyms].map((s) => s.toLowerCase()));
+    const hardNeg = new Set(item.payload.hardNegatives.map((s) => s.toLowerCase()));
+    const firstMatch = tokens2.find((t) => correct.has(t.toLowerCase())) ?? null;
+    const firstHardNeg = tokens2.find((t) => hardNeg.has(t.toLowerCase())) ?? null;
+    const score = firstMatch ? 1 : 0;
+    return {
+      score,
+      raw: {
+        firstToken: tokens2[0] ?? null,
+        matchedRoute: firstMatch,
+        hitHardNegative: Boolean(firstHardNeg),
+        hardNegativeRoute: firstHardNeg,
+        category: item.payload.category
+      }
+    };
+  }
+  assignSplit(itemId) {
+    return assignSplitImpl3(itemId);
+  }
+};
+function assignSplitImpl3(itemId) {
+  return deterministicSplit(`routing::${itemId}`);
+}
+function extractRouteTokens(response) {
+  const matches2 = response.match(/[a-z][a-z0-9_]*\.[a-z][a-z0-9_]*/gi);
+  return matches2 ?? [];
+}
+var adapter3 = new RoutingAdapter();
+var loadDataset3 = adapter3.loadDataset.bind(adapter3);
+var evaluate3 = adapter3.evaluate.bind(adapter3);
+var assignSplit3 = adapter3.assignSplit.bind(adapter3);
 // src/reference-replay-steering.ts
 function referenceReplayRunsToSteeringRows(runs, options = {}) {
   const rows = [];
@@ -10257,9 +11558,9 @@ function aggregateTrials(population, scenarioIds, trials) {
       return {
         variantId: variant.id,
         scenarioId: sid,
-        meanScore: mean5(gradedTrials.map((t) => t.score)),
-        meanCost: mean5(gradedTrials.map((t) => t.cost ?? 0)),
-        meanDurationMs: mean5(gradedTrials.map((t) => t.durationMs ?? 0)),
+        meanScore: mean7(gradedTrials.map((t) => t.score)),
+        meanCost: mean7(gradedTrials.map((t) => t.cost ?? 0)),
+        meanDurationMs: mean7(gradedTrials.map((t) => t.durationMs ?? 0)),
         okRate: scenarioTrials.length === 0 ? 0 : okTrials.length / scenarioTrials.length,
         trials: scenarioTrials.length,
         metrics
@@ -10267,10 +11568,10 @@ function aggregateTrials(population, scenarioIds, trials) {
     });
     return {
       variantId: variant.id,
-      meanScore: mean5(scenarios.map((s) => s.meanScore)),
-      meanCost: mean5(scenarios.map((s) => s.meanCost)),
-      meanDurationMs: mean5(scenarios.map((s) => s.meanDurationMs)),
-      okRate: mean5(scenarios.map((s) => s.okRate)),
+      meanScore: mean7(scenarios.map((s) => s.meanScore)),
+      meanCost: mean7(scenarios.map((s) => s.meanCost)),
+      meanDurationMs: mean7(scenarios.map((s) => s.meanDurationMs)),
+      okRate: mean7(scenarios.map((s) => s.okRate)),
       scenarios,
       metrics: aggregateMetrics(scenarios.map((s) => s.metrics))
     };
@@ -10287,10 +11588,10 @@ function aggregateMetrics(rows) {
     }
   }
   const out = {};
-  for (const [k, list] of buckets) out[k] = mean5(list);
+  for (const [k, list] of buckets) out[k] = mean7(list);
   return out;
 }
-function mean5(xs) {
+function mean7(xs) {
   if (xs.length === 0) return 0;
   return xs.reduce((a, b) => a + b, 0) / xs.length;
 }
@@ -10331,11 +11632,11 @@ function samePopulation(a, b) {
 }
 // src/jsonl-trial-cache.ts
-import { appendFileSync as appendFileSync4, existsSync as existsSync6, mkdirSync as mkdirSync4, readFileSync as readFileSync5 } from "fs";
+import { appendFileSync as appendFileSync4, existsSync as existsSync8, mkdirSync as mkdirSync4, readFileSync as readFileSync7 } from "fs";
 import { dirname as dirname4 } from "path";
 // src/locked-jsonl-appender.ts
-import { appendFileSync as appendFileSync3, existsSync as existsSync5, mkdirSync as mkdirSync3 } from "fs";
+import { appendFileSync as appendFileSync3, existsSync as existsSync7, mkdirSync as mkdirSync3 } from "fs";
 import { dirname as dirname3 } from "path";
 var mutexes = /* @__PURE__ */ new Map();
 function getMutex(path) {
@@ -10350,7 +11651,7 @@ var LockedJsonlAppender = class {
   constructor(path) {
     this.path = path;
     this.mutex = getMutex(path);
-    if (!existsSync5(dirname3(path))) {
+    if (!existsSync7(dirname3(path))) {
       mkdirSync3(dirname3(path), { recursive: true });
     }
   }
@@ -10375,8 +11676,8 @@ var JsonlTrialCache = class {
   appender;
   constructor(path) {
     this.path = path;
-    if (existsSync6(path)) {
-      for (const line of readFileSync5(path, "utf-8").split("\n")) {
+    if (existsSync8(path)) {
+      for (const line of readFileSync7(path, "utf-8").split("\n")) {
         if (!line.trim()) continue;
         try {
           const entry = JSON.parse(line);
@@ -10414,7 +11715,7 @@ var JsonlTrialCache = class {
 };
 // src/evolution-telemetry.ts
-import { appendFileSync as appendFileSync5, existsSync as existsSync7, mkdirSync as mkdirSync5, readFileSync as readFileSync6, writeFileSync } from "fs";
+import { appendFileSync as appendFileSync5, existsSync as existsSync9, mkdirSync as mkdirSync5, readFileSync as readFileSync8, writeFileSync } from "fs";
 import { dirname as dirname5 } from "path";
 var MutationTelemetry = class {
   appender;
@@ -10445,16 +11746,16 @@ var LineageRecorder = class {
     this.snapshotPath = `${path}.snapshot`;
     this.kindOf = kindOf ?? defaultKindOf;
     mkdirSync5(dirname5(path), { recursive: true });
-    if (existsSync7(this.snapshotPath)) {
+    if (existsSync9(this.snapshotPath)) {
       try {
-        const parsed = JSON.parse(readFileSync6(this.snapshotPath, "utf-8"));
+        const parsed = JSON.parse(readFileSync8(this.snapshotPath, "utf-8"));
         for (const n of parsed) this.nodes.set(n.id, n);
       } catch {
       }
     }
-    if (existsSync7(path)) {
+    if (existsSync9(path)) {
       try {
-        for (const line of readFileSync6(path, "utf-8").split("\n")) {
+        for (const line of readFileSync8(path, "utf-8").split("\n")) {
           if (!line.trim()) continue;
           try {
             const entry = JSON.parse(line);
@@ -10466,9 +11767,9 @@ var LineageRecorder = class {
       } catch {
       }
     }
-    if (existsSync7(path) && this.nodes.size === 0) {
+    if (existsSync9(path) && this.nodes.size === 0) {
       try {
-        const raw = readFileSync6(path, "utf-8").trim();
+        const raw = readFileSync8(path, "utf-8").trim();
         if (raw.startsWith("[")) {
           const parsed = JSON.parse(raw);
           for (const n of parsed) this.nodes.set(n.id, n);
@@ -10482,8 +11783,8 @@ var LineageRecorder = class {
       const prev = this.nodes.get(node.id);
       this.nodes.set(node.id, { ...prev, ...node });
       try {
-        if (existsSync7(this.path)) {
-          const head = readFileSync6(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
+        if (existsSync9(this.path)) {
+          const head = readFileSync8(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
           if (head === "[") {
             writeFileSync(this.path, "");
           }
@@ -10549,9 +11850,9 @@ var CostLedger = class {
   mutex = new Mutex();
   constructor(path) {
     this.path = path;
-    if (existsSync7(path)) {
+    if (existsSync9(path)) {
       try {
-        const loaded = JSON.parse(readFileSync6(path, "utf-8"));
+        const loaded = JSON.parse(readFileSync8(path, "utf-8"));
         for (const k of Object.keys(this.totals)) {
           if (k === "byGeneration") {
             if (loaded.byGeneration && typeof loaded.byGeneration === "object") {
@@ -10975,9 +12276,9 @@ function passOrthogonality(input) {
       sims.push(cosineSimilarity(vectors[i], vectors[j]));
     }
   }
-  const mean7 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
+  const mean9 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
   return {
-    orthogonality: Math.max(0, Math.min(1, 1 - mean7)),
+    orthogonality: Math.max(0, Math.min(1, 1 - mean9)),
     passCount: passes.length,
     similarities: sims
   };
@@ -11023,8 +12324,8 @@ function bootstrapCi(baseline, candidate, options = {}) {
   const iterations = options.iterations ?? 1e3;
   const minTotal = options.minTotalSamples ?? 6;
   const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
-  const baselineMean = mean6(baseline);
-  const candidateMean = mean6(candidate);
+  const baselineMean = mean8(baseline);
+  const candidateMean = mean8(candidate);
   const delta = candidateMean - baselineMean;
   if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
     return {
@@ -11042,7 +12343,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
   for (let i = 0; i < iterations; i++) {
     const bResample = resample(baseline, rng);
     const cResample = resample(candidate, rng);
-    deltas[i] = mean6(cResample) - mean6(bResample);
+    deltas[i] = mean8(cResample) - mean8(bResample);
   }
   deltas.sort((a, b) => a - b);
   const lowerIdx = Math.floor(alpha / 2 * iterations);
@@ -11065,7 +12366,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
     verdict
   };
 }
-function mean6(xs) {
+function mean8(xs) {
   if (xs.length === 0) return 0;
   let s = 0;
   for (const x of xs) s += x;
@@ -11260,6 +12561,7 @@ function parseReflectionResponse(raw, maxProposals) {
 export {
   AgentDriver,
   AxGepaSteeringOptimizer,
+  BENCHMARK_SPLIT_SEED,
   BenchmarkRunner,
   BudgetBreachError,
   BudgetGuard,
@@ -11288,6 +12590,7 @@ export {
   FileSystemExperimentStore,
   FileSystemOutcomeStore,
   FileSystemTraceStore,
+  HeldOutGate,
   HoldoutAuditor,
   HoldoutLockedError,
   INTENT_MATCH_JUDGE_VERSION,
@@ -11307,6 +12610,7 @@ export {
   MultiLayerVerifier,
   MutationTelemetry,
   Mutex,
+  NoopResearcher,
   OTEL_AGENT_EVAL_SCOPE,
   OptimizationLoop,
   PairwiseSteeringOptimizer,
@@ -11317,6 +12621,7 @@ export {
   PromptRegistry,
   REDACTION_VERSION,
   RunCritic,
+  RunRecordValidationError,
   SEMANTIC_CONCEPT_JUDGE_VERSION,
   SandboxHarness,
   ScenarioRegistry,
@@ -11333,7 +12638,10 @@ export {
   analyzeSeries,
   argHash,
   attributeCounterfactuals,
+  deterministicSplit as benchmarkDeterministicSplit,
+  benchmarks_exports as benchmarks,
   benjaminiHochberg,
+  bhAdjust,
   bisect,
   bonferroni,
   bootstrapCi,
@@ -11413,6 +12721,7 @@ export {
   formatBenchmarkReport,
   formatDriverReport,
   formatFindings,
+  gainHistogram,
   precision as goldenPrecision,
   gradeSemanticStatus,
   groupBy,
@@ -11427,6 +12736,7 @@ export {
   isLlmSpan,
   isPrmVerdict,
   isRetrievalSpan,
+  isRunRecord,
   isSandboxSpan,
   isToolSpan,
   jestTestParser,
@@ -11454,11 +12764,15 @@ export {
   normalizeScores,
   notBlocked,
   outputLengthRubric,
+  pairedBootstrap,
   pairedTTest,
+  pairedWilcoxon,
   paraphraseRobustness,
+  paretoChart,
   paretoFrontier,
   paretoFrontierWithCrowding,
   parseReflectionResponse,
+  parseRunRecordSafe,
   partialCredit,
   passOrthogonality,
   pixelDeltaRatio,
@@ -11489,9 +12803,11 @@ export {
   requiredSampleSize,
   resetLockedAppendersForTesting,
   resumeBuilderSession,
+  roundTripRunRecord,
   rowCount,
   rowWhere,
   runAssertions,
+  runCanaries,
   runCounterfactual,
   runE2EWorkflow,
   runExpectations,
@@ -11526,6 +12842,7 @@ export {
   stuckLoopView,
   summarize,
   summarizeHarnessResults,
+  summaryTable,
   testJudge,
   textInSnapshot,
   toLangfuseEnvelope,
@@ -11539,6 +12856,7 @@ export {
   toolWasteView,
   typoMutator,
   urlContains,
+  validateRunRecord,
   verbosityBias,
   verifyManifest,
   visualDiff,
@@ -11548,6 +12866,7 @@ export {
   weightedRecall,
   welchsTTest,
   whitespaceCollapseMutator,
-  wilcoxonSignedRank
+  wilcoxonSignedRank,
+  wranglerDeployRunner
 };
 //# sourceMappingURL=index.js.map