@tangle-network/agent-eval 0.14.2 → 0.16.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -6,6 +6,9 @@ import {
6
6
  probeLlm,
7
7
  stripFencedJson
8
8
  } from "./chunk-ITN4YOZY.js";
9
+ import {
10
+ __export
11
+ } from "./chunk-PZ5AY32C.js";
9
12
 
10
13
  // src/client.ts
11
14
  var ProductClient = class {
@@ -265,12 +268,7 @@ ${codeText}`
265
268
  };
266
269
  var coherenceJudge = async (tc, { scenario, turns }) => {
267
270
  if (turns.length < 2) {
268
- return [{
269
- judgeName: "coherence",
270
- dimension: "coherence",
271
- score: 5,
272
- reasoning: "Single-turn scenario \u2014 coherence not fully testable."
273
- }];
271
+ return [];
274
272
  }
275
273
  const conversation = turns.map(
276
274
  (t, i) => `Turn ${i + 1}:
@@ -396,36 +394,36 @@ var INVERTED_DIMENSIONS = /* @__PURE__ */ new Set([
396
394
  "false_confidence",
397
395
  "worst_failure"
398
396
  ]);
399
- function normalizeScores(scores) {
400
- return scores.map((s) => {
397
+ function normalizeScores(scores2) {
398
+ return scores2.map((s) => {
401
399
  if (INVERTED_DIMENSIONS.has(s.dimension)) {
402
400
  return s;
403
401
  }
404
402
  return s;
405
403
  });
406
404
  }
407
- function weightedMean(scores) {
408
- if (scores.length === 0) return 0;
405
+ function weightedMean(scores2) {
406
+ if (scores2.length === 0) return 0;
409
407
  let totalWeight = 0;
410
408
  let weightedSum = 0;
411
- for (const { score, weight } of scores) {
409
+ for (const { score, weight } of scores2) {
412
410
  const w = weight ?? 1;
413
411
  weightedSum += score * w;
414
412
  totalWeight += w;
415
413
  }
416
414
  return totalWeight > 0 ? weightedSum / totalWeight : 0;
417
415
  }
418
- function confidenceInterval(scores, confidence = 0.95) {
419
- if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
420
- if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
421
- const n = scores.length;
422
- const mean7 = scores.reduce((a, b) => a + b, 0) / n;
416
+ function confidenceInterval(scores2, confidence = 0.95) {
417
+ if (scores2.length === 0) return { mean: 0, lower: 0, upper: 0 };
418
+ if (scores2.length === 1) return { mean: scores2[0], lower: scores2[0], upper: scores2[0] };
419
+ const n = scores2.length;
420
+ const mean9 = scores2.reduce((a, b) => a + b, 0) / n;
423
421
  const B = 1e3;
424
422
  const bootstrapMeans = [];
425
423
  for (let i = 0; i < B; i++) {
426
424
  let sum2 = 0;
427
425
  for (let j = 0; j < n; j++) {
428
- sum2 += scores[Math.floor(Math.random() * n)];
426
+ sum2 += scores2[Math.floor(Math.random() * n)];
429
427
  }
430
428
  bootstrapMeans.push(sum2 / n);
431
429
  }
@@ -434,7 +432,7 @@ function confidenceInterval(scores, confidence = 0.95) {
434
432
  const lowerIdx = Math.floor(alpha / 2 * B);
435
433
  const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
436
434
  return {
437
- mean: mean7,
435
+ mean: mean9,
438
436
  lower: bootstrapMeans[lowerIdx],
439
437
  upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
440
438
  };
@@ -522,11 +520,11 @@ function pairedTTest(before, after) {
522
520
  const n = before.length;
523
521
  if (n < 2) return { t: 0, df: 0, p: 1 };
524
522
  const diffs = before.map((b, i) => after[i] - b);
525
- const mean7 = diffs.reduce((a, b) => a + b, 0) / n;
526
- const variance2 = diffs.reduce((acc, d) => acc + (d - mean7) ** 2, 0) / (n - 1);
523
+ const mean9 = diffs.reduce((a, b) => a + b, 0) / n;
524
+ const variance2 = diffs.reduce((acc, d) => acc + (d - mean9) ** 2, 0) / (n - 1);
527
525
  const se = Math.sqrt(variance2 / n);
528
- if (se === 0) return { t: mean7 === 0 ? 0 : Infinity, df: n - 1, p: mean7 === 0 ? 1 : 0 };
529
- const t = mean7 / se;
526
+ if (se === 0) return { t: mean9 === 0 ? 0 : Infinity, df: n - 1, p: mean9 === 0 ? 1 : 0 };
527
+ const t = mean9 / se;
530
528
  const df = n - 1;
531
529
  const p = 2 * (1 - studentTCdf(Math.abs(t), df));
532
530
  return { t, df, p };
@@ -544,15 +542,15 @@ function wilcoxonSignedRank(before, after) {
544
542
  while (i < n) {
545
543
  let j = i;
546
544
  while (j < n && absRanks[j].abs === absRanks[i].abs) j++;
547
- const avg = (i + 1 + j) / 2;
548
- for (let k = i; k < j; k++) ranks3[absRanks[k].i] = avg;
545
+ const avg2 = (i + 1 + j) / 2;
546
+ for (let k = i; k < j; k++) ranks3[absRanks[k].i] = avg2;
549
547
  i = j;
550
548
  }
551
549
  let wPlus = 0;
552
550
  for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
553
- const mean7 = n * (n + 1) / 4;
551
+ const mean9 = n * (n + 1) / 4;
554
552
  const variance2 = n * (n + 1) * (2 * n + 1) / 24;
555
- const z = (wPlus - mean7) / Math.sqrt(variance2);
553
+ const z = (wPlus - mean9) / Math.sqrt(variance2);
556
554
  const p = 2 * (1 - normalCdf(Math.abs(z)));
557
555
  return { w: wPlus, p };
558
556
  }
@@ -753,8 +751,8 @@ async function executeScenario(tc, scenario, config) {
753
751
  console.log(` judge retry ${attempt}/2 (waiting ${wait / 1e3}s)`);
754
752
  await new Promise((r) => setTimeout(r, wait));
755
753
  }
756
- const scores = await judge(tc, judgeInput);
757
- judgeResults.push(scores);
754
+ const scores2 = await judge(tc, judgeInput);
755
+ judgeResults.push(scores2);
758
756
  await new Promise((r) => setTimeout(r, 3e3));
759
757
  break;
760
758
  } catch (err) {
@@ -847,8 +845,8 @@ var BenchmarkRunner = class {
847
845
  byJudge[js.judgeName].dimensions.push(`${js.dimension}=${js.score}`);
848
846
  }
849
847
  for (const [name, data] of Object.entries(byJudge)) {
850
- const avg = (data.scores.reduce((a, b) => a + b, 0) / data.scores.length).toFixed(1);
851
- console.log(` ${name.padEnd(16)} avg=${avg} [${data.dimensions.join(", ")}]`);
848
+ const avg2 = (data.scores.reduce((a, b) => a + b, 0) / data.scores.length).toFixed(1);
849
+ console.log(` ${name.padEnd(16)} avg=${avg2} [${data.dimensions.join(", ")}]`);
852
850
  }
853
851
  console.log(` OVERALL: ${result.overallScore.toFixed(1)}/10 (${(result.totalDurationMs / 1e3).toFixed(0)}s)`);
854
852
  console.log();
@@ -2270,7 +2268,7 @@ var PromptOptimizer = class {
2270
2268
  });
2271
2269
  }
2272
2270
  }
2273
- const scores = config.variants.map((variant) => {
2271
+ const scores2 = config.variants.map((variant) => {
2274
2272
  const scenarioMap = rawScores.get(variant.id);
2275
2273
  const allSamples = [];
2276
2274
  const perScenario = {};
@@ -2293,10 +2291,10 @@ var PromptOptimizer = class {
2293
2291
  };
2294
2292
  });
2295
2293
  const rawPairs = [];
2296
- for (let i = 0; i < scores.length; i++) {
2297
- for (let j = i + 1; j < scores.length; j++) {
2298
- const a = scores[i];
2299
- const b = scores[j];
2294
+ for (let i = 0; i < scores2.length; i++) {
2295
+ for (let j = i + 1; j < scores2.length; j++) {
2296
+ const a = scores2[i];
2297
+ const b = scores2[j];
2300
2298
  const { p } = mannWhitneyU(flatSamples(a), flatSamples(b));
2301
2299
  rawPairs.push({ a, b, p });
2302
2300
  }
@@ -2310,7 +2308,7 @@ var PromptOptimizer = class {
2310
2308
  significant: qValues[idx] < alpha,
2311
2309
  meanDelta: r.b.mean - r.a.mean
2312
2310
  }));
2313
- const sorted = scores.slice().sort((x, y) => y.mean - x.mean);
2311
+ const sorted = scores2.slice().sort((x, y) => y.mean - x.mean);
2314
2312
  const winner = sorted[0];
2315
2313
  const second = sorted[1];
2316
2314
  const winnerComparisons = pairwise2.filter(
@@ -2324,7 +2322,7 @@ var PromptOptimizer = class {
2324
2322
  significant: significantOverAll,
2325
2323
  ciLowerBoundExceedsSecondMean
2326
2324
  },
2327
- scores,
2325
+ scores: scores2,
2328
2326
  pairwise: pairwise2,
2329
2327
  config: {
2330
2328
  trialsPerScenario: trials,
@@ -2870,20 +2868,20 @@ async function mapLimit(items, limit, fn) {
2870
2868
  function mean(values) {
2871
2869
  return values.length ? values.reduce((sum2, value) => sum2 + value, 0) / values.length : 0;
2872
2870
  }
2873
- function meanRunScore(scores) {
2871
+ function meanRunScore(scores2) {
2874
2872
  return {
2875
- success: mean(scores.map((s) => s.success)),
2876
- goalProgress: mean(scores.map((s) => s.goalProgress)),
2877
- repoGroundedness: mean(scores.map((s) => s.repoGroundedness)),
2878
- driftPenalty: mean(scores.map((s) => s.driftPenalty)),
2879
- toolUseQuality: mean(scores.map((s) => s.toolUseQuality)),
2880
- patchQuality: mean(scores.map((s) => s.patchQuality)),
2881
- testReality: mean(scores.map((s) => s.testReality)),
2882
- finalGate: mean(scores.map((s) => s.finalGate)),
2883
- reviewerBlockers: mean(scores.map((s) => s.reviewerBlockers)),
2884
- costUsd: mean(scores.map((s) => s.costUsd)),
2885
- wallSeconds: mean(scores.map((s) => s.wallSeconds)),
2886
- notes: scores.flatMap((s) => s.notes ?? [])
2873
+ success: mean(scores2.map((s) => s.success)),
2874
+ goalProgress: mean(scores2.map((s) => s.goalProgress)),
2875
+ repoGroundedness: mean(scores2.map((s) => s.repoGroundedness)),
2876
+ driftPenalty: mean(scores2.map((s) => s.driftPenalty)),
2877
+ toolUseQuality: mean(scores2.map((s) => s.toolUseQuality)),
2878
+ patchQuality: mean(scores2.map((s) => s.patchQuality)),
2879
+ testReality: mean(scores2.map((s) => s.testReality)),
2880
+ finalGate: mean(scores2.map((s) => s.finalGate)),
2881
+ reviewerBlockers: mean(scores2.map((s) => s.reviewerBlockers)),
2882
+ costUsd: mean(scores2.map((s) => s.costUsd)),
2883
+ wallSeconds: mean(scores2.map((s) => s.wallSeconds)),
2884
+ notes: scores2.flatMap((s) => s.notes ?? [])
2887
2885
  };
2888
2886
  }
2889
2887
 
@@ -3339,12 +3337,12 @@ var SubprocessSandboxDriver = class {
3339
3337
  this.defaultEnv = options.env;
3340
3338
  }
3341
3339
  async exec(phase, command, config) {
3342
- const { spawn } = await import("child_process");
3340
+ const { spawn: spawn2 } = await import("child_process");
3343
3341
  const start = Date.now();
3344
3342
  const effectiveCwd = config.cwd ?? this.defaultCwd;
3345
3343
  const effectiveEnv = { ...process.env, ...this.defaultEnv ?? {}, ...config.env ?? {} };
3346
3344
  return await new Promise((resolve) => {
3347
- const child = spawn(command, {
3345
+ const child = spawn2(command, {
3348
3346
  shell: true,
3349
3347
  cwd: effectiveCwd,
3350
3348
  env: effectiveEnv
@@ -5392,10 +5390,10 @@ function analyzeSeries(values, options = {}) {
5392
5390
  return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
5393
5391
  }
5394
5392
  const tail = values.slice(-window);
5395
- const mean7 = tail.reduce((a, b) => a + b, 0) / tail.length;
5396
- const variance2 = tail.reduce((acc, v) => acc + (v - mean7) ** 2, 0) / tail.length;
5393
+ const mean9 = tail.reduce((a, b) => a + b, 0) / tail.length;
5394
+ const variance2 = tail.reduce((acc, v) => acc + (v - mean9) ** 2, 0) / tail.length;
5397
5395
  const stdDev = Math.sqrt(variance2);
5398
- const refMean = Math.abs(mean7) > 1e-9 ? Math.abs(mean7) : 1;
5396
+ const refMean = Math.abs(mean9) > 1e-9 ? Math.abs(mean9) : 1;
5399
5397
  const cv = stdDev / refMean;
5400
5398
  const stable = tail.length >= window && cv <= stableCv;
5401
5399
  let tailRun = 0;
@@ -5416,7 +5414,7 @@ function analyzeSeries(values, options = {}) {
5416
5414
  } else {
5417
5415
  state = "noisy";
5418
5416
  }
5419
- return { state, windowMean: mean7, windowCv: cv, tailRun, stable };
5417
+ return { state, windowMean: mean9, windowCv: cv, tailRun, stable };
5420
5418
  }
5421
5419
 
5422
5420
  // src/state-continuity.ts
@@ -6012,9 +6010,9 @@ function calibrateJudge(golden, candidate) {
6012
6010
  const worst2 = [...map.entries()].filter(([, v]) => Number.isFinite(v.j)).map(([itemId, v]) => ({ itemId, judge: v.j, human: v.h, delta: Math.abs(v.j - v.h) })).sort((a, b) => b.delta - a.delta).slice(0, 5);
6013
6011
  return { n, pearson: pearson2, kappa, mae, worstItems: worst2 };
6014
6012
  }
6015
- function positionalBias(scores) {
6013
+ function positionalBias(scores2) {
6016
6014
  const pairs = /* @__PURE__ */ new Map();
6017
- for (const s of scores) {
6015
+ for (const s of scores2) {
6018
6016
  const slot = pairs.get(s.itemId) ?? {};
6019
6017
  if (s.positionOfAInput === "first") slot.first = s.score;
6020
6018
  else if (s.positionOfAInput === "second") slot.second = s.score;
@@ -6165,12 +6163,12 @@ function renderMarkdownReport(reports) {
6165
6163
  async function aggregateRunMetrics(runs, store) {
6166
6164
  if (runs.length === 0) return {};
6167
6165
  const durations = [];
6168
- const scores = [];
6166
+ const scores2 = [];
6169
6167
  const passes = [];
6170
6168
  const costs = [];
6171
6169
  for (const r of runs) {
6172
6170
  if (r.endedAt) durations.push(r.endedAt - r.startedAt);
6173
- if (r.outcome?.score !== void 0) scores.push(r.outcome.score);
6171
+ if (r.outcome?.score !== void 0) scores2.push(r.outcome.score);
6174
6172
  passes.push(r.outcome?.pass === true ? 1 : 0);
6175
6173
  const llm = await llmSpans(store, r.runId);
6176
6174
  costs.push(aggregateLlm(llm).costUsd);
@@ -6179,7 +6177,7 @@ async function aggregateRunMetrics(runs, store) {
6179
6177
  provisionMs: average(durations),
6180
6178
  firstTokenMs: average(durations),
6181
6179
  wallMs: average(durations),
6182
- overallScore: average(scores),
6180
+ overallScore: average(scores2),
6183
6181
  passRate: average(passes),
6184
6182
  costUsd: average(costs)
6185
6183
  };
@@ -6242,7 +6240,7 @@ async function toLangfuseEnvelope(store, runId) {
6242
6240
  },
6243
6241
  metadata: { finishReason: s.finishReason, cachedTokens: s.cachedTokens }
6244
6242
  }));
6245
- const scores = judges.map((j) => ({
6243
+ const scores2 = judges.map((j) => ({
6246
6244
  id: j.spanId,
6247
6245
  traceId: run.runId,
6248
6246
  observationId: j.targetSpanId,
@@ -6250,7 +6248,7 @@ async function toLangfuseEnvelope(store, runId) {
6250
6248
  value: j.score,
6251
6249
  comment: j.rationale
6252
6250
  }));
6253
- return { traceId: run.runId, generations, scores };
6251
+ return { traceId: run.runId, generations, scores: scores2 };
6254
6252
  }
6255
6253
  async function toPrometheusText(store) {
6256
6254
  const runs = await store.listRuns();
@@ -6344,12 +6342,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
6344
6342
  variantScores.push({ mutator: id, score, mutated });
6345
6343
  all.push(score);
6346
6344
  }
6347
- const mean7 = all.reduce((a, b) => a + b, 0) / all.length;
6348
- const variance2 = all.reduce((a, v) => a + (v - mean7) ** 2, 0) / all.length;
6345
+ const mean9 = all.reduce((a, b) => a + b, 0) / all.length;
6346
+ const variance2 = all.reduce((a, v) => a + (v - mean9) ** 2, 0) / all.length;
6349
6347
  const stdDev = Math.sqrt(variance2);
6350
- const ref = Math.abs(mean7) > 1e-9 ? Math.abs(mean7) : 1;
6348
+ const ref = Math.abs(mean9) > 1e-9 ? Math.abs(mean9) : 1;
6351
6349
  const robustness = Math.max(0, 1 - stdDev / ref);
6352
- return { originalScore, variantScores, meanScore: mean7, stdDev, robustness };
6350
+ return { originalScore, variantScores, meanScore: mean9, stdDev, robustness };
6353
6351
  }
6354
6352
  var lowercaseMutator = (p) => p.toLowerCase();
6355
6353
  var sentenceReorderMutator = (p, seed) => {
@@ -6684,8 +6682,8 @@ function ranks(xs) {
6684
6682
  for (let i = 0; i < indexed.length; i++) {
6685
6683
  let j = i;
6686
6684
  while (j + 1 < indexed.length && indexed[j + 1].v === indexed[i].v) j++;
6687
- const avg = (i + j + 2) / 2;
6688
- for (let k = i; k <= j; k++) r[indexed[k].i] = avg;
6685
+ const avg2 = (i + j + 2) / 2;
6686
+ for (let k = i; k <= j; k++) r[indexed[k].i] = avg2;
6689
6687
  i = j;
6690
6688
  }
6691
6689
  return r;
@@ -6929,8 +6927,8 @@ function ranks2(xs) {
6929
6927
  for (let i = 0; i < indexed.length; i++) {
6930
6928
  let j = i;
6931
6929
  while (j + 1 < indexed.length && indexed[j + 1].v === indexed[i].v) j++;
6932
- const avg = (i + j + 2) / 2;
6933
- for (let k = i; k <= j; k++) r[indexed[k].i] = avg;
6930
+ const avg2 = (i + j + 2) / 2;
6931
+ for (let k = i; k <= j; k++) r[indexed[k].i] = avg2;
6934
6932
  i = j;
6935
6933
  }
6936
6934
  return r;
@@ -7270,8 +7268,8 @@ async function prmBestOfN(store, grader, runIds) {
7270
7268
  if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
7271
7269
  const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
7272
7270
  const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
7273
- const mean7 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
7274
- const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean7) ** 2, 0) / graded.length;
7271
+ const mean9 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
7272
+ const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean9) ** 2, 0) / graded.length;
7275
7273
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
7276
7274
  }
7277
7275
  async function prmEnsembleBestOfN(store, graders, runIds) {
@@ -7293,8 +7291,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
7293
7291
  const ranked = [...byRun.values()].sort(
7294
7292
  (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
7295
7293
  );
7296
- const mean7 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
7297
- const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean7) ** 2, 0) / ranked.length;
7294
+ const mean9 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
7295
+ const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean9) ** 2, 0) / ranked.length;
7298
7296
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
7299
7297
  }
7300
7298
 
@@ -7672,15 +7670,15 @@ async function runSelfPlay(proposer, scorer, targets, options = {}) {
7672
7670
  const rejected = [];
7673
7671
  const surviving = [];
7674
7672
  for (const candidate of proposed) {
7675
- const scores = await scorer.scoreCandidate(candidate, targets);
7676
- if (scores.length < 2) {
7673
+ const scores2 = await scorer.scoreCandidate(candidate, targets);
7674
+ if (scores2.length < 2) {
7677
7675
  rejected.push({ candidate, reason: "scorer returned <2 results" });
7678
7676
  continue;
7679
7677
  }
7680
- const values = scores.map((s) => s.score);
7678
+ const values = scores2.map((s) => s.score);
7681
7679
  const spread = Math.max(...values) - Math.min(...values);
7682
7680
  const maxScore = Math.max(...values);
7683
- scored.push({ candidate, scores, spread });
7681
+ scored.push({ candidate, scores: scores2, spread });
7684
7682
  if (maxScore < floor) {
7685
7683
  rejected.push({ candidate, reason: `every target below floor (max=${maxScore.toFixed(3)} < ${floor})` });
7686
7684
  continue;
@@ -7822,10 +7820,10 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
7822
7820
  }
7823
7821
  for (const s of scenarios) {
7824
7822
  const sRuns = runs.filter((r) => r.scenarioId === s.id);
7825
- const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
7826
- if (scores.length < 3) continue;
7827
- const mean7 = scores.reduce((a, b) => a + b, 0) / scores.length;
7828
- const variance2 = scores.reduce((a, b) => a + (b - mean7) ** 2, 0) / scores.length;
7823
+ const scores2 = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
7824
+ if (scores2.length < 3) continue;
7825
+ const mean9 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
7826
+ const variance2 = scores2.reduce((a, b) => a + (b - mean9) ** 2, 0) / scores2.length;
7829
7827
  if (variance2 > varianceThreshold) {
7830
7828
  targets.push({
7831
7829
  reason: "high-variance",
@@ -8580,20 +8578,20 @@ function mergeLayerResults(name, perAdapter, options = {}) {
8580
8578
  let durationMs = 0;
8581
8579
  const reasonParts = [];
8582
8580
  const diagnostics = {};
8583
- for (const { adapter, result } of perAdapter) {
8581
+ for (const { adapter: adapter4, result } of perAdapter) {
8584
8582
  status = worst(status, result.status);
8585
8583
  if (typeof result.score === "number") {
8586
8584
  weightedScoreSum += result.score;
8587
8585
  weightCount += 1;
8588
8586
  }
8589
8587
  durationMs = mergeDuration === "sum" ? durationMs + result.durationMs : Math.max(durationMs, result.durationMs);
8590
- reasonParts.push(`${adapter}: ${result.status}`);
8588
+ reasonParts.push(`${adapter4}: ${result.status}`);
8591
8589
  for (const f of result.findings) {
8592
8590
  findings.push({
8593
8591
  ...f,
8594
8592
  layer: name,
8595
- message: prefix ? `${prefix(adapter)} ${f.message}` : f.message,
8596
- detail: { ...f.detail ?? {}, adapter }
8593
+ message: prefix ? `${prefix(adapter4)} ${f.message}` : f.message,
8594
+ detail: { ...f.detail ?? {}, adapter: adapter4 }
8597
8595
  });
8598
8596
  }
8599
8597
  for (const [k, v] of Object.entries(result.diagnostics ?? {})) {
@@ -8612,8 +8610,8 @@ function mergeLayerResults(name, perAdapter, options = {}) {
8612
8610
  reason: reasonParts.join(" \xB7 "),
8613
8611
  diagnostics: Object.keys(diagnostics).length > 0 ? diagnostics : void 0,
8614
8612
  detail: {
8615
- adapters: perAdapter.map(({ adapter, result }) => ({
8616
- adapter,
8613
+ adapters: perAdapter.map(({ adapter: adapter4, result }) => ({
8614
+ adapter: adapter4,
8617
8615
  status: result.status,
8618
8616
  score: result.score ?? null
8619
8617
  })),
@@ -8639,10 +8637,10 @@ function multiToolchainLayer(config) {
8639
8637
  reason: "no adapters detected"
8640
8638
  };
8641
8639
  }
8642
- const runOne = async (adapter) => {
8643
- const adapterName = config.adapterName(adapter);
8640
+ const runOne = async (adapter4) => {
8641
+ const adapterName = config.adapterName(adapter4);
8644
8642
  try {
8645
- const r = await config.run(adapter, ctx);
8643
+ const r = await config.run(adapter4, ctx);
8646
8644
  return { adapter: adapterName, result: r };
8647
8645
  } catch (err) {
8648
8646
  return {
@@ -9345,6 +9343,57 @@ function viteDeployRunner(input) {
9345
9343
  }
9346
9344
  };
9347
9345
  }
9346
+ function wranglerDeployRunner(input) {
9347
+ return {
9348
+ run: async () => {
9349
+ const start = Date.now();
9350
+ const buildCmd = input.buildCommand ?? "npm run build";
9351
+ const dryCmd = input.dryRunCommand ?? "npx wrangler deploy --dry-run --outdir dist";
9352
+ const timeoutMs = input.timeoutMs ?? 12e4;
9353
+ const hasToml = await input.exists("wrangler.toml");
9354
+ const hasJsonc = hasToml ? false : await input.exists("wrangler.jsonc");
9355
+ if (!hasToml && !hasJsonc) {
9356
+ return {
9357
+ ok: false,
9358
+ output: "no wrangler config found (wrangler.toml / wrangler.jsonc absent)",
9359
+ durationMs: Date.now() - start,
9360
+ artifactDir: "dist",
9361
+ artifactValid: false
9362
+ };
9363
+ }
9364
+ const build = await input.exec(buildCmd, { cwd: input.workdir, timeoutMs });
9365
+ if (build.exitCode !== 0) {
9366
+ const tail2 = ((build.stderr || build.stdout) ?? "").slice(-1500);
9367
+ return {
9368
+ ok: false,
9369
+ output: `build failed: ${tail2}`,
9370
+ durationMs: Date.now() - start,
9371
+ artifactDir: "dist",
9372
+ artifactValid: false
9373
+ };
9374
+ }
9375
+ const dry = await input.exec(dryCmd, { cwd: input.workdir, timeoutMs });
9376
+ if (dry.exitCode !== 0) {
9377
+ const tail2 = ((dry.stderr || dry.stdout) ?? "").slice(-1500);
9378
+ return {
9379
+ ok: false,
9380
+ output: `wrangler dry-run failed: ${tail2}`,
9381
+ durationMs: Date.now() - start,
9382
+ artifactDir: "dist",
9383
+ artifactValid: false
9384
+ };
9385
+ }
9386
+ const tail = ((dry.stdout || dry.stderr) ?? "").slice(-1500);
9387
+ return {
9388
+ ok: true,
9389
+ output: tail,
9390
+ durationMs: Date.now() - start,
9391
+ artifactDir: "dist",
9392
+ artifactValid: true
9393
+ };
9394
+ }
9395
+ };
9396
+ }
9348
9397
 
9349
9398
  // src/keyword-coverage-judge.ts
9350
9399
  function htmlContainsElement(html, selector) {
@@ -9712,15 +9761,15 @@ function scoreReferenceReplay(scenarios, options = {}) {
9712
9761
  const threshold = options.matchThreshold ?? DEFAULT_MATCH_THRESHOLD;
9713
9762
  const matchStrategy = options.matchStrategy ?? "reference-order";
9714
9763
  const allowedSplits = new Set(options.splits ?? ALL_SPLITS);
9715
- const scores = scenarios.filter((scenario) => {
9764
+ const scores2 = scenarios.filter((scenario) => {
9716
9765
  const split = scenario.split ?? "train";
9717
9766
  if (split === "holdout" && !options.includeHoldout) return false;
9718
9767
  return allowedSplits.has(split);
9719
9768
  }).map((scenario) => scoreScenario(scenario, matcher, threshold, matchStrategy));
9720
9769
  return {
9721
- scenarios: scores,
9722
- aggregate: aggregateScenarioScores(scores),
9723
- bySplit: aggregateBySplit(scores)
9770
+ scenarios: scores2,
9771
+ aggregate: aggregateScenarioScores(scores2),
9772
+ bySplit: aggregateBySplit(scores2)
9724
9773
  };
9725
9774
  }
9726
9775
  function compareReferenceReplay(baseline, candidate) {
@@ -9935,20 +9984,20 @@ function buildScenarioScore(scenario, matches2, falsePositives) {
9935
9984
  matches: matches2
9936
9985
  };
9937
9986
  }
9938
- function aggregateBySplit(scores) {
9987
+ function aggregateBySplit(scores2) {
9939
9988
  const out = {};
9940
9989
  for (const split of ALL_SPLITS) {
9941
- const scoped = scores.filter((score) => score.split === split);
9990
+ const scoped = scores2.filter((score) => score.split === split);
9942
9991
  if (scoped.length > 0) out[split] = aggregateScenarioScores(scoped);
9943
9992
  }
9944
9993
  return out;
9945
9994
  }
9946
- function aggregateScenarioScores(scores) {
9947
- const matched = sum(scores.map((score) => score.matched));
9948
- const total = sum(scores.map((score) => score.total));
9949
- const falsePositives = sum(scores.map((score) => score.falsePositives));
9950
- const matchedWeight = sum(scores.map((score) => score.matchedWeight));
9951
- const totalWeight = sum(scores.map((score) => score.totalWeight));
9995
+ function aggregateScenarioScores(scores2) {
9996
+ const matched = sum(scores2.map((score) => score.matched));
9997
+ const total = sum(scores2.map((score) => score.total));
9998
+ const falsePositives = sum(scores2.map((score) => score.falsePositives));
9999
+ const matchedWeight = sum(scores2.map((score) => score.matchedWeight));
10000
+ const totalWeight = sum(scores2.map((score) => score.totalWeight));
9952
10001
  const precision2 = ratio(matched, matched + falsePositives);
9953
10002
  const recall = ratio(matched, total);
9954
10003
  return {
@@ -10027,8 +10076,8 @@ function formatPct(value) {
10027
10076
  function bySplitOrder(a, b) {
10028
10077
  return ALL_SPLITS.indexOf(a) - ALL_SPLITS.indexOf(b);
10029
10078
  }
10030
- function runAdapter(adapter, scenario, context) {
10031
- return typeof adapter === "function" ? adapter(scenario, context) : adapter.run(scenario, context);
10079
+ function runAdapter(adapter4, scenario, context) {
10080
+ return typeof adapter4 === "function" ? adapter4(scenario, context) : adapter4.run(scenario, context);
10032
10081
  }
10033
10082
  function throwIfAborted(signal) {
10034
10083
  if (!signal?.aborted) return;
@@ -10066,6 +10115,1258 @@ var STOP_WORDS = /* @__PURE__ */ new Set([
10066
10115
  "which"
10067
10116
  ]);
10068
10117
 
10118
+ // src/paired-stats.ts
10119
+ function pairedBootstrap(before, after, opts = {}) {
10120
+ if (before.length !== after.length) {
10121
+ throw new Error(
10122
+ `pairedBootstrap: unequal sample sizes (${before.length} vs ${after.length})`
10123
+ );
10124
+ }
10125
+ const confidence = opts.confidence ?? 0.95;
10126
+ const resamples = opts.resamples ?? 2e3;
10127
+ const statistic = opts.statistic ?? "median";
10128
+ if (confidence <= 0 || confidence >= 1) {
10129
+ throw new Error(`pairedBootstrap: confidence must be in (0,1), got ${confidence}`);
10130
+ }
10131
+ const n = before.length;
10132
+ const deltas = before.map((b, i) => after[i] - b);
10133
+ if (n === 0) {
10134
+ return { n: 0, median: 0, mean: 0, low: 0, high: 0, confidence, resamples };
10135
+ }
10136
+ if (n === 1) {
10137
+ const d = deltas[0];
10138
+ return { n: 1, median: d, mean: d, low: d, high: d, confidence, resamples };
10139
+ }
10140
+ const rng = makeRng(opts.seed);
10141
+ const samples = new Array(resamples);
10142
+ for (let b = 0; b < resamples; b++) {
10143
+ let acc = null;
10144
+ if (statistic === "mean") {
10145
+ let sum2 = 0;
10146
+ for (let k = 0; k < n; k++) {
10147
+ sum2 += deltas[Math.floor(rng() * n)];
10148
+ }
10149
+ samples[b] = sum2 / n;
10150
+ } else {
10151
+ acc = new Array(n);
10152
+ for (let k = 0; k < n; k++) {
10153
+ acc[k] = deltas[Math.floor(rng() * n)];
10154
+ }
10155
+ samples[b] = medianInPlace(acc);
10156
+ }
10157
+ }
10158
+ samples.sort((a, b) => a - b);
10159
+ const alpha = 1 - confidence;
10160
+ const lowIdx = Math.floor(alpha / 2 * resamples);
10161
+ const highIdx = Math.min(resamples - 1, Math.ceil((1 - alpha / 2) * resamples) - 1);
10162
+ return {
10163
+ n,
10164
+ median: medianInPlace([...deltas]),
10165
+ mean: deltas.reduce((s, x) => s + x, 0) / n,
10166
+ low: samples[lowIdx],
10167
+ high: samples[Math.max(highIdx, lowIdx)],
10168
+ confidence,
10169
+ resamples
10170
+ };
10171
+ }
10172
+ function pairedWilcoxon(before, after) {
10173
+ return wilcoxonSignedRank(before, after);
10174
+ }
10175
+ function bhAdjust(pValues, fdr = 0.05) {
10176
+ return benjaminiHochberg(pValues, fdr);
10177
+ }
10178
+ function medianInPlace(xs) {
10179
+ if (xs.length === 0) return 0;
10180
+ xs.sort((a, b) => a - b);
10181
+ const mid = Math.floor(xs.length / 2);
10182
+ return xs.length % 2 === 0 ? (xs[mid - 1] + xs[mid]) / 2 : xs[mid];
10183
+ }
10184
+ function makeRng(seed) {
10185
+ if (seed === void 0) return Math.random;
10186
+ let s = seed | 0 || 2654435769;
10187
+ return () => {
10188
+ s = s + 1831565813 | 0;
10189
+ let t = s;
10190
+ t = Math.imul(t ^ t >>> 15, t | 1);
10191
+ t ^= t + Math.imul(t ^ t >>> 7, t | 61);
10192
+ return ((t ^ t >>> 14) >>> 0) / 4294967296;
10193
+ };
10194
+ }
10195
+
10196
+ // src/run-record.ts
10197
+ var MANDATORY_TOP_LEVEL = [
10198
+ "runId",
10199
+ "experimentId",
10200
+ "candidateId",
10201
+ "seed",
10202
+ "model",
10203
+ "promptHash",
10204
+ "configHash",
10205
+ "commitSha",
10206
+ "wallMs",
10207
+ "costUsd",
10208
+ "tokenUsage",
10209
+ "outcome",
10210
+ "splitTag"
10211
+ ];
10212
+ var SPLIT_TAGS = ["search", "dev", "holdout"];
10213
+ var RunRecordValidationError = class extends Error {
10214
+ path;
10215
+ constructor(message, path = "") {
10216
+ super(path ? `${message} (at ${path})` : message);
10217
+ this.name = "RunRecordValidationError";
10218
+ this.path = path;
10219
+ }
10220
+ };
10221
+ function validateRunRecord(input) {
10222
+ if (input === null || typeof input !== "object") {
10223
+ throw new RunRecordValidationError("expected object");
10224
+ }
10225
+ const obj = input;
10226
+ for (const key of MANDATORY_TOP_LEVEL) {
10227
+ if (!(key in obj)) {
10228
+ throw new RunRecordValidationError(`missing mandatory field "${key}"`);
10229
+ }
10230
+ }
10231
+ expectString(obj.runId, "runId");
10232
+ expectString(obj.experimentId, "experimentId");
10233
+ expectString(obj.candidateId, "candidateId");
10234
+ expectFiniteNumber(obj.seed, "seed");
10235
+ expectString(obj.model, "model");
10236
+ expectString(obj.promptHash, "promptHash");
10237
+ expectString(obj.configHash, "configHash");
10238
+ expectString(obj.commitSha, "commitSha");
10239
+ expectFiniteNumber(obj.wallMs, "wallMs");
10240
+ if (obj.queueMs !== void 0) expectFiniteNumber(obj.queueMs, "queueMs");
10241
+ expectFiniteNumber(obj.costUsd, "costUsd");
10242
+ if (!modelHasSnapshot(obj.model)) {
10243
+ throw new RunRecordValidationError(
10244
+ `model "${obj.model}" lacks a snapshot version (use 'name@YYYY-MM-DD' or 'name-YYYYMMDD')`,
10245
+ "model"
10246
+ );
10247
+ }
10248
+ const tu = obj.tokenUsage;
10249
+ if (tu === null || typeof tu !== "object") {
10250
+ throw new RunRecordValidationError("tokenUsage must be an object", "tokenUsage");
10251
+ }
10252
+ const tuRec = tu;
10253
+ expectFiniteNumber(tuRec.input, "tokenUsage.input");
10254
+ expectFiniteNumber(tuRec.output, "tokenUsage.output");
10255
+ if (tuRec.cached !== void 0) expectFiniteNumber(tuRec.cached, "tokenUsage.cached");
10256
+ if (obj.judgeMetadata !== void 0) {
10257
+ const jm = obj.judgeMetadata;
10258
+ if (jm === null || typeof jm !== "object") {
10259
+ throw new RunRecordValidationError("judgeMetadata must be an object", "judgeMetadata");
10260
+ }
10261
+ const jmRec = jm;
10262
+ expectString(jmRec.model, "judgeMetadata.model");
10263
+ expectString(jmRec.promptVersion, "judgeMetadata.promptVersion");
10264
+ expectFiniteNumber(jmRec.confidence, "judgeMetadata.confidence");
10265
+ if (typeof jmRec.fallback !== "boolean") {
10266
+ throw new RunRecordValidationError("judgeMetadata.fallback must be boolean", "judgeMetadata.fallback");
10267
+ }
10268
+ }
10269
+ const out = obj.outcome;
10270
+ if (out === null || typeof out !== "object") {
10271
+ throw new RunRecordValidationError("outcome must be an object", "outcome");
10272
+ }
10273
+ const outRec = out;
10274
+ if (outRec.searchScore !== void 0) expectFiniteNumber(outRec.searchScore, "outcome.searchScore");
10275
+ if (outRec.holdoutScore !== void 0) expectFiniteNumber(outRec.holdoutScore, "outcome.holdoutScore");
10276
+ if (outRec.searchScore === void 0 && outRec.holdoutScore === void 0) {
10277
+ throw new RunRecordValidationError(
10278
+ "outcome must define searchScore or holdoutScore (or both)",
10279
+ "outcome"
10280
+ );
10281
+ }
10282
+ const raw = outRec.raw;
10283
+ if (raw === null || typeof raw !== "object") {
10284
+ throw new RunRecordValidationError("outcome.raw must be an object", "outcome.raw");
10285
+ }
10286
+ for (const [k, v] of Object.entries(raw)) {
10287
+ expectFiniteNumber(v, `outcome.raw.${k}`);
10288
+ }
10289
+ if (obj.failureMode !== void 0) expectString(obj.failureMode, "failureMode");
10290
+ if (typeof obj.splitTag !== "string" || !SPLIT_TAGS.includes(obj.splitTag)) {
10291
+ throw new RunRecordValidationError(
10292
+ `splitTag must be one of ${SPLIT_TAGS.join(", ")}, got ${String(obj.splitTag)}`,
10293
+ "splitTag"
10294
+ );
10295
+ }
10296
+ return input;
10297
+ }
10298
+ function isRunRecord(input) {
10299
+ try {
10300
+ validateRunRecord(input);
10301
+ return true;
10302
+ } catch {
10303
+ return false;
10304
+ }
10305
+ }
10306
+ function parseRunRecordSafe(input) {
10307
+ try {
10308
+ return { ok: true, value: validateRunRecord(input) };
10309
+ } catch (e) {
10310
+ if (e instanceof RunRecordValidationError) return { ok: false, error: e };
10311
+ throw e;
10312
+ }
10313
+ }
10314
+ function roundTripRunRecord(record) {
10315
+ const json = JSON.stringify(record);
10316
+ return validateRunRecord(JSON.parse(json));
10317
+ }
10318
+ function expectString(value, path) {
10319
+ if (typeof value !== "string" || value.length === 0) {
10320
+ throw new RunRecordValidationError(`expected non-empty string`, path);
10321
+ }
10322
+ }
10323
+ function expectFiniteNumber(value, path) {
10324
+ if (typeof value !== "number" || !Number.isFinite(value)) {
10325
+ throw new RunRecordValidationError(`expected finite number`, path);
10326
+ }
10327
+ }
10328
+ function modelHasSnapshot(model) {
10329
+ if (model.includes("@")) return true;
10330
+ if (/-\d{8}$/.test(model)) return true;
10331
+ if (/-\d{4}-\d{2}-\d{2}$/.test(model)) return true;
10332
+ if (/:date-/.test(model)) return true;
10333
+ return false;
10334
+ }
10335
+
10336
+ // src/held-out-gate.ts
10337
+ var HeldOutGate = class {
10338
+ minProductiveRuns;
10339
+ pairedDeltaThreshold;
10340
+ overfitGapThreshold;
10341
+ baselineKey;
10342
+ confidence;
10343
+ resamples;
10344
+ seed;
10345
+ constructor(config) {
10346
+ if (!config.baselineKey) {
10347
+ throw new Error("HeldOutGate: baselineKey is required");
10348
+ }
10349
+ this.minProductiveRuns = config.minProductiveRuns ?? 3;
10350
+ this.pairedDeltaThreshold = config.pairedDeltaThreshold ?? 0;
10351
+ this.overfitGapThreshold = config.overfitGapThreshold ?? 0.15;
10352
+ this.baselineKey = config.baselineKey;
10353
+ this.confidence = config.confidence ?? 0.95;
10354
+ this.resamples = config.bootstrapResamples ?? 2e3;
10355
+ this.seed = config.seed;
10356
+ }
10357
+ /** Decide whether `candidate` should replace `baseline`. Pairing
10358
+ * is by (experimentId, seed) — identical experiment + seed pairs
10359
+ * the candidate run with the matching baseline run. Pairs without
10360
+ * a holdout score on both sides are dropped. */
10361
+ evaluate(candidate, baseline) {
10362
+ const candidateId = inferCandidateId(candidate, this.baselineKey);
10363
+ const baselineId = this.baselineKey;
10364
+ const baselineHoldoutByKey = indexHoldoutByKey(baseline);
10365
+ const beforeHoldout = [];
10366
+ const afterHoldout = [];
10367
+ for (const run of candidate) {
10368
+ if (run.splitTag !== "holdout") continue;
10369
+ if (run.outcome.holdoutScore === void 0) continue;
10370
+ const key = pairKey(run);
10371
+ const counterpart = baselineHoldoutByKey.get(key);
10372
+ if (counterpart === void 0) continue;
10373
+ beforeHoldout.push(counterpart);
10374
+ afterHoldout.push(run.outcome.holdoutScore);
10375
+ }
10376
+ const productiveRuns = beforeHoldout.length;
10377
+ const candidateSearchMean = mean5(scores(candidate, "searchScore", "search"));
10378
+ const candidateHoldoutMean = mean5(scores(candidate, "holdoutScore", "holdout"));
10379
+ const baselineSearchMean = mean5(scores(baseline, "searchScore", "search"));
10380
+ const baselineHoldoutMean = mean5(scores(baseline, "holdoutScore", "holdout"));
10381
+ const overfitGap = safeDiff(candidateSearchMean, candidateHoldoutMean);
10382
+ const baselineOverfitGap = safeDiff(baselineSearchMean, baselineHoldoutMean);
10383
+ if (productiveRuns < this.minProductiveRuns) {
10384
+ return {
10385
+ promote: false,
10386
+ candidateId,
10387
+ baselineId,
10388
+ evidence: {
10389
+ productiveRuns,
10390
+ medianPairedDelta: productiveRuns > 0 ? medianDelta(beforeHoldout, afterHoldout) : 0,
10391
+ pairedCI: { low: 0, high: 0 },
10392
+ pairedPValue: 1,
10393
+ searchScore: candidateSearchMean,
10394
+ holdoutScore: candidateHoldoutMean,
10395
+ overfitGap,
10396
+ baselineOverfitGap
10397
+ },
10398
+ reason: `few_runs: ${productiveRuns} paired holdout observation(s) < min ${this.minProductiveRuns}`,
10399
+ rejectionCode: "few_runs"
10400
+ };
10401
+ }
10402
+ const ci = pairedBootstrap(beforeHoldout, afterHoldout, {
10403
+ confidence: this.confidence,
10404
+ resamples: this.resamples,
10405
+ statistic: "median",
10406
+ seed: this.seed
10407
+ });
10408
+ const wilcoxon = pairedWilcoxon(beforeHoldout, afterHoldout);
10409
+ const evidence = {
10410
+ productiveRuns,
10411
+ medianPairedDelta: ci.median,
10412
+ pairedCI: { low: ci.low, high: ci.high },
10413
+ pairedPValue: wilcoxon.p,
10414
+ searchScore: candidateSearchMean,
10415
+ holdoutScore: candidateHoldoutMean,
10416
+ overfitGap,
10417
+ baselineOverfitGap
10418
+ };
10419
+ if (!(ci.low > this.pairedDeltaThreshold)) {
10420
+ return {
10421
+ promote: false,
10422
+ candidateId,
10423
+ baselineId,
10424
+ evidence,
10425
+ reason: `negative_delta: paired holdout median \u0394=${fmt(ci.median)} CI=[${fmt(ci.low)}, ${fmt(ci.high)}] does not clear threshold ${fmt(this.pairedDeltaThreshold)}`,
10426
+ rejectionCode: "negative_delta"
10427
+ };
10428
+ }
10429
+ if (Number.isFinite(overfitGap) && Number.isFinite(baselineOverfitGap) && overfitGap > baselineOverfitGap + this.overfitGapThreshold) {
10430
+ return {
10431
+ promote: false,
10432
+ candidateId,
10433
+ baselineId,
10434
+ evidence,
10435
+ reason: `overfit_gap: candidate gap=${fmt(overfitGap)} exceeds baseline gap=${fmt(baselineOverfitGap)} by more than ${fmt(this.overfitGapThreshold)}`,
10436
+ rejectionCode: "overfit_gap"
10437
+ };
10438
+ }
10439
+ return {
10440
+ promote: true,
10441
+ candidateId,
10442
+ baselineId,
10443
+ evidence,
10444
+ reason: `promote: paired holdout median \u0394=${fmt(ci.median)} CI=[${fmt(ci.low)}, ${fmt(ci.high)}] over ${productiveRuns} pairs; overfit gap candidate=${fmt(overfitGap)} vs baseline=${fmt(baselineOverfitGap)}`,
10445
+ rejectionCode: null
10446
+ };
10447
+ }
10448
+ };
10449
+ function inferCandidateId(candidate, baselineKey) {
10450
+ for (const run of candidate) {
10451
+ if (run.candidateId && run.candidateId !== baselineKey) return run.candidateId;
10452
+ }
10453
+ return candidate[0]?.candidateId ?? "(unknown candidate)";
10454
+ }
10455
+ function indexHoldoutByKey(runs) {
10456
+ const out = /* @__PURE__ */ new Map();
10457
+ for (const r of runs) {
10458
+ if (r.splitTag !== "holdout") continue;
10459
+ if (r.outcome.holdoutScore === void 0) continue;
10460
+ out.set(pairKey(r), r.outcome.holdoutScore);
10461
+ }
10462
+ return out;
10463
+ }
10464
+ function pairKey(r) {
10465
+ return `${r.experimentId}::${r.seed}`;
10466
+ }
10467
+ function scores(runs, field, splitFilter) {
10468
+ const out = [];
10469
+ for (const r of runs) {
10470
+ if (r.splitTag !== splitFilter) continue;
10471
+ const v = r.outcome[field];
10472
+ if (typeof v === "number" && Number.isFinite(v)) out.push(v);
10473
+ }
10474
+ return out;
10475
+ }
10476
+ function mean5(xs) {
10477
+ if (xs.length === 0) return Number.NaN;
10478
+ return xs.reduce((s, x) => s + x, 0) / xs.length;
10479
+ }
10480
+ function safeDiff(a, b) {
10481
+ if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
10482
+ return a - b;
10483
+ }
10484
+ function medianDelta(before, after) {
10485
+ const ds = before.map((b, i) => after[i] - b).sort((x, y) => x - y);
10486
+ if (ds.length === 0) return 0;
10487
+ const mid = Math.floor(ds.length / 2);
10488
+ return ds.length % 2 === 0 ? (ds[mid - 1] + ds[mid]) / 2 : ds[mid];
10489
+ }
10490
+ function fmt(x) {
10491
+ if (!Number.isFinite(x)) return String(x);
10492
+ return x.toFixed(4);
10493
+ }
10494
+
10495
+ // src/researcher.ts
10496
+ var NoopResearcher = class {
10497
+ hint;
10498
+ constructor(hint = "NoopResearcher: no implementation wired") {
10499
+ this.hint = hint;
10500
+ }
10501
+ async inspectFailures(_runs) {
10502
+ throw new Error(`${this.hint} (inspectFailures not implemented)`);
10503
+ }
10504
+ async proposeChange(_failures) {
10505
+ throw new Error(`${this.hint} (proposeChange not implemented)`);
10506
+ }
10507
+ async applyChange(_changes, _baseline) {
10508
+ throw new Error(`${this.hint} (applyChange not implemented)`);
10509
+ }
10510
+ async evaluateChange(_plan) {
10511
+ throw new Error(`${this.hint} (evaluateChange not implemented)`);
10512
+ }
10513
+ };
10514
+
10515
+ // src/summary-report.ts
10516
+ function summaryTable(runs, opts = {}) {
10517
+ const split = opts.split ?? "holdout";
10518
+ const confidence = opts.confidence ?? 0.95;
10519
+ const fdr = opts.fdr ?? 0.05;
10520
+ const comparator = opts.comparator ?? null;
10521
+ const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
10522
+ const byCandidate = /* @__PURE__ */ new Map();
10523
+ for (const r of runs) {
10524
+ if (r.splitTag !== split) continue;
10525
+ const v = r.outcome[scoreField];
10526
+ if (typeof v !== "number" || !Number.isFinite(v)) continue;
10527
+ const bucket = byCandidate.get(r.candidateId) ?? { runs: [], scores: [] };
10528
+ bucket.runs.push(r);
10529
+ bucket.scores.push(v);
10530
+ byCandidate.set(r.candidateId, bucket);
10531
+ }
10532
+ const candidateIds = [...byCandidate.keys()].sort();
10533
+ const compRuns = comparator ? byCandidate.get(comparator) : void 0;
10534
+ const tentative = [];
10535
+ for (const id of candidateIds) {
10536
+ const bucket = byCandidate.get(id);
10537
+ const ci = confidenceInterval(bucket.scores, confidence);
10538
+ let rawP = Number.NaN;
10539
+ let d = Number.NaN;
10540
+ if (comparator && compRuns && id !== comparator) {
10541
+ const paired = pairScoresByKey(bucket.runs, compRuns.runs, scoreField);
10542
+ if (paired.before.length >= 6) {
10543
+ rawP = wilcoxonSignedRank(paired.before, paired.after).p;
10544
+ }
10545
+ d = cohensD(compRuns.scores, bucket.scores);
10546
+ }
10547
+ tentative.push({
10548
+ candidateId: id,
10549
+ n: bucket.scores.length,
10550
+ mean: ci.mean,
10551
+ ciLow: ci.lower,
10552
+ ciHigh: ci.upper,
10553
+ qValue: rawP,
10554
+ cohensD: d,
10555
+ rawP
10556
+ });
10557
+ }
10558
+ if (comparator) {
10559
+ const idxs = [];
10560
+ const ps = [];
10561
+ for (let i = 0; i < tentative.length; i++) {
10562
+ const r = tentative[i];
10563
+ if (r.candidateId === comparator) continue;
10564
+ if (!Number.isFinite(r.rawP)) continue;
10565
+ idxs.push(i);
10566
+ ps.push(r.rawP);
10567
+ }
10568
+ if (ps.length > 0) {
10569
+ const { qValues } = benjaminiHochberg(ps, fdr);
10570
+ for (let k = 0; k < idxs.length; k++) {
10571
+ tentative[idxs[k]].qValue = qValues[k];
10572
+ }
10573
+ }
10574
+ }
10575
+ const rows = tentative.map(({ rawP: _rawP, ...rest }) => rest);
10576
+ const markdown = renderSummaryTableMarkdown(rows, comparator, split);
10577
+ return { rows, comparator, split, markdown };
10578
+ }
10579
+ function pairScoresByKey(candidate, baseline, scoreField) {
10580
+ const baseIdx = /* @__PURE__ */ new Map();
10581
+ for (const r of baseline) {
10582
+ const v = r.outcome[scoreField];
10583
+ if (typeof v === "number" && Number.isFinite(v)) {
10584
+ baseIdx.set(`${r.experimentId}::${r.seed}`, v);
10585
+ }
10586
+ }
10587
+ const before = [];
10588
+ const after = [];
10589
+ for (const r of candidate) {
10590
+ const v = r.outcome[scoreField];
10591
+ if (typeof v !== "number" || !Number.isFinite(v)) continue;
10592
+ const key = `${r.experimentId}::${r.seed}`;
10593
+ const b = baseIdx.get(key);
10594
+ if (b === void 0) continue;
10595
+ before.push(b);
10596
+ after.push(v);
10597
+ }
10598
+ return { before, after };
10599
+ }
10600
+ function renderSummaryTableMarkdown(rows, comparator, split) {
10601
+ const lines = [];
10602
+ const cmpLabel = comparator ? ` (vs ${comparator})` : "";
10603
+ lines.push(`Summary Table \u2014 ${split} split${cmpLabel}`);
10604
+ lines.push("");
10605
+ lines.push("| Candidate | N | Mean | 95% CI | q (BH) | Cohen's d |");
10606
+ lines.push("|---|---:|---:|---|---:|---:|");
10607
+ for (const r of rows) {
10608
+ const ci = `[${fmt2(r.ciLow)}, ${fmt2(r.ciHigh)}]`;
10609
+ const q = Number.isFinite(r.qValue) ? r.qValue.toFixed(4) : "\u2014";
10610
+ const d = Number.isFinite(r.cohensD) ? r.cohensD.toFixed(3) : "\u2014";
10611
+ lines.push(`| ${r.candidateId} | ${r.n} | ${fmt2(r.mean)} | ${ci} | ${q} | ${d} |`);
10612
+ }
10613
+ return lines.join("\n");
10614
+ }
10615
+ function paretoChart(runs, opts = {}) {
10616
+ const split = opts.split ?? "holdout";
10617
+ const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
10618
+ const buckets = /* @__PURE__ */ new Map();
10619
+ for (const r of runs) {
10620
+ if (r.splitTag !== split) continue;
10621
+ const v = r.outcome[scoreField];
10622
+ if (typeof v !== "number" || !Number.isFinite(v)) continue;
10623
+ const bucket = buckets.get(r.candidateId) ?? { cost: [], quality: [] };
10624
+ bucket.cost.push(r.costUsd);
10625
+ bucket.quality.push(v);
10626
+ buckets.set(r.candidateId, bucket);
10627
+ }
10628
+ const points = [];
10629
+ for (const [candidateId, bucket] of buckets.entries()) {
10630
+ points.push({
10631
+ candidateId,
10632
+ cost: avg(bucket.cost),
10633
+ quality: avg(bucket.quality),
10634
+ n: bucket.cost.length,
10635
+ onFrontier: false,
10636
+ gate: opts.gateDecisions?.[candidateId] ? gateLabel(opts.gateDecisions[candidateId]) : void 0
10637
+ });
10638
+ }
10639
+ for (const p of points) {
10640
+ p.onFrontier = !points.some((q) => q !== p && dominates2(q, p));
10641
+ }
10642
+ return {
10643
+ kind: "pareto-cost-quality",
10644
+ split,
10645
+ axes: { x: "costUsd", y: "score" },
10646
+ points
10647
+ };
10648
+ }
10649
+ function dominates2(a, b) {
10650
+ return a.cost <= b.cost && a.quality >= b.quality && (a.cost < b.cost || a.quality > b.quality);
10651
+ }
10652
+ function gateLabel(d) {
10653
+ if (d.promote) return "promote";
10654
+ if (d.rejectionCode === "few_runs") return "reject_few_runs";
10655
+ if (d.rejectionCode === "negative_delta") return "reject_negative_delta";
10656
+ if (d.rejectionCode === "overfit_gap") return "reject_overfit_gap";
10657
+ return null;
10658
+ }
10659
+ function gainHistogram(runs, candidateId, comparator, opts = {}) {
10660
+ const split = opts.split ?? "holdout";
10661
+ const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
10662
+ const binCount = opts.bins ?? 11;
10663
+ if (binCount < 1) throw new Error("gainHistogram: bins must be \u2265 1");
10664
+ const candidate = runs.filter((r) => r.candidateId === candidateId && r.splitTag === split);
10665
+ const baseline = runs.filter((r) => r.candidateId === comparator && r.splitTag === split);
10666
+ const { before, after } = pairScoresByKey(candidate, baseline, scoreField);
10667
+ const n = before.length;
10668
+ if (n === 0) {
10669
+ return {
10670
+ kind: "gain-distribution",
10671
+ candidateId,
10672
+ comparator,
10673
+ split,
10674
+ n: 0,
10675
+ bins: [],
10676
+ median: 0,
10677
+ ci: { low: 0, high: 0 }
10678
+ };
10679
+ }
10680
+ const deltas = before.map((b, i) => after[i] - b);
10681
+ const sortedDeltas = [...deltas].sort((a, b) => a - b);
10682
+ const median = medianOfSorted(sortedDeltas);
10683
+ const min = sortedDeltas[0];
10684
+ const max = sortedDeltas[sortedDeltas.length - 1];
10685
+ const bound = Math.max(Math.abs(min), Math.abs(max), 1e-6);
10686
+ const lo = -bound;
10687
+ const hi = bound;
10688
+ const width = (hi - lo) / binCount;
10689
+ const bins = [];
10690
+ for (let i = 0; i < binCount; i++) {
10691
+ bins.push({ lo: lo + i * width, hi: lo + (i + 1) * width, count: 0 });
10692
+ }
10693
+ for (const d of deltas) {
10694
+ let idx = Math.floor((d - lo) / width);
10695
+ if (idx < 0) idx = 0;
10696
+ if (idx >= binCount) idx = binCount - 1;
10697
+ bins[idx].count += 1;
10698
+ }
10699
+ const ci = pairedBootstrap(before, after, {
10700
+ confidence: opts.confidence ?? 0.95,
10701
+ resamples: opts.resamples ?? 2e3,
10702
+ statistic: "median",
10703
+ seed: opts.seed
10704
+ });
10705
+ return {
10706
+ kind: "gain-distribution",
10707
+ candidateId,
10708
+ comparator,
10709
+ split,
10710
+ n,
10711
+ bins,
10712
+ median,
10713
+ ci: { low: ci.low, high: ci.high }
10714
+ };
10715
+ }
10716
+ function avg(xs) {
10717
+ if (xs.length === 0) return Number.NaN;
10718
+ return xs.reduce((s, x) => s + x, 0) / xs.length;
10719
+ }
10720
+ function medianOfSorted(sorted) {
10721
+ if (sorted.length === 0) return 0;
10722
+ const mid = Math.floor(sorted.length / 2);
10723
+ return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
10724
+ }
10725
+ function fmt2(x) {
10726
+ if (!Number.isFinite(x)) return String(x);
10727
+ return x.toFixed(4);
10728
+ }
10729
+
10730
+ // src/canary.ts
10731
+ function runCanaries(runs, opts = {}) {
10732
+ const alerts = [
10733
+ ...detectSilentFallback(runs, opts.silentFallback ?? {}),
10734
+ ...detectCalibrationDrift(runs, opts.calibrationDrift ?? {}),
10735
+ ...opts.distributionShift ? detectDistributionShift(runs, opts.distributionShift) : []
10736
+ ];
10737
+ const counts = {
10738
+ silent_judge_fallback: 0,
10739
+ judge_calibration_drift: 0,
10740
+ distribution_shift: 0
10741
+ };
10742
+ for (const a of alerts) counts[a.kind]++;
10743
+ return { alerts, counts };
10744
+ }
10745
+ function detectSilentFallback(runs, opts) {
10746
+ const constant = opts.constant ?? 0.3;
10747
+ const threshold = opts.consecutiveThreshold ?? 3;
10748
+ const eps = opts.epsilon ?? 1e-9;
10749
+ const alerts = [];
10750
+ let streak = 0;
10751
+ let streakStartRunId = null;
10752
+ let streakValues = [];
10753
+ let lastFlush = -1;
10754
+ for (let i = 0; i < runs.length; i++) {
10755
+ const run = runs[i];
10756
+ const meta = run.judgeMetadata;
10757
+ if (!meta) {
10758
+ streak = 0;
10759
+ streakStartRunId = null;
10760
+ streakValues = [];
10761
+ continue;
10762
+ }
10763
+ const isFallback = meta.fallback === true || Math.abs(meta.confidence - constant) <= eps;
10764
+ if (isFallback) {
10765
+ streak += 1;
10766
+ if (streak === 1) streakStartRunId = run.runId;
10767
+ streakValues.push(meta.confidence);
10768
+ if (streak >= threshold && lastFlush < i) {
10769
+ alerts.push({
10770
+ kind: "silent_judge_fallback",
10771
+ severity: "error",
10772
+ message: `silent judge fallback: ${streak} consecutive run(s) at confidence\u2248${constant} or fallback=true`,
10773
+ evidence: {
10774
+ streakLength: streak,
10775
+ firstRunId: streakStartRunId,
10776
+ lastRunId: run.runId,
10777
+ confidences: streakValues.slice(-Math.min(streakValues.length, 10)),
10778
+ fallbackConstant: constant
10779
+ }
10780
+ });
10781
+ lastFlush = i;
10782
+ }
10783
+ } else {
10784
+ streak = 0;
10785
+ streakStartRunId = null;
10786
+ streakValues = [];
10787
+ lastFlush = -1;
10788
+ }
10789
+ }
10790
+ return alerts;
10791
+ }
10792
+ function detectCalibrationDrift(runs, opts) {
10793
+ const historyWindow = opts.historyWindow ?? 50;
10794
+ const recentWindow = opts.recentWindow ?? 20;
10795
+ const alpha = opts.ksAlpha ?? 0.05;
10796
+ const minRecent = opts.minRecent ?? 10;
10797
+ const conf = [];
10798
+ for (const r of runs) {
10799
+ if (r.judgeMetadata && Number.isFinite(r.judgeMetadata.confidence)) {
10800
+ conf.push(r.judgeMetadata.confidence);
10801
+ }
10802
+ }
10803
+ if (conf.length < minRecent + 1) return [];
10804
+ const recent = conf.slice(-Math.min(recentWindow, conf.length));
10805
+ const historical = conf.slice(0, -recent.length).slice(-historyWindow);
10806
+ if (recent.length < minRecent || historical.length < minRecent) return [];
10807
+ const ks = ksTwoSample(recent, historical);
10808
+ const c = alpha <= 0.01 ? 1.63 : alpha <= 0.05 ? 1.36 : alpha <= 0.1 ? 1.22 : 1;
10809
+ const critical = c * Math.sqrt((recent.length + historical.length) / (recent.length * historical.length));
10810
+ if (ks.d > critical) {
10811
+ return [
10812
+ {
10813
+ kind: "judge_calibration_drift",
10814
+ severity: "warn",
10815
+ message: `judge calibration drift: KS D=${ks.d.toFixed(4)} exceeds critical=${critical.toFixed(4)} at alpha=${alpha} (recent n=${recent.length}, history n=${historical.length})`,
10816
+ evidence: {
10817
+ ksD: ks.d,
10818
+ critical,
10819
+ alpha,
10820
+ recentN: recent.length,
10821
+ historyN: historical.length,
10822
+ recentMean: mean6(recent),
10823
+ historyMean: mean6(historical)
10824
+ }
10825
+ }
10826
+ ];
10827
+ }
10828
+ return [];
10829
+ }
10830
+ function ksTwoSample(a, b) {
10831
+ const sortedA = [...a].sort((x, y) => x - y);
10832
+ const sortedB = [...b].sort((x, y) => x - y);
10833
+ const n1 = sortedA.length;
10834
+ const n2 = sortedB.length;
10835
+ let i = 0;
10836
+ let j = 0;
10837
+ let d = 0;
10838
+ while (i < n1 && j < n2) {
10839
+ const ax = sortedA[i];
10840
+ const bx = sortedB[j];
10841
+ if (ax <= bx) i++;
10842
+ if (bx <= ax) j++;
10843
+ const diff = Math.abs(i / n1 - j / n2);
10844
+ if (diff > d) d = diff;
10845
+ }
10846
+ return { d };
10847
+ }
10848
+ function detectDistributionShift(runs, opts) {
10849
+ const historyWindow = opts.historyWindow ?? 50;
10850
+ const recentWindow = opts.recentWindow ?? 20;
10851
+ const alpha = opts.chiSquareAlpha ?? 0.05;
10852
+ const minRecent = opts.minRecent ?? 10;
10853
+ const cat = opts.category;
10854
+ const cats = [];
10855
+ for (const r of runs) {
10856
+ const b = cat(r);
10857
+ if (typeof b === "string" && b.length > 0) cats.push({ run: r, bucket: b });
10858
+ }
10859
+ if (cats.length < minRecent + 1) return [];
10860
+ const recent = cats.slice(-Math.min(recentWindow, cats.length));
10861
+ const historical = cats.slice(0, -recent.length).slice(-historyWindow);
10862
+ if (recent.length < minRecent || historical.length < minRecent) return [];
10863
+ const buckets = /* @__PURE__ */ new Set();
10864
+ for (const r of recent) buckets.add(r.bucket);
10865
+ for (const h of historical) buckets.add(h.bucket);
10866
+ const bucketList = [...buckets].sort();
10867
+ const recentCounts = {};
10868
+ const histCounts = {};
10869
+ for (const b of bucketList) {
10870
+ recentCounts[b] = 0;
10871
+ histCounts[b] = 0;
10872
+ }
10873
+ for (const r of recent) recentCounts[r.bucket] += 1;
10874
+ for (const h of historical) histCounts[h.bucket] += 1;
10875
+ let chi = 0;
10876
+ let df = 0;
10877
+ for (const b of bucketList) {
10878
+ const expected = histCounts[b] / historical.length * recent.length;
10879
+ if (expected < 1) continue;
10880
+ const obs = recentCounts[b];
10881
+ chi += (obs - expected) ** 2 / expected;
10882
+ df += 1;
10883
+ }
10884
+ df = Math.max(1, df - 1);
10885
+ const critical = chiSquareCritical(df, alpha);
10886
+ if (chi > critical) {
10887
+ return [
10888
+ {
10889
+ kind: "distribution_shift",
10890
+ severity: "warn",
10891
+ message: `eval-set distribution shift: \u03C7\xB2=${chi.toFixed(2)} df=${df} exceeds critical=${critical.toFixed(2)} at alpha=${alpha}`,
10892
+ evidence: {
10893
+ chi,
10894
+ df,
10895
+ critical,
10896
+ alpha,
10897
+ recentCounts,
10898
+ historicalCounts: histCounts,
10899
+ recentN: recent.length,
10900
+ historyN: historical.length
10901
+ }
10902
+ }
10903
+ ];
10904
+ }
10905
+ return [];
10906
+ }
10907
+ function chiSquareCritical(df, alpha) {
10908
+ const TABLE = {
10909
+ 1: [2.71, 3.84, 5.02, 6.63],
10910
+ 2: [4.61, 5.99, 7.38, 9.21],
10911
+ 3: [6.25, 7.81, 9.35, 11.34],
10912
+ 4: [7.78, 9.49, 11.14, 13.28],
10913
+ 5: [9.24, 11.07, 12.83, 15.09],
10914
+ 6: [10.64, 12.59, 14.45, 16.81],
10915
+ 7: [12.02, 14.07, 16.01, 18.48],
10916
+ 8: [13.36, 15.51, 17.53, 20.09],
10917
+ 9: [14.68, 16.92, 19.02, 21.67],
10918
+ 10: [15.99, 18.31, 20.48, 23.21],
10919
+ 15: [22.31, 25, 27.49, 30.58],
10920
+ 20: [28.41, 31.41, 34.17, 37.57],
10921
+ 25: [34.38, 37.65, 40.65, 44.31],
10922
+ 30: [40.26, 43.77, 46.98, 50.89]
10923
+ };
10924
+ const idx = alpha >= 0.1 ? 0 : alpha >= 0.05 ? 1 : alpha >= 0.025 ? 2 : 3;
10925
+ if (TABLE[df]) return TABLE[df][idx];
10926
+ if (df > 30) {
10927
+ const zMap = { 0: 1.282, 1: 1.645, 2: 1.96, 3: 2.326 };
10928
+ const z = zMap[idx] ?? 1.96;
10929
+ const term = 1 - 2 / (9 * df) + z * Math.sqrt(2 / (9 * df));
10930
+ return df * term ** 3;
10931
+ }
10932
+ const keys = Object.keys(TABLE).map((k) => Number(k)).sort((a, b) => a - b);
10933
+ for (let i = 1; i < keys.length; i++) {
10934
+ const lo = keys[i - 1];
10935
+ const hi = keys[i];
10936
+ if (df >= lo && df <= hi) {
10937
+ const t = (df - lo) / (hi - lo);
10938
+ return TABLE[lo][idx] * (1 - t) + TABLE[hi][idx] * t;
10939
+ }
10940
+ }
10941
+ return TABLE[10][idx];
10942
+ }
10943
+ function mean6(xs) {
10944
+ if (xs.length === 0) return 0;
10945
+ return xs.reduce((s, x) => s + x, 0) / xs.length;
10946
+ }
10947
+
10948
+ // src/benchmarks/types.ts
10949
+ function fnv1a32(input) {
10950
+ let h = 2166136261;
10951
+ for (let i = 0; i < input.length; i++) {
10952
+ h ^= input.charCodeAt(i) & 255;
10953
+ h = h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24)) >>> 0;
10954
+ }
10955
+ return h >>> 0;
10956
+ }
10957
+ var BENCHMARK_SPLIT_SEED = "agent-eval-v1";
10958
+ function deterministicSplit(itemId, seed = BENCHMARK_SPLIT_SEED) {
10959
+ const h = fnv1a32(`${seed}::${itemId}`);
10960
+ const pos = h / 4294967296;
10961
+ if (pos < 0.6) return "search";
10962
+ if (pos < 0.8) return "dev";
10963
+ return "holdout";
10964
+ }
10965
+
10966
+ // src/benchmarks/index.ts
10967
+ var benchmarks_exports = {};
10968
+ __export(benchmarks_exports, {
10969
+ BENCHMARK_SPLIT_SEED: () => BENCHMARK_SPLIT_SEED,
10970
+ deterministicSplit: () => deterministicSplit,
10971
+ gsm8k: () => gsm8k_exports,
10972
+ routing: () => routing_exports,
10973
+ swebenchLite: () => swebench_lite_exports
10974
+ });
10975
+
10976
+ // src/benchmarks/gsm8k/index.ts
10977
+ var gsm8k_exports = {};
10978
+ __export(gsm8k_exports, {
10979
+ Gsm8kAdapter: () => Gsm8kAdapter,
10980
+ assignSplit: () => assignSplit,
10981
+ evaluate: () => evaluate,
10982
+ loadDataset: () => loadDataset,
10983
+ parseGsm8kAnswer: () => parseGsm8kAnswer
10984
+ });
10985
+ import { existsSync as existsSync5, readFileSync as readFileSync5 } from "fs";
10986
+ var Gsm8kAdapter = class {
10987
+ async loadDataset(split) {
10988
+ const path = process.env.AGENT_EVAL_GSM8K_PATH;
10989
+ if (!path) {
10990
+ throw new Error(
10991
+ "GSM8K dataset not provided. Set AGENT_EVAL_GSM8K_PATH to a JSONL file with {id, question, answer} records (the HF GSM8K mirror converted to JSONL)."
10992
+ );
10993
+ }
10994
+ if (!existsSync5(path)) {
10995
+ throw new Error(`AGENT_EVAL_GSM8K_PATH=${path} does not exist`);
10996
+ }
10997
+ const items = parseJsonl(path).filter((it) => assignSplitImpl(it.id) === split);
10998
+ return items;
10999
+ }
11000
+ async evaluate(item, response) {
11001
+ const expected = parseGsm8kAnswer(item.payload.answer);
11002
+ const observed = parseGsm8kAnswer(response);
11003
+ if (expected === null) {
11004
+ return { score: 0, raw: { reason: "reference_not_numeric", expected: item.payload.answer } };
11005
+ }
11006
+ if (observed === null) {
11007
+ return { score: 0, raw: { reason: "no_numeric_in_response", expected, observed: null } };
11008
+ }
11009
+ const ok = Math.abs(expected - observed) < 1e-6;
11010
+ return { score: ok ? 1 : 0, raw: { expected, observed, exactMatch: ok } };
11011
+ }
11012
+ assignSplit(itemId) {
11013
+ return assignSplitImpl(itemId);
11014
+ }
11015
+ };
11016
+ function assignSplitImpl(itemId) {
11017
+ return deterministicSplit(`gsm8k::${itemId}`);
11018
+ }
11019
+ function parseJsonl(path) {
11020
+ const raw = readFileSync5(path, "utf8");
11021
+ const out = [];
11022
+ let lineNo = 0;
11023
+ for (const line of raw.split("\n")) {
11024
+ lineNo++;
11025
+ const trimmed = line.trim();
11026
+ if (!trimmed) continue;
11027
+ let row;
11028
+ try {
11029
+ row = JSON.parse(trimmed);
11030
+ } catch (e) {
11031
+ throw new Error(`GSM8K JSONL parse error at line ${lineNo}: ${e.message}`);
11032
+ }
11033
+ const id = String(row.id ?? `gsm8k_${lineNo}`);
11034
+ const question = String(row.question ?? "");
11035
+ const answer = String(row.answer ?? "");
11036
+ if (!question || !answer) {
11037
+ throw new Error(`GSM8K JSONL line ${lineNo} missing question/answer`);
11038
+ }
11039
+ out.push({ id, payload: { question, answer } });
11040
+ }
11041
+ return out;
11042
+ }
11043
+ function parseGsm8kAnswer(text) {
11044
+ if (!text) return null;
11045
+ const afterMarker = text.match(/####\s*(-?\d[\d,]*\.?\d*)/);
11046
+ if (afterMarker) {
11047
+ const cleaned2 = afterMarker[1].replace(/,/g, "");
11048
+ const v2 = Number(cleaned2);
11049
+ if (Number.isFinite(v2)) return v2;
11050
+ }
11051
+ const matches2 = text.match(/-?\d[\d,]*\.?\d*/g);
11052
+ if (!matches2 || matches2.length === 0) return null;
11053
+ const last = matches2[matches2.length - 1];
11054
+ const cleaned = last.replace(/,/g, "");
11055
+ const v = Number(cleaned);
11056
+ return Number.isFinite(v) ? v : null;
11057
+ }
11058
+ var adapter = new Gsm8kAdapter();
11059
+ var loadDataset = adapter.loadDataset.bind(adapter);
11060
+ var evaluate = adapter.evaluate.bind(adapter);
11061
+ var assignSplit = adapter.assignSplit.bind(adapter);
11062
+
11063
+ // src/benchmarks/swebench-lite/index.ts
11064
+ var swebench_lite_exports = {};
11065
+ __export(swebench_lite_exports, {
11066
+ SweBenchLiteAdapter: () => SweBenchLiteAdapter,
11067
+ assignSplit: () => assignSplit2,
11068
+ evaluate: () => evaluate2,
11069
+ loadDataset: () => loadDataset2
11070
+ });
11071
+ import { existsSync as existsSync6, readFileSync as readFileSync6 } from "fs";
11072
+ import { spawn } from "child_process";
11073
+ var SweBenchLiteAdapter = class {
11074
+ async loadDataset(split) {
11075
+ const path = process.env.AGENT_EVAL_SWEBENCH_PATH;
11076
+ if (!path) {
11077
+ throw new Error(
11078
+ "SWE-Bench Lite dataset not provided. Set AGENT_EVAL_SWEBENCH_PATH to a JSONL file with the 30 lite instances. STUB: this wrapper does not bundle the dataset; see https://www.swebench.com/lite.html for the canonical source."
11079
+ );
11080
+ }
11081
+ if (!existsSync6(path)) {
11082
+ throw new Error(`AGENT_EVAL_SWEBENCH_PATH=${path} does not exist`);
11083
+ }
11084
+ const all = parseJsonl2(path);
11085
+ return all.filter((it) => assignSplitImpl2(it.id) === split);
11086
+ }
11087
+ async evaluate(item, response) {
11088
+ const cmd = process.env.AGENT_EVAL_SWEBENCH_GRADER_CMD;
11089
+ if (!cmd) {
11090
+ throw new Error(
11091
+ "SWE-Bench Lite grader not configured. Set AGENT_EVAL_SWEBENCH_GRADER_CMD to an executable that reads {instance_id, patch} JSON on stdin and writes {passed, fail_to_pass_passed, pass_to_pass_passed, log} JSON on stdout. TODO(swebench-lite): bundle a default Docker-based runner once the SDK stabilises (https://github.com/swe-bench/SWE-bench)."
11092
+ );
11093
+ }
11094
+ const stdinPayload = JSON.stringify({ instance_id: item.payload.instanceId, patch: response });
11095
+ const result = await runGrader(cmd, stdinPayload);
11096
+ let parsed;
11097
+ try {
11098
+ parsed = JSON.parse(result.stdout);
11099
+ } catch (e) {
11100
+ throw new Error(
11101
+ `SWE-Bench grader emitted non-JSON stdout: ${e.message}
11102
+ stdout=${result.stdout.slice(0, 400)}
11103
+ stderr=${result.stderr.slice(0, 400)}`
11104
+ );
11105
+ }
11106
+ const passed = Boolean(parsed.passed);
11107
+ return {
11108
+ score: passed ? 1 : 0,
11109
+ raw: {
11110
+ passed,
11111
+ failToPassPassed: Boolean(parsed.fail_to_pass_passed),
11112
+ passToPassPassed: Boolean(parsed.pass_to_pass_passed),
11113
+ graderLog: typeof parsed.log === "string" ? parsed.log.slice(0, 4e3) : ""
11114
+ }
11115
+ };
11116
+ }
11117
+ assignSplit(itemId) {
11118
+ return assignSplitImpl2(itemId);
11119
+ }
11120
+ };
11121
+ function assignSplitImpl2(itemId) {
11122
+ return deterministicSplit(`swebench-lite::${itemId}`);
11123
+ }
11124
+ function parseJsonl2(path) {
11125
+ const raw = readFileSync6(path, "utf8");
11126
+ const out = [];
11127
+ let lineNo = 0;
11128
+ for (const line of raw.split("\n")) {
11129
+ lineNo++;
11130
+ const trimmed = line.trim();
11131
+ if (!trimmed) continue;
11132
+ const row = JSON.parse(trimmed);
11133
+ const instanceId = String(row.instance_id ?? row.instanceId ?? "");
11134
+ if (!instanceId) {
11135
+ throw new Error(`swebench-lite line ${lineNo} missing instance_id`);
11136
+ }
11137
+ out.push({
11138
+ id: instanceId,
11139
+ payload: {
11140
+ instanceId,
11141
+ problemStatement: String(row.problem_statement ?? row.problemStatement ?? ""),
11142
+ baseCommit: String(row.base_commit ?? row.baseCommit ?? ""),
11143
+ repo: String(row.repo ?? ""),
11144
+ failToPass: asStringArray(row.FAIL_TO_PASS ?? row.failToPass),
11145
+ passToPass: asStringArray(row.PASS_TO_PASS ?? row.passToPass)
11146
+ }
11147
+ });
11148
+ }
11149
+ return out;
11150
+ }
11151
+ function asStringArray(v) {
11152
+ if (Array.isArray(v)) return v.filter((x) => typeof x === "string");
11153
+ if (typeof v === "string") {
11154
+ try {
11155
+ const parsed = JSON.parse(v);
11156
+ if (Array.isArray(parsed)) return parsed.filter((x) => typeof x === "string");
11157
+ } catch {
11158
+ return [v];
11159
+ }
11160
+ }
11161
+ return [];
11162
+ }
11163
+ function runGrader(cmd, stdin) {
11164
+ return new Promise((resolve, reject) => {
11165
+ const parts = cmd.split(/\s+/);
11166
+ const child = spawn(parts[0], parts.slice(1), { stdio: ["pipe", "pipe", "pipe"] });
11167
+ let stdout = "";
11168
+ let stderr = "";
11169
+ child.stdout.on("data", (b) => stdout += b.toString("utf8"));
11170
+ child.stderr.on("data", (b) => stderr += b.toString("utf8"));
11171
+ child.on("error", reject);
11172
+ child.on("close", (code) => {
11173
+ if (code !== 0) {
11174
+ reject(new Error(`grader exited with code ${code}: ${stderr.slice(0, 400)}`));
11175
+ return;
11176
+ }
11177
+ resolve({ stdout, stderr });
11178
+ });
11179
+ child.stdin.write(stdin);
11180
+ child.stdin.end();
11181
+ });
11182
+ }
11183
+ var adapter2 = new SweBenchLiteAdapter();
11184
+ var loadDataset2 = adapter2.loadDataset.bind(adapter2);
11185
+ var evaluate2 = adapter2.evaluate.bind(adapter2);
11186
+ var assignSplit2 = adapter2.assignSplit.bind(adapter2);
11187
+
11188
+ // src/benchmarks/routing/index.ts
11189
+ var routing_exports = {};
11190
+ __export(routing_exports, {
11191
+ ROUTING_DATASET: () => ROUTING_DATASET,
11192
+ RoutingAdapter: () => RoutingAdapter,
11193
+ assignSplit: () => assignSplit3,
11194
+ evaluate: () => evaluate3,
11195
+ extractRouteTokens: () => extractRouteTokens,
11196
+ loadDataset: () => loadDataset3
11197
+ });
11198
+
11199
+ // src/benchmarks/routing/dataset.ts
11200
+ var ROUTING_DATASET = [
11201
+ {
11202
+ id: "file_001",
11203
+ category: "file",
11204
+ prompt: "Save the meeting notes to /tmp/notes-2025-04.md as markdown.",
11205
+ route: "fs.write",
11206
+ synonyms: ["filesystem.write", "write_file"],
11207
+ hardNegatives: ["fs.read", "chat.reply"]
11208
+ },
11209
+ {
11210
+ id: "file_002",
11211
+ category: "file",
11212
+ prompt: "Read the contents of /etc/hosts and summarize the entries.",
11213
+ route: "fs.read",
11214
+ synonyms: ["filesystem.read", "read_file"],
11215
+ hardNegatives: ["fs.write", "search.web"]
11216
+ },
11217
+ {
11218
+ id: "file_003",
11219
+ category: "file",
11220
+ prompt: "List every Python file under src/ recursively.",
11221
+ route: "fs.list",
11222
+ synonyms: ["filesystem.list", "list_files"],
11223
+ hardNegatives: ["fs.read", "search.code"]
11224
+ },
11225
+ {
11226
+ id: "file_004",
11227
+ category: "file",
11228
+ prompt: "Delete the cached build at .turbo/cache.",
11229
+ route: "fs.delete",
11230
+ synonyms: ["filesystem.delete", "remove_file"],
11231
+ hardNegatives: ["fs.write", "fs.list"]
11232
+ },
11233
+ {
11234
+ id: "math_001",
11235
+ category: "math",
11236
+ prompt: "What is the integral of 3x^2 + 2x from 0 to 5?",
11237
+ route: "math.integral",
11238
+ synonyms: ["calculator.integral", "math.solve"],
11239
+ hardNegatives: ["math.derivative", "chat.reply"]
11240
+ },
11241
+ {
11242
+ id: "math_002",
11243
+ category: "math",
11244
+ prompt: "Compute the derivative of sin(x) * cos(x).",
11245
+ route: "math.derivative",
11246
+ synonyms: ["calculator.derivative", "math.solve"],
11247
+ hardNegatives: ["math.integral", "math.algebra"]
11248
+ },
11249
+ {
11250
+ id: "math_003",
11251
+ category: "math",
11252
+ prompt: "Solve 2x + 7 = 19 for x.",
11253
+ route: "math.algebra",
11254
+ synonyms: ["calculator.algebra", "math.solve"],
11255
+ hardNegatives: ["math.derivative", "math.integral"]
11256
+ },
11257
+ {
11258
+ id: "math_004",
11259
+ category: "math",
11260
+ prompt: "What is the prime factorization of 360?",
11261
+ route: "math.numbertheory",
11262
+ synonyms: ["calculator.factor", "math.solve"],
11263
+ hardNegatives: ["math.algebra", "search.web"]
11264
+ },
11265
+ {
11266
+ id: "search_001",
11267
+ category: "search",
11268
+ prompt: "Find recent papers on agent prompt optimization with held-out promotion gates.",
11269
+ route: "search.web",
11270
+ synonyms: ["web.search", "search.papers"],
11271
+ hardNegatives: ["search.code", "chat.reply"]
11272
+ },
11273
+ {
11274
+ id: "search_002",
11275
+ category: "search",
11276
+ prompt: "Search the codebase for every call site of `runProposeReview`.",
11277
+ route: "search.code",
11278
+ synonyms: ["code.search", "grep"],
11279
+ hardNegatives: ["search.web", "fs.read"]
11280
+ },
11281
+ {
11282
+ id: "search_003",
11283
+ category: "search",
11284
+ prompt: "What is the latest release of the Tangle network on GitHub?",
11285
+ route: "search.web",
11286
+ synonyms: ["web.search", "github.releases"],
11287
+ hardNegatives: ["search.code", "chat.reply"]
11288
+ },
11289
+ {
11290
+ id: "search_004",
11291
+ category: "search",
11292
+ prompt: "Find all TODO comments in the agent-eval src tree.",
11293
+ route: "search.code",
11294
+ synonyms: ["code.search", "grep"],
11295
+ hardNegatives: ["search.web", "fs.list"]
11296
+ },
11297
+ {
11298
+ id: "chat_001",
11299
+ category: "chat",
11300
+ prompt: "Hi there, how are you doing today?",
11301
+ route: "chat.reply",
11302
+ synonyms: ["conversation.reply"],
11303
+ hardNegatives: ["search.web", "fs.read"]
11304
+ },
11305
+ {
11306
+ id: "chat_002",
11307
+ category: "chat",
11308
+ prompt: "Please explain the difference between an LLM and a foundation model.",
11309
+ route: "chat.reply",
11310
+ synonyms: ["conversation.reply", "qa.answer"],
11311
+ hardNegatives: ["search.web", "math.algebra"]
11312
+ },
11313
+ {
11314
+ id: "chat_003",
11315
+ category: "chat",
11316
+ prompt: "Tell me a short joke about distributed systems.",
11317
+ route: "chat.reply",
11318
+ synonyms: ["conversation.reply"],
11319
+ hardNegatives: ["search.web", "fs.read"]
11320
+ },
11321
+ {
11322
+ id: "chat_004",
11323
+ category: "chat",
11324
+ prompt: "Acknowledge my last message with a thumbs up.",
11325
+ route: "chat.reply",
11326
+ synonyms: ["conversation.reply", "react"],
11327
+ hardNegatives: ["fs.write", "search.web"]
11328
+ }
11329
+ ];
11330
+
11331
+ // src/benchmarks/routing/index.ts
11332
+ var RoutingAdapter = class {
11333
+ async loadDataset(split) {
11334
+ return ROUTING_DATASET.map((item) => ({ id: item.id, payload: item })).filter((it) => assignSplitImpl3(it.id) === split);
11335
+ }
11336
+ async evaluate(item, response) {
11337
+ const tokens2 = extractRouteTokens(response);
11338
+ const correct = new Set([item.payload.route, ...item.payload.synonyms].map((s) => s.toLowerCase()));
11339
+ const hardNeg = new Set(item.payload.hardNegatives.map((s) => s.toLowerCase()));
11340
+ const firstMatch = tokens2.find((t) => correct.has(t.toLowerCase())) ?? null;
11341
+ const firstHardNeg = tokens2.find((t) => hardNeg.has(t.toLowerCase())) ?? null;
11342
+ const score = firstMatch ? 1 : 0;
11343
+ return {
11344
+ score,
11345
+ raw: {
11346
+ firstToken: tokens2[0] ?? null,
11347
+ matchedRoute: firstMatch,
11348
+ hitHardNegative: Boolean(firstHardNeg),
11349
+ hardNegativeRoute: firstHardNeg,
11350
+ category: item.payload.category
11351
+ }
11352
+ };
11353
+ }
11354
+ assignSplit(itemId) {
11355
+ return assignSplitImpl3(itemId);
11356
+ }
11357
+ };
11358
+ function assignSplitImpl3(itemId) {
11359
+ return deterministicSplit(`routing::${itemId}`);
11360
+ }
11361
+ function extractRouteTokens(response) {
11362
+ const matches2 = response.match(/[a-z][a-z0-9_]*\.[a-z][a-z0-9_]*/gi);
11363
+ return matches2 ?? [];
11364
+ }
11365
+ var adapter3 = new RoutingAdapter();
11366
+ var loadDataset3 = adapter3.loadDataset.bind(adapter3);
11367
+ var evaluate3 = adapter3.evaluate.bind(adapter3);
11368
+ var assignSplit3 = adapter3.assignSplit.bind(adapter3);
11369
+
10069
11370
  // src/reference-replay-steering.ts
10070
11371
  function referenceReplayRunsToSteeringRows(runs, options = {}) {
10071
11372
  const rows = [];
@@ -10257,9 +11558,9 @@ function aggregateTrials(population, scenarioIds, trials) {
10257
11558
  return {
10258
11559
  variantId: variant.id,
10259
11560
  scenarioId: sid,
10260
- meanScore: mean5(gradedTrials.map((t) => t.score)),
10261
- meanCost: mean5(gradedTrials.map((t) => t.cost ?? 0)),
10262
- meanDurationMs: mean5(gradedTrials.map((t) => t.durationMs ?? 0)),
11561
+ meanScore: mean7(gradedTrials.map((t) => t.score)),
11562
+ meanCost: mean7(gradedTrials.map((t) => t.cost ?? 0)),
11563
+ meanDurationMs: mean7(gradedTrials.map((t) => t.durationMs ?? 0)),
10263
11564
  okRate: scenarioTrials.length === 0 ? 0 : okTrials.length / scenarioTrials.length,
10264
11565
  trials: scenarioTrials.length,
10265
11566
  metrics
@@ -10267,10 +11568,10 @@ function aggregateTrials(population, scenarioIds, trials) {
10267
11568
  });
10268
11569
  return {
10269
11570
  variantId: variant.id,
10270
- meanScore: mean5(scenarios.map((s) => s.meanScore)),
10271
- meanCost: mean5(scenarios.map((s) => s.meanCost)),
10272
- meanDurationMs: mean5(scenarios.map((s) => s.meanDurationMs)),
10273
- okRate: mean5(scenarios.map((s) => s.okRate)),
11571
+ meanScore: mean7(scenarios.map((s) => s.meanScore)),
11572
+ meanCost: mean7(scenarios.map((s) => s.meanCost)),
11573
+ meanDurationMs: mean7(scenarios.map((s) => s.meanDurationMs)),
11574
+ okRate: mean7(scenarios.map((s) => s.okRate)),
10274
11575
  scenarios,
10275
11576
  metrics: aggregateMetrics(scenarios.map((s) => s.metrics))
10276
11577
  };
@@ -10287,10 +11588,10 @@ function aggregateMetrics(rows) {
10287
11588
  }
10288
11589
  }
10289
11590
  const out = {};
10290
- for (const [k, list] of buckets) out[k] = mean5(list);
11591
+ for (const [k, list] of buckets) out[k] = mean7(list);
10291
11592
  return out;
10292
11593
  }
10293
- function mean5(xs) {
11594
+ function mean7(xs) {
10294
11595
  if (xs.length === 0) return 0;
10295
11596
  return xs.reduce((a, b) => a + b, 0) / xs.length;
10296
11597
  }
@@ -10331,11 +11632,11 @@ function samePopulation(a, b) {
10331
11632
  }
10332
11633
 
10333
11634
  // src/jsonl-trial-cache.ts
10334
- import { appendFileSync as appendFileSync4, existsSync as existsSync6, mkdirSync as mkdirSync4, readFileSync as readFileSync5 } from "fs";
11635
+ import { appendFileSync as appendFileSync4, existsSync as existsSync8, mkdirSync as mkdirSync4, readFileSync as readFileSync7 } from "fs";
10335
11636
  import { dirname as dirname4 } from "path";
10336
11637
 
10337
11638
  // src/locked-jsonl-appender.ts
10338
- import { appendFileSync as appendFileSync3, existsSync as existsSync5, mkdirSync as mkdirSync3 } from "fs";
11639
+ import { appendFileSync as appendFileSync3, existsSync as existsSync7, mkdirSync as mkdirSync3 } from "fs";
10339
11640
  import { dirname as dirname3 } from "path";
10340
11641
  var mutexes = /* @__PURE__ */ new Map();
10341
11642
  function getMutex(path) {
@@ -10350,7 +11651,7 @@ var LockedJsonlAppender = class {
10350
11651
  constructor(path) {
10351
11652
  this.path = path;
10352
11653
  this.mutex = getMutex(path);
10353
- if (!existsSync5(dirname3(path))) {
11654
+ if (!existsSync7(dirname3(path))) {
10354
11655
  mkdirSync3(dirname3(path), { recursive: true });
10355
11656
  }
10356
11657
  }
@@ -10375,8 +11676,8 @@ var JsonlTrialCache = class {
10375
11676
  appender;
10376
11677
  constructor(path) {
10377
11678
  this.path = path;
10378
- if (existsSync6(path)) {
10379
- for (const line of readFileSync5(path, "utf-8").split("\n")) {
11679
+ if (existsSync8(path)) {
11680
+ for (const line of readFileSync7(path, "utf-8").split("\n")) {
10380
11681
  if (!line.trim()) continue;
10381
11682
  try {
10382
11683
  const entry = JSON.parse(line);
@@ -10414,7 +11715,7 @@ var JsonlTrialCache = class {
10414
11715
  };
10415
11716
 
10416
11717
  // src/evolution-telemetry.ts
10417
- import { appendFileSync as appendFileSync5, existsSync as existsSync7, mkdirSync as mkdirSync5, readFileSync as readFileSync6, writeFileSync } from "fs";
11718
+ import { appendFileSync as appendFileSync5, existsSync as existsSync9, mkdirSync as mkdirSync5, readFileSync as readFileSync8, writeFileSync } from "fs";
10418
11719
  import { dirname as dirname5 } from "path";
10419
11720
  var MutationTelemetry = class {
10420
11721
  appender;
@@ -10445,16 +11746,16 @@ var LineageRecorder = class {
10445
11746
  this.snapshotPath = `${path}.snapshot`;
10446
11747
  this.kindOf = kindOf ?? defaultKindOf;
10447
11748
  mkdirSync5(dirname5(path), { recursive: true });
10448
- if (existsSync7(this.snapshotPath)) {
11749
+ if (existsSync9(this.snapshotPath)) {
10449
11750
  try {
10450
- const parsed = JSON.parse(readFileSync6(this.snapshotPath, "utf-8"));
11751
+ const parsed = JSON.parse(readFileSync8(this.snapshotPath, "utf-8"));
10451
11752
  for (const n of parsed) this.nodes.set(n.id, n);
10452
11753
  } catch {
10453
11754
  }
10454
11755
  }
10455
- if (existsSync7(path)) {
11756
+ if (existsSync9(path)) {
10456
11757
  try {
10457
- for (const line of readFileSync6(path, "utf-8").split("\n")) {
11758
+ for (const line of readFileSync8(path, "utf-8").split("\n")) {
10458
11759
  if (!line.trim()) continue;
10459
11760
  try {
10460
11761
  const entry = JSON.parse(line);
@@ -10466,9 +11767,9 @@ var LineageRecorder = class {
10466
11767
  } catch {
10467
11768
  }
10468
11769
  }
10469
- if (existsSync7(path) && this.nodes.size === 0) {
11770
+ if (existsSync9(path) && this.nodes.size === 0) {
10470
11771
  try {
10471
- const raw = readFileSync6(path, "utf-8").trim();
11772
+ const raw = readFileSync8(path, "utf-8").trim();
10472
11773
  if (raw.startsWith("[")) {
10473
11774
  const parsed = JSON.parse(raw);
10474
11775
  for (const n of parsed) this.nodes.set(n.id, n);
@@ -10482,8 +11783,8 @@ var LineageRecorder = class {
10482
11783
  const prev = this.nodes.get(node.id);
10483
11784
  this.nodes.set(node.id, { ...prev, ...node });
10484
11785
  try {
10485
- if (existsSync7(this.path)) {
10486
- const head = readFileSync6(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
11786
+ if (existsSync9(this.path)) {
11787
+ const head = readFileSync8(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
10487
11788
  if (head === "[") {
10488
11789
  writeFileSync(this.path, "");
10489
11790
  }
@@ -10549,9 +11850,9 @@ var CostLedger = class {
10549
11850
  mutex = new Mutex();
10550
11851
  constructor(path) {
10551
11852
  this.path = path;
10552
- if (existsSync7(path)) {
11853
+ if (existsSync9(path)) {
10553
11854
  try {
10554
- const loaded = JSON.parse(readFileSync6(path, "utf-8"));
11855
+ const loaded = JSON.parse(readFileSync8(path, "utf-8"));
10555
11856
  for (const k of Object.keys(this.totals)) {
10556
11857
  if (k === "byGeneration") {
10557
11858
  if (loaded.byGeneration && typeof loaded.byGeneration === "object") {
@@ -10975,9 +12276,9 @@ function passOrthogonality(input) {
10975
12276
  sims.push(cosineSimilarity(vectors[i], vectors[j]));
10976
12277
  }
10977
12278
  }
10978
- const mean7 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
12279
+ const mean9 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
10979
12280
  return {
10980
- orthogonality: Math.max(0, Math.min(1, 1 - mean7)),
12281
+ orthogonality: Math.max(0, Math.min(1, 1 - mean9)),
10981
12282
  passCount: passes.length,
10982
12283
  similarities: sims
10983
12284
  };
@@ -11023,8 +12324,8 @@ function bootstrapCi(baseline, candidate, options = {}) {
11023
12324
  const iterations = options.iterations ?? 1e3;
11024
12325
  const minTotal = options.minTotalSamples ?? 6;
11025
12326
  const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
11026
- const baselineMean = mean6(baseline);
11027
- const candidateMean = mean6(candidate);
12327
+ const baselineMean = mean8(baseline);
12328
+ const candidateMean = mean8(candidate);
11028
12329
  const delta = candidateMean - baselineMean;
11029
12330
  if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
11030
12331
  return {
@@ -11042,7 +12343,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
11042
12343
  for (let i = 0; i < iterations; i++) {
11043
12344
  const bResample = resample(baseline, rng);
11044
12345
  const cResample = resample(candidate, rng);
11045
- deltas[i] = mean6(cResample) - mean6(bResample);
12346
+ deltas[i] = mean8(cResample) - mean8(bResample);
11046
12347
  }
11047
12348
  deltas.sort((a, b) => a - b);
11048
12349
  const lowerIdx = Math.floor(alpha / 2 * iterations);
@@ -11065,7 +12366,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
11065
12366
  verdict
11066
12367
  };
11067
12368
  }
11068
- function mean6(xs) {
12369
+ function mean8(xs) {
11069
12370
  if (xs.length === 0) return 0;
11070
12371
  let s = 0;
11071
12372
  for (const x of xs) s += x;
@@ -11260,6 +12561,7 @@ function parseReflectionResponse(raw, maxProposals) {
11260
12561
  export {
11261
12562
  AgentDriver,
11262
12563
  AxGepaSteeringOptimizer,
12564
+ BENCHMARK_SPLIT_SEED,
11263
12565
  BenchmarkRunner,
11264
12566
  BudgetBreachError,
11265
12567
  BudgetGuard,
@@ -11288,6 +12590,7 @@ export {
11288
12590
  FileSystemExperimentStore,
11289
12591
  FileSystemOutcomeStore,
11290
12592
  FileSystemTraceStore,
12593
+ HeldOutGate,
11291
12594
  HoldoutAuditor,
11292
12595
  HoldoutLockedError,
11293
12596
  INTENT_MATCH_JUDGE_VERSION,
@@ -11307,6 +12610,7 @@ export {
11307
12610
  MultiLayerVerifier,
11308
12611
  MutationTelemetry,
11309
12612
  Mutex,
12613
+ NoopResearcher,
11310
12614
  OTEL_AGENT_EVAL_SCOPE,
11311
12615
  OptimizationLoop,
11312
12616
  PairwiseSteeringOptimizer,
@@ -11317,6 +12621,7 @@ export {
11317
12621
  PromptRegistry,
11318
12622
  REDACTION_VERSION,
11319
12623
  RunCritic,
12624
+ RunRecordValidationError,
11320
12625
  SEMANTIC_CONCEPT_JUDGE_VERSION,
11321
12626
  SandboxHarness,
11322
12627
  ScenarioRegistry,
@@ -11333,7 +12638,10 @@ export {
11333
12638
  analyzeSeries,
11334
12639
  argHash,
11335
12640
  attributeCounterfactuals,
12641
+ deterministicSplit as benchmarkDeterministicSplit,
12642
+ benchmarks_exports as benchmarks,
11336
12643
  benjaminiHochberg,
12644
+ bhAdjust,
11337
12645
  bisect,
11338
12646
  bonferroni,
11339
12647
  bootstrapCi,
@@ -11413,6 +12721,7 @@ export {
11413
12721
  formatBenchmarkReport,
11414
12722
  formatDriverReport,
11415
12723
  formatFindings,
12724
+ gainHistogram,
11416
12725
  precision as goldenPrecision,
11417
12726
  gradeSemanticStatus,
11418
12727
  groupBy,
@@ -11427,6 +12736,7 @@ export {
11427
12736
  isLlmSpan,
11428
12737
  isPrmVerdict,
11429
12738
  isRetrievalSpan,
12739
+ isRunRecord,
11430
12740
  isSandboxSpan,
11431
12741
  isToolSpan,
11432
12742
  jestTestParser,
@@ -11454,11 +12764,15 @@ export {
11454
12764
  normalizeScores,
11455
12765
  notBlocked,
11456
12766
  outputLengthRubric,
12767
+ pairedBootstrap,
11457
12768
  pairedTTest,
12769
+ pairedWilcoxon,
11458
12770
  paraphraseRobustness,
12771
+ paretoChart,
11459
12772
  paretoFrontier,
11460
12773
  paretoFrontierWithCrowding,
11461
12774
  parseReflectionResponse,
12775
+ parseRunRecordSafe,
11462
12776
  partialCredit,
11463
12777
  passOrthogonality,
11464
12778
  pixelDeltaRatio,
@@ -11489,9 +12803,11 @@ export {
11489
12803
  requiredSampleSize,
11490
12804
  resetLockedAppendersForTesting,
11491
12805
  resumeBuilderSession,
12806
+ roundTripRunRecord,
11492
12807
  rowCount,
11493
12808
  rowWhere,
11494
12809
  runAssertions,
12810
+ runCanaries,
11495
12811
  runCounterfactual,
11496
12812
  runE2EWorkflow,
11497
12813
  runExpectations,
@@ -11526,6 +12842,7 @@ export {
11526
12842
  stuckLoopView,
11527
12843
  summarize,
11528
12844
  summarizeHarnessResults,
12845
+ summaryTable,
11529
12846
  testJudge,
11530
12847
  textInSnapshot,
11531
12848
  toLangfuseEnvelope,
@@ -11539,6 +12856,7 @@ export {
11539
12856
  toolWasteView,
11540
12857
  typoMutator,
11541
12858
  urlContains,
12859
+ validateRunRecord,
11542
12860
  verbosityBias,
11543
12861
  verifyManifest,
11544
12862
  visualDiff,
@@ -11548,6 +12866,7 @@ export {
11548
12866
  weightedRecall,
11549
12867
  welchsTTest,
11550
12868
  whitespaceCollapseMutator,
11551
- wilcoxonSignedRank
12869
+ wilcoxonSignedRank,
12870
+ wranglerDeployRunner
11552
12871
  };
11553
12872
  //# sourceMappingURL=index.js.map