@tangle-network/agent-eval 0.14.2 → 0.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -6,6 +6,9 @@ import {
6
6
  probeLlm,
7
7
  stripFencedJson
8
8
  } from "./chunk-ITN4YOZY.js";
9
+ import {
10
+ __export
11
+ } from "./chunk-PZ5AY32C.js";
9
12
 
10
13
  // src/client.ts
11
14
  var ProductClient = class {
@@ -396,36 +399,36 @@ var INVERTED_DIMENSIONS = /* @__PURE__ */ new Set([
396
399
  "false_confidence",
397
400
  "worst_failure"
398
401
  ]);
399
- function normalizeScores(scores) {
400
- return scores.map((s) => {
402
+ function normalizeScores(scores2) {
403
+ return scores2.map((s) => {
401
404
  if (INVERTED_DIMENSIONS.has(s.dimension)) {
402
405
  return s;
403
406
  }
404
407
  return s;
405
408
  });
406
409
  }
407
- function weightedMean(scores) {
408
- if (scores.length === 0) return 0;
410
+ function weightedMean(scores2) {
411
+ if (scores2.length === 0) return 0;
409
412
  let totalWeight = 0;
410
413
  let weightedSum = 0;
411
- for (const { score, weight } of scores) {
414
+ for (const { score, weight } of scores2) {
412
415
  const w = weight ?? 1;
413
416
  weightedSum += score * w;
414
417
  totalWeight += w;
415
418
  }
416
419
  return totalWeight > 0 ? weightedSum / totalWeight : 0;
417
420
  }
418
- function confidenceInterval(scores, confidence = 0.95) {
419
- if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
420
- if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
421
- const n = scores.length;
422
- const mean7 = scores.reduce((a, b) => a + b, 0) / n;
421
+ function confidenceInterval(scores2, confidence = 0.95) {
422
+ if (scores2.length === 0) return { mean: 0, lower: 0, upper: 0 };
423
+ if (scores2.length === 1) return { mean: scores2[0], lower: scores2[0], upper: scores2[0] };
424
+ const n = scores2.length;
425
+ const mean9 = scores2.reduce((a, b) => a + b, 0) / n;
423
426
  const B = 1e3;
424
427
  const bootstrapMeans = [];
425
428
  for (let i = 0; i < B; i++) {
426
429
  let sum2 = 0;
427
430
  for (let j = 0; j < n; j++) {
428
- sum2 += scores[Math.floor(Math.random() * n)];
431
+ sum2 += scores2[Math.floor(Math.random() * n)];
429
432
  }
430
433
  bootstrapMeans.push(sum2 / n);
431
434
  }
@@ -434,7 +437,7 @@ function confidenceInterval(scores, confidence = 0.95) {
434
437
  const lowerIdx = Math.floor(alpha / 2 * B);
435
438
  const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
436
439
  return {
437
- mean: mean7,
440
+ mean: mean9,
438
441
  lower: bootstrapMeans[lowerIdx],
439
442
  upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
440
443
  };
@@ -522,11 +525,11 @@ function pairedTTest(before, after) {
522
525
  const n = before.length;
523
526
  if (n < 2) return { t: 0, df: 0, p: 1 };
524
527
  const diffs = before.map((b, i) => after[i] - b);
525
- const mean7 = diffs.reduce((a, b) => a + b, 0) / n;
526
- const variance2 = diffs.reduce((acc, d) => acc + (d - mean7) ** 2, 0) / (n - 1);
528
+ const mean9 = diffs.reduce((a, b) => a + b, 0) / n;
529
+ const variance2 = diffs.reduce((acc, d) => acc + (d - mean9) ** 2, 0) / (n - 1);
527
530
  const se = Math.sqrt(variance2 / n);
528
- if (se === 0) return { t: mean7 === 0 ? 0 : Infinity, df: n - 1, p: mean7 === 0 ? 1 : 0 };
529
- const t = mean7 / se;
531
+ if (se === 0) return { t: mean9 === 0 ? 0 : Infinity, df: n - 1, p: mean9 === 0 ? 1 : 0 };
532
+ const t = mean9 / se;
530
533
  const df = n - 1;
531
534
  const p = 2 * (1 - studentTCdf(Math.abs(t), df));
532
535
  return { t, df, p };
@@ -544,15 +547,15 @@ function wilcoxonSignedRank(before, after) {
544
547
  while (i < n) {
545
548
  let j = i;
546
549
  while (j < n && absRanks[j].abs === absRanks[i].abs) j++;
547
- const avg = (i + 1 + j) / 2;
548
- for (let k = i; k < j; k++) ranks3[absRanks[k].i] = avg;
550
+ const avg2 = (i + 1 + j) / 2;
551
+ for (let k = i; k < j; k++) ranks3[absRanks[k].i] = avg2;
549
552
  i = j;
550
553
  }
551
554
  let wPlus = 0;
552
555
  for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
553
- const mean7 = n * (n + 1) / 4;
556
+ const mean9 = n * (n + 1) / 4;
554
557
  const variance2 = n * (n + 1) * (2 * n + 1) / 24;
555
- const z = (wPlus - mean7) / Math.sqrt(variance2);
558
+ const z = (wPlus - mean9) / Math.sqrt(variance2);
556
559
  const p = 2 * (1 - normalCdf(Math.abs(z)));
557
560
  return { w: wPlus, p };
558
561
  }
@@ -753,8 +756,8 @@ async function executeScenario(tc, scenario, config) {
753
756
  console.log(` judge retry ${attempt}/2 (waiting ${wait / 1e3}s)`);
754
757
  await new Promise((r) => setTimeout(r, wait));
755
758
  }
756
- const scores = await judge(tc, judgeInput);
757
- judgeResults.push(scores);
759
+ const scores2 = await judge(tc, judgeInput);
760
+ judgeResults.push(scores2);
758
761
  await new Promise((r) => setTimeout(r, 3e3));
759
762
  break;
760
763
  } catch (err) {
@@ -847,8 +850,8 @@ var BenchmarkRunner = class {
847
850
  byJudge[js.judgeName].dimensions.push(`${js.dimension}=${js.score}`);
848
851
  }
849
852
  for (const [name, data] of Object.entries(byJudge)) {
850
- const avg = (data.scores.reduce((a, b) => a + b, 0) / data.scores.length).toFixed(1);
851
- console.log(` ${name.padEnd(16)} avg=${avg} [${data.dimensions.join(", ")}]`);
853
+ const avg2 = (data.scores.reduce((a, b) => a + b, 0) / data.scores.length).toFixed(1);
854
+ console.log(` ${name.padEnd(16)} avg=${avg2} [${data.dimensions.join(", ")}]`);
852
855
  }
853
856
  console.log(` OVERALL: ${result.overallScore.toFixed(1)}/10 (${(result.totalDurationMs / 1e3).toFixed(0)}s)`);
854
857
  console.log();
@@ -2270,7 +2273,7 @@ var PromptOptimizer = class {
2270
2273
  });
2271
2274
  }
2272
2275
  }
2273
- const scores = config.variants.map((variant) => {
2276
+ const scores2 = config.variants.map((variant) => {
2274
2277
  const scenarioMap = rawScores.get(variant.id);
2275
2278
  const allSamples = [];
2276
2279
  const perScenario = {};
@@ -2293,10 +2296,10 @@ var PromptOptimizer = class {
2293
2296
  };
2294
2297
  });
2295
2298
  const rawPairs = [];
2296
- for (let i = 0; i < scores.length; i++) {
2297
- for (let j = i + 1; j < scores.length; j++) {
2298
- const a = scores[i];
2299
- const b = scores[j];
2299
+ for (let i = 0; i < scores2.length; i++) {
2300
+ for (let j = i + 1; j < scores2.length; j++) {
2301
+ const a = scores2[i];
2302
+ const b = scores2[j];
2300
2303
  const { p } = mannWhitneyU(flatSamples(a), flatSamples(b));
2301
2304
  rawPairs.push({ a, b, p });
2302
2305
  }
@@ -2310,7 +2313,7 @@ var PromptOptimizer = class {
2310
2313
  significant: qValues[idx] < alpha,
2311
2314
  meanDelta: r.b.mean - r.a.mean
2312
2315
  }));
2313
- const sorted = scores.slice().sort((x, y) => y.mean - x.mean);
2316
+ const sorted = scores2.slice().sort((x, y) => y.mean - x.mean);
2314
2317
  const winner = sorted[0];
2315
2318
  const second = sorted[1];
2316
2319
  const winnerComparisons = pairwise2.filter(
@@ -2324,7 +2327,7 @@ var PromptOptimizer = class {
2324
2327
  significant: significantOverAll,
2325
2328
  ciLowerBoundExceedsSecondMean
2326
2329
  },
2327
- scores,
2330
+ scores: scores2,
2328
2331
  pairwise: pairwise2,
2329
2332
  config: {
2330
2333
  trialsPerScenario: trials,
@@ -2870,20 +2873,20 @@ async function mapLimit(items, limit, fn) {
2870
2873
  function mean(values) {
2871
2874
  return values.length ? values.reduce((sum2, value) => sum2 + value, 0) / values.length : 0;
2872
2875
  }
2873
- function meanRunScore(scores) {
2876
+ function meanRunScore(scores2) {
2874
2877
  return {
2875
- success: mean(scores.map((s) => s.success)),
2876
- goalProgress: mean(scores.map((s) => s.goalProgress)),
2877
- repoGroundedness: mean(scores.map((s) => s.repoGroundedness)),
2878
- driftPenalty: mean(scores.map((s) => s.driftPenalty)),
2879
- toolUseQuality: mean(scores.map((s) => s.toolUseQuality)),
2880
- patchQuality: mean(scores.map((s) => s.patchQuality)),
2881
- testReality: mean(scores.map((s) => s.testReality)),
2882
- finalGate: mean(scores.map((s) => s.finalGate)),
2883
- reviewerBlockers: mean(scores.map((s) => s.reviewerBlockers)),
2884
- costUsd: mean(scores.map((s) => s.costUsd)),
2885
- wallSeconds: mean(scores.map((s) => s.wallSeconds)),
2886
- notes: scores.flatMap((s) => s.notes ?? [])
2878
+ success: mean(scores2.map((s) => s.success)),
2879
+ goalProgress: mean(scores2.map((s) => s.goalProgress)),
2880
+ repoGroundedness: mean(scores2.map((s) => s.repoGroundedness)),
2881
+ driftPenalty: mean(scores2.map((s) => s.driftPenalty)),
2882
+ toolUseQuality: mean(scores2.map((s) => s.toolUseQuality)),
2883
+ patchQuality: mean(scores2.map((s) => s.patchQuality)),
2884
+ testReality: mean(scores2.map((s) => s.testReality)),
2885
+ finalGate: mean(scores2.map((s) => s.finalGate)),
2886
+ reviewerBlockers: mean(scores2.map((s) => s.reviewerBlockers)),
2887
+ costUsd: mean(scores2.map((s) => s.costUsd)),
2888
+ wallSeconds: mean(scores2.map((s) => s.wallSeconds)),
2889
+ notes: scores2.flatMap((s) => s.notes ?? [])
2887
2890
  };
2888
2891
  }
2889
2892
 
@@ -3339,12 +3342,12 @@ var SubprocessSandboxDriver = class {
3339
3342
  this.defaultEnv = options.env;
3340
3343
  }
3341
3344
  async exec(phase, command, config) {
3342
- const { spawn } = await import("child_process");
3345
+ const { spawn: spawn2 } = await import("child_process");
3343
3346
  const start = Date.now();
3344
3347
  const effectiveCwd = config.cwd ?? this.defaultCwd;
3345
3348
  const effectiveEnv = { ...process.env, ...this.defaultEnv ?? {}, ...config.env ?? {} };
3346
3349
  return await new Promise((resolve) => {
3347
- const child = spawn(command, {
3350
+ const child = spawn2(command, {
3348
3351
  shell: true,
3349
3352
  cwd: effectiveCwd,
3350
3353
  env: effectiveEnv
@@ -5392,10 +5395,10 @@ function analyzeSeries(values, options = {}) {
5392
5395
  return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
5393
5396
  }
5394
5397
  const tail = values.slice(-window);
5395
- const mean7 = tail.reduce((a, b) => a + b, 0) / tail.length;
5396
- const variance2 = tail.reduce((acc, v) => acc + (v - mean7) ** 2, 0) / tail.length;
5398
+ const mean9 = tail.reduce((a, b) => a + b, 0) / tail.length;
5399
+ const variance2 = tail.reduce((acc, v) => acc + (v - mean9) ** 2, 0) / tail.length;
5397
5400
  const stdDev = Math.sqrt(variance2);
5398
- const refMean = Math.abs(mean7) > 1e-9 ? Math.abs(mean7) : 1;
5401
+ const refMean = Math.abs(mean9) > 1e-9 ? Math.abs(mean9) : 1;
5399
5402
  const cv = stdDev / refMean;
5400
5403
  const stable = tail.length >= window && cv <= stableCv;
5401
5404
  let tailRun = 0;
@@ -5416,7 +5419,7 @@ function analyzeSeries(values, options = {}) {
5416
5419
  } else {
5417
5420
  state = "noisy";
5418
5421
  }
5419
- return { state, windowMean: mean7, windowCv: cv, tailRun, stable };
5422
+ return { state, windowMean: mean9, windowCv: cv, tailRun, stable };
5420
5423
  }
5421
5424
 
5422
5425
  // src/state-continuity.ts
@@ -6012,9 +6015,9 @@ function calibrateJudge(golden, candidate) {
6012
6015
  const worst2 = [...map.entries()].filter(([, v]) => Number.isFinite(v.j)).map(([itemId, v]) => ({ itemId, judge: v.j, human: v.h, delta: Math.abs(v.j - v.h) })).sort((a, b) => b.delta - a.delta).slice(0, 5);
6013
6016
  return { n, pearson: pearson2, kappa, mae, worstItems: worst2 };
6014
6017
  }
6015
- function positionalBias(scores) {
6018
+ function positionalBias(scores2) {
6016
6019
  const pairs = /* @__PURE__ */ new Map();
6017
- for (const s of scores) {
6020
+ for (const s of scores2) {
6018
6021
  const slot = pairs.get(s.itemId) ?? {};
6019
6022
  if (s.positionOfAInput === "first") slot.first = s.score;
6020
6023
  else if (s.positionOfAInput === "second") slot.second = s.score;
@@ -6165,12 +6168,12 @@ function renderMarkdownReport(reports) {
6165
6168
  async function aggregateRunMetrics(runs, store) {
6166
6169
  if (runs.length === 0) return {};
6167
6170
  const durations = [];
6168
- const scores = [];
6171
+ const scores2 = [];
6169
6172
  const passes = [];
6170
6173
  const costs = [];
6171
6174
  for (const r of runs) {
6172
6175
  if (r.endedAt) durations.push(r.endedAt - r.startedAt);
6173
- if (r.outcome?.score !== void 0) scores.push(r.outcome.score);
6176
+ if (r.outcome?.score !== void 0) scores2.push(r.outcome.score);
6174
6177
  passes.push(r.outcome?.pass === true ? 1 : 0);
6175
6178
  const llm = await llmSpans(store, r.runId);
6176
6179
  costs.push(aggregateLlm(llm).costUsd);
@@ -6179,7 +6182,7 @@ async function aggregateRunMetrics(runs, store) {
6179
6182
  provisionMs: average(durations),
6180
6183
  firstTokenMs: average(durations),
6181
6184
  wallMs: average(durations),
6182
- overallScore: average(scores),
6185
+ overallScore: average(scores2),
6183
6186
  passRate: average(passes),
6184
6187
  costUsd: average(costs)
6185
6188
  };
@@ -6242,7 +6245,7 @@ async function toLangfuseEnvelope(store, runId) {
6242
6245
  },
6243
6246
  metadata: { finishReason: s.finishReason, cachedTokens: s.cachedTokens }
6244
6247
  }));
6245
- const scores = judges.map((j) => ({
6248
+ const scores2 = judges.map((j) => ({
6246
6249
  id: j.spanId,
6247
6250
  traceId: run.runId,
6248
6251
  observationId: j.targetSpanId,
@@ -6250,7 +6253,7 @@ async function toLangfuseEnvelope(store, runId) {
6250
6253
  value: j.score,
6251
6254
  comment: j.rationale
6252
6255
  }));
6253
- return { traceId: run.runId, generations, scores };
6256
+ return { traceId: run.runId, generations, scores: scores2 };
6254
6257
  }
6255
6258
  async function toPrometheusText(store) {
6256
6259
  const runs = await store.listRuns();
@@ -6344,12 +6347,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
6344
6347
  variantScores.push({ mutator: id, score, mutated });
6345
6348
  all.push(score);
6346
6349
  }
6347
- const mean7 = all.reduce((a, b) => a + b, 0) / all.length;
6348
- const variance2 = all.reduce((a, v) => a + (v - mean7) ** 2, 0) / all.length;
6350
+ const mean9 = all.reduce((a, b) => a + b, 0) / all.length;
6351
+ const variance2 = all.reduce((a, v) => a + (v - mean9) ** 2, 0) / all.length;
6349
6352
  const stdDev = Math.sqrt(variance2);
6350
- const ref = Math.abs(mean7) > 1e-9 ? Math.abs(mean7) : 1;
6353
+ const ref = Math.abs(mean9) > 1e-9 ? Math.abs(mean9) : 1;
6351
6354
  const robustness = Math.max(0, 1 - stdDev / ref);
6352
- return { originalScore, variantScores, meanScore: mean7, stdDev, robustness };
6355
+ return { originalScore, variantScores, meanScore: mean9, stdDev, robustness };
6353
6356
  }
6354
6357
  var lowercaseMutator = (p) => p.toLowerCase();
6355
6358
  var sentenceReorderMutator = (p, seed) => {
@@ -6684,8 +6687,8 @@ function ranks(xs) {
6684
6687
  for (let i = 0; i < indexed.length; i++) {
6685
6688
  let j = i;
6686
6689
  while (j + 1 < indexed.length && indexed[j + 1].v === indexed[i].v) j++;
6687
- const avg = (i + j + 2) / 2;
6688
- for (let k = i; k <= j; k++) r[indexed[k].i] = avg;
6690
+ const avg2 = (i + j + 2) / 2;
6691
+ for (let k = i; k <= j; k++) r[indexed[k].i] = avg2;
6689
6692
  i = j;
6690
6693
  }
6691
6694
  return r;
@@ -6929,8 +6932,8 @@ function ranks2(xs) {
6929
6932
  for (let i = 0; i < indexed.length; i++) {
6930
6933
  let j = i;
6931
6934
  while (j + 1 < indexed.length && indexed[j + 1].v === indexed[i].v) j++;
6932
- const avg = (i + j + 2) / 2;
6933
- for (let k = i; k <= j; k++) r[indexed[k].i] = avg;
6935
+ const avg2 = (i + j + 2) / 2;
6936
+ for (let k = i; k <= j; k++) r[indexed[k].i] = avg2;
6934
6937
  i = j;
6935
6938
  }
6936
6939
  return r;
@@ -7270,8 +7273,8 @@ async function prmBestOfN(store, grader, runIds) {
7270
7273
  if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
7271
7274
  const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
7272
7275
  const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
7273
- const mean7 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
7274
- const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean7) ** 2, 0) / graded.length;
7276
+ const mean9 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
7277
+ const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean9) ** 2, 0) / graded.length;
7275
7278
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
7276
7279
  }
7277
7280
  async function prmEnsembleBestOfN(store, graders, runIds) {
@@ -7293,8 +7296,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
7293
7296
  const ranked = [...byRun.values()].sort(
7294
7297
  (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
7295
7298
  );
7296
- const mean7 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
7297
- const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean7) ** 2, 0) / ranked.length;
7299
+ const mean9 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
7300
+ const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean9) ** 2, 0) / ranked.length;
7298
7301
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
7299
7302
  }
7300
7303
 
@@ -7672,15 +7675,15 @@ async function runSelfPlay(proposer, scorer, targets, options = {}) {
7672
7675
  const rejected = [];
7673
7676
  const surviving = [];
7674
7677
  for (const candidate of proposed) {
7675
- const scores = await scorer.scoreCandidate(candidate, targets);
7676
- if (scores.length < 2) {
7678
+ const scores2 = await scorer.scoreCandidate(candidate, targets);
7679
+ if (scores2.length < 2) {
7677
7680
  rejected.push({ candidate, reason: "scorer returned <2 results" });
7678
7681
  continue;
7679
7682
  }
7680
- const values = scores.map((s) => s.score);
7683
+ const values = scores2.map((s) => s.score);
7681
7684
  const spread = Math.max(...values) - Math.min(...values);
7682
7685
  const maxScore = Math.max(...values);
7683
- scored.push({ candidate, scores, spread });
7686
+ scored.push({ candidate, scores: scores2, spread });
7684
7687
  if (maxScore < floor) {
7685
7688
  rejected.push({ candidate, reason: `every target below floor (max=${maxScore.toFixed(3)} < ${floor})` });
7686
7689
  continue;
@@ -7822,10 +7825,10 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
7822
7825
  }
7823
7826
  for (const s of scenarios) {
7824
7827
  const sRuns = runs.filter((r) => r.scenarioId === s.id);
7825
- const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
7826
- if (scores.length < 3) continue;
7827
- const mean7 = scores.reduce((a, b) => a + b, 0) / scores.length;
7828
- const variance2 = scores.reduce((a, b) => a + (b - mean7) ** 2, 0) / scores.length;
7828
+ const scores2 = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
7829
+ if (scores2.length < 3) continue;
7830
+ const mean9 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
7831
+ const variance2 = scores2.reduce((a, b) => a + (b - mean9) ** 2, 0) / scores2.length;
7829
7832
  if (variance2 > varianceThreshold) {
7830
7833
  targets.push({
7831
7834
  reason: "high-variance",
@@ -8580,20 +8583,20 @@ function mergeLayerResults(name, perAdapter, options = {}) {
8580
8583
  let durationMs = 0;
8581
8584
  const reasonParts = [];
8582
8585
  const diagnostics = {};
8583
- for (const { adapter, result } of perAdapter) {
8586
+ for (const { adapter: adapter4, result } of perAdapter) {
8584
8587
  status = worst(status, result.status);
8585
8588
  if (typeof result.score === "number") {
8586
8589
  weightedScoreSum += result.score;
8587
8590
  weightCount += 1;
8588
8591
  }
8589
8592
  durationMs = mergeDuration === "sum" ? durationMs + result.durationMs : Math.max(durationMs, result.durationMs);
8590
- reasonParts.push(`${adapter}: ${result.status}`);
8593
+ reasonParts.push(`${adapter4}: ${result.status}`);
8591
8594
  for (const f of result.findings) {
8592
8595
  findings.push({
8593
8596
  ...f,
8594
8597
  layer: name,
8595
- message: prefix ? `${prefix(adapter)} ${f.message}` : f.message,
8596
- detail: { ...f.detail ?? {}, adapter }
8598
+ message: prefix ? `${prefix(adapter4)} ${f.message}` : f.message,
8599
+ detail: { ...f.detail ?? {}, adapter: adapter4 }
8597
8600
  });
8598
8601
  }
8599
8602
  for (const [k, v] of Object.entries(result.diagnostics ?? {})) {
@@ -8612,8 +8615,8 @@ function mergeLayerResults(name, perAdapter, options = {}) {
8612
8615
  reason: reasonParts.join(" \xB7 "),
8613
8616
  diagnostics: Object.keys(diagnostics).length > 0 ? diagnostics : void 0,
8614
8617
  detail: {
8615
- adapters: perAdapter.map(({ adapter, result }) => ({
8616
- adapter,
8618
+ adapters: perAdapter.map(({ adapter: adapter4, result }) => ({
8619
+ adapter: adapter4,
8617
8620
  status: result.status,
8618
8621
  score: result.score ?? null
8619
8622
  })),
@@ -8639,10 +8642,10 @@ function multiToolchainLayer(config) {
8639
8642
  reason: "no adapters detected"
8640
8643
  };
8641
8644
  }
8642
- const runOne = async (adapter) => {
8643
- const adapterName = config.adapterName(adapter);
8645
+ const runOne = async (adapter4) => {
8646
+ const adapterName = config.adapterName(adapter4);
8644
8647
  try {
8645
- const r = await config.run(adapter, ctx);
8648
+ const r = await config.run(adapter4, ctx);
8646
8649
  return { adapter: adapterName, result: r };
8647
8650
  } catch (err) {
8648
8651
  return {
@@ -9345,6 +9348,57 @@ function viteDeployRunner(input) {
9345
9348
  }
9346
9349
  };
9347
9350
  }
9351
+ function wranglerDeployRunner(input) {
9352
+ return {
9353
+ run: async () => {
9354
+ const start = Date.now();
9355
+ const buildCmd = input.buildCommand ?? "npm run build";
9356
+ const dryCmd = input.dryRunCommand ?? "npx wrangler deploy --dry-run --outdir dist";
9357
+ const timeoutMs = input.timeoutMs ?? 12e4;
9358
+ const hasToml = await input.exists("wrangler.toml");
9359
+ const hasJsonc = hasToml ? false : await input.exists("wrangler.jsonc");
9360
+ if (!hasToml && !hasJsonc) {
9361
+ return {
9362
+ ok: false,
9363
+ output: "no wrangler config found (wrangler.toml / wrangler.jsonc absent)",
9364
+ durationMs: Date.now() - start,
9365
+ artifactDir: "dist",
9366
+ artifactValid: false
9367
+ };
9368
+ }
9369
+ const build = await input.exec(buildCmd, { cwd: input.workdir, timeoutMs });
9370
+ if (build.exitCode !== 0) {
9371
+ const tail2 = ((build.stderr || build.stdout) ?? "").slice(-1500);
9372
+ return {
9373
+ ok: false,
9374
+ output: `build failed: ${tail2}`,
9375
+ durationMs: Date.now() - start,
9376
+ artifactDir: "dist",
9377
+ artifactValid: false
9378
+ };
9379
+ }
9380
+ const dry = await input.exec(dryCmd, { cwd: input.workdir, timeoutMs });
9381
+ if (dry.exitCode !== 0) {
9382
+ const tail2 = ((dry.stderr || dry.stdout) ?? "").slice(-1500);
9383
+ return {
9384
+ ok: false,
9385
+ output: `wrangler dry-run failed: ${tail2}`,
9386
+ durationMs: Date.now() - start,
9387
+ artifactDir: "dist",
9388
+ artifactValid: false
9389
+ };
9390
+ }
9391
+ const tail = ((dry.stdout || dry.stderr) ?? "").slice(-1500);
9392
+ return {
9393
+ ok: true,
9394
+ output: tail,
9395
+ durationMs: Date.now() - start,
9396
+ artifactDir: "dist",
9397
+ artifactValid: true
9398
+ };
9399
+ }
9400
+ };
9401
+ }
9348
9402
 
9349
9403
  // src/keyword-coverage-judge.ts
9350
9404
  function htmlContainsElement(html, selector) {
@@ -9712,15 +9766,15 @@ function scoreReferenceReplay(scenarios, options = {}) {
9712
9766
  const threshold = options.matchThreshold ?? DEFAULT_MATCH_THRESHOLD;
9713
9767
  const matchStrategy = options.matchStrategy ?? "reference-order";
9714
9768
  const allowedSplits = new Set(options.splits ?? ALL_SPLITS);
9715
- const scores = scenarios.filter((scenario) => {
9769
+ const scores2 = scenarios.filter((scenario) => {
9716
9770
  const split = scenario.split ?? "train";
9717
9771
  if (split === "holdout" && !options.includeHoldout) return false;
9718
9772
  return allowedSplits.has(split);
9719
9773
  }).map((scenario) => scoreScenario(scenario, matcher, threshold, matchStrategy));
9720
9774
  return {
9721
- scenarios: scores,
9722
- aggregate: aggregateScenarioScores(scores),
9723
- bySplit: aggregateBySplit(scores)
9775
+ scenarios: scores2,
9776
+ aggregate: aggregateScenarioScores(scores2),
9777
+ bySplit: aggregateBySplit(scores2)
9724
9778
  };
9725
9779
  }
9726
9780
  function compareReferenceReplay(baseline, candidate) {
@@ -9935,20 +9989,20 @@ function buildScenarioScore(scenario, matches2, falsePositives) {
9935
9989
  matches: matches2
9936
9990
  };
9937
9991
  }
9938
- function aggregateBySplit(scores) {
9992
+ function aggregateBySplit(scores2) {
9939
9993
  const out = {};
9940
9994
  for (const split of ALL_SPLITS) {
9941
- const scoped = scores.filter((score) => score.split === split);
9995
+ const scoped = scores2.filter((score) => score.split === split);
9942
9996
  if (scoped.length > 0) out[split] = aggregateScenarioScores(scoped);
9943
9997
  }
9944
9998
  return out;
9945
9999
  }
9946
- function aggregateScenarioScores(scores) {
9947
- const matched = sum(scores.map((score) => score.matched));
9948
- const total = sum(scores.map((score) => score.total));
9949
- const falsePositives = sum(scores.map((score) => score.falsePositives));
9950
- const matchedWeight = sum(scores.map((score) => score.matchedWeight));
9951
- const totalWeight = sum(scores.map((score) => score.totalWeight));
10000
+ function aggregateScenarioScores(scores2) {
10001
+ const matched = sum(scores2.map((score) => score.matched));
10002
+ const total = sum(scores2.map((score) => score.total));
10003
+ const falsePositives = sum(scores2.map((score) => score.falsePositives));
10004
+ const matchedWeight = sum(scores2.map((score) => score.matchedWeight));
10005
+ const totalWeight = sum(scores2.map((score) => score.totalWeight));
9952
10006
  const precision2 = ratio(matched, matched + falsePositives);
9953
10007
  const recall = ratio(matched, total);
9954
10008
  return {
@@ -10027,8 +10081,8 @@ function formatPct(value) {
10027
10081
  function bySplitOrder(a, b) {
10028
10082
  return ALL_SPLITS.indexOf(a) - ALL_SPLITS.indexOf(b);
10029
10083
  }
10030
- function runAdapter(adapter, scenario, context) {
10031
- return typeof adapter === "function" ? adapter(scenario, context) : adapter.run(scenario, context);
10084
+ function runAdapter(adapter4, scenario, context) {
10085
+ return typeof adapter4 === "function" ? adapter4(scenario, context) : adapter4.run(scenario, context);
10032
10086
  }
10033
10087
  function throwIfAborted(signal) {
10034
10088
  if (!signal?.aborted) return;
@@ -10066,6 +10120,1258 @@ var STOP_WORDS = /* @__PURE__ */ new Set([
10066
10120
  "which"
10067
10121
  ]);
10068
10122
 
10123
+ // src/paired-stats.ts
10124
+ function pairedBootstrap(before, after, opts = {}) {
10125
+ if (before.length !== after.length) {
10126
+ throw new Error(
10127
+ `pairedBootstrap: unequal sample sizes (${before.length} vs ${after.length})`
10128
+ );
10129
+ }
10130
+ const confidence = opts.confidence ?? 0.95;
10131
+ const resamples = opts.resamples ?? 2e3;
10132
+ const statistic = opts.statistic ?? "median";
10133
+ if (confidence <= 0 || confidence >= 1) {
10134
+ throw new Error(`pairedBootstrap: confidence must be in (0,1), got ${confidence}`);
10135
+ }
10136
+ const n = before.length;
10137
+ const deltas = before.map((b, i) => after[i] - b);
10138
+ if (n === 0) {
10139
+ return { n: 0, median: 0, mean: 0, low: 0, high: 0, confidence, resamples };
10140
+ }
10141
+ if (n === 1) {
10142
+ const d = deltas[0];
10143
+ return { n: 1, median: d, mean: d, low: d, high: d, confidence, resamples };
10144
+ }
10145
+ const rng = makeRng(opts.seed);
10146
+ const samples = new Array(resamples);
10147
+ for (let b = 0; b < resamples; b++) {
10148
+ let acc = null;
10149
+ if (statistic === "mean") {
10150
+ let sum2 = 0;
10151
+ for (let k = 0; k < n; k++) {
10152
+ sum2 += deltas[Math.floor(rng() * n)];
10153
+ }
10154
+ samples[b] = sum2 / n;
10155
+ } else {
10156
+ acc = new Array(n);
10157
+ for (let k = 0; k < n; k++) {
10158
+ acc[k] = deltas[Math.floor(rng() * n)];
10159
+ }
10160
+ samples[b] = medianInPlace(acc);
10161
+ }
10162
+ }
10163
+ samples.sort((a, b) => a - b);
10164
+ const alpha = 1 - confidence;
10165
+ const lowIdx = Math.floor(alpha / 2 * resamples);
10166
+ const highIdx = Math.min(resamples - 1, Math.ceil((1 - alpha / 2) * resamples) - 1);
10167
+ return {
10168
+ n,
10169
+ median: medianInPlace([...deltas]),
10170
+ mean: deltas.reduce((s, x) => s + x, 0) / n,
10171
+ low: samples[lowIdx],
10172
+ high: samples[Math.max(highIdx, lowIdx)],
10173
+ confidence,
10174
+ resamples
10175
+ };
10176
+ }
10177
+ function pairedWilcoxon(before, after) {
10178
+ return wilcoxonSignedRank(before, after);
10179
+ }
10180
+ function bhAdjust(pValues, fdr = 0.05) {
10181
+ return benjaminiHochberg(pValues, fdr);
10182
+ }
10183
+ function medianInPlace(xs) {
10184
+ if (xs.length === 0) return 0;
10185
+ xs.sort((a, b) => a - b);
10186
+ const mid = Math.floor(xs.length / 2);
10187
+ return xs.length % 2 === 0 ? (xs[mid - 1] + xs[mid]) / 2 : xs[mid];
10188
+ }
10189
+ function makeRng(seed) {
10190
+ if (seed === void 0) return Math.random;
10191
+ let s = seed | 0 || 2654435769;
10192
+ return () => {
10193
+ s = s + 1831565813 | 0;
10194
+ let t = s;
10195
+ t = Math.imul(t ^ t >>> 15, t | 1);
10196
+ t ^= t + Math.imul(t ^ t >>> 7, t | 61);
10197
+ return ((t ^ t >>> 14) >>> 0) / 4294967296;
10198
+ };
10199
+ }
10200
+
10201
+ // src/run-record.ts
10202
+ var MANDATORY_TOP_LEVEL = [
10203
+ "runId",
10204
+ "experimentId",
10205
+ "candidateId",
10206
+ "seed",
10207
+ "model",
10208
+ "promptHash",
10209
+ "configHash",
10210
+ "commitSha",
10211
+ "wallMs",
10212
+ "costUsd",
10213
+ "tokenUsage",
10214
+ "outcome",
10215
+ "splitTag"
10216
+ ];
10217
+ var SPLIT_TAGS = ["search", "dev", "holdout"];
10218
+ var RunRecordValidationError = class extends Error {
10219
+ path;
10220
+ constructor(message, path = "") {
10221
+ super(path ? `${message} (at ${path})` : message);
10222
+ this.name = "RunRecordValidationError";
10223
+ this.path = path;
10224
+ }
10225
+ };
10226
+ function validateRunRecord(input) {
10227
+ if (input === null || typeof input !== "object") {
10228
+ throw new RunRecordValidationError("expected object");
10229
+ }
10230
+ const obj = input;
10231
+ for (const key of MANDATORY_TOP_LEVEL) {
10232
+ if (!(key in obj)) {
10233
+ throw new RunRecordValidationError(`missing mandatory field "${key}"`);
10234
+ }
10235
+ }
10236
+ expectString(obj.runId, "runId");
10237
+ expectString(obj.experimentId, "experimentId");
10238
+ expectString(obj.candidateId, "candidateId");
10239
+ expectFiniteNumber(obj.seed, "seed");
10240
+ expectString(obj.model, "model");
10241
+ expectString(obj.promptHash, "promptHash");
10242
+ expectString(obj.configHash, "configHash");
10243
+ expectString(obj.commitSha, "commitSha");
10244
+ expectFiniteNumber(obj.wallMs, "wallMs");
10245
+ if (obj.queueMs !== void 0) expectFiniteNumber(obj.queueMs, "queueMs");
10246
+ expectFiniteNumber(obj.costUsd, "costUsd");
10247
+ if (!modelHasSnapshot(obj.model)) {
10248
+ throw new RunRecordValidationError(
10249
+ `model "${obj.model}" lacks a snapshot version (use 'name@YYYY-MM-DD' or 'name-YYYYMMDD')`,
10250
+ "model"
10251
+ );
10252
+ }
10253
+ const tu = obj.tokenUsage;
10254
+ if (tu === null || typeof tu !== "object") {
10255
+ throw new RunRecordValidationError("tokenUsage must be an object", "tokenUsage");
10256
+ }
10257
+ const tuRec = tu;
10258
+ expectFiniteNumber(tuRec.input, "tokenUsage.input");
10259
+ expectFiniteNumber(tuRec.output, "tokenUsage.output");
10260
+ if (tuRec.cached !== void 0) expectFiniteNumber(tuRec.cached, "tokenUsage.cached");
10261
+ if (obj.judgeMetadata !== void 0) {
10262
+ const jm = obj.judgeMetadata;
10263
+ if (jm === null || typeof jm !== "object") {
10264
+ throw new RunRecordValidationError("judgeMetadata must be an object", "judgeMetadata");
10265
+ }
10266
+ const jmRec = jm;
10267
+ expectString(jmRec.model, "judgeMetadata.model");
10268
+ expectString(jmRec.promptVersion, "judgeMetadata.promptVersion");
10269
+ expectFiniteNumber(jmRec.confidence, "judgeMetadata.confidence");
10270
+ if (typeof jmRec.fallback !== "boolean") {
10271
+ throw new RunRecordValidationError("judgeMetadata.fallback must be boolean", "judgeMetadata.fallback");
10272
+ }
10273
+ }
10274
+ const out = obj.outcome;
10275
+ if (out === null || typeof out !== "object") {
10276
+ throw new RunRecordValidationError("outcome must be an object", "outcome");
10277
+ }
10278
+ const outRec = out;
10279
+ if (outRec.searchScore !== void 0) expectFiniteNumber(outRec.searchScore, "outcome.searchScore");
10280
+ if (outRec.holdoutScore !== void 0) expectFiniteNumber(outRec.holdoutScore, "outcome.holdoutScore");
10281
+ if (outRec.searchScore === void 0 && outRec.holdoutScore === void 0) {
10282
+ throw new RunRecordValidationError(
10283
+ "outcome must define searchScore or holdoutScore (or both)",
10284
+ "outcome"
10285
+ );
10286
+ }
10287
+ const raw = outRec.raw;
10288
+ if (raw === null || typeof raw !== "object") {
10289
+ throw new RunRecordValidationError("outcome.raw must be an object", "outcome.raw");
10290
+ }
10291
+ for (const [k, v] of Object.entries(raw)) {
10292
+ expectFiniteNumber(v, `outcome.raw.${k}`);
10293
+ }
10294
+ if (obj.failureMode !== void 0) expectString(obj.failureMode, "failureMode");
10295
+ if (typeof obj.splitTag !== "string" || !SPLIT_TAGS.includes(obj.splitTag)) {
10296
+ throw new RunRecordValidationError(
10297
+ `splitTag must be one of ${SPLIT_TAGS.join(", ")}, got ${String(obj.splitTag)}`,
10298
+ "splitTag"
10299
+ );
10300
+ }
10301
+ return input;
10302
+ }
10303
+ function isRunRecord(input) {
10304
+ try {
10305
+ validateRunRecord(input);
10306
+ return true;
10307
+ } catch {
10308
+ return false;
10309
+ }
10310
+ }
10311
+ function parseRunRecordSafe(input) {
10312
+ try {
10313
+ return { ok: true, value: validateRunRecord(input) };
10314
+ } catch (e) {
10315
+ if (e instanceof RunRecordValidationError) return { ok: false, error: e };
10316
+ throw e;
10317
+ }
10318
+ }
10319
+ function roundTripRunRecord(record) {
10320
+ const json = JSON.stringify(record);
10321
+ return validateRunRecord(JSON.parse(json));
10322
+ }
10323
+ function expectString(value, path) {
10324
+ if (typeof value !== "string" || value.length === 0) {
10325
+ throw new RunRecordValidationError(`expected non-empty string`, path);
10326
+ }
10327
+ }
10328
+ function expectFiniteNumber(value, path) {
10329
+ if (typeof value !== "number" || !Number.isFinite(value)) {
10330
+ throw new RunRecordValidationError(`expected finite number`, path);
10331
+ }
10332
+ }
10333
+ function modelHasSnapshot(model) {
10334
+ if (model.includes("@")) return true;
10335
+ if (/-\d{8}$/.test(model)) return true;
10336
+ if (/-\d{4}-\d{2}-\d{2}$/.test(model)) return true;
10337
+ if (/:date-/.test(model)) return true;
10338
+ return false;
10339
+ }
10340
+
10341
+ // src/held-out-gate.ts
10342
+ var HeldOutGate = class {
10343
+ minProductiveRuns;
10344
+ pairedDeltaThreshold;
10345
+ overfitGapThreshold;
10346
+ baselineKey;
10347
+ confidence;
10348
+ resamples;
10349
+ seed;
10350
+ constructor(config) {
10351
+ if (!config.baselineKey) {
10352
+ throw new Error("HeldOutGate: baselineKey is required");
10353
+ }
10354
+ this.minProductiveRuns = config.minProductiveRuns ?? 3;
10355
+ this.pairedDeltaThreshold = config.pairedDeltaThreshold ?? 0;
10356
+ this.overfitGapThreshold = config.overfitGapThreshold ?? 0.15;
10357
+ this.baselineKey = config.baselineKey;
10358
+ this.confidence = config.confidence ?? 0.95;
10359
+ this.resamples = config.bootstrapResamples ?? 2e3;
10360
+ this.seed = config.seed;
10361
+ }
10362
+ /** Decide whether `candidate` should replace `baseline`. Pairing
10363
+ * is by (experimentId, seed) — identical experiment + seed pairs
10364
+ * the candidate run with the matching baseline run. Pairs without
10365
+ * a holdout score on both sides are dropped. */
10366
+ evaluate(candidate, baseline) {
10367
+ const candidateId = inferCandidateId(candidate, this.baselineKey);
10368
+ const baselineId = this.baselineKey;
10369
+ const baselineHoldoutByKey = indexHoldoutByKey(baseline);
10370
+ const beforeHoldout = [];
10371
+ const afterHoldout = [];
10372
+ for (const run of candidate) {
10373
+ if (run.splitTag !== "holdout") continue;
10374
+ if (run.outcome.holdoutScore === void 0) continue;
10375
+ const key = pairKey(run);
10376
+ const counterpart = baselineHoldoutByKey.get(key);
10377
+ if (counterpart === void 0) continue;
10378
+ beforeHoldout.push(counterpart);
10379
+ afterHoldout.push(run.outcome.holdoutScore);
10380
+ }
10381
+ const productiveRuns = beforeHoldout.length;
10382
+ const candidateSearchMean = mean5(scores(candidate, "searchScore", "search"));
10383
+ const candidateHoldoutMean = mean5(scores(candidate, "holdoutScore", "holdout"));
10384
+ const baselineSearchMean = mean5(scores(baseline, "searchScore", "search"));
10385
+ const baselineHoldoutMean = mean5(scores(baseline, "holdoutScore", "holdout"));
10386
+ const overfitGap = safeDiff(candidateSearchMean, candidateHoldoutMean);
10387
+ const baselineOverfitGap = safeDiff(baselineSearchMean, baselineHoldoutMean);
10388
+ if (productiveRuns < this.minProductiveRuns) {
10389
+ return {
10390
+ promote: false,
10391
+ candidateId,
10392
+ baselineId,
10393
+ evidence: {
10394
+ productiveRuns,
10395
+ medianPairedDelta: productiveRuns > 0 ? medianDelta(beforeHoldout, afterHoldout) : 0,
10396
+ pairedCI: { low: 0, high: 0 },
10397
+ pairedPValue: 1,
10398
+ searchScore: candidateSearchMean,
10399
+ holdoutScore: candidateHoldoutMean,
10400
+ overfitGap,
10401
+ baselineOverfitGap
10402
+ },
10403
+ reason: `few_runs: ${productiveRuns} paired holdout observation(s) < min ${this.minProductiveRuns}`,
10404
+ rejectionCode: "few_runs"
10405
+ };
10406
+ }
10407
+ const ci = pairedBootstrap(beforeHoldout, afterHoldout, {
10408
+ confidence: this.confidence,
10409
+ resamples: this.resamples,
10410
+ statistic: "median",
10411
+ seed: this.seed
10412
+ });
10413
+ const wilcoxon = pairedWilcoxon(beforeHoldout, afterHoldout);
10414
+ const evidence = {
10415
+ productiveRuns,
10416
+ medianPairedDelta: ci.median,
10417
+ pairedCI: { low: ci.low, high: ci.high },
10418
+ pairedPValue: wilcoxon.p,
10419
+ searchScore: candidateSearchMean,
10420
+ holdoutScore: candidateHoldoutMean,
10421
+ overfitGap,
10422
+ baselineOverfitGap
10423
+ };
10424
+ if (!(ci.low > this.pairedDeltaThreshold)) {
10425
+ return {
10426
+ promote: false,
10427
+ candidateId,
10428
+ baselineId,
10429
+ evidence,
10430
+ reason: `negative_delta: paired holdout median \u0394=${fmt(ci.median)} CI=[${fmt(ci.low)}, ${fmt(ci.high)}] does not clear threshold ${fmt(this.pairedDeltaThreshold)}`,
10431
+ rejectionCode: "negative_delta"
10432
+ };
10433
+ }
10434
+ if (Number.isFinite(overfitGap) && Number.isFinite(baselineOverfitGap) && overfitGap > baselineOverfitGap + this.overfitGapThreshold) {
10435
+ return {
10436
+ promote: false,
10437
+ candidateId,
10438
+ baselineId,
10439
+ evidence,
10440
+ reason: `overfit_gap: candidate gap=${fmt(overfitGap)} exceeds baseline gap=${fmt(baselineOverfitGap)} by more than ${fmt(this.overfitGapThreshold)}`,
10441
+ rejectionCode: "overfit_gap"
10442
+ };
10443
+ }
10444
+ return {
10445
+ promote: true,
10446
+ candidateId,
10447
+ baselineId,
10448
+ evidence,
10449
+ reason: `promote: paired holdout median \u0394=${fmt(ci.median)} CI=[${fmt(ci.low)}, ${fmt(ci.high)}] over ${productiveRuns} pairs; overfit gap candidate=${fmt(overfitGap)} vs baseline=${fmt(baselineOverfitGap)}`,
10450
+ rejectionCode: null
10451
+ };
10452
+ }
10453
+ };
10454
+ function inferCandidateId(candidate, baselineKey) {
10455
+ for (const run of candidate) {
10456
+ if (run.candidateId && run.candidateId !== baselineKey) return run.candidateId;
10457
+ }
10458
+ return candidate[0]?.candidateId ?? "(unknown candidate)";
10459
+ }
10460
+ function indexHoldoutByKey(runs) {
10461
+ const out = /* @__PURE__ */ new Map();
10462
+ for (const r of runs) {
10463
+ if (r.splitTag !== "holdout") continue;
10464
+ if (r.outcome.holdoutScore === void 0) continue;
10465
+ out.set(pairKey(r), r.outcome.holdoutScore);
10466
+ }
10467
+ return out;
10468
+ }
10469
+ function pairKey(r) {
10470
+ return `${r.experimentId}::${r.seed}`;
10471
+ }
10472
+ function scores(runs, field, splitFilter) {
10473
+ const out = [];
10474
+ for (const r of runs) {
10475
+ if (r.splitTag !== splitFilter) continue;
10476
+ const v = r.outcome[field];
10477
+ if (typeof v === "number" && Number.isFinite(v)) out.push(v);
10478
+ }
10479
+ return out;
10480
+ }
10481
+ function mean5(xs) {
10482
+ if (xs.length === 0) return Number.NaN;
10483
+ return xs.reduce((s, x) => s + x, 0) / xs.length;
10484
+ }
10485
+ function safeDiff(a, b) {
10486
+ if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
10487
+ return a - b;
10488
+ }
10489
+ function medianDelta(before, after) {
10490
+ const ds = before.map((b, i) => after[i] - b).sort((x, y) => x - y);
10491
+ if (ds.length === 0) return 0;
10492
+ const mid = Math.floor(ds.length / 2);
10493
+ return ds.length % 2 === 0 ? (ds[mid - 1] + ds[mid]) / 2 : ds[mid];
10494
+ }
10495
+ function fmt(x) {
10496
+ if (!Number.isFinite(x)) return String(x);
10497
+ return x.toFixed(4);
10498
+ }
10499
+
10500
+ // src/researcher.ts
10501
+ var NoopResearcher = class {
10502
+ hint;
10503
+ constructor(hint = "NoopResearcher: no implementation wired") {
10504
+ this.hint = hint;
10505
+ }
10506
+ async inspectFailures(_runs) {
10507
+ throw new Error(`${this.hint} (inspectFailures not implemented)`);
10508
+ }
10509
+ async proposeChange(_failures) {
10510
+ throw new Error(`${this.hint} (proposeChange not implemented)`);
10511
+ }
10512
+ async applyChange(_changes, _baseline) {
10513
+ throw new Error(`${this.hint} (applyChange not implemented)`);
10514
+ }
10515
+ async evaluateChange(_plan) {
10516
+ throw new Error(`${this.hint} (evaluateChange not implemented)`);
10517
+ }
10518
+ };
10519
+
10520
+ // src/summary-report.ts
10521
+ function summaryTable(runs, opts = {}) {
10522
+ const split = opts.split ?? "holdout";
10523
+ const confidence = opts.confidence ?? 0.95;
10524
+ const fdr = opts.fdr ?? 0.05;
10525
+ const comparator = opts.comparator ?? null;
10526
+ const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
10527
+ const byCandidate = /* @__PURE__ */ new Map();
10528
+ for (const r of runs) {
10529
+ if (r.splitTag !== split) continue;
10530
+ const v = r.outcome[scoreField];
10531
+ if (typeof v !== "number" || !Number.isFinite(v)) continue;
10532
+ const bucket = byCandidate.get(r.candidateId) ?? { runs: [], scores: [] };
10533
+ bucket.runs.push(r);
10534
+ bucket.scores.push(v);
10535
+ byCandidate.set(r.candidateId, bucket);
10536
+ }
10537
+ const candidateIds = [...byCandidate.keys()].sort();
10538
+ const compRuns = comparator ? byCandidate.get(comparator) : void 0;
10539
+ const tentative = [];
10540
+ for (const id of candidateIds) {
10541
+ const bucket = byCandidate.get(id);
10542
+ const ci = confidenceInterval(bucket.scores, confidence);
10543
+ let rawP = Number.NaN;
10544
+ let d = Number.NaN;
10545
+ if (comparator && compRuns && id !== comparator) {
10546
+ const paired = pairScoresByKey(bucket.runs, compRuns.runs, scoreField);
10547
+ if (paired.before.length >= 6) {
10548
+ rawP = wilcoxonSignedRank(paired.before, paired.after).p;
10549
+ }
10550
+ d = cohensD(compRuns.scores, bucket.scores);
10551
+ }
10552
+ tentative.push({
10553
+ candidateId: id,
10554
+ n: bucket.scores.length,
10555
+ mean: ci.mean,
10556
+ ciLow: ci.lower,
10557
+ ciHigh: ci.upper,
10558
+ qValue: rawP,
10559
+ cohensD: d,
10560
+ rawP
10561
+ });
10562
+ }
10563
+ if (comparator) {
10564
+ const idxs = [];
10565
+ const ps = [];
10566
+ for (let i = 0; i < tentative.length; i++) {
10567
+ const r = tentative[i];
10568
+ if (r.candidateId === comparator) continue;
10569
+ if (!Number.isFinite(r.rawP)) continue;
10570
+ idxs.push(i);
10571
+ ps.push(r.rawP);
10572
+ }
10573
+ if (ps.length > 0) {
10574
+ const { qValues } = benjaminiHochberg(ps, fdr);
10575
+ for (let k = 0; k < idxs.length; k++) {
10576
+ tentative[idxs[k]].qValue = qValues[k];
10577
+ }
10578
+ }
10579
+ }
10580
+ const rows = tentative.map(({ rawP: _rawP, ...rest }) => rest);
10581
+ const markdown = renderSummaryTableMarkdown(rows, comparator, split);
10582
+ return { rows, comparator, split, markdown };
10583
+ }
10584
+ function pairScoresByKey(candidate, baseline, scoreField) {
10585
+ const baseIdx = /* @__PURE__ */ new Map();
10586
+ for (const r of baseline) {
10587
+ const v = r.outcome[scoreField];
10588
+ if (typeof v === "number" && Number.isFinite(v)) {
10589
+ baseIdx.set(`${r.experimentId}::${r.seed}`, v);
10590
+ }
10591
+ }
10592
+ const before = [];
10593
+ const after = [];
10594
+ for (const r of candidate) {
10595
+ const v = r.outcome[scoreField];
10596
+ if (typeof v !== "number" || !Number.isFinite(v)) continue;
10597
+ const key = `${r.experimentId}::${r.seed}`;
10598
+ const b = baseIdx.get(key);
10599
+ if (b === void 0) continue;
10600
+ before.push(b);
10601
+ after.push(v);
10602
+ }
10603
+ return { before, after };
10604
+ }
10605
+ function renderSummaryTableMarkdown(rows, comparator, split) {
10606
+ const lines = [];
10607
+ const cmpLabel = comparator ? ` (vs ${comparator})` : "";
10608
+ lines.push(`Summary Table \u2014 ${split} split${cmpLabel}`);
10609
+ lines.push("");
10610
+ lines.push("| Candidate | N | Mean | 95% CI | q (BH) | Cohen's d |");
10611
+ lines.push("|---|---:|---:|---|---:|---:|");
10612
+ for (const r of rows) {
10613
+ const ci = `[${fmt2(r.ciLow)}, ${fmt2(r.ciHigh)}]`;
10614
+ const q = Number.isFinite(r.qValue) ? r.qValue.toFixed(4) : "\u2014";
10615
+ const d = Number.isFinite(r.cohensD) ? r.cohensD.toFixed(3) : "\u2014";
10616
+ lines.push(`| ${r.candidateId} | ${r.n} | ${fmt2(r.mean)} | ${ci} | ${q} | ${d} |`);
10617
+ }
10618
+ return lines.join("\n");
10619
+ }
10620
+ function paretoChart(runs, opts = {}) {
10621
+ const split = opts.split ?? "holdout";
10622
+ const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
10623
+ const buckets = /* @__PURE__ */ new Map();
10624
+ for (const r of runs) {
10625
+ if (r.splitTag !== split) continue;
10626
+ const v = r.outcome[scoreField];
10627
+ if (typeof v !== "number" || !Number.isFinite(v)) continue;
10628
+ const bucket = buckets.get(r.candidateId) ?? { cost: [], quality: [] };
10629
+ bucket.cost.push(r.costUsd);
10630
+ bucket.quality.push(v);
10631
+ buckets.set(r.candidateId, bucket);
10632
+ }
10633
+ const points = [];
10634
+ for (const [candidateId, bucket] of buckets.entries()) {
10635
+ points.push({
10636
+ candidateId,
10637
+ cost: avg(bucket.cost),
10638
+ quality: avg(bucket.quality),
10639
+ n: bucket.cost.length,
10640
+ onFrontier: false,
10641
+ gate: opts.gateDecisions?.[candidateId] ? gateLabel(opts.gateDecisions[candidateId]) : void 0
10642
+ });
10643
+ }
10644
+ for (const p of points) {
10645
+ p.onFrontier = !points.some((q) => q !== p && dominates2(q, p));
10646
+ }
10647
+ return {
10648
+ kind: "pareto-cost-quality",
10649
+ split,
10650
+ axes: { x: "costUsd", y: "score" },
10651
+ points
10652
+ };
10653
+ }
10654
+ function dominates2(a, b) {
10655
+ return a.cost <= b.cost && a.quality >= b.quality && (a.cost < b.cost || a.quality > b.quality);
10656
+ }
10657
+ function gateLabel(d) {
10658
+ if (d.promote) return "promote";
10659
+ if (d.rejectionCode === "few_runs") return "reject_few_runs";
10660
+ if (d.rejectionCode === "negative_delta") return "reject_negative_delta";
10661
+ if (d.rejectionCode === "overfit_gap") return "reject_overfit_gap";
10662
+ return null;
10663
+ }
10664
+ function gainHistogram(runs, candidateId, comparator, opts = {}) {
10665
+ const split = opts.split ?? "holdout";
10666
+ const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
10667
+ const binCount = opts.bins ?? 11;
10668
+ if (binCount < 1) throw new Error("gainHistogram: bins must be \u2265 1");
10669
+ const candidate = runs.filter((r) => r.candidateId === candidateId && r.splitTag === split);
10670
+ const baseline = runs.filter((r) => r.candidateId === comparator && r.splitTag === split);
10671
+ const { before, after } = pairScoresByKey(candidate, baseline, scoreField);
10672
+ const n = before.length;
10673
+ if (n === 0) {
10674
+ return {
10675
+ kind: "gain-distribution",
10676
+ candidateId,
10677
+ comparator,
10678
+ split,
10679
+ n: 0,
10680
+ bins: [],
10681
+ median: 0,
10682
+ ci: { low: 0, high: 0 }
10683
+ };
10684
+ }
10685
+ const deltas = before.map((b, i) => after[i] - b);
10686
+ const sortedDeltas = [...deltas].sort((a, b) => a - b);
10687
+ const median = medianOfSorted(sortedDeltas);
10688
+ const min = sortedDeltas[0];
10689
+ const max = sortedDeltas[sortedDeltas.length - 1];
10690
+ const bound = Math.max(Math.abs(min), Math.abs(max), 1e-6);
10691
+ const lo = -bound;
10692
+ const hi = bound;
10693
+ const width = (hi - lo) / binCount;
10694
+ const bins = [];
10695
+ for (let i = 0; i < binCount; i++) {
10696
+ bins.push({ lo: lo + i * width, hi: lo + (i + 1) * width, count: 0 });
10697
+ }
10698
+ for (const d of deltas) {
10699
+ let idx = Math.floor((d - lo) / width);
10700
+ if (idx < 0) idx = 0;
10701
+ if (idx >= binCount) idx = binCount - 1;
10702
+ bins[idx].count += 1;
10703
+ }
10704
+ const ci = pairedBootstrap(before, after, {
10705
+ confidence: opts.confidence ?? 0.95,
10706
+ resamples: opts.resamples ?? 2e3,
10707
+ statistic: "median",
10708
+ seed: opts.seed
10709
+ });
10710
+ return {
10711
+ kind: "gain-distribution",
10712
+ candidateId,
10713
+ comparator,
10714
+ split,
10715
+ n,
10716
+ bins,
10717
+ median,
10718
+ ci: { low: ci.low, high: ci.high }
10719
+ };
10720
+ }
10721
+ function avg(xs) {
10722
+ if (xs.length === 0) return Number.NaN;
10723
+ return xs.reduce((s, x) => s + x, 0) / xs.length;
10724
+ }
10725
+ function medianOfSorted(sorted) {
10726
+ if (sorted.length === 0) return 0;
10727
+ const mid = Math.floor(sorted.length / 2);
10728
+ return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
10729
+ }
10730
+ function fmt2(x) {
10731
+ if (!Number.isFinite(x)) return String(x);
10732
+ return x.toFixed(4);
10733
+ }
10734
+
10735
+ // src/canary.ts
10736
+ function runCanaries(runs, opts = {}) {
10737
+ const alerts = [
10738
+ ...detectSilentFallback(runs, opts.silentFallback ?? {}),
10739
+ ...detectCalibrationDrift(runs, opts.calibrationDrift ?? {}),
10740
+ ...opts.distributionShift ? detectDistributionShift(runs, opts.distributionShift) : []
10741
+ ];
10742
+ const counts = {
10743
+ silent_judge_fallback: 0,
10744
+ judge_calibration_drift: 0,
10745
+ distribution_shift: 0
10746
+ };
10747
+ for (const a of alerts) counts[a.kind]++;
10748
+ return { alerts, counts };
10749
+ }
10750
+ function detectSilentFallback(runs, opts) {
10751
+ const constant = opts.constant ?? 0.3;
10752
+ const threshold = opts.consecutiveThreshold ?? 3;
10753
+ const eps = opts.epsilon ?? 1e-9;
10754
+ const alerts = [];
10755
+ let streak = 0;
10756
+ let streakStartRunId = null;
10757
+ let streakValues = [];
10758
+ let lastFlush = -1;
10759
+ for (let i = 0; i < runs.length; i++) {
10760
+ const run = runs[i];
10761
+ const meta = run.judgeMetadata;
10762
+ if (!meta) {
10763
+ streak = 0;
10764
+ streakStartRunId = null;
10765
+ streakValues = [];
10766
+ continue;
10767
+ }
10768
+ const isFallback = meta.fallback === true || Math.abs(meta.confidence - constant) <= eps;
10769
+ if (isFallback) {
10770
+ streak += 1;
10771
+ if (streak === 1) streakStartRunId = run.runId;
10772
+ streakValues.push(meta.confidence);
10773
+ if (streak >= threshold && lastFlush < i) {
10774
+ alerts.push({
10775
+ kind: "silent_judge_fallback",
10776
+ severity: "error",
10777
+ message: `silent judge fallback: ${streak} consecutive run(s) at confidence\u2248${constant} or fallback=true`,
10778
+ evidence: {
10779
+ streakLength: streak,
10780
+ firstRunId: streakStartRunId,
10781
+ lastRunId: run.runId,
10782
+ confidences: streakValues.slice(-Math.min(streakValues.length, 10)),
10783
+ fallbackConstant: constant
10784
+ }
10785
+ });
10786
+ lastFlush = i;
10787
+ }
10788
+ } else {
10789
+ streak = 0;
10790
+ streakStartRunId = null;
10791
+ streakValues = [];
10792
+ lastFlush = -1;
10793
+ }
10794
+ }
10795
+ return alerts;
10796
+ }
10797
+ function detectCalibrationDrift(runs, opts) {
10798
+ const historyWindow = opts.historyWindow ?? 50;
10799
+ const recentWindow = opts.recentWindow ?? 20;
10800
+ const alpha = opts.ksAlpha ?? 0.05;
10801
+ const minRecent = opts.minRecent ?? 10;
10802
+ const conf = [];
10803
+ for (const r of runs) {
10804
+ if (r.judgeMetadata && Number.isFinite(r.judgeMetadata.confidence)) {
10805
+ conf.push(r.judgeMetadata.confidence);
10806
+ }
10807
+ }
10808
+ if (conf.length < minRecent + 1) return [];
10809
+ const recent = conf.slice(-Math.min(recentWindow, conf.length));
10810
+ const historical = conf.slice(0, -recent.length).slice(-historyWindow);
10811
+ if (recent.length < minRecent || historical.length < minRecent) return [];
10812
+ const ks = ksTwoSample(recent, historical);
10813
+ const c = alpha <= 0.01 ? 1.63 : alpha <= 0.05 ? 1.36 : alpha <= 0.1 ? 1.22 : 1;
10814
+ const critical = c * Math.sqrt((recent.length + historical.length) / (recent.length * historical.length));
10815
+ if (ks.d > critical) {
10816
+ return [
10817
+ {
10818
+ kind: "judge_calibration_drift",
10819
+ severity: "warn",
10820
+ message: `judge calibration drift: KS D=${ks.d.toFixed(4)} exceeds critical=${critical.toFixed(4)} at alpha=${alpha} (recent n=${recent.length}, history n=${historical.length})`,
10821
+ evidence: {
10822
+ ksD: ks.d,
10823
+ critical,
10824
+ alpha,
10825
+ recentN: recent.length,
10826
+ historyN: historical.length,
10827
+ recentMean: mean6(recent),
10828
+ historyMean: mean6(historical)
10829
+ }
10830
+ }
10831
+ ];
10832
+ }
10833
+ return [];
10834
+ }
10835
+ function ksTwoSample(a, b) {
10836
+ const sortedA = [...a].sort((x, y) => x - y);
10837
+ const sortedB = [...b].sort((x, y) => x - y);
10838
+ const n1 = sortedA.length;
10839
+ const n2 = sortedB.length;
10840
+ let i = 0;
10841
+ let j = 0;
10842
+ let d = 0;
10843
+ while (i < n1 && j < n2) {
10844
+ const ax = sortedA[i];
10845
+ const bx = sortedB[j];
10846
+ if (ax <= bx) i++;
10847
+ if (bx <= ax) j++;
10848
+ const diff = Math.abs(i / n1 - j / n2);
10849
+ if (diff > d) d = diff;
10850
+ }
10851
+ return { d };
10852
+ }
10853
+ function detectDistributionShift(runs, opts) {
10854
+ const historyWindow = opts.historyWindow ?? 50;
10855
+ const recentWindow = opts.recentWindow ?? 20;
10856
+ const alpha = opts.chiSquareAlpha ?? 0.05;
10857
+ const minRecent = opts.minRecent ?? 10;
10858
+ const cat = opts.category;
10859
+ const cats = [];
10860
+ for (const r of runs) {
10861
+ const b = cat(r);
10862
+ if (typeof b === "string" && b.length > 0) cats.push({ run: r, bucket: b });
10863
+ }
10864
+ if (cats.length < minRecent + 1) return [];
10865
+ const recent = cats.slice(-Math.min(recentWindow, cats.length));
10866
+ const historical = cats.slice(0, -recent.length).slice(-historyWindow);
10867
+ if (recent.length < minRecent || historical.length < minRecent) return [];
10868
+ const buckets = /* @__PURE__ */ new Set();
10869
+ for (const r of recent) buckets.add(r.bucket);
10870
+ for (const h of historical) buckets.add(h.bucket);
10871
+ const bucketList = [...buckets].sort();
10872
+ const recentCounts = {};
10873
+ const histCounts = {};
10874
+ for (const b of bucketList) {
10875
+ recentCounts[b] = 0;
10876
+ histCounts[b] = 0;
10877
+ }
10878
+ for (const r of recent) recentCounts[r.bucket] += 1;
10879
+ for (const h of historical) histCounts[h.bucket] += 1;
10880
+ let chi = 0;
10881
+ let df = 0;
10882
+ for (const b of bucketList) {
10883
+ const expected = histCounts[b] / historical.length * recent.length;
10884
+ if (expected < 1) continue;
10885
+ const obs = recentCounts[b];
10886
+ chi += (obs - expected) ** 2 / expected;
10887
+ df += 1;
10888
+ }
10889
+ df = Math.max(1, df - 1);
10890
+ const critical = chiSquareCritical(df, alpha);
10891
+ if (chi > critical) {
10892
+ return [
10893
+ {
10894
+ kind: "distribution_shift",
10895
+ severity: "warn",
10896
+ message: `eval-set distribution shift: \u03C7\xB2=${chi.toFixed(2)} df=${df} exceeds critical=${critical.toFixed(2)} at alpha=${alpha}`,
10897
+ evidence: {
10898
+ chi,
10899
+ df,
10900
+ critical,
10901
+ alpha,
10902
+ recentCounts,
10903
+ historicalCounts: histCounts,
10904
+ recentN: recent.length,
10905
+ historyN: historical.length
10906
+ }
10907
+ }
10908
+ ];
10909
+ }
10910
+ return [];
10911
+ }
10912
+ function chiSquareCritical(df, alpha) {
10913
+ const TABLE = {
10914
+ 1: [2.71, 3.84, 5.02, 6.63],
10915
+ 2: [4.61, 5.99, 7.38, 9.21],
10916
+ 3: [6.25, 7.81, 9.35, 11.34],
10917
+ 4: [7.78, 9.49, 11.14, 13.28],
10918
+ 5: [9.24, 11.07, 12.83, 15.09],
10919
+ 6: [10.64, 12.59, 14.45, 16.81],
10920
+ 7: [12.02, 14.07, 16.01, 18.48],
10921
+ 8: [13.36, 15.51, 17.53, 20.09],
10922
+ 9: [14.68, 16.92, 19.02, 21.67],
10923
+ 10: [15.99, 18.31, 20.48, 23.21],
10924
+ 15: [22.31, 25, 27.49, 30.58],
10925
+ 20: [28.41, 31.41, 34.17, 37.57],
10926
+ 25: [34.38, 37.65, 40.65, 44.31],
10927
+ 30: [40.26, 43.77, 46.98, 50.89]
10928
+ };
10929
+ const idx = alpha >= 0.1 ? 0 : alpha >= 0.05 ? 1 : alpha >= 0.025 ? 2 : 3;
10930
+ if (TABLE[df]) return TABLE[df][idx];
10931
+ if (df > 30) {
10932
+ const zMap = { 0: 1.282, 1: 1.645, 2: 1.96, 3: 2.326 };
10933
+ const z = zMap[idx] ?? 1.96;
10934
+ const term = 1 - 2 / (9 * df) + z * Math.sqrt(2 / (9 * df));
10935
+ return df * term ** 3;
10936
+ }
10937
+ const keys = Object.keys(TABLE).map((k) => Number(k)).sort((a, b) => a - b);
10938
+ for (let i = 1; i < keys.length; i++) {
10939
+ const lo = keys[i - 1];
10940
+ const hi = keys[i];
10941
+ if (df >= lo && df <= hi) {
10942
+ const t = (df - lo) / (hi - lo);
10943
+ return TABLE[lo][idx] * (1 - t) + TABLE[hi][idx] * t;
10944
+ }
10945
+ }
10946
+ return TABLE[10][idx];
10947
+ }
10948
+ function mean6(xs) {
10949
+ if (xs.length === 0) return 0;
10950
+ return xs.reduce((s, x) => s + x, 0) / xs.length;
10951
+ }
10952
+
10953
+ // src/benchmarks/types.ts
10954
+ function fnv1a32(input) {
10955
+ let h = 2166136261;
10956
+ for (let i = 0; i < input.length; i++) {
10957
+ h ^= input.charCodeAt(i) & 255;
10958
+ h = h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24)) >>> 0;
10959
+ }
10960
+ return h >>> 0;
10961
+ }
10962
+ var BENCHMARK_SPLIT_SEED = "agent-eval-v1";
10963
+ function deterministicSplit(itemId, seed = BENCHMARK_SPLIT_SEED) {
10964
+ const h = fnv1a32(`${seed}::${itemId}`);
10965
+ const pos = h / 4294967296;
10966
+ if (pos < 0.6) return "search";
10967
+ if (pos < 0.8) return "dev";
10968
+ return "holdout";
10969
+ }
10970
+
10971
+ // src/benchmarks/index.ts
10972
+ var benchmarks_exports = {};
10973
+ __export(benchmarks_exports, {
10974
+ BENCHMARK_SPLIT_SEED: () => BENCHMARK_SPLIT_SEED,
10975
+ deterministicSplit: () => deterministicSplit,
10976
+ gsm8k: () => gsm8k_exports,
10977
+ routing: () => routing_exports,
10978
+ swebenchLite: () => swebench_lite_exports
10979
+ });
10980
+
10981
+ // src/benchmarks/gsm8k/index.ts
10982
+ var gsm8k_exports = {};
10983
+ __export(gsm8k_exports, {
10984
+ Gsm8kAdapter: () => Gsm8kAdapter,
10985
+ assignSplit: () => assignSplit,
10986
+ evaluate: () => evaluate,
10987
+ loadDataset: () => loadDataset,
10988
+ parseGsm8kAnswer: () => parseGsm8kAnswer
10989
+ });
10990
+ import { existsSync as existsSync5, readFileSync as readFileSync5 } from "fs";
10991
+ var Gsm8kAdapter = class {
10992
+ async loadDataset(split) {
10993
+ const path = process.env.AGENT_EVAL_GSM8K_PATH;
10994
+ if (!path) {
10995
+ throw new Error(
10996
+ "GSM8K dataset not provided. Set AGENT_EVAL_GSM8K_PATH to a JSONL file with {id, question, answer} records (the HF GSM8K mirror converted to JSONL)."
10997
+ );
10998
+ }
10999
+ if (!existsSync5(path)) {
11000
+ throw new Error(`AGENT_EVAL_GSM8K_PATH=${path} does not exist`);
11001
+ }
11002
+ const items = parseJsonl(path).filter((it) => assignSplitImpl(it.id) === split);
11003
+ return items;
11004
+ }
11005
+ async evaluate(item, response) {
11006
+ const expected = parseGsm8kAnswer(item.payload.answer);
11007
+ const observed = parseGsm8kAnswer(response);
11008
+ if (expected === null) {
11009
+ return { score: 0, raw: { reason: "reference_not_numeric", expected: item.payload.answer } };
11010
+ }
11011
+ if (observed === null) {
11012
+ return { score: 0, raw: { reason: "no_numeric_in_response", expected, observed: null } };
11013
+ }
11014
+ const ok = Math.abs(expected - observed) < 1e-6;
11015
+ return { score: ok ? 1 : 0, raw: { expected, observed, exactMatch: ok } };
11016
+ }
11017
+ assignSplit(itemId) {
11018
+ return assignSplitImpl(itemId);
11019
+ }
11020
+ };
11021
+ function assignSplitImpl(itemId) {
11022
+ return deterministicSplit(`gsm8k::${itemId}`);
11023
+ }
11024
+ function parseJsonl(path) {
11025
+ const raw = readFileSync5(path, "utf8");
11026
+ const out = [];
11027
+ let lineNo = 0;
11028
+ for (const line of raw.split("\n")) {
11029
+ lineNo++;
11030
+ const trimmed = line.trim();
11031
+ if (!trimmed) continue;
11032
+ let row;
11033
+ try {
11034
+ row = JSON.parse(trimmed);
11035
+ } catch (e) {
11036
+ throw new Error(`GSM8K JSONL parse error at line ${lineNo}: ${e.message}`);
11037
+ }
11038
+ const id = String(row.id ?? `gsm8k_${lineNo}`);
11039
+ const question = String(row.question ?? "");
11040
+ const answer = String(row.answer ?? "");
11041
+ if (!question || !answer) {
11042
+ throw new Error(`GSM8K JSONL line ${lineNo} missing question/answer`);
11043
+ }
11044
+ out.push({ id, payload: { question, answer } });
11045
+ }
11046
+ return out;
11047
+ }
11048
+ function parseGsm8kAnswer(text) {
11049
+ if (!text) return null;
11050
+ const afterMarker = text.match(/####\s*(-?\d[\d,]*\.?\d*)/);
11051
+ if (afterMarker) {
11052
+ const cleaned2 = afterMarker[1].replace(/,/g, "");
11053
+ const v2 = Number(cleaned2);
11054
+ if (Number.isFinite(v2)) return v2;
11055
+ }
11056
+ const matches2 = text.match(/-?\d[\d,]*\.?\d*/g);
11057
+ if (!matches2 || matches2.length === 0) return null;
11058
+ const last = matches2[matches2.length - 1];
11059
+ const cleaned = last.replace(/,/g, "");
11060
+ const v = Number(cleaned);
11061
+ return Number.isFinite(v) ? v : null;
11062
+ }
11063
+ var adapter = new Gsm8kAdapter();
11064
+ var loadDataset = adapter.loadDataset.bind(adapter);
11065
+ var evaluate = adapter.evaluate.bind(adapter);
11066
+ var assignSplit = adapter.assignSplit.bind(adapter);
11067
+
11068
+ // src/benchmarks/swebench-lite/index.ts
11069
+ var swebench_lite_exports = {};
11070
+ __export(swebench_lite_exports, {
11071
+ SweBenchLiteAdapter: () => SweBenchLiteAdapter,
11072
+ assignSplit: () => assignSplit2,
11073
+ evaluate: () => evaluate2,
11074
+ loadDataset: () => loadDataset2
11075
+ });
11076
+ import { existsSync as existsSync6, readFileSync as readFileSync6 } from "fs";
11077
+ import { spawn } from "child_process";
11078
+ var SweBenchLiteAdapter = class {
11079
+ async loadDataset(split) {
11080
+ const path = process.env.AGENT_EVAL_SWEBENCH_PATH;
11081
+ if (!path) {
11082
+ throw new Error(
11083
+ "SWE-Bench Lite dataset not provided. Set AGENT_EVAL_SWEBENCH_PATH to a JSONL file with the 30 lite instances. STUB: this wrapper does not bundle the dataset; see https://www.swebench.com/lite.html for the canonical source."
11084
+ );
11085
+ }
11086
+ if (!existsSync6(path)) {
11087
+ throw new Error(`AGENT_EVAL_SWEBENCH_PATH=${path} does not exist`);
11088
+ }
11089
+ const all = parseJsonl2(path);
11090
+ return all.filter((it) => assignSplitImpl2(it.id) === split);
11091
+ }
11092
+ async evaluate(item, response) {
11093
+ const cmd = process.env.AGENT_EVAL_SWEBENCH_GRADER_CMD;
11094
+ if (!cmd) {
11095
+ throw new Error(
11096
+ "SWE-Bench Lite grader not configured. Set AGENT_EVAL_SWEBENCH_GRADER_CMD to an executable that reads {instance_id, patch} JSON on stdin and writes {passed, fail_to_pass_passed, pass_to_pass_passed, log} JSON on stdout. TODO(swebench-lite): bundle a default Docker-based runner once the SDK stabilises (https://github.com/swe-bench/SWE-bench)."
11097
+ );
11098
+ }
11099
+ const stdinPayload = JSON.stringify({ instance_id: item.payload.instanceId, patch: response });
11100
+ const result = await runGrader(cmd, stdinPayload);
11101
+ let parsed;
11102
+ try {
11103
+ parsed = JSON.parse(result.stdout);
11104
+ } catch (e) {
11105
+ throw new Error(
11106
+ `SWE-Bench grader emitted non-JSON stdout: ${e.message}
11107
+ stdout=${result.stdout.slice(0, 400)}
11108
+ stderr=${result.stderr.slice(0, 400)}`
11109
+ );
11110
+ }
11111
+ const passed = Boolean(parsed.passed);
11112
+ return {
11113
+ score: passed ? 1 : 0,
11114
+ raw: {
11115
+ passed,
11116
+ failToPassPassed: Boolean(parsed.fail_to_pass_passed),
11117
+ passToPassPassed: Boolean(parsed.pass_to_pass_passed),
11118
+ graderLog: typeof parsed.log === "string" ? parsed.log.slice(0, 4e3) : ""
11119
+ }
11120
+ };
11121
+ }
11122
+ assignSplit(itemId) {
11123
+ return assignSplitImpl2(itemId);
11124
+ }
11125
+ };
11126
+ function assignSplitImpl2(itemId) {
11127
+ return deterministicSplit(`swebench-lite::${itemId}`);
11128
+ }
11129
+ function parseJsonl2(path) {
11130
+ const raw = readFileSync6(path, "utf8");
11131
+ const out = [];
11132
+ let lineNo = 0;
11133
+ for (const line of raw.split("\n")) {
11134
+ lineNo++;
11135
+ const trimmed = line.trim();
11136
+ if (!trimmed) continue;
11137
+ const row = JSON.parse(trimmed);
11138
+ const instanceId = String(row.instance_id ?? row.instanceId ?? "");
11139
+ if (!instanceId) {
11140
+ throw new Error(`swebench-lite line ${lineNo} missing instance_id`);
11141
+ }
11142
+ out.push({
11143
+ id: instanceId,
11144
+ payload: {
11145
+ instanceId,
11146
+ problemStatement: String(row.problem_statement ?? row.problemStatement ?? ""),
11147
+ baseCommit: String(row.base_commit ?? row.baseCommit ?? ""),
11148
+ repo: String(row.repo ?? ""),
11149
+ failToPass: asStringArray(row.FAIL_TO_PASS ?? row.failToPass),
11150
+ passToPass: asStringArray(row.PASS_TO_PASS ?? row.passToPass)
11151
+ }
11152
+ });
11153
+ }
11154
+ return out;
11155
+ }
11156
+ function asStringArray(v) {
11157
+ if (Array.isArray(v)) return v.filter((x) => typeof x === "string");
11158
+ if (typeof v === "string") {
11159
+ try {
11160
+ const parsed = JSON.parse(v);
11161
+ if (Array.isArray(parsed)) return parsed.filter((x) => typeof x === "string");
11162
+ } catch {
11163
+ return [v];
11164
+ }
11165
+ }
11166
+ return [];
11167
+ }
11168
+ function runGrader(cmd, stdin) {
11169
+ return new Promise((resolve, reject) => {
11170
+ const parts = cmd.split(/\s+/);
11171
+ const child = spawn(parts[0], parts.slice(1), { stdio: ["pipe", "pipe", "pipe"] });
11172
+ let stdout = "";
11173
+ let stderr = "";
11174
+ child.stdout.on("data", (b) => stdout += b.toString("utf8"));
11175
+ child.stderr.on("data", (b) => stderr += b.toString("utf8"));
11176
+ child.on("error", reject);
11177
+ child.on("close", (code) => {
11178
+ if (code !== 0) {
11179
+ reject(new Error(`grader exited with code ${code}: ${stderr.slice(0, 400)}`));
11180
+ return;
11181
+ }
11182
+ resolve({ stdout, stderr });
11183
+ });
11184
+ child.stdin.write(stdin);
11185
+ child.stdin.end();
11186
+ });
11187
+ }
11188
+ var adapter2 = new SweBenchLiteAdapter();
11189
+ var loadDataset2 = adapter2.loadDataset.bind(adapter2);
11190
+ var evaluate2 = adapter2.evaluate.bind(adapter2);
11191
+ var assignSplit2 = adapter2.assignSplit.bind(adapter2);
11192
+
11193
+ // src/benchmarks/routing/index.ts
11194
+ var routing_exports = {};
11195
+ __export(routing_exports, {
11196
+ ROUTING_DATASET: () => ROUTING_DATASET,
11197
+ RoutingAdapter: () => RoutingAdapter,
11198
+ assignSplit: () => assignSplit3,
11199
+ evaluate: () => evaluate3,
11200
+ extractRouteTokens: () => extractRouteTokens,
11201
+ loadDataset: () => loadDataset3
11202
+ });
11203
+
11204
+ // src/benchmarks/routing/dataset.ts
11205
+ var ROUTING_DATASET = [
11206
+ {
11207
+ id: "file_001",
11208
+ category: "file",
11209
+ prompt: "Save the meeting notes to /tmp/notes-2025-04.md as markdown.",
11210
+ route: "fs.write",
11211
+ synonyms: ["filesystem.write", "write_file"],
11212
+ hardNegatives: ["fs.read", "chat.reply"]
11213
+ },
11214
+ {
11215
+ id: "file_002",
11216
+ category: "file",
11217
+ prompt: "Read the contents of /etc/hosts and summarize the entries.",
11218
+ route: "fs.read",
11219
+ synonyms: ["filesystem.read", "read_file"],
11220
+ hardNegatives: ["fs.write", "search.web"]
11221
+ },
11222
+ {
11223
+ id: "file_003",
11224
+ category: "file",
11225
+ prompt: "List every Python file under src/ recursively.",
11226
+ route: "fs.list",
11227
+ synonyms: ["filesystem.list", "list_files"],
11228
+ hardNegatives: ["fs.read", "search.code"]
11229
+ },
11230
+ {
11231
+ id: "file_004",
11232
+ category: "file",
11233
+ prompt: "Delete the cached build at .turbo/cache.",
11234
+ route: "fs.delete",
11235
+ synonyms: ["filesystem.delete", "remove_file"],
11236
+ hardNegatives: ["fs.write", "fs.list"]
11237
+ },
11238
+ {
11239
+ id: "math_001",
11240
+ category: "math",
11241
+ prompt: "What is the integral of 3x^2 + 2x from 0 to 5?",
11242
+ route: "math.integral",
11243
+ synonyms: ["calculator.integral", "math.solve"],
11244
+ hardNegatives: ["math.derivative", "chat.reply"]
11245
+ },
11246
+ {
11247
+ id: "math_002",
11248
+ category: "math",
11249
+ prompt: "Compute the derivative of sin(x) * cos(x).",
11250
+ route: "math.derivative",
11251
+ synonyms: ["calculator.derivative", "math.solve"],
11252
+ hardNegatives: ["math.integral", "math.algebra"]
11253
+ },
11254
+ {
11255
+ id: "math_003",
11256
+ category: "math",
11257
+ prompt: "Solve 2x + 7 = 19 for x.",
11258
+ route: "math.algebra",
11259
+ synonyms: ["calculator.algebra", "math.solve"],
11260
+ hardNegatives: ["math.derivative", "math.integral"]
11261
+ },
11262
+ {
11263
+ id: "math_004",
11264
+ category: "math",
11265
+ prompt: "What is the prime factorization of 360?",
11266
+ route: "math.numbertheory",
11267
+ synonyms: ["calculator.factor", "math.solve"],
11268
+ hardNegatives: ["math.algebra", "search.web"]
11269
+ },
11270
+ {
11271
+ id: "search_001",
11272
+ category: "search",
11273
+ prompt: "Find recent papers on agent prompt optimization with held-out promotion gates.",
11274
+ route: "search.web",
11275
+ synonyms: ["web.search", "search.papers"],
11276
+ hardNegatives: ["search.code", "chat.reply"]
11277
+ },
11278
+ {
11279
+ id: "search_002",
11280
+ category: "search",
11281
+ prompt: "Search the codebase for every call site of `runProposeReview`.",
11282
+ route: "search.code",
11283
+ synonyms: ["code.search", "grep"],
11284
+ hardNegatives: ["search.web", "fs.read"]
11285
+ },
11286
+ {
11287
+ id: "search_003",
11288
+ category: "search",
11289
+ prompt: "What is the latest release of the Tangle network on GitHub?",
11290
+ route: "search.web",
11291
+ synonyms: ["web.search", "github.releases"],
11292
+ hardNegatives: ["search.code", "chat.reply"]
11293
+ },
11294
+ {
11295
+ id: "search_004",
11296
+ category: "search",
11297
+ prompt: "Find all TODO comments in the agent-eval src tree.",
11298
+ route: "search.code",
11299
+ synonyms: ["code.search", "grep"],
11300
+ hardNegatives: ["search.web", "fs.list"]
11301
+ },
11302
+ {
11303
+ id: "chat_001",
11304
+ category: "chat",
11305
+ prompt: "Hi there, how are you doing today?",
11306
+ route: "chat.reply",
11307
+ synonyms: ["conversation.reply"],
11308
+ hardNegatives: ["search.web", "fs.read"]
11309
+ },
11310
+ {
11311
+ id: "chat_002",
11312
+ category: "chat",
11313
+ prompt: "Please explain the difference between an LLM and a foundation model.",
11314
+ route: "chat.reply",
11315
+ synonyms: ["conversation.reply", "qa.answer"],
11316
+ hardNegatives: ["search.web", "math.algebra"]
11317
+ },
11318
+ {
11319
+ id: "chat_003",
11320
+ category: "chat",
11321
+ prompt: "Tell me a short joke about distributed systems.",
11322
+ route: "chat.reply",
11323
+ synonyms: ["conversation.reply"],
11324
+ hardNegatives: ["search.web", "fs.read"]
11325
+ },
11326
+ {
11327
+ id: "chat_004",
11328
+ category: "chat",
11329
+ prompt: "Acknowledge my last message with a thumbs up.",
11330
+ route: "chat.reply",
11331
+ synonyms: ["conversation.reply", "react"],
11332
+ hardNegatives: ["fs.write", "search.web"]
11333
+ }
11334
+ ];
11335
+
11336
+ // src/benchmarks/routing/index.ts
11337
+ var RoutingAdapter = class {
11338
+ async loadDataset(split) {
11339
+ return ROUTING_DATASET.map((item) => ({ id: item.id, payload: item })).filter((it) => assignSplitImpl3(it.id) === split);
11340
+ }
11341
+ async evaluate(item, response) {
11342
+ const tokens2 = extractRouteTokens(response);
11343
+ const correct = new Set([item.payload.route, ...item.payload.synonyms].map((s) => s.toLowerCase()));
11344
+ const hardNeg = new Set(item.payload.hardNegatives.map((s) => s.toLowerCase()));
11345
+ const firstMatch = tokens2.find((t) => correct.has(t.toLowerCase())) ?? null;
11346
+ const firstHardNeg = tokens2.find((t) => hardNeg.has(t.toLowerCase())) ?? null;
11347
+ const score = firstMatch ? 1 : 0;
11348
+ return {
11349
+ score,
11350
+ raw: {
11351
+ firstToken: tokens2[0] ?? null,
11352
+ matchedRoute: firstMatch,
11353
+ hitHardNegative: Boolean(firstHardNeg),
11354
+ hardNegativeRoute: firstHardNeg,
11355
+ category: item.payload.category
11356
+ }
11357
+ };
11358
+ }
11359
+ assignSplit(itemId) {
11360
+ return assignSplitImpl3(itemId);
11361
+ }
11362
+ };
11363
+ function assignSplitImpl3(itemId) {
11364
+ return deterministicSplit(`routing::${itemId}`);
11365
+ }
11366
+ function extractRouteTokens(response) {
11367
+ const matches2 = response.match(/[a-z][a-z0-9_]*\.[a-z][a-z0-9_]*/gi);
11368
+ return matches2 ?? [];
11369
+ }
11370
+ var adapter3 = new RoutingAdapter();
11371
+ var loadDataset3 = adapter3.loadDataset.bind(adapter3);
11372
+ var evaluate3 = adapter3.evaluate.bind(adapter3);
11373
+ var assignSplit3 = adapter3.assignSplit.bind(adapter3);
11374
+
10069
11375
  // src/reference-replay-steering.ts
10070
11376
  function referenceReplayRunsToSteeringRows(runs, options = {}) {
10071
11377
  const rows = [];
@@ -10257,9 +11563,9 @@ function aggregateTrials(population, scenarioIds, trials) {
10257
11563
  return {
10258
11564
  variantId: variant.id,
10259
11565
  scenarioId: sid,
10260
- meanScore: mean5(gradedTrials.map((t) => t.score)),
10261
- meanCost: mean5(gradedTrials.map((t) => t.cost ?? 0)),
10262
- meanDurationMs: mean5(gradedTrials.map((t) => t.durationMs ?? 0)),
11566
+ meanScore: mean7(gradedTrials.map((t) => t.score)),
11567
+ meanCost: mean7(gradedTrials.map((t) => t.cost ?? 0)),
11568
+ meanDurationMs: mean7(gradedTrials.map((t) => t.durationMs ?? 0)),
10263
11569
  okRate: scenarioTrials.length === 0 ? 0 : okTrials.length / scenarioTrials.length,
10264
11570
  trials: scenarioTrials.length,
10265
11571
  metrics
@@ -10267,10 +11573,10 @@ function aggregateTrials(population, scenarioIds, trials) {
10267
11573
  });
10268
11574
  return {
10269
11575
  variantId: variant.id,
10270
- meanScore: mean5(scenarios.map((s) => s.meanScore)),
10271
- meanCost: mean5(scenarios.map((s) => s.meanCost)),
10272
- meanDurationMs: mean5(scenarios.map((s) => s.meanDurationMs)),
10273
- okRate: mean5(scenarios.map((s) => s.okRate)),
11576
+ meanScore: mean7(scenarios.map((s) => s.meanScore)),
11577
+ meanCost: mean7(scenarios.map((s) => s.meanCost)),
11578
+ meanDurationMs: mean7(scenarios.map((s) => s.meanDurationMs)),
11579
+ okRate: mean7(scenarios.map((s) => s.okRate)),
10274
11580
  scenarios,
10275
11581
  metrics: aggregateMetrics(scenarios.map((s) => s.metrics))
10276
11582
  };
@@ -10287,10 +11593,10 @@ function aggregateMetrics(rows) {
10287
11593
  }
10288
11594
  }
10289
11595
  const out = {};
10290
- for (const [k, list] of buckets) out[k] = mean5(list);
11596
+ for (const [k, list] of buckets) out[k] = mean7(list);
10291
11597
  return out;
10292
11598
  }
10293
- function mean5(xs) {
11599
+ function mean7(xs) {
10294
11600
  if (xs.length === 0) return 0;
10295
11601
  return xs.reduce((a, b) => a + b, 0) / xs.length;
10296
11602
  }
@@ -10331,11 +11637,11 @@ function samePopulation(a, b) {
10331
11637
  }
10332
11638
 
10333
11639
  // src/jsonl-trial-cache.ts
10334
- import { appendFileSync as appendFileSync4, existsSync as existsSync6, mkdirSync as mkdirSync4, readFileSync as readFileSync5 } from "fs";
11640
+ import { appendFileSync as appendFileSync4, existsSync as existsSync8, mkdirSync as mkdirSync4, readFileSync as readFileSync7 } from "fs";
10335
11641
  import { dirname as dirname4 } from "path";
10336
11642
 
10337
11643
  // src/locked-jsonl-appender.ts
10338
- import { appendFileSync as appendFileSync3, existsSync as existsSync5, mkdirSync as mkdirSync3 } from "fs";
11644
+ import { appendFileSync as appendFileSync3, existsSync as existsSync7, mkdirSync as mkdirSync3 } from "fs";
10339
11645
  import { dirname as dirname3 } from "path";
10340
11646
  var mutexes = /* @__PURE__ */ new Map();
10341
11647
  function getMutex(path) {
@@ -10350,7 +11656,7 @@ var LockedJsonlAppender = class {
10350
11656
  constructor(path) {
10351
11657
  this.path = path;
10352
11658
  this.mutex = getMutex(path);
10353
- if (!existsSync5(dirname3(path))) {
11659
+ if (!existsSync7(dirname3(path))) {
10354
11660
  mkdirSync3(dirname3(path), { recursive: true });
10355
11661
  }
10356
11662
  }
@@ -10375,8 +11681,8 @@ var JsonlTrialCache = class {
10375
11681
  appender;
10376
11682
  constructor(path) {
10377
11683
  this.path = path;
10378
- if (existsSync6(path)) {
10379
- for (const line of readFileSync5(path, "utf-8").split("\n")) {
11684
+ if (existsSync8(path)) {
11685
+ for (const line of readFileSync7(path, "utf-8").split("\n")) {
10380
11686
  if (!line.trim()) continue;
10381
11687
  try {
10382
11688
  const entry = JSON.parse(line);
@@ -10414,7 +11720,7 @@ var JsonlTrialCache = class {
10414
11720
  };
10415
11721
 
10416
11722
  // src/evolution-telemetry.ts
10417
- import { appendFileSync as appendFileSync5, existsSync as existsSync7, mkdirSync as mkdirSync5, readFileSync as readFileSync6, writeFileSync } from "fs";
11723
+ import { appendFileSync as appendFileSync5, existsSync as existsSync9, mkdirSync as mkdirSync5, readFileSync as readFileSync8, writeFileSync } from "fs";
10418
11724
  import { dirname as dirname5 } from "path";
10419
11725
  var MutationTelemetry = class {
10420
11726
  appender;
@@ -10445,16 +11751,16 @@ var LineageRecorder = class {
10445
11751
  this.snapshotPath = `${path}.snapshot`;
10446
11752
  this.kindOf = kindOf ?? defaultKindOf;
10447
11753
  mkdirSync5(dirname5(path), { recursive: true });
10448
- if (existsSync7(this.snapshotPath)) {
11754
+ if (existsSync9(this.snapshotPath)) {
10449
11755
  try {
10450
- const parsed = JSON.parse(readFileSync6(this.snapshotPath, "utf-8"));
11756
+ const parsed = JSON.parse(readFileSync8(this.snapshotPath, "utf-8"));
10451
11757
  for (const n of parsed) this.nodes.set(n.id, n);
10452
11758
  } catch {
10453
11759
  }
10454
11760
  }
10455
- if (existsSync7(path)) {
11761
+ if (existsSync9(path)) {
10456
11762
  try {
10457
- for (const line of readFileSync6(path, "utf-8").split("\n")) {
11763
+ for (const line of readFileSync8(path, "utf-8").split("\n")) {
10458
11764
  if (!line.trim()) continue;
10459
11765
  try {
10460
11766
  const entry = JSON.parse(line);
@@ -10466,9 +11772,9 @@ var LineageRecorder = class {
10466
11772
  } catch {
10467
11773
  }
10468
11774
  }
10469
- if (existsSync7(path) && this.nodes.size === 0) {
11775
+ if (existsSync9(path) && this.nodes.size === 0) {
10470
11776
  try {
10471
- const raw = readFileSync6(path, "utf-8").trim();
11777
+ const raw = readFileSync8(path, "utf-8").trim();
10472
11778
  if (raw.startsWith("[")) {
10473
11779
  const parsed = JSON.parse(raw);
10474
11780
  for (const n of parsed) this.nodes.set(n.id, n);
@@ -10482,8 +11788,8 @@ var LineageRecorder = class {
10482
11788
  const prev = this.nodes.get(node.id);
10483
11789
  this.nodes.set(node.id, { ...prev, ...node });
10484
11790
  try {
10485
- if (existsSync7(this.path)) {
10486
- const head = readFileSync6(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
11791
+ if (existsSync9(this.path)) {
11792
+ const head = readFileSync8(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
10487
11793
  if (head === "[") {
10488
11794
  writeFileSync(this.path, "");
10489
11795
  }
@@ -10549,9 +11855,9 @@ var CostLedger = class {
10549
11855
  mutex = new Mutex();
10550
11856
  constructor(path) {
10551
11857
  this.path = path;
10552
- if (existsSync7(path)) {
11858
+ if (existsSync9(path)) {
10553
11859
  try {
10554
- const loaded = JSON.parse(readFileSync6(path, "utf-8"));
11860
+ const loaded = JSON.parse(readFileSync8(path, "utf-8"));
10555
11861
  for (const k of Object.keys(this.totals)) {
10556
11862
  if (k === "byGeneration") {
10557
11863
  if (loaded.byGeneration && typeof loaded.byGeneration === "object") {
@@ -10975,9 +12281,9 @@ function passOrthogonality(input) {
10975
12281
  sims.push(cosineSimilarity(vectors[i], vectors[j]));
10976
12282
  }
10977
12283
  }
10978
- const mean7 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
12284
+ const mean9 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
10979
12285
  return {
10980
- orthogonality: Math.max(0, Math.min(1, 1 - mean7)),
12286
+ orthogonality: Math.max(0, Math.min(1, 1 - mean9)),
10981
12287
  passCount: passes.length,
10982
12288
  similarities: sims
10983
12289
  };
@@ -11023,8 +12329,8 @@ function bootstrapCi(baseline, candidate, options = {}) {
11023
12329
  const iterations = options.iterations ?? 1e3;
11024
12330
  const minTotal = options.minTotalSamples ?? 6;
11025
12331
  const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
11026
- const baselineMean = mean6(baseline);
11027
- const candidateMean = mean6(candidate);
12332
+ const baselineMean = mean8(baseline);
12333
+ const candidateMean = mean8(candidate);
11028
12334
  const delta = candidateMean - baselineMean;
11029
12335
  if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
11030
12336
  return {
@@ -11042,7 +12348,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
11042
12348
  for (let i = 0; i < iterations; i++) {
11043
12349
  const bResample = resample(baseline, rng);
11044
12350
  const cResample = resample(candidate, rng);
11045
- deltas[i] = mean6(cResample) - mean6(bResample);
12351
+ deltas[i] = mean8(cResample) - mean8(bResample);
11046
12352
  }
11047
12353
  deltas.sort((a, b) => a - b);
11048
12354
  const lowerIdx = Math.floor(alpha / 2 * iterations);
@@ -11065,7 +12371,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
11065
12371
  verdict
11066
12372
  };
11067
12373
  }
11068
- function mean6(xs) {
12374
+ function mean8(xs) {
11069
12375
  if (xs.length === 0) return 0;
11070
12376
  let s = 0;
11071
12377
  for (const x of xs) s += x;
@@ -11260,6 +12566,7 @@ function parseReflectionResponse(raw, maxProposals) {
11260
12566
  export {
11261
12567
  AgentDriver,
11262
12568
  AxGepaSteeringOptimizer,
12569
+ BENCHMARK_SPLIT_SEED,
11263
12570
  BenchmarkRunner,
11264
12571
  BudgetBreachError,
11265
12572
  BudgetGuard,
@@ -11288,6 +12595,7 @@ export {
11288
12595
  FileSystemExperimentStore,
11289
12596
  FileSystemOutcomeStore,
11290
12597
  FileSystemTraceStore,
12598
+ HeldOutGate,
11291
12599
  HoldoutAuditor,
11292
12600
  HoldoutLockedError,
11293
12601
  INTENT_MATCH_JUDGE_VERSION,
@@ -11307,6 +12615,7 @@ export {
11307
12615
  MultiLayerVerifier,
11308
12616
  MutationTelemetry,
11309
12617
  Mutex,
12618
+ NoopResearcher,
11310
12619
  OTEL_AGENT_EVAL_SCOPE,
11311
12620
  OptimizationLoop,
11312
12621
  PairwiseSteeringOptimizer,
@@ -11317,6 +12626,7 @@ export {
11317
12626
  PromptRegistry,
11318
12627
  REDACTION_VERSION,
11319
12628
  RunCritic,
12629
+ RunRecordValidationError,
11320
12630
  SEMANTIC_CONCEPT_JUDGE_VERSION,
11321
12631
  SandboxHarness,
11322
12632
  ScenarioRegistry,
@@ -11333,7 +12643,10 @@ export {
11333
12643
  analyzeSeries,
11334
12644
  argHash,
11335
12645
  attributeCounterfactuals,
12646
+ deterministicSplit as benchmarkDeterministicSplit,
12647
+ benchmarks_exports as benchmarks,
11336
12648
  benjaminiHochberg,
12649
+ bhAdjust,
11337
12650
  bisect,
11338
12651
  bonferroni,
11339
12652
  bootstrapCi,
@@ -11413,6 +12726,7 @@ export {
11413
12726
  formatBenchmarkReport,
11414
12727
  formatDriverReport,
11415
12728
  formatFindings,
12729
+ gainHistogram,
11416
12730
  precision as goldenPrecision,
11417
12731
  gradeSemanticStatus,
11418
12732
  groupBy,
@@ -11427,6 +12741,7 @@ export {
11427
12741
  isLlmSpan,
11428
12742
  isPrmVerdict,
11429
12743
  isRetrievalSpan,
12744
+ isRunRecord,
11430
12745
  isSandboxSpan,
11431
12746
  isToolSpan,
11432
12747
  jestTestParser,
@@ -11454,11 +12769,15 @@ export {
11454
12769
  normalizeScores,
11455
12770
  notBlocked,
11456
12771
  outputLengthRubric,
12772
+ pairedBootstrap,
11457
12773
  pairedTTest,
12774
+ pairedWilcoxon,
11458
12775
  paraphraseRobustness,
12776
+ paretoChart,
11459
12777
  paretoFrontier,
11460
12778
  paretoFrontierWithCrowding,
11461
12779
  parseReflectionResponse,
12780
+ parseRunRecordSafe,
11462
12781
  partialCredit,
11463
12782
  passOrthogonality,
11464
12783
  pixelDeltaRatio,
@@ -11489,9 +12808,11 @@ export {
11489
12808
  requiredSampleSize,
11490
12809
  resetLockedAppendersForTesting,
11491
12810
  resumeBuilderSession,
12811
+ roundTripRunRecord,
11492
12812
  rowCount,
11493
12813
  rowWhere,
11494
12814
  runAssertions,
12815
+ runCanaries,
11495
12816
  runCounterfactual,
11496
12817
  runE2EWorkflow,
11497
12818
  runExpectations,
@@ -11526,6 +12847,7 @@ export {
11526
12847
  stuckLoopView,
11527
12848
  summarize,
11528
12849
  summarizeHarnessResults,
12850
+ summaryTable,
11529
12851
  testJudge,
11530
12852
  textInSnapshot,
11531
12853
  toLangfuseEnvelope,
@@ -11539,6 +12861,7 @@ export {
11539
12861
  toolWasteView,
11540
12862
  typoMutator,
11541
12863
  urlContains,
12864
+ validateRunRecord,
11542
12865
  verbosityBias,
11543
12866
  verifyManifest,
11544
12867
  visualDiff,
@@ -11548,6 +12871,7 @@ export {
11548
12871
  weightedRecall,
11549
12872
  welchsTTest,
11550
12873
  whitespaceCollapseMutator,
11551
- wilcoxonSignedRank
12874
+ wilcoxonSignedRank,
12875
+ wranglerDeployRunner
11552
12876
  };
11553
12877
  //# sourceMappingURL=index.js.map