@tangle-network/agent-eval 0.14.2 → 0.16.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +45 -0
- package/dist/chunk-PZ5AY32C.js +10 -0
- package/dist/chunk-PZ5AY32C.js.map +1 -0
- package/dist/cli.js +1 -0
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +963 -4
- package/dist/index.js +1456 -132
- package/dist/index.js.map +1 -1
- package/dist/telemetry/file.js +2 -0
- package/dist/telemetry/file.js.map +1 -1
- package/dist/telemetry/index.js +2 -0
- package/dist/telemetry/index.js.map +1 -1
- package/dist/wire/index.js +1 -0
- package/package.json +10 -12
package/dist/index.js
CHANGED
|
@@ -6,6 +6,9 @@ import {
|
|
|
6
6
|
probeLlm,
|
|
7
7
|
stripFencedJson
|
|
8
8
|
} from "./chunk-ITN4YOZY.js";
|
|
9
|
+
import {
|
|
10
|
+
__export
|
|
11
|
+
} from "./chunk-PZ5AY32C.js";
|
|
9
12
|
|
|
10
13
|
// src/client.ts
|
|
11
14
|
var ProductClient = class {
|
|
@@ -396,36 +399,36 @@ var INVERTED_DIMENSIONS = /* @__PURE__ */ new Set([
|
|
|
396
399
|
"false_confidence",
|
|
397
400
|
"worst_failure"
|
|
398
401
|
]);
|
|
399
|
-
function normalizeScores(
|
|
400
|
-
return
|
|
402
|
+
function normalizeScores(scores2) {
|
|
403
|
+
return scores2.map((s) => {
|
|
401
404
|
if (INVERTED_DIMENSIONS.has(s.dimension)) {
|
|
402
405
|
return s;
|
|
403
406
|
}
|
|
404
407
|
return s;
|
|
405
408
|
});
|
|
406
409
|
}
|
|
407
|
-
function weightedMean(
|
|
408
|
-
if (
|
|
410
|
+
function weightedMean(scores2) {
|
|
411
|
+
if (scores2.length === 0) return 0;
|
|
409
412
|
let totalWeight = 0;
|
|
410
413
|
let weightedSum = 0;
|
|
411
|
-
for (const { score, weight } of
|
|
414
|
+
for (const { score, weight } of scores2) {
|
|
412
415
|
const w = weight ?? 1;
|
|
413
416
|
weightedSum += score * w;
|
|
414
417
|
totalWeight += w;
|
|
415
418
|
}
|
|
416
419
|
return totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
417
420
|
}
|
|
418
|
-
function confidenceInterval(
|
|
419
|
-
if (
|
|
420
|
-
if (
|
|
421
|
-
const n =
|
|
422
|
-
const
|
|
421
|
+
function confidenceInterval(scores2, confidence = 0.95) {
|
|
422
|
+
if (scores2.length === 0) return { mean: 0, lower: 0, upper: 0 };
|
|
423
|
+
if (scores2.length === 1) return { mean: scores2[0], lower: scores2[0], upper: scores2[0] };
|
|
424
|
+
const n = scores2.length;
|
|
425
|
+
const mean9 = scores2.reduce((a, b) => a + b, 0) / n;
|
|
423
426
|
const B = 1e3;
|
|
424
427
|
const bootstrapMeans = [];
|
|
425
428
|
for (let i = 0; i < B; i++) {
|
|
426
429
|
let sum2 = 0;
|
|
427
430
|
for (let j = 0; j < n; j++) {
|
|
428
|
-
sum2 +=
|
|
431
|
+
sum2 += scores2[Math.floor(Math.random() * n)];
|
|
429
432
|
}
|
|
430
433
|
bootstrapMeans.push(sum2 / n);
|
|
431
434
|
}
|
|
@@ -434,7 +437,7 @@ function confidenceInterval(scores, confidence = 0.95) {
|
|
|
434
437
|
const lowerIdx = Math.floor(alpha / 2 * B);
|
|
435
438
|
const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
|
|
436
439
|
return {
|
|
437
|
-
mean:
|
|
440
|
+
mean: mean9,
|
|
438
441
|
lower: bootstrapMeans[lowerIdx],
|
|
439
442
|
upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
|
|
440
443
|
};
|
|
@@ -522,11 +525,11 @@ function pairedTTest(before, after) {
|
|
|
522
525
|
const n = before.length;
|
|
523
526
|
if (n < 2) return { t: 0, df: 0, p: 1 };
|
|
524
527
|
const diffs = before.map((b, i) => after[i] - b);
|
|
525
|
-
const
|
|
526
|
-
const variance2 = diffs.reduce((acc, d) => acc + (d -
|
|
528
|
+
const mean9 = diffs.reduce((a, b) => a + b, 0) / n;
|
|
529
|
+
const variance2 = diffs.reduce((acc, d) => acc + (d - mean9) ** 2, 0) / (n - 1);
|
|
527
530
|
const se = Math.sqrt(variance2 / n);
|
|
528
|
-
if (se === 0) return { t:
|
|
529
|
-
const t =
|
|
531
|
+
if (se === 0) return { t: mean9 === 0 ? 0 : Infinity, df: n - 1, p: mean9 === 0 ? 1 : 0 };
|
|
532
|
+
const t = mean9 / se;
|
|
530
533
|
const df = n - 1;
|
|
531
534
|
const p = 2 * (1 - studentTCdf(Math.abs(t), df));
|
|
532
535
|
return { t, df, p };
|
|
@@ -544,15 +547,15 @@ function wilcoxonSignedRank(before, after) {
|
|
|
544
547
|
while (i < n) {
|
|
545
548
|
let j = i;
|
|
546
549
|
while (j < n && absRanks[j].abs === absRanks[i].abs) j++;
|
|
547
|
-
const
|
|
548
|
-
for (let k = i; k < j; k++) ranks3[absRanks[k].i] =
|
|
550
|
+
const avg2 = (i + 1 + j) / 2;
|
|
551
|
+
for (let k = i; k < j; k++) ranks3[absRanks[k].i] = avg2;
|
|
549
552
|
i = j;
|
|
550
553
|
}
|
|
551
554
|
let wPlus = 0;
|
|
552
555
|
for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
|
|
553
|
-
const
|
|
556
|
+
const mean9 = n * (n + 1) / 4;
|
|
554
557
|
const variance2 = n * (n + 1) * (2 * n + 1) / 24;
|
|
555
|
-
const z = (wPlus -
|
|
558
|
+
const z = (wPlus - mean9) / Math.sqrt(variance2);
|
|
556
559
|
const p = 2 * (1 - normalCdf(Math.abs(z)));
|
|
557
560
|
return { w: wPlus, p };
|
|
558
561
|
}
|
|
@@ -753,8 +756,8 @@ async function executeScenario(tc, scenario, config) {
|
|
|
753
756
|
console.log(` judge retry ${attempt}/2 (waiting ${wait / 1e3}s)`);
|
|
754
757
|
await new Promise((r) => setTimeout(r, wait));
|
|
755
758
|
}
|
|
756
|
-
const
|
|
757
|
-
judgeResults.push(
|
|
759
|
+
const scores2 = await judge(tc, judgeInput);
|
|
760
|
+
judgeResults.push(scores2);
|
|
758
761
|
await new Promise((r) => setTimeout(r, 3e3));
|
|
759
762
|
break;
|
|
760
763
|
} catch (err) {
|
|
@@ -847,8 +850,8 @@ var BenchmarkRunner = class {
|
|
|
847
850
|
byJudge[js.judgeName].dimensions.push(`${js.dimension}=${js.score}`);
|
|
848
851
|
}
|
|
849
852
|
for (const [name, data] of Object.entries(byJudge)) {
|
|
850
|
-
const
|
|
851
|
-
console.log(` ${name.padEnd(16)} avg=${
|
|
853
|
+
const avg2 = (data.scores.reduce((a, b) => a + b, 0) / data.scores.length).toFixed(1);
|
|
854
|
+
console.log(` ${name.padEnd(16)} avg=${avg2} [${data.dimensions.join(", ")}]`);
|
|
852
855
|
}
|
|
853
856
|
console.log(` OVERALL: ${result.overallScore.toFixed(1)}/10 (${(result.totalDurationMs / 1e3).toFixed(0)}s)`);
|
|
854
857
|
console.log();
|
|
@@ -2270,7 +2273,7 @@ var PromptOptimizer = class {
|
|
|
2270
2273
|
});
|
|
2271
2274
|
}
|
|
2272
2275
|
}
|
|
2273
|
-
const
|
|
2276
|
+
const scores2 = config.variants.map((variant) => {
|
|
2274
2277
|
const scenarioMap = rawScores.get(variant.id);
|
|
2275
2278
|
const allSamples = [];
|
|
2276
2279
|
const perScenario = {};
|
|
@@ -2293,10 +2296,10 @@ var PromptOptimizer = class {
|
|
|
2293
2296
|
};
|
|
2294
2297
|
});
|
|
2295
2298
|
const rawPairs = [];
|
|
2296
|
-
for (let i = 0; i <
|
|
2297
|
-
for (let j = i + 1; j <
|
|
2298
|
-
const a =
|
|
2299
|
-
const b =
|
|
2299
|
+
for (let i = 0; i < scores2.length; i++) {
|
|
2300
|
+
for (let j = i + 1; j < scores2.length; j++) {
|
|
2301
|
+
const a = scores2[i];
|
|
2302
|
+
const b = scores2[j];
|
|
2300
2303
|
const { p } = mannWhitneyU(flatSamples(a), flatSamples(b));
|
|
2301
2304
|
rawPairs.push({ a, b, p });
|
|
2302
2305
|
}
|
|
@@ -2310,7 +2313,7 @@ var PromptOptimizer = class {
|
|
|
2310
2313
|
significant: qValues[idx] < alpha,
|
|
2311
2314
|
meanDelta: r.b.mean - r.a.mean
|
|
2312
2315
|
}));
|
|
2313
|
-
const sorted =
|
|
2316
|
+
const sorted = scores2.slice().sort((x, y) => y.mean - x.mean);
|
|
2314
2317
|
const winner = sorted[0];
|
|
2315
2318
|
const second = sorted[1];
|
|
2316
2319
|
const winnerComparisons = pairwise2.filter(
|
|
@@ -2324,7 +2327,7 @@ var PromptOptimizer = class {
|
|
|
2324
2327
|
significant: significantOverAll,
|
|
2325
2328
|
ciLowerBoundExceedsSecondMean
|
|
2326
2329
|
},
|
|
2327
|
-
scores,
|
|
2330
|
+
scores: scores2,
|
|
2328
2331
|
pairwise: pairwise2,
|
|
2329
2332
|
config: {
|
|
2330
2333
|
trialsPerScenario: trials,
|
|
@@ -2870,20 +2873,20 @@ async function mapLimit(items, limit, fn) {
|
|
|
2870
2873
|
function mean(values) {
|
|
2871
2874
|
return values.length ? values.reduce((sum2, value) => sum2 + value, 0) / values.length : 0;
|
|
2872
2875
|
}
|
|
2873
|
-
function meanRunScore(
|
|
2876
|
+
function meanRunScore(scores2) {
|
|
2874
2877
|
return {
|
|
2875
|
-
success: mean(
|
|
2876
|
-
goalProgress: mean(
|
|
2877
|
-
repoGroundedness: mean(
|
|
2878
|
-
driftPenalty: mean(
|
|
2879
|
-
toolUseQuality: mean(
|
|
2880
|
-
patchQuality: mean(
|
|
2881
|
-
testReality: mean(
|
|
2882
|
-
finalGate: mean(
|
|
2883
|
-
reviewerBlockers: mean(
|
|
2884
|
-
costUsd: mean(
|
|
2885
|
-
wallSeconds: mean(
|
|
2886
|
-
notes:
|
|
2878
|
+
success: mean(scores2.map((s) => s.success)),
|
|
2879
|
+
goalProgress: mean(scores2.map((s) => s.goalProgress)),
|
|
2880
|
+
repoGroundedness: mean(scores2.map((s) => s.repoGroundedness)),
|
|
2881
|
+
driftPenalty: mean(scores2.map((s) => s.driftPenalty)),
|
|
2882
|
+
toolUseQuality: mean(scores2.map((s) => s.toolUseQuality)),
|
|
2883
|
+
patchQuality: mean(scores2.map((s) => s.patchQuality)),
|
|
2884
|
+
testReality: mean(scores2.map((s) => s.testReality)),
|
|
2885
|
+
finalGate: mean(scores2.map((s) => s.finalGate)),
|
|
2886
|
+
reviewerBlockers: mean(scores2.map((s) => s.reviewerBlockers)),
|
|
2887
|
+
costUsd: mean(scores2.map((s) => s.costUsd)),
|
|
2888
|
+
wallSeconds: mean(scores2.map((s) => s.wallSeconds)),
|
|
2889
|
+
notes: scores2.flatMap((s) => s.notes ?? [])
|
|
2887
2890
|
};
|
|
2888
2891
|
}
|
|
2889
2892
|
|
|
@@ -3339,12 +3342,12 @@ var SubprocessSandboxDriver = class {
|
|
|
3339
3342
|
this.defaultEnv = options.env;
|
|
3340
3343
|
}
|
|
3341
3344
|
async exec(phase, command, config) {
|
|
3342
|
-
const { spawn } = await import("child_process");
|
|
3345
|
+
const { spawn: spawn2 } = await import("child_process");
|
|
3343
3346
|
const start = Date.now();
|
|
3344
3347
|
const effectiveCwd = config.cwd ?? this.defaultCwd;
|
|
3345
3348
|
const effectiveEnv = { ...process.env, ...this.defaultEnv ?? {}, ...config.env ?? {} };
|
|
3346
3349
|
return await new Promise((resolve) => {
|
|
3347
|
-
const child =
|
|
3350
|
+
const child = spawn2(command, {
|
|
3348
3351
|
shell: true,
|
|
3349
3352
|
cwd: effectiveCwd,
|
|
3350
3353
|
env: effectiveEnv
|
|
@@ -5392,10 +5395,10 @@ function analyzeSeries(values, options = {}) {
|
|
|
5392
5395
|
return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
|
|
5393
5396
|
}
|
|
5394
5397
|
const tail = values.slice(-window);
|
|
5395
|
-
const
|
|
5396
|
-
const variance2 = tail.reduce((acc, v) => acc + (v -
|
|
5398
|
+
const mean9 = tail.reduce((a, b) => a + b, 0) / tail.length;
|
|
5399
|
+
const variance2 = tail.reduce((acc, v) => acc + (v - mean9) ** 2, 0) / tail.length;
|
|
5397
5400
|
const stdDev = Math.sqrt(variance2);
|
|
5398
|
-
const refMean = Math.abs(
|
|
5401
|
+
const refMean = Math.abs(mean9) > 1e-9 ? Math.abs(mean9) : 1;
|
|
5399
5402
|
const cv = stdDev / refMean;
|
|
5400
5403
|
const stable = tail.length >= window && cv <= stableCv;
|
|
5401
5404
|
let tailRun = 0;
|
|
@@ -5416,7 +5419,7 @@ function analyzeSeries(values, options = {}) {
|
|
|
5416
5419
|
} else {
|
|
5417
5420
|
state = "noisy";
|
|
5418
5421
|
}
|
|
5419
|
-
return { state, windowMean:
|
|
5422
|
+
return { state, windowMean: mean9, windowCv: cv, tailRun, stable };
|
|
5420
5423
|
}
|
|
5421
5424
|
|
|
5422
5425
|
// src/state-continuity.ts
|
|
@@ -6012,9 +6015,9 @@ function calibrateJudge(golden, candidate) {
|
|
|
6012
6015
|
const worst2 = [...map.entries()].filter(([, v]) => Number.isFinite(v.j)).map(([itemId, v]) => ({ itemId, judge: v.j, human: v.h, delta: Math.abs(v.j - v.h) })).sort((a, b) => b.delta - a.delta).slice(0, 5);
|
|
6013
6016
|
return { n, pearson: pearson2, kappa, mae, worstItems: worst2 };
|
|
6014
6017
|
}
|
|
6015
|
-
function positionalBias(
|
|
6018
|
+
function positionalBias(scores2) {
|
|
6016
6019
|
const pairs = /* @__PURE__ */ new Map();
|
|
6017
|
-
for (const s of
|
|
6020
|
+
for (const s of scores2) {
|
|
6018
6021
|
const slot = pairs.get(s.itemId) ?? {};
|
|
6019
6022
|
if (s.positionOfAInput === "first") slot.first = s.score;
|
|
6020
6023
|
else if (s.positionOfAInput === "second") slot.second = s.score;
|
|
@@ -6165,12 +6168,12 @@ function renderMarkdownReport(reports) {
|
|
|
6165
6168
|
async function aggregateRunMetrics(runs, store) {
|
|
6166
6169
|
if (runs.length === 0) return {};
|
|
6167
6170
|
const durations = [];
|
|
6168
|
-
const
|
|
6171
|
+
const scores2 = [];
|
|
6169
6172
|
const passes = [];
|
|
6170
6173
|
const costs = [];
|
|
6171
6174
|
for (const r of runs) {
|
|
6172
6175
|
if (r.endedAt) durations.push(r.endedAt - r.startedAt);
|
|
6173
|
-
if (r.outcome?.score !== void 0)
|
|
6176
|
+
if (r.outcome?.score !== void 0) scores2.push(r.outcome.score);
|
|
6174
6177
|
passes.push(r.outcome?.pass === true ? 1 : 0);
|
|
6175
6178
|
const llm = await llmSpans(store, r.runId);
|
|
6176
6179
|
costs.push(aggregateLlm(llm).costUsd);
|
|
@@ -6179,7 +6182,7 @@ async function aggregateRunMetrics(runs, store) {
|
|
|
6179
6182
|
provisionMs: average(durations),
|
|
6180
6183
|
firstTokenMs: average(durations),
|
|
6181
6184
|
wallMs: average(durations),
|
|
6182
|
-
overallScore: average(
|
|
6185
|
+
overallScore: average(scores2),
|
|
6183
6186
|
passRate: average(passes),
|
|
6184
6187
|
costUsd: average(costs)
|
|
6185
6188
|
};
|
|
@@ -6242,7 +6245,7 @@ async function toLangfuseEnvelope(store, runId) {
|
|
|
6242
6245
|
},
|
|
6243
6246
|
metadata: { finishReason: s.finishReason, cachedTokens: s.cachedTokens }
|
|
6244
6247
|
}));
|
|
6245
|
-
const
|
|
6248
|
+
const scores2 = judges.map((j) => ({
|
|
6246
6249
|
id: j.spanId,
|
|
6247
6250
|
traceId: run.runId,
|
|
6248
6251
|
observationId: j.targetSpanId,
|
|
@@ -6250,7 +6253,7 @@ async function toLangfuseEnvelope(store, runId) {
|
|
|
6250
6253
|
value: j.score,
|
|
6251
6254
|
comment: j.rationale
|
|
6252
6255
|
}));
|
|
6253
|
-
return { traceId: run.runId, generations, scores };
|
|
6256
|
+
return { traceId: run.runId, generations, scores: scores2 };
|
|
6254
6257
|
}
|
|
6255
6258
|
async function toPrometheusText(store) {
|
|
6256
6259
|
const runs = await store.listRuns();
|
|
@@ -6344,12 +6347,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
|
|
|
6344
6347
|
variantScores.push({ mutator: id, score, mutated });
|
|
6345
6348
|
all.push(score);
|
|
6346
6349
|
}
|
|
6347
|
-
const
|
|
6348
|
-
const variance2 = all.reduce((a, v) => a + (v -
|
|
6350
|
+
const mean9 = all.reduce((a, b) => a + b, 0) / all.length;
|
|
6351
|
+
const variance2 = all.reduce((a, v) => a + (v - mean9) ** 2, 0) / all.length;
|
|
6349
6352
|
const stdDev = Math.sqrt(variance2);
|
|
6350
|
-
const ref = Math.abs(
|
|
6353
|
+
const ref = Math.abs(mean9) > 1e-9 ? Math.abs(mean9) : 1;
|
|
6351
6354
|
const robustness = Math.max(0, 1 - stdDev / ref);
|
|
6352
|
-
return { originalScore, variantScores, meanScore:
|
|
6355
|
+
return { originalScore, variantScores, meanScore: mean9, stdDev, robustness };
|
|
6353
6356
|
}
|
|
6354
6357
|
var lowercaseMutator = (p) => p.toLowerCase();
|
|
6355
6358
|
var sentenceReorderMutator = (p, seed) => {
|
|
@@ -6684,8 +6687,8 @@ function ranks(xs) {
|
|
|
6684
6687
|
for (let i = 0; i < indexed.length; i++) {
|
|
6685
6688
|
let j = i;
|
|
6686
6689
|
while (j + 1 < indexed.length && indexed[j + 1].v === indexed[i].v) j++;
|
|
6687
|
-
const
|
|
6688
|
-
for (let k = i; k <= j; k++) r[indexed[k].i] =
|
|
6690
|
+
const avg2 = (i + j + 2) / 2;
|
|
6691
|
+
for (let k = i; k <= j; k++) r[indexed[k].i] = avg2;
|
|
6689
6692
|
i = j;
|
|
6690
6693
|
}
|
|
6691
6694
|
return r;
|
|
@@ -6929,8 +6932,8 @@ function ranks2(xs) {
|
|
|
6929
6932
|
for (let i = 0; i < indexed.length; i++) {
|
|
6930
6933
|
let j = i;
|
|
6931
6934
|
while (j + 1 < indexed.length && indexed[j + 1].v === indexed[i].v) j++;
|
|
6932
|
-
const
|
|
6933
|
-
for (let k = i; k <= j; k++) r[indexed[k].i] =
|
|
6935
|
+
const avg2 = (i + j + 2) / 2;
|
|
6936
|
+
for (let k = i; k <= j; k++) r[indexed[k].i] = avg2;
|
|
6934
6937
|
i = j;
|
|
6935
6938
|
}
|
|
6936
6939
|
return r;
|
|
@@ -7270,8 +7273,8 @@ async function prmBestOfN(store, grader, runIds) {
|
|
|
7270
7273
|
if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
|
|
7271
7274
|
const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
|
|
7272
7275
|
const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
|
|
7273
|
-
const
|
|
7274
|
-
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore -
|
|
7276
|
+
const mean9 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
|
|
7277
|
+
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean9) ** 2, 0) / graded.length;
|
|
7275
7278
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
7276
7279
|
}
|
|
7277
7280
|
async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
@@ -7293,8 +7296,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
|
7293
7296
|
const ranked = [...byRun.values()].sort(
|
|
7294
7297
|
(a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
|
|
7295
7298
|
);
|
|
7296
|
-
const
|
|
7297
|
-
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore -
|
|
7299
|
+
const mean9 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
|
|
7300
|
+
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean9) ** 2, 0) / ranked.length;
|
|
7298
7301
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
7299
7302
|
}
|
|
7300
7303
|
|
|
@@ -7672,15 +7675,15 @@ async function runSelfPlay(proposer, scorer, targets, options = {}) {
|
|
|
7672
7675
|
const rejected = [];
|
|
7673
7676
|
const surviving = [];
|
|
7674
7677
|
for (const candidate of proposed) {
|
|
7675
|
-
const
|
|
7676
|
-
if (
|
|
7678
|
+
const scores2 = await scorer.scoreCandidate(candidate, targets);
|
|
7679
|
+
if (scores2.length < 2) {
|
|
7677
7680
|
rejected.push({ candidate, reason: "scorer returned <2 results" });
|
|
7678
7681
|
continue;
|
|
7679
7682
|
}
|
|
7680
|
-
const values =
|
|
7683
|
+
const values = scores2.map((s) => s.score);
|
|
7681
7684
|
const spread = Math.max(...values) - Math.min(...values);
|
|
7682
7685
|
const maxScore = Math.max(...values);
|
|
7683
|
-
scored.push({ candidate, scores, spread });
|
|
7686
|
+
scored.push({ candidate, scores: scores2, spread });
|
|
7684
7687
|
if (maxScore < floor) {
|
|
7685
7688
|
rejected.push({ candidate, reason: `every target below floor (max=${maxScore.toFixed(3)} < ${floor})` });
|
|
7686
7689
|
continue;
|
|
@@ -7822,10 +7825,10 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
7822
7825
|
}
|
|
7823
7826
|
for (const s of scenarios) {
|
|
7824
7827
|
const sRuns = runs.filter((r) => r.scenarioId === s.id);
|
|
7825
|
-
const
|
|
7826
|
-
if (
|
|
7827
|
-
const
|
|
7828
|
-
const variance2 =
|
|
7828
|
+
const scores2 = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
|
|
7829
|
+
if (scores2.length < 3) continue;
|
|
7830
|
+
const mean9 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
|
|
7831
|
+
const variance2 = scores2.reduce((a, b) => a + (b - mean9) ** 2, 0) / scores2.length;
|
|
7829
7832
|
if (variance2 > varianceThreshold) {
|
|
7830
7833
|
targets.push({
|
|
7831
7834
|
reason: "high-variance",
|
|
@@ -8580,20 +8583,20 @@ function mergeLayerResults(name, perAdapter, options = {}) {
|
|
|
8580
8583
|
let durationMs = 0;
|
|
8581
8584
|
const reasonParts = [];
|
|
8582
8585
|
const diagnostics = {};
|
|
8583
|
-
for (const { adapter, result } of perAdapter) {
|
|
8586
|
+
for (const { adapter: adapter4, result } of perAdapter) {
|
|
8584
8587
|
status = worst(status, result.status);
|
|
8585
8588
|
if (typeof result.score === "number") {
|
|
8586
8589
|
weightedScoreSum += result.score;
|
|
8587
8590
|
weightCount += 1;
|
|
8588
8591
|
}
|
|
8589
8592
|
durationMs = mergeDuration === "sum" ? durationMs + result.durationMs : Math.max(durationMs, result.durationMs);
|
|
8590
|
-
reasonParts.push(`${
|
|
8593
|
+
reasonParts.push(`${adapter4}: ${result.status}`);
|
|
8591
8594
|
for (const f of result.findings) {
|
|
8592
8595
|
findings.push({
|
|
8593
8596
|
...f,
|
|
8594
8597
|
layer: name,
|
|
8595
|
-
message: prefix ? `${prefix(
|
|
8596
|
-
detail: { ...f.detail ?? {}, adapter }
|
|
8598
|
+
message: prefix ? `${prefix(adapter4)} ${f.message}` : f.message,
|
|
8599
|
+
detail: { ...f.detail ?? {}, adapter: adapter4 }
|
|
8597
8600
|
});
|
|
8598
8601
|
}
|
|
8599
8602
|
for (const [k, v] of Object.entries(result.diagnostics ?? {})) {
|
|
@@ -8612,8 +8615,8 @@ function mergeLayerResults(name, perAdapter, options = {}) {
|
|
|
8612
8615
|
reason: reasonParts.join(" \xB7 "),
|
|
8613
8616
|
diagnostics: Object.keys(diagnostics).length > 0 ? diagnostics : void 0,
|
|
8614
8617
|
detail: {
|
|
8615
|
-
adapters: perAdapter.map(({ adapter, result }) => ({
|
|
8616
|
-
adapter,
|
|
8618
|
+
adapters: perAdapter.map(({ adapter: adapter4, result }) => ({
|
|
8619
|
+
adapter: adapter4,
|
|
8617
8620
|
status: result.status,
|
|
8618
8621
|
score: result.score ?? null
|
|
8619
8622
|
})),
|
|
@@ -8639,10 +8642,10 @@ function multiToolchainLayer(config) {
|
|
|
8639
8642
|
reason: "no adapters detected"
|
|
8640
8643
|
};
|
|
8641
8644
|
}
|
|
8642
|
-
const runOne = async (
|
|
8643
|
-
const adapterName = config.adapterName(
|
|
8645
|
+
const runOne = async (adapter4) => {
|
|
8646
|
+
const adapterName = config.adapterName(adapter4);
|
|
8644
8647
|
try {
|
|
8645
|
-
const r = await config.run(
|
|
8648
|
+
const r = await config.run(adapter4, ctx);
|
|
8646
8649
|
return { adapter: adapterName, result: r };
|
|
8647
8650
|
} catch (err) {
|
|
8648
8651
|
return {
|
|
@@ -9345,6 +9348,57 @@ function viteDeployRunner(input) {
|
|
|
9345
9348
|
}
|
|
9346
9349
|
};
|
|
9347
9350
|
}
|
|
9351
|
+
function wranglerDeployRunner(input) {
|
|
9352
|
+
return {
|
|
9353
|
+
run: async () => {
|
|
9354
|
+
const start = Date.now();
|
|
9355
|
+
const buildCmd = input.buildCommand ?? "npm run build";
|
|
9356
|
+
const dryCmd = input.dryRunCommand ?? "npx wrangler deploy --dry-run --outdir dist";
|
|
9357
|
+
const timeoutMs = input.timeoutMs ?? 12e4;
|
|
9358
|
+
const hasToml = await input.exists("wrangler.toml");
|
|
9359
|
+
const hasJsonc = hasToml ? false : await input.exists("wrangler.jsonc");
|
|
9360
|
+
if (!hasToml && !hasJsonc) {
|
|
9361
|
+
return {
|
|
9362
|
+
ok: false,
|
|
9363
|
+
output: "no wrangler config found (wrangler.toml / wrangler.jsonc absent)",
|
|
9364
|
+
durationMs: Date.now() - start,
|
|
9365
|
+
artifactDir: "dist",
|
|
9366
|
+
artifactValid: false
|
|
9367
|
+
};
|
|
9368
|
+
}
|
|
9369
|
+
const build = await input.exec(buildCmd, { cwd: input.workdir, timeoutMs });
|
|
9370
|
+
if (build.exitCode !== 0) {
|
|
9371
|
+
const tail2 = ((build.stderr || build.stdout) ?? "").slice(-1500);
|
|
9372
|
+
return {
|
|
9373
|
+
ok: false,
|
|
9374
|
+
output: `build failed: ${tail2}`,
|
|
9375
|
+
durationMs: Date.now() - start,
|
|
9376
|
+
artifactDir: "dist",
|
|
9377
|
+
artifactValid: false
|
|
9378
|
+
};
|
|
9379
|
+
}
|
|
9380
|
+
const dry = await input.exec(dryCmd, { cwd: input.workdir, timeoutMs });
|
|
9381
|
+
if (dry.exitCode !== 0) {
|
|
9382
|
+
const tail2 = ((dry.stderr || dry.stdout) ?? "").slice(-1500);
|
|
9383
|
+
return {
|
|
9384
|
+
ok: false,
|
|
9385
|
+
output: `wrangler dry-run failed: ${tail2}`,
|
|
9386
|
+
durationMs: Date.now() - start,
|
|
9387
|
+
artifactDir: "dist",
|
|
9388
|
+
artifactValid: false
|
|
9389
|
+
};
|
|
9390
|
+
}
|
|
9391
|
+
const tail = ((dry.stdout || dry.stderr) ?? "").slice(-1500);
|
|
9392
|
+
return {
|
|
9393
|
+
ok: true,
|
|
9394
|
+
output: tail,
|
|
9395
|
+
durationMs: Date.now() - start,
|
|
9396
|
+
artifactDir: "dist",
|
|
9397
|
+
artifactValid: true
|
|
9398
|
+
};
|
|
9399
|
+
}
|
|
9400
|
+
};
|
|
9401
|
+
}
|
|
9348
9402
|
|
|
9349
9403
|
// src/keyword-coverage-judge.ts
|
|
9350
9404
|
function htmlContainsElement(html, selector) {
|
|
@@ -9712,15 +9766,15 @@ function scoreReferenceReplay(scenarios, options = {}) {
|
|
|
9712
9766
|
const threshold = options.matchThreshold ?? DEFAULT_MATCH_THRESHOLD;
|
|
9713
9767
|
const matchStrategy = options.matchStrategy ?? "reference-order";
|
|
9714
9768
|
const allowedSplits = new Set(options.splits ?? ALL_SPLITS);
|
|
9715
|
-
const
|
|
9769
|
+
const scores2 = scenarios.filter((scenario) => {
|
|
9716
9770
|
const split = scenario.split ?? "train";
|
|
9717
9771
|
if (split === "holdout" && !options.includeHoldout) return false;
|
|
9718
9772
|
return allowedSplits.has(split);
|
|
9719
9773
|
}).map((scenario) => scoreScenario(scenario, matcher, threshold, matchStrategy));
|
|
9720
9774
|
return {
|
|
9721
|
-
scenarios:
|
|
9722
|
-
aggregate: aggregateScenarioScores(
|
|
9723
|
-
bySplit: aggregateBySplit(
|
|
9775
|
+
scenarios: scores2,
|
|
9776
|
+
aggregate: aggregateScenarioScores(scores2),
|
|
9777
|
+
bySplit: aggregateBySplit(scores2)
|
|
9724
9778
|
};
|
|
9725
9779
|
}
|
|
9726
9780
|
function compareReferenceReplay(baseline, candidate) {
|
|
@@ -9935,20 +9989,20 @@ function buildScenarioScore(scenario, matches2, falsePositives) {
|
|
|
9935
9989
|
matches: matches2
|
|
9936
9990
|
};
|
|
9937
9991
|
}
|
|
9938
|
-
function aggregateBySplit(
|
|
9992
|
+
function aggregateBySplit(scores2) {
|
|
9939
9993
|
const out = {};
|
|
9940
9994
|
for (const split of ALL_SPLITS) {
|
|
9941
|
-
const scoped =
|
|
9995
|
+
const scoped = scores2.filter((score) => score.split === split);
|
|
9942
9996
|
if (scoped.length > 0) out[split] = aggregateScenarioScores(scoped);
|
|
9943
9997
|
}
|
|
9944
9998
|
return out;
|
|
9945
9999
|
}
|
|
9946
|
-
function aggregateScenarioScores(
|
|
9947
|
-
const matched = sum(
|
|
9948
|
-
const total = sum(
|
|
9949
|
-
const falsePositives = sum(
|
|
9950
|
-
const matchedWeight = sum(
|
|
9951
|
-
const totalWeight = sum(
|
|
10000
|
+
function aggregateScenarioScores(scores2) {
|
|
10001
|
+
const matched = sum(scores2.map((score) => score.matched));
|
|
10002
|
+
const total = sum(scores2.map((score) => score.total));
|
|
10003
|
+
const falsePositives = sum(scores2.map((score) => score.falsePositives));
|
|
10004
|
+
const matchedWeight = sum(scores2.map((score) => score.matchedWeight));
|
|
10005
|
+
const totalWeight = sum(scores2.map((score) => score.totalWeight));
|
|
9952
10006
|
const precision2 = ratio(matched, matched + falsePositives);
|
|
9953
10007
|
const recall = ratio(matched, total);
|
|
9954
10008
|
return {
|
|
@@ -10027,8 +10081,8 @@ function formatPct(value) {
|
|
|
10027
10081
|
function bySplitOrder(a, b) {
|
|
10028
10082
|
return ALL_SPLITS.indexOf(a) - ALL_SPLITS.indexOf(b);
|
|
10029
10083
|
}
|
|
10030
|
-
function runAdapter(
|
|
10031
|
-
return typeof
|
|
10084
|
+
function runAdapter(adapter4, scenario, context) {
|
|
10085
|
+
return typeof adapter4 === "function" ? adapter4(scenario, context) : adapter4.run(scenario, context);
|
|
10032
10086
|
}
|
|
10033
10087
|
function throwIfAborted(signal) {
|
|
10034
10088
|
if (!signal?.aborted) return;
|
|
@@ -10066,6 +10120,1258 @@ var STOP_WORDS = /* @__PURE__ */ new Set([
|
|
|
10066
10120
|
"which"
|
|
10067
10121
|
]);
|
|
10068
10122
|
|
|
10123
|
+
// src/paired-stats.ts
|
|
10124
|
+
function pairedBootstrap(before, after, opts = {}) {
|
|
10125
|
+
if (before.length !== after.length) {
|
|
10126
|
+
throw new Error(
|
|
10127
|
+
`pairedBootstrap: unequal sample sizes (${before.length} vs ${after.length})`
|
|
10128
|
+
);
|
|
10129
|
+
}
|
|
10130
|
+
const confidence = opts.confidence ?? 0.95;
|
|
10131
|
+
const resamples = opts.resamples ?? 2e3;
|
|
10132
|
+
const statistic = opts.statistic ?? "median";
|
|
10133
|
+
if (confidence <= 0 || confidence >= 1) {
|
|
10134
|
+
throw new Error(`pairedBootstrap: confidence must be in (0,1), got ${confidence}`);
|
|
10135
|
+
}
|
|
10136
|
+
const n = before.length;
|
|
10137
|
+
const deltas = before.map((b, i) => after[i] - b);
|
|
10138
|
+
if (n === 0) {
|
|
10139
|
+
return { n: 0, median: 0, mean: 0, low: 0, high: 0, confidence, resamples };
|
|
10140
|
+
}
|
|
10141
|
+
if (n === 1) {
|
|
10142
|
+
const d = deltas[0];
|
|
10143
|
+
return { n: 1, median: d, mean: d, low: d, high: d, confidence, resamples };
|
|
10144
|
+
}
|
|
10145
|
+
const rng = makeRng(opts.seed);
|
|
10146
|
+
const samples = new Array(resamples);
|
|
10147
|
+
for (let b = 0; b < resamples; b++) {
|
|
10148
|
+
let acc = null;
|
|
10149
|
+
if (statistic === "mean") {
|
|
10150
|
+
let sum2 = 0;
|
|
10151
|
+
for (let k = 0; k < n; k++) {
|
|
10152
|
+
sum2 += deltas[Math.floor(rng() * n)];
|
|
10153
|
+
}
|
|
10154
|
+
samples[b] = sum2 / n;
|
|
10155
|
+
} else {
|
|
10156
|
+
acc = new Array(n);
|
|
10157
|
+
for (let k = 0; k < n; k++) {
|
|
10158
|
+
acc[k] = deltas[Math.floor(rng() * n)];
|
|
10159
|
+
}
|
|
10160
|
+
samples[b] = medianInPlace(acc);
|
|
10161
|
+
}
|
|
10162
|
+
}
|
|
10163
|
+
samples.sort((a, b) => a - b);
|
|
10164
|
+
const alpha = 1 - confidence;
|
|
10165
|
+
const lowIdx = Math.floor(alpha / 2 * resamples);
|
|
10166
|
+
const highIdx = Math.min(resamples - 1, Math.ceil((1 - alpha / 2) * resamples) - 1);
|
|
10167
|
+
return {
|
|
10168
|
+
n,
|
|
10169
|
+
median: medianInPlace([...deltas]),
|
|
10170
|
+
mean: deltas.reduce((s, x) => s + x, 0) / n,
|
|
10171
|
+
low: samples[lowIdx],
|
|
10172
|
+
high: samples[Math.max(highIdx, lowIdx)],
|
|
10173
|
+
confidence,
|
|
10174
|
+
resamples
|
|
10175
|
+
};
|
|
10176
|
+
}
|
|
10177
|
+
function pairedWilcoxon(before, after) {
|
|
10178
|
+
return wilcoxonSignedRank(before, after);
|
|
10179
|
+
}
|
|
10180
|
+
function bhAdjust(pValues, fdr = 0.05) {
|
|
10181
|
+
return benjaminiHochberg(pValues, fdr);
|
|
10182
|
+
}
|
|
10183
|
+
function medianInPlace(xs) {
|
|
10184
|
+
if (xs.length === 0) return 0;
|
|
10185
|
+
xs.sort((a, b) => a - b);
|
|
10186
|
+
const mid = Math.floor(xs.length / 2);
|
|
10187
|
+
return xs.length % 2 === 0 ? (xs[mid - 1] + xs[mid]) / 2 : xs[mid];
|
|
10188
|
+
}
|
|
10189
|
+
function makeRng(seed) {
|
|
10190
|
+
if (seed === void 0) return Math.random;
|
|
10191
|
+
let s = seed | 0 || 2654435769;
|
|
10192
|
+
return () => {
|
|
10193
|
+
s = s + 1831565813 | 0;
|
|
10194
|
+
let t = s;
|
|
10195
|
+
t = Math.imul(t ^ t >>> 15, t | 1);
|
|
10196
|
+
t ^= t + Math.imul(t ^ t >>> 7, t | 61);
|
|
10197
|
+
return ((t ^ t >>> 14) >>> 0) / 4294967296;
|
|
10198
|
+
};
|
|
10199
|
+
}
|
|
10200
|
+
|
|
10201
|
+
// src/run-record.ts
|
|
10202
|
+
var MANDATORY_TOP_LEVEL = [
|
|
10203
|
+
"runId",
|
|
10204
|
+
"experimentId",
|
|
10205
|
+
"candidateId",
|
|
10206
|
+
"seed",
|
|
10207
|
+
"model",
|
|
10208
|
+
"promptHash",
|
|
10209
|
+
"configHash",
|
|
10210
|
+
"commitSha",
|
|
10211
|
+
"wallMs",
|
|
10212
|
+
"costUsd",
|
|
10213
|
+
"tokenUsage",
|
|
10214
|
+
"outcome",
|
|
10215
|
+
"splitTag"
|
|
10216
|
+
];
|
|
10217
|
+
var SPLIT_TAGS = ["search", "dev", "holdout"];
|
|
10218
|
+
var RunRecordValidationError = class extends Error {
|
|
10219
|
+
path;
|
|
10220
|
+
constructor(message, path = "") {
|
|
10221
|
+
super(path ? `${message} (at ${path})` : message);
|
|
10222
|
+
this.name = "RunRecordValidationError";
|
|
10223
|
+
this.path = path;
|
|
10224
|
+
}
|
|
10225
|
+
};
|
|
10226
|
+
function validateRunRecord(input) {
|
|
10227
|
+
if (input === null || typeof input !== "object") {
|
|
10228
|
+
throw new RunRecordValidationError("expected object");
|
|
10229
|
+
}
|
|
10230
|
+
const obj = input;
|
|
10231
|
+
for (const key of MANDATORY_TOP_LEVEL) {
|
|
10232
|
+
if (!(key in obj)) {
|
|
10233
|
+
throw new RunRecordValidationError(`missing mandatory field "${key}"`);
|
|
10234
|
+
}
|
|
10235
|
+
}
|
|
10236
|
+
expectString(obj.runId, "runId");
|
|
10237
|
+
expectString(obj.experimentId, "experimentId");
|
|
10238
|
+
expectString(obj.candidateId, "candidateId");
|
|
10239
|
+
expectFiniteNumber(obj.seed, "seed");
|
|
10240
|
+
expectString(obj.model, "model");
|
|
10241
|
+
expectString(obj.promptHash, "promptHash");
|
|
10242
|
+
expectString(obj.configHash, "configHash");
|
|
10243
|
+
expectString(obj.commitSha, "commitSha");
|
|
10244
|
+
expectFiniteNumber(obj.wallMs, "wallMs");
|
|
10245
|
+
if (obj.queueMs !== void 0) expectFiniteNumber(obj.queueMs, "queueMs");
|
|
10246
|
+
expectFiniteNumber(obj.costUsd, "costUsd");
|
|
10247
|
+
if (!modelHasSnapshot(obj.model)) {
|
|
10248
|
+
throw new RunRecordValidationError(
|
|
10249
|
+
`model "${obj.model}" lacks a snapshot version (use 'name@YYYY-MM-DD' or 'name-YYYYMMDD')`,
|
|
10250
|
+
"model"
|
|
10251
|
+
);
|
|
10252
|
+
}
|
|
10253
|
+
const tu = obj.tokenUsage;
|
|
10254
|
+
if (tu === null || typeof tu !== "object") {
|
|
10255
|
+
throw new RunRecordValidationError("tokenUsage must be an object", "tokenUsage");
|
|
10256
|
+
}
|
|
10257
|
+
const tuRec = tu;
|
|
10258
|
+
expectFiniteNumber(tuRec.input, "tokenUsage.input");
|
|
10259
|
+
expectFiniteNumber(tuRec.output, "tokenUsage.output");
|
|
10260
|
+
if (tuRec.cached !== void 0) expectFiniteNumber(tuRec.cached, "tokenUsage.cached");
|
|
10261
|
+
if (obj.judgeMetadata !== void 0) {
|
|
10262
|
+
const jm = obj.judgeMetadata;
|
|
10263
|
+
if (jm === null || typeof jm !== "object") {
|
|
10264
|
+
throw new RunRecordValidationError("judgeMetadata must be an object", "judgeMetadata");
|
|
10265
|
+
}
|
|
10266
|
+
const jmRec = jm;
|
|
10267
|
+
expectString(jmRec.model, "judgeMetadata.model");
|
|
10268
|
+
expectString(jmRec.promptVersion, "judgeMetadata.promptVersion");
|
|
10269
|
+
expectFiniteNumber(jmRec.confidence, "judgeMetadata.confidence");
|
|
10270
|
+
if (typeof jmRec.fallback !== "boolean") {
|
|
10271
|
+
throw new RunRecordValidationError("judgeMetadata.fallback must be boolean", "judgeMetadata.fallback");
|
|
10272
|
+
}
|
|
10273
|
+
}
|
|
10274
|
+
const out = obj.outcome;
|
|
10275
|
+
if (out === null || typeof out !== "object") {
|
|
10276
|
+
throw new RunRecordValidationError("outcome must be an object", "outcome");
|
|
10277
|
+
}
|
|
10278
|
+
const outRec = out;
|
|
10279
|
+
if (outRec.searchScore !== void 0) expectFiniteNumber(outRec.searchScore, "outcome.searchScore");
|
|
10280
|
+
if (outRec.holdoutScore !== void 0) expectFiniteNumber(outRec.holdoutScore, "outcome.holdoutScore");
|
|
10281
|
+
if (outRec.searchScore === void 0 && outRec.holdoutScore === void 0) {
|
|
10282
|
+
throw new RunRecordValidationError(
|
|
10283
|
+
"outcome must define searchScore or holdoutScore (or both)",
|
|
10284
|
+
"outcome"
|
|
10285
|
+
);
|
|
10286
|
+
}
|
|
10287
|
+
const raw = outRec.raw;
|
|
10288
|
+
if (raw === null || typeof raw !== "object") {
|
|
10289
|
+
throw new RunRecordValidationError("outcome.raw must be an object", "outcome.raw");
|
|
10290
|
+
}
|
|
10291
|
+
for (const [k, v] of Object.entries(raw)) {
|
|
10292
|
+
expectFiniteNumber(v, `outcome.raw.${k}`);
|
|
10293
|
+
}
|
|
10294
|
+
if (obj.failureMode !== void 0) expectString(obj.failureMode, "failureMode");
|
|
10295
|
+
if (typeof obj.splitTag !== "string" || !SPLIT_TAGS.includes(obj.splitTag)) {
|
|
10296
|
+
throw new RunRecordValidationError(
|
|
10297
|
+
`splitTag must be one of ${SPLIT_TAGS.join(", ")}, got ${String(obj.splitTag)}`,
|
|
10298
|
+
"splitTag"
|
|
10299
|
+
);
|
|
10300
|
+
}
|
|
10301
|
+
return input;
|
|
10302
|
+
}
|
|
10303
|
+
function isRunRecord(input) {
|
|
10304
|
+
try {
|
|
10305
|
+
validateRunRecord(input);
|
|
10306
|
+
return true;
|
|
10307
|
+
} catch {
|
|
10308
|
+
return false;
|
|
10309
|
+
}
|
|
10310
|
+
}
|
|
10311
|
+
function parseRunRecordSafe(input) {
|
|
10312
|
+
try {
|
|
10313
|
+
return { ok: true, value: validateRunRecord(input) };
|
|
10314
|
+
} catch (e) {
|
|
10315
|
+
if (e instanceof RunRecordValidationError) return { ok: false, error: e };
|
|
10316
|
+
throw e;
|
|
10317
|
+
}
|
|
10318
|
+
}
|
|
10319
|
+
function roundTripRunRecord(record) {
|
|
10320
|
+
const json = JSON.stringify(record);
|
|
10321
|
+
return validateRunRecord(JSON.parse(json));
|
|
10322
|
+
}
|
|
10323
|
+
function expectString(value, path) {
|
|
10324
|
+
if (typeof value !== "string" || value.length === 0) {
|
|
10325
|
+
throw new RunRecordValidationError(`expected non-empty string`, path);
|
|
10326
|
+
}
|
|
10327
|
+
}
|
|
10328
|
+
function expectFiniteNumber(value, path) {
|
|
10329
|
+
if (typeof value !== "number" || !Number.isFinite(value)) {
|
|
10330
|
+
throw new RunRecordValidationError(`expected finite number`, path);
|
|
10331
|
+
}
|
|
10332
|
+
}
|
|
10333
|
+
function modelHasSnapshot(model) {
|
|
10334
|
+
if (model.includes("@")) return true;
|
|
10335
|
+
if (/-\d{8}$/.test(model)) return true;
|
|
10336
|
+
if (/-\d{4}-\d{2}-\d{2}$/.test(model)) return true;
|
|
10337
|
+
if (/:date-/.test(model)) return true;
|
|
10338
|
+
return false;
|
|
10339
|
+
}
|
|
10340
|
+
|
|
10341
|
+
// src/held-out-gate.ts
|
|
10342
|
+
var HeldOutGate = class {
|
|
10343
|
+
minProductiveRuns;
|
|
10344
|
+
pairedDeltaThreshold;
|
|
10345
|
+
overfitGapThreshold;
|
|
10346
|
+
baselineKey;
|
|
10347
|
+
confidence;
|
|
10348
|
+
resamples;
|
|
10349
|
+
seed;
|
|
10350
|
+
constructor(config) {
|
|
10351
|
+
if (!config.baselineKey) {
|
|
10352
|
+
throw new Error("HeldOutGate: baselineKey is required");
|
|
10353
|
+
}
|
|
10354
|
+
this.minProductiveRuns = config.minProductiveRuns ?? 3;
|
|
10355
|
+
this.pairedDeltaThreshold = config.pairedDeltaThreshold ?? 0;
|
|
10356
|
+
this.overfitGapThreshold = config.overfitGapThreshold ?? 0.15;
|
|
10357
|
+
this.baselineKey = config.baselineKey;
|
|
10358
|
+
this.confidence = config.confidence ?? 0.95;
|
|
10359
|
+
this.resamples = config.bootstrapResamples ?? 2e3;
|
|
10360
|
+
this.seed = config.seed;
|
|
10361
|
+
}
|
|
10362
|
+
/** Decide whether `candidate` should replace `baseline`. Pairing
|
|
10363
|
+
* is by (experimentId, seed) — identical experiment + seed pairs
|
|
10364
|
+
* the candidate run with the matching baseline run. Pairs without
|
|
10365
|
+
* a holdout score on both sides are dropped. */
|
|
10366
|
+
evaluate(candidate, baseline) {
|
|
10367
|
+
const candidateId = inferCandidateId(candidate, this.baselineKey);
|
|
10368
|
+
const baselineId = this.baselineKey;
|
|
10369
|
+
const baselineHoldoutByKey = indexHoldoutByKey(baseline);
|
|
10370
|
+
const beforeHoldout = [];
|
|
10371
|
+
const afterHoldout = [];
|
|
10372
|
+
for (const run of candidate) {
|
|
10373
|
+
if (run.splitTag !== "holdout") continue;
|
|
10374
|
+
if (run.outcome.holdoutScore === void 0) continue;
|
|
10375
|
+
const key = pairKey(run);
|
|
10376
|
+
const counterpart = baselineHoldoutByKey.get(key);
|
|
10377
|
+
if (counterpart === void 0) continue;
|
|
10378
|
+
beforeHoldout.push(counterpart);
|
|
10379
|
+
afterHoldout.push(run.outcome.holdoutScore);
|
|
10380
|
+
}
|
|
10381
|
+
const productiveRuns = beforeHoldout.length;
|
|
10382
|
+
const candidateSearchMean = mean5(scores(candidate, "searchScore", "search"));
|
|
10383
|
+
const candidateHoldoutMean = mean5(scores(candidate, "holdoutScore", "holdout"));
|
|
10384
|
+
const baselineSearchMean = mean5(scores(baseline, "searchScore", "search"));
|
|
10385
|
+
const baselineHoldoutMean = mean5(scores(baseline, "holdoutScore", "holdout"));
|
|
10386
|
+
const overfitGap = safeDiff(candidateSearchMean, candidateHoldoutMean);
|
|
10387
|
+
const baselineOverfitGap = safeDiff(baselineSearchMean, baselineHoldoutMean);
|
|
10388
|
+
if (productiveRuns < this.minProductiveRuns) {
|
|
10389
|
+
return {
|
|
10390
|
+
promote: false,
|
|
10391
|
+
candidateId,
|
|
10392
|
+
baselineId,
|
|
10393
|
+
evidence: {
|
|
10394
|
+
productiveRuns,
|
|
10395
|
+
medianPairedDelta: productiveRuns > 0 ? medianDelta(beforeHoldout, afterHoldout) : 0,
|
|
10396
|
+
pairedCI: { low: 0, high: 0 },
|
|
10397
|
+
pairedPValue: 1,
|
|
10398
|
+
searchScore: candidateSearchMean,
|
|
10399
|
+
holdoutScore: candidateHoldoutMean,
|
|
10400
|
+
overfitGap,
|
|
10401
|
+
baselineOverfitGap
|
|
10402
|
+
},
|
|
10403
|
+
reason: `few_runs: ${productiveRuns} paired holdout observation(s) < min ${this.minProductiveRuns}`,
|
|
10404
|
+
rejectionCode: "few_runs"
|
|
10405
|
+
};
|
|
10406
|
+
}
|
|
10407
|
+
const ci = pairedBootstrap(beforeHoldout, afterHoldout, {
|
|
10408
|
+
confidence: this.confidence,
|
|
10409
|
+
resamples: this.resamples,
|
|
10410
|
+
statistic: "median",
|
|
10411
|
+
seed: this.seed
|
|
10412
|
+
});
|
|
10413
|
+
const wilcoxon = pairedWilcoxon(beforeHoldout, afterHoldout);
|
|
10414
|
+
const evidence = {
|
|
10415
|
+
productiveRuns,
|
|
10416
|
+
medianPairedDelta: ci.median,
|
|
10417
|
+
pairedCI: { low: ci.low, high: ci.high },
|
|
10418
|
+
pairedPValue: wilcoxon.p,
|
|
10419
|
+
searchScore: candidateSearchMean,
|
|
10420
|
+
holdoutScore: candidateHoldoutMean,
|
|
10421
|
+
overfitGap,
|
|
10422
|
+
baselineOverfitGap
|
|
10423
|
+
};
|
|
10424
|
+
if (!(ci.low > this.pairedDeltaThreshold)) {
|
|
10425
|
+
return {
|
|
10426
|
+
promote: false,
|
|
10427
|
+
candidateId,
|
|
10428
|
+
baselineId,
|
|
10429
|
+
evidence,
|
|
10430
|
+
reason: `negative_delta: paired holdout median \u0394=${fmt(ci.median)} CI=[${fmt(ci.low)}, ${fmt(ci.high)}] does not clear threshold ${fmt(this.pairedDeltaThreshold)}`,
|
|
10431
|
+
rejectionCode: "negative_delta"
|
|
10432
|
+
};
|
|
10433
|
+
}
|
|
10434
|
+
if (Number.isFinite(overfitGap) && Number.isFinite(baselineOverfitGap) && overfitGap > baselineOverfitGap + this.overfitGapThreshold) {
|
|
10435
|
+
return {
|
|
10436
|
+
promote: false,
|
|
10437
|
+
candidateId,
|
|
10438
|
+
baselineId,
|
|
10439
|
+
evidence,
|
|
10440
|
+
reason: `overfit_gap: candidate gap=${fmt(overfitGap)} exceeds baseline gap=${fmt(baselineOverfitGap)} by more than ${fmt(this.overfitGapThreshold)}`,
|
|
10441
|
+
rejectionCode: "overfit_gap"
|
|
10442
|
+
};
|
|
10443
|
+
}
|
|
10444
|
+
return {
|
|
10445
|
+
promote: true,
|
|
10446
|
+
candidateId,
|
|
10447
|
+
baselineId,
|
|
10448
|
+
evidence,
|
|
10449
|
+
reason: `promote: paired holdout median \u0394=${fmt(ci.median)} CI=[${fmt(ci.low)}, ${fmt(ci.high)}] over ${productiveRuns} pairs; overfit gap candidate=${fmt(overfitGap)} vs baseline=${fmt(baselineOverfitGap)}`,
|
|
10450
|
+
rejectionCode: null
|
|
10451
|
+
};
|
|
10452
|
+
}
|
|
10453
|
+
};
|
|
10454
|
+
function inferCandidateId(candidate, baselineKey) {
|
|
10455
|
+
for (const run of candidate) {
|
|
10456
|
+
if (run.candidateId && run.candidateId !== baselineKey) return run.candidateId;
|
|
10457
|
+
}
|
|
10458
|
+
return candidate[0]?.candidateId ?? "(unknown candidate)";
|
|
10459
|
+
}
|
|
10460
|
+
function indexHoldoutByKey(runs) {
|
|
10461
|
+
const out = /* @__PURE__ */ new Map();
|
|
10462
|
+
for (const r of runs) {
|
|
10463
|
+
if (r.splitTag !== "holdout") continue;
|
|
10464
|
+
if (r.outcome.holdoutScore === void 0) continue;
|
|
10465
|
+
out.set(pairKey(r), r.outcome.holdoutScore);
|
|
10466
|
+
}
|
|
10467
|
+
return out;
|
|
10468
|
+
}
|
|
10469
|
+
function pairKey(r) {
|
|
10470
|
+
return `${r.experimentId}::${r.seed}`;
|
|
10471
|
+
}
|
|
10472
|
+
function scores(runs, field, splitFilter) {
|
|
10473
|
+
const out = [];
|
|
10474
|
+
for (const r of runs) {
|
|
10475
|
+
if (r.splitTag !== splitFilter) continue;
|
|
10476
|
+
const v = r.outcome[field];
|
|
10477
|
+
if (typeof v === "number" && Number.isFinite(v)) out.push(v);
|
|
10478
|
+
}
|
|
10479
|
+
return out;
|
|
10480
|
+
}
|
|
10481
|
+
function mean5(xs) {
|
|
10482
|
+
if (xs.length === 0) return Number.NaN;
|
|
10483
|
+
return xs.reduce((s, x) => s + x, 0) / xs.length;
|
|
10484
|
+
}
|
|
10485
|
+
function safeDiff(a, b) {
|
|
10486
|
+
if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
|
|
10487
|
+
return a - b;
|
|
10488
|
+
}
|
|
10489
|
+
function medianDelta(before, after) {
|
|
10490
|
+
const ds = before.map((b, i) => after[i] - b).sort((x, y) => x - y);
|
|
10491
|
+
if (ds.length === 0) return 0;
|
|
10492
|
+
const mid = Math.floor(ds.length / 2);
|
|
10493
|
+
return ds.length % 2 === 0 ? (ds[mid - 1] + ds[mid]) / 2 : ds[mid];
|
|
10494
|
+
}
|
|
10495
|
+
function fmt(x) {
|
|
10496
|
+
if (!Number.isFinite(x)) return String(x);
|
|
10497
|
+
return x.toFixed(4);
|
|
10498
|
+
}
|
|
10499
|
+
|
|
10500
|
+
// src/researcher.ts
|
|
10501
|
+
var NoopResearcher = class {
|
|
10502
|
+
hint;
|
|
10503
|
+
constructor(hint = "NoopResearcher: no implementation wired") {
|
|
10504
|
+
this.hint = hint;
|
|
10505
|
+
}
|
|
10506
|
+
async inspectFailures(_runs) {
|
|
10507
|
+
throw new Error(`${this.hint} (inspectFailures not implemented)`);
|
|
10508
|
+
}
|
|
10509
|
+
async proposeChange(_failures) {
|
|
10510
|
+
throw new Error(`${this.hint} (proposeChange not implemented)`);
|
|
10511
|
+
}
|
|
10512
|
+
async applyChange(_changes, _baseline) {
|
|
10513
|
+
throw new Error(`${this.hint} (applyChange not implemented)`);
|
|
10514
|
+
}
|
|
10515
|
+
async evaluateChange(_plan) {
|
|
10516
|
+
throw new Error(`${this.hint} (evaluateChange not implemented)`);
|
|
10517
|
+
}
|
|
10518
|
+
};
|
|
10519
|
+
|
|
10520
|
+
// src/summary-report.ts
|
|
10521
|
+
function summaryTable(runs, opts = {}) {
|
|
10522
|
+
const split = opts.split ?? "holdout";
|
|
10523
|
+
const confidence = opts.confidence ?? 0.95;
|
|
10524
|
+
const fdr = opts.fdr ?? 0.05;
|
|
10525
|
+
const comparator = opts.comparator ?? null;
|
|
10526
|
+
const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
|
|
10527
|
+
const byCandidate = /* @__PURE__ */ new Map();
|
|
10528
|
+
for (const r of runs) {
|
|
10529
|
+
if (r.splitTag !== split) continue;
|
|
10530
|
+
const v = r.outcome[scoreField];
|
|
10531
|
+
if (typeof v !== "number" || !Number.isFinite(v)) continue;
|
|
10532
|
+
const bucket = byCandidate.get(r.candidateId) ?? { runs: [], scores: [] };
|
|
10533
|
+
bucket.runs.push(r);
|
|
10534
|
+
bucket.scores.push(v);
|
|
10535
|
+
byCandidate.set(r.candidateId, bucket);
|
|
10536
|
+
}
|
|
10537
|
+
const candidateIds = [...byCandidate.keys()].sort();
|
|
10538
|
+
const compRuns = comparator ? byCandidate.get(comparator) : void 0;
|
|
10539
|
+
const tentative = [];
|
|
10540
|
+
for (const id of candidateIds) {
|
|
10541
|
+
const bucket = byCandidate.get(id);
|
|
10542
|
+
const ci = confidenceInterval(bucket.scores, confidence);
|
|
10543
|
+
let rawP = Number.NaN;
|
|
10544
|
+
let d = Number.NaN;
|
|
10545
|
+
if (comparator && compRuns && id !== comparator) {
|
|
10546
|
+
const paired = pairScoresByKey(bucket.runs, compRuns.runs, scoreField);
|
|
10547
|
+
if (paired.before.length >= 6) {
|
|
10548
|
+
rawP = wilcoxonSignedRank(paired.before, paired.after).p;
|
|
10549
|
+
}
|
|
10550
|
+
d = cohensD(compRuns.scores, bucket.scores);
|
|
10551
|
+
}
|
|
10552
|
+
tentative.push({
|
|
10553
|
+
candidateId: id,
|
|
10554
|
+
n: bucket.scores.length,
|
|
10555
|
+
mean: ci.mean,
|
|
10556
|
+
ciLow: ci.lower,
|
|
10557
|
+
ciHigh: ci.upper,
|
|
10558
|
+
qValue: rawP,
|
|
10559
|
+
cohensD: d,
|
|
10560
|
+
rawP
|
|
10561
|
+
});
|
|
10562
|
+
}
|
|
10563
|
+
if (comparator) {
|
|
10564
|
+
const idxs = [];
|
|
10565
|
+
const ps = [];
|
|
10566
|
+
for (let i = 0; i < tentative.length; i++) {
|
|
10567
|
+
const r = tentative[i];
|
|
10568
|
+
if (r.candidateId === comparator) continue;
|
|
10569
|
+
if (!Number.isFinite(r.rawP)) continue;
|
|
10570
|
+
idxs.push(i);
|
|
10571
|
+
ps.push(r.rawP);
|
|
10572
|
+
}
|
|
10573
|
+
if (ps.length > 0) {
|
|
10574
|
+
const { qValues } = benjaminiHochberg(ps, fdr);
|
|
10575
|
+
for (let k = 0; k < idxs.length; k++) {
|
|
10576
|
+
tentative[idxs[k]].qValue = qValues[k];
|
|
10577
|
+
}
|
|
10578
|
+
}
|
|
10579
|
+
}
|
|
10580
|
+
const rows = tentative.map(({ rawP: _rawP, ...rest }) => rest);
|
|
10581
|
+
const markdown = renderSummaryTableMarkdown(rows, comparator, split);
|
|
10582
|
+
return { rows, comparator, split, markdown };
|
|
10583
|
+
}
|
|
10584
|
+
function pairScoresByKey(candidate, baseline, scoreField) {
|
|
10585
|
+
const baseIdx = /* @__PURE__ */ new Map();
|
|
10586
|
+
for (const r of baseline) {
|
|
10587
|
+
const v = r.outcome[scoreField];
|
|
10588
|
+
if (typeof v === "number" && Number.isFinite(v)) {
|
|
10589
|
+
baseIdx.set(`${r.experimentId}::${r.seed}`, v);
|
|
10590
|
+
}
|
|
10591
|
+
}
|
|
10592
|
+
const before = [];
|
|
10593
|
+
const after = [];
|
|
10594
|
+
for (const r of candidate) {
|
|
10595
|
+
const v = r.outcome[scoreField];
|
|
10596
|
+
if (typeof v !== "number" || !Number.isFinite(v)) continue;
|
|
10597
|
+
const key = `${r.experimentId}::${r.seed}`;
|
|
10598
|
+
const b = baseIdx.get(key);
|
|
10599
|
+
if (b === void 0) continue;
|
|
10600
|
+
before.push(b);
|
|
10601
|
+
after.push(v);
|
|
10602
|
+
}
|
|
10603
|
+
return { before, after };
|
|
10604
|
+
}
|
|
10605
|
+
function renderSummaryTableMarkdown(rows, comparator, split) {
|
|
10606
|
+
const lines = [];
|
|
10607
|
+
const cmpLabel = comparator ? ` (vs ${comparator})` : "";
|
|
10608
|
+
lines.push(`Summary Table \u2014 ${split} split${cmpLabel}`);
|
|
10609
|
+
lines.push("");
|
|
10610
|
+
lines.push("| Candidate | N | Mean | 95% CI | q (BH) | Cohen's d |");
|
|
10611
|
+
lines.push("|---|---:|---:|---|---:|---:|");
|
|
10612
|
+
for (const r of rows) {
|
|
10613
|
+
const ci = `[${fmt2(r.ciLow)}, ${fmt2(r.ciHigh)}]`;
|
|
10614
|
+
const q = Number.isFinite(r.qValue) ? r.qValue.toFixed(4) : "\u2014";
|
|
10615
|
+
const d = Number.isFinite(r.cohensD) ? r.cohensD.toFixed(3) : "\u2014";
|
|
10616
|
+
lines.push(`| ${r.candidateId} | ${r.n} | ${fmt2(r.mean)} | ${ci} | ${q} | ${d} |`);
|
|
10617
|
+
}
|
|
10618
|
+
return lines.join("\n");
|
|
10619
|
+
}
|
|
10620
|
+
function paretoChart(runs, opts = {}) {
|
|
10621
|
+
const split = opts.split ?? "holdout";
|
|
10622
|
+
const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
|
|
10623
|
+
const buckets = /* @__PURE__ */ new Map();
|
|
10624
|
+
for (const r of runs) {
|
|
10625
|
+
if (r.splitTag !== split) continue;
|
|
10626
|
+
const v = r.outcome[scoreField];
|
|
10627
|
+
if (typeof v !== "number" || !Number.isFinite(v)) continue;
|
|
10628
|
+
const bucket = buckets.get(r.candidateId) ?? { cost: [], quality: [] };
|
|
10629
|
+
bucket.cost.push(r.costUsd);
|
|
10630
|
+
bucket.quality.push(v);
|
|
10631
|
+
buckets.set(r.candidateId, bucket);
|
|
10632
|
+
}
|
|
10633
|
+
const points = [];
|
|
10634
|
+
for (const [candidateId, bucket] of buckets.entries()) {
|
|
10635
|
+
points.push({
|
|
10636
|
+
candidateId,
|
|
10637
|
+
cost: avg(bucket.cost),
|
|
10638
|
+
quality: avg(bucket.quality),
|
|
10639
|
+
n: bucket.cost.length,
|
|
10640
|
+
onFrontier: false,
|
|
10641
|
+
gate: opts.gateDecisions?.[candidateId] ? gateLabel(opts.gateDecisions[candidateId]) : void 0
|
|
10642
|
+
});
|
|
10643
|
+
}
|
|
10644
|
+
for (const p of points) {
|
|
10645
|
+
p.onFrontier = !points.some((q) => q !== p && dominates2(q, p));
|
|
10646
|
+
}
|
|
10647
|
+
return {
|
|
10648
|
+
kind: "pareto-cost-quality",
|
|
10649
|
+
split,
|
|
10650
|
+
axes: { x: "costUsd", y: "score" },
|
|
10651
|
+
points
|
|
10652
|
+
};
|
|
10653
|
+
}
|
|
10654
|
+
function dominates2(a, b) {
|
|
10655
|
+
return a.cost <= b.cost && a.quality >= b.quality && (a.cost < b.cost || a.quality > b.quality);
|
|
10656
|
+
}
|
|
10657
|
+
function gateLabel(d) {
|
|
10658
|
+
if (d.promote) return "promote";
|
|
10659
|
+
if (d.rejectionCode === "few_runs") return "reject_few_runs";
|
|
10660
|
+
if (d.rejectionCode === "negative_delta") return "reject_negative_delta";
|
|
10661
|
+
if (d.rejectionCode === "overfit_gap") return "reject_overfit_gap";
|
|
10662
|
+
return null;
|
|
10663
|
+
}
|
|
10664
|
+
function gainHistogram(runs, candidateId, comparator, opts = {}) {
|
|
10665
|
+
const split = opts.split ?? "holdout";
|
|
10666
|
+
const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
|
|
10667
|
+
const binCount = opts.bins ?? 11;
|
|
10668
|
+
if (binCount < 1) throw new Error("gainHistogram: bins must be \u2265 1");
|
|
10669
|
+
const candidate = runs.filter((r) => r.candidateId === candidateId && r.splitTag === split);
|
|
10670
|
+
const baseline = runs.filter((r) => r.candidateId === comparator && r.splitTag === split);
|
|
10671
|
+
const { before, after } = pairScoresByKey(candidate, baseline, scoreField);
|
|
10672
|
+
const n = before.length;
|
|
10673
|
+
if (n === 0) {
|
|
10674
|
+
return {
|
|
10675
|
+
kind: "gain-distribution",
|
|
10676
|
+
candidateId,
|
|
10677
|
+
comparator,
|
|
10678
|
+
split,
|
|
10679
|
+
n: 0,
|
|
10680
|
+
bins: [],
|
|
10681
|
+
median: 0,
|
|
10682
|
+
ci: { low: 0, high: 0 }
|
|
10683
|
+
};
|
|
10684
|
+
}
|
|
10685
|
+
const deltas = before.map((b, i) => after[i] - b);
|
|
10686
|
+
const sortedDeltas = [...deltas].sort((a, b) => a - b);
|
|
10687
|
+
const median = medianOfSorted(sortedDeltas);
|
|
10688
|
+
const min = sortedDeltas[0];
|
|
10689
|
+
const max = sortedDeltas[sortedDeltas.length - 1];
|
|
10690
|
+
const bound = Math.max(Math.abs(min), Math.abs(max), 1e-6);
|
|
10691
|
+
const lo = -bound;
|
|
10692
|
+
const hi = bound;
|
|
10693
|
+
const width = (hi - lo) / binCount;
|
|
10694
|
+
const bins = [];
|
|
10695
|
+
for (let i = 0; i < binCount; i++) {
|
|
10696
|
+
bins.push({ lo: lo + i * width, hi: lo + (i + 1) * width, count: 0 });
|
|
10697
|
+
}
|
|
10698
|
+
for (const d of deltas) {
|
|
10699
|
+
let idx = Math.floor((d - lo) / width);
|
|
10700
|
+
if (idx < 0) idx = 0;
|
|
10701
|
+
if (idx >= binCount) idx = binCount - 1;
|
|
10702
|
+
bins[idx].count += 1;
|
|
10703
|
+
}
|
|
10704
|
+
const ci = pairedBootstrap(before, after, {
|
|
10705
|
+
confidence: opts.confidence ?? 0.95,
|
|
10706
|
+
resamples: opts.resamples ?? 2e3,
|
|
10707
|
+
statistic: "median",
|
|
10708
|
+
seed: opts.seed
|
|
10709
|
+
});
|
|
10710
|
+
return {
|
|
10711
|
+
kind: "gain-distribution",
|
|
10712
|
+
candidateId,
|
|
10713
|
+
comparator,
|
|
10714
|
+
split,
|
|
10715
|
+
n,
|
|
10716
|
+
bins,
|
|
10717
|
+
median,
|
|
10718
|
+
ci: { low: ci.low, high: ci.high }
|
|
10719
|
+
};
|
|
10720
|
+
}
|
|
10721
|
+
function avg(xs) {
|
|
10722
|
+
if (xs.length === 0) return Number.NaN;
|
|
10723
|
+
return xs.reduce((s, x) => s + x, 0) / xs.length;
|
|
10724
|
+
}
|
|
10725
|
+
function medianOfSorted(sorted) {
|
|
10726
|
+
if (sorted.length === 0) return 0;
|
|
10727
|
+
const mid = Math.floor(sorted.length / 2);
|
|
10728
|
+
return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
|
|
10729
|
+
}
|
|
10730
|
+
function fmt2(x) {
|
|
10731
|
+
if (!Number.isFinite(x)) return String(x);
|
|
10732
|
+
return x.toFixed(4);
|
|
10733
|
+
}
|
|
10734
|
+
|
|
10735
|
+
// src/canary.ts
|
|
10736
|
+
function runCanaries(runs, opts = {}) {
|
|
10737
|
+
const alerts = [
|
|
10738
|
+
...detectSilentFallback(runs, opts.silentFallback ?? {}),
|
|
10739
|
+
...detectCalibrationDrift(runs, opts.calibrationDrift ?? {}),
|
|
10740
|
+
...opts.distributionShift ? detectDistributionShift(runs, opts.distributionShift) : []
|
|
10741
|
+
];
|
|
10742
|
+
const counts = {
|
|
10743
|
+
silent_judge_fallback: 0,
|
|
10744
|
+
judge_calibration_drift: 0,
|
|
10745
|
+
distribution_shift: 0
|
|
10746
|
+
};
|
|
10747
|
+
for (const a of alerts) counts[a.kind]++;
|
|
10748
|
+
return { alerts, counts };
|
|
10749
|
+
}
|
|
10750
|
+
function detectSilentFallback(runs, opts) {
|
|
10751
|
+
const constant = opts.constant ?? 0.3;
|
|
10752
|
+
const threshold = opts.consecutiveThreshold ?? 3;
|
|
10753
|
+
const eps = opts.epsilon ?? 1e-9;
|
|
10754
|
+
const alerts = [];
|
|
10755
|
+
let streak = 0;
|
|
10756
|
+
let streakStartRunId = null;
|
|
10757
|
+
let streakValues = [];
|
|
10758
|
+
let lastFlush = -1;
|
|
10759
|
+
for (let i = 0; i < runs.length; i++) {
|
|
10760
|
+
const run = runs[i];
|
|
10761
|
+
const meta = run.judgeMetadata;
|
|
10762
|
+
if (!meta) {
|
|
10763
|
+
streak = 0;
|
|
10764
|
+
streakStartRunId = null;
|
|
10765
|
+
streakValues = [];
|
|
10766
|
+
continue;
|
|
10767
|
+
}
|
|
10768
|
+
const isFallback = meta.fallback === true || Math.abs(meta.confidence - constant) <= eps;
|
|
10769
|
+
if (isFallback) {
|
|
10770
|
+
streak += 1;
|
|
10771
|
+
if (streak === 1) streakStartRunId = run.runId;
|
|
10772
|
+
streakValues.push(meta.confidence);
|
|
10773
|
+
if (streak >= threshold && lastFlush < i) {
|
|
10774
|
+
alerts.push({
|
|
10775
|
+
kind: "silent_judge_fallback",
|
|
10776
|
+
severity: "error",
|
|
10777
|
+
message: `silent judge fallback: ${streak} consecutive run(s) at confidence\u2248${constant} or fallback=true`,
|
|
10778
|
+
evidence: {
|
|
10779
|
+
streakLength: streak,
|
|
10780
|
+
firstRunId: streakStartRunId,
|
|
10781
|
+
lastRunId: run.runId,
|
|
10782
|
+
confidences: streakValues.slice(-Math.min(streakValues.length, 10)),
|
|
10783
|
+
fallbackConstant: constant
|
|
10784
|
+
}
|
|
10785
|
+
});
|
|
10786
|
+
lastFlush = i;
|
|
10787
|
+
}
|
|
10788
|
+
} else {
|
|
10789
|
+
streak = 0;
|
|
10790
|
+
streakStartRunId = null;
|
|
10791
|
+
streakValues = [];
|
|
10792
|
+
lastFlush = -1;
|
|
10793
|
+
}
|
|
10794
|
+
}
|
|
10795
|
+
return alerts;
|
|
10796
|
+
}
|
|
10797
|
+
function detectCalibrationDrift(runs, opts) {
|
|
10798
|
+
const historyWindow = opts.historyWindow ?? 50;
|
|
10799
|
+
const recentWindow = opts.recentWindow ?? 20;
|
|
10800
|
+
const alpha = opts.ksAlpha ?? 0.05;
|
|
10801
|
+
const minRecent = opts.minRecent ?? 10;
|
|
10802
|
+
const conf = [];
|
|
10803
|
+
for (const r of runs) {
|
|
10804
|
+
if (r.judgeMetadata && Number.isFinite(r.judgeMetadata.confidence)) {
|
|
10805
|
+
conf.push(r.judgeMetadata.confidence);
|
|
10806
|
+
}
|
|
10807
|
+
}
|
|
10808
|
+
if (conf.length < minRecent + 1) return [];
|
|
10809
|
+
const recent = conf.slice(-Math.min(recentWindow, conf.length));
|
|
10810
|
+
const historical = conf.slice(0, -recent.length).slice(-historyWindow);
|
|
10811
|
+
if (recent.length < minRecent || historical.length < minRecent) return [];
|
|
10812
|
+
const ks = ksTwoSample(recent, historical);
|
|
10813
|
+
const c = alpha <= 0.01 ? 1.63 : alpha <= 0.05 ? 1.36 : alpha <= 0.1 ? 1.22 : 1;
|
|
10814
|
+
const critical = c * Math.sqrt((recent.length + historical.length) / (recent.length * historical.length));
|
|
10815
|
+
if (ks.d > critical) {
|
|
10816
|
+
return [
|
|
10817
|
+
{
|
|
10818
|
+
kind: "judge_calibration_drift",
|
|
10819
|
+
severity: "warn",
|
|
10820
|
+
message: `judge calibration drift: KS D=${ks.d.toFixed(4)} exceeds critical=${critical.toFixed(4)} at alpha=${alpha} (recent n=${recent.length}, history n=${historical.length})`,
|
|
10821
|
+
evidence: {
|
|
10822
|
+
ksD: ks.d,
|
|
10823
|
+
critical,
|
|
10824
|
+
alpha,
|
|
10825
|
+
recentN: recent.length,
|
|
10826
|
+
historyN: historical.length,
|
|
10827
|
+
recentMean: mean6(recent),
|
|
10828
|
+
historyMean: mean6(historical)
|
|
10829
|
+
}
|
|
10830
|
+
}
|
|
10831
|
+
];
|
|
10832
|
+
}
|
|
10833
|
+
return [];
|
|
10834
|
+
}
|
|
10835
|
+
function ksTwoSample(a, b) {
|
|
10836
|
+
const sortedA = [...a].sort((x, y) => x - y);
|
|
10837
|
+
const sortedB = [...b].sort((x, y) => x - y);
|
|
10838
|
+
const n1 = sortedA.length;
|
|
10839
|
+
const n2 = sortedB.length;
|
|
10840
|
+
let i = 0;
|
|
10841
|
+
let j = 0;
|
|
10842
|
+
let d = 0;
|
|
10843
|
+
while (i < n1 && j < n2) {
|
|
10844
|
+
const ax = sortedA[i];
|
|
10845
|
+
const bx = sortedB[j];
|
|
10846
|
+
if (ax <= bx) i++;
|
|
10847
|
+
if (bx <= ax) j++;
|
|
10848
|
+
const diff = Math.abs(i / n1 - j / n2);
|
|
10849
|
+
if (diff > d) d = diff;
|
|
10850
|
+
}
|
|
10851
|
+
return { d };
|
|
10852
|
+
}
|
|
10853
|
+
function detectDistributionShift(runs, opts) {
|
|
10854
|
+
const historyWindow = opts.historyWindow ?? 50;
|
|
10855
|
+
const recentWindow = opts.recentWindow ?? 20;
|
|
10856
|
+
const alpha = opts.chiSquareAlpha ?? 0.05;
|
|
10857
|
+
const minRecent = opts.minRecent ?? 10;
|
|
10858
|
+
const cat = opts.category;
|
|
10859
|
+
const cats = [];
|
|
10860
|
+
for (const r of runs) {
|
|
10861
|
+
const b = cat(r);
|
|
10862
|
+
if (typeof b === "string" && b.length > 0) cats.push({ run: r, bucket: b });
|
|
10863
|
+
}
|
|
10864
|
+
if (cats.length < minRecent + 1) return [];
|
|
10865
|
+
const recent = cats.slice(-Math.min(recentWindow, cats.length));
|
|
10866
|
+
const historical = cats.slice(0, -recent.length).slice(-historyWindow);
|
|
10867
|
+
if (recent.length < minRecent || historical.length < minRecent) return [];
|
|
10868
|
+
const buckets = /* @__PURE__ */ new Set();
|
|
10869
|
+
for (const r of recent) buckets.add(r.bucket);
|
|
10870
|
+
for (const h of historical) buckets.add(h.bucket);
|
|
10871
|
+
const bucketList = [...buckets].sort();
|
|
10872
|
+
const recentCounts = {};
|
|
10873
|
+
const histCounts = {};
|
|
10874
|
+
for (const b of bucketList) {
|
|
10875
|
+
recentCounts[b] = 0;
|
|
10876
|
+
histCounts[b] = 0;
|
|
10877
|
+
}
|
|
10878
|
+
for (const r of recent) recentCounts[r.bucket] += 1;
|
|
10879
|
+
for (const h of historical) histCounts[h.bucket] += 1;
|
|
10880
|
+
let chi = 0;
|
|
10881
|
+
let df = 0;
|
|
10882
|
+
for (const b of bucketList) {
|
|
10883
|
+
const expected = histCounts[b] / historical.length * recent.length;
|
|
10884
|
+
if (expected < 1) continue;
|
|
10885
|
+
const obs = recentCounts[b];
|
|
10886
|
+
chi += (obs - expected) ** 2 / expected;
|
|
10887
|
+
df += 1;
|
|
10888
|
+
}
|
|
10889
|
+
df = Math.max(1, df - 1);
|
|
10890
|
+
const critical = chiSquareCritical(df, alpha);
|
|
10891
|
+
if (chi > critical) {
|
|
10892
|
+
return [
|
|
10893
|
+
{
|
|
10894
|
+
kind: "distribution_shift",
|
|
10895
|
+
severity: "warn",
|
|
10896
|
+
message: `eval-set distribution shift: \u03C7\xB2=${chi.toFixed(2)} df=${df} exceeds critical=${critical.toFixed(2)} at alpha=${alpha}`,
|
|
10897
|
+
evidence: {
|
|
10898
|
+
chi,
|
|
10899
|
+
df,
|
|
10900
|
+
critical,
|
|
10901
|
+
alpha,
|
|
10902
|
+
recentCounts,
|
|
10903
|
+
historicalCounts: histCounts,
|
|
10904
|
+
recentN: recent.length,
|
|
10905
|
+
historyN: historical.length
|
|
10906
|
+
}
|
|
10907
|
+
}
|
|
10908
|
+
];
|
|
10909
|
+
}
|
|
10910
|
+
return [];
|
|
10911
|
+
}
|
|
10912
|
+
function chiSquareCritical(df, alpha) {
|
|
10913
|
+
const TABLE = {
|
|
10914
|
+
1: [2.71, 3.84, 5.02, 6.63],
|
|
10915
|
+
2: [4.61, 5.99, 7.38, 9.21],
|
|
10916
|
+
3: [6.25, 7.81, 9.35, 11.34],
|
|
10917
|
+
4: [7.78, 9.49, 11.14, 13.28],
|
|
10918
|
+
5: [9.24, 11.07, 12.83, 15.09],
|
|
10919
|
+
6: [10.64, 12.59, 14.45, 16.81],
|
|
10920
|
+
7: [12.02, 14.07, 16.01, 18.48],
|
|
10921
|
+
8: [13.36, 15.51, 17.53, 20.09],
|
|
10922
|
+
9: [14.68, 16.92, 19.02, 21.67],
|
|
10923
|
+
10: [15.99, 18.31, 20.48, 23.21],
|
|
10924
|
+
15: [22.31, 25, 27.49, 30.58],
|
|
10925
|
+
20: [28.41, 31.41, 34.17, 37.57],
|
|
10926
|
+
25: [34.38, 37.65, 40.65, 44.31],
|
|
10927
|
+
30: [40.26, 43.77, 46.98, 50.89]
|
|
10928
|
+
};
|
|
10929
|
+
const idx = alpha >= 0.1 ? 0 : alpha >= 0.05 ? 1 : alpha >= 0.025 ? 2 : 3;
|
|
10930
|
+
if (TABLE[df]) return TABLE[df][idx];
|
|
10931
|
+
if (df > 30) {
|
|
10932
|
+
const zMap = { 0: 1.282, 1: 1.645, 2: 1.96, 3: 2.326 };
|
|
10933
|
+
const z = zMap[idx] ?? 1.96;
|
|
10934
|
+
const term = 1 - 2 / (9 * df) + z * Math.sqrt(2 / (9 * df));
|
|
10935
|
+
return df * term ** 3;
|
|
10936
|
+
}
|
|
10937
|
+
const keys = Object.keys(TABLE).map((k) => Number(k)).sort((a, b) => a - b);
|
|
10938
|
+
for (let i = 1; i < keys.length; i++) {
|
|
10939
|
+
const lo = keys[i - 1];
|
|
10940
|
+
const hi = keys[i];
|
|
10941
|
+
if (df >= lo && df <= hi) {
|
|
10942
|
+
const t = (df - lo) / (hi - lo);
|
|
10943
|
+
return TABLE[lo][idx] * (1 - t) + TABLE[hi][idx] * t;
|
|
10944
|
+
}
|
|
10945
|
+
}
|
|
10946
|
+
return TABLE[10][idx];
|
|
10947
|
+
}
|
|
10948
|
+
function mean6(xs) {
|
|
10949
|
+
if (xs.length === 0) return 0;
|
|
10950
|
+
return xs.reduce((s, x) => s + x, 0) / xs.length;
|
|
10951
|
+
}
|
|
10952
|
+
|
|
10953
|
+
// src/benchmarks/types.ts
|
|
10954
|
+
function fnv1a32(input) {
|
|
10955
|
+
let h = 2166136261;
|
|
10956
|
+
for (let i = 0; i < input.length; i++) {
|
|
10957
|
+
h ^= input.charCodeAt(i) & 255;
|
|
10958
|
+
h = h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24)) >>> 0;
|
|
10959
|
+
}
|
|
10960
|
+
return h >>> 0;
|
|
10961
|
+
}
|
|
10962
|
+
var BENCHMARK_SPLIT_SEED = "agent-eval-v1";
|
|
10963
|
+
function deterministicSplit(itemId, seed = BENCHMARK_SPLIT_SEED) {
|
|
10964
|
+
const h = fnv1a32(`${seed}::${itemId}`);
|
|
10965
|
+
const pos = h / 4294967296;
|
|
10966
|
+
if (pos < 0.6) return "search";
|
|
10967
|
+
if (pos < 0.8) return "dev";
|
|
10968
|
+
return "holdout";
|
|
10969
|
+
}
|
|
10970
|
+
|
|
10971
|
+
// src/benchmarks/index.ts
|
|
10972
|
+
var benchmarks_exports = {};
|
|
10973
|
+
__export(benchmarks_exports, {
|
|
10974
|
+
BENCHMARK_SPLIT_SEED: () => BENCHMARK_SPLIT_SEED,
|
|
10975
|
+
deterministicSplit: () => deterministicSplit,
|
|
10976
|
+
gsm8k: () => gsm8k_exports,
|
|
10977
|
+
routing: () => routing_exports,
|
|
10978
|
+
swebenchLite: () => swebench_lite_exports
|
|
10979
|
+
});
|
|
10980
|
+
|
|
10981
|
+
// src/benchmarks/gsm8k/index.ts
|
|
10982
|
+
var gsm8k_exports = {};
|
|
10983
|
+
__export(gsm8k_exports, {
|
|
10984
|
+
Gsm8kAdapter: () => Gsm8kAdapter,
|
|
10985
|
+
assignSplit: () => assignSplit,
|
|
10986
|
+
evaluate: () => evaluate,
|
|
10987
|
+
loadDataset: () => loadDataset,
|
|
10988
|
+
parseGsm8kAnswer: () => parseGsm8kAnswer
|
|
10989
|
+
});
|
|
10990
|
+
import { existsSync as existsSync5, readFileSync as readFileSync5 } from "fs";
|
|
10991
|
+
var Gsm8kAdapter = class {
|
|
10992
|
+
async loadDataset(split) {
|
|
10993
|
+
const path = process.env.AGENT_EVAL_GSM8K_PATH;
|
|
10994
|
+
if (!path) {
|
|
10995
|
+
throw new Error(
|
|
10996
|
+
"GSM8K dataset not provided. Set AGENT_EVAL_GSM8K_PATH to a JSONL file with {id, question, answer} records (the HF GSM8K mirror converted to JSONL)."
|
|
10997
|
+
);
|
|
10998
|
+
}
|
|
10999
|
+
if (!existsSync5(path)) {
|
|
11000
|
+
throw new Error(`AGENT_EVAL_GSM8K_PATH=${path} does not exist`);
|
|
11001
|
+
}
|
|
11002
|
+
const items = parseJsonl(path).filter((it) => assignSplitImpl(it.id) === split);
|
|
11003
|
+
return items;
|
|
11004
|
+
}
|
|
11005
|
+
async evaluate(item, response) {
|
|
11006
|
+
const expected = parseGsm8kAnswer(item.payload.answer);
|
|
11007
|
+
const observed = parseGsm8kAnswer(response);
|
|
11008
|
+
if (expected === null) {
|
|
11009
|
+
return { score: 0, raw: { reason: "reference_not_numeric", expected: item.payload.answer } };
|
|
11010
|
+
}
|
|
11011
|
+
if (observed === null) {
|
|
11012
|
+
return { score: 0, raw: { reason: "no_numeric_in_response", expected, observed: null } };
|
|
11013
|
+
}
|
|
11014
|
+
const ok = Math.abs(expected - observed) < 1e-6;
|
|
11015
|
+
return { score: ok ? 1 : 0, raw: { expected, observed, exactMatch: ok } };
|
|
11016
|
+
}
|
|
11017
|
+
assignSplit(itemId) {
|
|
11018
|
+
return assignSplitImpl(itemId);
|
|
11019
|
+
}
|
|
11020
|
+
};
|
|
11021
|
+
function assignSplitImpl(itemId) {
|
|
11022
|
+
return deterministicSplit(`gsm8k::${itemId}`);
|
|
11023
|
+
}
|
|
11024
|
+
function parseJsonl(path) {
|
|
11025
|
+
const raw = readFileSync5(path, "utf8");
|
|
11026
|
+
const out = [];
|
|
11027
|
+
let lineNo = 0;
|
|
11028
|
+
for (const line of raw.split("\n")) {
|
|
11029
|
+
lineNo++;
|
|
11030
|
+
const trimmed = line.trim();
|
|
11031
|
+
if (!trimmed) continue;
|
|
11032
|
+
let row;
|
|
11033
|
+
try {
|
|
11034
|
+
row = JSON.parse(trimmed);
|
|
11035
|
+
} catch (e) {
|
|
11036
|
+
throw new Error(`GSM8K JSONL parse error at line ${lineNo}: ${e.message}`);
|
|
11037
|
+
}
|
|
11038
|
+
const id = String(row.id ?? `gsm8k_${lineNo}`);
|
|
11039
|
+
const question = String(row.question ?? "");
|
|
11040
|
+
const answer = String(row.answer ?? "");
|
|
11041
|
+
if (!question || !answer) {
|
|
11042
|
+
throw new Error(`GSM8K JSONL line ${lineNo} missing question/answer`);
|
|
11043
|
+
}
|
|
11044
|
+
out.push({ id, payload: { question, answer } });
|
|
11045
|
+
}
|
|
11046
|
+
return out;
|
|
11047
|
+
}
|
|
11048
|
+
function parseGsm8kAnswer(text) {
|
|
11049
|
+
if (!text) return null;
|
|
11050
|
+
const afterMarker = text.match(/####\s*(-?\d[\d,]*\.?\d*)/);
|
|
11051
|
+
if (afterMarker) {
|
|
11052
|
+
const cleaned2 = afterMarker[1].replace(/,/g, "");
|
|
11053
|
+
const v2 = Number(cleaned2);
|
|
11054
|
+
if (Number.isFinite(v2)) return v2;
|
|
11055
|
+
}
|
|
11056
|
+
const matches2 = text.match(/-?\d[\d,]*\.?\d*/g);
|
|
11057
|
+
if (!matches2 || matches2.length === 0) return null;
|
|
11058
|
+
const last = matches2[matches2.length - 1];
|
|
11059
|
+
const cleaned = last.replace(/,/g, "");
|
|
11060
|
+
const v = Number(cleaned);
|
|
11061
|
+
return Number.isFinite(v) ? v : null;
|
|
11062
|
+
}
|
|
11063
|
+
var adapter = new Gsm8kAdapter();
|
|
11064
|
+
var loadDataset = adapter.loadDataset.bind(adapter);
|
|
11065
|
+
var evaluate = adapter.evaluate.bind(adapter);
|
|
11066
|
+
var assignSplit = adapter.assignSplit.bind(adapter);
|
|
11067
|
+
|
|
11068
|
+
// src/benchmarks/swebench-lite/index.ts
|
|
11069
|
+
var swebench_lite_exports = {};
|
|
11070
|
+
__export(swebench_lite_exports, {
|
|
11071
|
+
SweBenchLiteAdapter: () => SweBenchLiteAdapter,
|
|
11072
|
+
assignSplit: () => assignSplit2,
|
|
11073
|
+
evaluate: () => evaluate2,
|
|
11074
|
+
loadDataset: () => loadDataset2
|
|
11075
|
+
});
|
|
11076
|
+
import { existsSync as existsSync6, readFileSync as readFileSync6 } from "fs";
|
|
11077
|
+
import { spawn } from "child_process";
|
|
11078
|
+
var SweBenchLiteAdapter = class {
|
|
11079
|
+
async loadDataset(split) {
|
|
11080
|
+
const path = process.env.AGENT_EVAL_SWEBENCH_PATH;
|
|
11081
|
+
if (!path) {
|
|
11082
|
+
throw new Error(
|
|
11083
|
+
"SWE-Bench Lite dataset not provided. Set AGENT_EVAL_SWEBENCH_PATH to a JSONL file with the 30 lite instances. STUB: this wrapper does not bundle the dataset; see https://www.swebench.com/lite.html for the canonical source."
|
|
11084
|
+
);
|
|
11085
|
+
}
|
|
11086
|
+
if (!existsSync6(path)) {
|
|
11087
|
+
throw new Error(`AGENT_EVAL_SWEBENCH_PATH=${path} does not exist`);
|
|
11088
|
+
}
|
|
11089
|
+
const all = parseJsonl2(path);
|
|
11090
|
+
return all.filter((it) => assignSplitImpl2(it.id) === split);
|
|
11091
|
+
}
|
|
11092
|
+
async evaluate(item, response) {
|
|
11093
|
+
const cmd = process.env.AGENT_EVAL_SWEBENCH_GRADER_CMD;
|
|
11094
|
+
if (!cmd) {
|
|
11095
|
+
throw new Error(
|
|
11096
|
+
"SWE-Bench Lite grader not configured. Set AGENT_EVAL_SWEBENCH_GRADER_CMD to an executable that reads {instance_id, patch} JSON on stdin and writes {passed, fail_to_pass_passed, pass_to_pass_passed, log} JSON on stdout. TODO(swebench-lite): bundle a default Docker-based runner once the SDK stabilises (https://github.com/swe-bench/SWE-bench)."
|
|
11097
|
+
);
|
|
11098
|
+
}
|
|
11099
|
+
const stdinPayload = JSON.stringify({ instance_id: item.payload.instanceId, patch: response });
|
|
11100
|
+
const result = await runGrader(cmd, stdinPayload);
|
|
11101
|
+
let parsed;
|
|
11102
|
+
try {
|
|
11103
|
+
parsed = JSON.parse(result.stdout);
|
|
11104
|
+
} catch (e) {
|
|
11105
|
+
throw new Error(
|
|
11106
|
+
`SWE-Bench grader emitted non-JSON stdout: ${e.message}
|
|
11107
|
+
stdout=${result.stdout.slice(0, 400)}
|
|
11108
|
+
stderr=${result.stderr.slice(0, 400)}`
|
|
11109
|
+
);
|
|
11110
|
+
}
|
|
11111
|
+
const passed = Boolean(parsed.passed);
|
|
11112
|
+
return {
|
|
11113
|
+
score: passed ? 1 : 0,
|
|
11114
|
+
raw: {
|
|
11115
|
+
passed,
|
|
11116
|
+
failToPassPassed: Boolean(parsed.fail_to_pass_passed),
|
|
11117
|
+
passToPassPassed: Boolean(parsed.pass_to_pass_passed),
|
|
11118
|
+
graderLog: typeof parsed.log === "string" ? parsed.log.slice(0, 4e3) : ""
|
|
11119
|
+
}
|
|
11120
|
+
};
|
|
11121
|
+
}
|
|
11122
|
+
assignSplit(itemId) {
|
|
11123
|
+
return assignSplitImpl2(itemId);
|
|
11124
|
+
}
|
|
11125
|
+
};
|
|
11126
|
+
function assignSplitImpl2(itemId) {
|
|
11127
|
+
return deterministicSplit(`swebench-lite::${itemId}`);
|
|
11128
|
+
}
|
|
11129
|
+
function parseJsonl2(path) {
|
|
11130
|
+
const raw = readFileSync6(path, "utf8");
|
|
11131
|
+
const out = [];
|
|
11132
|
+
let lineNo = 0;
|
|
11133
|
+
for (const line of raw.split("\n")) {
|
|
11134
|
+
lineNo++;
|
|
11135
|
+
const trimmed = line.trim();
|
|
11136
|
+
if (!trimmed) continue;
|
|
11137
|
+
const row = JSON.parse(trimmed);
|
|
11138
|
+
const instanceId = String(row.instance_id ?? row.instanceId ?? "");
|
|
11139
|
+
if (!instanceId) {
|
|
11140
|
+
throw new Error(`swebench-lite line ${lineNo} missing instance_id`);
|
|
11141
|
+
}
|
|
11142
|
+
out.push({
|
|
11143
|
+
id: instanceId,
|
|
11144
|
+
payload: {
|
|
11145
|
+
instanceId,
|
|
11146
|
+
problemStatement: String(row.problem_statement ?? row.problemStatement ?? ""),
|
|
11147
|
+
baseCommit: String(row.base_commit ?? row.baseCommit ?? ""),
|
|
11148
|
+
repo: String(row.repo ?? ""),
|
|
11149
|
+
failToPass: asStringArray(row.FAIL_TO_PASS ?? row.failToPass),
|
|
11150
|
+
passToPass: asStringArray(row.PASS_TO_PASS ?? row.passToPass)
|
|
11151
|
+
}
|
|
11152
|
+
});
|
|
11153
|
+
}
|
|
11154
|
+
return out;
|
|
11155
|
+
}
|
|
11156
|
+
function asStringArray(v) {
|
|
11157
|
+
if (Array.isArray(v)) return v.filter((x) => typeof x === "string");
|
|
11158
|
+
if (typeof v === "string") {
|
|
11159
|
+
try {
|
|
11160
|
+
const parsed = JSON.parse(v);
|
|
11161
|
+
if (Array.isArray(parsed)) return parsed.filter((x) => typeof x === "string");
|
|
11162
|
+
} catch {
|
|
11163
|
+
return [v];
|
|
11164
|
+
}
|
|
11165
|
+
}
|
|
11166
|
+
return [];
|
|
11167
|
+
}
|
|
11168
|
+
function runGrader(cmd, stdin) {
|
|
11169
|
+
return new Promise((resolve, reject) => {
|
|
11170
|
+
const parts = cmd.split(/\s+/);
|
|
11171
|
+
const child = spawn(parts[0], parts.slice(1), { stdio: ["pipe", "pipe", "pipe"] });
|
|
11172
|
+
let stdout = "";
|
|
11173
|
+
let stderr = "";
|
|
11174
|
+
child.stdout.on("data", (b) => stdout += b.toString("utf8"));
|
|
11175
|
+
child.stderr.on("data", (b) => stderr += b.toString("utf8"));
|
|
11176
|
+
child.on("error", reject);
|
|
11177
|
+
child.on("close", (code) => {
|
|
11178
|
+
if (code !== 0) {
|
|
11179
|
+
reject(new Error(`grader exited with code ${code}: ${stderr.slice(0, 400)}`));
|
|
11180
|
+
return;
|
|
11181
|
+
}
|
|
11182
|
+
resolve({ stdout, stderr });
|
|
11183
|
+
});
|
|
11184
|
+
child.stdin.write(stdin);
|
|
11185
|
+
child.stdin.end();
|
|
11186
|
+
});
|
|
11187
|
+
}
|
|
11188
|
+
var adapter2 = new SweBenchLiteAdapter();
|
|
11189
|
+
var loadDataset2 = adapter2.loadDataset.bind(adapter2);
|
|
11190
|
+
var evaluate2 = adapter2.evaluate.bind(adapter2);
|
|
11191
|
+
var assignSplit2 = adapter2.assignSplit.bind(adapter2);
|
|
11192
|
+
|
|
11193
|
+
// src/benchmarks/routing/index.ts
|
|
11194
|
+
var routing_exports = {};
|
|
11195
|
+
__export(routing_exports, {
|
|
11196
|
+
ROUTING_DATASET: () => ROUTING_DATASET,
|
|
11197
|
+
RoutingAdapter: () => RoutingAdapter,
|
|
11198
|
+
assignSplit: () => assignSplit3,
|
|
11199
|
+
evaluate: () => evaluate3,
|
|
11200
|
+
extractRouteTokens: () => extractRouteTokens,
|
|
11201
|
+
loadDataset: () => loadDataset3
|
|
11202
|
+
});
|
|
11203
|
+
|
|
11204
|
+
// src/benchmarks/routing/dataset.ts
|
|
11205
|
+
var ROUTING_DATASET = [
|
|
11206
|
+
{
|
|
11207
|
+
id: "file_001",
|
|
11208
|
+
category: "file",
|
|
11209
|
+
prompt: "Save the meeting notes to /tmp/notes-2025-04.md as markdown.",
|
|
11210
|
+
route: "fs.write",
|
|
11211
|
+
synonyms: ["filesystem.write", "write_file"],
|
|
11212
|
+
hardNegatives: ["fs.read", "chat.reply"]
|
|
11213
|
+
},
|
|
11214
|
+
{
|
|
11215
|
+
id: "file_002",
|
|
11216
|
+
category: "file",
|
|
11217
|
+
prompt: "Read the contents of /etc/hosts and summarize the entries.",
|
|
11218
|
+
route: "fs.read",
|
|
11219
|
+
synonyms: ["filesystem.read", "read_file"],
|
|
11220
|
+
hardNegatives: ["fs.write", "search.web"]
|
|
11221
|
+
},
|
|
11222
|
+
{
|
|
11223
|
+
id: "file_003",
|
|
11224
|
+
category: "file",
|
|
11225
|
+
prompt: "List every Python file under src/ recursively.",
|
|
11226
|
+
route: "fs.list",
|
|
11227
|
+
synonyms: ["filesystem.list", "list_files"],
|
|
11228
|
+
hardNegatives: ["fs.read", "search.code"]
|
|
11229
|
+
},
|
|
11230
|
+
{
|
|
11231
|
+
id: "file_004",
|
|
11232
|
+
category: "file",
|
|
11233
|
+
prompt: "Delete the cached build at .turbo/cache.",
|
|
11234
|
+
route: "fs.delete",
|
|
11235
|
+
synonyms: ["filesystem.delete", "remove_file"],
|
|
11236
|
+
hardNegatives: ["fs.write", "fs.list"]
|
|
11237
|
+
},
|
|
11238
|
+
{
|
|
11239
|
+
id: "math_001",
|
|
11240
|
+
category: "math",
|
|
11241
|
+
prompt: "What is the integral of 3x^2 + 2x from 0 to 5?",
|
|
11242
|
+
route: "math.integral",
|
|
11243
|
+
synonyms: ["calculator.integral", "math.solve"],
|
|
11244
|
+
hardNegatives: ["math.derivative", "chat.reply"]
|
|
11245
|
+
},
|
|
11246
|
+
{
|
|
11247
|
+
id: "math_002",
|
|
11248
|
+
category: "math",
|
|
11249
|
+
prompt: "Compute the derivative of sin(x) * cos(x).",
|
|
11250
|
+
route: "math.derivative",
|
|
11251
|
+
synonyms: ["calculator.derivative", "math.solve"],
|
|
11252
|
+
hardNegatives: ["math.integral", "math.algebra"]
|
|
11253
|
+
},
|
|
11254
|
+
{
|
|
11255
|
+
id: "math_003",
|
|
11256
|
+
category: "math",
|
|
11257
|
+
prompt: "Solve 2x + 7 = 19 for x.",
|
|
11258
|
+
route: "math.algebra",
|
|
11259
|
+
synonyms: ["calculator.algebra", "math.solve"],
|
|
11260
|
+
hardNegatives: ["math.derivative", "math.integral"]
|
|
11261
|
+
},
|
|
11262
|
+
{
|
|
11263
|
+
id: "math_004",
|
|
11264
|
+
category: "math",
|
|
11265
|
+
prompt: "What is the prime factorization of 360?",
|
|
11266
|
+
route: "math.numbertheory",
|
|
11267
|
+
synonyms: ["calculator.factor", "math.solve"],
|
|
11268
|
+
hardNegatives: ["math.algebra", "search.web"]
|
|
11269
|
+
},
|
|
11270
|
+
{
|
|
11271
|
+
id: "search_001",
|
|
11272
|
+
category: "search",
|
|
11273
|
+
prompt: "Find recent papers on agent prompt optimization with held-out promotion gates.",
|
|
11274
|
+
route: "search.web",
|
|
11275
|
+
synonyms: ["web.search", "search.papers"],
|
|
11276
|
+
hardNegatives: ["search.code", "chat.reply"]
|
|
11277
|
+
},
|
|
11278
|
+
{
|
|
11279
|
+
id: "search_002",
|
|
11280
|
+
category: "search",
|
|
11281
|
+
prompt: "Search the codebase for every call site of `runProposeReview`.",
|
|
11282
|
+
route: "search.code",
|
|
11283
|
+
synonyms: ["code.search", "grep"],
|
|
11284
|
+
hardNegatives: ["search.web", "fs.read"]
|
|
11285
|
+
},
|
|
11286
|
+
{
|
|
11287
|
+
id: "search_003",
|
|
11288
|
+
category: "search",
|
|
11289
|
+
prompt: "What is the latest release of the Tangle network on GitHub?",
|
|
11290
|
+
route: "search.web",
|
|
11291
|
+
synonyms: ["web.search", "github.releases"],
|
|
11292
|
+
hardNegatives: ["search.code", "chat.reply"]
|
|
11293
|
+
},
|
|
11294
|
+
{
|
|
11295
|
+
id: "search_004",
|
|
11296
|
+
category: "search",
|
|
11297
|
+
prompt: "Find all TODO comments in the agent-eval src tree.",
|
|
11298
|
+
route: "search.code",
|
|
11299
|
+
synonyms: ["code.search", "grep"],
|
|
11300
|
+
hardNegatives: ["search.web", "fs.list"]
|
|
11301
|
+
},
|
|
11302
|
+
{
|
|
11303
|
+
id: "chat_001",
|
|
11304
|
+
category: "chat",
|
|
11305
|
+
prompt: "Hi there, how are you doing today?",
|
|
11306
|
+
route: "chat.reply",
|
|
11307
|
+
synonyms: ["conversation.reply"],
|
|
11308
|
+
hardNegatives: ["search.web", "fs.read"]
|
|
11309
|
+
},
|
|
11310
|
+
{
|
|
11311
|
+
id: "chat_002",
|
|
11312
|
+
category: "chat",
|
|
11313
|
+
prompt: "Please explain the difference between an LLM and a foundation model.",
|
|
11314
|
+
route: "chat.reply",
|
|
11315
|
+
synonyms: ["conversation.reply", "qa.answer"],
|
|
11316
|
+
hardNegatives: ["search.web", "math.algebra"]
|
|
11317
|
+
},
|
|
11318
|
+
{
|
|
11319
|
+
id: "chat_003",
|
|
11320
|
+
category: "chat",
|
|
11321
|
+
prompt: "Tell me a short joke about distributed systems.",
|
|
11322
|
+
route: "chat.reply",
|
|
11323
|
+
synonyms: ["conversation.reply"],
|
|
11324
|
+
hardNegatives: ["search.web", "fs.read"]
|
|
11325
|
+
},
|
|
11326
|
+
{
|
|
11327
|
+
id: "chat_004",
|
|
11328
|
+
category: "chat",
|
|
11329
|
+
prompt: "Acknowledge my last message with a thumbs up.",
|
|
11330
|
+
route: "chat.reply",
|
|
11331
|
+
synonyms: ["conversation.reply", "react"],
|
|
11332
|
+
hardNegatives: ["fs.write", "search.web"]
|
|
11333
|
+
}
|
|
11334
|
+
];
|
|
11335
|
+
|
|
11336
|
+
// src/benchmarks/routing/index.ts
|
|
11337
|
+
var RoutingAdapter = class {
|
|
11338
|
+
async loadDataset(split) {
|
|
11339
|
+
return ROUTING_DATASET.map((item) => ({ id: item.id, payload: item })).filter((it) => assignSplitImpl3(it.id) === split);
|
|
11340
|
+
}
|
|
11341
|
+
async evaluate(item, response) {
|
|
11342
|
+
const tokens2 = extractRouteTokens(response);
|
|
11343
|
+
const correct = new Set([item.payload.route, ...item.payload.synonyms].map((s) => s.toLowerCase()));
|
|
11344
|
+
const hardNeg = new Set(item.payload.hardNegatives.map((s) => s.toLowerCase()));
|
|
11345
|
+
const firstMatch = tokens2.find((t) => correct.has(t.toLowerCase())) ?? null;
|
|
11346
|
+
const firstHardNeg = tokens2.find((t) => hardNeg.has(t.toLowerCase())) ?? null;
|
|
11347
|
+
const score = firstMatch ? 1 : 0;
|
|
11348
|
+
return {
|
|
11349
|
+
score,
|
|
11350
|
+
raw: {
|
|
11351
|
+
firstToken: tokens2[0] ?? null,
|
|
11352
|
+
matchedRoute: firstMatch,
|
|
11353
|
+
hitHardNegative: Boolean(firstHardNeg),
|
|
11354
|
+
hardNegativeRoute: firstHardNeg,
|
|
11355
|
+
category: item.payload.category
|
|
11356
|
+
}
|
|
11357
|
+
};
|
|
11358
|
+
}
|
|
11359
|
+
assignSplit(itemId) {
|
|
11360
|
+
return assignSplitImpl3(itemId);
|
|
11361
|
+
}
|
|
11362
|
+
};
|
|
11363
|
+
function assignSplitImpl3(itemId) {
|
|
11364
|
+
return deterministicSplit(`routing::${itemId}`);
|
|
11365
|
+
}
|
|
11366
|
+
function extractRouteTokens(response) {
|
|
11367
|
+
const matches2 = response.match(/[a-z][a-z0-9_]*\.[a-z][a-z0-9_]*/gi);
|
|
11368
|
+
return matches2 ?? [];
|
|
11369
|
+
}
|
|
11370
|
+
var adapter3 = new RoutingAdapter();
|
|
11371
|
+
var loadDataset3 = adapter3.loadDataset.bind(adapter3);
|
|
11372
|
+
var evaluate3 = adapter3.evaluate.bind(adapter3);
|
|
11373
|
+
var assignSplit3 = adapter3.assignSplit.bind(adapter3);
|
|
11374
|
+
|
|
10069
11375
|
// src/reference-replay-steering.ts
|
|
10070
11376
|
function referenceReplayRunsToSteeringRows(runs, options = {}) {
|
|
10071
11377
|
const rows = [];
|
|
@@ -10257,9 +11563,9 @@ function aggregateTrials(population, scenarioIds, trials) {
|
|
|
10257
11563
|
return {
|
|
10258
11564
|
variantId: variant.id,
|
|
10259
11565
|
scenarioId: sid,
|
|
10260
|
-
meanScore:
|
|
10261
|
-
meanCost:
|
|
10262
|
-
meanDurationMs:
|
|
11566
|
+
meanScore: mean7(gradedTrials.map((t) => t.score)),
|
|
11567
|
+
meanCost: mean7(gradedTrials.map((t) => t.cost ?? 0)),
|
|
11568
|
+
meanDurationMs: mean7(gradedTrials.map((t) => t.durationMs ?? 0)),
|
|
10263
11569
|
okRate: scenarioTrials.length === 0 ? 0 : okTrials.length / scenarioTrials.length,
|
|
10264
11570
|
trials: scenarioTrials.length,
|
|
10265
11571
|
metrics
|
|
@@ -10267,10 +11573,10 @@ function aggregateTrials(population, scenarioIds, trials) {
|
|
|
10267
11573
|
});
|
|
10268
11574
|
return {
|
|
10269
11575
|
variantId: variant.id,
|
|
10270
|
-
meanScore:
|
|
10271
|
-
meanCost:
|
|
10272
|
-
meanDurationMs:
|
|
10273
|
-
okRate:
|
|
11576
|
+
meanScore: mean7(scenarios.map((s) => s.meanScore)),
|
|
11577
|
+
meanCost: mean7(scenarios.map((s) => s.meanCost)),
|
|
11578
|
+
meanDurationMs: mean7(scenarios.map((s) => s.meanDurationMs)),
|
|
11579
|
+
okRate: mean7(scenarios.map((s) => s.okRate)),
|
|
10274
11580
|
scenarios,
|
|
10275
11581
|
metrics: aggregateMetrics(scenarios.map((s) => s.metrics))
|
|
10276
11582
|
};
|
|
@@ -10287,10 +11593,10 @@ function aggregateMetrics(rows) {
|
|
|
10287
11593
|
}
|
|
10288
11594
|
}
|
|
10289
11595
|
const out = {};
|
|
10290
|
-
for (const [k, list] of buckets) out[k] =
|
|
11596
|
+
for (const [k, list] of buckets) out[k] = mean7(list);
|
|
10291
11597
|
return out;
|
|
10292
11598
|
}
|
|
10293
|
-
function
|
|
11599
|
+
function mean7(xs) {
|
|
10294
11600
|
if (xs.length === 0) return 0;
|
|
10295
11601
|
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
10296
11602
|
}
|
|
@@ -10331,11 +11637,11 @@ function samePopulation(a, b) {
|
|
|
10331
11637
|
}
|
|
10332
11638
|
|
|
10333
11639
|
// src/jsonl-trial-cache.ts
|
|
10334
|
-
import { appendFileSync as appendFileSync4, existsSync as
|
|
11640
|
+
import { appendFileSync as appendFileSync4, existsSync as existsSync8, mkdirSync as mkdirSync4, readFileSync as readFileSync7 } from "fs";
|
|
10335
11641
|
import { dirname as dirname4 } from "path";
|
|
10336
11642
|
|
|
10337
11643
|
// src/locked-jsonl-appender.ts
|
|
10338
|
-
import { appendFileSync as appendFileSync3, existsSync as
|
|
11644
|
+
import { appendFileSync as appendFileSync3, existsSync as existsSync7, mkdirSync as mkdirSync3 } from "fs";
|
|
10339
11645
|
import { dirname as dirname3 } from "path";
|
|
10340
11646
|
var mutexes = /* @__PURE__ */ new Map();
|
|
10341
11647
|
function getMutex(path) {
|
|
@@ -10350,7 +11656,7 @@ var LockedJsonlAppender = class {
|
|
|
10350
11656
|
constructor(path) {
|
|
10351
11657
|
this.path = path;
|
|
10352
11658
|
this.mutex = getMutex(path);
|
|
10353
|
-
if (!
|
|
11659
|
+
if (!existsSync7(dirname3(path))) {
|
|
10354
11660
|
mkdirSync3(dirname3(path), { recursive: true });
|
|
10355
11661
|
}
|
|
10356
11662
|
}
|
|
@@ -10375,8 +11681,8 @@ var JsonlTrialCache = class {
|
|
|
10375
11681
|
appender;
|
|
10376
11682
|
constructor(path) {
|
|
10377
11683
|
this.path = path;
|
|
10378
|
-
if (
|
|
10379
|
-
for (const line of
|
|
11684
|
+
if (existsSync8(path)) {
|
|
11685
|
+
for (const line of readFileSync7(path, "utf-8").split("\n")) {
|
|
10380
11686
|
if (!line.trim()) continue;
|
|
10381
11687
|
try {
|
|
10382
11688
|
const entry = JSON.parse(line);
|
|
@@ -10414,7 +11720,7 @@ var JsonlTrialCache = class {
|
|
|
10414
11720
|
};
|
|
10415
11721
|
|
|
10416
11722
|
// src/evolution-telemetry.ts
|
|
10417
|
-
import { appendFileSync as appendFileSync5, existsSync as
|
|
11723
|
+
import { appendFileSync as appendFileSync5, existsSync as existsSync9, mkdirSync as mkdirSync5, readFileSync as readFileSync8, writeFileSync } from "fs";
|
|
10418
11724
|
import { dirname as dirname5 } from "path";
|
|
10419
11725
|
var MutationTelemetry = class {
|
|
10420
11726
|
appender;
|
|
@@ -10445,16 +11751,16 @@ var LineageRecorder = class {
|
|
|
10445
11751
|
this.snapshotPath = `${path}.snapshot`;
|
|
10446
11752
|
this.kindOf = kindOf ?? defaultKindOf;
|
|
10447
11753
|
mkdirSync5(dirname5(path), { recursive: true });
|
|
10448
|
-
if (
|
|
11754
|
+
if (existsSync9(this.snapshotPath)) {
|
|
10449
11755
|
try {
|
|
10450
|
-
const parsed = JSON.parse(
|
|
11756
|
+
const parsed = JSON.parse(readFileSync8(this.snapshotPath, "utf-8"));
|
|
10451
11757
|
for (const n of parsed) this.nodes.set(n.id, n);
|
|
10452
11758
|
} catch {
|
|
10453
11759
|
}
|
|
10454
11760
|
}
|
|
10455
|
-
if (
|
|
11761
|
+
if (existsSync9(path)) {
|
|
10456
11762
|
try {
|
|
10457
|
-
for (const line of
|
|
11763
|
+
for (const line of readFileSync8(path, "utf-8").split("\n")) {
|
|
10458
11764
|
if (!line.trim()) continue;
|
|
10459
11765
|
try {
|
|
10460
11766
|
const entry = JSON.parse(line);
|
|
@@ -10466,9 +11772,9 @@ var LineageRecorder = class {
|
|
|
10466
11772
|
} catch {
|
|
10467
11773
|
}
|
|
10468
11774
|
}
|
|
10469
|
-
if (
|
|
11775
|
+
if (existsSync9(path) && this.nodes.size === 0) {
|
|
10470
11776
|
try {
|
|
10471
|
-
const raw =
|
|
11777
|
+
const raw = readFileSync8(path, "utf-8").trim();
|
|
10472
11778
|
if (raw.startsWith("[")) {
|
|
10473
11779
|
const parsed = JSON.parse(raw);
|
|
10474
11780
|
for (const n of parsed) this.nodes.set(n.id, n);
|
|
@@ -10482,8 +11788,8 @@ var LineageRecorder = class {
|
|
|
10482
11788
|
const prev = this.nodes.get(node.id);
|
|
10483
11789
|
this.nodes.set(node.id, { ...prev, ...node });
|
|
10484
11790
|
try {
|
|
10485
|
-
if (
|
|
10486
|
-
const head =
|
|
11791
|
+
if (existsSync9(this.path)) {
|
|
11792
|
+
const head = readFileSync8(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
|
|
10487
11793
|
if (head === "[") {
|
|
10488
11794
|
writeFileSync(this.path, "");
|
|
10489
11795
|
}
|
|
@@ -10549,9 +11855,9 @@ var CostLedger = class {
|
|
|
10549
11855
|
mutex = new Mutex();
|
|
10550
11856
|
constructor(path) {
|
|
10551
11857
|
this.path = path;
|
|
10552
|
-
if (
|
|
11858
|
+
if (existsSync9(path)) {
|
|
10553
11859
|
try {
|
|
10554
|
-
const loaded = JSON.parse(
|
|
11860
|
+
const loaded = JSON.parse(readFileSync8(path, "utf-8"));
|
|
10555
11861
|
for (const k of Object.keys(this.totals)) {
|
|
10556
11862
|
if (k === "byGeneration") {
|
|
10557
11863
|
if (loaded.byGeneration && typeof loaded.byGeneration === "object") {
|
|
@@ -10975,9 +12281,9 @@ function passOrthogonality(input) {
|
|
|
10975
12281
|
sims.push(cosineSimilarity(vectors[i], vectors[j]));
|
|
10976
12282
|
}
|
|
10977
12283
|
}
|
|
10978
|
-
const
|
|
12284
|
+
const mean9 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
|
|
10979
12285
|
return {
|
|
10980
|
-
orthogonality: Math.max(0, Math.min(1, 1 -
|
|
12286
|
+
orthogonality: Math.max(0, Math.min(1, 1 - mean9)),
|
|
10981
12287
|
passCount: passes.length,
|
|
10982
12288
|
similarities: sims
|
|
10983
12289
|
};
|
|
@@ -11023,8 +12329,8 @@ function bootstrapCi(baseline, candidate, options = {}) {
|
|
|
11023
12329
|
const iterations = options.iterations ?? 1e3;
|
|
11024
12330
|
const minTotal = options.minTotalSamples ?? 6;
|
|
11025
12331
|
const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
|
|
11026
|
-
const baselineMean =
|
|
11027
|
-
const candidateMean =
|
|
12332
|
+
const baselineMean = mean8(baseline);
|
|
12333
|
+
const candidateMean = mean8(candidate);
|
|
11028
12334
|
const delta = candidateMean - baselineMean;
|
|
11029
12335
|
if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
|
|
11030
12336
|
return {
|
|
@@ -11042,7 +12348,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
|
|
|
11042
12348
|
for (let i = 0; i < iterations; i++) {
|
|
11043
12349
|
const bResample = resample(baseline, rng);
|
|
11044
12350
|
const cResample = resample(candidate, rng);
|
|
11045
|
-
deltas[i] =
|
|
12351
|
+
deltas[i] = mean8(cResample) - mean8(bResample);
|
|
11046
12352
|
}
|
|
11047
12353
|
deltas.sort((a, b) => a - b);
|
|
11048
12354
|
const lowerIdx = Math.floor(alpha / 2 * iterations);
|
|
@@ -11065,7 +12371,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
|
|
|
11065
12371
|
verdict
|
|
11066
12372
|
};
|
|
11067
12373
|
}
|
|
11068
|
-
function
|
|
12374
|
+
function mean8(xs) {
|
|
11069
12375
|
if (xs.length === 0) return 0;
|
|
11070
12376
|
let s = 0;
|
|
11071
12377
|
for (const x of xs) s += x;
|
|
@@ -11260,6 +12566,7 @@ function parseReflectionResponse(raw, maxProposals) {
|
|
|
11260
12566
|
export {
|
|
11261
12567
|
AgentDriver,
|
|
11262
12568
|
AxGepaSteeringOptimizer,
|
|
12569
|
+
BENCHMARK_SPLIT_SEED,
|
|
11263
12570
|
BenchmarkRunner,
|
|
11264
12571
|
BudgetBreachError,
|
|
11265
12572
|
BudgetGuard,
|
|
@@ -11288,6 +12595,7 @@ export {
|
|
|
11288
12595
|
FileSystemExperimentStore,
|
|
11289
12596
|
FileSystemOutcomeStore,
|
|
11290
12597
|
FileSystemTraceStore,
|
|
12598
|
+
HeldOutGate,
|
|
11291
12599
|
HoldoutAuditor,
|
|
11292
12600
|
HoldoutLockedError,
|
|
11293
12601
|
INTENT_MATCH_JUDGE_VERSION,
|
|
@@ -11307,6 +12615,7 @@ export {
|
|
|
11307
12615
|
MultiLayerVerifier,
|
|
11308
12616
|
MutationTelemetry,
|
|
11309
12617
|
Mutex,
|
|
12618
|
+
NoopResearcher,
|
|
11310
12619
|
OTEL_AGENT_EVAL_SCOPE,
|
|
11311
12620
|
OptimizationLoop,
|
|
11312
12621
|
PairwiseSteeringOptimizer,
|
|
@@ -11317,6 +12626,7 @@ export {
|
|
|
11317
12626
|
PromptRegistry,
|
|
11318
12627
|
REDACTION_VERSION,
|
|
11319
12628
|
RunCritic,
|
|
12629
|
+
RunRecordValidationError,
|
|
11320
12630
|
SEMANTIC_CONCEPT_JUDGE_VERSION,
|
|
11321
12631
|
SandboxHarness,
|
|
11322
12632
|
ScenarioRegistry,
|
|
@@ -11333,7 +12643,10 @@ export {
|
|
|
11333
12643
|
analyzeSeries,
|
|
11334
12644
|
argHash,
|
|
11335
12645
|
attributeCounterfactuals,
|
|
12646
|
+
deterministicSplit as benchmarkDeterministicSplit,
|
|
12647
|
+
benchmarks_exports as benchmarks,
|
|
11336
12648
|
benjaminiHochberg,
|
|
12649
|
+
bhAdjust,
|
|
11337
12650
|
bisect,
|
|
11338
12651
|
bonferroni,
|
|
11339
12652
|
bootstrapCi,
|
|
@@ -11413,6 +12726,7 @@ export {
|
|
|
11413
12726
|
formatBenchmarkReport,
|
|
11414
12727
|
formatDriverReport,
|
|
11415
12728
|
formatFindings,
|
|
12729
|
+
gainHistogram,
|
|
11416
12730
|
precision as goldenPrecision,
|
|
11417
12731
|
gradeSemanticStatus,
|
|
11418
12732
|
groupBy,
|
|
@@ -11427,6 +12741,7 @@ export {
|
|
|
11427
12741
|
isLlmSpan,
|
|
11428
12742
|
isPrmVerdict,
|
|
11429
12743
|
isRetrievalSpan,
|
|
12744
|
+
isRunRecord,
|
|
11430
12745
|
isSandboxSpan,
|
|
11431
12746
|
isToolSpan,
|
|
11432
12747
|
jestTestParser,
|
|
@@ -11454,11 +12769,15 @@ export {
|
|
|
11454
12769
|
normalizeScores,
|
|
11455
12770
|
notBlocked,
|
|
11456
12771
|
outputLengthRubric,
|
|
12772
|
+
pairedBootstrap,
|
|
11457
12773
|
pairedTTest,
|
|
12774
|
+
pairedWilcoxon,
|
|
11458
12775
|
paraphraseRobustness,
|
|
12776
|
+
paretoChart,
|
|
11459
12777
|
paretoFrontier,
|
|
11460
12778
|
paretoFrontierWithCrowding,
|
|
11461
12779
|
parseReflectionResponse,
|
|
12780
|
+
parseRunRecordSafe,
|
|
11462
12781
|
partialCredit,
|
|
11463
12782
|
passOrthogonality,
|
|
11464
12783
|
pixelDeltaRatio,
|
|
@@ -11489,9 +12808,11 @@ export {
|
|
|
11489
12808
|
requiredSampleSize,
|
|
11490
12809
|
resetLockedAppendersForTesting,
|
|
11491
12810
|
resumeBuilderSession,
|
|
12811
|
+
roundTripRunRecord,
|
|
11492
12812
|
rowCount,
|
|
11493
12813
|
rowWhere,
|
|
11494
12814
|
runAssertions,
|
|
12815
|
+
runCanaries,
|
|
11495
12816
|
runCounterfactual,
|
|
11496
12817
|
runE2EWorkflow,
|
|
11497
12818
|
runExpectations,
|
|
@@ -11526,6 +12847,7 @@ export {
|
|
|
11526
12847
|
stuckLoopView,
|
|
11527
12848
|
summarize,
|
|
11528
12849
|
summarizeHarnessResults,
|
|
12850
|
+
summaryTable,
|
|
11529
12851
|
testJudge,
|
|
11530
12852
|
textInSnapshot,
|
|
11531
12853
|
toLangfuseEnvelope,
|
|
@@ -11539,6 +12861,7 @@ export {
|
|
|
11539
12861
|
toolWasteView,
|
|
11540
12862
|
typoMutator,
|
|
11541
12863
|
urlContains,
|
|
12864
|
+
validateRunRecord,
|
|
11542
12865
|
verbosityBias,
|
|
11543
12866
|
verifyManifest,
|
|
11544
12867
|
visualDiff,
|
|
@@ -11548,6 +12871,7 @@ export {
|
|
|
11548
12871
|
weightedRecall,
|
|
11549
12872
|
welchsTTest,
|
|
11550
12873
|
whitespaceCollapseMutator,
|
|
11551
|
-
wilcoxonSignedRank
|
|
12874
|
+
wilcoxonSignedRank,
|
|
12875
|
+
wranglerDeployRunner
|
|
11552
12876
|
};
|
|
11553
12877
|
//# sourceMappingURL=index.js.map
|