@tangle-network/agent-eval 0.14.2 → 0.16.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +45 -0
- package/dist/chunk-PZ5AY32C.js +10 -0
- package/dist/chunk-PZ5AY32C.js.map +1 -0
- package/dist/cli.js +1 -0
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +963 -4
- package/dist/index.js +1457 -138
- package/dist/index.js.map +1 -1
- package/dist/telemetry/file.js +2 -0
- package/dist/telemetry/file.js.map +1 -1
- package/dist/telemetry/index.js +2 -0
- package/dist/telemetry/index.js.map +1 -1
- package/dist/wire/index.js +1 -0
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -6,6 +6,9 @@ import {
|
|
|
6
6
|
probeLlm,
|
|
7
7
|
stripFencedJson
|
|
8
8
|
} from "./chunk-ITN4YOZY.js";
|
|
9
|
+
import {
|
|
10
|
+
__export
|
|
11
|
+
} from "./chunk-PZ5AY32C.js";
|
|
9
12
|
|
|
10
13
|
// src/client.ts
|
|
11
14
|
var ProductClient = class {
|
|
@@ -265,12 +268,7 @@ ${codeText}`
|
|
|
265
268
|
};
|
|
266
269
|
var coherenceJudge = async (tc, { scenario, turns }) => {
|
|
267
270
|
if (turns.length < 2) {
|
|
268
|
-
return [
|
|
269
|
-
judgeName: "coherence",
|
|
270
|
-
dimension: "coherence",
|
|
271
|
-
score: 5,
|
|
272
|
-
reasoning: "Single-turn scenario \u2014 coherence not fully testable."
|
|
273
|
-
}];
|
|
271
|
+
return [];
|
|
274
272
|
}
|
|
275
273
|
const conversation = turns.map(
|
|
276
274
|
(t, i) => `Turn ${i + 1}:
|
|
@@ -396,36 +394,36 @@ var INVERTED_DIMENSIONS = /* @__PURE__ */ new Set([
|
|
|
396
394
|
"false_confidence",
|
|
397
395
|
"worst_failure"
|
|
398
396
|
]);
|
|
399
|
-
function normalizeScores(
|
|
400
|
-
return
|
|
397
|
+
function normalizeScores(scores2) {
|
|
398
|
+
return scores2.map((s) => {
|
|
401
399
|
if (INVERTED_DIMENSIONS.has(s.dimension)) {
|
|
402
400
|
return s;
|
|
403
401
|
}
|
|
404
402
|
return s;
|
|
405
403
|
});
|
|
406
404
|
}
|
|
407
|
-
function weightedMean(
|
|
408
|
-
if (
|
|
405
|
+
function weightedMean(scores2) {
|
|
406
|
+
if (scores2.length === 0) return 0;
|
|
409
407
|
let totalWeight = 0;
|
|
410
408
|
let weightedSum = 0;
|
|
411
|
-
for (const { score, weight } of
|
|
409
|
+
for (const { score, weight } of scores2) {
|
|
412
410
|
const w = weight ?? 1;
|
|
413
411
|
weightedSum += score * w;
|
|
414
412
|
totalWeight += w;
|
|
415
413
|
}
|
|
416
414
|
return totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
417
415
|
}
|
|
418
|
-
function confidenceInterval(
|
|
419
|
-
if (
|
|
420
|
-
if (
|
|
421
|
-
const n =
|
|
422
|
-
const
|
|
416
|
+
function confidenceInterval(scores2, confidence = 0.95) {
|
|
417
|
+
if (scores2.length === 0) return { mean: 0, lower: 0, upper: 0 };
|
|
418
|
+
if (scores2.length === 1) return { mean: scores2[0], lower: scores2[0], upper: scores2[0] };
|
|
419
|
+
const n = scores2.length;
|
|
420
|
+
const mean9 = scores2.reduce((a, b) => a + b, 0) / n;
|
|
423
421
|
const B = 1e3;
|
|
424
422
|
const bootstrapMeans = [];
|
|
425
423
|
for (let i = 0; i < B; i++) {
|
|
426
424
|
let sum2 = 0;
|
|
427
425
|
for (let j = 0; j < n; j++) {
|
|
428
|
-
sum2 +=
|
|
426
|
+
sum2 += scores2[Math.floor(Math.random() * n)];
|
|
429
427
|
}
|
|
430
428
|
bootstrapMeans.push(sum2 / n);
|
|
431
429
|
}
|
|
@@ -434,7 +432,7 @@ function confidenceInterval(scores, confidence = 0.95) {
|
|
|
434
432
|
const lowerIdx = Math.floor(alpha / 2 * B);
|
|
435
433
|
const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
|
|
436
434
|
return {
|
|
437
|
-
mean:
|
|
435
|
+
mean: mean9,
|
|
438
436
|
lower: bootstrapMeans[lowerIdx],
|
|
439
437
|
upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
|
|
440
438
|
};
|
|
@@ -522,11 +520,11 @@ function pairedTTest(before, after) {
|
|
|
522
520
|
const n = before.length;
|
|
523
521
|
if (n < 2) return { t: 0, df: 0, p: 1 };
|
|
524
522
|
const diffs = before.map((b, i) => after[i] - b);
|
|
525
|
-
const
|
|
526
|
-
const variance2 = diffs.reduce((acc, d) => acc + (d -
|
|
523
|
+
const mean9 = diffs.reduce((a, b) => a + b, 0) / n;
|
|
524
|
+
const variance2 = diffs.reduce((acc, d) => acc + (d - mean9) ** 2, 0) / (n - 1);
|
|
527
525
|
const se = Math.sqrt(variance2 / n);
|
|
528
|
-
if (se === 0) return { t:
|
|
529
|
-
const t =
|
|
526
|
+
if (se === 0) return { t: mean9 === 0 ? 0 : Infinity, df: n - 1, p: mean9 === 0 ? 1 : 0 };
|
|
527
|
+
const t = mean9 / se;
|
|
530
528
|
const df = n - 1;
|
|
531
529
|
const p = 2 * (1 - studentTCdf(Math.abs(t), df));
|
|
532
530
|
return { t, df, p };
|
|
@@ -544,15 +542,15 @@ function wilcoxonSignedRank(before, after) {
|
|
|
544
542
|
while (i < n) {
|
|
545
543
|
let j = i;
|
|
546
544
|
while (j < n && absRanks[j].abs === absRanks[i].abs) j++;
|
|
547
|
-
const
|
|
548
|
-
for (let k = i; k < j; k++) ranks3[absRanks[k].i] =
|
|
545
|
+
const avg2 = (i + 1 + j) / 2;
|
|
546
|
+
for (let k = i; k < j; k++) ranks3[absRanks[k].i] = avg2;
|
|
549
547
|
i = j;
|
|
550
548
|
}
|
|
551
549
|
let wPlus = 0;
|
|
552
550
|
for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
|
|
553
|
-
const
|
|
551
|
+
const mean9 = n * (n + 1) / 4;
|
|
554
552
|
const variance2 = n * (n + 1) * (2 * n + 1) / 24;
|
|
555
|
-
const z = (wPlus -
|
|
553
|
+
const z = (wPlus - mean9) / Math.sqrt(variance2);
|
|
556
554
|
const p = 2 * (1 - normalCdf(Math.abs(z)));
|
|
557
555
|
return { w: wPlus, p };
|
|
558
556
|
}
|
|
@@ -753,8 +751,8 @@ async function executeScenario(tc, scenario, config) {
|
|
|
753
751
|
console.log(` judge retry ${attempt}/2 (waiting ${wait / 1e3}s)`);
|
|
754
752
|
await new Promise((r) => setTimeout(r, wait));
|
|
755
753
|
}
|
|
756
|
-
const
|
|
757
|
-
judgeResults.push(
|
|
754
|
+
const scores2 = await judge(tc, judgeInput);
|
|
755
|
+
judgeResults.push(scores2);
|
|
758
756
|
await new Promise((r) => setTimeout(r, 3e3));
|
|
759
757
|
break;
|
|
760
758
|
} catch (err) {
|
|
@@ -847,8 +845,8 @@ var BenchmarkRunner = class {
|
|
|
847
845
|
byJudge[js.judgeName].dimensions.push(`${js.dimension}=${js.score}`);
|
|
848
846
|
}
|
|
849
847
|
for (const [name, data] of Object.entries(byJudge)) {
|
|
850
|
-
const
|
|
851
|
-
console.log(` ${name.padEnd(16)} avg=${
|
|
848
|
+
const avg2 = (data.scores.reduce((a, b) => a + b, 0) / data.scores.length).toFixed(1);
|
|
849
|
+
console.log(` ${name.padEnd(16)} avg=${avg2} [${data.dimensions.join(", ")}]`);
|
|
852
850
|
}
|
|
853
851
|
console.log(` OVERALL: ${result.overallScore.toFixed(1)}/10 (${(result.totalDurationMs / 1e3).toFixed(0)}s)`);
|
|
854
852
|
console.log();
|
|
@@ -2270,7 +2268,7 @@ var PromptOptimizer = class {
|
|
|
2270
2268
|
});
|
|
2271
2269
|
}
|
|
2272
2270
|
}
|
|
2273
|
-
const
|
|
2271
|
+
const scores2 = config.variants.map((variant) => {
|
|
2274
2272
|
const scenarioMap = rawScores.get(variant.id);
|
|
2275
2273
|
const allSamples = [];
|
|
2276
2274
|
const perScenario = {};
|
|
@@ -2293,10 +2291,10 @@ var PromptOptimizer = class {
|
|
|
2293
2291
|
};
|
|
2294
2292
|
});
|
|
2295
2293
|
const rawPairs = [];
|
|
2296
|
-
for (let i = 0; i <
|
|
2297
|
-
for (let j = i + 1; j <
|
|
2298
|
-
const a =
|
|
2299
|
-
const b =
|
|
2294
|
+
for (let i = 0; i < scores2.length; i++) {
|
|
2295
|
+
for (let j = i + 1; j < scores2.length; j++) {
|
|
2296
|
+
const a = scores2[i];
|
|
2297
|
+
const b = scores2[j];
|
|
2300
2298
|
const { p } = mannWhitneyU(flatSamples(a), flatSamples(b));
|
|
2301
2299
|
rawPairs.push({ a, b, p });
|
|
2302
2300
|
}
|
|
@@ -2310,7 +2308,7 @@ var PromptOptimizer = class {
|
|
|
2310
2308
|
significant: qValues[idx] < alpha,
|
|
2311
2309
|
meanDelta: r.b.mean - r.a.mean
|
|
2312
2310
|
}));
|
|
2313
|
-
const sorted =
|
|
2311
|
+
const sorted = scores2.slice().sort((x, y) => y.mean - x.mean);
|
|
2314
2312
|
const winner = sorted[0];
|
|
2315
2313
|
const second = sorted[1];
|
|
2316
2314
|
const winnerComparisons = pairwise2.filter(
|
|
@@ -2324,7 +2322,7 @@ var PromptOptimizer = class {
|
|
|
2324
2322
|
significant: significantOverAll,
|
|
2325
2323
|
ciLowerBoundExceedsSecondMean
|
|
2326
2324
|
},
|
|
2327
|
-
scores,
|
|
2325
|
+
scores: scores2,
|
|
2328
2326
|
pairwise: pairwise2,
|
|
2329
2327
|
config: {
|
|
2330
2328
|
trialsPerScenario: trials,
|
|
@@ -2870,20 +2868,20 @@ async function mapLimit(items, limit, fn) {
|
|
|
2870
2868
|
function mean(values) {
|
|
2871
2869
|
return values.length ? values.reduce((sum2, value) => sum2 + value, 0) / values.length : 0;
|
|
2872
2870
|
}
|
|
2873
|
-
function meanRunScore(
|
|
2871
|
+
function meanRunScore(scores2) {
|
|
2874
2872
|
return {
|
|
2875
|
-
success: mean(
|
|
2876
|
-
goalProgress: mean(
|
|
2877
|
-
repoGroundedness: mean(
|
|
2878
|
-
driftPenalty: mean(
|
|
2879
|
-
toolUseQuality: mean(
|
|
2880
|
-
patchQuality: mean(
|
|
2881
|
-
testReality: mean(
|
|
2882
|
-
finalGate: mean(
|
|
2883
|
-
reviewerBlockers: mean(
|
|
2884
|
-
costUsd: mean(
|
|
2885
|
-
wallSeconds: mean(
|
|
2886
|
-
notes:
|
|
2873
|
+
success: mean(scores2.map((s) => s.success)),
|
|
2874
|
+
goalProgress: mean(scores2.map((s) => s.goalProgress)),
|
|
2875
|
+
repoGroundedness: mean(scores2.map((s) => s.repoGroundedness)),
|
|
2876
|
+
driftPenalty: mean(scores2.map((s) => s.driftPenalty)),
|
|
2877
|
+
toolUseQuality: mean(scores2.map((s) => s.toolUseQuality)),
|
|
2878
|
+
patchQuality: mean(scores2.map((s) => s.patchQuality)),
|
|
2879
|
+
testReality: mean(scores2.map((s) => s.testReality)),
|
|
2880
|
+
finalGate: mean(scores2.map((s) => s.finalGate)),
|
|
2881
|
+
reviewerBlockers: mean(scores2.map((s) => s.reviewerBlockers)),
|
|
2882
|
+
costUsd: mean(scores2.map((s) => s.costUsd)),
|
|
2883
|
+
wallSeconds: mean(scores2.map((s) => s.wallSeconds)),
|
|
2884
|
+
notes: scores2.flatMap((s) => s.notes ?? [])
|
|
2887
2885
|
};
|
|
2888
2886
|
}
|
|
2889
2887
|
|
|
@@ -3339,12 +3337,12 @@ var SubprocessSandboxDriver = class {
|
|
|
3339
3337
|
this.defaultEnv = options.env;
|
|
3340
3338
|
}
|
|
3341
3339
|
async exec(phase, command, config) {
|
|
3342
|
-
const { spawn } = await import("child_process");
|
|
3340
|
+
const { spawn: spawn2 } = await import("child_process");
|
|
3343
3341
|
const start = Date.now();
|
|
3344
3342
|
const effectiveCwd = config.cwd ?? this.defaultCwd;
|
|
3345
3343
|
const effectiveEnv = { ...process.env, ...this.defaultEnv ?? {}, ...config.env ?? {} };
|
|
3346
3344
|
return await new Promise((resolve) => {
|
|
3347
|
-
const child =
|
|
3345
|
+
const child = spawn2(command, {
|
|
3348
3346
|
shell: true,
|
|
3349
3347
|
cwd: effectiveCwd,
|
|
3350
3348
|
env: effectiveEnv
|
|
@@ -5392,10 +5390,10 @@ function analyzeSeries(values, options = {}) {
|
|
|
5392
5390
|
return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
|
|
5393
5391
|
}
|
|
5394
5392
|
const tail = values.slice(-window);
|
|
5395
|
-
const
|
|
5396
|
-
const variance2 = tail.reduce((acc, v) => acc + (v -
|
|
5393
|
+
const mean9 = tail.reduce((a, b) => a + b, 0) / tail.length;
|
|
5394
|
+
const variance2 = tail.reduce((acc, v) => acc + (v - mean9) ** 2, 0) / tail.length;
|
|
5397
5395
|
const stdDev = Math.sqrt(variance2);
|
|
5398
|
-
const refMean = Math.abs(
|
|
5396
|
+
const refMean = Math.abs(mean9) > 1e-9 ? Math.abs(mean9) : 1;
|
|
5399
5397
|
const cv = stdDev / refMean;
|
|
5400
5398
|
const stable = tail.length >= window && cv <= stableCv;
|
|
5401
5399
|
let tailRun = 0;
|
|
@@ -5416,7 +5414,7 @@ function analyzeSeries(values, options = {}) {
|
|
|
5416
5414
|
} else {
|
|
5417
5415
|
state = "noisy";
|
|
5418
5416
|
}
|
|
5419
|
-
return { state, windowMean:
|
|
5417
|
+
return { state, windowMean: mean9, windowCv: cv, tailRun, stable };
|
|
5420
5418
|
}
|
|
5421
5419
|
|
|
5422
5420
|
// src/state-continuity.ts
|
|
@@ -6012,9 +6010,9 @@ function calibrateJudge(golden, candidate) {
|
|
|
6012
6010
|
const worst2 = [...map.entries()].filter(([, v]) => Number.isFinite(v.j)).map(([itemId, v]) => ({ itemId, judge: v.j, human: v.h, delta: Math.abs(v.j - v.h) })).sort((a, b) => b.delta - a.delta).slice(0, 5);
|
|
6013
6011
|
return { n, pearson: pearson2, kappa, mae, worstItems: worst2 };
|
|
6014
6012
|
}
|
|
6015
|
-
function positionalBias(
|
|
6013
|
+
function positionalBias(scores2) {
|
|
6016
6014
|
const pairs = /* @__PURE__ */ new Map();
|
|
6017
|
-
for (const s of
|
|
6015
|
+
for (const s of scores2) {
|
|
6018
6016
|
const slot = pairs.get(s.itemId) ?? {};
|
|
6019
6017
|
if (s.positionOfAInput === "first") slot.first = s.score;
|
|
6020
6018
|
else if (s.positionOfAInput === "second") slot.second = s.score;
|
|
@@ -6165,12 +6163,12 @@ function renderMarkdownReport(reports) {
|
|
|
6165
6163
|
async function aggregateRunMetrics(runs, store) {
|
|
6166
6164
|
if (runs.length === 0) return {};
|
|
6167
6165
|
const durations = [];
|
|
6168
|
-
const
|
|
6166
|
+
const scores2 = [];
|
|
6169
6167
|
const passes = [];
|
|
6170
6168
|
const costs = [];
|
|
6171
6169
|
for (const r of runs) {
|
|
6172
6170
|
if (r.endedAt) durations.push(r.endedAt - r.startedAt);
|
|
6173
|
-
if (r.outcome?.score !== void 0)
|
|
6171
|
+
if (r.outcome?.score !== void 0) scores2.push(r.outcome.score);
|
|
6174
6172
|
passes.push(r.outcome?.pass === true ? 1 : 0);
|
|
6175
6173
|
const llm = await llmSpans(store, r.runId);
|
|
6176
6174
|
costs.push(aggregateLlm(llm).costUsd);
|
|
@@ -6179,7 +6177,7 @@ async function aggregateRunMetrics(runs, store) {
|
|
|
6179
6177
|
provisionMs: average(durations),
|
|
6180
6178
|
firstTokenMs: average(durations),
|
|
6181
6179
|
wallMs: average(durations),
|
|
6182
|
-
overallScore: average(
|
|
6180
|
+
overallScore: average(scores2),
|
|
6183
6181
|
passRate: average(passes),
|
|
6184
6182
|
costUsd: average(costs)
|
|
6185
6183
|
};
|
|
@@ -6242,7 +6240,7 @@ async function toLangfuseEnvelope(store, runId) {
|
|
|
6242
6240
|
},
|
|
6243
6241
|
metadata: { finishReason: s.finishReason, cachedTokens: s.cachedTokens }
|
|
6244
6242
|
}));
|
|
6245
|
-
const
|
|
6243
|
+
const scores2 = judges.map((j) => ({
|
|
6246
6244
|
id: j.spanId,
|
|
6247
6245
|
traceId: run.runId,
|
|
6248
6246
|
observationId: j.targetSpanId,
|
|
@@ -6250,7 +6248,7 @@ async function toLangfuseEnvelope(store, runId) {
|
|
|
6250
6248
|
value: j.score,
|
|
6251
6249
|
comment: j.rationale
|
|
6252
6250
|
}));
|
|
6253
|
-
return { traceId: run.runId, generations, scores };
|
|
6251
|
+
return { traceId: run.runId, generations, scores: scores2 };
|
|
6254
6252
|
}
|
|
6255
6253
|
async function toPrometheusText(store) {
|
|
6256
6254
|
const runs = await store.listRuns();
|
|
@@ -6344,12 +6342,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
|
|
|
6344
6342
|
variantScores.push({ mutator: id, score, mutated });
|
|
6345
6343
|
all.push(score);
|
|
6346
6344
|
}
|
|
6347
|
-
const
|
|
6348
|
-
const variance2 = all.reduce((a, v) => a + (v -
|
|
6345
|
+
const mean9 = all.reduce((a, b) => a + b, 0) / all.length;
|
|
6346
|
+
const variance2 = all.reduce((a, v) => a + (v - mean9) ** 2, 0) / all.length;
|
|
6349
6347
|
const stdDev = Math.sqrt(variance2);
|
|
6350
|
-
const ref = Math.abs(
|
|
6348
|
+
const ref = Math.abs(mean9) > 1e-9 ? Math.abs(mean9) : 1;
|
|
6351
6349
|
const robustness = Math.max(0, 1 - stdDev / ref);
|
|
6352
|
-
return { originalScore, variantScores, meanScore:
|
|
6350
|
+
return { originalScore, variantScores, meanScore: mean9, stdDev, robustness };
|
|
6353
6351
|
}
|
|
6354
6352
|
var lowercaseMutator = (p) => p.toLowerCase();
|
|
6355
6353
|
var sentenceReorderMutator = (p, seed) => {
|
|
@@ -6684,8 +6682,8 @@ function ranks(xs) {
|
|
|
6684
6682
|
for (let i = 0; i < indexed.length; i++) {
|
|
6685
6683
|
let j = i;
|
|
6686
6684
|
while (j + 1 < indexed.length && indexed[j + 1].v === indexed[i].v) j++;
|
|
6687
|
-
const
|
|
6688
|
-
for (let k = i; k <= j; k++) r[indexed[k].i] =
|
|
6685
|
+
const avg2 = (i + j + 2) / 2;
|
|
6686
|
+
for (let k = i; k <= j; k++) r[indexed[k].i] = avg2;
|
|
6689
6687
|
i = j;
|
|
6690
6688
|
}
|
|
6691
6689
|
return r;
|
|
@@ -6929,8 +6927,8 @@ function ranks2(xs) {
|
|
|
6929
6927
|
for (let i = 0; i < indexed.length; i++) {
|
|
6930
6928
|
let j = i;
|
|
6931
6929
|
while (j + 1 < indexed.length && indexed[j + 1].v === indexed[i].v) j++;
|
|
6932
|
-
const
|
|
6933
|
-
for (let k = i; k <= j; k++) r[indexed[k].i] =
|
|
6930
|
+
const avg2 = (i + j + 2) / 2;
|
|
6931
|
+
for (let k = i; k <= j; k++) r[indexed[k].i] = avg2;
|
|
6934
6932
|
i = j;
|
|
6935
6933
|
}
|
|
6936
6934
|
return r;
|
|
@@ -7270,8 +7268,8 @@ async function prmBestOfN(store, grader, runIds) {
|
|
|
7270
7268
|
if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
|
|
7271
7269
|
const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
|
|
7272
7270
|
const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
|
|
7273
|
-
const
|
|
7274
|
-
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore -
|
|
7271
|
+
const mean9 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
|
|
7272
|
+
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean9) ** 2, 0) / graded.length;
|
|
7275
7273
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
7276
7274
|
}
|
|
7277
7275
|
async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
@@ -7293,8 +7291,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
|
7293
7291
|
const ranked = [...byRun.values()].sort(
|
|
7294
7292
|
(a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
|
|
7295
7293
|
);
|
|
7296
|
-
const
|
|
7297
|
-
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore -
|
|
7294
|
+
const mean9 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
|
|
7295
|
+
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean9) ** 2, 0) / ranked.length;
|
|
7298
7296
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
7299
7297
|
}
|
|
7300
7298
|
|
|
@@ -7672,15 +7670,15 @@ async function runSelfPlay(proposer, scorer, targets, options = {}) {
|
|
|
7672
7670
|
const rejected = [];
|
|
7673
7671
|
const surviving = [];
|
|
7674
7672
|
for (const candidate of proposed) {
|
|
7675
|
-
const
|
|
7676
|
-
if (
|
|
7673
|
+
const scores2 = await scorer.scoreCandidate(candidate, targets);
|
|
7674
|
+
if (scores2.length < 2) {
|
|
7677
7675
|
rejected.push({ candidate, reason: "scorer returned <2 results" });
|
|
7678
7676
|
continue;
|
|
7679
7677
|
}
|
|
7680
|
-
const values =
|
|
7678
|
+
const values = scores2.map((s) => s.score);
|
|
7681
7679
|
const spread = Math.max(...values) - Math.min(...values);
|
|
7682
7680
|
const maxScore = Math.max(...values);
|
|
7683
|
-
scored.push({ candidate, scores, spread });
|
|
7681
|
+
scored.push({ candidate, scores: scores2, spread });
|
|
7684
7682
|
if (maxScore < floor) {
|
|
7685
7683
|
rejected.push({ candidate, reason: `every target below floor (max=${maxScore.toFixed(3)} < ${floor})` });
|
|
7686
7684
|
continue;
|
|
@@ -7822,10 +7820,10 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
7822
7820
|
}
|
|
7823
7821
|
for (const s of scenarios) {
|
|
7824
7822
|
const sRuns = runs.filter((r) => r.scenarioId === s.id);
|
|
7825
|
-
const
|
|
7826
|
-
if (
|
|
7827
|
-
const
|
|
7828
|
-
const variance2 =
|
|
7823
|
+
const scores2 = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
|
|
7824
|
+
if (scores2.length < 3) continue;
|
|
7825
|
+
const mean9 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
|
|
7826
|
+
const variance2 = scores2.reduce((a, b) => a + (b - mean9) ** 2, 0) / scores2.length;
|
|
7829
7827
|
if (variance2 > varianceThreshold) {
|
|
7830
7828
|
targets.push({
|
|
7831
7829
|
reason: "high-variance",
|
|
@@ -8580,20 +8578,20 @@ function mergeLayerResults(name, perAdapter, options = {}) {
|
|
|
8580
8578
|
let durationMs = 0;
|
|
8581
8579
|
const reasonParts = [];
|
|
8582
8580
|
const diagnostics = {};
|
|
8583
|
-
for (const { adapter, result } of perAdapter) {
|
|
8581
|
+
for (const { adapter: adapter4, result } of perAdapter) {
|
|
8584
8582
|
status = worst(status, result.status);
|
|
8585
8583
|
if (typeof result.score === "number") {
|
|
8586
8584
|
weightedScoreSum += result.score;
|
|
8587
8585
|
weightCount += 1;
|
|
8588
8586
|
}
|
|
8589
8587
|
durationMs = mergeDuration === "sum" ? durationMs + result.durationMs : Math.max(durationMs, result.durationMs);
|
|
8590
|
-
reasonParts.push(`${
|
|
8588
|
+
reasonParts.push(`${adapter4}: ${result.status}`);
|
|
8591
8589
|
for (const f of result.findings) {
|
|
8592
8590
|
findings.push({
|
|
8593
8591
|
...f,
|
|
8594
8592
|
layer: name,
|
|
8595
|
-
message: prefix ? `${prefix(
|
|
8596
|
-
detail: { ...f.detail ?? {}, adapter }
|
|
8593
|
+
message: prefix ? `${prefix(adapter4)} ${f.message}` : f.message,
|
|
8594
|
+
detail: { ...f.detail ?? {}, adapter: adapter4 }
|
|
8597
8595
|
});
|
|
8598
8596
|
}
|
|
8599
8597
|
for (const [k, v] of Object.entries(result.diagnostics ?? {})) {
|
|
@@ -8612,8 +8610,8 @@ function mergeLayerResults(name, perAdapter, options = {}) {
|
|
|
8612
8610
|
reason: reasonParts.join(" \xB7 "),
|
|
8613
8611
|
diagnostics: Object.keys(diagnostics).length > 0 ? diagnostics : void 0,
|
|
8614
8612
|
detail: {
|
|
8615
|
-
adapters: perAdapter.map(({ adapter, result }) => ({
|
|
8616
|
-
adapter,
|
|
8613
|
+
adapters: perAdapter.map(({ adapter: adapter4, result }) => ({
|
|
8614
|
+
adapter: adapter4,
|
|
8617
8615
|
status: result.status,
|
|
8618
8616
|
score: result.score ?? null
|
|
8619
8617
|
})),
|
|
@@ -8639,10 +8637,10 @@ function multiToolchainLayer(config) {
|
|
|
8639
8637
|
reason: "no adapters detected"
|
|
8640
8638
|
};
|
|
8641
8639
|
}
|
|
8642
|
-
const runOne = async (
|
|
8643
|
-
const adapterName = config.adapterName(
|
|
8640
|
+
const runOne = async (adapter4) => {
|
|
8641
|
+
const adapterName = config.adapterName(adapter4);
|
|
8644
8642
|
try {
|
|
8645
|
-
const r = await config.run(
|
|
8643
|
+
const r = await config.run(adapter4, ctx);
|
|
8646
8644
|
return { adapter: adapterName, result: r };
|
|
8647
8645
|
} catch (err) {
|
|
8648
8646
|
return {
|
|
@@ -9345,6 +9343,57 @@ function viteDeployRunner(input) {
|
|
|
9345
9343
|
}
|
|
9346
9344
|
};
|
|
9347
9345
|
}
|
|
9346
|
+
function wranglerDeployRunner(input) {
|
|
9347
|
+
return {
|
|
9348
|
+
run: async () => {
|
|
9349
|
+
const start = Date.now();
|
|
9350
|
+
const buildCmd = input.buildCommand ?? "npm run build";
|
|
9351
|
+
const dryCmd = input.dryRunCommand ?? "npx wrangler deploy --dry-run --outdir dist";
|
|
9352
|
+
const timeoutMs = input.timeoutMs ?? 12e4;
|
|
9353
|
+
const hasToml = await input.exists("wrangler.toml");
|
|
9354
|
+
const hasJsonc = hasToml ? false : await input.exists("wrangler.jsonc");
|
|
9355
|
+
if (!hasToml && !hasJsonc) {
|
|
9356
|
+
return {
|
|
9357
|
+
ok: false,
|
|
9358
|
+
output: "no wrangler config found (wrangler.toml / wrangler.jsonc absent)",
|
|
9359
|
+
durationMs: Date.now() - start,
|
|
9360
|
+
artifactDir: "dist",
|
|
9361
|
+
artifactValid: false
|
|
9362
|
+
};
|
|
9363
|
+
}
|
|
9364
|
+
const build = await input.exec(buildCmd, { cwd: input.workdir, timeoutMs });
|
|
9365
|
+
if (build.exitCode !== 0) {
|
|
9366
|
+
const tail2 = ((build.stderr || build.stdout) ?? "").slice(-1500);
|
|
9367
|
+
return {
|
|
9368
|
+
ok: false,
|
|
9369
|
+
output: `build failed: ${tail2}`,
|
|
9370
|
+
durationMs: Date.now() - start,
|
|
9371
|
+
artifactDir: "dist",
|
|
9372
|
+
artifactValid: false
|
|
9373
|
+
};
|
|
9374
|
+
}
|
|
9375
|
+
const dry = await input.exec(dryCmd, { cwd: input.workdir, timeoutMs });
|
|
9376
|
+
if (dry.exitCode !== 0) {
|
|
9377
|
+
const tail2 = ((dry.stderr || dry.stdout) ?? "").slice(-1500);
|
|
9378
|
+
return {
|
|
9379
|
+
ok: false,
|
|
9380
|
+
output: `wrangler dry-run failed: ${tail2}`,
|
|
9381
|
+
durationMs: Date.now() - start,
|
|
9382
|
+
artifactDir: "dist",
|
|
9383
|
+
artifactValid: false
|
|
9384
|
+
};
|
|
9385
|
+
}
|
|
9386
|
+
const tail = ((dry.stdout || dry.stderr) ?? "").slice(-1500);
|
|
9387
|
+
return {
|
|
9388
|
+
ok: true,
|
|
9389
|
+
output: tail,
|
|
9390
|
+
durationMs: Date.now() - start,
|
|
9391
|
+
artifactDir: "dist",
|
|
9392
|
+
artifactValid: true
|
|
9393
|
+
};
|
|
9394
|
+
}
|
|
9395
|
+
};
|
|
9396
|
+
}
|
|
9348
9397
|
|
|
9349
9398
|
// src/keyword-coverage-judge.ts
|
|
9350
9399
|
function htmlContainsElement(html, selector) {
|
|
@@ -9712,15 +9761,15 @@ function scoreReferenceReplay(scenarios, options = {}) {
|
|
|
9712
9761
|
const threshold = options.matchThreshold ?? DEFAULT_MATCH_THRESHOLD;
|
|
9713
9762
|
const matchStrategy = options.matchStrategy ?? "reference-order";
|
|
9714
9763
|
const allowedSplits = new Set(options.splits ?? ALL_SPLITS);
|
|
9715
|
-
const
|
|
9764
|
+
const scores2 = scenarios.filter((scenario) => {
|
|
9716
9765
|
const split = scenario.split ?? "train";
|
|
9717
9766
|
if (split === "holdout" && !options.includeHoldout) return false;
|
|
9718
9767
|
return allowedSplits.has(split);
|
|
9719
9768
|
}).map((scenario) => scoreScenario(scenario, matcher, threshold, matchStrategy));
|
|
9720
9769
|
return {
|
|
9721
|
-
scenarios:
|
|
9722
|
-
aggregate: aggregateScenarioScores(
|
|
9723
|
-
bySplit: aggregateBySplit(
|
|
9770
|
+
scenarios: scores2,
|
|
9771
|
+
aggregate: aggregateScenarioScores(scores2),
|
|
9772
|
+
bySplit: aggregateBySplit(scores2)
|
|
9724
9773
|
};
|
|
9725
9774
|
}
|
|
9726
9775
|
function compareReferenceReplay(baseline, candidate) {
|
|
@@ -9935,20 +9984,20 @@ function buildScenarioScore(scenario, matches2, falsePositives) {
|
|
|
9935
9984
|
matches: matches2
|
|
9936
9985
|
};
|
|
9937
9986
|
}
|
|
9938
|
-
function aggregateBySplit(
|
|
9987
|
+
function aggregateBySplit(scores2) {
|
|
9939
9988
|
const out = {};
|
|
9940
9989
|
for (const split of ALL_SPLITS) {
|
|
9941
|
-
const scoped =
|
|
9990
|
+
const scoped = scores2.filter((score) => score.split === split);
|
|
9942
9991
|
if (scoped.length > 0) out[split] = aggregateScenarioScores(scoped);
|
|
9943
9992
|
}
|
|
9944
9993
|
return out;
|
|
9945
9994
|
}
|
|
9946
|
-
function aggregateScenarioScores(
|
|
9947
|
-
const matched = sum(
|
|
9948
|
-
const total = sum(
|
|
9949
|
-
const falsePositives = sum(
|
|
9950
|
-
const matchedWeight = sum(
|
|
9951
|
-
const totalWeight = sum(
|
|
9995
|
+
function aggregateScenarioScores(scores2) {
|
|
9996
|
+
const matched = sum(scores2.map((score) => score.matched));
|
|
9997
|
+
const total = sum(scores2.map((score) => score.total));
|
|
9998
|
+
const falsePositives = sum(scores2.map((score) => score.falsePositives));
|
|
9999
|
+
const matchedWeight = sum(scores2.map((score) => score.matchedWeight));
|
|
10000
|
+
const totalWeight = sum(scores2.map((score) => score.totalWeight));
|
|
9952
10001
|
const precision2 = ratio(matched, matched + falsePositives);
|
|
9953
10002
|
const recall = ratio(matched, total);
|
|
9954
10003
|
return {
|
|
@@ -10027,8 +10076,8 @@ function formatPct(value) {
|
|
|
10027
10076
|
function bySplitOrder(a, b) {
|
|
10028
10077
|
return ALL_SPLITS.indexOf(a) - ALL_SPLITS.indexOf(b);
|
|
10029
10078
|
}
|
|
10030
|
-
function runAdapter(
|
|
10031
|
-
return typeof
|
|
10079
|
+
function runAdapter(adapter4, scenario, context) {
|
|
10080
|
+
return typeof adapter4 === "function" ? adapter4(scenario, context) : adapter4.run(scenario, context);
|
|
10032
10081
|
}
|
|
10033
10082
|
function throwIfAborted(signal) {
|
|
10034
10083
|
if (!signal?.aborted) return;
|
|
@@ -10066,6 +10115,1258 @@ var STOP_WORDS = /* @__PURE__ */ new Set([
|
|
|
10066
10115
|
"which"
|
|
10067
10116
|
]);
|
|
10068
10117
|
|
|
10118
|
+
// src/paired-stats.ts
|
|
10119
|
+
function pairedBootstrap(before, after, opts = {}) {
|
|
10120
|
+
if (before.length !== after.length) {
|
|
10121
|
+
throw new Error(
|
|
10122
|
+
`pairedBootstrap: unequal sample sizes (${before.length} vs ${after.length})`
|
|
10123
|
+
);
|
|
10124
|
+
}
|
|
10125
|
+
const confidence = opts.confidence ?? 0.95;
|
|
10126
|
+
const resamples = opts.resamples ?? 2e3;
|
|
10127
|
+
const statistic = opts.statistic ?? "median";
|
|
10128
|
+
if (confidence <= 0 || confidence >= 1) {
|
|
10129
|
+
throw new Error(`pairedBootstrap: confidence must be in (0,1), got ${confidence}`);
|
|
10130
|
+
}
|
|
10131
|
+
const n = before.length;
|
|
10132
|
+
const deltas = before.map((b, i) => after[i] - b);
|
|
10133
|
+
if (n === 0) {
|
|
10134
|
+
return { n: 0, median: 0, mean: 0, low: 0, high: 0, confidence, resamples };
|
|
10135
|
+
}
|
|
10136
|
+
if (n === 1) {
|
|
10137
|
+
const d = deltas[0];
|
|
10138
|
+
return { n: 1, median: d, mean: d, low: d, high: d, confidence, resamples };
|
|
10139
|
+
}
|
|
10140
|
+
const rng = makeRng(opts.seed);
|
|
10141
|
+
const samples = new Array(resamples);
|
|
10142
|
+
for (let b = 0; b < resamples; b++) {
|
|
10143
|
+
let acc = null;
|
|
10144
|
+
if (statistic === "mean") {
|
|
10145
|
+
let sum2 = 0;
|
|
10146
|
+
for (let k = 0; k < n; k++) {
|
|
10147
|
+
sum2 += deltas[Math.floor(rng() * n)];
|
|
10148
|
+
}
|
|
10149
|
+
samples[b] = sum2 / n;
|
|
10150
|
+
} else {
|
|
10151
|
+
acc = new Array(n);
|
|
10152
|
+
for (let k = 0; k < n; k++) {
|
|
10153
|
+
acc[k] = deltas[Math.floor(rng() * n)];
|
|
10154
|
+
}
|
|
10155
|
+
samples[b] = medianInPlace(acc);
|
|
10156
|
+
}
|
|
10157
|
+
}
|
|
10158
|
+
samples.sort((a, b) => a - b);
|
|
10159
|
+
const alpha = 1 - confidence;
|
|
10160
|
+
const lowIdx = Math.floor(alpha / 2 * resamples);
|
|
10161
|
+
const highIdx = Math.min(resamples - 1, Math.ceil((1 - alpha / 2) * resamples) - 1);
|
|
10162
|
+
return {
|
|
10163
|
+
n,
|
|
10164
|
+
median: medianInPlace([...deltas]),
|
|
10165
|
+
mean: deltas.reduce((s, x) => s + x, 0) / n,
|
|
10166
|
+
low: samples[lowIdx],
|
|
10167
|
+
high: samples[Math.max(highIdx, lowIdx)],
|
|
10168
|
+
confidence,
|
|
10169
|
+
resamples
|
|
10170
|
+
};
|
|
10171
|
+
}
|
|
10172
|
+
function pairedWilcoxon(before, after) {
|
|
10173
|
+
return wilcoxonSignedRank(before, after);
|
|
10174
|
+
}
|
|
10175
|
+
function bhAdjust(pValues, fdr = 0.05) {
|
|
10176
|
+
return benjaminiHochberg(pValues, fdr);
|
|
10177
|
+
}
|
|
10178
|
+
function medianInPlace(xs) {
|
|
10179
|
+
if (xs.length === 0) return 0;
|
|
10180
|
+
xs.sort((a, b) => a - b);
|
|
10181
|
+
const mid = Math.floor(xs.length / 2);
|
|
10182
|
+
return xs.length % 2 === 0 ? (xs[mid - 1] + xs[mid]) / 2 : xs[mid];
|
|
10183
|
+
}
|
|
10184
|
+
function makeRng(seed) {
|
|
10185
|
+
if (seed === void 0) return Math.random;
|
|
10186
|
+
let s = seed | 0 || 2654435769;
|
|
10187
|
+
return () => {
|
|
10188
|
+
s = s + 1831565813 | 0;
|
|
10189
|
+
let t = s;
|
|
10190
|
+
t = Math.imul(t ^ t >>> 15, t | 1);
|
|
10191
|
+
t ^= t + Math.imul(t ^ t >>> 7, t | 61);
|
|
10192
|
+
return ((t ^ t >>> 14) >>> 0) / 4294967296;
|
|
10193
|
+
};
|
|
10194
|
+
}
|
|
10195
|
+
|
|
10196
|
+
// src/run-record.ts
|
|
10197
|
+
var MANDATORY_TOP_LEVEL = [
|
|
10198
|
+
"runId",
|
|
10199
|
+
"experimentId",
|
|
10200
|
+
"candidateId",
|
|
10201
|
+
"seed",
|
|
10202
|
+
"model",
|
|
10203
|
+
"promptHash",
|
|
10204
|
+
"configHash",
|
|
10205
|
+
"commitSha",
|
|
10206
|
+
"wallMs",
|
|
10207
|
+
"costUsd",
|
|
10208
|
+
"tokenUsage",
|
|
10209
|
+
"outcome",
|
|
10210
|
+
"splitTag"
|
|
10211
|
+
];
|
|
10212
|
+
var SPLIT_TAGS = ["search", "dev", "holdout"];
|
|
10213
|
+
var RunRecordValidationError = class extends Error {
|
|
10214
|
+
path;
|
|
10215
|
+
constructor(message, path = "") {
|
|
10216
|
+
super(path ? `${message} (at ${path})` : message);
|
|
10217
|
+
this.name = "RunRecordValidationError";
|
|
10218
|
+
this.path = path;
|
|
10219
|
+
}
|
|
10220
|
+
};
|
|
10221
|
+
function validateRunRecord(input) {
|
|
10222
|
+
if (input === null || typeof input !== "object") {
|
|
10223
|
+
throw new RunRecordValidationError("expected object");
|
|
10224
|
+
}
|
|
10225
|
+
const obj = input;
|
|
10226
|
+
for (const key of MANDATORY_TOP_LEVEL) {
|
|
10227
|
+
if (!(key in obj)) {
|
|
10228
|
+
throw new RunRecordValidationError(`missing mandatory field "${key}"`);
|
|
10229
|
+
}
|
|
10230
|
+
}
|
|
10231
|
+
expectString(obj.runId, "runId");
|
|
10232
|
+
expectString(obj.experimentId, "experimentId");
|
|
10233
|
+
expectString(obj.candidateId, "candidateId");
|
|
10234
|
+
expectFiniteNumber(obj.seed, "seed");
|
|
10235
|
+
expectString(obj.model, "model");
|
|
10236
|
+
expectString(obj.promptHash, "promptHash");
|
|
10237
|
+
expectString(obj.configHash, "configHash");
|
|
10238
|
+
expectString(obj.commitSha, "commitSha");
|
|
10239
|
+
expectFiniteNumber(obj.wallMs, "wallMs");
|
|
10240
|
+
if (obj.queueMs !== void 0) expectFiniteNumber(obj.queueMs, "queueMs");
|
|
10241
|
+
expectFiniteNumber(obj.costUsd, "costUsd");
|
|
10242
|
+
if (!modelHasSnapshot(obj.model)) {
|
|
10243
|
+
throw new RunRecordValidationError(
|
|
10244
|
+
`model "${obj.model}" lacks a snapshot version (use 'name@YYYY-MM-DD' or 'name-YYYYMMDD')`,
|
|
10245
|
+
"model"
|
|
10246
|
+
);
|
|
10247
|
+
}
|
|
10248
|
+
const tu = obj.tokenUsage;
|
|
10249
|
+
if (tu === null || typeof tu !== "object") {
|
|
10250
|
+
throw new RunRecordValidationError("tokenUsage must be an object", "tokenUsage");
|
|
10251
|
+
}
|
|
10252
|
+
const tuRec = tu;
|
|
10253
|
+
expectFiniteNumber(tuRec.input, "tokenUsage.input");
|
|
10254
|
+
expectFiniteNumber(tuRec.output, "tokenUsage.output");
|
|
10255
|
+
if (tuRec.cached !== void 0) expectFiniteNumber(tuRec.cached, "tokenUsage.cached");
|
|
10256
|
+
if (obj.judgeMetadata !== void 0) {
|
|
10257
|
+
const jm = obj.judgeMetadata;
|
|
10258
|
+
if (jm === null || typeof jm !== "object") {
|
|
10259
|
+
throw new RunRecordValidationError("judgeMetadata must be an object", "judgeMetadata");
|
|
10260
|
+
}
|
|
10261
|
+
const jmRec = jm;
|
|
10262
|
+
expectString(jmRec.model, "judgeMetadata.model");
|
|
10263
|
+
expectString(jmRec.promptVersion, "judgeMetadata.promptVersion");
|
|
10264
|
+
expectFiniteNumber(jmRec.confidence, "judgeMetadata.confidence");
|
|
10265
|
+
if (typeof jmRec.fallback !== "boolean") {
|
|
10266
|
+
throw new RunRecordValidationError("judgeMetadata.fallback must be boolean", "judgeMetadata.fallback");
|
|
10267
|
+
}
|
|
10268
|
+
}
|
|
10269
|
+
const out = obj.outcome;
|
|
10270
|
+
if (out === null || typeof out !== "object") {
|
|
10271
|
+
throw new RunRecordValidationError("outcome must be an object", "outcome");
|
|
10272
|
+
}
|
|
10273
|
+
const outRec = out;
|
|
10274
|
+
if (outRec.searchScore !== void 0) expectFiniteNumber(outRec.searchScore, "outcome.searchScore");
|
|
10275
|
+
if (outRec.holdoutScore !== void 0) expectFiniteNumber(outRec.holdoutScore, "outcome.holdoutScore");
|
|
10276
|
+
if (outRec.searchScore === void 0 && outRec.holdoutScore === void 0) {
|
|
10277
|
+
throw new RunRecordValidationError(
|
|
10278
|
+
"outcome must define searchScore or holdoutScore (or both)",
|
|
10279
|
+
"outcome"
|
|
10280
|
+
);
|
|
10281
|
+
}
|
|
10282
|
+
const raw = outRec.raw;
|
|
10283
|
+
if (raw === null || typeof raw !== "object") {
|
|
10284
|
+
throw new RunRecordValidationError("outcome.raw must be an object", "outcome.raw");
|
|
10285
|
+
}
|
|
10286
|
+
for (const [k, v] of Object.entries(raw)) {
|
|
10287
|
+
expectFiniteNumber(v, `outcome.raw.${k}`);
|
|
10288
|
+
}
|
|
10289
|
+
if (obj.failureMode !== void 0) expectString(obj.failureMode, "failureMode");
|
|
10290
|
+
if (typeof obj.splitTag !== "string" || !SPLIT_TAGS.includes(obj.splitTag)) {
|
|
10291
|
+
throw new RunRecordValidationError(
|
|
10292
|
+
`splitTag must be one of ${SPLIT_TAGS.join(", ")}, got ${String(obj.splitTag)}`,
|
|
10293
|
+
"splitTag"
|
|
10294
|
+
);
|
|
10295
|
+
}
|
|
10296
|
+
return input;
|
|
10297
|
+
}
|
|
10298
|
+
function isRunRecord(input) {
|
|
10299
|
+
try {
|
|
10300
|
+
validateRunRecord(input);
|
|
10301
|
+
return true;
|
|
10302
|
+
} catch {
|
|
10303
|
+
return false;
|
|
10304
|
+
}
|
|
10305
|
+
}
|
|
10306
|
+
function parseRunRecordSafe(input) {
|
|
10307
|
+
try {
|
|
10308
|
+
return { ok: true, value: validateRunRecord(input) };
|
|
10309
|
+
} catch (e) {
|
|
10310
|
+
if (e instanceof RunRecordValidationError) return { ok: false, error: e };
|
|
10311
|
+
throw e;
|
|
10312
|
+
}
|
|
10313
|
+
}
|
|
10314
|
+
function roundTripRunRecord(record) {
|
|
10315
|
+
const json = JSON.stringify(record);
|
|
10316
|
+
return validateRunRecord(JSON.parse(json));
|
|
10317
|
+
}
|
|
10318
|
+
function expectString(value, path) {
|
|
10319
|
+
if (typeof value !== "string" || value.length === 0) {
|
|
10320
|
+
throw new RunRecordValidationError(`expected non-empty string`, path);
|
|
10321
|
+
}
|
|
10322
|
+
}
|
|
10323
|
+
function expectFiniteNumber(value, path) {
|
|
10324
|
+
if (typeof value !== "number" || !Number.isFinite(value)) {
|
|
10325
|
+
throw new RunRecordValidationError(`expected finite number`, path);
|
|
10326
|
+
}
|
|
10327
|
+
}
|
|
10328
|
+
function modelHasSnapshot(model) {
|
|
10329
|
+
if (model.includes("@")) return true;
|
|
10330
|
+
if (/-\d{8}$/.test(model)) return true;
|
|
10331
|
+
if (/-\d{4}-\d{2}-\d{2}$/.test(model)) return true;
|
|
10332
|
+
if (/:date-/.test(model)) return true;
|
|
10333
|
+
return false;
|
|
10334
|
+
}
|
|
10335
|
+
|
|
10336
|
+
// src/held-out-gate.ts
|
|
10337
|
+
var HeldOutGate = class {
|
|
10338
|
+
minProductiveRuns;
|
|
10339
|
+
pairedDeltaThreshold;
|
|
10340
|
+
overfitGapThreshold;
|
|
10341
|
+
baselineKey;
|
|
10342
|
+
confidence;
|
|
10343
|
+
resamples;
|
|
10344
|
+
seed;
|
|
10345
|
+
constructor(config) {
|
|
10346
|
+
if (!config.baselineKey) {
|
|
10347
|
+
throw new Error("HeldOutGate: baselineKey is required");
|
|
10348
|
+
}
|
|
10349
|
+
this.minProductiveRuns = config.minProductiveRuns ?? 3;
|
|
10350
|
+
this.pairedDeltaThreshold = config.pairedDeltaThreshold ?? 0;
|
|
10351
|
+
this.overfitGapThreshold = config.overfitGapThreshold ?? 0.15;
|
|
10352
|
+
this.baselineKey = config.baselineKey;
|
|
10353
|
+
this.confidence = config.confidence ?? 0.95;
|
|
10354
|
+
this.resamples = config.bootstrapResamples ?? 2e3;
|
|
10355
|
+
this.seed = config.seed;
|
|
10356
|
+
}
|
|
10357
|
+
/** Decide whether `candidate` should replace `baseline`. Pairing
|
|
10358
|
+
* is by (experimentId, seed) — identical experiment + seed pairs
|
|
10359
|
+
* the candidate run with the matching baseline run. Pairs without
|
|
10360
|
+
* a holdout score on both sides are dropped. */
|
|
10361
|
+
evaluate(candidate, baseline) {
|
|
10362
|
+
const candidateId = inferCandidateId(candidate, this.baselineKey);
|
|
10363
|
+
const baselineId = this.baselineKey;
|
|
10364
|
+
const baselineHoldoutByKey = indexHoldoutByKey(baseline);
|
|
10365
|
+
const beforeHoldout = [];
|
|
10366
|
+
const afterHoldout = [];
|
|
10367
|
+
for (const run of candidate) {
|
|
10368
|
+
if (run.splitTag !== "holdout") continue;
|
|
10369
|
+
if (run.outcome.holdoutScore === void 0) continue;
|
|
10370
|
+
const key = pairKey(run);
|
|
10371
|
+
const counterpart = baselineHoldoutByKey.get(key);
|
|
10372
|
+
if (counterpart === void 0) continue;
|
|
10373
|
+
beforeHoldout.push(counterpart);
|
|
10374
|
+
afterHoldout.push(run.outcome.holdoutScore);
|
|
10375
|
+
}
|
|
10376
|
+
const productiveRuns = beforeHoldout.length;
|
|
10377
|
+
const candidateSearchMean = mean5(scores(candidate, "searchScore", "search"));
|
|
10378
|
+
const candidateHoldoutMean = mean5(scores(candidate, "holdoutScore", "holdout"));
|
|
10379
|
+
const baselineSearchMean = mean5(scores(baseline, "searchScore", "search"));
|
|
10380
|
+
const baselineHoldoutMean = mean5(scores(baseline, "holdoutScore", "holdout"));
|
|
10381
|
+
const overfitGap = safeDiff(candidateSearchMean, candidateHoldoutMean);
|
|
10382
|
+
const baselineOverfitGap = safeDiff(baselineSearchMean, baselineHoldoutMean);
|
|
10383
|
+
if (productiveRuns < this.minProductiveRuns) {
|
|
10384
|
+
return {
|
|
10385
|
+
promote: false,
|
|
10386
|
+
candidateId,
|
|
10387
|
+
baselineId,
|
|
10388
|
+
evidence: {
|
|
10389
|
+
productiveRuns,
|
|
10390
|
+
medianPairedDelta: productiveRuns > 0 ? medianDelta(beforeHoldout, afterHoldout) : 0,
|
|
10391
|
+
pairedCI: { low: 0, high: 0 },
|
|
10392
|
+
pairedPValue: 1,
|
|
10393
|
+
searchScore: candidateSearchMean,
|
|
10394
|
+
holdoutScore: candidateHoldoutMean,
|
|
10395
|
+
overfitGap,
|
|
10396
|
+
baselineOverfitGap
|
|
10397
|
+
},
|
|
10398
|
+
reason: `few_runs: ${productiveRuns} paired holdout observation(s) < min ${this.minProductiveRuns}`,
|
|
10399
|
+
rejectionCode: "few_runs"
|
|
10400
|
+
};
|
|
10401
|
+
}
|
|
10402
|
+
const ci = pairedBootstrap(beforeHoldout, afterHoldout, {
|
|
10403
|
+
confidence: this.confidence,
|
|
10404
|
+
resamples: this.resamples,
|
|
10405
|
+
statistic: "median",
|
|
10406
|
+
seed: this.seed
|
|
10407
|
+
});
|
|
10408
|
+
const wilcoxon = pairedWilcoxon(beforeHoldout, afterHoldout);
|
|
10409
|
+
const evidence = {
|
|
10410
|
+
productiveRuns,
|
|
10411
|
+
medianPairedDelta: ci.median,
|
|
10412
|
+
pairedCI: { low: ci.low, high: ci.high },
|
|
10413
|
+
pairedPValue: wilcoxon.p,
|
|
10414
|
+
searchScore: candidateSearchMean,
|
|
10415
|
+
holdoutScore: candidateHoldoutMean,
|
|
10416
|
+
overfitGap,
|
|
10417
|
+
baselineOverfitGap
|
|
10418
|
+
};
|
|
10419
|
+
if (!(ci.low > this.pairedDeltaThreshold)) {
|
|
10420
|
+
return {
|
|
10421
|
+
promote: false,
|
|
10422
|
+
candidateId,
|
|
10423
|
+
baselineId,
|
|
10424
|
+
evidence,
|
|
10425
|
+
reason: `negative_delta: paired holdout median \u0394=${fmt(ci.median)} CI=[${fmt(ci.low)}, ${fmt(ci.high)}] does not clear threshold ${fmt(this.pairedDeltaThreshold)}`,
|
|
10426
|
+
rejectionCode: "negative_delta"
|
|
10427
|
+
};
|
|
10428
|
+
}
|
|
10429
|
+
if (Number.isFinite(overfitGap) && Number.isFinite(baselineOverfitGap) && overfitGap > baselineOverfitGap + this.overfitGapThreshold) {
|
|
10430
|
+
return {
|
|
10431
|
+
promote: false,
|
|
10432
|
+
candidateId,
|
|
10433
|
+
baselineId,
|
|
10434
|
+
evidence,
|
|
10435
|
+
reason: `overfit_gap: candidate gap=${fmt(overfitGap)} exceeds baseline gap=${fmt(baselineOverfitGap)} by more than ${fmt(this.overfitGapThreshold)}`,
|
|
10436
|
+
rejectionCode: "overfit_gap"
|
|
10437
|
+
};
|
|
10438
|
+
}
|
|
10439
|
+
return {
|
|
10440
|
+
promote: true,
|
|
10441
|
+
candidateId,
|
|
10442
|
+
baselineId,
|
|
10443
|
+
evidence,
|
|
10444
|
+
reason: `promote: paired holdout median \u0394=${fmt(ci.median)} CI=[${fmt(ci.low)}, ${fmt(ci.high)}] over ${productiveRuns} pairs; overfit gap candidate=${fmt(overfitGap)} vs baseline=${fmt(baselineOverfitGap)}`,
|
|
10445
|
+
rejectionCode: null
|
|
10446
|
+
};
|
|
10447
|
+
}
|
|
10448
|
+
};
|
|
10449
|
+
function inferCandidateId(candidate, baselineKey) {
|
|
10450
|
+
for (const run of candidate) {
|
|
10451
|
+
if (run.candidateId && run.candidateId !== baselineKey) return run.candidateId;
|
|
10452
|
+
}
|
|
10453
|
+
return candidate[0]?.candidateId ?? "(unknown candidate)";
|
|
10454
|
+
}
|
|
10455
|
+
function indexHoldoutByKey(runs) {
|
|
10456
|
+
const out = /* @__PURE__ */ new Map();
|
|
10457
|
+
for (const r of runs) {
|
|
10458
|
+
if (r.splitTag !== "holdout") continue;
|
|
10459
|
+
if (r.outcome.holdoutScore === void 0) continue;
|
|
10460
|
+
out.set(pairKey(r), r.outcome.holdoutScore);
|
|
10461
|
+
}
|
|
10462
|
+
return out;
|
|
10463
|
+
}
|
|
10464
|
+
function pairKey(r) {
|
|
10465
|
+
return `${r.experimentId}::${r.seed}`;
|
|
10466
|
+
}
|
|
10467
|
+
function scores(runs, field, splitFilter) {
|
|
10468
|
+
const out = [];
|
|
10469
|
+
for (const r of runs) {
|
|
10470
|
+
if (r.splitTag !== splitFilter) continue;
|
|
10471
|
+
const v = r.outcome[field];
|
|
10472
|
+
if (typeof v === "number" && Number.isFinite(v)) out.push(v);
|
|
10473
|
+
}
|
|
10474
|
+
return out;
|
|
10475
|
+
}
|
|
10476
|
+
function mean5(xs) {
|
|
10477
|
+
if (xs.length === 0) return Number.NaN;
|
|
10478
|
+
return xs.reduce((s, x) => s + x, 0) / xs.length;
|
|
10479
|
+
}
|
|
10480
|
+
function safeDiff(a, b) {
|
|
10481
|
+
if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
|
|
10482
|
+
return a - b;
|
|
10483
|
+
}
|
|
10484
|
+
function medianDelta(before, after) {
|
|
10485
|
+
const ds = before.map((b, i) => after[i] - b).sort((x, y) => x - y);
|
|
10486
|
+
if (ds.length === 0) return 0;
|
|
10487
|
+
const mid = Math.floor(ds.length / 2);
|
|
10488
|
+
return ds.length % 2 === 0 ? (ds[mid - 1] + ds[mid]) / 2 : ds[mid];
|
|
10489
|
+
}
|
|
10490
|
+
function fmt(x) {
|
|
10491
|
+
if (!Number.isFinite(x)) return String(x);
|
|
10492
|
+
return x.toFixed(4);
|
|
10493
|
+
}
|
|
10494
|
+
|
|
10495
|
+
// src/researcher.ts
|
|
10496
|
+
var NoopResearcher = class {
|
|
10497
|
+
hint;
|
|
10498
|
+
constructor(hint = "NoopResearcher: no implementation wired") {
|
|
10499
|
+
this.hint = hint;
|
|
10500
|
+
}
|
|
10501
|
+
async inspectFailures(_runs) {
|
|
10502
|
+
throw new Error(`${this.hint} (inspectFailures not implemented)`);
|
|
10503
|
+
}
|
|
10504
|
+
async proposeChange(_failures) {
|
|
10505
|
+
throw new Error(`${this.hint} (proposeChange not implemented)`);
|
|
10506
|
+
}
|
|
10507
|
+
async applyChange(_changes, _baseline) {
|
|
10508
|
+
throw new Error(`${this.hint} (applyChange not implemented)`);
|
|
10509
|
+
}
|
|
10510
|
+
async evaluateChange(_plan) {
|
|
10511
|
+
throw new Error(`${this.hint} (evaluateChange not implemented)`);
|
|
10512
|
+
}
|
|
10513
|
+
};
|
|
10514
|
+
|
|
10515
|
+
// src/summary-report.ts
|
|
10516
|
+
function summaryTable(runs, opts = {}) {
|
|
10517
|
+
const split = opts.split ?? "holdout";
|
|
10518
|
+
const confidence = opts.confidence ?? 0.95;
|
|
10519
|
+
const fdr = opts.fdr ?? 0.05;
|
|
10520
|
+
const comparator = opts.comparator ?? null;
|
|
10521
|
+
const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
|
|
10522
|
+
const byCandidate = /* @__PURE__ */ new Map();
|
|
10523
|
+
for (const r of runs) {
|
|
10524
|
+
if (r.splitTag !== split) continue;
|
|
10525
|
+
const v = r.outcome[scoreField];
|
|
10526
|
+
if (typeof v !== "number" || !Number.isFinite(v)) continue;
|
|
10527
|
+
const bucket = byCandidate.get(r.candidateId) ?? { runs: [], scores: [] };
|
|
10528
|
+
bucket.runs.push(r);
|
|
10529
|
+
bucket.scores.push(v);
|
|
10530
|
+
byCandidate.set(r.candidateId, bucket);
|
|
10531
|
+
}
|
|
10532
|
+
const candidateIds = [...byCandidate.keys()].sort();
|
|
10533
|
+
const compRuns = comparator ? byCandidate.get(comparator) : void 0;
|
|
10534
|
+
const tentative = [];
|
|
10535
|
+
for (const id of candidateIds) {
|
|
10536
|
+
const bucket = byCandidate.get(id);
|
|
10537
|
+
const ci = confidenceInterval(bucket.scores, confidence);
|
|
10538
|
+
let rawP = Number.NaN;
|
|
10539
|
+
let d = Number.NaN;
|
|
10540
|
+
if (comparator && compRuns && id !== comparator) {
|
|
10541
|
+
const paired = pairScoresByKey(bucket.runs, compRuns.runs, scoreField);
|
|
10542
|
+
if (paired.before.length >= 6) {
|
|
10543
|
+
rawP = wilcoxonSignedRank(paired.before, paired.after).p;
|
|
10544
|
+
}
|
|
10545
|
+
d = cohensD(compRuns.scores, bucket.scores);
|
|
10546
|
+
}
|
|
10547
|
+
tentative.push({
|
|
10548
|
+
candidateId: id,
|
|
10549
|
+
n: bucket.scores.length,
|
|
10550
|
+
mean: ci.mean,
|
|
10551
|
+
ciLow: ci.lower,
|
|
10552
|
+
ciHigh: ci.upper,
|
|
10553
|
+
qValue: rawP,
|
|
10554
|
+
cohensD: d,
|
|
10555
|
+
rawP
|
|
10556
|
+
});
|
|
10557
|
+
}
|
|
10558
|
+
if (comparator) {
|
|
10559
|
+
const idxs = [];
|
|
10560
|
+
const ps = [];
|
|
10561
|
+
for (let i = 0; i < tentative.length; i++) {
|
|
10562
|
+
const r = tentative[i];
|
|
10563
|
+
if (r.candidateId === comparator) continue;
|
|
10564
|
+
if (!Number.isFinite(r.rawP)) continue;
|
|
10565
|
+
idxs.push(i);
|
|
10566
|
+
ps.push(r.rawP);
|
|
10567
|
+
}
|
|
10568
|
+
if (ps.length > 0) {
|
|
10569
|
+
const { qValues } = benjaminiHochberg(ps, fdr);
|
|
10570
|
+
for (let k = 0; k < idxs.length; k++) {
|
|
10571
|
+
tentative[idxs[k]].qValue = qValues[k];
|
|
10572
|
+
}
|
|
10573
|
+
}
|
|
10574
|
+
}
|
|
10575
|
+
const rows = tentative.map(({ rawP: _rawP, ...rest }) => rest);
|
|
10576
|
+
const markdown = renderSummaryTableMarkdown(rows, comparator, split);
|
|
10577
|
+
return { rows, comparator, split, markdown };
|
|
10578
|
+
}
|
|
10579
|
+
function pairScoresByKey(candidate, baseline, scoreField) {
|
|
10580
|
+
const baseIdx = /* @__PURE__ */ new Map();
|
|
10581
|
+
for (const r of baseline) {
|
|
10582
|
+
const v = r.outcome[scoreField];
|
|
10583
|
+
if (typeof v === "number" && Number.isFinite(v)) {
|
|
10584
|
+
baseIdx.set(`${r.experimentId}::${r.seed}`, v);
|
|
10585
|
+
}
|
|
10586
|
+
}
|
|
10587
|
+
const before = [];
|
|
10588
|
+
const after = [];
|
|
10589
|
+
for (const r of candidate) {
|
|
10590
|
+
const v = r.outcome[scoreField];
|
|
10591
|
+
if (typeof v !== "number" || !Number.isFinite(v)) continue;
|
|
10592
|
+
const key = `${r.experimentId}::${r.seed}`;
|
|
10593
|
+
const b = baseIdx.get(key);
|
|
10594
|
+
if (b === void 0) continue;
|
|
10595
|
+
before.push(b);
|
|
10596
|
+
after.push(v);
|
|
10597
|
+
}
|
|
10598
|
+
return { before, after };
|
|
10599
|
+
}
|
|
10600
|
+
function renderSummaryTableMarkdown(rows, comparator, split) {
|
|
10601
|
+
const lines = [];
|
|
10602
|
+
const cmpLabel = comparator ? ` (vs ${comparator})` : "";
|
|
10603
|
+
lines.push(`Summary Table \u2014 ${split} split${cmpLabel}`);
|
|
10604
|
+
lines.push("");
|
|
10605
|
+
lines.push("| Candidate | N | Mean | 95% CI | q (BH) | Cohen's d |");
|
|
10606
|
+
lines.push("|---|---:|---:|---|---:|---:|");
|
|
10607
|
+
for (const r of rows) {
|
|
10608
|
+
const ci = `[${fmt2(r.ciLow)}, ${fmt2(r.ciHigh)}]`;
|
|
10609
|
+
const q = Number.isFinite(r.qValue) ? r.qValue.toFixed(4) : "\u2014";
|
|
10610
|
+
const d = Number.isFinite(r.cohensD) ? r.cohensD.toFixed(3) : "\u2014";
|
|
10611
|
+
lines.push(`| ${r.candidateId} | ${r.n} | ${fmt2(r.mean)} | ${ci} | ${q} | ${d} |`);
|
|
10612
|
+
}
|
|
10613
|
+
return lines.join("\n");
|
|
10614
|
+
}
|
|
10615
|
+
function paretoChart(runs, opts = {}) {
|
|
10616
|
+
const split = opts.split ?? "holdout";
|
|
10617
|
+
const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
|
|
10618
|
+
const buckets = /* @__PURE__ */ new Map();
|
|
10619
|
+
for (const r of runs) {
|
|
10620
|
+
if (r.splitTag !== split) continue;
|
|
10621
|
+
const v = r.outcome[scoreField];
|
|
10622
|
+
if (typeof v !== "number" || !Number.isFinite(v)) continue;
|
|
10623
|
+
const bucket = buckets.get(r.candidateId) ?? { cost: [], quality: [] };
|
|
10624
|
+
bucket.cost.push(r.costUsd);
|
|
10625
|
+
bucket.quality.push(v);
|
|
10626
|
+
buckets.set(r.candidateId, bucket);
|
|
10627
|
+
}
|
|
10628
|
+
const points = [];
|
|
10629
|
+
for (const [candidateId, bucket] of buckets.entries()) {
|
|
10630
|
+
points.push({
|
|
10631
|
+
candidateId,
|
|
10632
|
+
cost: avg(bucket.cost),
|
|
10633
|
+
quality: avg(bucket.quality),
|
|
10634
|
+
n: bucket.cost.length,
|
|
10635
|
+
onFrontier: false,
|
|
10636
|
+
gate: opts.gateDecisions?.[candidateId] ? gateLabel(opts.gateDecisions[candidateId]) : void 0
|
|
10637
|
+
});
|
|
10638
|
+
}
|
|
10639
|
+
for (const p of points) {
|
|
10640
|
+
p.onFrontier = !points.some((q) => q !== p && dominates2(q, p));
|
|
10641
|
+
}
|
|
10642
|
+
return {
|
|
10643
|
+
kind: "pareto-cost-quality",
|
|
10644
|
+
split,
|
|
10645
|
+
axes: { x: "costUsd", y: "score" },
|
|
10646
|
+
points
|
|
10647
|
+
};
|
|
10648
|
+
}
|
|
10649
|
+
function dominates2(a, b) {
|
|
10650
|
+
return a.cost <= b.cost && a.quality >= b.quality && (a.cost < b.cost || a.quality > b.quality);
|
|
10651
|
+
}
|
|
10652
|
+
function gateLabel(d) {
|
|
10653
|
+
if (d.promote) return "promote";
|
|
10654
|
+
if (d.rejectionCode === "few_runs") return "reject_few_runs";
|
|
10655
|
+
if (d.rejectionCode === "negative_delta") return "reject_negative_delta";
|
|
10656
|
+
if (d.rejectionCode === "overfit_gap") return "reject_overfit_gap";
|
|
10657
|
+
return null;
|
|
10658
|
+
}
|
|
10659
|
+
function gainHistogram(runs, candidateId, comparator, opts = {}) {
|
|
10660
|
+
const split = opts.split ?? "holdout";
|
|
10661
|
+
const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
|
|
10662
|
+
const binCount = opts.bins ?? 11;
|
|
10663
|
+
if (binCount < 1) throw new Error("gainHistogram: bins must be \u2265 1");
|
|
10664
|
+
const candidate = runs.filter((r) => r.candidateId === candidateId && r.splitTag === split);
|
|
10665
|
+
const baseline = runs.filter((r) => r.candidateId === comparator && r.splitTag === split);
|
|
10666
|
+
const { before, after } = pairScoresByKey(candidate, baseline, scoreField);
|
|
10667
|
+
const n = before.length;
|
|
10668
|
+
if (n === 0) {
|
|
10669
|
+
return {
|
|
10670
|
+
kind: "gain-distribution",
|
|
10671
|
+
candidateId,
|
|
10672
|
+
comparator,
|
|
10673
|
+
split,
|
|
10674
|
+
n: 0,
|
|
10675
|
+
bins: [],
|
|
10676
|
+
median: 0,
|
|
10677
|
+
ci: { low: 0, high: 0 }
|
|
10678
|
+
};
|
|
10679
|
+
}
|
|
10680
|
+
const deltas = before.map((b, i) => after[i] - b);
|
|
10681
|
+
const sortedDeltas = [...deltas].sort((a, b) => a - b);
|
|
10682
|
+
const median = medianOfSorted(sortedDeltas);
|
|
10683
|
+
const min = sortedDeltas[0];
|
|
10684
|
+
const max = sortedDeltas[sortedDeltas.length - 1];
|
|
10685
|
+
const bound = Math.max(Math.abs(min), Math.abs(max), 1e-6);
|
|
10686
|
+
const lo = -bound;
|
|
10687
|
+
const hi = bound;
|
|
10688
|
+
const width = (hi - lo) / binCount;
|
|
10689
|
+
const bins = [];
|
|
10690
|
+
for (let i = 0; i < binCount; i++) {
|
|
10691
|
+
bins.push({ lo: lo + i * width, hi: lo + (i + 1) * width, count: 0 });
|
|
10692
|
+
}
|
|
10693
|
+
for (const d of deltas) {
|
|
10694
|
+
let idx = Math.floor((d - lo) / width);
|
|
10695
|
+
if (idx < 0) idx = 0;
|
|
10696
|
+
if (idx >= binCount) idx = binCount - 1;
|
|
10697
|
+
bins[idx].count += 1;
|
|
10698
|
+
}
|
|
10699
|
+
const ci = pairedBootstrap(before, after, {
|
|
10700
|
+
confidence: opts.confidence ?? 0.95,
|
|
10701
|
+
resamples: opts.resamples ?? 2e3,
|
|
10702
|
+
statistic: "median",
|
|
10703
|
+
seed: opts.seed
|
|
10704
|
+
});
|
|
10705
|
+
return {
|
|
10706
|
+
kind: "gain-distribution",
|
|
10707
|
+
candidateId,
|
|
10708
|
+
comparator,
|
|
10709
|
+
split,
|
|
10710
|
+
n,
|
|
10711
|
+
bins,
|
|
10712
|
+
median,
|
|
10713
|
+
ci: { low: ci.low, high: ci.high }
|
|
10714
|
+
};
|
|
10715
|
+
}
|
|
10716
|
+
function avg(xs) {
|
|
10717
|
+
if (xs.length === 0) return Number.NaN;
|
|
10718
|
+
return xs.reduce((s, x) => s + x, 0) / xs.length;
|
|
10719
|
+
}
|
|
10720
|
+
function medianOfSorted(sorted) {
|
|
10721
|
+
if (sorted.length === 0) return 0;
|
|
10722
|
+
const mid = Math.floor(sorted.length / 2);
|
|
10723
|
+
return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
|
|
10724
|
+
}
|
|
10725
|
+
function fmt2(x) {
|
|
10726
|
+
if (!Number.isFinite(x)) return String(x);
|
|
10727
|
+
return x.toFixed(4);
|
|
10728
|
+
}
|
|
10729
|
+
|
|
10730
|
+
// src/canary.ts
|
|
10731
|
+
function runCanaries(runs, opts = {}) {
|
|
10732
|
+
const alerts = [
|
|
10733
|
+
...detectSilentFallback(runs, opts.silentFallback ?? {}),
|
|
10734
|
+
...detectCalibrationDrift(runs, opts.calibrationDrift ?? {}),
|
|
10735
|
+
...opts.distributionShift ? detectDistributionShift(runs, opts.distributionShift) : []
|
|
10736
|
+
];
|
|
10737
|
+
const counts = {
|
|
10738
|
+
silent_judge_fallback: 0,
|
|
10739
|
+
judge_calibration_drift: 0,
|
|
10740
|
+
distribution_shift: 0
|
|
10741
|
+
};
|
|
10742
|
+
for (const a of alerts) counts[a.kind]++;
|
|
10743
|
+
return { alerts, counts };
|
|
10744
|
+
}
|
|
10745
|
+
function detectSilentFallback(runs, opts) {
|
|
10746
|
+
const constant = opts.constant ?? 0.3;
|
|
10747
|
+
const threshold = opts.consecutiveThreshold ?? 3;
|
|
10748
|
+
const eps = opts.epsilon ?? 1e-9;
|
|
10749
|
+
const alerts = [];
|
|
10750
|
+
let streak = 0;
|
|
10751
|
+
let streakStartRunId = null;
|
|
10752
|
+
let streakValues = [];
|
|
10753
|
+
let lastFlush = -1;
|
|
10754
|
+
for (let i = 0; i < runs.length; i++) {
|
|
10755
|
+
const run = runs[i];
|
|
10756
|
+
const meta = run.judgeMetadata;
|
|
10757
|
+
if (!meta) {
|
|
10758
|
+
streak = 0;
|
|
10759
|
+
streakStartRunId = null;
|
|
10760
|
+
streakValues = [];
|
|
10761
|
+
continue;
|
|
10762
|
+
}
|
|
10763
|
+
const isFallback = meta.fallback === true || Math.abs(meta.confidence - constant) <= eps;
|
|
10764
|
+
if (isFallback) {
|
|
10765
|
+
streak += 1;
|
|
10766
|
+
if (streak === 1) streakStartRunId = run.runId;
|
|
10767
|
+
streakValues.push(meta.confidence);
|
|
10768
|
+
if (streak >= threshold && lastFlush < i) {
|
|
10769
|
+
alerts.push({
|
|
10770
|
+
kind: "silent_judge_fallback",
|
|
10771
|
+
severity: "error",
|
|
10772
|
+
message: `silent judge fallback: ${streak} consecutive run(s) at confidence\u2248${constant} or fallback=true`,
|
|
10773
|
+
evidence: {
|
|
10774
|
+
streakLength: streak,
|
|
10775
|
+
firstRunId: streakStartRunId,
|
|
10776
|
+
lastRunId: run.runId,
|
|
10777
|
+
confidences: streakValues.slice(-Math.min(streakValues.length, 10)),
|
|
10778
|
+
fallbackConstant: constant
|
|
10779
|
+
}
|
|
10780
|
+
});
|
|
10781
|
+
lastFlush = i;
|
|
10782
|
+
}
|
|
10783
|
+
} else {
|
|
10784
|
+
streak = 0;
|
|
10785
|
+
streakStartRunId = null;
|
|
10786
|
+
streakValues = [];
|
|
10787
|
+
lastFlush = -1;
|
|
10788
|
+
}
|
|
10789
|
+
}
|
|
10790
|
+
return alerts;
|
|
10791
|
+
}
|
|
10792
|
+
function detectCalibrationDrift(runs, opts) {
|
|
10793
|
+
const historyWindow = opts.historyWindow ?? 50;
|
|
10794
|
+
const recentWindow = opts.recentWindow ?? 20;
|
|
10795
|
+
const alpha = opts.ksAlpha ?? 0.05;
|
|
10796
|
+
const minRecent = opts.minRecent ?? 10;
|
|
10797
|
+
const conf = [];
|
|
10798
|
+
for (const r of runs) {
|
|
10799
|
+
if (r.judgeMetadata && Number.isFinite(r.judgeMetadata.confidence)) {
|
|
10800
|
+
conf.push(r.judgeMetadata.confidence);
|
|
10801
|
+
}
|
|
10802
|
+
}
|
|
10803
|
+
if (conf.length < minRecent + 1) return [];
|
|
10804
|
+
const recent = conf.slice(-Math.min(recentWindow, conf.length));
|
|
10805
|
+
const historical = conf.slice(0, -recent.length).slice(-historyWindow);
|
|
10806
|
+
if (recent.length < minRecent || historical.length < minRecent) return [];
|
|
10807
|
+
const ks = ksTwoSample(recent, historical);
|
|
10808
|
+
const c = alpha <= 0.01 ? 1.63 : alpha <= 0.05 ? 1.36 : alpha <= 0.1 ? 1.22 : 1;
|
|
10809
|
+
const critical = c * Math.sqrt((recent.length + historical.length) / (recent.length * historical.length));
|
|
10810
|
+
if (ks.d > critical) {
|
|
10811
|
+
return [
|
|
10812
|
+
{
|
|
10813
|
+
kind: "judge_calibration_drift",
|
|
10814
|
+
severity: "warn",
|
|
10815
|
+
message: `judge calibration drift: KS D=${ks.d.toFixed(4)} exceeds critical=${critical.toFixed(4)} at alpha=${alpha} (recent n=${recent.length}, history n=${historical.length})`,
|
|
10816
|
+
evidence: {
|
|
10817
|
+
ksD: ks.d,
|
|
10818
|
+
critical,
|
|
10819
|
+
alpha,
|
|
10820
|
+
recentN: recent.length,
|
|
10821
|
+
historyN: historical.length,
|
|
10822
|
+
recentMean: mean6(recent),
|
|
10823
|
+
historyMean: mean6(historical)
|
|
10824
|
+
}
|
|
10825
|
+
}
|
|
10826
|
+
];
|
|
10827
|
+
}
|
|
10828
|
+
return [];
|
|
10829
|
+
}
|
|
10830
|
+
function ksTwoSample(a, b) {
|
|
10831
|
+
const sortedA = [...a].sort((x, y) => x - y);
|
|
10832
|
+
const sortedB = [...b].sort((x, y) => x - y);
|
|
10833
|
+
const n1 = sortedA.length;
|
|
10834
|
+
const n2 = sortedB.length;
|
|
10835
|
+
let i = 0;
|
|
10836
|
+
let j = 0;
|
|
10837
|
+
let d = 0;
|
|
10838
|
+
while (i < n1 && j < n2) {
|
|
10839
|
+
const ax = sortedA[i];
|
|
10840
|
+
const bx = sortedB[j];
|
|
10841
|
+
if (ax <= bx) i++;
|
|
10842
|
+
if (bx <= ax) j++;
|
|
10843
|
+
const diff = Math.abs(i / n1 - j / n2);
|
|
10844
|
+
if (diff > d) d = diff;
|
|
10845
|
+
}
|
|
10846
|
+
return { d };
|
|
10847
|
+
}
|
|
10848
|
+
function detectDistributionShift(runs, opts) {
|
|
10849
|
+
const historyWindow = opts.historyWindow ?? 50;
|
|
10850
|
+
const recentWindow = opts.recentWindow ?? 20;
|
|
10851
|
+
const alpha = opts.chiSquareAlpha ?? 0.05;
|
|
10852
|
+
const minRecent = opts.minRecent ?? 10;
|
|
10853
|
+
const cat = opts.category;
|
|
10854
|
+
const cats = [];
|
|
10855
|
+
for (const r of runs) {
|
|
10856
|
+
const b = cat(r);
|
|
10857
|
+
if (typeof b === "string" && b.length > 0) cats.push({ run: r, bucket: b });
|
|
10858
|
+
}
|
|
10859
|
+
if (cats.length < minRecent + 1) return [];
|
|
10860
|
+
const recent = cats.slice(-Math.min(recentWindow, cats.length));
|
|
10861
|
+
const historical = cats.slice(0, -recent.length).slice(-historyWindow);
|
|
10862
|
+
if (recent.length < minRecent || historical.length < minRecent) return [];
|
|
10863
|
+
const buckets = /* @__PURE__ */ new Set();
|
|
10864
|
+
for (const r of recent) buckets.add(r.bucket);
|
|
10865
|
+
for (const h of historical) buckets.add(h.bucket);
|
|
10866
|
+
const bucketList = [...buckets].sort();
|
|
10867
|
+
const recentCounts = {};
|
|
10868
|
+
const histCounts = {};
|
|
10869
|
+
for (const b of bucketList) {
|
|
10870
|
+
recentCounts[b] = 0;
|
|
10871
|
+
histCounts[b] = 0;
|
|
10872
|
+
}
|
|
10873
|
+
for (const r of recent) recentCounts[r.bucket] += 1;
|
|
10874
|
+
for (const h of historical) histCounts[h.bucket] += 1;
|
|
10875
|
+
let chi = 0;
|
|
10876
|
+
let df = 0;
|
|
10877
|
+
for (const b of bucketList) {
|
|
10878
|
+
const expected = histCounts[b] / historical.length * recent.length;
|
|
10879
|
+
if (expected < 1) continue;
|
|
10880
|
+
const obs = recentCounts[b];
|
|
10881
|
+
chi += (obs - expected) ** 2 / expected;
|
|
10882
|
+
df += 1;
|
|
10883
|
+
}
|
|
10884
|
+
df = Math.max(1, df - 1);
|
|
10885
|
+
const critical = chiSquareCritical(df, alpha);
|
|
10886
|
+
if (chi > critical) {
|
|
10887
|
+
return [
|
|
10888
|
+
{
|
|
10889
|
+
kind: "distribution_shift",
|
|
10890
|
+
severity: "warn",
|
|
10891
|
+
message: `eval-set distribution shift: \u03C7\xB2=${chi.toFixed(2)} df=${df} exceeds critical=${critical.toFixed(2)} at alpha=${alpha}`,
|
|
10892
|
+
evidence: {
|
|
10893
|
+
chi,
|
|
10894
|
+
df,
|
|
10895
|
+
critical,
|
|
10896
|
+
alpha,
|
|
10897
|
+
recentCounts,
|
|
10898
|
+
historicalCounts: histCounts,
|
|
10899
|
+
recentN: recent.length,
|
|
10900
|
+
historyN: historical.length
|
|
10901
|
+
}
|
|
10902
|
+
}
|
|
10903
|
+
];
|
|
10904
|
+
}
|
|
10905
|
+
return [];
|
|
10906
|
+
}
|
|
10907
|
+
function chiSquareCritical(df, alpha) {
|
|
10908
|
+
const TABLE = {
|
|
10909
|
+
1: [2.71, 3.84, 5.02, 6.63],
|
|
10910
|
+
2: [4.61, 5.99, 7.38, 9.21],
|
|
10911
|
+
3: [6.25, 7.81, 9.35, 11.34],
|
|
10912
|
+
4: [7.78, 9.49, 11.14, 13.28],
|
|
10913
|
+
5: [9.24, 11.07, 12.83, 15.09],
|
|
10914
|
+
6: [10.64, 12.59, 14.45, 16.81],
|
|
10915
|
+
7: [12.02, 14.07, 16.01, 18.48],
|
|
10916
|
+
8: [13.36, 15.51, 17.53, 20.09],
|
|
10917
|
+
9: [14.68, 16.92, 19.02, 21.67],
|
|
10918
|
+
10: [15.99, 18.31, 20.48, 23.21],
|
|
10919
|
+
15: [22.31, 25, 27.49, 30.58],
|
|
10920
|
+
20: [28.41, 31.41, 34.17, 37.57],
|
|
10921
|
+
25: [34.38, 37.65, 40.65, 44.31],
|
|
10922
|
+
30: [40.26, 43.77, 46.98, 50.89]
|
|
10923
|
+
};
|
|
10924
|
+
const idx = alpha >= 0.1 ? 0 : alpha >= 0.05 ? 1 : alpha >= 0.025 ? 2 : 3;
|
|
10925
|
+
if (TABLE[df]) return TABLE[df][idx];
|
|
10926
|
+
if (df > 30) {
|
|
10927
|
+
const zMap = { 0: 1.282, 1: 1.645, 2: 1.96, 3: 2.326 };
|
|
10928
|
+
const z = zMap[idx] ?? 1.96;
|
|
10929
|
+
const term = 1 - 2 / (9 * df) + z * Math.sqrt(2 / (9 * df));
|
|
10930
|
+
return df * term ** 3;
|
|
10931
|
+
}
|
|
10932
|
+
const keys = Object.keys(TABLE).map((k) => Number(k)).sort((a, b) => a - b);
|
|
10933
|
+
for (let i = 1; i < keys.length; i++) {
|
|
10934
|
+
const lo = keys[i - 1];
|
|
10935
|
+
const hi = keys[i];
|
|
10936
|
+
if (df >= lo && df <= hi) {
|
|
10937
|
+
const t = (df - lo) / (hi - lo);
|
|
10938
|
+
return TABLE[lo][idx] * (1 - t) + TABLE[hi][idx] * t;
|
|
10939
|
+
}
|
|
10940
|
+
}
|
|
10941
|
+
return TABLE[10][idx];
|
|
10942
|
+
}
|
|
10943
|
+
function mean6(xs) {
|
|
10944
|
+
if (xs.length === 0) return 0;
|
|
10945
|
+
return xs.reduce((s, x) => s + x, 0) / xs.length;
|
|
10946
|
+
}
|
|
10947
|
+
|
|
10948
|
+
// src/benchmarks/types.ts
|
|
10949
|
+
function fnv1a32(input) {
|
|
10950
|
+
let h = 2166136261;
|
|
10951
|
+
for (let i = 0; i < input.length; i++) {
|
|
10952
|
+
h ^= input.charCodeAt(i) & 255;
|
|
10953
|
+
h = h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24)) >>> 0;
|
|
10954
|
+
}
|
|
10955
|
+
return h >>> 0;
|
|
10956
|
+
}
|
|
10957
|
+
var BENCHMARK_SPLIT_SEED = "agent-eval-v1";
|
|
10958
|
+
function deterministicSplit(itemId, seed = BENCHMARK_SPLIT_SEED) {
|
|
10959
|
+
const h = fnv1a32(`${seed}::${itemId}`);
|
|
10960
|
+
const pos = h / 4294967296;
|
|
10961
|
+
if (pos < 0.6) return "search";
|
|
10962
|
+
if (pos < 0.8) return "dev";
|
|
10963
|
+
return "holdout";
|
|
10964
|
+
}
|
|
10965
|
+
|
|
10966
|
+
// src/benchmarks/index.ts
|
|
10967
|
+
var benchmarks_exports = {};
|
|
10968
|
+
__export(benchmarks_exports, {
|
|
10969
|
+
BENCHMARK_SPLIT_SEED: () => BENCHMARK_SPLIT_SEED,
|
|
10970
|
+
deterministicSplit: () => deterministicSplit,
|
|
10971
|
+
gsm8k: () => gsm8k_exports,
|
|
10972
|
+
routing: () => routing_exports,
|
|
10973
|
+
swebenchLite: () => swebench_lite_exports
|
|
10974
|
+
});
|
|
10975
|
+
|
|
10976
|
+
// src/benchmarks/gsm8k/index.ts
|
|
10977
|
+
var gsm8k_exports = {};
|
|
10978
|
+
__export(gsm8k_exports, {
|
|
10979
|
+
Gsm8kAdapter: () => Gsm8kAdapter,
|
|
10980
|
+
assignSplit: () => assignSplit,
|
|
10981
|
+
evaluate: () => evaluate,
|
|
10982
|
+
loadDataset: () => loadDataset,
|
|
10983
|
+
parseGsm8kAnswer: () => parseGsm8kAnswer
|
|
10984
|
+
});
|
|
10985
|
+
import { existsSync as existsSync5, readFileSync as readFileSync5 } from "fs";
|
|
10986
|
+
var Gsm8kAdapter = class {
|
|
10987
|
+
async loadDataset(split) {
|
|
10988
|
+
const path = process.env.AGENT_EVAL_GSM8K_PATH;
|
|
10989
|
+
if (!path) {
|
|
10990
|
+
throw new Error(
|
|
10991
|
+
"GSM8K dataset not provided. Set AGENT_EVAL_GSM8K_PATH to a JSONL file with {id, question, answer} records (the HF GSM8K mirror converted to JSONL)."
|
|
10992
|
+
);
|
|
10993
|
+
}
|
|
10994
|
+
if (!existsSync5(path)) {
|
|
10995
|
+
throw new Error(`AGENT_EVAL_GSM8K_PATH=${path} does not exist`);
|
|
10996
|
+
}
|
|
10997
|
+
const items = parseJsonl(path).filter((it) => assignSplitImpl(it.id) === split);
|
|
10998
|
+
return items;
|
|
10999
|
+
}
|
|
11000
|
+
async evaluate(item, response) {
|
|
11001
|
+
const expected = parseGsm8kAnswer(item.payload.answer);
|
|
11002
|
+
const observed = parseGsm8kAnswer(response);
|
|
11003
|
+
if (expected === null) {
|
|
11004
|
+
return { score: 0, raw: { reason: "reference_not_numeric", expected: item.payload.answer } };
|
|
11005
|
+
}
|
|
11006
|
+
if (observed === null) {
|
|
11007
|
+
return { score: 0, raw: { reason: "no_numeric_in_response", expected, observed: null } };
|
|
11008
|
+
}
|
|
11009
|
+
const ok = Math.abs(expected - observed) < 1e-6;
|
|
11010
|
+
return { score: ok ? 1 : 0, raw: { expected, observed, exactMatch: ok } };
|
|
11011
|
+
}
|
|
11012
|
+
assignSplit(itemId) {
|
|
11013
|
+
return assignSplitImpl(itemId);
|
|
11014
|
+
}
|
|
11015
|
+
};
|
|
11016
|
+
function assignSplitImpl(itemId) {
|
|
11017
|
+
return deterministicSplit(`gsm8k::${itemId}`);
|
|
11018
|
+
}
|
|
11019
|
+
function parseJsonl(path) {
|
|
11020
|
+
const raw = readFileSync5(path, "utf8");
|
|
11021
|
+
const out = [];
|
|
11022
|
+
let lineNo = 0;
|
|
11023
|
+
for (const line of raw.split("\n")) {
|
|
11024
|
+
lineNo++;
|
|
11025
|
+
const trimmed = line.trim();
|
|
11026
|
+
if (!trimmed) continue;
|
|
11027
|
+
let row;
|
|
11028
|
+
try {
|
|
11029
|
+
row = JSON.parse(trimmed);
|
|
11030
|
+
} catch (e) {
|
|
11031
|
+
throw new Error(`GSM8K JSONL parse error at line ${lineNo}: ${e.message}`);
|
|
11032
|
+
}
|
|
11033
|
+
const id = String(row.id ?? `gsm8k_${lineNo}`);
|
|
11034
|
+
const question = String(row.question ?? "");
|
|
11035
|
+
const answer = String(row.answer ?? "");
|
|
11036
|
+
if (!question || !answer) {
|
|
11037
|
+
throw new Error(`GSM8K JSONL line ${lineNo} missing question/answer`);
|
|
11038
|
+
}
|
|
11039
|
+
out.push({ id, payload: { question, answer } });
|
|
11040
|
+
}
|
|
11041
|
+
return out;
|
|
11042
|
+
}
|
|
11043
|
+
function parseGsm8kAnswer(text) {
|
|
11044
|
+
if (!text) return null;
|
|
11045
|
+
const afterMarker = text.match(/####\s*(-?\d[\d,]*\.?\d*)/);
|
|
11046
|
+
if (afterMarker) {
|
|
11047
|
+
const cleaned2 = afterMarker[1].replace(/,/g, "");
|
|
11048
|
+
const v2 = Number(cleaned2);
|
|
11049
|
+
if (Number.isFinite(v2)) return v2;
|
|
11050
|
+
}
|
|
11051
|
+
const matches2 = text.match(/-?\d[\d,]*\.?\d*/g);
|
|
11052
|
+
if (!matches2 || matches2.length === 0) return null;
|
|
11053
|
+
const last = matches2[matches2.length - 1];
|
|
11054
|
+
const cleaned = last.replace(/,/g, "");
|
|
11055
|
+
const v = Number(cleaned);
|
|
11056
|
+
return Number.isFinite(v) ? v : null;
|
|
11057
|
+
}
|
|
11058
|
+
var adapter = new Gsm8kAdapter();
|
|
11059
|
+
var loadDataset = adapter.loadDataset.bind(adapter);
|
|
11060
|
+
var evaluate = adapter.evaluate.bind(adapter);
|
|
11061
|
+
var assignSplit = adapter.assignSplit.bind(adapter);
|
|
11062
|
+
|
|
11063
|
+
// src/benchmarks/swebench-lite/index.ts
|
|
11064
|
+
var swebench_lite_exports = {};
|
|
11065
|
+
__export(swebench_lite_exports, {
|
|
11066
|
+
SweBenchLiteAdapter: () => SweBenchLiteAdapter,
|
|
11067
|
+
assignSplit: () => assignSplit2,
|
|
11068
|
+
evaluate: () => evaluate2,
|
|
11069
|
+
loadDataset: () => loadDataset2
|
|
11070
|
+
});
|
|
11071
|
+
import { existsSync as existsSync6, readFileSync as readFileSync6 } from "fs";
|
|
11072
|
+
import { spawn } from "child_process";
|
|
11073
|
+
var SweBenchLiteAdapter = class {
|
|
11074
|
+
async loadDataset(split) {
|
|
11075
|
+
const path = process.env.AGENT_EVAL_SWEBENCH_PATH;
|
|
11076
|
+
if (!path) {
|
|
11077
|
+
throw new Error(
|
|
11078
|
+
"SWE-Bench Lite dataset not provided. Set AGENT_EVAL_SWEBENCH_PATH to a JSONL file with the 30 lite instances. STUB: this wrapper does not bundle the dataset; see https://www.swebench.com/lite.html for the canonical source."
|
|
11079
|
+
);
|
|
11080
|
+
}
|
|
11081
|
+
if (!existsSync6(path)) {
|
|
11082
|
+
throw new Error(`AGENT_EVAL_SWEBENCH_PATH=${path} does not exist`);
|
|
11083
|
+
}
|
|
11084
|
+
const all = parseJsonl2(path);
|
|
11085
|
+
return all.filter((it) => assignSplitImpl2(it.id) === split);
|
|
11086
|
+
}
|
|
11087
|
+
async evaluate(item, response) {
|
|
11088
|
+
const cmd = process.env.AGENT_EVAL_SWEBENCH_GRADER_CMD;
|
|
11089
|
+
if (!cmd) {
|
|
11090
|
+
throw new Error(
|
|
11091
|
+
"SWE-Bench Lite grader not configured. Set AGENT_EVAL_SWEBENCH_GRADER_CMD to an executable that reads {instance_id, patch} JSON on stdin and writes {passed, fail_to_pass_passed, pass_to_pass_passed, log} JSON on stdout. TODO(swebench-lite): bundle a default Docker-based runner once the SDK stabilises (https://github.com/swe-bench/SWE-bench)."
|
|
11092
|
+
);
|
|
11093
|
+
}
|
|
11094
|
+
const stdinPayload = JSON.stringify({ instance_id: item.payload.instanceId, patch: response });
|
|
11095
|
+
const result = await runGrader(cmd, stdinPayload);
|
|
11096
|
+
let parsed;
|
|
11097
|
+
try {
|
|
11098
|
+
parsed = JSON.parse(result.stdout);
|
|
11099
|
+
} catch (e) {
|
|
11100
|
+
throw new Error(
|
|
11101
|
+
`SWE-Bench grader emitted non-JSON stdout: ${e.message}
|
|
11102
|
+
stdout=${result.stdout.slice(0, 400)}
|
|
11103
|
+
stderr=${result.stderr.slice(0, 400)}`
|
|
11104
|
+
);
|
|
11105
|
+
}
|
|
11106
|
+
const passed = Boolean(parsed.passed);
|
|
11107
|
+
return {
|
|
11108
|
+
score: passed ? 1 : 0,
|
|
11109
|
+
raw: {
|
|
11110
|
+
passed,
|
|
11111
|
+
failToPassPassed: Boolean(parsed.fail_to_pass_passed),
|
|
11112
|
+
passToPassPassed: Boolean(parsed.pass_to_pass_passed),
|
|
11113
|
+
graderLog: typeof parsed.log === "string" ? parsed.log.slice(0, 4e3) : ""
|
|
11114
|
+
}
|
|
11115
|
+
};
|
|
11116
|
+
}
|
|
11117
|
+
assignSplit(itemId) {
|
|
11118
|
+
return assignSplitImpl2(itemId);
|
|
11119
|
+
}
|
|
11120
|
+
};
|
|
11121
|
+
function assignSplitImpl2(itemId) {
|
|
11122
|
+
return deterministicSplit(`swebench-lite::${itemId}`);
|
|
11123
|
+
}
|
|
11124
|
+
function parseJsonl2(path) {
|
|
11125
|
+
const raw = readFileSync6(path, "utf8");
|
|
11126
|
+
const out = [];
|
|
11127
|
+
let lineNo = 0;
|
|
11128
|
+
for (const line of raw.split("\n")) {
|
|
11129
|
+
lineNo++;
|
|
11130
|
+
const trimmed = line.trim();
|
|
11131
|
+
if (!trimmed) continue;
|
|
11132
|
+
const row = JSON.parse(trimmed);
|
|
11133
|
+
const instanceId = String(row.instance_id ?? row.instanceId ?? "");
|
|
11134
|
+
if (!instanceId) {
|
|
11135
|
+
throw new Error(`swebench-lite line ${lineNo} missing instance_id`);
|
|
11136
|
+
}
|
|
11137
|
+
out.push({
|
|
11138
|
+
id: instanceId,
|
|
11139
|
+
payload: {
|
|
11140
|
+
instanceId,
|
|
11141
|
+
problemStatement: String(row.problem_statement ?? row.problemStatement ?? ""),
|
|
11142
|
+
baseCommit: String(row.base_commit ?? row.baseCommit ?? ""),
|
|
11143
|
+
repo: String(row.repo ?? ""),
|
|
11144
|
+
failToPass: asStringArray(row.FAIL_TO_PASS ?? row.failToPass),
|
|
11145
|
+
passToPass: asStringArray(row.PASS_TO_PASS ?? row.passToPass)
|
|
11146
|
+
}
|
|
11147
|
+
});
|
|
11148
|
+
}
|
|
11149
|
+
return out;
|
|
11150
|
+
}
|
|
11151
|
+
function asStringArray(v) {
|
|
11152
|
+
if (Array.isArray(v)) return v.filter((x) => typeof x === "string");
|
|
11153
|
+
if (typeof v === "string") {
|
|
11154
|
+
try {
|
|
11155
|
+
const parsed = JSON.parse(v);
|
|
11156
|
+
if (Array.isArray(parsed)) return parsed.filter((x) => typeof x === "string");
|
|
11157
|
+
} catch {
|
|
11158
|
+
return [v];
|
|
11159
|
+
}
|
|
11160
|
+
}
|
|
11161
|
+
return [];
|
|
11162
|
+
}
|
|
11163
|
+
function runGrader(cmd, stdin) {
|
|
11164
|
+
return new Promise((resolve, reject) => {
|
|
11165
|
+
const parts = cmd.split(/\s+/);
|
|
11166
|
+
const child = spawn(parts[0], parts.slice(1), { stdio: ["pipe", "pipe", "pipe"] });
|
|
11167
|
+
let stdout = "";
|
|
11168
|
+
let stderr = "";
|
|
11169
|
+
child.stdout.on("data", (b) => stdout += b.toString("utf8"));
|
|
11170
|
+
child.stderr.on("data", (b) => stderr += b.toString("utf8"));
|
|
11171
|
+
child.on("error", reject);
|
|
11172
|
+
child.on("close", (code) => {
|
|
11173
|
+
if (code !== 0) {
|
|
11174
|
+
reject(new Error(`grader exited with code ${code}: ${stderr.slice(0, 400)}`));
|
|
11175
|
+
return;
|
|
11176
|
+
}
|
|
11177
|
+
resolve({ stdout, stderr });
|
|
11178
|
+
});
|
|
11179
|
+
child.stdin.write(stdin);
|
|
11180
|
+
child.stdin.end();
|
|
11181
|
+
});
|
|
11182
|
+
}
|
|
11183
|
+
var adapter2 = new SweBenchLiteAdapter();
|
|
11184
|
+
var loadDataset2 = adapter2.loadDataset.bind(adapter2);
|
|
11185
|
+
var evaluate2 = adapter2.evaluate.bind(adapter2);
|
|
11186
|
+
var assignSplit2 = adapter2.assignSplit.bind(adapter2);
|
|
11187
|
+
|
|
11188
|
+
// src/benchmarks/routing/index.ts
|
|
11189
|
+
var routing_exports = {};
|
|
11190
|
+
__export(routing_exports, {
|
|
11191
|
+
ROUTING_DATASET: () => ROUTING_DATASET,
|
|
11192
|
+
RoutingAdapter: () => RoutingAdapter,
|
|
11193
|
+
assignSplit: () => assignSplit3,
|
|
11194
|
+
evaluate: () => evaluate3,
|
|
11195
|
+
extractRouteTokens: () => extractRouteTokens,
|
|
11196
|
+
loadDataset: () => loadDataset3
|
|
11197
|
+
});
|
|
11198
|
+
|
|
11199
|
+
// src/benchmarks/routing/dataset.ts
|
|
11200
|
+
var ROUTING_DATASET = [
|
|
11201
|
+
{
|
|
11202
|
+
id: "file_001",
|
|
11203
|
+
category: "file",
|
|
11204
|
+
prompt: "Save the meeting notes to /tmp/notes-2025-04.md as markdown.",
|
|
11205
|
+
route: "fs.write",
|
|
11206
|
+
synonyms: ["filesystem.write", "write_file"],
|
|
11207
|
+
hardNegatives: ["fs.read", "chat.reply"]
|
|
11208
|
+
},
|
|
11209
|
+
{
|
|
11210
|
+
id: "file_002",
|
|
11211
|
+
category: "file",
|
|
11212
|
+
prompt: "Read the contents of /etc/hosts and summarize the entries.",
|
|
11213
|
+
route: "fs.read",
|
|
11214
|
+
synonyms: ["filesystem.read", "read_file"],
|
|
11215
|
+
hardNegatives: ["fs.write", "search.web"]
|
|
11216
|
+
},
|
|
11217
|
+
{
|
|
11218
|
+
id: "file_003",
|
|
11219
|
+
category: "file",
|
|
11220
|
+
prompt: "List every Python file under src/ recursively.",
|
|
11221
|
+
route: "fs.list",
|
|
11222
|
+
synonyms: ["filesystem.list", "list_files"],
|
|
11223
|
+
hardNegatives: ["fs.read", "search.code"]
|
|
11224
|
+
},
|
|
11225
|
+
{
|
|
11226
|
+
id: "file_004",
|
|
11227
|
+
category: "file",
|
|
11228
|
+
prompt: "Delete the cached build at .turbo/cache.",
|
|
11229
|
+
route: "fs.delete",
|
|
11230
|
+
synonyms: ["filesystem.delete", "remove_file"],
|
|
11231
|
+
hardNegatives: ["fs.write", "fs.list"]
|
|
11232
|
+
},
|
|
11233
|
+
{
|
|
11234
|
+
id: "math_001",
|
|
11235
|
+
category: "math",
|
|
11236
|
+
prompt: "What is the integral of 3x^2 + 2x from 0 to 5?",
|
|
11237
|
+
route: "math.integral",
|
|
11238
|
+
synonyms: ["calculator.integral", "math.solve"],
|
|
11239
|
+
hardNegatives: ["math.derivative", "chat.reply"]
|
|
11240
|
+
},
|
|
11241
|
+
{
|
|
11242
|
+
id: "math_002",
|
|
11243
|
+
category: "math",
|
|
11244
|
+
prompt: "Compute the derivative of sin(x) * cos(x).",
|
|
11245
|
+
route: "math.derivative",
|
|
11246
|
+
synonyms: ["calculator.derivative", "math.solve"],
|
|
11247
|
+
hardNegatives: ["math.integral", "math.algebra"]
|
|
11248
|
+
},
|
|
11249
|
+
{
|
|
11250
|
+
id: "math_003",
|
|
11251
|
+
category: "math",
|
|
11252
|
+
prompt: "Solve 2x + 7 = 19 for x.",
|
|
11253
|
+
route: "math.algebra",
|
|
11254
|
+
synonyms: ["calculator.algebra", "math.solve"],
|
|
11255
|
+
hardNegatives: ["math.derivative", "math.integral"]
|
|
11256
|
+
},
|
|
11257
|
+
{
|
|
11258
|
+
id: "math_004",
|
|
11259
|
+
category: "math",
|
|
11260
|
+
prompt: "What is the prime factorization of 360?",
|
|
11261
|
+
route: "math.numbertheory",
|
|
11262
|
+
synonyms: ["calculator.factor", "math.solve"],
|
|
11263
|
+
hardNegatives: ["math.algebra", "search.web"]
|
|
11264
|
+
},
|
|
11265
|
+
{
|
|
11266
|
+
id: "search_001",
|
|
11267
|
+
category: "search",
|
|
11268
|
+
prompt: "Find recent papers on agent prompt optimization with held-out promotion gates.",
|
|
11269
|
+
route: "search.web",
|
|
11270
|
+
synonyms: ["web.search", "search.papers"],
|
|
11271
|
+
hardNegatives: ["search.code", "chat.reply"]
|
|
11272
|
+
},
|
|
11273
|
+
{
|
|
11274
|
+
id: "search_002",
|
|
11275
|
+
category: "search",
|
|
11276
|
+
prompt: "Search the codebase for every call site of `runProposeReview`.",
|
|
11277
|
+
route: "search.code",
|
|
11278
|
+
synonyms: ["code.search", "grep"],
|
|
11279
|
+
hardNegatives: ["search.web", "fs.read"]
|
|
11280
|
+
},
|
|
11281
|
+
{
|
|
11282
|
+
id: "search_003",
|
|
11283
|
+
category: "search",
|
|
11284
|
+
prompt: "What is the latest release of the Tangle network on GitHub?",
|
|
11285
|
+
route: "search.web",
|
|
11286
|
+
synonyms: ["web.search", "github.releases"],
|
|
11287
|
+
hardNegatives: ["search.code", "chat.reply"]
|
|
11288
|
+
},
|
|
11289
|
+
{
|
|
11290
|
+
id: "search_004",
|
|
11291
|
+
category: "search",
|
|
11292
|
+
prompt: "Find all TODO comments in the agent-eval src tree.",
|
|
11293
|
+
route: "search.code",
|
|
11294
|
+
synonyms: ["code.search", "grep"],
|
|
11295
|
+
hardNegatives: ["search.web", "fs.list"]
|
|
11296
|
+
},
|
|
11297
|
+
{
|
|
11298
|
+
id: "chat_001",
|
|
11299
|
+
category: "chat",
|
|
11300
|
+
prompt: "Hi there, how are you doing today?",
|
|
11301
|
+
route: "chat.reply",
|
|
11302
|
+
synonyms: ["conversation.reply"],
|
|
11303
|
+
hardNegatives: ["search.web", "fs.read"]
|
|
11304
|
+
},
|
|
11305
|
+
{
|
|
11306
|
+
id: "chat_002",
|
|
11307
|
+
category: "chat",
|
|
11308
|
+
prompt: "Please explain the difference between an LLM and a foundation model.",
|
|
11309
|
+
route: "chat.reply",
|
|
11310
|
+
synonyms: ["conversation.reply", "qa.answer"],
|
|
11311
|
+
hardNegatives: ["search.web", "math.algebra"]
|
|
11312
|
+
},
|
|
11313
|
+
{
|
|
11314
|
+
id: "chat_003",
|
|
11315
|
+
category: "chat",
|
|
11316
|
+
prompt: "Tell me a short joke about distributed systems.",
|
|
11317
|
+
route: "chat.reply",
|
|
11318
|
+
synonyms: ["conversation.reply"],
|
|
11319
|
+
hardNegatives: ["search.web", "fs.read"]
|
|
11320
|
+
},
|
|
11321
|
+
{
|
|
11322
|
+
id: "chat_004",
|
|
11323
|
+
category: "chat",
|
|
11324
|
+
prompt: "Acknowledge my last message with a thumbs up.",
|
|
11325
|
+
route: "chat.reply",
|
|
11326
|
+
synonyms: ["conversation.reply", "react"],
|
|
11327
|
+
hardNegatives: ["fs.write", "search.web"]
|
|
11328
|
+
}
|
|
11329
|
+
];
|
|
11330
|
+
|
|
11331
|
+
// src/benchmarks/routing/index.ts
|
|
11332
|
+
var RoutingAdapter = class {
|
|
11333
|
+
async loadDataset(split) {
|
|
11334
|
+
return ROUTING_DATASET.map((item) => ({ id: item.id, payload: item })).filter((it) => assignSplitImpl3(it.id) === split);
|
|
11335
|
+
}
|
|
11336
|
+
async evaluate(item, response) {
|
|
11337
|
+
const tokens2 = extractRouteTokens(response);
|
|
11338
|
+
const correct = new Set([item.payload.route, ...item.payload.synonyms].map((s) => s.toLowerCase()));
|
|
11339
|
+
const hardNeg = new Set(item.payload.hardNegatives.map((s) => s.toLowerCase()));
|
|
11340
|
+
const firstMatch = tokens2.find((t) => correct.has(t.toLowerCase())) ?? null;
|
|
11341
|
+
const firstHardNeg = tokens2.find((t) => hardNeg.has(t.toLowerCase())) ?? null;
|
|
11342
|
+
const score = firstMatch ? 1 : 0;
|
|
11343
|
+
return {
|
|
11344
|
+
score,
|
|
11345
|
+
raw: {
|
|
11346
|
+
firstToken: tokens2[0] ?? null,
|
|
11347
|
+
matchedRoute: firstMatch,
|
|
11348
|
+
hitHardNegative: Boolean(firstHardNeg),
|
|
11349
|
+
hardNegativeRoute: firstHardNeg,
|
|
11350
|
+
category: item.payload.category
|
|
11351
|
+
}
|
|
11352
|
+
};
|
|
11353
|
+
}
|
|
11354
|
+
assignSplit(itemId) {
|
|
11355
|
+
return assignSplitImpl3(itemId);
|
|
11356
|
+
}
|
|
11357
|
+
};
|
|
11358
|
+
function assignSplitImpl3(itemId) {
|
|
11359
|
+
return deterministicSplit(`routing::${itemId}`);
|
|
11360
|
+
}
|
|
11361
|
+
function extractRouteTokens(response) {
|
|
11362
|
+
const matches2 = response.match(/[a-z][a-z0-9_]*\.[a-z][a-z0-9_]*/gi);
|
|
11363
|
+
return matches2 ?? [];
|
|
11364
|
+
}
|
|
11365
|
+
var adapter3 = new RoutingAdapter();
|
|
11366
|
+
var loadDataset3 = adapter3.loadDataset.bind(adapter3);
|
|
11367
|
+
var evaluate3 = adapter3.evaluate.bind(adapter3);
|
|
11368
|
+
var assignSplit3 = adapter3.assignSplit.bind(adapter3);
|
|
11369
|
+
|
|
10069
11370
|
// src/reference-replay-steering.ts
|
|
10070
11371
|
function referenceReplayRunsToSteeringRows(runs, options = {}) {
|
|
10071
11372
|
const rows = [];
|
|
@@ -10257,9 +11558,9 @@ function aggregateTrials(population, scenarioIds, trials) {
|
|
|
10257
11558
|
return {
|
|
10258
11559
|
variantId: variant.id,
|
|
10259
11560
|
scenarioId: sid,
|
|
10260
|
-
meanScore:
|
|
10261
|
-
meanCost:
|
|
10262
|
-
meanDurationMs:
|
|
11561
|
+
meanScore: mean7(gradedTrials.map((t) => t.score)),
|
|
11562
|
+
meanCost: mean7(gradedTrials.map((t) => t.cost ?? 0)),
|
|
11563
|
+
meanDurationMs: mean7(gradedTrials.map((t) => t.durationMs ?? 0)),
|
|
10263
11564
|
okRate: scenarioTrials.length === 0 ? 0 : okTrials.length / scenarioTrials.length,
|
|
10264
11565
|
trials: scenarioTrials.length,
|
|
10265
11566
|
metrics
|
|
@@ -10267,10 +11568,10 @@ function aggregateTrials(population, scenarioIds, trials) {
|
|
|
10267
11568
|
});
|
|
10268
11569
|
return {
|
|
10269
11570
|
variantId: variant.id,
|
|
10270
|
-
meanScore:
|
|
10271
|
-
meanCost:
|
|
10272
|
-
meanDurationMs:
|
|
10273
|
-
okRate:
|
|
11571
|
+
meanScore: mean7(scenarios.map((s) => s.meanScore)),
|
|
11572
|
+
meanCost: mean7(scenarios.map((s) => s.meanCost)),
|
|
11573
|
+
meanDurationMs: mean7(scenarios.map((s) => s.meanDurationMs)),
|
|
11574
|
+
okRate: mean7(scenarios.map((s) => s.okRate)),
|
|
10274
11575
|
scenarios,
|
|
10275
11576
|
metrics: aggregateMetrics(scenarios.map((s) => s.metrics))
|
|
10276
11577
|
};
|
|
@@ -10287,10 +11588,10 @@ function aggregateMetrics(rows) {
|
|
|
10287
11588
|
}
|
|
10288
11589
|
}
|
|
10289
11590
|
const out = {};
|
|
10290
|
-
for (const [k, list] of buckets) out[k] =
|
|
11591
|
+
for (const [k, list] of buckets) out[k] = mean7(list);
|
|
10291
11592
|
return out;
|
|
10292
11593
|
}
|
|
10293
|
-
function
|
|
11594
|
+
function mean7(xs) {
|
|
10294
11595
|
if (xs.length === 0) return 0;
|
|
10295
11596
|
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
10296
11597
|
}
|
|
@@ -10331,11 +11632,11 @@ function samePopulation(a, b) {
|
|
|
10331
11632
|
}
|
|
10332
11633
|
|
|
10333
11634
|
// src/jsonl-trial-cache.ts
|
|
10334
|
-
import { appendFileSync as appendFileSync4, existsSync as
|
|
11635
|
+
import { appendFileSync as appendFileSync4, existsSync as existsSync8, mkdirSync as mkdirSync4, readFileSync as readFileSync7 } from "fs";
|
|
10335
11636
|
import { dirname as dirname4 } from "path";
|
|
10336
11637
|
|
|
10337
11638
|
// src/locked-jsonl-appender.ts
|
|
10338
|
-
import { appendFileSync as appendFileSync3, existsSync as
|
|
11639
|
+
import { appendFileSync as appendFileSync3, existsSync as existsSync7, mkdirSync as mkdirSync3 } from "fs";
|
|
10339
11640
|
import { dirname as dirname3 } from "path";
|
|
10340
11641
|
var mutexes = /* @__PURE__ */ new Map();
|
|
10341
11642
|
function getMutex(path) {
|
|
@@ -10350,7 +11651,7 @@ var LockedJsonlAppender = class {
|
|
|
10350
11651
|
constructor(path) {
|
|
10351
11652
|
this.path = path;
|
|
10352
11653
|
this.mutex = getMutex(path);
|
|
10353
|
-
if (!
|
|
11654
|
+
if (!existsSync7(dirname3(path))) {
|
|
10354
11655
|
mkdirSync3(dirname3(path), { recursive: true });
|
|
10355
11656
|
}
|
|
10356
11657
|
}
|
|
@@ -10375,8 +11676,8 @@ var JsonlTrialCache = class {
|
|
|
10375
11676
|
appender;
|
|
10376
11677
|
constructor(path) {
|
|
10377
11678
|
this.path = path;
|
|
10378
|
-
if (
|
|
10379
|
-
for (const line of
|
|
11679
|
+
if (existsSync8(path)) {
|
|
11680
|
+
for (const line of readFileSync7(path, "utf-8").split("\n")) {
|
|
10380
11681
|
if (!line.trim()) continue;
|
|
10381
11682
|
try {
|
|
10382
11683
|
const entry = JSON.parse(line);
|
|
@@ -10414,7 +11715,7 @@ var JsonlTrialCache = class {
|
|
|
10414
11715
|
};
|
|
10415
11716
|
|
|
10416
11717
|
// src/evolution-telemetry.ts
|
|
10417
|
-
import { appendFileSync as appendFileSync5, existsSync as
|
|
11718
|
+
import { appendFileSync as appendFileSync5, existsSync as existsSync9, mkdirSync as mkdirSync5, readFileSync as readFileSync8, writeFileSync } from "fs";
|
|
10418
11719
|
import { dirname as dirname5 } from "path";
|
|
10419
11720
|
var MutationTelemetry = class {
|
|
10420
11721
|
appender;
|
|
@@ -10445,16 +11746,16 @@ var LineageRecorder = class {
|
|
|
10445
11746
|
this.snapshotPath = `${path}.snapshot`;
|
|
10446
11747
|
this.kindOf = kindOf ?? defaultKindOf;
|
|
10447
11748
|
mkdirSync5(dirname5(path), { recursive: true });
|
|
10448
|
-
if (
|
|
11749
|
+
if (existsSync9(this.snapshotPath)) {
|
|
10449
11750
|
try {
|
|
10450
|
-
const parsed = JSON.parse(
|
|
11751
|
+
const parsed = JSON.parse(readFileSync8(this.snapshotPath, "utf-8"));
|
|
10451
11752
|
for (const n of parsed) this.nodes.set(n.id, n);
|
|
10452
11753
|
} catch {
|
|
10453
11754
|
}
|
|
10454
11755
|
}
|
|
10455
|
-
if (
|
|
11756
|
+
if (existsSync9(path)) {
|
|
10456
11757
|
try {
|
|
10457
|
-
for (const line of
|
|
11758
|
+
for (const line of readFileSync8(path, "utf-8").split("\n")) {
|
|
10458
11759
|
if (!line.trim()) continue;
|
|
10459
11760
|
try {
|
|
10460
11761
|
const entry = JSON.parse(line);
|
|
@@ -10466,9 +11767,9 @@ var LineageRecorder = class {
|
|
|
10466
11767
|
} catch {
|
|
10467
11768
|
}
|
|
10468
11769
|
}
|
|
10469
|
-
if (
|
|
11770
|
+
if (existsSync9(path) && this.nodes.size === 0) {
|
|
10470
11771
|
try {
|
|
10471
|
-
const raw =
|
|
11772
|
+
const raw = readFileSync8(path, "utf-8").trim();
|
|
10472
11773
|
if (raw.startsWith("[")) {
|
|
10473
11774
|
const parsed = JSON.parse(raw);
|
|
10474
11775
|
for (const n of parsed) this.nodes.set(n.id, n);
|
|
@@ -10482,8 +11783,8 @@ var LineageRecorder = class {
|
|
|
10482
11783
|
const prev = this.nodes.get(node.id);
|
|
10483
11784
|
this.nodes.set(node.id, { ...prev, ...node });
|
|
10484
11785
|
try {
|
|
10485
|
-
if (
|
|
10486
|
-
const head =
|
|
11786
|
+
if (existsSync9(this.path)) {
|
|
11787
|
+
const head = readFileSync8(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
|
|
10487
11788
|
if (head === "[") {
|
|
10488
11789
|
writeFileSync(this.path, "");
|
|
10489
11790
|
}
|
|
@@ -10549,9 +11850,9 @@ var CostLedger = class {
|
|
|
10549
11850
|
mutex = new Mutex();
|
|
10550
11851
|
constructor(path) {
|
|
10551
11852
|
this.path = path;
|
|
10552
|
-
if (
|
|
11853
|
+
if (existsSync9(path)) {
|
|
10553
11854
|
try {
|
|
10554
|
-
const loaded = JSON.parse(
|
|
11855
|
+
const loaded = JSON.parse(readFileSync8(path, "utf-8"));
|
|
10555
11856
|
for (const k of Object.keys(this.totals)) {
|
|
10556
11857
|
if (k === "byGeneration") {
|
|
10557
11858
|
if (loaded.byGeneration && typeof loaded.byGeneration === "object") {
|
|
@@ -10975,9 +12276,9 @@ function passOrthogonality(input) {
|
|
|
10975
12276
|
sims.push(cosineSimilarity(vectors[i], vectors[j]));
|
|
10976
12277
|
}
|
|
10977
12278
|
}
|
|
10978
|
-
const
|
|
12279
|
+
const mean9 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
|
|
10979
12280
|
return {
|
|
10980
|
-
orthogonality: Math.max(0, Math.min(1, 1 -
|
|
12281
|
+
orthogonality: Math.max(0, Math.min(1, 1 - mean9)),
|
|
10981
12282
|
passCount: passes.length,
|
|
10982
12283
|
similarities: sims
|
|
10983
12284
|
};
|
|
@@ -11023,8 +12324,8 @@ function bootstrapCi(baseline, candidate, options = {}) {
|
|
|
11023
12324
|
const iterations = options.iterations ?? 1e3;
|
|
11024
12325
|
const minTotal = options.minTotalSamples ?? 6;
|
|
11025
12326
|
const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
|
|
11026
|
-
const baselineMean =
|
|
11027
|
-
const candidateMean =
|
|
12327
|
+
const baselineMean = mean8(baseline);
|
|
12328
|
+
const candidateMean = mean8(candidate);
|
|
11028
12329
|
const delta = candidateMean - baselineMean;
|
|
11029
12330
|
if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
|
|
11030
12331
|
return {
|
|
@@ -11042,7 +12343,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
|
|
|
11042
12343
|
for (let i = 0; i < iterations; i++) {
|
|
11043
12344
|
const bResample = resample(baseline, rng);
|
|
11044
12345
|
const cResample = resample(candidate, rng);
|
|
11045
|
-
deltas[i] =
|
|
12346
|
+
deltas[i] = mean8(cResample) - mean8(bResample);
|
|
11046
12347
|
}
|
|
11047
12348
|
deltas.sort((a, b) => a - b);
|
|
11048
12349
|
const lowerIdx = Math.floor(alpha / 2 * iterations);
|
|
@@ -11065,7 +12366,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
|
|
|
11065
12366
|
verdict
|
|
11066
12367
|
};
|
|
11067
12368
|
}
|
|
11068
|
-
function
|
|
12369
|
+
function mean8(xs) {
|
|
11069
12370
|
if (xs.length === 0) return 0;
|
|
11070
12371
|
let s = 0;
|
|
11071
12372
|
for (const x of xs) s += x;
|
|
@@ -11260,6 +12561,7 @@ function parseReflectionResponse(raw, maxProposals) {
|
|
|
11260
12561
|
export {
|
|
11261
12562
|
AgentDriver,
|
|
11262
12563
|
AxGepaSteeringOptimizer,
|
|
12564
|
+
BENCHMARK_SPLIT_SEED,
|
|
11263
12565
|
BenchmarkRunner,
|
|
11264
12566
|
BudgetBreachError,
|
|
11265
12567
|
BudgetGuard,
|
|
@@ -11288,6 +12590,7 @@ export {
|
|
|
11288
12590
|
FileSystemExperimentStore,
|
|
11289
12591
|
FileSystemOutcomeStore,
|
|
11290
12592
|
FileSystemTraceStore,
|
|
12593
|
+
HeldOutGate,
|
|
11291
12594
|
HoldoutAuditor,
|
|
11292
12595
|
HoldoutLockedError,
|
|
11293
12596
|
INTENT_MATCH_JUDGE_VERSION,
|
|
@@ -11307,6 +12610,7 @@ export {
|
|
|
11307
12610
|
MultiLayerVerifier,
|
|
11308
12611
|
MutationTelemetry,
|
|
11309
12612
|
Mutex,
|
|
12613
|
+
NoopResearcher,
|
|
11310
12614
|
OTEL_AGENT_EVAL_SCOPE,
|
|
11311
12615
|
OptimizationLoop,
|
|
11312
12616
|
PairwiseSteeringOptimizer,
|
|
@@ -11317,6 +12621,7 @@ export {
|
|
|
11317
12621
|
PromptRegistry,
|
|
11318
12622
|
REDACTION_VERSION,
|
|
11319
12623
|
RunCritic,
|
|
12624
|
+
RunRecordValidationError,
|
|
11320
12625
|
SEMANTIC_CONCEPT_JUDGE_VERSION,
|
|
11321
12626
|
SandboxHarness,
|
|
11322
12627
|
ScenarioRegistry,
|
|
@@ -11333,7 +12638,10 @@ export {
|
|
|
11333
12638
|
analyzeSeries,
|
|
11334
12639
|
argHash,
|
|
11335
12640
|
attributeCounterfactuals,
|
|
12641
|
+
deterministicSplit as benchmarkDeterministicSplit,
|
|
12642
|
+
benchmarks_exports as benchmarks,
|
|
11336
12643
|
benjaminiHochberg,
|
|
12644
|
+
bhAdjust,
|
|
11337
12645
|
bisect,
|
|
11338
12646
|
bonferroni,
|
|
11339
12647
|
bootstrapCi,
|
|
@@ -11413,6 +12721,7 @@ export {
|
|
|
11413
12721
|
formatBenchmarkReport,
|
|
11414
12722
|
formatDriverReport,
|
|
11415
12723
|
formatFindings,
|
|
12724
|
+
gainHistogram,
|
|
11416
12725
|
precision as goldenPrecision,
|
|
11417
12726
|
gradeSemanticStatus,
|
|
11418
12727
|
groupBy,
|
|
@@ -11427,6 +12736,7 @@ export {
|
|
|
11427
12736
|
isLlmSpan,
|
|
11428
12737
|
isPrmVerdict,
|
|
11429
12738
|
isRetrievalSpan,
|
|
12739
|
+
isRunRecord,
|
|
11430
12740
|
isSandboxSpan,
|
|
11431
12741
|
isToolSpan,
|
|
11432
12742
|
jestTestParser,
|
|
@@ -11454,11 +12764,15 @@ export {
|
|
|
11454
12764
|
normalizeScores,
|
|
11455
12765
|
notBlocked,
|
|
11456
12766
|
outputLengthRubric,
|
|
12767
|
+
pairedBootstrap,
|
|
11457
12768
|
pairedTTest,
|
|
12769
|
+
pairedWilcoxon,
|
|
11458
12770
|
paraphraseRobustness,
|
|
12771
|
+
paretoChart,
|
|
11459
12772
|
paretoFrontier,
|
|
11460
12773
|
paretoFrontierWithCrowding,
|
|
11461
12774
|
parseReflectionResponse,
|
|
12775
|
+
parseRunRecordSafe,
|
|
11462
12776
|
partialCredit,
|
|
11463
12777
|
passOrthogonality,
|
|
11464
12778
|
pixelDeltaRatio,
|
|
@@ -11489,9 +12803,11 @@ export {
|
|
|
11489
12803
|
requiredSampleSize,
|
|
11490
12804
|
resetLockedAppendersForTesting,
|
|
11491
12805
|
resumeBuilderSession,
|
|
12806
|
+
roundTripRunRecord,
|
|
11492
12807
|
rowCount,
|
|
11493
12808
|
rowWhere,
|
|
11494
12809
|
runAssertions,
|
|
12810
|
+
runCanaries,
|
|
11495
12811
|
runCounterfactual,
|
|
11496
12812
|
runE2EWorkflow,
|
|
11497
12813
|
runExpectations,
|
|
@@ -11526,6 +12842,7 @@ export {
|
|
|
11526
12842
|
stuckLoopView,
|
|
11527
12843
|
summarize,
|
|
11528
12844
|
summarizeHarnessResults,
|
|
12845
|
+
summaryTable,
|
|
11529
12846
|
testJudge,
|
|
11530
12847
|
textInSnapshot,
|
|
11531
12848
|
toLangfuseEnvelope,
|
|
@@ -11539,6 +12856,7 @@ export {
|
|
|
11539
12856
|
toolWasteView,
|
|
11540
12857
|
typoMutator,
|
|
11541
12858
|
urlContains,
|
|
12859
|
+
validateRunRecord,
|
|
11542
12860
|
verbosityBias,
|
|
11543
12861
|
verifyManifest,
|
|
11544
12862
|
visualDiff,
|
|
@@ -11548,6 +12866,7 @@ export {
|
|
|
11548
12866
|
weightedRecall,
|
|
11549
12867
|
welchsTTest,
|
|
11550
12868
|
whitespaceCollapseMutator,
|
|
11551
|
-
wilcoxonSignedRank
|
|
12869
|
+
wilcoxonSignedRank,
|
|
12870
|
+
wranglerDeployRunner
|
|
11552
12871
|
};
|
|
11553
12872
|
//# sourceMappingURL=index.js.map
|