@tangle-network/agent-eval 0.7.0 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -38
- package/dist/index.d.ts +255 -32
- package/dist/index.js +410 -67
- package/dist/index.js.map +1 -1
- package/package.json +10 -9
package/dist/index.js
CHANGED
|
@@ -410,7 +410,7 @@ function confidenceInterval(scores, confidence = 0.95) {
|
|
|
410
410
|
if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
|
|
411
411
|
if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
|
|
412
412
|
const n = scores.length;
|
|
413
|
-
const
|
|
413
|
+
const mean4 = scores.reduce((a, b) => a + b, 0) / n;
|
|
414
414
|
const B = 1e3;
|
|
415
415
|
const bootstrapMeans = [];
|
|
416
416
|
for (let i = 0; i < B; i++) {
|
|
@@ -425,7 +425,7 @@ function confidenceInterval(scores, confidence = 0.95) {
|
|
|
425
425
|
const lowerIdx = Math.floor(alpha / 2 * B);
|
|
426
426
|
const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
|
|
427
427
|
return {
|
|
428
|
-
mean:
|
|
428
|
+
mean: mean4,
|
|
429
429
|
lower: bootstrapMeans[lowerIdx],
|
|
430
430
|
upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
|
|
431
431
|
};
|
|
@@ -513,11 +513,11 @@ function pairedTTest(before, after) {
|
|
|
513
513
|
const n = before.length;
|
|
514
514
|
if (n < 2) return { t: 0, df: 0, p: 1 };
|
|
515
515
|
const diffs = before.map((b, i) => after[i] - b);
|
|
516
|
-
const
|
|
517
|
-
const variance2 = diffs.reduce((acc, d) => acc + (d -
|
|
516
|
+
const mean4 = diffs.reduce((a, b) => a + b, 0) / n;
|
|
517
|
+
const variance2 = diffs.reduce((acc, d) => acc + (d - mean4) ** 2, 0) / (n - 1);
|
|
518
518
|
const se = Math.sqrt(variance2 / n);
|
|
519
|
-
if (se === 0) return { t:
|
|
520
|
-
const t =
|
|
519
|
+
if (se === 0) return { t: mean4 === 0 ? 0 : Infinity, df: n - 1, p: mean4 === 0 ? 1 : 0 };
|
|
520
|
+
const t = mean4 / se;
|
|
521
521
|
const df = n - 1;
|
|
522
522
|
const p = 2 * (1 - studentTCdf(Math.abs(t), df));
|
|
523
523
|
return { t, df, p };
|
|
@@ -541,9 +541,9 @@ function wilcoxonSignedRank(before, after) {
|
|
|
541
541
|
}
|
|
542
542
|
let wPlus = 0;
|
|
543
543
|
for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
|
|
544
|
-
const
|
|
544
|
+
const mean4 = n * (n + 1) / 4;
|
|
545
545
|
const variance2 = n * (n + 1) * (2 * n + 1) / 24;
|
|
546
|
-
const z = (wPlus -
|
|
546
|
+
const z = (wPlus - mean4) / Math.sqrt(variance2);
|
|
547
547
|
const p = 2 * (1 - normalCdf(Math.abs(z)));
|
|
548
548
|
return { w: wPlus, p };
|
|
549
549
|
}
|
|
@@ -2135,12 +2135,14 @@ var DEFAULT_RUN_SCORE_WEIGHTS = {
|
|
|
2135
2135
|
toolUseQuality: 1,
|
|
2136
2136
|
patchQuality: 1.25,
|
|
2137
2137
|
testReality: 1.5,
|
|
2138
|
+
finalGate: 3,
|
|
2139
|
+
reviewerBlockers: -2,
|
|
2138
2140
|
costUsd: -0.2,
|
|
2139
2141
|
wallSeconds: -0.1
|
|
2140
2142
|
};
|
|
2141
2143
|
function aggregateRunScore(score, weights = {}) {
|
|
2142
2144
|
const w = { ...DEFAULT_RUN_SCORE_WEIGHTS, ...weights };
|
|
2143
|
-
return w.success * clamp01(score.success) + w.goalProgress * clamp01(score.goalProgress) + w.repoGroundedness * clamp01(score.repoGroundedness) + w.driftPenalty * clamp01(score.driftPenalty) + w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60);
|
|
2145
|
+
return w.success * clamp01(score.success) + w.goalProgress * clamp01(score.goalProgress) + w.repoGroundedness * clamp01(score.repoGroundedness) + w.driftPenalty * clamp01(score.driftPenalty) + w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + w.finalGate * clamp01(score.finalGate) + w.reviewerBlockers * clamp01(score.reviewerBlockers) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60);
|
|
2144
2146
|
}
|
|
2145
2147
|
function clamp01(value) {
|
|
2146
2148
|
if (!Number.isFinite(value)) return 0;
|
|
@@ -2180,6 +2182,9 @@ var RunCritic = class {
|
|
|
2180
2182
|
const toolSpans2 = trace.spans.filter((s) => s.kind === "tool");
|
|
2181
2183
|
const judgeSpans2 = trace.spans.filter((s) => s.kind === "judge");
|
|
2182
2184
|
const sandboxSpans = trace.spans.filter((s) => s.kind === "sandbox");
|
|
2185
|
+
const finalGateSpans = judgeSpans2.filter(
|
|
2186
|
+
(span) => span.dimension === "final_gate" || span.attributes?.finalGate === true
|
|
2187
|
+
);
|
|
2183
2188
|
const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
|
|
2184
2189
|
if (!success) notes.push("run did not complete with pass=true");
|
|
2185
2190
|
const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum, span) => sum + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
|
|
@@ -2194,6 +2199,15 @@ var RunCritic = class {
|
|
|
2194
2199
|
const sandboxTests = sandboxSpans.filter((span) => typeof span.testsTotal === "number" && span.testsTotal > 0);
|
|
2195
2200
|
const testReality = sandboxTests.length ? sandboxTests.reduce((sum, span) => sum + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1), 0) / sandboxTests.length : toolSpans2.some((span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))) ? 0.4 : 0;
|
|
2196
2201
|
if (!testReality) notes.push("no real test/build evidence recorded");
|
|
2202
|
+
const blockerSpans = judgeSpans2.filter(
|
|
2203
|
+
(span) => isBlockingJudge(span)
|
|
2204
|
+
);
|
|
2205
|
+
const finalGateBlockers = finalGateSpans.filter((span) => isBlockingJudge(span));
|
|
2206
|
+
const finalGate = finalGateSpans.length ? finalGateBlockers.length ? 0 : 1 : success;
|
|
2207
|
+
if (finalGateBlockers.length) notes.push(`final gate blocked by ${finalGateBlockers.length} reviewer(s)`);
|
|
2208
|
+
else if (!finalGateSpans.length) notes.push("no final gate judgment recorded");
|
|
2209
|
+
const reviewerBlockers = judgeSpans2.length ? blockerSpans.length / judgeSpans2.length : 0;
|
|
2210
|
+
if (reviewerBlockers) notes.push(`detected ${blockerSpans.length} blocking reviewer signal(s)`);
|
|
2197
2211
|
const positiveGroundingSignals = patchEvidence + sandboxSpans.length + llmSpans2.filter((span) => looksRepoGrounded(span.output ?? "")).length;
|
|
2198
2212
|
const driftSignals = llmSpans2.filter((span) => this.isDrift(span.output ?? "")).length + trace.events.filter((event) => this.isDrift(JSON.stringify(event.payload))).length;
|
|
2199
2213
|
const repoGroundedness = positiveGroundingSignals + driftSignals === 0 ? 0 : positiveGroundingSignals / (positiveGroundingSignals + driftSignals);
|
|
@@ -2209,6 +2223,8 @@ var RunCritic = class {
|
|
|
2209
2223
|
toolUseQuality,
|
|
2210
2224
|
patchQuality,
|
|
2211
2225
|
testReality,
|
|
2226
|
+
finalGate,
|
|
2227
|
+
reviewerBlockers,
|
|
2212
2228
|
costUsd,
|
|
2213
2229
|
wallSeconds,
|
|
2214
2230
|
notes
|
|
@@ -2227,6 +2243,12 @@ function normalizeJudgeScore(score) {
|
|
|
2227
2243
|
function looksRepoGrounded(text) {
|
|
2228
2244
|
return /(?:src\/|tests?\/|package\.json|tsconfig|\.ts\b|\.tsx\b|git status|pnpm |npm |vitest|pytest|jest)/i.test(text);
|
|
2229
2245
|
}
|
|
2246
|
+
function isBlockingJudge(span) {
|
|
2247
|
+
return span.attributes?.blocking === true || span.attributes?.verdict === "BLOCKING" || positiveNumber(span.attributes?.blockingFindings) || positiveNumber(span.attributes?.highFindings) || span.score <= 2;
|
|
2248
|
+
}
|
|
2249
|
+
function positiveNumber(value) {
|
|
2250
|
+
return typeof value === "number" && value > 0;
|
|
2251
|
+
}
|
|
2230
2252
|
|
|
2231
2253
|
// src/playbook.ts
|
|
2232
2254
|
function distillPlaybook(entries, options = {}) {
|
|
@@ -2430,6 +2452,144 @@ function createAxService(aiFactory, provider, apiKey, model) {
|
|
|
2430
2452
|
});
|
|
2431
2453
|
}
|
|
2432
2454
|
|
|
2455
|
+
// src/pareto.ts
|
|
2456
|
+
function dominates(a, b, objectives) {
|
|
2457
|
+
let strictlyBetter = false;
|
|
2458
|
+
for (const obj of objectives) {
|
|
2459
|
+
const av = obj.value(a);
|
|
2460
|
+
const bv = obj.value(b);
|
|
2461
|
+
if (!Number.isFinite(av) || !Number.isFinite(bv)) return false;
|
|
2462
|
+
const aIsBetter = obj.direction === "maximize" ? av > bv : av < bv;
|
|
2463
|
+
const aIsWorse = obj.direction === "maximize" ? av < bv : av > bv;
|
|
2464
|
+
if (aIsWorse) return false;
|
|
2465
|
+
if (aIsBetter) strictlyBetter = true;
|
|
2466
|
+
}
|
|
2467
|
+
return strictlyBetter;
|
|
2468
|
+
}
|
|
2469
|
+
function paretoFrontier(candidates, objectives) {
|
|
2470
|
+
if (objectives.length === 0) {
|
|
2471
|
+
throw new Error("paretoFrontier: at least 1 objective required");
|
|
2472
|
+
}
|
|
2473
|
+
const valid = candidates.filter(
|
|
2474
|
+
(c) => objectives.every((o) => Number.isFinite(o.value(c)))
|
|
2475
|
+
);
|
|
2476
|
+
const frontier = [];
|
|
2477
|
+
const dominated = [];
|
|
2478
|
+
for (const c of valid) {
|
|
2479
|
+
const isDominated = valid.some((other) => other !== c && dominates(other, c, objectives));
|
|
2480
|
+
if (isDominated) dominated.push(c);
|
|
2481
|
+
else frontier.push(c);
|
|
2482
|
+
}
|
|
2483
|
+
const dominanceMap = frontier.map((d) => ({
|
|
2484
|
+
dominator: d,
|
|
2485
|
+
dominated: dominated.filter((x) => dominates(d, x, objectives))
|
|
2486
|
+
}));
|
|
2487
|
+
return { frontier, dominated, dominanceMap };
|
|
2488
|
+
}
|
|
2489
|
+
|
|
2490
|
+
// src/harness-optimizer.ts
|
|
2491
|
+
var DEFAULT_HARNESS_OBJECTIVES = [
|
|
2492
|
+
{ name: "aggregate", direction: "maximize", value: (r) => r.aggregateMean },
|
|
2493
|
+
{ name: "pass_rate", direction: "maximize", value: (r) => r.passRate },
|
|
2494
|
+
{ name: "cost", direction: "minimize", value: (r) => r.costUsdMean },
|
|
2495
|
+
{ name: "wall", direction: "minimize", value: (r) => r.wallSecondsMean }
|
|
2496
|
+
];
|
|
2497
|
+
async function runHarnessExperiment(config) {
|
|
2498
|
+
const jobs = buildJobs(config);
|
|
2499
|
+
const critic = new RunCritic({ weights: config.weights });
|
|
2500
|
+
const score = config.score ?? ((trace) => critic.scoreTrace(trace));
|
|
2501
|
+
const results = await mapLimit(jobs, config.parallelism ?? 1, async (request) => {
|
|
2502
|
+
const trace = await config.adapter.run(request);
|
|
2503
|
+
const runScore = await score(trace, request);
|
|
2504
|
+
const result = {
|
|
2505
|
+
variant: request.variant,
|
|
2506
|
+
scenario: request.scenario,
|
|
2507
|
+
trialIndex: request.trialIndex,
|
|
2508
|
+
trace,
|
|
2509
|
+
score: runScore,
|
|
2510
|
+
aggregate: aggregateRunScore(runScore, config.weights)
|
|
2511
|
+
};
|
|
2512
|
+
await config.onResult?.(result);
|
|
2513
|
+
return result;
|
|
2514
|
+
});
|
|
2515
|
+
return { results, selection: selectHarnessVariant(results, config.objectives) };
|
|
2516
|
+
}
|
|
2517
|
+
function selectHarnessVariant(results, objectives = DEFAULT_HARNESS_OBJECTIVES) {
|
|
2518
|
+
const reports = summarizeHarnessResults(results);
|
|
2519
|
+
if (reports.length === 0) throw new Error("selectHarnessVariant: no results");
|
|
2520
|
+
const frontier = paretoFrontier(reports, objectives);
|
|
2521
|
+
const candidates = frontier.frontier.length ? frontier.frontier : reports;
|
|
2522
|
+
const winner = [...candidates].sort((a, b) => b.aggregateMean - a.aggregateMean)[0];
|
|
2523
|
+
if (!winner) throw new Error("selectHarnessVariant: no winner");
|
|
2524
|
+
return { winner, frontier, reports };
|
|
2525
|
+
}
|
|
2526
|
+
function summarizeHarnessResults(results) {
|
|
2527
|
+
const byVariant = /* @__PURE__ */ new Map();
|
|
2528
|
+
for (const result of results) {
|
|
2529
|
+
byVariant.set(result.variant.id, [...byVariant.get(result.variant.id) ?? [], result]);
|
|
2530
|
+
}
|
|
2531
|
+
return [...byVariant.values()].map((runs) => {
|
|
2532
|
+
const variant = runs[0]?.variant;
|
|
2533
|
+
if (!variant) throw new Error("summarizeHarnessResults: empty variant bucket");
|
|
2534
|
+
return {
|
|
2535
|
+
variant,
|
|
2536
|
+
runs,
|
|
2537
|
+
aggregateMean: mean(runs.map((r) => r.aggregate)),
|
|
2538
|
+
passRate: mean(runs.map((r) => r.score.success)),
|
|
2539
|
+
costUsdMean: mean(runs.map((r) => r.score.costUsd)),
|
|
2540
|
+
wallSecondsMean: mean(runs.map((r) => r.score.wallSeconds)),
|
|
2541
|
+
scoreMean: meanRunScore(runs.map((r) => r.score))
|
|
2542
|
+
};
|
|
2543
|
+
}).sort((a, b) => b.aggregateMean - a.aggregateMean);
|
|
2544
|
+
}
|
|
2545
|
+
function buildJobs(config) {
|
|
2546
|
+
if (config.variants.length === 0) throw new Error("runHarnessExperiment: at least one variant required");
|
|
2547
|
+
if (config.scenarios.length === 0) throw new Error("runHarnessExperiment: at least one scenario required");
|
|
2548
|
+
const trials = Math.max(1, Math.floor(config.trialsPerScenario ?? 1));
|
|
2549
|
+
const jobs = [];
|
|
2550
|
+
for (const variant of config.variants) {
|
|
2551
|
+
for (const scenario of config.scenarios) {
|
|
2552
|
+
for (let trialIndex = 0; trialIndex < trials; trialIndex++) {
|
|
2553
|
+
jobs.push({ variant, scenario, trialIndex });
|
|
2554
|
+
}
|
|
2555
|
+
}
|
|
2556
|
+
}
|
|
2557
|
+
return jobs;
|
|
2558
|
+
}
|
|
2559
|
+
async function mapLimit(items, limit, fn) {
|
|
2560
|
+
const results = new Array(items.length);
|
|
2561
|
+
let next = 0;
|
|
2562
|
+
const workerCount = Math.max(1, Math.min(Math.floor(limit), items.length));
|
|
2563
|
+
await Promise.all(Array.from({ length: workerCount }, async () => {
|
|
2564
|
+
while (next < items.length) {
|
|
2565
|
+
const index = next++;
|
|
2566
|
+
const item = items[index];
|
|
2567
|
+
if (item === void 0) continue;
|
|
2568
|
+
results[index] = await fn(item);
|
|
2569
|
+
}
|
|
2570
|
+
}));
|
|
2571
|
+
return results;
|
|
2572
|
+
}
|
|
2573
|
+
function mean(values) {
|
|
2574
|
+
return values.length ? values.reduce((sum, value) => sum + value, 0) / values.length : 0;
|
|
2575
|
+
}
|
|
2576
|
+
function meanRunScore(scores) {
|
|
2577
|
+
return {
|
|
2578
|
+
success: mean(scores.map((s) => s.success)),
|
|
2579
|
+
goalProgress: mean(scores.map((s) => s.goalProgress)),
|
|
2580
|
+
repoGroundedness: mean(scores.map((s) => s.repoGroundedness)),
|
|
2581
|
+
driftPenalty: mean(scores.map((s) => s.driftPenalty)),
|
|
2582
|
+
toolUseQuality: mean(scores.map((s) => s.toolUseQuality)),
|
|
2583
|
+
patchQuality: mean(scores.map((s) => s.patchQuality)),
|
|
2584
|
+
testReality: mean(scores.map((s) => s.testReality)),
|
|
2585
|
+
finalGate: mean(scores.map((s) => s.finalGate)),
|
|
2586
|
+
reviewerBlockers: mean(scores.map((s) => s.reviewerBlockers)),
|
|
2587
|
+
costUsd: mean(scores.map((s) => s.costUsd)),
|
|
2588
|
+
wallSeconds: mean(scores.map((s) => s.wallSeconds)),
|
|
2589
|
+
notes: scores.flatMap((s) => s.notes ?? [])
|
|
2590
|
+
};
|
|
2591
|
+
}
|
|
2592
|
+
|
|
2433
2593
|
// src/trace/store.ts
|
|
2434
2594
|
var InMemoryTraceStore = class {
|
|
2435
2595
|
runs = /* @__PURE__ */ new Map();
|
|
@@ -2875,14 +3035,22 @@ function composeParsers(...parsers) {
|
|
|
2875
3035
|
}
|
|
2876
3036
|
var SubprocessSandboxDriver = class {
|
|
2877
3037
|
id = "subprocess";
|
|
3038
|
+
defaultCwd;
|
|
3039
|
+
defaultEnv;
|
|
3040
|
+
constructor(options = {}) {
|
|
3041
|
+
this.defaultCwd = options.cwd;
|
|
3042
|
+
this.defaultEnv = options.env;
|
|
3043
|
+
}
|
|
2878
3044
|
async exec(phase, command, config) {
|
|
2879
3045
|
const { spawn } = await import("child_process");
|
|
2880
3046
|
const start = Date.now();
|
|
3047
|
+
const effectiveCwd = config.cwd ?? this.defaultCwd;
|
|
3048
|
+
const effectiveEnv = { ...process.env, ...this.defaultEnv ?? {}, ...config.env ?? {} };
|
|
2881
3049
|
return await new Promise((resolve) => {
|
|
2882
3050
|
const child = spawn(command, {
|
|
2883
3051
|
shell: true,
|
|
2884
|
-
cwd:
|
|
2885
|
-
env:
|
|
3052
|
+
cwd: effectiveCwd,
|
|
3053
|
+
env: effectiveEnv
|
|
2886
3054
|
});
|
|
2887
3055
|
let stdout = "";
|
|
2888
3056
|
let stderr = "";
|
|
@@ -4308,8 +4476,8 @@ function compareToBaseline(samples, options = {}) {
|
|
|
4308
4476
|
if (s.baseline.length < 2 || s.candidate.length < 2) {
|
|
4309
4477
|
throw new Error(`compareToBaseline: need \u22652 samples per side for "${s.metric}"`);
|
|
4310
4478
|
}
|
|
4311
|
-
const bMean =
|
|
4312
|
-
const cMean =
|
|
4479
|
+
const bMean = mean2(s.baseline);
|
|
4480
|
+
const cMean = mean2(s.candidate);
|
|
4313
4481
|
const delta = cMean - bMean;
|
|
4314
4482
|
const d = cohensD(s.baseline, s.candidate);
|
|
4315
4483
|
const { t, df, p } = welchsTTest(s.baseline, s.candidate);
|
|
@@ -4348,7 +4516,7 @@ function compareToBaseline(samples, options = {}) {
|
|
|
4348
4516
|
hasUnstable: metrics.some((m) => m.verdict === "unstable")
|
|
4349
4517
|
};
|
|
4350
4518
|
}
|
|
4351
|
-
function
|
|
4519
|
+
function mean2(xs) {
|
|
4352
4520
|
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
4353
4521
|
}
|
|
4354
4522
|
function iqr(xs) {
|
|
@@ -4364,8 +4532,8 @@ function iqr(xs) {
|
|
|
4364
4532
|
}
|
|
4365
4533
|
function welchsTTest(a, b) {
|
|
4366
4534
|
if (a.length < 2 || b.length < 2) return { t: 0, df: 0, p: 1 };
|
|
4367
|
-
const mA =
|
|
4368
|
-
const mB =
|
|
4535
|
+
const mA = mean2(a);
|
|
4536
|
+
const mB = mean2(b);
|
|
4369
4537
|
const vA = variance(a, mA);
|
|
4370
4538
|
const vB = variance(b, mB);
|
|
4371
4539
|
const seSquared = vA / a.length + vB / b.length;
|
|
@@ -4685,6 +4853,30 @@ var CostTracker = class {
|
|
|
4685
4853
|
if (!bucket) throw new Error(`CostTracker.markOutcome: unknown scenario "${scenarioId}"`);
|
|
4686
4854
|
bucket.completed = completed;
|
|
4687
4855
|
}
|
|
4856
|
+
/**
|
|
4857
|
+
* Convenience: record + markOutcome in one call from a
|
|
4858
|
+
* `{ usage, verdict }`-shaped response (starter-foundry's
|
|
4859
|
+
* `invokeMetaJudge` returns this shape; consumers that wrap any
|
|
4860
|
+
* judge/critic can follow the same convention).
|
|
4861
|
+
*
|
|
4862
|
+
* `usage.model` must be present in `MODEL_PRICING` for cost math to
|
|
4863
|
+
* populate; otherwise totalCostUsd stays at 0 for the entry but
|
|
4864
|
+
* tokens still aggregate.
|
|
4865
|
+
*/
|
|
4866
|
+
recordVerdict(verdict, scenarioId, tags) {
|
|
4867
|
+
if (!verdict.usage) return null;
|
|
4868
|
+
const entry = this.record({
|
|
4869
|
+
scenarioId,
|
|
4870
|
+
model: verdict.usage.model,
|
|
4871
|
+
inputTokens: verdict.usage.inputTokens,
|
|
4872
|
+
outputTokens: verdict.usage.outputTokens,
|
|
4873
|
+
cachedTokens: verdict.usage.cachedTokens,
|
|
4874
|
+
reasoningTokens: verdict.usage.reasoningTokens,
|
|
4875
|
+
tags
|
|
4876
|
+
});
|
|
4877
|
+
this.markOutcome(scenarioId, verdict.verdict === "pass");
|
|
4878
|
+
return entry;
|
|
4879
|
+
}
|
|
4688
4880
|
get(scenarioId) {
|
|
4689
4881
|
return this.byScenario.get(scenarioId);
|
|
4690
4882
|
}
|
|
@@ -4721,39 +4913,177 @@ function assertNonNegative(n, name) {
|
|
|
4721
4913
|
}
|
|
4722
4914
|
}
|
|
4723
4915
|
|
|
4724
|
-
// src/
|
|
4725
|
-
|
|
4726
|
-
|
|
4727
|
-
|
|
4728
|
-
|
|
4729
|
-
const bv = obj.value(b);
|
|
4730
|
-
if (!Number.isFinite(av) || !Number.isFinite(bv)) return false;
|
|
4731
|
-
const aIsBetter = obj.direction === "maximize" ? av > bv : av < bv;
|
|
4732
|
-
const aIsWorse = obj.direction === "maximize" ? av < bv : av > bv;
|
|
4733
|
-
if (aIsWorse) return false;
|
|
4734
|
-
if (aIsBetter) strictlyBetter = true;
|
|
4735
|
-
}
|
|
4736
|
-
return strictlyBetter;
|
|
4916
|
+
// src/muffled-gate-scanner.ts
|
|
4917
|
+
import { readFileSync as readFileSync2, existsSync as existsSync2, readdirSync, statSync } from "fs";
|
|
4918
|
+
import { join } from "path";
|
|
4919
|
+
function codeOf(line) {
|
|
4920
|
+
return line.replace(/\/\/.*$/, "").replace(/^\s*\*.*$/, "");
|
|
4737
4921
|
}
|
|
4738
|
-
function
|
|
4739
|
-
|
|
4740
|
-
|
|
4922
|
+
function isMuffleOk(line) {
|
|
4923
|
+
return line.includes("muffle-ok:");
|
|
4924
|
+
}
|
|
4925
|
+
var findFallbackToPass = (file, text) => {
|
|
4926
|
+
const out = [];
|
|
4927
|
+
const lines = text.split("\n");
|
|
4928
|
+
for (let i = 0; i < lines.length; i++) {
|
|
4929
|
+
const line = lines[i];
|
|
4930
|
+
if (isMuffleOk(line)) continue;
|
|
4931
|
+
const code = codeOf(line);
|
|
4932
|
+
if (!code.trim()) continue;
|
|
4933
|
+
if (/\|\| true/.test(code) && /(testCommand|setupCommand|cmd|command)/.test(code)) {
|
|
4934
|
+
out.push({ file, line: i + 1, lineText: line.trim(), pattern: "fallback-to-pass (|| true in command string)" });
|
|
4935
|
+
}
|
|
4741
4936
|
}
|
|
4742
|
-
|
|
4743
|
-
|
|
4744
|
-
|
|
4745
|
-
const
|
|
4746
|
-
const
|
|
4747
|
-
for (
|
|
4748
|
-
const
|
|
4749
|
-
if (
|
|
4750
|
-
|
|
4937
|
+
return out;
|
|
4938
|
+
};
|
|
4939
|
+
var findLiteralTruePass = (file, text) => {
|
|
4940
|
+
const out = [];
|
|
4941
|
+
const lines = text.split("\n");
|
|
4942
|
+
for (let i = 0; i < lines.length; i++) {
|
|
4943
|
+
const line = lines[i];
|
|
4944
|
+
if (isMuffleOk(line)) continue;
|
|
4945
|
+
const code = codeOf(line);
|
|
4946
|
+
if (!code.trim()) continue;
|
|
4947
|
+
if (/testCommand\s*:\s*['"]true['"]/.test(code)) {
|
|
4948
|
+
out.push({ file, line: i + 1, lineText: line.trim(), pattern: 'literal-true-pass (testCommand: "true")' });
|
|
4949
|
+
}
|
|
4751
4950
|
}
|
|
4752
|
-
|
|
4753
|
-
|
|
4754
|
-
|
|
4755
|
-
|
|
4756
|
-
|
|
4951
|
+
return out;
|
|
4952
|
+
};
|
|
4953
|
+
var findConstructorCwdDropped = (file, text) => {
|
|
4954
|
+
const out = [];
|
|
4955
|
+
const lines = text.split("\n");
|
|
4956
|
+
for (let i = 0; i < lines.length; i++) {
|
|
4957
|
+
const line = lines[i];
|
|
4958
|
+
if (isMuffleOk(line)) continue;
|
|
4959
|
+
const code = codeOf(line);
|
|
4960
|
+
if (!code.trim()) continue;
|
|
4961
|
+
if (/new\s+SubprocessSandboxDriver\s*\(\s*\{[^}]*cwd\s*:/.test(code)) {
|
|
4962
|
+
out.push({
|
|
4963
|
+
file,
|
|
4964
|
+
line: i + 1,
|
|
4965
|
+
lineText: line.trim(),
|
|
4966
|
+
pattern: "construct-vs-call cwd dropped (driver.exec reads config.cwd, not constructor.cwd)"
|
|
4967
|
+
});
|
|
4968
|
+
}
|
|
4969
|
+
}
|
|
4970
|
+
return out;
|
|
4971
|
+
};
|
|
4972
|
+
var findAutoMatchNoExpectation = (file, text) => {
|
|
4973
|
+
const out = [];
|
|
4974
|
+
const lines = text.split("\n");
|
|
4975
|
+
for (let i = 0; i < lines.length; i++) {
|
|
4976
|
+
const line = lines[i];
|
|
4977
|
+
if (isMuffleOk(line)) continue;
|
|
4978
|
+
const code = codeOf(line);
|
|
4979
|
+
if (!code.trim()) continue;
|
|
4980
|
+
if (/if\s*\(\s*!expected\s*\)\s*return\s+true/.test(code)) {
|
|
4981
|
+
out.push({
|
|
4982
|
+
file,
|
|
4983
|
+
line: i + 1,
|
|
4984
|
+
lineText: line.trim(),
|
|
4985
|
+
pattern: "auto-match-no-expectation (if (!expected) return true)"
|
|
4986
|
+
});
|
|
4987
|
+
}
|
|
4988
|
+
}
|
|
4989
|
+
return out;
|
|
4990
|
+
};
|
|
4991
|
+
var findSkipCountsAsPass = (file, text) => {
|
|
4992
|
+
const out = [];
|
|
4993
|
+
const lines = text.split("\n");
|
|
4994
|
+
for (let i = 0; i < lines.length; i++) {
|
|
4995
|
+
const line = lines[i];
|
|
4996
|
+
if (isMuffleOk(line)) continue;
|
|
4997
|
+
const code = codeOf(line);
|
|
4998
|
+
if (!code.trim()) continue;
|
|
4999
|
+
if (/if\s*\(\s*\w+\.skipped\s*\)\s*return\s+true/.test(code)) {
|
|
5000
|
+
out.push({
|
|
5001
|
+
file,
|
|
5002
|
+
line: i + 1,
|
|
5003
|
+
lineText: line.trim(),
|
|
5004
|
+
pattern: "skip-counts-as-pass (if (.skipped) return true)"
|
|
5005
|
+
});
|
|
5006
|
+
}
|
|
5007
|
+
}
|
|
5008
|
+
return out;
|
|
5009
|
+
};
|
|
5010
|
+
var DEFAULT_FINDERS = [
|
|
5011
|
+
findFallbackToPass,
|
|
5012
|
+
findLiteralTruePass,
|
|
5013
|
+
findAutoMatchNoExpectation,
|
|
5014
|
+
findSkipCountsAsPass
|
|
5015
|
+
];
|
|
5016
|
+
var UNIVERSAL_FINDERS = [
|
|
5017
|
+
findConstructorCwdDropped
|
|
5018
|
+
];
|
|
5019
|
+
function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
|
|
5020
|
+
const matches2 = [];
|
|
5021
|
+
const walk = (rel) => {
|
|
5022
|
+
const abs = join(repoRoot, rel);
|
|
5023
|
+
if (!existsSync2(abs)) return;
|
|
5024
|
+
for (const entry of readdirSync(abs)) {
|
|
5025
|
+
const sub = join(rel, entry);
|
|
5026
|
+
const subAbs = join(repoRoot, sub);
|
|
5027
|
+
let st;
|
|
5028
|
+
try {
|
|
5029
|
+
st = statSync(subAbs);
|
|
5030
|
+
} catch {
|
|
5031
|
+
continue;
|
|
5032
|
+
}
|
|
5033
|
+
if (st.isDirectory()) {
|
|
5034
|
+
if (entry === "node_modules" || entry === "dist" || entry === "dist-tests" || entry.startsWith(".")) continue;
|
|
5035
|
+
walk(sub);
|
|
5036
|
+
} else if (st.isFile() && extensions.test(entry)) {
|
|
5037
|
+
if (entry.endsWith(".test.ts") || entry.endsWith(".test.mjs") || entry.endsWith(".test.js")) continue;
|
|
5038
|
+
let text;
|
|
5039
|
+
try {
|
|
5040
|
+
text = readFileSync2(subAbs, "utf8");
|
|
5041
|
+
} catch {
|
|
5042
|
+
continue;
|
|
5043
|
+
}
|
|
5044
|
+
if (text.includes(importsContain)) matches2.push(sub);
|
|
5045
|
+
}
|
|
5046
|
+
}
|
|
5047
|
+
};
|
|
5048
|
+
for (const r of roots) walk(r);
|
|
5049
|
+
return matches2;
|
|
5050
|
+
}
|
|
5051
|
+
function scanForMuffledGates(opts) {
|
|
5052
|
+
const findings = [];
|
|
5053
|
+
const scanned = /* @__PURE__ */ new Set();
|
|
5054
|
+
for (const file of opts.scanFiles) {
|
|
5055
|
+
const abs = join(opts.repoRoot, file);
|
|
5056
|
+
if (!existsSync2(abs)) continue;
|
|
5057
|
+
const text = readFileSync2(abs, "utf8");
|
|
5058
|
+
for (const find of opts.finders) findings.push(...find(file, text));
|
|
5059
|
+
scanned.add(file);
|
|
5060
|
+
}
|
|
5061
|
+
if (opts.autoDerive) {
|
|
5062
|
+
const importers = autoDeriveImporters(
|
|
5063
|
+
opts.repoRoot,
|
|
5064
|
+
opts.autoDerive.roots,
|
|
5065
|
+
opts.autoDerive.extensions,
|
|
5066
|
+
opts.autoDerive.importsContain
|
|
5067
|
+
);
|
|
5068
|
+
for (const file of importers) {
|
|
5069
|
+
if (scanned.has(file)) continue;
|
|
5070
|
+
const abs = join(opts.repoRoot, file);
|
|
5071
|
+
if (!existsSync2(abs)) continue;
|
|
5072
|
+
const text = readFileSync2(abs, "utf8");
|
|
5073
|
+
for (const find of opts.autoDerive.universalFinders) findings.push(...find(file, text));
|
|
5074
|
+
}
|
|
5075
|
+
}
|
|
5076
|
+
return findings;
|
|
5077
|
+
}
|
|
5078
|
+
function formatFindings(findings) {
|
|
5079
|
+
if (findings.length === 0) return "";
|
|
5080
|
+
return [
|
|
5081
|
+
`Found ${findings.length} muffled-gate pattern(s).`,
|
|
5082
|
+
`Fix each or annotate the line with "// muffle-ok: <reason>".`,
|
|
5083
|
+
"",
|
|
5084
|
+
...findings.map((f) => ` ${f.file}:${f.line} \u2014 ${f.pattern}
|
|
5085
|
+
${f.lineText}`)
|
|
5086
|
+
].join("\n");
|
|
4757
5087
|
}
|
|
4758
5088
|
|
|
4759
5089
|
// src/series-convergence.ts
|
|
@@ -4765,10 +5095,10 @@ function analyzeSeries(values, options = {}) {
|
|
|
4765
5095
|
return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
|
|
4766
5096
|
}
|
|
4767
5097
|
const tail = values.slice(-window);
|
|
4768
|
-
const
|
|
4769
|
-
const variance2 = tail.reduce((acc, v) => acc + (v -
|
|
5098
|
+
const mean4 = tail.reduce((a, b) => a + b, 0) / tail.length;
|
|
5099
|
+
const variance2 = tail.reduce((acc, v) => acc + (v - mean4) ** 2, 0) / tail.length;
|
|
4770
5100
|
const stdDev = Math.sqrt(variance2);
|
|
4771
|
-
const refMean = Math.abs(
|
|
5101
|
+
const refMean = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
|
|
4772
5102
|
const cv = stdDev / refMean;
|
|
4773
5103
|
const stable = tail.length >= window && cv <= stableCv;
|
|
4774
5104
|
let tailRun = 0;
|
|
@@ -4789,7 +5119,7 @@ function analyzeSeries(values, options = {}) {
|
|
|
4789
5119
|
} else {
|
|
4790
5120
|
state = "noisy";
|
|
4791
5121
|
}
|
|
4792
|
-
return { state, windowMean:
|
|
5122
|
+
return { state, windowMean: mean4, windowCv: cv, tailRun, stable };
|
|
4793
5123
|
}
|
|
4794
5124
|
|
|
4795
5125
|
// src/state-continuity.ts
|
|
@@ -5717,12 +6047,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
|
|
|
5717
6047
|
variantScores.push({ mutator: id, score, mutated });
|
|
5718
6048
|
all.push(score);
|
|
5719
6049
|
}
|
|
5720
|
-
const
|
|
5721
|
-
const variance2 = all.reduce((a, v) => a + (v -
|
|
6050
|
+
const mean4 = all.reduce((a, b) => a + b, 0) / all.length;
|
|
6051
|
+
const variance2 = all.reduce((a, v) => a + (v - mean4) ** 2, 0) / all.length;
|
|
5722
6052
|
const stdDev = Math.sqrt(variance2);
|
|
5723
|
-
const ref = Math.abs(
|
|
6053
|
+
const ref = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
|
|
5724
6054
|
const robustness = Math.max(0, 1 - stdDev / ref);
|
|
5725
|
-
return { originalScore, variantScores, meanScore:
|
|
6055
|
+
return { originalScore, variantScores, meanScore: mean4, stdDev, robustness };
|
|
5726
6056
|
}
|
|
5727
6057
|
var lowercaseMutator = (p) => p.toLowerCase();
|
|
5728
6058
|
var sentenceReorderMutator = (p, seed) => {
|
|
@@ -6407,8 +6737,8 @@ async function calibrationCurve(traceStore, outcomeStore, evalMetric, outcomeMet
|
|
|
6407
6737
|
function toBin(chunk, lower, upper) {
|
|
6408
6738
|
const xs = chunk.map((c) => c.x);
|
|
6409
6739
|
const ys = chunk.map((c) => c.y);
|
|
6410
|
-
const evalMean =
|
|
6411
|
-
const outcomeMean =
|
|
6740
|
+
const evalMean = mean3(xs);
|
|
6741
|
+
const outcomeMean = mean3(ys);
|
|
6412
6742
|
return {
|
|
6413
6743
|
lower: lower ?? Math.min(...xs),
|
|
6414
6744
|
upper: upper ?? Math.max(...xs),
|
|
@@ -6418,7 +6748,7 @@ function toBin(chunk, lower, upper) {
|
|
|
6418
6748
|
gap: Math.abs(outcomeMean - evalMean)
|
|
6419
6749
|
};
|
|
6420
6750
|
}
|
|
6421
|
-
function
|
|
6751
|
+
function mean3(xs) {
|
|
6422
6752
|
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
6423
6753
|
}
|
|
6424
6754
|
function defaultExtract4(metric) {
|
|
@@ -6643,8 +6973,8 @@ async function prmBestOfN(store, grader, runIds) {
|
|
|
6643
6973
|
if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
|
|
6644
6974
|
const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
|
|
6645
6975
|
const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
|
|
6646
|
-
const
|
|
6647
|
-
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore -
|
|
6976
|
+
const mean4 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
|
|
6977
|
+
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean4) ** 2, 0) / graded.length;
|
|
6648
6978
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
6649
6979
|
}
|
|
6650
6980
|
async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
@@ -6666,8 +6996,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
|
6666
6996
|
const ranked = [...byRun.values()].sort(
|
|
6667
6997
|
(a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
|
|
6668
6998
|
);
|
|
6669
|
-
const
|
|
6670
|
-
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore -
|
|
6999
|
+
const mean4 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
|
|
7000
|
+
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean4) ** 2, 0) / ranked.length;
|
|
6671
7001
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
6672
7002
|
}
|
|
6673
7003
|
|
|
@@ -6725,7 +7055,7 @@ async function commitBisect(options) {
|
|
|
6725
7055
|
}
|
|
6726
7056
|
async function promptBisect(options) {
|
|
6727
7057
|
const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
|
|
6728
|
-
const
|
|
7058
|
+
const join2 = (paragraphs) => paragraphs.join("\n\n");
|
|
6729
7059
|
const goodParas = split(options.good);
|
|
6730
7060
|
const badParas = split(options.bad);
|
|
6731
7061
|
if (goodParas.length !== badParas.length) {
|
|
@@ -6743,7 +7073,7 @@ async function promptBisect(options) {
|
|
|
6743
7073
|
const result = await bisect({
|
|
6744
7074
|
good: goodMask,
|
|
6745
7075
|
bad: badMask,
|
|
6746
|
-
runEval: (mask) => options.runEval(
|
|
7076
|
+
runEval: (mask) => options.runEval(join2(paragraphsFor(mask))),
|
|
6747
7077
|
maxIterations: options.maxIterations ?? n + 5,
|
|
6748
7078
|
halfway: (g, b) => {
|
|
6749
7079
|
for (let i = 0; i < g.length; i++) {
|
|
@@ -6774,12 +7104,12 @@ async function promptBisect(options) {
|
|
|
6774
7104
|
}
|
|
6775
7105
|
}
|
|
6776
7106
|
const materializedPath = result.path.map((s) => ({
|
|
6777
|
-
state:
|
|
7107
|
+
state: join2(paragraphsFor(s.state)),
|
|
6778
7108
|
score: s.score,
|
|
6779
7109
|
pass: s.pass
|
|
6780
7110
|
}));
|
|
6781
7111
|
return {
|
|
6782
|
-
culprit:
|
|
7112
|
+
culprit: join2(paragraphsFor(culprit)),
|
|
6783
7113
|
path: materializedPath,
|
|
6784
7114
|
converged: result.converged,
|
|
6785
7115
|
inputInconsistent: result.inputInconsistent,
|
|
@@ -7197,8 +7527,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
7197
7527
|
const sRuns = runs.filter((r) => r.scenarioId === s.id);
|
|
7198
7528
|
const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
|
|
7199
7529
|
if (scores.length < 3) continue;
|
|
7200
|
-
const
|
|
7201
|
-
const variance2 = scores.reduce((a, b) => a + (b -
|
|
7530
|
+
const mean4 = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
7531
|
+
const variance2 = scores.reduce((a, b) => a + (b - mean4) ** 2, 0) / scores.length;
|
|
7202
7532
|
if (variance2 > varianceThreshold) {
|
|
7203
7533
|
targets.push({
|
|
7204
7534
|
reason: "high-variance",
|
|
@@ -7688,6 +8018,8 @@ export {
|
|
|
7688
8018
|
CostTracker,
|
|
7689
8019
|
DEFAULT_AGENT_SLOS,
|
|
7690
8020
|
DEFAULT_RULES as DEFAULT_FAILURE_RULES,
|
|
8021
|
+
DEFAULT_FINDERS,
|
|
8022
|
+
DEFAULT_HARNESS_OBJECTIVES,
|
|
7691
8023
|
DEFAULT_MUTATORS,
|
|
7692
8024
|
DEFAULT_REDACTION_RULES,
|
|
7693
8025
|
DEFAULT_RED_TEAM_CORPUS,
|
|
@@ -7724,6 +8056,7 @@ export {
|
|
|
7724
8056
|
TRACE_SCHEMA_VERSION,
|
|
7725
8057
|
TokenCounter,
|
|
7726
8058
|
TraceEmitter,
|
|
8059
|
+
UNIVERSAL_FINDERS,
|
|
7727
8060
|
adversarialJudge,
|
|
7728
8061
|
aggregateLlm,
|
|
7729
8062
|
aggregateRunScore,
|
|
@@ -7782,9 +8115,15 @@ export {
|
|
|
7782
8115
|
failureClusterView,
|
|
7783
8116
|
fileContains,
|
|
7784
8117
|
fileExists,
|
|
8118
|
+
findAutoMatchNoExpectation,
|
|
8119
|
+
findConstructorCwdDropped,
|
|
8120
|
+
findFallbackToPass,
|
|
8121
|
+
findLiteralTruePass,
|
|
8122
|
+
findSkipCountsAsPass,
|
|
7785
8123
|
firstDivergenceView,
|
|
7786
8124
|
formatBenchmarkReport,
|
|
7787
8125
|
formatDriverReport,
|
|
8126
|
+
formatFindings,
|
|
7788
8127
|
groupBy,
|
|
7789
8128
|
hashContent,
|
|
7790
8129
|
hashScenarios,
|
|
@@ -7851,16 +8190,19 @@ export {
|
|
|
7851
8190
|
runE2EWorkflow,
|
|
7852
8191
|
runExpectations,
|
|
7853
8192
|
runFailureClass,
|
|
8193
|
+
runHarnessExperiment,
|
|
7854
8194
|
runJudgeFleet,
|
|
7855
8195
|
runProposeReview,
|
|
7856
8196
|
runSelfPlay,
|
|
7857
8197
|
runTestGradedScenario,
|
|
7858
8198
|
runsForScenario,
|
|
8199
|
+
scanForMuffledGates,
|
|
7859
8200
|
scoreAllProjects,
|
|
7860
8201
|
scoreContinuity,
|
|
7861
8202
|
scoreProject,
|
|
7862
8203
|
scoreRedTeamOutput,
|
|
7863
8204
|
securityJudge,
|
|
8205
|
+
selectHarnessVariant,
|
|
7864
8206
|
selfPreference,
|
|
7865
8207
|
sentenceReorderMutator,
|
|
7866
8208
|
signManifest,
|
|
@@ -7868,6 +8210,7 @@ export {
|
|
|
7868
8210
|
statusAdvanced,
|
|
7869
8211
|
stuckLoopView,
|
|
7870
8212
|
summarize,
|
|
8213
|
+
summarizeHarnessResults,
|
|
7871
8214
|
testJudge,
|
|
7872
8215
|
textInSnapshot,
|
|
7873
8216
|
toLangfuseEnvelope,
|