@tangle-network/agent-eval 0.7.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -38
- package/dist/index.d.ts +142 -41
- package/dist/index.js +205 -68
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -410,7 +410,7 @@ function confidenceInterval(scores, confidence = 0.95) {
|
|
|
410
410
|
if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
|
|
411
411
|
if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
|
|
412
412
|
const n = scores.length;
|
|
413
|
-
const
|
|
413
|
+
const mean4 = scores.reduce((a, b) => a + b, 0) / n;
|
|
414
414
|
const B = 1e3;
|
|
415
415
|
const bootstrapMeans = [];
|
|
416
416
|
for (let i = 0; i < B; i++) {
|
|
@@ -425,7 +425,7 @@ function confidenceInterval(scores, confidence = 0.95) {
|
|
|
425
425
|
const lowerIdx = Math.floor(alpha / 2 * B);
|
|
426
426
|
const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
|
|
427
427
|
return {
|
|
428
|
-
mean:
|
|
428
|
+
mean: mean4,
|
|
429
429
|
lower: bootstrapMeans[lowerIdx],
|
|
430
430
|
upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
|
|
431
431
|
};
|
|
@@ -513,11 +513,11 @@ function pairedTTest(before, after) {
|
|
|
513
513
|
const n = before.length;
|
|
514
514
|
if (n < 2) return { t: 0, df: 0, p: 1 };
|
|
515
515
|
const diffs = before.map((b, i) => after[i] - b);
|
|
516
|
-
const
|
|
517
|
-
const variance2 = diffs.reduce((acc, d) => acc + (d -
|
|
516
|
+
const mean4 = diffs.reduce((a, b) => a + b, 0) / n;
|
|
517
|
+
const variance2 = diffs.reduce((acc, d) => acc + (d - mean4) ** 2, 0) / (n - 1);
|
|
518
518
|
const se = Math.sqrt(variance2 / n);
|
|
519
|
-
if (se === 0) return { t:
|
|
520
|
-
const t =
|
|
519
|
+
if (se === 0) return { t: mean4 === 0 ? 0 : Infinity, df: n - 1, p: mean4 === 0 ? 1 : 0 };
|
|
520
|
+
const t = mean4 / se;
|
|
521
521
|
const df = n - 1;
|
|
522
522
|
const p = 2 * (1 - studentTCdf(Math.abs(t), df));
|
|
523
523
|
return { t, df, p };
|
|
@@ -541,9 +541,9 @@ function wilcoxonSignedRank(before, after) {
|
|
|
541
541
|
}
|
|
542
542
|
let wPlus = 0;
|
|
543
543
|
for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
|
|
544
|
-
const
|
|
544
|
+
const mean4 = n * (n + 1) / 4;
|
|
545
545
|
const variance2 = n * (n + 1) * (2 * n + 1) / 24;
|
|
546
|
-
const z = (wPlus -
|
|
546
|
+
const z = (wPlus - mean4) / Math.sqrt(variance2);
|
|
547
547
|
const p = 2 * (1 - normalCdf(Math.abs(z)));
|
|
548
548
|
return { w: wPlus, p };
|
|
549
549
|
}
|
|
@@ -2135,12 +2135,14 @@ var DEFAULT_RUN_SCORE_WEIGHTS = {
|
|
|
2135
2135
|
toolUseQuality: 1,
|
|
2136
2136
|
patchQuality: 1.25,
|
|
2137
2137
|
testReality: 1.5,
|
|
2138
|
+
finalGate: 3,
|
|
2139
|
+
reviewerBlockers: -2,
|
|
2138
2140
|
costUsd: -0.2,
|
|
2139
2141
|
wallSeconds: -0.1
|
|
2140
2142
|
};
|
|
2141
2143
|
function aggregateRunScore(score, weights = {}) {
|
|
2142
2144
|
const w = { ...DEFAULT_RUN_SCORE_WEIGHTS, ...weights };
|
|
2143
|
-
return w.success * clamp01(score.success) + w.goalProgress * clamp01(score.goalProgress) + w.repoGroundedness * clamp01(score.repoGroundedness) + w.driftPenalty * clamp01(score.driftPenalty) + w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60);
|
|
2145
|
+
return w.success * clamp01(score.success) + w.goalProgress * clamp01(score.goalProgress) + w.repoGroundedness * clamp01(score.repoGroundedness) + w.driftPenalty * clamp01(score.driftPenalty) + w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + w.finalGate * clamp01(score.finalGate) + w.reviewerBlockers * clamp01(score.reviewerBlockers) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60);
|
|
2144
2146
|
}
|
|
2145
2147
|
function clamp01(value) {
|
|
2146
2148
|
if (!Number.isFinite(value)) return 0;
|
|
@@ -2180,6 +2182,9 @@ var RunCritic = class {
|
|
|
2180
2182
|
const toolSpans2 = trace.spans.filter((s) => s.kind === "tool");
|
|
2181
2183
|
const judgeSpans2 = trace.spans.filter((s) => s.kind === "judge");
|
|
2182
2184
|
const sandboxSpans = trace.spans.filter((s) => s.kind === "sandbox");
|
|
2185
|
+
const finalGateSpans = judgeSpans2.filter(
|
|
2186
|
+
(span) => span.dimension === "final_gate" || span.attributes?.finalGate === true
|
|
2187
|
+
);
|
|
2183
2188
|
const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
|
|
2184
2189
|
if (!success) notes.push("run did not complete with pass=true");
|
|
2185
2190
|
const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum, span) => sum + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
|
|
@@ -2194,6 +2199,15 @@ var RunCritic = class {
|
|
|
2194
2199
|
const sandboxTests = sandboxSpans.filter((span) => typeof span.testsTotal === "number" && span.testsTotal > 0);
|
|
2195
2200
|
const testReality = sandboxTests.length ? sandboxTests.reduce((sum, span) => sum + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1), 0) / sandboxTests.length : toolSpans2.some((span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))) ? 0.4 : 0;
|
|
2196
2201
|
if (!testReality) notes.push("no real test/build evidence recorded");
|
|
2202
|
+
const blockerSpans = judgeSpans2.filter(
|
|
2203
|
+
(span) => isBlockingJudge(span)
|
|
2204
|
+
);
|
|
2205
|
+
const finalGateBlockers = finalGateSpans.filter((span) => isBlockingJudge(span));
|
|
2206
|
+
const finalGate = finalGateSpans.length ? finalGateBlockers.length ? 0 : 1 : success;
|
|
2207
|
+
if (finalGateBlockers.length) notes.push(`final gate blocked by ${finalGateBlockers.length} reviewer(s)`);
|
|
2208
|
+
else if (!finalGateSpans.length) notes.push("no final gate judgment recorded");
|
|
2209
|
+
const reviewerBlockers = judgeSpans2.length ? blockerSpans.length / judgeSpans2.length : 0;
|
|
2210
|
+
if (reviewerBlockers) notes.push(`detected ${blockerSpans.length} blocking reviewer signal(s)`);
|
|
2197
2211
|
const positiveGroundingSignals = patchEvidence + sandboxSpans.length + llmSpans2.filter((span) => looksRepoGrounded(span.output ?? "")).length;
|
|
2198
2212
|
const driftSignals = llmSpans2.filter((span) => this.isDrift(span.output ?? "")).length + trace.events.filter((event) => this.isDrift(JSON.stringify(event.payload))).length;
|
|
2199
2213
|
const repoGroundedness = positiveGroundingSignals + driftSignals === 0 ? 0 : positiveGroundingSignals / (positiveGroundingSignals + driftSignals);
|
|
@@ -2209,6 +2223,8 @@ var RunCritic = class {
|
|
|
2209
2223
|
toolUseQuality,
|
|
2210
2224
|
patchQuality,
|
|
2211
2225
|
testReality,
|
|
2226
|
+
finalGate,
|
|
2227
|
+
reviewerBlockers,
|
|
2212
2228
|
costUsd,
|
|
2213
2229
|
wallSeconds,
|
|
2214
2230
|
notes
|
|
@@ -2227,6 +2243,12 @@ function normalizeJudgeScore(score) {
|
|
|
2227
2243
|
function looksRepoGrounded(text) {
|
|
2228
2244
|
return /(?:src\/|tests?\/|package\.json|tsconfig|\.ts\b|\.tsx\b|git status|pnpm |npm |vitest|pytest|jest)/i.test(text);
|
|
2229
2245
|
}
|
|
2246
|
+
function isBlockingJudge(span) {
|
|
2247
|
+
return span.attributes?.blocking === true || span.attributes?.verdict === "BLOCKING" || positiveNumber(span.attributes?.blockingFindings) || positiveNumber(span.attributes?.highFindings) || span.score <= 2;
|
|
2248
|
+
}
|
|
2249
|
+
function positiveNumber(value) {
|
|
2250
|
+
return typeof value === "number" && value > 0;
|
|
2251
|
+
}
|
|
2230
2252
|
|
|
2231
2253
|
// src/playbook.ts
|
|
2232
2254
|
function distillPlaybook(entries, options = {}) {
|
|
@@ -2430,6 +2452,144 @@ function createAxService(aiFactory, provider, apiKey, model) {
|
|
|
2430
2452
|
});
|
|
2431
2453
|
}
|
|
2432
2454
|
|
|
2455
|
+
// src/pareto.ts
|
|
2456
|
+
function dominates(a, b, objectives) {
|
|
2457
|
+
let strictlyBetter = false;
|
|
2458
|
+
for (const obj of objectives) {
|
|
2459
|
+
const av = obj.value(a);
|
|
2460
|
+
const bv = obj.value(b);
|
|
2461
|
+
if (!Number.isFinite(av) || !Number.isFinite(bv)) return false;
|
|
2462
|
+
const aIsBetter = obj.direction === "maximize" ? av > bv : av < bv;
|
|
2463
|
+
const aIsWorse = obj.direction === "maximize" ? av < bv : av > bv;
|
|
2464
|
+
if (aIsWorse) return false;
|
|
2465
|
+
if (aIsBetter) strictlyBetter = true;
|
|
2466
|
+
}
|
|
2467
|
+
return strictlyBetter;
|
|
2468
|
+
}
|
|
2469
|
+
function paretoFrontier(candidates, objectives) {
|
|
2470
|
+
if (objectives.length === 0) {
|
|
2471
|
+
throw new Error("paretoFrontier: at least 1 objective required");
|
|
2472
|
+
}
|
|
2473
|
+
const valid = candidates.filter(
|
|
2474
|
+
(c) => objectives.every((o) => Number.isFinite(o.value(c)))
|
|
2475
|
+
);
|
|
2476
|
+
const frontier = [];
|
|
2477
|
+
const dominated = [];
|
|
2478
|
+
for (const c of valid) {
|
|
2479
|
+
const isDominated = valid.some((other) => other !== c && dominates(other, c, objectives));
|
|
2480
|
+
if (isDominated) dominated.push(c);
|
|
2481
|
+
else frontier.push(c);
|
|
2482
|
+
}
|
|
2483
|
+
const dominanceMap = frontier.map((d) => ({
|
|
2484
|
+
dominator: d,
|
|
2485
|
+
dominated: dominated.filter((x) => dominates(d, x, objectives))
|
|
2486
|
+
}));
|
|
2487
|
+
return { frontier, dominated, dominanceMap };
|
|
2488
|
+
}
|
|
2489
|
+
|
|
2490
|
+
// src/harness-optimizer.ts
|
|
2491
|
+
var DEFAULT_HARNESS_OBJECTIVES = [
|
|
2492
|
+
{ name: "aggregate", direction: "maximize", value: (r) => r.aggregateMean },
|
|
2493
|
+
{ name: "pass_rate", direction: "maximize", value: (r) => r.passRate },
|
|
2494
|
+
{ name: "cost", direction: "minimize", value: (r) => r.costUsdMean },
|
|
2495
|
+
{ name: "wall", direction: "minimize", value: (r) => r.wallSecondsMean }
|
|
2496
|
+
];
|
|
2497
|
+
async function runHarnessExperiment(config) {
|
|
2498
|
+
const jobs = buildJobs(config);
|
|
2499
|
+
const critic = new RunCritic({ weights: config.weights });
|
|
2500
|
+
const score = config.score ?? ((trace) => critic.scoreTrace(trace));
|
|
2501
|
+
const results = await mapLimit(jobs, config.parallelism ?? 1, async (request) => {
|
|
2502
|
+
const trace = await config.adapter.run(request);
|
|
2503
|
+
const runScore = await score(trace, request);
|
|
2504
|
+
const result = {
|
|
2505
|
+
variant: request.variant,
|
|
2506
|
+
scenario: request.scenario,
|
|
2507
|
+
trialIndex: request.trialIndex,
|
|
2508
|
+
trace,
|
|
2509
|
+
score: runScore,
|
|
2510
|
+
aggregate: aggregateRunScore(runScore, config.weights)
|
|
2511
|
+
};
|
|
2512
|
+
await config.onResult?.(result);
|
|
2513
|
+
return result;
|
|
2514
|
+
});
|
|
2515
|
+
return { results, selection: selectHarnessVariant(results, config.objectives) };
|
|
2516
|
+
}
|
|
2517
|
+
function selectHarnessVariant(results, objectives = DEFAULT_HARNESS_OBJECTIVES) {
|
|
2518
|
+
const reports = summarizeHarnessResults(results);
|
|
2519
|
+
if (reports.length === 0) throw new Error("selectHarnessVariant: no results");
|
|
2520
|
+
const frontier = paretoFrontier(reports, objectives);
|
|
2521
|
+
const candidates = frontier.frontier.length ? frontier.frontier : reports;
|
|
2522
|
+
const winner = [...candidates].sort((a, b) => b.aggregateMean - a.aggregateMean)[0];
|
|
2523
|
+
if (!winner) throw new Error("selectHarnessVariant: no winner");
|
|
2524
|
+
return { winner, frontier, reports };
|
|
2525
|
+
}
|
|
2526
|
+
function summarizeHarnessResults(results) {
|
|
2527
|
+
const byVariant = /* @__PURE__ */ new Map();
|
|
2528
|
+
for (const result of results) {
|
|
2529
|
+
byVariant.set(result.variant.id, [...byVariant.get(result.variant.id) ?? [], result]);
|
|
2530
|
+
}
|
|
2531
|
+
return [...byVariant.values()].map((runs) => {
|
|
2532
|
+
const variant = runs[0]?.variant;
|
|
2533
|
+
if (!variant) throw new Error("summarizeHarnessResults: empty variant bucket");
|
|
2534
|
+
return {
|
|
2535
|
+
variant,
|
|
2536
|
+
runs,
|
|
2537
|
+
aggregateMean: mean(runs.map((r) => r.aggregate)),
|
|
2538
|
+
passRate: mean(runs.map((r) => r.score.success)),
|
|
2539
|
+
costUsdMean: mean(runs.map((r) => r.score.costUsd)),
|
|
2540
|
+
wallSecondsMean: mean(runs.map((r) => r.score.wallSeconds)),
|
|
2541
|
+
scoreMean: meanRunScore(runs.map((r) => r.score))
|
|
2542
|
+
};
|
|
2543
|
+
}).sort((a, b) => b.aggregateMean - a.aggregateMean);
|
|
2544
|
+
}
|
|
2545
|
+
function buildJobs(config) {
|
|
2546
|
+
if (config.variants.length === 0) throw new Error("runHarnessExperiment: at least one variant required");
|
|
2547
|
+
if (config.scenarios.length === 0) throw new Error("runHarnessExperiment: at least one scenario required");
|
|
2548
|
+
const trials = Math.max(1, Math.floor(config.trialsPerScenario ?? 1));
|
|
2549
|
+
const jobs = [];
|
|
2550
|
+
for (const variant of config.variants) {
|
|
2551
|
+
for (const scenario of config.scenarios) {
|
|
2552
|
+
for (let trialIndex = 0; trialIndex < trials; trialIndex++) {
|
|
2553
|
+
jobs.push({ variant, scenario, trialIndex });
|
|
2554
|
+
}
|
|
2555
|
+
}
|
|
2556
|
+
}
|
|
2557
|
+
return jobs;
|
|
2558
|
+
}
|
|
2559
|
+
async function mapLimit(items, limit, fn) {
|
|
2560
|
+
const results = new Array(items.length);
|
|
2561
|
+
let next = 0;
|
|
2562
|
+
const workerCount = Math.max(1, Math.min(Math.floor(limit), items.length));
|
|
2563
|
+
await Promise.all(Array.from({ length: workerCount }, async () => {
|
|
2564
|
+
while (next < items.length) {
|
|
2565
|
+
const index = next++;
|
|
2566
|
+
const item = items[index];
|
|
2567
|
+
if (item === void 0) continue;
|
|
2568
|
+
results[index] = await fn(item);
|
|
2569
|
+
}
|
|
2570
|
+
}));
|
|
2571
|
+
return results;
|
|
2572
|
+
}
|
|
2573
|
+
function mean(values) {
|
|
2574
|
+
return values.length ? values.reduce((sum, value) => sum + value, 0) / values.length : 0;
|
|
2575
|
+
}
|
|
2576
|
+
function meanRunScore(scores) {
|
|
2577
|
+
return {
|
|
2578
|
+
success: mean(scores.map((s) => s.success)),
|
|
2579
|
+
goalProgress: mean(scores.map((s) => s.goalProgress)),
|
|
2580
|
+
repoGroundedness: mean(scores.map((s) => s.repoGroundedness)),
|
|
2581
|
+
driftPenalty: mean(scores.map((s) => s.driftPenalty)),
|
|
2582
|
+
toolUseQuality: mean(scores.map((s) => s.toolUseQuality)),
|
|
2583
|
+
patchQuality: mean(scores.map((s) => s.patchQuality)),
|
|
2584
|
+
testReality: mean(scores.map((s) => s.testReality)),
|
|
2585
|
+
finalGate: mean(scores.map((s) => s.finalGate)),
|
|
2586
|
+
reviewerBlockers: mean(scores.map((s) => s.reviewerBlockers)),
|
|
2587
|
+
costUsd: mean(scores.map((s) => s.costUsd)),
|
|
2588
|
+
wallSeconds: mean(scores.map((s) => s.wallSeconds)),
|
|
2589
|
+
notes: scores.flatMap((s) => s.notes ?? [])
|
|
2590
|
+
};
|
|
2591
|
+
}
|
|
2592
|
+
|
|
2433
2593
|
// src/trace/store.ts
|
|
2434
2594
|
var InMemoryTraceStore = class {
|
|
2435
2595
|
runs = /* @__PURE__ */ new Map();
|
|
@@ -2875,14 +3035,22 @@ function composeParsers(...parsers) {
|
|
|
2875
3035
|
}
|
|
2876
3036
|
var SubprocessSandboxDriver = class {
|
|
2877
3037
|
id = "subprocess";
|
|
3038
|
+
defaultCwd;
|
|
3039
|
+
defaultEnv;
|
|
3040
|
+
constructor(options = {}) {
|
|
3041
|
+
this.defaultCwd = options.cwd;
|
|
3042
|
+
this.defaultEnv = options.env;
|
|
3043
|
+
}
|
|
2878
3044
|
async exec(phase, command, config) {
|
|
2879
3045
|
const { spawn } = await import("child_process");
|
|
2880
3046
|
const start = Date.now();
|
|
3047
|
+
const effectiveCwd = config.cwd ?? this.defaultCwd;
|
|
3048
|
+
const effectiveEnv = { ...process.env, ...this.defaultEnv ?? {}, ...config.env ?? {} };
|
|
2881
3049
|
return await new Promise((resolve) => {
|
|
2882
3050
|
const child = spawn(command, {
|
|
2883
3051
|
shell: true,
|
|
2884
|
-
cwd:
|
|
2885
|
-
env:
|
|
3052
|
+
cwd: effectiveCwd,
|
|
3053
|
+
env: effectiveEnv
|
|
2886
3054
|
});
|
|
2887
3055
|
let stdout = "";
|
|
2888
3056
|
let stderr = "";
|
|
@@ -4308,8 +4476,8 @@ function compareToBaseline(samples, options = {}) {
|
|
|
4308
4476
|
if (s.baseline.length < 2 || s.candidate.length < 2) {
|
|
4309
4477
|
throw new Error(`compareToBaseline: need \u22652 samples per side for "${s.metric}"`);
|
|
4310
4478
|
}
|
|
4311
|
-
const bMean =
|
|
4312
|
-
const cMean =
|
|
4479
|
+
const bMean = mean2(s.baseline);
|
|
4480
|
+
const cMean = mean2(s.candidate);
|
|
4313
4481
|
const delta = cMean - bMean;
|
|
4314
4482
|
const d = cohensD(s.baseline, s.candidate);
|
|
4315
4483
|
const { t, df, p } = welchsTTest(s.baseline, s.candidate);
|
|
@@ -4348,7 +4516,7 @@ function compareToBaseline(samples, options = {}) {
|
|
|
4348
4516
|
hasUnstable: metrics.some((m) => m.verdict === "unstable")
|
|
4349
4517
|
};
|
|
4350
4518
|
}
|
|
4351
|
-
function
|
|
4519
|
+
function mean2(xs) {
|
|
4352
4520
|
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
4353
4521
|
}
|
|
4354
4522
|
function iqr(xs) {
|
|
@@ -4364,8 +4532,8 @@ function iqr(xs) {
|
|
|
4364
4532
|
}
|
|
4365
4533
|
function welchsTTest(a, b) {
|
|
4366
4534
|
if (a.length < 2 || b.length < 2) return { t: 0, df: 0, p: 1 };
|
|
4367
|
-
const mA =
|
|
4368
|
-
const mB =
|
|
4535
|
+
const mA = mean2(a);
|
|
4536
|
+
const mB = mean2(b);
|
|
4369
4537
|
const vA = variance(a, mA);
|
|
4370
4538
|
const vB = variance(b, mB);
|
|
4371
4539
|
const seSquared = vA / a.length + vB / b.length;
|
|
@@ -4721,41 +4889,6 @@ function assertNonNegative(n, name) {
|
|
|
4721
4889
|
}
|
|
4722
4890
|
}
|
|
4723
4891
|
|
|
4724
|
-
// src/pareto.ts
|
|
4725
|
-
function dominates(a, b, objectives) {
|
|
4726
|
-
let strictlyBetter = false;
|
|
4727
|
-
for (const obj of objectives) {
|
|
4728
|
-
const av = obj.value(a);
|
|
4729
|
-
const bv = obj.value(b);
|
|
4730
|
-
if (!Number.isFinite(av) || !Number.isFinite(bv)) return false;
|
|
4731
|
-
const aIsBetter = obj.direction === "maximize" ? av > bv : av < bv;
|
|
4732
|
-
const aIsWorse = obj.direction === "maximize" ? av < bv : av > bv;
|
|
4733
|
-
if (aIsWorse) return false;
|
|
4734
|
-
if (aIsBetter) strictlyBetter = true;
|
|
4735
|
-
}
|
|
4736
|
-
return strictlyBetter;
|
|
4737
|
-
}
|
|
4738
|
-
function paretoFrontier(candidates, objectives) {
|
|
4739
|
-
if (objectives.length === 0) {
|
|
4740
|
-
throw new Error("paretoFrontier: at least 1 objective required");
|
|
4741
|
-
}
|
|
4742
|
-
const valid = candidates.filter(
|
|
4743
|
-
(c) => objectives.every((o) => Number.isFinite(o.value(c)))
|
|
4744
|
-
);
|
|
4745
|
-
const frontier = [];
|
|
4746
|
-
const dominated = [];
|
|
4747
|
-
for (const c of valid) {
|
|
4748
|
-
const isDominated = valid.some((other) => other !== c && dominates(other, c, objectives));
|
|
4749
|
-
if (isDominated) dominated.push(c);
|
|
4750
|
-
else frontier.push(c);
|
|
4751
|
-
}
|
|
4752
|
-
const dominanceMap = frontier.map((d) => ({
|
|
4753
|
-
dominator: d,
|
|
4754
|
-
dominated: dominated.filter((x) => dominates(d, x, objectives))
|
|
4755
|
-
}));
|
|
4756
|
-
return { frontier, dominated, dominanceMap };
|
|
4757
|
-
}
|
|
4758
|
-
|
|
4759
4892
|
// src/series-convergence.ts
|
|
4760
4893
|
function analyzeSeries(values, options = {}) {
|
|
4761
4894
|
const window = options.window ?? 5;
|
|
@@ -4765,10 +4898,10 @@ function analyzeSeries(values, options = {}) {
|
|
|
4765
4898
|
return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
|
|
4766
4899
|
}
|
|
4767
4900
|
const tail = values.slice(-window);
|
|
4768
|
-
const
|
|
4769
|
-
const variance2 = tail.reduce((acc, v) => acc + (v -
|
|
4901
|
+
const mean4 = tail.reduce((a, b) => a + b, 0) / tail.length;
|
|
4902
|
+
const variance2 = tail.reduce((acc, v) => acc + (v - mean4) ** 2, 0) / tail.length;
|
|
4770
4903
|
const stdDev = Math.sqrt(variance2);
|
|
4771
|
-
const refMean = Math.abs(
|
|
4904
|
+
const refMean = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
|
|
4772
4905
|
const cv = stdDev / refMean;
|
|
4773
4906
|
const stable = tail.length >= window && cv <= stableCv;
|
|
4774
4907
|
let tailRun = 0;
|
|
@@ -4789,7 +4922,7 @@ function analyzeSeries(values, options = {}) {
|
|
|
4789
4922
|
} else {
|
|
4790
4923
|
state = "noisy";
|
|
4791
4924
|
}
|
|
4792
|
-
return { state, windowMean:
|
|
4925
|
+
return { state, windowMean: mean4, windowCv: cv, tailRun, stable };
|
|
4793
4926
|
}
|
|
4794
4927
|
|
|
4795
4928
|
// src/state-continuity.ts
|
|
@@ -5717,12 +5850,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
|
|
|
5717
5850
|
variantScores.push({ mutator: id, score, mutated });
|
|
5718
5851
|
all.push(score);
|
|
5719
5852
|
}
|
|
5720
|
-
const
|
|
5721
|
-
const variance2 = all.reduce((a, v) => a + (v -
|
|
5853
|
+
const mean4 = all.reduce((a, b) => a + b, 0) / all.length;
|
|
5854
|
+
const variance2 = all.reduce((a, v) => a + (v - mean4) ** 2, 0) / all.length;
|
|
5722
5855
|
const stdDev = Math.sqrt(variance2);
|
|
5723
|
-
const ref = Math.abs(
|
|
5856
|
+
const ref = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
|
|
5724
5857
|
const robustness = Math.max(0, 1 - stdDev / ref);
|
|
5725
|
-
return { originalScore, variantScores, meanScore:
|
|
5858
|
+
return { originalScore, variantScores, meanScore: mean4, stdDev, robustness };
|
|
5726
5859
|
}
|
|
5727
5860
|
var lowercaseMutator = (p) => p.toLowerCase();
|
|
5728
5861
|
var sentenceReorderMutator = (p, seed) => {
|
|
@@ -6407,8 +6540,8 @@ async function calibrationCurve(traceStore, outcomeStore, evalMetric, outcomeMet
|
|
|
6407
6540
|
function toBin(chunk, lower, upper) {
|
|
6408
6541
|
const xs = chunk.map((c) => c.x);
|
|
6409
6542
|
const ys = chunk.map((c) => c.y);
|
|
6410
|
-
const evalMean =
|
|
6411
|
-
const outcomeMean =
|
|
6543
|
+
const evalMean = mean3(xs);
|
|
6544
|
+
const outcomeMean = mean3(ys);
|
|
6412
6545
|
return {
|
|
6413
6546
|
lower: lower ?? Math.min(...xs),
|
|
6414
6547
|
upper: upper ?? Math.max(...xs),
|
|
@@ -6418,7 +6551,7 @@ function toBin(chunk, lower, upper) {
|
|
|
6418
6551
|
gap: Math.abs(outcomeMean - evalMean)
|
|
6419
6552
|
};
|
|
6420
6553
|
}
|
|
6421
|
-
function
|
|
6554
|
+
function mean3(xs) {
|
|
6422
6555
|
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
6423
6556
|
}
|
|
6424
6557
|
function defaultExtract4(metric) {
|
|
@@ -6643,8 +6776,8 @@ async function prmBestOfN(store, grader, runIds) {
|
|
|
6643
6776
|
if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
|
|
6644
6777
|
const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
|
|
6645
6778
|
const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
|
|
6646
|
-
const
|
|
6647
|
-
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore -
|
|
6779
|
+
const mean4 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
|
|
6780
|
+
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean4) ** 2, 0) / graded.length;
|
|
6648
6781
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
6649
6782
|
}
|
|
6650
6783
|
async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
@@ -6666,8 +6799,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
|
6666
6799
|
const ranked = [...byRun.values()].sort(
|
|
6667
6800
|
(a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
|
|
6668
6801
|
);
|
|
6669
|
-
const
|
|
6670
|
-
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore -
|
|
6802
|
+
const mean4 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
|
|
6803
|
+
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean4) ** 2, 0) / ranked.length;
|
|
6671
6804
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
6672
6805
|
}
|
|
6673
6806
|
|
|
@@ -7197,8 +7330,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
7197
7330
|
const sRuns = runs.filter((r) => r.scenarioId === s.id);
|
|
7198
7331
|
const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
|
|
7199
7332
|
if (scores.length < 3) continue;
|
|
7200
|
-
const
|
|
7201
|
-
const variance2 = scores.reduce((a, b) => a + (b -
|
|
7333
|
+
const mean4 = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
7334
|
+
const variance2 = scores.reduce((a, b) => a + (b - mean4) ** 2, 0) / scores.length;
|
|
7202
7335
|
if (variance2 > varianceThreshold) {
|
|
7203
7336
|
targets.push({
|
|
7204
7337
|
reason: "high-variance",
|
|
@@ -7688,6 +7821,7 @@ export {
|
|
|
7688
7821
|
CostTracker,
|
|
7689
7822
|
DEFAULT_AGENT_SLOS,
|
|
7690
7823
|
DEFAULT_RULES as DEFAULT_FAILURE_RULES,
|
|
7824
|
+
DEFAULT_HARNESS_OBJECTIVES,
|
|
7691
7825
|
DEFAULT_MUTATORS,
|
|
7692
7826
|
DEFAULT_REDACTION_RULES,
|
|
7693
7827
|
DEFAULT_RED_TEAM_CORPUS,
|
|
@@ -7851,6 +7985,7 @@ export {
|
|
|
7851
7985
|
runE2EWorkflow,
|
|
7852
7986
|
runExpectations,
|
|
7853
7987
|
runFailureClass,
|
|
7988
|
+
runHarnessExperiment,
|
|
7854
7989
|
runJudgeFleet,
|
|
7855
7990
|
runProposeReview,
|
|
7856
7991
|
runSelfPlay,
|
|
@@ -7861,6 +7996,7 @@ export {
|
|
|
7861
7996
|
scoreProject,
|
|
7862
7997
|
scoreRedTeamOutput,
|
|
7863
7998
|
securityJudge,
|
|
7999
|
+
selectHarnessVariant,
|
|
7864
8000
|
selfPreference,
|
|
7865
8001
|
sentenceReorderMutator,
|
|
7866
8002
|
signManifest,
|
|
@@ -7868,6 +8004,7 @@ export {
|
|
|
7868
8004
|
statusAdvanced,
|
|
7869
8005
|
stuckLoopView,
|
|
7870
8006
|
summarize,
|
|
8007
|
+
summarizeHarnessResults,
|
|
7871
8008
|
testJudge,
|
|
7872
8009
|
textInSnapshot,
|
|
7873
8010
|
toLangfuseEnvelope,
|