@tangle-network/agent-eval 0.6.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -38
- package/dist/index.d.ts +661 -119
- package/dist/index.js +1186 -336
- package/dist/index.js.map +1 -1
- package/package.json +5 -1
package/dist/index.js
CHANGED
|
@@ -410,7 +410,7 @@ function confidenceInterval(scores, confidence = 0.95) {
|
|
|
410
410
|
if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
|
|
411
411
|
if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
|
|
412
412
|
const n = scores.length;
|
|
413
|
-
const
|
|
413
|
+
const mean4 = scores.reduce((a, b) => a + b, 0) / n;
|
|
414
414
|
const B = 1e3;
|
|
415
415
|
const bootstrapMeans = [];
|
|
416
416
|
for (let i = 0; i < B; i++) {
|
|
@@ -425,7 +425,7 @@ function confidenceInterval(scores, confidence = 0.95) {
|
|
|
425
425
|
const lowerIdx = Math.floor(alpha / 2 * B);
|
|
426
426
|
const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
|
|
427
427
|
return {
|
|
428
|
-
mean:
|
|
428
|
+
mean: mean4,
|
|
429
429
|
lower: bootstrapMeans[lowerIdx],
|
|
430
430
|
upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
|
|
431
431
|
};
|
|
@@ -513,11 +513,11 @@ function pairedTTest(before, after) {
|
|
|
513
513
|
const n = before.length;
|
|
514
514
|
if (n < 2) return { t: 0, df: 0, p: 1 };
|
|
515
515
|
const diffs = before.map((b, i) => after[i] - b);
|
|
516
|
-
const
|
|
517
|
-
const variance2 = diffs.reduce((acc, d) => acc + (d -
|
|
516
|
+
const mean4 = diffs.reduce((a, b) => a + b, 0) / n;
|
|
517
|
+
const variance2 = diffs.reduce((acc, d) => acc + (d - mean4) ** 2, 0) / (n - 1);
|
|
518
518
|
const se = Math.sqrt(variance2 / n);
|
|
519
|
-
if (se === 0) return { t:
|
|
520
|
-
const t =
|
|
519
|
+
if (se === 0) return { t: mean4 === 0 ? 0 : Infinity, df: n - 1, p: mean4 === 0 ? 1 : 0 };
|
|
520
|
+
const t = mean4 / se;
|
|
521
521
|
const df = n - 1;
|
|
522
522
|
const p = 2 * (1 - studentTCdf(Math.abs(t), df));
|
|
523
523
|
return { t, df, p };
|
|
@@ -541,9 +541,9 @@ function wilcoxonSignedRank(before, after) {
|
|
|
541
541
|
}
|
|
542
542
|
let wPlus = 0;
|
|
543
543
|
for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
|
|
544
|
-
const
|
|
544
|
+
const mean4 = n * (n + 1) / 4;
|
|
545
545
|
const variance2 = n * (n + 1) * (2 * n + 1) / 24;
|
|
546
|
-
const z = (wPlus -
|
|
546
|
+
const z = (wPlus - mean4) / Math.sqrt(variance2);
|
|
547
547
|
const p = 2 * (1 - normalCdf(Math.abs(z)));
|
|
548
548
|
return { w: wPlus, p };
|
|
549
549
|
}
|
|
@@ -2094,113 +2094,500 @@ function flatSamples(score) {
|
|
|
2094
2094
|
return out;
|
|
2095
2095
|
}
|
|
2096
2096
|
|
|
2097
|
-
// src/
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
|
|
2101
|
-
|
|
2102
|
-
|
|
2103
|
-
|
|
2097
|
+
// src/steering.ts
|
|
2098
|
+
function mergeSteeringBundle(base, delta) {
|
|
2099
|
+
return {
|
|
2100
|
+
...base,
|
|
2101
|
+
...delta.coderPrompt !== void 0 ? { coderPrompt: delta.coderPrompt } : {},
|
|
2102
|
+
...delta.continuePrompt !== void 0 ? { continuePrompt: delta.continuePrompt } : {},
|
|
2103
|
+
reviewerPrompts: {
|
|
2104
|
+
...base.reviewerPrompts ?? {},
|
|
2105
|
+
...delta.reviewerPrompts ?? {}
|
|
2106
|
+
},
|
|
2107
|
+
skills: delta.skills ?? base.skills,
|
|
2108
|
+
rolePrompts: {
|
|
2109
|
+
...base.rolePrompts ?? {},
|
|
2110
|
+
...delta.rolePrompts ?? {}
|
|
2111
|
+
},
|
|
2112
|
+
metadata: {
|
|
2113
|
+
...base.metadata ?? {},
|
|
2114
|
+
...delta.metadata ?? {}
|
|
2104
2115
|
}
|
|
2105
|
-
|
|
2106
|
-
|
|
2107
|
-
|
|
2108
|
-
|
|
2109
|
-
|
|
2110
|
-
|
|
2111
|
-
|
|
2112
|
-
|
|
2113
|
-
|
|
2114
|
-
|
|
2115
|
-
|
|
2116
|
-
|
|
2117
|
-
|
|
2118
|
-
|
|
2119
|
-
|
|
2120
|
-
|
|
2121
|
-
|
|
2122
|
-
|
|
2123
|
-
|
|
2124
|
-
|
|
2125
|
-
|
|
2126
|
-
|
|
2127
|
-
|
|
2128
|
-
|
|
2129
|
-
|
|
2130
|
-
|
|
2131
|
-
|
|
2132
|
-
|
|
2133
|
-
|
|
2134
|
-
|
|
2135
|
-
|
|
2136
|
-
|
|
2137
|
-
|
|
2138
|
-
|
|
2139
|
-
|
|
2140
|
-
|
|
2141
|
-
|
|
2142
|
-
|
|
2143
|
-
|
|
2144
|
-
|
|
2145
|
-
|
|
2146
|
-
|
|
2116
|
+
};
|
|
2117
|
+
}
|
|
2118
|
+
function renderSteeringText(bundle) {
|
|
2119
|
+
const lines = [`bundle:${bundle.id}`];
|
|
2120
|
+
if (bundle.coderPrompt) lines.push(`coder:${bundle.coderPrompt}`);
|
|
2121
|
+
if (bundle.continuePrompt) lines.push(`continue:${bundle.continuePrompt}`);
|
|
2122
|
+
const reviewers = Object.entries(bundle.reviewerPrompts ?? {}).sort(([a], [b]) => a.localeCompare(b));
|
|
2123
|
+
for (const [name, prompt] of reviewers) lines.push(`reviewer:${name}:${prompt}`);
|
|
2124
|
+
const skills = [...bundle.skills ?? []].sort();
|
|
2125
|
+
if (skills.length) lines.push(`skills:${skills.join(",")}`);
|
|
2126
|
+
return lines.join("\n");
|
|
2127
|
+
}
|
|
2128
|
+
|
|
2129
|
+
// src/run-score.ts
|
|
2130
|
+
var DEFAULT_RUN_SCORE_WEIGHTS = {
|
|
2131
|
+
success: 4,
|
|
2132
|
+
goalProgress: 2,
|
|
2133
|
+
repoGroundedness: 1.5,
|
|
2134
|
+
driftPenalty: -1.5,
|
|
2135
|
+
toolUseQuality: 1,
|
|
2136
|
+
patchQuality: 1.25,
|
|
2137
|
+
testReality: 1.5,
|
|
2138
|
+
finalGate: 3,
|
|
2139
|
+
reviewerBlockers: -2,
|
|
2140
|
+
costUsd: -0.2,
|
|
2141
|
+
wallSeconds: -0.1
|
|
2142
|
+
};
|
|
2143
|
+
function aggregateRunScore(score, weights = {}) {
|
|
2144
|
+
const w = { ...DEFAULT_RUN_SCORE_WEIGHTS, ...weights };
|
|
2145
|
+
return w.success * clamp01(score.success) + w.goalProgress * clamp01(score.goalProgress) + w.repoGroundedness * clamp01(score.repoGroundedness) + w.driftPenalty * clamp01(score.driftPenalty) + w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + w.finalGate * clamp01(score.finalGate) + w.reviewerBlockers * clamp01(score.reviewerBlockers) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60);
|
|
2146
|
+
}
|
|
2147
|
+
function clamp01(value) {
|
|
2148
|
+
if (!Number.isFinite(value)) return 0;
|
|
2149
|
+
return Math.max(0, Math.min(1, value));
|
|
2150
|
+
}
|
|
2151
|
+
|
|
2152
|
+
// src/run-critic.ts
|
|
2153
|
+
var DEFAULT_DRIFT_PATTERNS = [
|
|
2154
|
+
/https?:\/\//i,
|
|
2155
|
+
/\btitle:\s/i,
|
|
2156
|
+
/\bsummary:\s/i,
|
|
2157
|
+
/\burl:\s/i,
|
|
2158
|
+
/\bnpm package usage\b/i,
|
|
2159
|
+
/\bnews\b/i
|
|
2160
|
+
];
|
|
2161
|
+
var RunCritic = class {
|
|
2162
|
+
weights;
|
|
2163
|
+
driftPatterns;
|
|
2164
|
+
constructor(options = {}) {
|
|
2165
|
+
this.weights = options.weights;
|
|
2166
|
+
this.driftPatterns = options.driftPatterns ?? DEFAULT_DRIFT_PATTERNS;
|
|
2167
|
+
}
|
|
2168
|
+
async score(store, runId) {
|
|
2169
|
+
const run = await store.getRun(runId);
|
|
2170
|
+
if (!run) throw new Error(`run ${runId} not found`);
|
|
2171
|
+
const [spans, events, artifacts, budget] = await Promise.all([
|
|
2172
|
+
store.spans({ runId }),
|
|
2173
|
+
store.events({ runId }),
|
|
2174
|
+
store.artifacts(runId),
|
|
2175
|
+
store.budget(runId)
|
|
2176
|
+
]);
|
|
2177
|
+
return this.scoreTrace({ run, spans, events, artifacts, budget });
|
|
2178
|
+
}
|
|
2179
|
+
scoreTrace(trace) {
|
|
2180
|
+
const notes = [];
|
|
2181
|
+
const llmSpans2 = trace.spans.filter((s) => s.kind === "llm");
|
|
2182
|
+
const toolSpans2 = trace.spans.filter((s) => s.kind === "tool");
|
|
2183
|
+
const judgeSpans2 = trace.spans.filter((s) => s.kind === "judge");
|
|
2184
|
+
const sandboxSpans = trace.spans.filter((s) => s.kind === "sandbox");
|
|
2185
|
+
const finalGateSpans = judgeSpans2.filter(
|
|
2186
|
+
(span) => span.dimension === "final_gate" || span.attributes?.finalGate === true
|
|
2187
|
+
);
|
|
2188
|
+
const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
|
|
2189
|
+
if (!success) notes.push("run did not complete with pass=true");
|
|
2190
|
+
const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum, span) => sum + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
|
|
2191
|
+
const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score) : void 0;
|
|
2192
|
+
const goalProgress = outcomeScore ?? judgeAverage ?? success;
|
|
2193
|
+
const successfulTools = toolSpans2.filter((span) => span.status !== "error").length;
|
|
2194
|
+
const toolUseQuality = toolSpans2.length === 0 ? 0 : successfulTools / toolSpans2.length;
|
|
2195
|
+
if (toolSpans2.length === 0) notes.push("no tool spans recorded");
|
|
2196
|
+
const patchEvidence = trace.artifacts.length + toolSpans2.filter((span) => /write|edit|patch|apply/i.test(span.toolName)).length;
|
|
2197
|
+
const patchQuality = patchEvidence > 0 ? clamp01(patchEvidence / 4) : 0;
|
|
2198
|
+
if (!patchQuality) notes.push("no artifact or edit evidence recorded");
|
|
2199
|
+
const sandboxTests = sandboxSpans.filter((span) => typeof span.testsTotal === "number" && span.testsTotal > 0);
|
|
2200
|
+
const testReality = sandboxTests.length ? sandboxTests.reduce((sum, span) => sum + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1), 0) / sandboxTests.length : toolSpans2.some((span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))) ? 0.4 : 0;
|
|
2201
|
+
if (!testReality) notes.push("no real test/build evidence recorded");
|
|
2202
|
+
const blockerSpans = judgeSpans2.filter(
|
|
2203
|
+
(span) => isBlockingJudge(span)
|
|
2204
|
+
);
|
|
2205
|
+
const finalGateBlockers = finalGateSpans.filter((span) => isBlockingJudge(span));
|
|
2206
|
+
const finalGate = finalGateSpans.length ? finalGateBlockers.length ? 0 : 1 : success;
|
|
2207
|
+
if (finalGateBlockers.length) notes.push(`final gate blocked by ${finalGateBlockers.length} reviewer(s)`);
|
|
2208
|
+
else if (!finalGateSpans.length) notes.push("no final gate judgment recorded");
|
|
2209
|
+
const reviewerBlockers = judgeSpans2.length ? blockerSpans.length / judgeSpans2.length : 0;
|
|
2210
|
+
if (reviewerBlockers) notes.push(`detected ${blockerSpans.length} blocking reviewer signal(s)`);
|
|
2211
|
+
const positiveGroundingSignals = patchEvidence + sandboxSpans.length + llmSpans2.filter((span) => looksRepoGrounded(span.output ?? "")).length;
|
|
2212
|
+
const driftSignals = llmSpans2.filter((span) => this.isDrift(span.output ?? "")).length + trace.events.filter((event) => this.isDrift(JSON.stringify(event.payload))).length;
|
|
2213
|
+
const repoGroundedness = positiveGroundingSignals + driftSignals === 0 ? 0 : positiveGroundingSignals / (positiveGroundingSignals + driftSignals);
|
|
2214
|
+
const driftPenalty = positiveGroundingSignals + driftSignals === 0 ? 0 : driftSignals / (positiveGroundingSignals + driftSignals);
|
|
2215
|
+
if (driftSignals > 0) notes.push(`detected ${driftSignals} drift signal(s)`);
|
|
2216
|
+
const costUsd = trace.budget.length ? Math.max(...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed), 0) : llmSpans2.reduce((sum, span) => sum + (span.costUsd ?? 0), 0);
|
|
2217
|
+
const wallSeconds = trace.run.endedAt && trace.run.startedAt ? Math.max(0, (trace.run.endedAt - trace.run.startedAt) / 1e3) : 0;
|
|
2218
|
+
return {
|
|
2219
|
+
success,
|
|
2220
|
+
goalProgress,
|
|
2221
|
+
repoGroundedness,
|
|
2222
|
+
driftPenalty,
|
|
2223
|
+
toolUseQuality,
|
|
2224
|
+
patchQuality,
|
|
2225
|
+
testReality,
|
|
2226
|
+
finalGate,
|
|
2227
|
+
reviewerBlockers,
|
|
2228
|
+
costUsd,
|
|
2229
|
+
wallSeconds,
|
|
2230
|
+
notes
|
|
2231
|
+
};
|
|
2232
|
+
}
|
|
2233
|
+
rank(score) {
|
|
2234
|
+
return aggregateRunScore(score, this.weights);
|
|
2235
|
+
}
|
|
2236
|
+
isDrift(text) {
|
|
2237
|
+
return this.driftPatterns.some((pattern) => pattern.test(text));
|
|
2238
|
+
}
|
|
2239
|
+
};
|
|
2240
|
+
function normalizeJudgeScore(score) {
|
|
2241
|
+
return score > 1 ? clamp01(score / 10) : clamp01(score);
|
|
2242
|
+
}
|
|
2243
|
+
function looksRepoGrounded(text) {
|
|
2244
|
+
return /(?:src\/|tests?\/|package\.json|tsconfig|\.ts\b|\.tsx\b|git status|pnpm |npm |vitest|pytest|jest)/i.test(text);
|
|
2245
|
+
}
|
|
2246
|
+
function isBlockingJudge(span) {
|
|
2247
|
+
return span.attributes?.blocking === true || span.attributes?.verdict === "BLOCKING" || positiveNumber(span.attributes?.blockingFindings) || positiveNumber(span.attributes?.highFindings) || span.score <= 2;
|
|
2248
|
+
}
|
|
2249
|
+
function positiveNumber(value) {
|
|
2250
|
+
return typeof value === "number" && value > 0;
|
|
2251
|
+
}
|
|
2252
|
+
|
|
2253
|
+
// src/playbook.ts
|
|
2254
|
+
function distillPlaybook(entries, options = {}) {
|
|
2255
|
+
const maxEntries = options.maxEntries ?? 12;
|
|
2256
|
+
const byInstruction = /* @__PURE__ */ new Map();
|
|
2257
|
+
for (const entry of entries) {
|
|
2258
|
+
const key = normalizeInstruction(entry.instruction);
|
|
2259
|
+
const existing = byInstruction.get(key);
|
|
2260
|
+
if (!existing || (entry.weight ?? 0) > (existing.weight ?? 0)) {
|
|
2261
|
+
byInstruction.set(key, { ...entry, instruction: canonicalInstruction(entry.instruction) });
|
|
2262
|
+
}
|
|
2263
|
+
}
|
|
2264
|
+
const distilled = [...byInstruction.values()].sort((a, b) => (b.weight ?? 0) - (a.weight ?? 0)).slice(0, maxEntries);
|
|
2265
|
+
return { entries: distilled };
|
|
2266
|
+
}
|
|
2267
|
+
function renderPlaybookMarkdown(playbook) {
|
|
2268
|
+
const lines = ["# Playbook", ""];
|
|
2269
|
+
for (const entry of playbook.entries) {
|
|
2270
|
+
lines.push(`- ${entry.instruction}`);
|
|
2271
|
+
lines.push(` Rationale: ${entry.rationale}`);
|
|
2272
|
+
if (entry.category) lines.push(` Category: ${entry.category}`);
|
|
2273
|
+
if (entry.evidence) lines.push(` Evidence: ${entry.evidence}`);
|
|
2274
|
+
if (entry.sourceRunId) lines.push(` Source run: ${entry.sourceRunId}`);
|
|
2275
|
+
lines.push("");
|
|
2276
|
+
}
|
|
2277
|
+
return lines.join("\n").trim() + "\n";
|
|
2278
|
+
}
|
|
2279
|
+
function normalizeInstruction(value) {
|
|
2280
|
+
return value.trim().toLowerCase().replace(/\s+/g, " ");
|
|
2281
|
+
}
|
|
2282
|
+
function canonicalInstruction(value) {
|
|
2283
|
+
const normalized = value.trim().replace(/\s+/g, " ");
|
|
2284
|
+
return normalized.length === 0 ? normalized : normalized[0].toUpperCase() + normalized.slice(1);
|
|
2285
|
+
}
|
|
2286
|
+
|
|
2287
|
+
// src/optimization-loop.ts
|
|
2288
|
+
var OptimizationLoop = class {
|
|
2289
|
+
optimizer;
|
|
2290
|
+
constructor(optimizer = new PromptOptimizer()) {
|
|
2291
|
+
this.optimizer = optimizer;
|
|
2292
|
+
}
|
|
2293
|
+
async run(config) {
|
|
2294
|
+
const byId = new Map(config.variants.map((variant) => [variant.id, variant]));
|
|
2295
|
+
const result = await this.optimizer.run({
|
|
2296
|
+
variants: config.variants.map((variant) => ({
|
|
2297
|
+
id: variant.id,
|
|
2298
|
+
prompt: renderSteeringText(variant),
|
|
2299
|
+
metadata: { bundle: variant }
|
|
2300
|
+
})),
|
|
2301
|
+
scenarioIds: config.examples.map((example) => example.scenarioId),
|
|
2302
|
+
trialsPerScenario: config.trialsPerScenario,
|
|
2303
|
+
scoreVariant: async ({ variant, scenarioId, trialIndex }) => {
|
|
2304
|
+
const bundle = byId.get(variant.id);
|
|
2305
|
+
if (!bundle) throw new Error(`unknown steering bundle ${variant.id}`);
|
|
2306
|
+
const example = config.examples.find((item) => item.scenarioId === scenarioId);
|
|
2307
|
+
if (!example) throw new Error(`unknown optimization example ${scenarioId}`);
|
|
2308
|
+
const score = await config.evaluate({ variant: bundle, example, trialIndex });
|
|
2309
|
+
return aggregateRunScore(score, config.scoreWeights);
|
|
2147
2310
|
}
|
|
2148
|
-
|
|
2149
|
-
|
|
2150
|
-
|
|
2151
|
-
|
|
2152
|
-
|
|
2153
|
-
|
|
2154
|
-
|
|
2155
|
-
|
|
2311
|
+
});
|
|
2312
|
+
return {
|
|
2313
|
+
winner: byId.get(result.winner.variantId),
|
|
2314
|
+
significant: result.winner.significant,
|
|
2315
|
+
reports: result.scores.map((score) => ({
|
|
2316
|
+
variantId: score.variantId,
|
|
2317
|
+
bundle: byId.get(score.variantId),
|
|
2318
|
+
mean: score.mean,
|
|
2319
|
+
ci95: score.ci95,
|
|
2320
|
+
scenarioScores: score.perScenario
|
|
2321
|
+
})),
|
|
2322
|
+
pairwise: result.pairwise
|
|
2323
|
+
};
|
|
2324
|
+
}
|
|
2325
|
+
};
|
|
2326
|
+
|
|
2327
|
+
// src/steering-optimizer.ts
|
|
2328
|
+
var PairwiseSteeringOptimizer = class {
|
|
2329
|
+
optimize(rows, config = {}) {
|
|
2330
|
+
const ranked = rankRows(rows, config.weights);
|
|
2331
|
+
if (!ranked.length) throw new Error("no steering optimization rows");
|
|
2332
|
+
return {
|
|
2333
|
+
backend: "pairwise",
|
|
2334
|
+
recommendedVariantId: ranked[0].variantId,
|
|
2335
|
+
rationale: `Highest observed mean aggregate across ${rows.length} scored run(s).`,
|
|
2336
|
+
rankings: ranked
|
|
2337
|
+
};
|
|
2338
|
+
}
|
|
2339
|
+
};
|
|
2340
|
+
var AxGepaSteeringOptimizer = class {
|
|
2341
|
+
constructor(config) {
|
|
2342
|
+
this.config = config;
|
|
2343
|
+
}
|
|
2344
|
+
config;
|
|
2345
|
+
async optimize(rows) {
|
|
2346
|
+
const fallback = new PairwiseSteeringOptimizer().optimize(rows, this.config);
|
|
2347
|
+
const minRows = this.config.minRows ?? 6;
|
|
2348
|
+
const variantIds = [...new Set(rows.map((row) => row.variantId))];
|
|
2349
|
+
const byScenario = collapseScenarioWinners(rows, this.config.weights);
|
|
2350
|
+
if (variantIds.length < 2 || byScenario.length < minRows) {
|
|
2351
|
+
return {
|
|
2352
|
+
...fallback,
|
|
2353
|
+
backend: "ax-gepa",
|
|
2354
|
+
skipped: true,
|
|
2355
|
+
rationale: `AxGEPA skipped: need >=2 variants and >=${minRows} scenario winners, got ${variantIds.length} variant(s) and ${byScenario.length} scenario winner(s).`
|
|
2356
|
+
};
|
|
2156
2357
|
}
|
|
2157
|
-
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
|
|
2358
|
+
let axLib;
|
|
2359
|
+
try {
|
|
2360
|
+
axLib = await import("@ax-llm/ax");
|
|
2361
|
+
} catch {
|
|
2362
|
+
return {
|
|
2363
|
+
...fallback,
|
|
2364
|
+
backend: "ax-gepa",
|
|
2365
|
+
skipped: true,
|
|
2366
|
+
rationale: "AxGEPA unavailable: install @ax-llm/ax to enable selector optimization."
|
|
2367
|
+
};
|
|
2368
|
+
}
|
|
2369
|
+
const { ai, ax, AxGEPA } = axLib;
|
|
2370
|
+
const signature = `task:string, split:string, seedPreview:string -> variantId:class "${variantIds.join(", ")}", rationale:string`;
|
|
2371
|
+
const selector = ax(signature, {
|
|
2372
|
+
description: "Choose the best steering bundle variant for an autopilot task."
|
|
2373
|
+
});
|
|
2374
|
+
const splitIndex = Math.max(1, Math.floor(byScenario.length * 0.8));
|
|
2375
|
+
const train = byScenario.slice(0, splitIndex);
|
|
2376
|
+
const validation = byScenario.slice(splitIndex);
|
|
2377
|
+
if (!validation.length) {
|
|
2378
|
+
return {
|
|
2379
|
+
...fallback,
|
|
2380
|
+
backend: "ax-gepa",
|
|
2381
|
+
skipped: true,
|
|
2382
|
+
rationale: "AxGEPA skipped: no validation examples after split."
|
|
2383
|
+
};
|
|
2384
|
+
}
|
|
2385
|
+
const optimizer = new AxGEPA({
|
|
2386
|
+
studentAI: createAxService(ai, this.config.provider, this.config.apiKey, this.config.model),
|
|
2387
|
+
teacherAI: createAxService(ai, this.config.provider, this.config.apiKey, this.config.teacherModel ?? this.config.model),
|
|
2388
|
+
numTrials: 8,
|
|
2389
|
+
minibatch: true,
|
|
2390
|
+
minibatchSize: 4,
|
|
2391
|
+
earlyStoppingTrials: 3,
|
|
2392
|
+
sampleCount: 1
|
|
2393
|
+
});
|
|
2394
|
+
const compiled = await optimizer.compile(
|
|
2395
|
+
selector,
|
|
2396
|
+
train,
|
|
2397
|
+
(({ prediction, example }) => prediction?.variantId === example?.variantId ? 1 : 0),
|
|
2398
|
+
{
|
|
2399
|
+
validationExamples: validation,
|
|
2400
|
+
maxMetricCalls: 64
|
|
2401
|
+
}
|
|
2402
|
+
);
|
|
2403
|
+
selector.applyOptimization(compiled.optimizedProgram);
|
|
2161
2404
|
return {
|
|
2162
|
-
|
|
2163
|
-
|
|
2164
|
-
|
|
2405
|
+
...fallback,
|
|
2406
|
+
backend: "ax-gepa",
|
|
2407
|
+
rationale: `AxGEPA trained a variant selector from ${byScenario.length} scored scenario winner(s); default winner remains ${fallback.recommendedVariantId}.`,
|
|
2408
|
+
selector: {
|
|
2409
|
+
backend: "ax-gepa",
|
|
2410
|
+
signature,
|
|
2411
|
+
labels: variantIds,
|
|
2412
|
+
rationale: compiled.bestScore !== void 0 ? `bestScore=${compiled.bestScore}` : void 0
|
|
2413
|
+
}
|
|
2165
2414
|
};
|
|
2166
2415
|
}
|
|
2167
2416
|
};
|
|
2417
|
+
function rankRows(rows, weights) {
|
|
2418
|
+
const buckets = /* @__PURE__ */ new Map();
|
|
2419
|
+
for (const row of rows) {
|
|
2420
|
+
const values = buckets.get(row.variantId) ?? [];
|
|
2421
|
+
values.push(aggregateRunScore(row.score, weights));
|
|
2422
|
+
buckets.set(row.variantId, values);
|
|
2423
|
+
}
|
|
2424
|
+
return [...buckets.entries()].map(([variantId, values]) => ({
|
|
2425
|
+
variantId,
|
|
2426
|
+
mean: values.reduce((sum, value) => sum + value, 0) / values.length,
|
|
2427
|
+
runs: values.length
|
|
2428
|
+
})).sort((a, b) => b.mean - a.mean);
|
|
2429
|
+
}
|
|
2430
|
+
function collapseScenarioWinners(rows, weights) {
|
|
2431
|
+
const byScenario = /* @__PURE__ */ new Map();
|
|
2432
|
+
for (const row of rows) {
|
|
2433
|
+
const bucket = byScenario.get(row.scenarioId) ?? [];
|
|
2434
|
+
bucket.push(row);
|
|
2435
|
+
byScenario.set(row.scenarioId, bucket);
|
|
2436
|
+
}
|
|
2437
|
+
return [...byScenario.entries()].map(([scenarioId, scenarioRows]) => {
|
|
2438
|
+
const best = scenarioRows.map((row) => ({ row, aggregate: aggregateRunScore(row.score, weights) })).sort((a, b) => b.aggregate - a.aggregate)[0];
|
|
2439
|
+
return {
|
|
2440
|
+
task: String(best.row.metadata?.task ?? best.row.metadata?.seed_preview ?? scenarioId),
|
|
2441
|
+
split: String(best.row.metadata?.split ?? "train"),
|
|
2442
|
+
seedPreview: String(best.row.metadata?.seed_preview ?? ""),
|
|
2443
|
+
variantId: best.row.variantId
|
|
2444
|
+
};
|
|
2445
|
+
});
|
|
2446
|
+
}
|
|
2447
|
+
function createAxService(aiFactory, provider, apiKey, model) {
|
|
2448
|
+
return aiFactory({
|
|
2449
|
+
name: provider,
|
|
2450
|
+
apiKey,
|
|
2451
|
+
config: { model }
|
|
2452
|
+
});
|
|
2453
|
+
}
|
|
2168
2454
|
|
|
2169
|
-
// src/
|
|
2170
|
-
|
|
2171
|
-
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
|
|
2175
|
-
|
|
2176
|
-
|
|
2177
|
-
|
|
2178
|
-
|
|
2179
|
-
|
|
2180
|
-
|
|
2181
|
-
|
|
2182
|
-
"format_drift",
|
|
2183
|
-
"permission_escalation",
|
|
2184
|
-
"pii_leak",
|
|
2185
|
-
"cost_overrun",
|
|
2186
|
-
"timeout",
|
|
2187
|
-
"sandbox_failure",
|
|
2188
|
-
"unknown"
|
|
2189
|
-
];
|
|
2190
|
-
function isLlmSpan(s) {
|
|
2191
|
-
return s.kind === "llm";
|
|
2455
|
+
// src/pareto.ts
|
|
2456
|
+
function dominates(a, b, objectives) {
|
|
2457
|
+
let strictlyBetter = false;
|
|
2458
|
+
for (const obj of objectives) {
|
|
2459
|
+
const av = obj.value(a);
|
|
2460
|
+
const bv = obj.value(b);
|
|
2461
|
+
if (!Number.isFinite(av) || !Number.isFinite(bv)) return false;
|
|
2462
|
+
const aIsBetter = obj.direction === "maximize" ? av > bv : av < bv;
|
|
2463
|
+
const aIsWorse = obj.direction === "maximize" ? av < bv : av > bv;
|
|
2464
|
+
if (aIsWorse) return false;
|
|
2465
|
+
if (aIsBetter) strictlyBetter = true;
|
|
2466
|
+
}
|
|
2467
|
+
return strictlyBetter;
|
|
2192
2468
|
}
|
|
2193
|
-
function
|
|
2194
|
-
|
|
2469
|
+
function paretoFrontier(candidates, objectives) {
|
|
2470
|
+
if (objectives.length === 0) {
|
|
2471
|
+
throw new Error("paretoFrontier: at least 1 objective required");
|
|
2472
|
+
}
|
|
2473
|
+
const valid = candidates.filter(
|
|
2474
|
+
(c) => objectives.every((o) => Number.isFinite(o.value(c)))
|
|
2475
|
+
);
|
|
2476
|
+
const frontier = [];
|
|
2477
|
+
const dominated = [];
|
|
2478
|
+
for (const c of valid) {
|
|
2479
|
+
const isDominated = valid.some((other) => other !== c && dominates(other, c, objectives));
|
|
2480
|
+
if (isDominated) dominated.push(c);
|
|
2481
|
+
else frontier.push(c);
|
|
2482
|
+
}
|
|
2483
|
+
const dominanceMap = frontier.map((d) => ({
|
|
2484
|
+
dominator: d,
|
|
2485
|
+
dominated: dominated.filter((x) => dominates(d, x, objectives))
|
|
2486
|
+
}));
|
|
2487
|
+
return { frontier, dominated, dominanceMap };
|
|
2195
2488
|
}
|
|
2196
|
-
|
|
2197
|
-
|
|
2489
|
+
|
|
2490
|
+
// src/harness-optimizer.ts
|
|
2491
|
+
var DEFAULT_HARNESS_OBJECTIVES = [
|
|
2492
|
+
{ name: "aggregate", direction: "maximize", value: (r) => r.aggregateMean },
|
|
2493
|
+
{ name: "pass_rate", direction: "maximize", value: (r) => r.passRate },
|
|
2494
|
+
{ name: "cost", direction: "minimize", value: (r) => r.costUsdMean },
|
|
2495
|
+
{ name: "wall", direction: "minimize", value: (r) => r.wallSecondsMean }
|
|
2496
|
+
];
|
|
2497
|
+
async function runHarnessExperiment(config) {
|
|
2498
|
+
const jobs = buildJobs(config);
|
|
2499
|
+
const critic = new RunCritic({ weights: config.weights });
|
|
2500
|
+
const score = config.score ?? ((trace) => critic.scoreTrace(trace));
|
|
2501
|
+
const results = await mapLimit(jobs, config.parallelism ?? 1, async (request) => {
|
|
2502
|
+
const trace = await config.adapter.run(request);
|
|
2503
|
+
const runScore = await score(trace, request);
|
|
2504
|
+
const result = {
|
|
2505
|
+
variant: request.variant,
|
|
2506
|
+
scenario: request.scenario,
|
|
2507
|
+
trialIndex: request.trialIndex,
|
|
2508
|
+
trace,
|
|
2509
|
+
score: runScore,
|
|
2510
|
+
aggregate: aggregateRunScore(runScore, config.weights)
|
|
2511
|
+
};
|
|
2512
|
+
await config.onResult?.(result);
|
|
2513
|
+
return result;
|
|
2514
|
+
});
|
|
2515
|
+
return { results, selection: selectHarnessVariant(results, config.objectives) };
|
|
2516
|
+
}
|
|
2517
|
+
function selectHarnessVariant(results, objectives = DEFAULT_HARNESS_OBJECTIVES) {
|
|
2518
|
+
const reports = summarizeHarnessResults(results);
|
|
2519
|
+
if (reports.length === 0) throw new Error("selectHarnessVariant: no results");
|
|
2520
|
+
const frontier = paretoFrontier(reports, objectives);
|
|
2521
|
+
const candidates = frontier.frontier.length ? frontier.frontier : reports;
|
|
2522
|
+
const winner = [...candidates].sort((a, b) => b.aggregateMean - a.aggregateMean)[0];
|
|
2523
|
+
if (!winner) throw new Error("selectHarnessVariant: no winner");
|
|
2524
|
+
return { winner, frontier, reports };
|
|
2525
|
+
}
|
|
2526
|
+
function summarizeHarnessResults(results) {
|
|
2527
|
+
const byVariant = /* @__PURE__ */ new Map();
|
|
2528
|
+
for (const result of results) {
|
|
2529
|
+
byVariant.set(result.variant.id, [...byVariant.get(result.variant.id) ?? [], result]);
|
|
2530
|
+
}
|
|
2531
|
+
return [...byVariant.values()].map((runs) => {
|
|
2532
|
+
const variant = runs[0]?.variant;
|
|
2533
|
+
if (!variant) throw new Error("summarizeHarnessResults: empty variant bucket");
|
|
2534
|
+
return {
|
|
2535
|
+
variant,
|
|
2536
|
+
runs,
|
|
2537
|
+
aggregateMean: mean(runs.map((r) => r.aggregate)),
|
|
2538
|
+
passRate: mean(runs.map((r) => r.score.success)),
|
|
2539
|
+
costUsdMean: mean(runs.map((r) => r.score.costUsd)),
|
|
2540
|
+
wallSecondsMean: mean(runs.map((r) => r.score.wallSeconds)),
|
|
2541
|
+
scoreMean: meanRunScore(runs.map((r) => r.score))
|
|
2542
|
+
};
|
|
2543
|
+
}).sort((a, b) => b.aggregateMean - a.aggregateMean);
|
|
2544
|
+
}
|
|
2545
|
+
function buildJobs(config) {
|
|
2546
|
+
if (config.variants.length === 0) throw new Error("runHarnessExperiment: at least one variant required");
|
|
2547
|
+
if (config.scenarios.length === 0) throw new Error("runHarnessExperiment: at least one scenario required");
|
|
2548
|
+
const trials = Math.max(1, Math.floor(config.trialsPerScenario ?? 1));
|
|
2549
|
+
const jobs = [];
|
|
2550
|
+
for (const variant of config.variants) {
|
|
2551
|
+
for (const scenario of config.scenarios) {
|
|
2552
|
+
for (let trialIndex = 0; trialIndex < trials; trialIndex++) {
|
|
2553
|
+
jobs.push({ variant, scenario, trialIndex });
|
|
2554
|
+
}
|
|
2555
|
+
}
|
|
2556
|
+
}
|
|
2557
|
+
return jobs;
|
|
2198
2558
|
}
|
|
2199
|
-
function
|
|
2200
|
-
|
|
2559
|
+
async function mapLimit(items, limit, fn) {
|
|
2560
|
+
const results = new Array(items.length);
|
|
2561
|
+
let next = 0;
|
|
2562
|
+
const workerCount = Math.max(1, Math.min(Math.floor(limit), items.length));
|
|
2563
|
+
await Promise.all(Array.from({ length: workerCount }, async () => {
|
|
2564
|
+
while (next < items.length) {
|
|
2565
|
+
const index = next++;
|
|
2566
|
+
const item = items[index];
|
|
2567
|
+
if (item === void 0) continue;
|
|
2568
|
+
results[index] = await fn(item);
|
|
2569
|
+
}
|
|
2570
|
+
}));
|
|
2571
|
+
return results;
|
|
2201
2572
|
}
|
|
2202
|
-
function
|
|
2203
|
-
return
|
|
2573
|
+
function mean(values) {
|
|
2574
|
+
return values.length ? values.reduce((sum, value) => sum + value, 0) / values.length : 0;
|
|
2575
|
+
}
|
|
2576
|
+
function meanRunScore(scores) {
|
|
2577
|
+
return {
|
|
2578
|
+
success: mean(scores.map((s) => s.success)),
|
|
2579
|
+
goalProgress: mean(scores.map((s) => s.goalProgress)),
|
|
2580
|
+
repoGroundedness: mean(scores.map((s) => s.repoGroundedness)),
|
|
2581
|
+
driftPenalty: mean(scores.map((s) => s.driftPenalty)),
|
|
2582
|
+
toolUseQuality: mean(scores.map((s) => s.toolUseQuality)),
|
|
2583
|
+
patchQuality: mean(scores.map((s) => s.patchQuality)),
|
|
2584
|
+
testReality: mean(scores.map((s) => s.testReality)),
|
|
2585
|
+
finalGate: mean(scores.map((s) => s.finalGate)),
|
|
2586
|
+
reviewerBlockers: mean(scores.map((s) => s.reviewerBlockers)),
|
|
2587
|
+
costUsd: mean(scores.map((s) => s.costUsd)),
|
|
2588
|
+
wallSeconds: mean(scores.map((s) => s.wallSeconds)),
|
|
2589
|
+
notes: scores.flatMap((s) => s.notes ?? [])
|
|
2590
|
+
};
|
|
2204
2591
|
}
|
|
2205
2592
|
|
|
2206
2593
|
// src/trace/store.ts
|
|
@@ -2597,6 +2984,651 @@ function llmSpanFromProvider(args) {
|
|
|
2597
2984
|
};
|
|
2598
2985
|
}
|
|
2599
2986
|
|
|
2987
|
+
// src/sandbox-harness.ts
|
|
2988
|
+
var vitestTestParser = {
|
|
2989
|
+
id: "vitest",
|
|
2990
|
+
parse(stdout) {
|
|
2991
|
+
const m = stdout.match(/Tests\s+(\d+)\s+(passed|failed)(?:\s*\|\s*(\d+)\s+(passed|failed))?/i);
|
|
2992
|
+
if (!m) return void 0;
|
|
2993
|
+
let passed = 0;
|
|
2994
|
+
let failed = 0;
|
|
2995
|
+
const a = parseInt(m[1], 10);
|
|
2996
|
+
const aLabel = m[2].toLowerCase();
|
|
2997
|
+
if (aLabel === "passed") passed += a;
|
|
2998
|
+
else failed += a;
|
|
2999
|
+
if (m[3] && m[4]) {
|
|
3000
|
+
const b = parseInt(m[3], 10);
|
|
3001
|
+
if (m[4].toLowerCase() === "passed") passed += b;
|
|
3002
|
+
else failed += b;
|
|
3003
|
+
}
|
|
3004
|
+
return { testsTotal: passed + failed, testsPassed: passed };
|
|
3005
|
+
}
|
|
3006
|
+
};
|
|
3007
|
+
var pytestTestParser = {
|
|
3008
|
+
id: "pytest",
|
|
3009
|
+
parse(stdout) {
|
|
3010
|
+
const total = stdout.match(/collected\s+(\d+)\s+items?/i);
|
|
3011
|
+
const passed = stdout.match(/(\d+)\s+passed/);
|
|
3012
|
+
if (!total || !passed) return void 0;
|
|
3013
|
+
return { testsTotal: parseInt(total[1], 10), testsPassed: parseInt(passed[1], 10) };
|
|
3014
|
+
}
|
|
3015
|
+
};
|
|
3016
|
+
var jestTestParser = {
|
|
3017
|
+
id: "jest",
|
|
3018
|
+
parse(stdout) {
|
|
3019
|
+
const m = stdout.match(/Tests:\s+(?:(\d+)\s+failed[^,]*,\s*)?(\d+)\s+passed,\s+(\d+)\s+total/i);
|
|
3020
|
+
if (!m) return void 0;
|
|
3021
|
+
return { testsTotal: parseInt(m[3], 10), testsPassed: parseInt(m[2], 10) };
|
|
3022
|
+
}
|
|
3023
|
+
};
|
|
3024
|
+
function composeParsers(...parsers) {
|
|
3025
|
+
return {
|
|
3026
|
+
id: parsers.map((p) => p.id).join("|"),
|
|
3027
|
+
parse(stdout, stderr, exitCode) {
|
|
3028
|
+
for (const p of parsers) {
|
|
3029
|
+
const res = p.parse(stdout, stderr, exitCode);
|
|
3030
|
+
if (res) return res;
|
|
3031
|
+
}
|
|
3032
|
+
return void 0;
|
|
3033
|
+
}
|
|
3034
|
+
};
|
|
3035
|
+
}
|
|
3036
|
+
var SubprocessSandboxDriver = class {
|
|
3037
|
+
id = "subprocess";
|
|
3038
|
+
defaultCwd;
|
|
3039
|
+
defaultEnv;
|
|
3040
|
+
constructor(options = {}) {
|
|
3041
|
+
this.defaultCwd = options.cwd;
|
|
3042
|
+
this.defaultEnv = options.env;
|
|
3043
|
+
}
|
|
3044
|
+
async exec(phase, command, config) {
|
|
3045
|
+
const { spawn } = await import("child_process");
|
|
3046
|
+
const start = Date.now();
|
|
3047
|
+
const effectiveCwd = config.cwd ?? this.defaultCwd;
|
|
3048
|
+
const effectiveEnv = { ...process.env, ...this.defaultEnv ?? {}, ...config.env ?? {} };
|
|
3049
|
+
return await new Promise((resolve) => {
|
|
3050
|
+
const child = spawn(command, {
|
|
3051
|
+
shell: true,
|
|
3052
|
+
cwd: effectiveCwd,
|
|
3053
|
+
env: effectiveEnv
|
|
3054
|
+
});
|
|
3055
|
+
let stdout = "";
|
|
3056
|
+
let stderr = "";
|
|
3057
|
+
child.stdout?.on("data", (d) => {
|
|
3058
|
+
stdout += String(d);
|
|
3059
|
+
});
|
|
3060
|
+
child.stderr?.on("data", (d) => {
|
|
3061
|
+
stderr += String(d);
|
|
3062
|
+
});
|
|
3063
|
+
const timeout = setTimeout(() => {
|
|
3064
|
+
try {
|
|
3065
|
+
child.kill("SIGKILL");
|
|
3066
|
+
} catch {
|
|
3067
|
+
}
|
|
3068
|
+
}, config.timeoutMs ?? 10 * 6e4);
|
|
3069
|
+
child.on("close", (code) => {
|
|
3070
|
+
clearTimeout(timeout);
|
|
3071
|
+
const wallMs = Date.now() - start;
|
|
3072
|
+
const parsed = phase === "test" && config.testParser ? config.testParser.parse(stdout, stderr, code ?? 1) : void 0;
|
|
3073
|
+
resolve({
|
|
3074
|
+
phase,
|
|
3075
|
+
exitCode: code ?? 1,
|
|
3076
|
+
stdout,
|
|
3077
|
+
stderr,
|
|
3078
|
+
wallMs,
|
|
3079
|
+
testsTotal: parsed?.testsTotal,
|
|
3080
|
+
testsPassed: parsed?.testsPassed
|
|
3081
|
+
});
|
|
3082
|
+
});
|
|
3083
|
+
child.on("error", (err) => {
|
|
3084
|
+
clearTimeout(timeout);
|
|
3085
|
+
const wallMs = Date.now() - start;
|
|
3086
|
+
resolve({ phase, exitCode: 127, stdout, stderr: stderr + String(err), wallMs });
|
|
3087
|
+
});
|
|
3088
|
+
});
|
|
3089
|
+
}
|
|
3090
|
+
};
|
|
3091
|
+
var DockerSandboxDriver = class {
|
|
3092
|
+
id = "docker";
|
|
3093
|
+
async exec(phase, command, config) {
|
|
3094
|
+
if (!config.image) throw new Error("DockerSandboxDriver requires config.image");
|
|
3095
|
+
const sub = new SubprocessSandboxDriver();
|
|
3096
|
+
const envArgs = Object.entries(config.env ?? {}).map(([k, v]) => `-e ${shellQuote(k)}=${shellQuote(v)}`).join(" ");
|
|
3097
|
+
const wrapped = `docker run --rm ${envArgs} ${shellQuote(config.image)} sh -c ${shellQuote(command)}`;
|
|
3098
|
+
return sub.exec(phase, wrapped, { ...config, env: void 0 });
|
|
3099
|
+
}
|
|
3100
|
+
};
|
|
3101
|
+
function shellQuote(v) {
|
|
3102
|
+
if (/^[A-Za-z0-9_\-\/\.@:=]+$/.test(v)) return v;
|
|
3103
|
+
return `'${v.replace(/'/g, `'\\''`)}'`;
|
|
3104
|
+
}
|
|
3105
|
+
var SandboxHarness = class {
|
|
3106
|
+
driver;
|
|
3107
|
+
constructor(driver = new SubprocessSandboxDriver()) {
|
|
3108
|
+
this.driver = driver;
|
|
3109
|
+
}
|
|
3110
|
+
async run(config, emitter) {
|
|
3111
|
+
const handle = await emitter.sandbox({
|
|
3112
|
+
name: `sandbox(${this.driver.id})`,
|
|
3113
|
+
image: config.image,
|
|
3114
|
+
command: [config.setupCommand, config.runCommand, config.testCommand].filter(Boolean).join(" && ")
|
|
3115
|
+
});
|
|
3116
|
+
const result = { passed: false, totalWallMs: 0, score: 0 };
|
|
3117
|
+
try {
|
|
3118
|
+
if (config.setupCommand) {
|
|
3119
|
+
result.setup = await this.driver.exec("setup", config.setupCommand, config);
|
|
3120
|
+
result.totalWallMs += result.setup.wallMs;
|
|
3121
|
+
if (result.setup.exitCode !== 0) {
|
|
3122
|
+
await handle.fail(`setup failed (exit ${result.setup.exitCode})`, {
|
|
3123
|
+
exitCode: result.setup.exitCode,
|
|
3124
|
+
wallMs: result.totalWallMs
|
|
3125
|
+
});
|
|
3126
|
+
return result;
|
|
3127
|
+
}
|
|
3128
|
+
}
|
|
3129
|
+
if (config.runCommand) {
|
|
3130
|
+
result.run = await this.driver.exec("run", config.runCommand, config);
|
|
3131
|
+
result.totalWallMs += result.run.wallMs;
|
|
3132
|
+
if (result.run.exitCode !== 0) {
|
|
3133
|
+
await handle.fail(`run failed (exit ${result.run.exitCode})`, {
|
|
3134
|
+
exitCode: result.run.exitCode,
|
|
3135
|
+
wallMs: result.totalWallMs
|
|
3136
|
+
});
|
|
3137
|
+
return result;
|
|
3138
|
+
}
|
|
3139
|
+
}
|
|
3140
|
+
if (config.testCommand) {
|
|
3141
|
+
result.test = await this.driver.exec("test", config.testCommand, config);
|
|
3142
|
+
result.totalWallMs += result.test.wallMs;
|
|
3143
|
+
const passed = result.test.exitCode === 0;
|
|
3144
|
+
result.passed = passed;
|
|
3145
|
+
if (result.test.testsTotal !== void 0 && result.test.testsTotal > 0) {
|
|
3146
|
+
result.score = (result.test.testsPassed ?? 0) / result.test.testsTotal;
|
|
3147
|
+
} else {
|
|
3148
|
+
result.score = passed ? 1 : 0;
|
|
3149
|
+
}
|
|
3150
|
+
await handle.end({
|
|
3151
|
+
exitCode: result.test.exitCode,
|
|
3152
|
+
testsTotal: result.test.testsTotal,
|
|
3153
|
+
testsPassed: result.test.testsPassed,
|
|
3154
|
+
wallMs: result.totalWallMs,
|
|
3155
|
+
status: passed ? "ok" : "error"
|
|
3156
|
+
});
|
|
3157
|
+
} else {
|
|
3158
|
+
result.passed = true;
|
|
3159
|
+
result.score = 1;
|
|
3160
|
+
await handle.end({ wallMs: result.totalWallMs });
|
|
3161
|
+
}
|
|
3162
|
+
} catch (err) {
|
|
3163
|
+
await handle.fail(err instanceof Error ? err : String(err));
|
|
3164
|
+
throw err;
|
|
3165
|
+
}
|
|
3166
|
+
return result;
|
|
3167
|
+
}
|
|
3168
|
+
};
|
|
3169
|
+
|
|
3170
|
+
// src/judge-runner.ts
|
|
3171
|
+
var JudgeRunner = class {
|
|
3172
|
+
driver;
|
|
3173
|
+
constructor(driver = new SubprocessSandboxDriver()) {
|
|
3174
|
+
this.driver = driver;
|
|
3175
|
+
}
|
|
3176
|
+
async run(spec) {
|
|
3177
|
+
const store = new InMemoryTraceStore();
|
|
3178
|
+
const emitter = new TraceEmitter(store, { runId: `judge-${spec.id}` });
|
|
3179
|
+
await emitter.startRun({
|
|
3180
|
+
scenarioId: spec.id,
|
|
3181
|
+
layer: "meta",
|
|
3182
|
+
projectId: "judge-runner"
|
|
3183
|
+
});
|
|
3184
|
+
const harness = new SandboxHarness(this.driver);
|
|
3185
|
+
const detail = await harness.run(spec.config, emitter);
|
|
3186
|
+
await emitter.endRun({ pass: detail.passed, score: detail.score, notes: `${spec.kind} judge` });
|
|
3187
|
+
return {
|
|
3188
|
+
id: spec.id,
|
|
3189
|
+
kind: spec.kind,
|
|
3190
|
+
passed: detail.passed,
|
|
3191
|
+
score: detail.score,
|
|
3192
|
+
summary: renderJudgeSummary(spec.kind, detail),
|
|
3193
|
+
detail
|
|
3194
|
+
};
|
|
3195
|
+
}
|
|
3196
|
+
};
|
|
3197
|
+
async function runJudgeFleet(specs, options = {}) {
|
|
3198
|
+
const runner = new JudgeRunner(options.driver);
|
|
3199
|
+
if (options.parallel === false) {
|
|
3200
|
+
const results = [];
|
|
3201
|
+
for (const spec of specs) results.push(await runner.run(spec));
|
|
3202
|
+
return results;
|
|
3203
|
+
}
|
|
3204
|
+
return await Promise.all(specs.map((spec) => runner.run(spec)));
|
|
3205
|
+
}
|
|
3206
|
+
function compilerJudge(id, config) {
|
|
3207
|
+
return { id, kind: "compiler", config };
|
|
3208
|
+
}
|
|
3209
|
+
function testJudge(id, config) {
|
|
3210
|
+
return { id, kind: "test", config };
|
|
3211
|
+
}
|
|
3212
|
+
function linterJudge(id, config) {
|
|
3213
|
+
return { id, kind: "linter", config };
|
|
3214
|
+
}
|
|
3215
|
+
function securityJudge(id, config) {
|
|
3216
|
+
return { id, kind: "security", config };
|
|
3217
|
+
}
|
|
3218
|
+
function renderJudgeSummary(kind, detail) {
|
|
3219
|
+
if (!detail.passed) return `${kind} judge failed`;
|
|
3220
|
+
if (detail.test?.testsTotal) return `${kind} judge passed ${detail.test.testsPassed}/${detail.test.testsTotal} tests`;
|
|
3221
|
+
return `${kind} judge passed`;
|
|
3222
|
+
}
|
|
3223
|
+
|
|
3224
|
+
// src/dual-agent-bench.ts
|
|
3225
|
+
var DualAgentBench = class {
|
|
3226
|
+
async run(config) {
|
|
3227
|
+
const maxRounds = config.maxRounds ?? 5;
|
|
3228
|
+
const threshold = config.convergenceThreshold ?? 0.85;
|
|
3229
|
+
if (config.scenarios.length === 0) {
|
|
3230
|
+
throw new Error("DualAgentBench requires at least 1 scenario");
|
|
3231
|
+
}
|
|
3232
|
+
const results = [];
|
|
3233
|
+
for (const scenario of config.scenarios) {
|
|
3234
|
+
const history = [];
|
|
3235
|
+
let converged = false;
|
|
3236
|
+
let roundsToConverge = null;
|
|
3237
|
+
let finalProposal = "";
|
|
3238
|
+
let lastScore = 0;
|
|
3239
|
+
let priorCritique;
|
|
3240
|
+
for (let r = 0; r < maxRounds; r++) {
|
|
3241
|
+
const priorProposal = history[history.length - 1]?.proposal;
|
|
3242
|
+
const proposal = await config.propose({
|
|
3243
|
+
scenario,
|
|
3244
|
+
roundIndex: r,
|
|
3245
|
+
priorProposal,
|
|
3246
|
+
priorCritique
|
|
3247
|
+
});
|
|
3248
|
+
const { critique, convergenceScore } = await config.critique({
|
|
3249
|
+
scenario,
|
|
3250
|
+
roundIndex: r,
|
|
3251
|
+
proposal
|
|
3252
|
+
});
|
|
3253
|
+
if (!Number.isFinite(convergenceScore) || convergenceScore < 0 || convergenceScore > 1) {
|
|
3254
|
+
throw new Error(
|
|
3255
|
+
`critique must return convergenceScore in [0,1]; got ${convergenceScore} for scenario ${scenario.id} round ${r}`
|
|
3256
|
+
);
|
|
3257
|
+
}
|
|
3258
|
+
const round = {
|
|
3259
|
+
roundIndex: r,
|
|
3260
|
+
proposal,
|
|
3261
|
+
critique,
|
|
3262
|
+
convergenceScore
|
|
3263
|
+
};
|
|
3264
|
+
history.push(round);
|
|
3265
|
+
config.onRoundComplete?.({ scenarioId: scenario.id, round });
|
|
3266
|
+
finalProposal = proposal;
|
|
3267
|
+
lastScore = convergenceScore;
|
|
3268
|
+
priorCritique = critique;
|
|
3269
|
+
if (convergenceScore >= threshold) {
|
|
3270
|
+
converged = true;
|
|
3271
|
+
roundsToConverge = r + 1;
|
|
3272
|
+
break;
|
|
3273
|
+
}
|
|
3274
|
+
}
|
|
3275
|
+
results.push({
|
|
3276
|
+
scenarioId: scenario.id,
|
|
3277
|
+
converged,
|
|
3278
|
+
roundsToConverge,
|
|
3279
|
+
finalProposal,
|
|
3280
|
+
history,
|
|
3281
|
+
finalScore: lastScore
|
|
3282
|
+
});
|
|
3283
|
+
}
|
|
3284
|
+
const convergedResults = results.filter((r) => r.converged);
|
|
3285
|
+
const convergenceRate = results.length ? convergedResults.length / results.length : 0;
|
|
3286
|
+
const avgRoundsToConverge = convergedResults.length ? convergedResults.reduce((acc, r) => acc + (r.roundsToConverge ?? 0), 0) / convergedResults.length : null;
|
|
3287
|
+
const avgFinalScore = results.length ? results.reduce((acc, r) => acc + r.finalScore, 0) / results.length : 0;
|
|
3288
|
+
return {
|
|
3289
|
+
scenarios: results,
|
|
3290
|
+
aggregate: { convergenceRate, avgRoundsToConverge, avgFinalScore },
|
|
3291
|
+
config: { maxRounds, convergenceThreshold: threshold }
|
|
3292
|
+
};
|
|
3293
|
+
}
|
|
3294
|
+
};
|
|
3295
|
+
|
|
3296
|
+
// src/propose-review.ts
|
|
3297
|
+
import { appendFileSync, existsSync, mkdirSync, readFileSync } from "fs";
|
|
3298
|
+
import { dirname } from "path";
|
|
3299
|
+
function inMemoryReviewStore(initial = []) {
|
|
3300
|
+
const entries = [...initial];
|
|
3301
|
+
return {
|
|
3302
|
+
async load() {
|
|
3303
|
+
return [...entries];
|
|
3304
|
+
},
|
|
3305
|
+
async append(entry) {
|
|
3306
|
+
entries.push(entry);
|
|
3307
|
+
}
|
|
3308
|
+
};
|
|
3309
|
+
}
|
|
3310
|
+
function jsonlReviewStore(path) {
|
|
3311
|
+
return {
|
|
3312
|
+
async load() {
|
|
3313
|
+
if (!existsSync(path)) return [];
|
|
3314
|
+
const raw = readFileSync(path, "utf8");
|
|
3315
|
+
const out = [];
|
|
3316
|
+
for (const line of raw.split("\n")) {
|
|
3317
|
+
const trimmed = line.trim();
|
|
3318
|
+
if (!trimmed) continue;
|
|
3319
|
+
try {
|
|
3320
|
+
out.push(JSON.parse(trimmed));
|
|
3321
|
+
} catch {
|
|
3322
|
+
}
|
|
3323
|
+
}
|
|
3324
|
+
return out;
|
|
3325
|
+
},
|
|
3326
|
+
async append(entry) {
|
|
3327
|
+
mkdirSync(dirname(path), { recursive: true });
|
|
3328
|
+
appendFileSync(path, JSON.stringify(entry) + "\n");
|
|
3329
|
+
}
|
|
3330
|
+
};
|
|
3331
|
+
}
|
|
3332
|
+
var DEFAULT_FALLBACK_INSTRUCTION = "Inspect the verification failures above. Fix the critical issues first, then the major ones. Do not restate the failures \u2014 act on them.";
|
|
3333
|
+
async function runProposeReview(config) {
|
|
3334
|
+
const maxShots = config.maxShots ?? 10;
|
|
3335
|
+
const maxWallMs = config.maxWallMs ?? 10 * 60 * 1e3;
|
|
3336
|
+
const confidenceFloor = config.confidenceFloor ?? 0.3;
|
|
3337
|
+
const confidenceFloorWindow = config.confidenceFloorWindow ?? 2;
|
|
3338
|
+
const memory = config.memory ?? inMemoryReviewStore();
|
|
3339
|
+
const fallbackInstruction = config.fallbackInstruction ?? DEFAULT_FALLBACK_INSTRUCTION;
|
|
3340
|
+
const emitter = config.store ? new TraceEmitter(config.store) : null;
|
|
3341
|
+
if (emitter) {
|
|
3342
|
+
await emitter.startRun({
|
|
3343
|
+
scenarioId: config.scenarioId ?? "propose-review",
|
|
3344
|
+
projectId: config.projectId,
|
|
3345
|
+
variantId: config.variantId,
|
|
3346
|
+
layer: "meta",
|
|
3347
|
+
tags: {
|
|
3348
|
+
goal: config.goal.slice(0, 120),
|
|
3349
|
+
maxShots: String(maxShots)
|
|
3350
|
+
}
|
|
3351
|
+
});
|
|
3352
|
+
}
|
|
3353
|
+
const abort = new AbortController();
|
|
3354
|
+
const wallStart = Date.now();
|
|
3355
|
+
const wallTimer = setTimeout(() => abort.abort(new Error("propose-review wall timeout")), maxWallMs);
|
|
3356
|
+
const shots = [];
|
|
3357
|
+
let state = config.initialState;
|
|
3358
|
+
let priorReview = null;
|
|
3359
|
+
let lastVerification = { pass: false };
|
|
3360
|
+
let failureClass;
|
|
3361
|
+
let completed = false;
|
|
3362
|
+
let lowConfidenceStreak = 0;
|
|
3363
|
+
try {
|
|
3364
|
+
for (let shot = 1; shot <= maxShots; shot++) {
|
|
3365
|
+
if (abort.signal.aborted) {
|
|
3366
|
+
failureClass = "timeout";
|
|
3367
|
+
break;
|
|
3368
|
+
}
|
|
3369
|
+
const shotStart = Date.now();
|
|
3370
|
+
const shotHandle = emitter ? await emitter.span({ kind: "tool", name: `shot-${shot}` }) : null;
|
|
3371
|
+
let proposeOut;
|
|
3372
|
+
try {
|
|
3373
|
+
proposeOut = await config.propose({
|
|
3374
|
+
shot,
|
|
3375
|
+
goal: config.goal,
|
|
3376
|
+
state,
|
|
3377
|
+
priorReview,
|
|
3378
|
+
abortSignal: abort.signal,
|
|
3379
|
+
emitter: emitter ?? void 0
|
|
3380
|
+
});
|
|
3381
|
+
} catch (err) {
|
|
3382
|
+
await shotHandle?.fail(err instanceof Error ? err : String(err));
|
|
3383
|
+
failureClass = "unknown";
|
|
3384
|
+
throw err;
|
|
3385
|
+
}
|
|
3386
|
+
state = proposeOut.state;
|
|
3387
|
+
const traceSummary = proposeOut.traceSummary;
|
|
3388
|
+
let verification;
|
|
3389
|
+
try {
|
|
3390
|
+
verification = await config.verify(state);
|
|
3391
|
+
} catch (err) {
|
|
3392
|
+
await shotHandle?.fail(err instanceof Error ? err : String(err));
|
|
3393
|
+
failureClass = "unknown";
|
|
3394
|
+
throw err;
|
|
3395
|
+
}
|
|
3396
|
+
lastVerification = verification;
|
|
3397
|
+
const memorySnapshot = await memory.load();
|
|
3398
|
+
const verificationDigest = {
|
|
3399
|
+
pass: verification.pass,
|
|
3400
|
+
score: verification.score,
|
|
3401
|
+
failingLayers: verification.failingLayers ?? []
|
|
3402
|
+
};
|
|
3403
|
+
let review;
|
|
3404
|
+
let reviewAvailable = true;
|
|
3405
|
+
let reviewError;
|
|
3406
|
+
if (verification.pass) {
|
|
3407
|
+
review = {
|
|
3408
|
+
observations: "verification passed \u2014 skipping reviewer LLM call",
|
|
3409
|
+
diagnosis: "no failures to diagnose",
|
|
3410
|
+
nextShotInstruction: "(done)",
|
|
3411
|
+
shouldContinue: false,
|
|
3412
|
+
confidence: 1
|
|
3413
|
+
};
|
|
3414
|
+
} else {
|
|
3415
|
+
try {
|
|
3416
|
+
review = await config.review({
|
|
3417
|
+
shot,
|
|
3418
|
+
goal: config.goal,
|
|
3419
|
+
state,
|
|
3420
|
+
verification,
|
|
3421
|
+
traceSummary,
|
|
3422
|
+
memory: memorySnapshot
|
|
3423
|
+
});
|
|
3424
|
+
review = coerceReview(review);
|
|
3425
|
+
} catch (err) {
|
|
3426
|
+
reviewAvailable = false;
|
|
3427
|
+
reviewError = err instanceof Error ? err.message : String(err);
|
|
3428
|
+
const lastInstruction = memorySnapshot.length > 0 ? memorySnapshot[memorySnapshot.length - 1].nextShotInstruction : fallbackInstruction;
|
|
3429
|
+
review = {
|
|
3430
|
+
observations: "(reviewer unavailable \u2014 using last-known instruction)",
|
|
3431
|
+
diagnosis: reviewError,
|
|
3432
|
+
nextShotInstruction: lastInstruction,
|
|
3433
|
+
shouldContinue: true,
|
|
3434
|
+
confidence: 0.3
|
|
3435
|
+
};
|
|
3436
|
+
}
|
|
3437
|
+
}
|
|
3438
|
+
const entry = {
|
|
3439
|
+
shot,
|
|
3440
|
+
timestamp: Date.now(),
|
|
3441
|
+
...review,
|
|
3442
|
+
verification: verificationDigest
|
|
3443
|
+
};
|
|
3444
|
+
await memory.append(entry);
|
|
3445
|
+
const shotRecord = {
|
|
3446
|
+
shot,
|
|
3447
|
+
state,
|
|
3448
|
+
verification,
|
|
3449
|
+
traceSummary,
|
|
3450
|
+
review,
|
|
3451
|
+
reviewAvailable,
|
|
3452
|
+
reviewError,
|
|
3453
|
+
durationMs: Date.now() - shotStart
|
|
3454
|
+
};
|
|
3455
|
+
shots.push(shotRecord);
|
|
3456
|
+
await shotHandle?.end({
|
|
3457
|
+
attributes: {
|
|
3458
|
+
verificationPass: verification.pass,
|
|
3459
|
+
verificationScore: verification.score ?? null,
|
|
3460
|
+
reviewShouldContinue: review.shouldContinue,
|
|
3461
|
+
reviewConfidence: review.confidence,
|
|
3462
|
+
reviewAvailable
|
|
3463
|
+
}
|
|
3464
|
+
});
|
|
3465
|
+
if (verification.pass) {
|
|
3466
|
+
completed = true;
|
|
3467
|
+
break;
|
|
3468
|
+
}
|
|
3469
|
+
if (!review.shouldContinue) {
|
|
3470
|
+
break;
|
|
3471
|
+
}
|
|
3472
|
+
if (confidenceFloorWindow > 0 && review.confidence <= confidenceFloor) {
|
|
3473
|
+
lowConfidenceStreak += 1;
|
|
3474
|
+
if (lowConfidenceStreak >= confidenceFloorWindow) break;
|
|
3475
|
+
} else {
|
|
3476
|
+
lowConfidenceStreak = 0;
|
|
3477
|
+
}
|
|
3478
|
+
priorReview = review;
|
|
3479
|
+
}
|
|
3480
|
+
if (!completed && !failureClass) {
|
|
3481
|
+
failureClass = shots.length >= maxShots ? "budget_exceeded" : "unknown";
|
|
3482
|
+
}
|
|
3483
|
+
} finally {
|
|
3484
|
+
clearTimeout(wallTimer);
|
|
3485
|
+
}
|
|
3486
|
+
const score = lastVerification.pass ? 1 : typeof lastVerification.score === "number" ? lastVerification.score : 0;
|
|
3487
|
+
if (emitter) {
|
|
3488
|
+
await emitter.endRun({
|
|
3489
|
+
pass: completed,
|
|
3490
|
+
score,
|
|
3491
|
+
failureClass,
|
|
3492
|
+
notes: `${shots.length} shot(s); final pass=${lastVerification.pass}`
|
|
3493
|
+
});
|
|
3494
|
+
}
|
|
3495
|
+
return {
|
|
3496
|
+
runId: emitter?.runId ?? null,
|
|
3497
|
+
completed,
|
|
3498
|
+
shots,
|
|
3499
|
+
finalState: state,
|
|
3500
|
+
finalVerification: lastVerification,
|
|
3501
|
+
failureClass,
|
|
3502
|
+
wallMs: Date.now() - wallStart,
|
|
3503
|
+
score
|
|
3504
|
+
};
|
|
3505
|
+
}
|
|
3506
|
+
var REVIEWER_SYSTEM_PROMPT = `You are a senior reviewer directing a multi-shot build loop.
|
|
3507
|
+
You do NOT grade \u2014 the verifier already did. Your job is to direct the worker's next shot.
|
|
3508
|
+
You are blind to the worker's inner monologue. You see what it DID, not what it thought.
|
|
3509
|
+
Return STRICT JSON matching the schema. No prose outside the JSON.`;
|
|
3510
|
+
function createLlmReviewer(cfg) {
|
|
3511
|
+
const renderState = cfg.renderState ?? ((s) => safeJson(s));
|
|
3512
|
+
const renderTraceSummary = cfg.renderTraceSummary ?? ((s) => s === void 0 ? "(none)" : safeJson(s));
|
|
3513
|
+
const system = cfg.systemPromptAddendum ? `${REVIEWER_SYSTEM_PROMPT}
|
|
3514
|
+
|
|
3515
|
+
${cfg.systemPromptAddendum}` : REVIEWER_SYSTEM_PROMPT;
|
|
3516
|
+
return async (input) => {
|
|
3517
|
+
const memoryBlock = input.memory.length === 0 ? "(no prior shots \u2014 this is shot 1)" : input.memory.map((m) => [
|
|
3518
|
+
`shot ${m.shot} \u2014 verification.pass=${m.verification.pass}` + (typeof m.verification.score === "number" ? ` score=${m.verification.score.toFixed(2)}` : "") + ` confidence=${m.confidence.toFixed(2)} failing=[${(m.verification.failingLayers ?? []).join(",")}]`,
|
|
3519
|
+
` observations: ${m.observations.slice(0, 400)}`,
|
|
3520
|
+
` diagnosis: ${m.diagnosis.slice(0, 400)}`,
|
|
3521
|
+
` instruction given: ${m.nextShotInstruction.slice(0, 400)}`
|
|
3522
|
+
].join("\n")).join("\n\n");
|
|
3523
|
+
const user = [
|
|
3524
|
+
`=== GOAL ===`,
|
|
3525
|
+
input.goal,
|
|
3526
|
+
``,
|
|
3527
|
+
`=== SHOT NUMBER ===`,
|
|
3528
|
+
String(input.shot),
|
|
3529
|
+
``,
|
|
3530
|
+
`=== CURRENT STATE ===`,
|
|
3531
|
+
renderState(input.state),
|
|
3532
|
+
``,
|
|
3533
|
+
`=== TRACE SUMMARY ===`,
|
|
3534
|
+
renderTraceSummary(input.traceSummary),
|
|
3535
|
+
``,
|
|
3536
|
+
`=== VERIFICATION ===`,
|
|
3537
|
+
summarizeVerification(input.verification),
|
|
3538
|
+
``,
|
|
3539
|
+
`=== REVIEWER MEMORY (prior shots) ===`,
|
|
3540
|
+
memoryBlock,
|
|
3541
|
+
``,
|
|
3542
|
+
`=== YOUR TASK ===`,
|
|
3543
|
+
`Return STRICT JSON:`,
|
|
3544
|
+
`{`,
|
|
3545
|
+
` "observations": string (20..2000 chars, first-person worker behavior \u2014 quote counts, errors, loops)`,
|
|
3546
|
+
` "diagnosis": string (20..1500 chars, root cause, NOT a restatement of verification)`,
|
|
3547
|
+
` "nextShotInstruction": string (40..3000 chars, concrete directive to the worker)`,
|
|
3548
|
+
` "shouldContinue": boolean (false if verification.pass, or if thrashing, or unachievable)`,
|
|
3549
|
+
` "confidence": number in [0,1]`,
|
|
3550
|
+
`}`
|
|
3551
|
+
].join("\n");
|
|
3552
|
+
const raw = await cfg.callJson({ system, user });
|
|
3553
|
+
return coerceReview(raw);
|
|
3554
|
+
};
|
|
3555
|
+
}
|
|
3556
|
+
function coerceReview(raw) {
|
|
3557
|
+
if (!raw || typeof raw !== "object") {
|
|
3558
|
+
throw new Error("reviewer returned non-object");
|
|
3559
|
+
}
|
|
3560
|
+
const observations = typeof raw.observations === "string" ? raw.observations : "";
|
|
3561
|
+
const diagnosis = typeof raw.diagnosis === "string" ? raw.diagnosis : "";
|
|
3562
|
+
const nextShotInstruction = typeof raw.nextShotInstruction === "string" ? raw.nextShotInstruction : "";
|
|
3563
|
+
if (!observations || !diagnosis || !nextShotInstruction) {
|
|
3564
|
+
throw new Error("reviewer missing required string fields");
|
|
3565
|
+
}
|
|
3566
|
+
if (typeof raw.shouldContinue !== "boolean") {
|
|
3567
|
+
throw new Error("reviewer missing shouldContinue boolean");
|
|
3568
|
+
}
|
|
3569
|
+
const confidenceRaw = Number(raw.confidence);
|
|
3570
|
+
if (!Number.isFinite(confidenceRaw)) {
|
|
3571
|
+
throw new Error("reviewer confidence not finite");
|
|
3572
|
+
}
|
|
3573
|
+
return {
|
|
3574
|
+
observations,
|
|
3575
|
+
diagnosis,
|
|
3576
|
+
nextShotInstruction,
|
|
3577
|
+
shouldContinue: raw.shouldContinue,
|
|
3578
|
+
confidence: Math.max(0, Math.min(1, confidenceRaw))
|
|
3579
|
+
};
|
|
3580
|
+
}
|
|
3581
|
+
function summarizeVerification(v) {
|
|
3582
|
+
const header = `pass=${v.pass}` + (typeof v.score === "number" ? ` score=${v.score.toFixed(3)}` : "") + (v.failingLayers && v.failingLayers.length > 0 ? ` failing=[${v.failingLayers.join(", ")}]` : "");
|
|
3583
|
+
const details = v.details === void 0 ? "" : `
|
|
3584
|
+
${safeJson(v.details).slice(0, 1500)}`;
|
|
3585
|
+
return header + details;
|
|
3586
|
+
}
|
|
3587
|
+
function safeJson(x) {
|
|
3588
|
+
try {
|
|
3589
|
+
return JSON.stringify(x, null, 2);
|
|
3590
|
+
} catch {
|
|
3591
|
+
return String(x);
|
|
3592
|
+
}
|
|
3593
|
+
}
|
|
3594
|
+
|
|
3595
|
+
// src/trace/schema.ts
|
|
3596
|
+
var TRACE_SCHEMA_VERSION = "1.0.0";
|
|
3597
|
+
var FAILURE_CLASSES = [
|
|
3598
|
+
"success",
|
|
3599
|
+
"reasoning_error",
|
|
3600
|
+
"tool_selection_error",
|
|
3601
|
+
"tool_argument_error",
|
|
3602
|
+
"tool_recovery_failure",
|
|
3603
|
+
"hallucination",
|
|
3604
|
+
"instruction_following",
|
|
3605
|
+
"safety_refusal_miss",
|
|
3606
|
+
"policy_violation",
|
|
3607
|
+
"budget_exceeded",
|
|
3608
|
+
"format_drift",
|
|
3609
|
+
"permission_escalation",
|
|
3610
|
+
"pii_leak",
|
|
3611
|
+
"cost_overrun",
|
|
3612
|
+
"timeout",
|
|
3613
|
+
"sandbox_failure",
|
|
3614
|
+
"unknown"
|
|
3615
|
+
];
|
|
3616
|
+
function isLlmSpan(s) {
|
|
3617
|
+
return s.kind === "llm";
|
|
3618
|
+
}
|
|
3619
|
+
function isToolSpan(s) {
|
|
3620
|
+
return s.kind === "tool";
|
|
3621
|
+
}
|
|
3622
|
+
function isRetrievalSpan(s) {
|
|
3623
|
+
return s.kind === "retrieval";
|
|
3624
|
+
}
|
|
3625
|
+
function isJudgeSpan(s) {
|
|
3626
|
+
return s.kind === "judge";
|
|
3627
|
+
}
|
|
3628
|
+
function isSandboxSpan(s) {
|
|
3629
|
+
return s.kind === "sandbox";
|
|
3630
|
+
}
|
|
3631
|
+
|
|
2600
3632
|
// src/trace/query.ts
|
|
2601
3633
|
async function runsForScenario(store, scenarioId) {
|
|
2602
3634
|
return store.listRuns({ scenarioId });
|
|
@@ -2825,181 +3857,6 @@ function runToTraceId(run) {
|
|
|
2825
3857
|
return cleaned.slice(0, 32).padEnd(32, "0");
|
|
2826
3858
|
}
|
|
2827
3859
|
|
|
2828
|
-
// src/sandbox-harness.ts
|
|
2829
|
-
var vitestTestParser = {
|
|
2830
|
-
id: "vitest",
|
|
2831
|
-
parse(stdout) {
|
|
2832
|
-
const m = stdout.match(/Tests\s+(\d+)\s+(passed|failed)(?:\s*\|\s*(\d+)\s+(passed|failed))?/i);
|
|
2833
|
-
if (!m) return void 0;
|
|
2834
|
-
let passed = 0;
|
|
2835
|
-
let failed = 0;
|
|
2836
|
-
const a = parseInt(m[1], 10);
|
|
2837
|
-
const aLabel = m[2].toLowerCase();
|
|
2838
|
-
if (aLabel === "passed") passed += a;
|
|
2839
|
-
else failed += a;
|
|
2840
|
-
if (m[3] && m[4]) {
|
|
2841
|
-
const b = parseInt(m[3], 10);
|
|
2842
|
-
if (m[4].toLowerCase() === "passed") passed += b;
|
|
2843
|
-
else failed += b;
|
|
2844
|
-
}
|
|
2845
|
-
return { testsTotal: passed + failed, testsPassed: passed };
|
|
2846
|
-
}
|
|
2847
|
-
};
|
|
2848
|
-
var pytestTestParser = {
|
|
2849
|
-
id: "pytest",
|
|
2850
|
-
parse(stdout) {
|
|
2851
|
-
const total = stdout.match(/collected\s+(\d+)\s+items?/i);
|
|
2852
|
-
const passed = stdout.match(/(\d+)\s+passed/);
|
|
2853
|
-
if (!total || !passed) return void 0;
|
|
2854
|
-
return { testsTotal: parseInt(total[1], 10), testsPassed: parseInt(passed[1], 10) };
|
|
2855
|
-
}
|
|
2856
|
-
};
|
|
2857
|
-
var jestTestParser = {
|
|
2858
|
-
id: "jest",
|
|
2859
|
-
parse(stdout) {
|
|
2860
|
-
const m = stdout.match(/Tests:\s+(?:(\d+)\s+failed[^,]*,\s*)?(\d+)\s+passed,\s+(\d+)\s+total/i);
|
|
2861
|
-
if (!m) return void 0;
|
|
2862
|
-
return { testsTotal: parseInt(m[3], 10), testsPassed: parseInt(m[2], 10) };
|
|
2863
|
-
}
|
|
2864
|
-
};
|
|
2865
|
-
function composeParsers(...parsers) {
|
|
2866
|
-
return {
|
|
2867
|
-
id: parsers.map((p) => p.id).join("|"),
|
|
2868
|
-
parse(stdout, stderr, exitCode) {
|
|
2869
|
-
for (const p of parsers) {
|
|
2870
|
-
const res = p.parse(stdout, stderr, exitCode);
|
|
2871
|
-
if (res) return res;
|
|
2872
|
-
}
|
|
2873
|
-
return void 0;
|
|
2874
|
-
}
|
|
2875
|
-
};
|
|
2876
|
-
}
|
|
2877
|
-
var SubprocessSandboxDriver = class {
|
|
2878
|
-
id = "subprocess";
|
|
2879
|
-
async exec(phase, command, config) {
|
|
2880
|
-
const { spawn } = await import("child_process");
|
|
2881
|
-
const start = Date.now();
|
|
2882
|
-
return await new Promise((resolve) => {
|
|
2883
|
-
const child = spawn(command, {
|
|
2884
|
-
shell: true,
|
|
2885
|
-
cwd: config.cwd,
|
|
2886
|
-
env: { ...process.env, ...config.env ?? {} }
|
|
2887
|
-
});
|
|
2888
|
-
let stdout = "";
|
|
2889
|
-
let stderr = "";
|
|
2890
|
-
child.stdout?.on("data", (d) => {
|
|
2891
|
-
stdout += String(d);
|
|
2892
|
-
});
|
|
2893
|
-
child.stderr?.on("data", (d) => {
|
|
2894
|
-
stderr += String(d);
|
|
2895
|
-
});
|
|
2896
|
-
const timeout = setTimeout(() => {
|
|
2897
|
-
try {
|
|
2898
|
-
child.kill("SIGKILL");
|
|
2899
|
-
} catch {
|
|
2900
|
-
}
|
|
2901
|
-
}, config.timeoutMs ?? 10 * 6e4);
|
|
2902
|
-
child.on("close", (code) => {
|
|
2903
|
-
clearTimeout(timeout);
|
|
2904
|
-
const wallMs = Date.now() - start;
|
|
2905
|
-
const parsed = phase === "test" && config.testParser ? config.testParser.parse(stdout, stderr, code ?? 1) : void 0;
|
|
2906
|
-
resolve({
|
|
2907
|
-
phase,
|
|
2908
|
-
exitCode: code ?? 1,
|
|
2909
|
-
stdout,
|
|
2910
|
-
stderr,
|
|
2911
|
-
wallMs,
|
|
2912
|
-
testsTotal: parsed?.testsTotal,
|
|
2913
|
-
testsPassed: parsed?.testsPassed
|
|
2914
|
-
});
|
|
2915
|
-
});
|
|
2916
|
-
child.on("error", (err) => {
|
|
2917
|
-
clearTimeout(timeout);
|
|
2918
|
-
const wallMs = Date.now() - start;
|
|
2919
|
-
resolve({ phase, exitCode: 127, stdout, stderr: stderr + String(err), wallMs });
|
|
2920
|
-
});
|
|
2921
|
-
});
|
|
2922
|
-
}
|
|
2923
|
-
};
|
|
2924
|
-
var DockerSandboxDriver = class {
|
|
2925
|
-
id = "docker";
|
|
2926
|
-
async exec(phase, command, config) {
|
|
2927
|
-
if (!config.image) throw new Error("DockerSandboxDriver requires config.image");
|
|
2928
|
-
const sub = new SubprocessSandboxDriver();
|
|
2929
|
-
const envArgs = Object.entries(config.env ?? {}).map(([k, v]) => `-e ${shellQuote(k)}=${shellQuote(v)}`).join(" ");
|
|
2930
|
-
const wrapped = `docker run --rm ${envArgs} ${shellQuote(config.image)} sh -c ${shellQuote(command)}`;
|
|
2931
|
-
return sub.exec(phase, wrapped, { ...config, env: void 0 });
|
|
2932
|
-
}
|
|
2933
|
-
};
|
|
2934
|
-
function shellQuote(v) {
|
|
2935
|
-
if (/^[A-Za-z0-9_\-\/\.@:=]+$/.test(v)) return v;
|
|
2936
|
-
return `'${v.replace(/'/g, `'\\''`)}'`;
|
|
2937
|
-
}
|
|
2938
|
-
var SandboxHarness = class {
|
|
2939
|
-
driver;
|
|
2940
|
-
constructor(driver = new SubprocessSandboxDriver()) {
|
|
2941
|
-
this.driver = driver;
|
|
2942
|
-
}
|
|
2943
|
-
async run(config, emitter) {
|
|
2944
|
-
const handle = await emitter.sandbox({
|
|
2945
|
-
name: `sandbox(${this.driver.id})`,
|
|
2946
|
-
image: config.image,
|
|
2947
|
-
command: [config.setupCommand, config.runCommand, config.testCommand].filter(Boolean).join(" && ")
|
|
2948
|
-
});
|
|
2949
|
-
const result = { passed: false, totalWallMs: 0, score: 0 };
|
|
2950
|
-
try {
|
|
2951
|
-
if (config.setupCommand) {
|
|
2952
|
-
result.setup = await this.driver.exec("setup", config.setupCommand, config);
|
|
2953
|
-
result.totalWallMs += result.setup.wallMs;
|
|
2954
|
-
if (result.setup.exitCode !== 0) {
|
|
2955
|
-
await handle.fail(`setup failed (exit ${result.setup.exitCode})`, {
|
|
2956
|
-
exitCode: result.setup.exitCode,
|
|
2957
|
-
wallMs: result.totalWallMs
|
|
2958
|
-
});
|
|
2959
|
-
return result;
|
|
2960
|
-
}
|
|
2961
|
-
}
|
|
2962
|
-
if (config.runCommand) {
|
|
2963
|
-
result.run = await this.driver.exec("run", config.runCommand, config);
|
|
2964
|
-
result.totalWallMs += result.run.wallMs;
|
|
2965
|
-
if (result.run.exitCode !== 0) {
|
|
2966
|
-
await handle.fail(`run failed (exit ${result.run.exitCode})`, {
|
|
2967
|
-
exitCode: result.run.exitCode,
|
|
2968
|
-
wallMs: result.totalWallMs
|
|
2969
|
-
});
|
|
2970
|
-
return result;
|
|
2971
|
-
}
|
|
2972
|
-
}
|
|
2973
|
-
if (config.testCommand) {
|
|
2974
|
-
result.test = await this.driver.exec("test", config.testCommand, config);
|
|
2975
|
-
result.totalWallMs += result.test.wallMs;
|
|
2976
|
-
const passed = result.test.exitCode === 0;
|
|
2977
|
-
result.passed = passed;
|
|
2978
|
-
if (result.test.testsTotal !== void 0 && result.test.testsTotal > 0) {
|
|
2979
|
-
result.score = (result.test.testsPassed ?? 0) / result.test.testsTotal;
|
|
2980
|
-
} else {
|
|
2981
|
-
result.score = passed ? 1 : 0;
|
|
2982
|
-
}
|
|
2983
|
-
await handle.end({
|
|
2984
|
-
exitCode: result.test.exitCode,
|
|
2985
|
-
testsTotal: result.test.testsTotal,
|
|
2986
|
-
testsPassed: result.test.testsPassed,
|
|
2987
|
-
wallMs: result.totalWallMs,
|
|
2988
|
-
status: passed ? "ok" : "error"
|
|
2989
|
-
});
|
|
2990
|
-
} else {
|
|
2991
|
-
result.passed = true;
|
|
2992
|
-
result.score = 1;
|
|
2993
|
-
await handle.end({ wallMs: result.totalWallMs });
|
|
2994
|
-
}
|
|
2995
|
-
} catch (err) {
|
|
2996
|
-
await handle.fail(err instanceof Error ? err : String(err));
|
|
2997
|
-
throw err;
|
|
2998
|
-
}
|
|
2999
|
-
return result;
|
|
3000
|
-
}
|
|
3001
|
-
};
|
|
3002
|
-
|
|
3003
3860
|
// src/test-graded-scenario.ts
|
|
3004
3861
|
async function runTestGradedScenario(scenario, store, options = {}) {
|
|
3005
3862
|
const emitter = new TraceEmitter(store);
|
|
@@ -3619,8 +4476,8 @@ function compareToBaseline(samples, options = {}) {
|
|
|
3619
4476
|
if (s.baseline.length < 2 || s.candidate.length < 2) {
|
|
3620
4477
|
throw new Error(`compareToBaseline: need \u22652 samples per side for "${s.metric}"`);
|
|
3621
4478
|
}
|
|
3622
|
-
const bMean =
|
|
3623
|
-
const cMean =
|
|
4479
|
+
const bMean = mean2(s.baseline);
|
|
4480
|
+
const cMean = mean2(s.candidate);
|
|
3624
4481
|
const delta = cMean - bMean;
|
|
3625
4482
|
const d = cohensD(s.baseline, s.candidate);
|
|
3626
4483
|
const { t, df, p } = welchsTTest(s.baseline, s.candidate);
|
|
@@ -3659,7 +4516,7 @@ function compareToBaseline(samples, options = {}) {
|
|
|
3659
4516
|
hasUnstable: metrics.some((m) => m.verdict === "unstable")
|
|
3660
4517
|
};
|
|
3661
4518
|
}
|
|
3662
|
-
function
|
|
4519
|
+
function mean2(xs) {
|
|
3663
4520
|
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
3664
4521
|
}
|
|
3665
4522
|
function iqr(xs) {
|
|
@@ -3675,8 +4532,8 @@ function iqr(xs) {
|
|
|
3675
4532
|
}
|
|
3676
4533
|
function welchsTTest(a, b) {
|
|
3677
4534
|
if (a.length < 2 || b.length < 2) return { t: 0, df: 0, p: 1 };
|
|
3678
|
-
const mA =
|
|
3679
|
-
const mB =
|
|
4535
|
+
const mA = mean2(a);
|
|
4536
|
+
const mB = mean2(b);
|
|
3680
4537
|
const vA = variance(a, mA);
|
|
3681
4538
|
const vB = variance(b, mB);
|
|
3682
4539
|
const seSquared = vA / a.length + vB / b.length;
|
|
@@ -4032,41 +4889,6 @@ function assertNonNegative(n, name) {
|
|
|
4032
4889
|
}
|
|
4033
4890
|
}
|
|
4034
4891
|
|
|
4035
|
-
// src/pareto.ts
|
|
4036
|
-
function dominates(a, b, objectives) {
|
|
4037
|
-
let strictlyBetter = false;
|
|
4038
|
-
for (const obj of objectives) {
|
|
4039
|
-
const av = obj.value(a);
|
|
4040
|
-
const bv = obj.value(b);
|
|
4041
|
-
if (!Number.isFinite(av) || !Number.isFinite(bv)) return false;
|
|
4042
|
-
const aIsBetter = obj.direction === "maximize" ? av > bv : av < bv;
|
|
4043
|
-
const aIsWorse = obj.direction === "maximize" ? av < bv : av > bv;
|
|
4044
|
-
if (aIsWorse) return false;
|
|
4045
|
-
if (aIsBetter) strictlyBetter = true;
|
|
4046
|
-
}
|
|
4047
|
-
return strictlyBetter;
|
|
4048
|
-
}
|
|
4049
|
-
function paretoFrontier(candidates, objectives) {
|
|
4050
|
-
if (objectives.length === 0) {
|
|
4051
|
-
throw new Error("paretoFrontier: at least 1 objective required");
|
|
4052
|
-
}
|
|
4053
|
-
const valid = candidates.filter(
|
|
4054
|
-
(c) => objectives.every((o) => Number.isFinite(o.value(c)))
|
|
4055
|
-
);
|
|
4056
|
-
const frontier = [];
|
|
4057
|
-
const dominated = [];
|
|
4058
|
-
for (const c of valid) {
|
|
4059
|
-
const isDominated = valid.some((other) => other !== c && dominates(other, c, objectives));
|
|
4060
|
-
if (isDominated) dominated.push(c);
|
|
4061
|
-
else frontier.push(c);
|
|
4062
|
-
}
|
|
4063
|
-
const dominanceMap = frontier.map((d) => ({
|
|
4064
|
-
dominator: d,
|
|
4065
|
-
dominated: dominated.filter((x) => dominates(d, x, objectives))
|
|
4066
|
-
}));
|
|
4067
|
-
return { frontier, dominated, dominanceMap };
|
|
4068
|
-
}
|
|
4069
|
-
|
|
4070
4892
|
// src/series-convergence.ts
|
|
4071
4893
|
function analyzeSeries(values, options = {}) {
|
|
4072
4894
|
const window = options.window ?? 5;
|
|
@@ -4076,10 +4898,10 @@ function analyzeSeries(values, options = {}) {
|
|
|
4076
4898
|
return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
|
|
4077
4899
|
}
|
|
4078
4900
|
const tail = values.slice(-window);
|
|
4079
|
-
const
|
|
4080
|
-
const variance2 = tail.reduce((acc, v) => acc + (v -
|
|
4901
|
+
const mean4 = tail.reduce((a, b) => a + b, 0) / tail.length;
|
|
4902
|
+
const variance2 = tail.reduce((acc, v) => acc + (v - mean4) ** 2, 0) / tail.length;
|
|
4081
4903
|
const stdDev = Math.sqrt(variance2);
|
|
4082
|
-
const refMean = Math.abs(
|
|
4904
|
+
const refMean = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
|
|
4083
4905
|
const cv = stdDev / refMean;
|
|
4084
4906
|
const stable = tail.length >= window && cv <= stableCv;
|
|
4085
4907
|
let tailRun = 0;
|
|
@@ -4100,7 +4922,7 @@ function analyzeSeries(values, options = {}) {
|
|
|
4100
4922
|
} else {
|
|
4101
4923
|
state = "noisy";
|
|
4102
4924
|
}
|
|
4103
|
-
return { state, windowMean:
|
|
4925
|
+
return { state, windowMean: mean4, windowCv: cv, tailRun, stable };
|
|
4104
4926
|
}
|
|
4105
4927
|
|
|
4106
4928
|
// src/state-continuity.ts
|
|
@@ -5028,12 +5850,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
|
|
|
5028
5850
|
variantScores.push({ mutator: id, score, mutated });
|
|
5029
5851
|
all.push(score);
|
|
5030
5852
|
}
|
|
5031
|
-
const
|
|
5032
|
-
const variance2 = all.reduce((a, v) => a + (v -
|
|
5853
|
+
const mean4 = all.reduce((a, b) => a + b, 0) / all.length;
|
|
5854
|
+
const variance2 = all.reduce((a, v) => a + (v - mean4) ** 2, 0) / all.length;
|
|
5033
5855
|
const stdDev = Math.sqrt(variance2);
|
|
5034
|
-
const ref = Math.abs(
|
|
5856
|
+
const ref = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
|
|
5035
5857
|
const robustness = Math.max(0, 1 - stdDev / ref);
|
|
5036
|
-
return { originalScore, variantScores, meanScore:
|
|
5858
|
+
return { originalScore, variantScores, meanScore: mean4, stdDev, robustness };
|
|
5037
5859
|
}
|
|
5038
5860
|
var lowercaseMutator = (p) => p.toLowerCase();
|
|
5039
5861
|
var sentenceReorderMutator = (p, seed) => {
|
|
@@ -5284,8 +6106,11 @@ async function scoreProject(store, projectId) {
|
|
|
5284
6106
|
const runtimeScore = runtimeScores.length > 0 ? runtimeScores.reduce((a, b) => a + b, 0) / runtimeScores.length : null;
|
|
5285
6107
|
const runtimePassed = runtime.filter((r) => r.outcome?.pass === true).length;
|
|
5286
6108
|
const runtimePassRate = runtime.length > 0 ? runtimePassed / runtime.length : null;
|
|
6109
|
+
const kind = runtime.length === 0 ? "scaffold-only" : "full";
|
|
6110
|
+
const complete = kind === "scaffold-only" ? metaScore !== null && buildScore !== null : metaScore !== null && buildScore !== null && runtimeScore !== null;
|
|
5287
6111
|
return {
|
|
5288
6112
|
projectId,
|
|
6113
|
+
kind,
|
|
5289
6114
|
builderRunId: builder?.runId,
|
|
5290
6115
|
metaScore,
|
|
5291
6116
|
buildRunId: build?.runId,
|
|
@@ -5293,7 +6118,7 @@ async function scoreProject(store, projectId) {
|
|
|
5293
6118
|
appRuntimeRunIds: runtime.map((r) => r.runId),
|
|
5294
6119
|
runtimeScore,
|
|
5295
6120
|
runtimePassRate,
|
|
5296
|
-
complete
|
|
6121
|
+
complete
|
|
5297
6122
|
};
|
|
5298
6123
|
}
|
|
5299
6124
|
async function scoreAllProjects(store) {
|
|
@@ -5715,8 +6540,8 @@ async function calibrationCurve(traceStore, outcomeStore, evalMetric, outcomeMet
|
|
|
5715
6540
|
function toBin(chunk, lower, upper) {
|
|
5716
6541
|
const xs = chunk.map((c) => c.x);
|
|
5717
6542
|
const ys = chunk.map((c) => c.y);
|
|
5718
|
-
const evalMean =
|
|
5719
|
-
const outcomeMean =
|
|
6543
|
+
const evalMean = mean3(xs);
|
|
6544
|
+
const outcomeMean = mean3(ys);
|
|
5720
6545
|
return {
|
|
5721
6546
|
lower: lower ?? Math.min(...xs),
|
|
5722
6547
|
upper: upper ?? Math.max(...xs),
|
|
@@ -5726,7 +6551,7 @@ function toBin(chunk, lower, upper) {
|
|
|
5726
6551
|
gap: Math.abs(outcomeMean - evalMean)
|
|
5727
6552
|
};
|
|
5728
6553
|
}
|
|
5729
|
-
function
|
|
6554
|
+
function mean3(xs) {
|
|
5730
6555
|
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
5731
6556
|
}
|
|
5732
6557
|
function defaultExtract4(metric) {
|
|
@@ -5951,8 +6776,8 @@ async function prmBestOfN(store, grader, runIds) {
|
|
|
5951
6776
|
if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
|
|
5952
6777
|
const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
|
|
5953
6778
|
const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
|
|
5954
|
-
const
|
|
5955
|
-
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore -
|
|
6779
|
+
const mean4 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
|
|
6780
|
+
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean4) ** 2, 0) / graded.length;
|
|
5956
6781
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
5957
6782
|
}
|
|
5958
6783
|
async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
@@ -5974,8 +6799,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
|
5974
6799
|
const ranked = [...byRun.values()].sort(
|
|
5975
6800
|
(a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
|
|
5976
6801
|
);
|
|
5977
|
-
const
|
|
5978
|
-
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore -
|
|
6802
|
+
const mean4 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
|
|
6803
|
+
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean4) ** 2, 0) / ranked.length;
|
|
5979
6804
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
5980
6805
|
}
|
|
5981
6806
|
|
|
@@ -6505,8 +7330,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
6505
7330
|
const sRuns = runs.filter((r) => r.scenarioId === s.id);
|
|
6506
7331
|
const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
|
|
6507
7332
|
if (scores.length < 3) continue;
|
|
6508
|
-
const
|
|
6509
|
-
const variance2 = scores.reduce((a, b) => a + (b -
|
|
7333
|
+
const mean4 = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
7334
|
+
const variance2 = scores.reduce((a, b) => a + (b - mean4) ** 2, 0) / scores.length;
|
|
6510
7335
|
if (variance2 > varianceThreshold) {
|
|
6511
7336
|
targets.push({
|
|
6512
7337
|
reason: "high-variance",
|
|
@@ -6987,6 +7812,7 @@ async function euAiActReport(ctx, signals) {
|
|
|
6987
7812
|
}
|
|
6988
7813
|
export {
|
|
6989
7814
|
AgentDriver,
|
|
7815
|
+
AxGepaSteeringOptimizer,
|
|
6990
7816
|
BenchmarkRunner,
|
|
6991
7817
|
BudgetBreachError,
|
|
6992
7818
|
BudgetGuard,
|
|
@@ -6995,9 +7821,11 @@ export {
|
|
|
6995
7821
|
CostTracker,
|
|
6996
7822
|
DEFAULT_AGENT_SLOS,
|
|
6997
7823
|
DEFAULT_RULES as DEFAULT_FAILURE_RULES,
|
|
7824
|
+
DEFAULT_HARNESS_OBJECTIVES,
|
|
6998
7825
|
DEFAULT_MUTATORS,
|
|
6999
7826
|
DEFAULT_REDACTION_RULES,
|
|
7000
7827
|
DEFAULT_RED_TEAM_CORPUS,
|
|
7828
|
+
DEFAULT_RUN_SCORE_WEIGHTS,
|
|
7001
7829
|
Dataset,
|
|
7002
7830
|
DockerSandboxDriver,
|
|
7003
7831
|
DualAgentBench,
|
|
@@ -7011,15 +7839,19 @@ export {
|
|
|
7011
7839
|
InMemoryOutcomeStore,
|
|
7012
7840
|
InMemoryTraceStore,
|
|
7013
7841
|
InMemoryWorkspaceInspector,
|
|
7842
|
+
JudgeRunner,
|
|
7014
7843
|
MODEL_PRICING,
|
|
7015
7844
|
MetricsCollector,
|
|
7016
7845
|
OTEL_AGENT_EVAL_SCOPE,
|
|
7846
|
+
OptimizationLoop,
|
|
7847
|
+
PairwiseSteeringOptimizer,
|
|
7017
7848
|
PrmGrader,
|
|
7018
7849
|
ProductClient,
|
|
7019
7850
|
ProjectRegistry,
|
|
7020
7851
|
PromptOptimizer,
|
|
7021
7852
|
PromptRegistry,
|
|
7022
7853
|
REDACTION_VERSION,
|
|
7854
|
+
RunCritic,
|
|
7023
7855
|
SandboxHarness,
|
|
7024
7856
|
ScenarioRegistry,
|
|
7025
7857
|
SubprocessSandboxDriver,
|
|
@@ -7028,6 +7860,7 @@ export {
|
|
|
7028
7860
|
TraceEmitter,
|
|
7029
7861
|
adversarialJudge,
|
|
7030
7862
|
aggregateLlm,
|
|
7863
|
+
aggregateRunScore,
|
|
7031
7864
|
analyzeAntiSlop,
|
|
7032
7865
|
analyzeSeries,
|
|
7033
7866
|
argHash,
|
|
@@ -7044,6 +7877,7 @@ export {
|
|
|
7044
7877
|
causalAttribution,
|
|
7045
7878
|
checkCanaries,
|
|
7046
7879
|
checkSlos,
|
|
7880
|
+
clamp01,
|
|
7047
7881
|
classifyEuAiRisk,
|
|
7048
7882
|
classifyFailure,
|
|
7049
7883
|
codeExecutionJudge,
|
|
@@ -7052,6 +7886,7 @@ export {
|
|
|
7052
7886
|
collectionPreserved,
|
|
7053
7887
|
commitBisect,
|
|
7054
7888
|
compareToBaseline,
|
|
7889
|
+
compilerJudge,
|
|
7055
7890
|
composeParsers,
|
|
7056
7891
|
composeValidators,
|
|
7057
7892
|
computeToolUseMetrics,
|
|
@@ -7062,8 +7897,10 @@ export {
|
|
|
7062
7897
|
createAntiSlopJudge,
|
|
7063
7898
|
createCustomJudge,
|
|
7064
7899
|
createDomainExpertJudge,
|
|
7900
|
+
createLlmReviewer,
|
|
7065
7901
|
crossTraceDiff,
|
|
7066
7902
|
defaultJudges,
|
|
7903
|
+
distillPlaybook,
|
|
7067
7904
|
dominates,
|
|
7068
7905
|
estimateCost,
|
|
7069
7906
|
estimateTokens,
|
|
@@ -7085,6 +7922,7 @@ export {
|
|
|
7085
7922
|
groupBy,
|
|
7086
7923
|
hashContent,
|
|
7087
7924
|
hashScenarios,
|
|
7925
|
+
inMemoryReviewStore,
|
|
7088
7926
|
interRaterReliability,
|
|
7089
7927
|
iqr,
|
|
7090
7928
|
isJudgeSpan,
|
|
@@ -7096,14 +7934,17 @@ export {
|
|
|
7096
7934
|
jestTestParser,
|
|
7097
7935
|
jsonHasKeys,
|
|
7098
7936
|
jsonShape,
|
|
7937
|
+
jsonlReviewStore,
|
|
7099
7938
|
judgeAgreementView,
|
|
7100
7939
|
judgeSpans,
|
|
7101
7940
|
keyPreserved,
|
|
7941
|
+
linterJudge,
|
|
7102
7942
|
llmSpanFromProvider,
|
|
7103
7943
|
llmSpans,
|
|
7104
7944
|
loadScorerFromGrader,
|
|
7105
7945
|
lowercaseMutator,
|
|
7106
7946
|
mannWhitneyU,
|
|
7947
|
+
mergeSteeringBundle,
|
|
7107
7948
|
nistAiRmfReport,
|
|
7108
7949
|
nonRefusalRubric,
|
|
7109
7950
|
normalizeScores,
|
|
@@ -7131,6 +7972,8 @@ export {
|
|
|
7131
7972
|
regressionView,
|
|
7132
7973
|
renderMarkdown,
|
|
7133
7974
|
renderMarkdownReport,
|
|
7975
|
+
renderPlaybookMarkdown,
|
|
7976
|
+
renderSteeringText,
|
|
7134
7977
|
replayScorerOverCorpus,
|
|
7135
7978
|
replayTraceThroughJudge,
|
|
7136
7979
|
requiredSampleSize,
|
|
@@ -7142,6 +7985,9 @@ export {
|
|
|
7142
7985
|
runE2EWorkflow,
|
|
7143
7986
|
runExpectations,
|
|
7144
7987
|
runFailureClass,
|
|
7988
|
+
runHarnessExperiment,
|
|
7989
|
+
runJudgeFleet,
|
|
7990
|
+
runProposeReview,
|
|
7145
7991
|
runSelfPlay,
|
|
7146
7992
|
runTestGradedScenario,
|
|
7147
7993
|
runsForScenario,
|
|
@@ -7149,6 +7995,8 @@ export {
|
|
|
7149
7995
|
scoreContinuity,
|
|
7150
7996
|
scoreProject,
|
|
7151
7997
|
scoreRedTeamOutput,
|
|
7998
|
+
securityJudge,
|
|
7999
|
+
selectHarnessVariant,
|
|
7152
8000
|
selfPreference,
|
|
7153
8001
|
sentenceReorderMutator,
|
|
7154
8002
|
signManifest,
|
|
@@ -7156,6 +8004,8 @@ export {
|
|
|
7156
8004
|
statusAdvanced,
|
|
7157
8005
|
stuckLoopView,
|
|
7158
8006
|
summarize,
|
|
8007
|
+
summarizeHarnessResults,
|
|
8008
|
+
testJudge,
|
|
7159
8009
|
textInSnapshot,
|
|
7160
8010
|
toLangfuseEnvelope,
|
|
7161
8011
|
toNdjson,
|