@tangle-network/agent-eval 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +520 -79
- package/dist/index.js +1035 -322
- package/dist/index.js.map +1 -1
- package/package.json +5 -1
package/dist/index.js
CHANGED
|
@@ -2094,113 +2094,340 @@ function flatSamples(score) {
|
|
|
2094
2094
|
return out;
|
|
2095
2095
|
}
|
|
2096
2096
|
|
|
2097
|
-
// src/
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
|
|
2101
|
-
|
|
2102
|
-
|
|
2103
|
-
|
|
2104
|
-
|
|
2105
|
-
|
|
2106
|
-
|
|
2107
|
-
|
|
2108
|
-
|
|
2109
|
-
|
|
2110
|
-
|
|
2111
|
-
|
|
2112
|
-
|
|
2113
|
-
|
|
2114
|
-
|
|
2115
|
-
const proposal = await config.propose({
|
|
2116
|
-
scenario,
|
|
2117
|
-
roundIndex: r,
|
|
2118
|
-
priorProposal,
|
|
2119
|
-
priorCritique
|
|
2120
|
-
});
|
|
2121
|
-
const { critique, convergenceScore } = await config.critique({
|
|
2122
|
-
scenario,
|
|
2123
|
-
roundIndex: r,
|
|
2124
|
-
proposal
|
|
2125
|
-
});
|
|
2126
|
-
if (!Number.isFinite(convergenceScore) || convergenceScore < 0 || convergenceScore > 1) {
|
|
2127
|
-
throw new Error(
|
|
2128
|
-
`critique must return convergenceScore in [0,1]; got ${convergenceScore} for scenario ${scenario.id} round ${r}`
|
|
2129
|
-
);
|
|
2130
|
-
}
|
|
2131
|
-
const round = {
|
|
2132
|
-
roundIndex: r,
|
|
2133
|
-
proposal,
|
|
2134
|
-
critique,
|
|
2135
|
-
convergenceScore
|
|
2136
|
-
};
|
|
2137
|
-
history.push(round);
|
|
2138
|
-
config.onRoundComplete?.({ scenarioId: scenario.id, round });
|
|
2139
|
-
finalProposal = proposal;
|
|
2140
|
-
lastScore = convergenceScore;
|
|
2141
|
-
priorCritique = critique;
|
|
2142
|
-
if (convergenceScore >= threshold) {
|
|
2143
|
-
converged = true;
|
|
2144
|
-
roundsToConverge = r + 1;
|
|
2145
|
-
break;
|
|
2146
|
-
}
|
|
2147
|
-
}
|
|
2148
|
-
results.push({
|
|
2149
|
-
scenarioId: scenario.id,
|
|
2150
|
-
converged,
|
|
2151
|
-
roundsToConverge,
|
|
2152
|
-
finalProposal,
|
|
2153
|
-
history,
|
|
2154
|
-
finalScore: lastScore
|
|
2155
|
-
});
|
|
2097
|
+
// src/steering.ts
|
|
2098
|
+
function mergeSteeringBundle(base, delta) {
|
|
2099
|
+
return {
|
|
2100
|
+
...base,
|
|
2101
|
+
...delta.coderPrompt !== void 0 ? { coderPrompt: delta.coderPrompt } : {},
|
|
2102
|
+
...delta.continuePrompt !== void 0 ? { continuePrompt: delta.continuePrompt } : {},
|
|
2103
|
+
reviewerPrompts: {
|
|
2104
|
+
...base.reviewerPrompts ?? {},
|
|
2105
|
+
...delta.reviewerPrompts ?? {}
|
|
2106
|
+
},
|
|
2107
|
+
skills: delta.skills ?? base.skills,
|
|
2108
|
+
rolePrompts: {
|
|
2109
|
+
...base.rolePrompts ?? {},
|
|
2110
|
+
...delta.rolePrompts ?? {}
|
|
2111
|
+
},
|
|
2112
|
+
metadata: {
|
|
2113
|
+
...base.metadata ?? {},
|
|
2114
|
+
...delta.metadata ?? {}
|
|
2156
2115
|
}
|
|
2157
|
-
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
|
|
2116
|
+
};
|
|
2117
|
+
}
|
|
2118
|
+
function renderSteeringText(bundle) {
|
|
2119
|
+
const lines = [`bundle:${bundle.id}`];
|
|
2120
|
+
if (bundle.coderPrompt) lines.push(`coder:${bundle.coderPrompt}`);
|
|
2121
|
+
if (bundle.continuePrompt) lines.push(`continue:${bundle.continuePrompt}`);
|
|
2122
|
+
const reviewers = Object.entries(bundle.reviewerPrompts ?? {}).sort(([a], [b]) => a.localeCompare(b));
|
|
2123
|
+
for (const [name, prompt] of reviewers) lines.push(`reviewer:${name}:${prompt}`);
|
|
2124
|
+
const skills = [...bundle.skills ?? []].sort();
|
|
2125
|
+
if (skills.length) lines.push(`skills:${skills.join(",")}`);
|
|
2126
|
+
return lines.join("\n");
|
|
2127
|
+
}
|
|
2128
|
+
|
|
2129
|
+
// src/run-score.ts
|
|
2130
|
+
var DEFAULT_RUN_SCORE_WEIGHTS = {
|
|
2131
|
+
success: 4,
|
|
2132
|
+
goalProgress: 2,
|
|
2133
|
+
repoGroundedness: 1.5,
|
|
2134
|
+
driftPenalty: -1.5,
|
|
2135
|
+
toolUseQuality: 1,
|
|
2136
|
+
patchQuality: 1.25,
|
|
2137
|
+
testReality: 1.5,
|
|
2138
|
+
costUsd: -0.2,
|
|
2139
|
+
wallSeconds: -0.1
|
|
2140
|
+
};
|
|
2141
|
+
function aggregateRunScore(score, weights = {}) {
|
|
2142
|
+
const w = { ...DEFAULT_RUN_SCORE_WEIGHTS, ...weights };
|
|
2143
|
+
return w.success * clamp01(score.success) + w.goalProgress * clamp01(score.goalProgress) + w.repoGroundedness * clamp01(score.repoGroundedness) + w.driftPenalty * clamp01(score.driftPenalty) + w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60);
|
|
2144
|
+
}
|
|
2145
|
+
function clamp01(value) {
|
|
2146
|
+
if (!Number.isFinite(value)) return 0;
|
|
2147
|
+
return Math.max(0, Math.min(1, value));
|
|
2148
|
+
}
|
|
2149
|
+
|
|
2150
|
+
// src/run-critic.ts
|
|
2151
|
+
var DEFAULT_DRIFT_PATTERNS = [
|
|
2152
|
+
/https?:\/\//i,
|
|
2153
|
+
/\btitle:\s/i,
|
|
2154
|
+
/\bsummary:\s/i,
|
|
2155
|
+
/\burl:\s/i,
|
|
2156
|
+
/\bnpm package usage\b/i,
|
|
2157
|
+
/\bnews\b/i
|
|
2158
|
+
];
|
|
2159
|
+
var RunCritic = class {
|
|
2160
|
+
weights;
|
|
2161
|
+
driftPatterns;
|
|
2162
|
+
constructor(options = {}) {
|
|
2163
|
+
this.weights = options.weights;
|
|
2164
|
+
this.driftPatterns = options.driftPatterns ?? DEFAULT_DRIFT_PATTERNS;
|
|
2165
|
+
}
|
|
2166
|
+
async score(store, runId) {
|
|
2167
|
+
const run = await store.getRun(runId);
|
|
2168
|
+
if (!run) throw new Error(`run ${runId} not found`);
|
|
2169
|
+
const [spans, events, artifacts, budget] = await Promise.all([
|
|
2170
|
+
store.spans({ runId }),
|
|
2171
|
+
store.events({ runId }),
|
|
2172
|
+
store.artifacts(runId),
|
|
2173
|
+
store.budget(runId)
|
|
2174
|
+
]);
|
|
2175
|
+
return this.scoreTrace({ run, spans, events, artifacts, budget });
|
|
2176
|
+
}
|
|
2177
|
+
scoreTrace(trace) {
|
|
2178
|
+
const notes = [];
|
|
2179
|
+
const llmSpans2 = trace.spans.filter((s) => s.kind === "llm");
|
|
2180
|
+
const toolSpans2 = trace.spans.filter((s) => s.kind === "tool");
|
|
2181
|
+
const judgeSpans2 = trace.spans.filter((s) => s.kind === "judge");
|
|
2182
|
+
const sandboxSpans = trace.spans.filter((s) => s.kind === "sandbox");
|
|
2183
|
+
const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
|
|
2184
|
+
if (!success) notes.push("run did not complete with pass=true");
|
|
2185
|
+
const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum, span) => sum + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
|
|
2186
|
+
const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score) : void 0;
|
|
2187
|
+
const goalProgress = outcomeScore ?? judgeAverage ?? success;
|
|
2188
|
+
const successfulTools = toolSpans2.filter((span) => span.status !== "error").length;
|
|
2189
|
+
const toolUseQuality = toolSpans2.length === 0 ? 0 : successfulTools / toolSpans2.length;
|
|
2190
|
+
if (toolSpans2.length === 0) notes.push("no tool spans recorded");
|
|
2191
|
+
const patchEvidence = trace.artifacts.length + toolSpans2.filter((span) => /write|edit|patch|apply/i.test(span.toolName)).length;
|
|
2192
|
+
const patchQuality = patchEvidence > 0 ? clamp01(patchEvidence / 4) : 0;
|
|
2193
|
+
if (!patchQuality) notes.push("no artifact or edit evidence recorded");
|
|
2194
|
+
const sandboxTests = sandboxSpans.filter((span) => typeof span.testsTotal === "number" && span.testsTotal > 0);
|
|
2195
|
+
const testReality = sandboxTests.length ? sandboxTests.reduce((sum, span) => sum + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1), 0) / sandboxTests.length : toolSpans2.some((span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))) ? 0.4 : 0;
|
|
2196
|
+
if (!testReality) notes.push("no real test/build evidence recorded");
|
|
2197
|
+
const positiveGroundingSignals = patchEvidence + sandboxSpans.length + llmSpans2.filter((span) => looksRepoGrounded(span.output ?? "")).length;
|
|
2198
|
+
const driftSignals = llmSpans2.filter((span) => this.isDrift(span.output ?? "")).length + trace.events.filter((event) => this.isDrift(JSON.stringify(event.payload))).length;
|
|
2199
|
+
const repoGroundedness = positiveGroundingSignals + driftSignals === 0 ? 0 : positiveGroundingSignals / (positiveGroundingSignals + driftSignals);
|
|
2200
|
+
const driftPenalty = positiveGroundingSignals + driftSignals === 0 ? 0 : driftSignals / (positiveGroundingSignals + driftSignals);
|
|
2201
|
+
if (driftSignals > 0) notes.push(`detected ${driftSignals} drift signal(s)`);
|
|
2202
|
+
const costUsd = trace.budget.length ? Math.max(...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed), 0) : llmSpans2.reduce((sum, span) => sum + (span.costUsd ?? 0), 0);
|
|
2203
|
+
const wallSeconds = trace.run.endedAt && trace.run.startedAt ? Math.max(0, (trace.run.endedAt - trace.run.startedAt) / 1e3) : 0;
|
|
2161
2204
|
return {
|
|
2162
|
-
|
|
2163
|
-
|
|
2164
|
-
|
|
2205
|
+
success,
|
|
2206
|
+
goalProgress,
|
|
2207
|
+
repoGroundedness,
|
|
2208
|
+
driftPenalty,
|
|
2209
|
+
toolUseQuality,
|
|
2210
|
+
patchQuality,
|
|
2211
|
+
testReality,
|
|
2212
|
+
costUsd,
|
|
2213
|
+
wallSeconds,
|
|
2214
|
+
notes
|
|
2165
2215
|
};
|
|
2166
2216
|
}
|
|
2217
|
+
rank(score) {
|
|
2218
|
+
return aggregateRunScore(score, this.weights);
|
|
2219
|
+
}
|
|
2220
|
+
isDrift(text) {
|
|
2221
|
+
return this.driftPatterns.some((pattern) => pattern.test(text));
|
|
2222
|
+
}
|
|
2167
2223
|
};
|
|
2224
|
+
function normalizeJudgeScore(score) {
|
|
2225
|
+
return score > 1 ? clamp01(score / 10) : clamp01(score);
|
|
2226
|
+
}
|
|
2227
|
+
function looksRepoGrounded(text) {
|
|
2228
|
+
return /(?:src\/|tests?\/|package\.json|tsconfig|\.ts\b|\.tsx\b|git status|pnpm |npm |vitest|pytest|jest)/i.test(text);
|
|
2229
|
+
}
|
|
2168
2230
|
|
|
2169
|
-
// src/
|
|
2170
|
-
|
|
2171
|
-
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
|
|
2175
|
-
|
|
2176
|
-
|
|
2177
|
-
|
|
2178
|
-
|
|
2179
|
-
|
|
2180
|
-
|
|
2181
|
-
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
"
|
|
2185
|
-
|
|
2186
|
-
|
|
2187
|
-
|
|
2188
|
-
|
|
2189
|
-
|
|
2190
|
-
|
|
2191
|
-
|
|
2231
|
+
// src/playbook.ts
|
|
2232
|
+
function distillPlaybook(entries, options = {}) {
|
|
2233
|
+
const maxEntries = options.maxEntries ?? 12;
|
|
2234
|
+
const byInstruction = /* @__PURE__ */ new Map();
|
|
2235
|
+
for (const entry of entries) {
|
|
2236
|
+
const key = normalizeInstruction(entry.instruction);
|
|
2237
|
+
const existing = byInstruction.get(key);
|
|
2238
|
+
if (!existing || (entry.weight ?? 0) > (existing.weight ?? 0)) {
|
|
2239
|
+
byInstruction.set(key, { ...entry, instruction: canonicalInstruction(entry.instruction) });
|
|
2240
|
+
}
|
|
2241
|
+
}
|
|
2242
|
+
const distilled = [...byInstruction.values()].sort((a, b) => (b.weight ?? 0) - (a.weight ?? 0)).slice(0, maxEntries);
|
|
2243
|
+
return { entries: distilled };
|
|
2244
|
+
}
|
|
2245
|
+
function renderPlaybookMarkdown(playbook) {
|
|
2246
|
+
const lines = ["# Playbook", ""];
|
|
2247
|
+
for (const entry of playbook.entries) {
|
|
2248
|
+
lines.push(`- ${entry.instruction}`);
|
|
2249
|
+
lines.push(` Rationale: ${entry.rationale}`);
|
|
2250
|
+
if (entry.category) lines.push(` Category: ${entry.category}`);
|
|
2251
|
+
if (entry.evidence) lines.push(` Evidence: ${entry.evidence}`);
|
|
2252
|
+
if (entry.sourceRunId) lines.push(` Source run: ${entry.sourceRunId}`);
|
|
2253
|
+
lines.push("");
|
|
2254
|
+
}
|
|
2255
|
+
return lines.join("\n").trim() + "\n";
|
|
2192
2256
|
}
|
|
2193
|
-
function
|
|
2194
|
-
return s
|
|
2257
|
+
function normalizeInstruction(value) {
|
|
2258
|
+
return value.trim().toLowerCase().replace(/\s+/g, " ");
|
|
2195
2259
|
}
|
|
2196
|
-
function
|
|
2197
|
-
|
|
2260
|
+
function canonicalInstruction(value) {
|
|
2261
|
+
const normalized = value.trim().replace(/\s+/g, " ");
|
|
2262
|
+
return normalized.length === 0 ? normalized : normalized[0].toUpperCase() + normalized.slice(1);
|
|
2198
2263
|
}
|
|
2199
|
-
|
|
2200
|
-
|
|
2264
|
+
|
|
2265
|
+
// src/optimization-loop.ts
|
|
2266
|
+
var OptimizationLoop = class {
|
|
2267
|
+
optimizer;
|
|
2268
|
+
constructor(optimizer = new PromptOptimizer()) {
|
|
2269
|
+
this.optimizer = optimizer;
|
|
2270
|
+
}
|
|
2271
|
+
async run(config) {
|
|
2272
|
+
const byId = new Map(config.variants.map((variant) => [variant.id, variant]));
|
|
2273
|
+
const result = await this.optimizer.run({
|
|
2274
|
+
variants: config.variants.map((variant) => ({
|
|
2275
|
+
id: variant.id,
|
|
2276
|
+
prompt: renderSteeringText(variant),
|
|
2277
|
+
metadata: { bundle: variant }
|
|
2278
|
+
})),
|
|
2279
|
+
scenarioIds: config.examples.map((example) => example.scenarioId),
|
|
2280
|
+
trialsPerScenario: config.trialsPerScenario,
|
|
2281
|
+
scoreVariant: async ({ variant, scenarioId, trialIndex }) => {
|
|
2282
|
+
const bundle = byId.get(variant.id);
|
|
2283
|
+
if (!bundle) throw new Error(`unknown steering bundle ${variant.id}`);
|
|
2284
|
+
const example = config.examples.find((item) => item.scenarioId === scenarioId);
|
|
2285
|
+
if (!example) throw new Error(`unknown optimization example ${scenarioId}`);
|
|
2286
|
+
const score = await config.evaluate({ variant: bundle, example, trialIndex });
|
|
2287
|
+
return aggregateRunScore(score, config.scoreWeights);
|
|
2288
|
+
}
|
|
2289
|
+
});
|
|
2290
|
+
return {
|
|
2291
|
+
winner: byId.get(result.winner.variantId),
|
|
2292
|
+
significant: result.winner.significant,
|
|
2293
|
+
reports: result.scores.map((score) => ({
|
|
2294
|
+
variantId: score.variantId,
|
|
2295
|
+
bundle: byId.get(score.variantId),
|
|
2296
|
+
mean: score.mean,
|
|
2297
|
+
ci95: score.ci95,
|
|
2298
|
+
scenarioScores: score.perScenario
|
|
2299
|
+
})),
|
|
2300
|
+
pairwise: result.pairwise
|
|
2301
|
+
};
|
|
2302
|
+
}
|
|
2303
|
+
};
|
|
2304
|
+
|
|
2305
|
+
// src/steering-optimizer.ts
|
|
2306
|
+
var PairwiseSteeringOptimizer = class {
|
|
2307
|
+
optimize(rows, config = {}) {
|
|
2308
|
+
const ranked = rankRows(rows, config.weights);
|
|
2309
|
+
if (!ranked.length) throw new Error("no steering optimization rows");
|
|
2310
|
+
return {
|
|
2311
|
+
backend: "pairwise",
|
|
2312
|
+
recommendedVariantId: ranked[0].variantId,
|
|
2313
|
+
rationale: `Highest observed mean aggregate across ${rows.length} scored run(s).`,
|
|
2314
|
+
rankings: ranked
|
|
2315
|
+
};
|
|
2316
|
+
}
|
|
2317
|
+
};
|
|
2318
|
+
var AxGepaSteeringOptimizer = class {
|
|
2319
|
+
constructor(config) {
|
|
2320
|
+
this.config = config;
|
|
2321
|
+
}
|
|
2322
|
+
config;
|
|
2323
|
+
async optimize(rows) {
|
|
2324
|
+
const fallback = new PairwiseSteeringOptimizer().optimize(rows, this.config);
|
|
2325
|
+
const minRows = this.config.minRows ?? 6;
|
|
2326
|
+
const variantIds = [...new Set(rows.map((row) => row.variantId))];
|
|
2327
|
+
const byScenario = collapseScenarioWinners(rows, this.config.weights);
|
|
2328
|
+
if (variantIds.length < 2 || byScenario.length < minRows) {
|
|
2329
|
+
return {
|
|
2330
|
+
...fallback,
|
|
2331
|
+
backend: "ax-gepa",
|
|
2332
|
+
skipped: true,
|
|
2333
|
+
rationale: `AxGEPA skipped: need >=2 variants and >=${minRows} scenario winners, got ${variantIds.length} variant(s) and ${byScenario.length} scenario winner(s).`
|
|
2334
|
+
};
|
|
2335
|
+
}
|
|
2336
|
+
let axLib;
|
|
2337
|
+
try {
|
|
2338
|
+
axLib = await import("@ax-llm/ax");
|
|
2339
|
+
} catch {
|
|
2340
|
+
return {
|
|
2341
|
+
...fallback,
|
|
2342
|
+
backend: "ax-gepa",
|
|
2343
|
+
skipped: true,
|
|
2344
|
+
rationale: "AxGEPA unavailable: install @ax-llm/ax to enable selector optimization."
|
|
2345
|
+
};
|
|
2346
|
+
}
|
|
2347
|
+
const { ai, ax, AxGEPA } = axLib;
|
|
2348
|
+
const signature = `task:string, split:string, seedPreview:string -> variantId:class "${variantIds.join(", ")}", rationale:string`;
|
|
2349
|
+
const selector = ax(signature, {
|
|
2350
|
+
description: "Choose the best steering bundle variant for an autopilot task."
|
|
2351
|
+
});
|
|
2352
|
+
const splitIndex = Math.max(1, Math.floor(byScenario.length * 0.8));
|
|
2353
|
+
const train = byScenario.slice(0, splitIndex);
|
|
2354
|
+
const validation = byScenario.slice(splitIndex);
|
|
2355
|
+
if (!validation.length) {
|
|
2356
|
+
return {
|
|
2357
|
+
...fallback,
|
|
2358
|
+
backend: "ax-gepa",
|
|
2359
|
+
skipped: true,
|
|
2360
|
+
rationale: "AxGEPA skipped: no validation examples after split."
|
|
2361
|
+
};
|
|
2362
|
+
}
|
|
2363
|
+
const optimizer = new AxGEPA({
|
|
2364
|
+
studentAI: createAxService(ai, this.config.provider, this.config.apiKey, this.config.model),
|
|
2365
|
+
teacherAI: createAxService(ai, this.config.provider, this.config.apiKey, this.config.teacherModel ?? this.config.model),
|
|
2366
|
+
numTrials: 8,
|
|
2367
|
+
minibatch: true,
|
|
2368
|
+
minibatchSize: 4,
|
|
2369
|
+
earlyStoppingTrials: 3,
|
|
2370
|
+
sampleCount: 1
|
|
2371
|
+
});
|
|
2372
|
+
const compiled = await optimizer.compile(
|
|
2373
|
+
selector,
|
|
2374
|
+
train,
|
|
2375
|
+
(({ prediction, example }) => prediction?.variantId === example?.variantId ? 1 : 0),
|
|
2376
|
+
{
|
|
2377
|
+
validationExamples: validation,
|
|
2378
|
+
maxMetricCalls: 64
|
|
2379
|
+
}
|
|
2380
|
+
);
|
|
2381
|
+
selector.applyOptimization(compiled.optimizedProgram);
|
|
2382
|
+
return {
|
|
2383
|
+
...fallback,
|
|
2384
|
+
backend: "ax-gepa",
|
|
2385
|
+
rationale: `AxGEPA trained a variant selector from ${byScenario.length} scored scenario winner(s); default winner remains ${fallback.recommendedVariantId}.`,
|
|
2386
|
+
selector: {
|
|
2387
|
+
backend: "ax-gepa",
|
|
2388
|
+
signature,
|
|
2389
|
+
labels: variantIds,
|
|
2390
|
+
rationale: compiled.bestScore !== void 0 ? `bestScore=${compiled.bestScore}` : void 0
|
|
2391
|
+
}
|
|
2392
|
+
};
|
|
2393
|
+
}
|
|
2394
|
+
};
|
|
2395
|
+
function rankRows(rows, weights) {
|
|
2396
|
+
const buckets = /* @__PURE__ */ new Map();
|
|
2397
|
+
for (const row of rows) {
|
|
2398
|
+
const values = buckets.get(row.variantId) ?? [];
|
|
2399
|
+
values.push(aggregateRunScore(row.score, weights));
|
|
2400
|
+
buckets.set(row.variantId, values);
|
|
2401
|
+
}
|
|
2402
|
+
return [...buckets.entries()].map(([variantId, values]) => ({
|
|
2403
|
+
variantId,
|
|
2404
|
+
mean: values.reduce((sum, value) => sum + value, 0) / values.length,
|
|
2405
|
+
runs: values.length
|
|
2406
|
+
})).sort((a, b) => b.mean - a.mean);
|
|
2407
|
+
}
|
|
2408
|
+
function collapseScenarioWinners(rows, weights) {
|
|
2409
|
+
const byScenario = /* @__PURE__ */ new Map();
|
|
2410
|
+
for (const row of rows) {
|
|
2411
|
+
const bucket = byScenario.get(row.scenarioId) ?? [];
|
|
2412
|
+
bucket.push(row);
|
|
2413
|
+
byScenario.set(row.scenarioId, bucket);
|
|
2414
|
+
}
|
|
2415
|
+
return [...byScenario.entries()].map(([scenarioId, scenarioRows]) => {
|
|
2416
|
+
const best = scenarioRows.map((row) => ({ row, aggregate: aggregateRunScore(row.score, weights) })).sort((a, b) => b.aggregate - a.aggregate)[0];
|
|
2417
|
+
return {
|
|
2418
|
+
task: String(best.row.metadata?.task ?? best.row.metadata?.seed_preview ?? scenarioId),
|
|
2419
|
+
split: String(best.row.metadata?.split ?? "train"),
|
|
2420
|
+
seedPreview: String(best.row.metadata?.seed_preview ?? ""),
|
|
2421
|
+
variantId: best.row.variantId
|
|
2422
|
+
};
|
|
2423
|
+
});
|
|
2201
2424
|
}
|
|
2202
|
-
function
|
|
2203
|
-
return
|
|
2425
|
+
function createAxService(aiFactory, provider, apiKey, model) {
|
|
2426
|
+
return aiFactory({
|
|
2427
|
+
name: provider,
|
|
2428
|
+
apiKey,
|
|
2429
|
+
config: { model }
|
|
2430
|
+
});
|
|
2204
2431
|
}
|
|
2205
2432
|
|
|
2206
2433
|
// src/trace/store.ts
|
|
@@ -2597,58 +2824,695 @@ function llmSpanFromProvider(args) {
|
|
|
2597
2824
|
};
|
|
2598
2825
|
}
|
|
2599
2826
|
|
|
2600
|
-
// src/
|
|
2601
|
-
|
|
2602
|
-
|
|
2603
|
-
|
|
2604
|
-
|
|
2605
|
-
|
|
2606
|
-
|
|
2607
|
-
|
|
2608
|
-
|
|
2609
|
-
|
|
2610
|
-
|
|
2611
|
-
|
|
2612
|
-
|
|
2613
|
-
|
|
2614
|
-
|
|
2615
|
-
|
|
2616
|
-
function groupBy(items, key) {
|
|
2617
|
-
const map = /* @__PURE__ */ new Map();
|
|
2618
|
-
for (const item of items) {
|
|
2619
|
-
const k = key(item);
|
|
2620
|
-
let bucket = map.get(k);
|
|
2621
|
-
if (!bucket) {
|
|
2622
|
-
bucket = [];
|
|
2623
|
-
map.set(k, bucket);
|
|
2827
|
+
// src/sandbox-harness.ts
|
|
2828
|
+
var vitestTestParser = {
|
|
2829
|
+
id: "vitest",
|
|
2830
|
+
parse(stdout) {
|
|
2831
|
+
const m = stdout.match(/Tests\s+(\d+)\s+(passed|failed)(?:\s*\|\s*(\d+)\s+(passed|failed))?/i);
|
|
2832
|
+
if (!m) return void 0;
|
|
2833
|
+
let passed = 0;
|
|
2834
|
+
let failed = 0;
|
|
2835
|
+
const a = parseInt(m[1], 10);
|
|
2836
|
+
const aLabel = m[2].toLowerCase();
|
|
2837
|
+
if (aLabel === "passed") passed += a;
|
|
2838
|
+
else failed += a;
|
|
2839
|
+
if (m[3] && m[4]) {
|
|
2840
|
+
const b = parseInt(m[3], 10);
|
|
2841
|
+
if (m[4].toLowerCase() === "passed") passed += b;
|
|
2842
|
+
else failed += b;
|
|
2624
2843
|
}
|
|
2625
|
-
|
|
2844
|
+
return { testsTotal: passed + failed, testsPassed: passed };
|
|
2626
2845
|
}
|
|
2627
|
-
|
|
2628
|
-
|
|
2629
|
-
|
|
2630
|
-
|
|
2631
|
-
|
|
2632
|
-
|
|
2633
|
-
|
|
2634
|
-
|
|
2635
|
-
|
|
2636
|
-
|
|
2637
|
-
|
|
2638
|
-
|
|
2639
|
-
|
|
2640
|
-
|
|
2641
|
-
(
|
|
2642
|
-
|
|
2643
|
-
|
|
2644
|
-
|
|
2645
|
-
|
|
2646
|
-
|
|
2647
|
-
|
|
2648
|
-
|
|
2846
|
+
};
|
|
2847
|
+
var pytestTestParser = {
|
|
2848
|
+
id: "pytest",
|
|
2849
|
+
parse(stdout) {
|
|
2850
|
+
const total = stdout.match(/collected\s+(\d+)\s+items?/i);
|
|
2851
|
+
const passed = stdout.match(/(\d+)\s+passed/);
|
|
2852
|
+
if (!total || !passed) return void 0;
|
|
2853
|
+
return { testsTotal: parseInt(total[1], 10), testsPassed: parseInt(passed[1], 10) };
|
|
2854
|
+
}
|
|
2855
|
+
};
|
|
2856
|
+
var jestTestParser = {
|
|
2857
|
+
id: "jest",
|
|
2858
|
+
parse(stdout) {
|
|
2859
|
+
const m = stdout.match(/Tests:\s+(?:(\d+)\s+failed[^,]*,\s*)?(\d+)\s+passed,\s+(\d+)\s+total/i);
|
|
2860
|
+
if (!m) return void 0;
|
|
2861
|
+
return { testsTotal: parseInt(m[3], 10), testsPassed: parseInt(m[2], 10) };
|
|
2862
|
+
}
|
|
2863
|
+
};
|
|
2864
|
+
function composeParsers(...parsers) {
|
|
2865
|
+
return {
|
|
2866
|
+
id: parsers.map((p) => p.id).join("|"),
|
|
2867
|
+
parse(stdout, stderr, exitCode) {
|
|
2868
|
+
for (const p of parsers) {
|
|
2869
|
+
const res = p.parse(stdout, stderr, exitCode);
|
|
2870
|
+
if (res) return res;
|
|
2871
|
+
}
|
|
2872
|
+
return void 0;
|
|
2873
|
+
}
|
|
2874
|
+
};
|
|
2649
2875
|
}
|
|
2650
|
-
|
|
2651
|
-
|
|
2876
|
+
var SubprocessSandboxDriver = class {
|
|
2877
|
+
id = "subprocess";
|
|
2878
|
+
async exec(phase, command, config) {
|
|
2879
|
+
const { spawn } = await import("child_process");
|
|
2880
|
+
const start = Date.now();
|
|
2881
|
+
return await new Promise((resolve) => {
|
|
2882
|
+
const child = spawn(command, {
|
|
2883
|
+
shell: true,
|
|
2884
|
+
cwd: config.cwd,
|
|
2885
|
+
env: { ...process.env, ...config.env ?? {} }
|
|
2886
|
+
});
|
|
2887
|
+
let stdout = "";
|
|
2888
|
+
let stderr = "";
|
|
2889
|
+
child.stdout?.on("data", (d) => {
|
|
2890
|
+
stdout += String(d);
|
|
2891
|
+
});
|
|
2892
|
+
child.stderr?.on("data", (d) => {
|
|
2893
|
+
stderr += String(d);
|
|
2894
|
+
});
|
|
2895
|
+
const timeout = setTimeout(() => {
|
|
2896
|
+
try {
|
|
2897
|
+
child.kill("SIGKILL");
|
|
2898
|
+
} catch {
|
|
2899
|
+
}
|
|
2900
|
+
}, config.timeoutMs ?? 10 * 6e4);
|
|
2901
|
+
child.on("close", (code) => {
|
|
2902
|
+
clearTimeout(timeout);
|
|
2903
|
+
const wallMs = Date.now() - start;
|
|
2904
|
+
const parsed = phase === "test" && config.testParser ? config.testParser.parse(stdout, stderr, code ?? 1) : void 0;
|
|
2905
|
+
resolve({
|
|
2906
|
+
phase,
|
|
2907
|
+
exitCode: code ?? 1,
|
|
2908
|
+
stdout,
|
|
2909
|
+
stderr,
|
|
2910
|
+
wallMs,
|
|
2911
|
+
testsTotal: parsed?.testsTotal,
|
|
2912
|
+
testsPassed: parsed?.testsPassed
|
|
2913
|
+
});
|
|
2914
|
+
});
|
|
2915
|
+
child.on("error", (err) => {
|
|
2916
|
+
clearTimeout(timeout);
|
|
2917
|
+
const wallMs = Date.now() - start;
|
|
2918
|
+
resolve({ phase, exitCode: 127, stdout, stderr: stderr + String(err), wallMs });
|
|
2919
|
+
});
|
|
2920
|
+
});
|
|
2921
|
+
}
|
|
2922
|
+
};
|
|
2923
|
+
var DockerSandboxDriver = class {
|
|
2924
|
+
id = "docker";
|
|
2925
|
+
async exec(phase, command, config) {
|
|
2926
|
+
if (!config.image) throw new Error("DockerSandboxDriver requires config.image");
|
|
2927
|
+
const sub = new SubprocessSandboxDriver();
|
|
2928
|
+
const envArgs = Object.entries(config.env ?? {}).map(([k, v]) => `-e ${shellQuote(k)}=${shellQuote(v)}`).join(" ");
|
|
2929
|
+
const wrapped = `docker run --rm ${envArgs} ${shellQuote(config.image)} sh -c ${shellQuote(command)}`;
|
|
2930
|
+
return sub.exec(phase, wrapped, { ...config, env: void 0 });
|
|
2931
|
+
}
|
|
2932
|
+
};
|
|
2933
|
+
function shellQuote(v) {
|
|
2934
|
+
if (/^[A-Za-z0-9_\-\/\.@:=]+$/.test(v)) return v;
|
|
2935
|
+
return `'${v.replace(/'/g, `'\\''`)}'`;
|
|
2936
|
+
}
|
|
2937
|
+
var SandboxHarness = class {
|
|
2938
|
+
driver;
|
|
2939
|
+
constructor(driver = new SubprocessSandboxDriver()) {
|
|
2940
|
+
this.driver = driver;
|
|
2941
|
+
}
|
|
2942
|
+
async run(config, emitter) {
|
|
2943
|
+
const handle = await emitter.sandbox({
|
|
2944
|
+
name: `sandbox(${this.driver.id})`,
|
|
2945
|
+
image: config.image,
|
|
2946
|
+
command: [config.setupCommand, config.runCommand, config.testCommand].filter(Boolean).join(" && ")
|
|
2947
|
+
});
|
|
2948
|
+
const result = { passed: false, totalWallMs: 0, score: 0 };
|
|
2949
|
+
try {
|
|
2950
|
+
if (config.setupCommand) {
|
|
2951
|
+
result.setup = await this.driver.exec("setup", config.setupCommand, config);
|
|
2952
|
+
result.totalWallMs += result.setup.wallMs;
|
|
2953
|
+
if (result.setup.exitCode !== 0) {
|
|
2954
|
+
await handle.fail(`setup failed (exit ${result.setup.exitCode})`, {
|
|
2955
|
+
exitCode: result.setup.exitCode,
|
|
2956
|
+
wallMs: result.totalWallMs
|
|
2957
|
+
});
|
|
2958
|
+
return result;
|
|
2959
|
+
}
|
|
2960
|
+
}
|
|
2961
|
+
if (config.runCommand) {
|
|
2962
|
+
result.run = await this.driver.exec("run", config.runCommand, config);
|
|
2963
|
+
result.totalWallMs += result.run.wallMs;
|
|
2964
|
+
if (result.run.exitCode !== 0) {
|
|
2965
|
+
await handle.fail(`run failed (exit ${result.run.exitCode})`, {
|
|
2966
|
+
exitCode: result.run.exitCode,
|
|
2967
|
+
wallMs: result.totalWallMs
|
|
2968
|
+
});
|
|
2969
|
+
return result;
|
|
2970
|
+
}
|
|
2971
|
+
}
|
|
2972
|
+
if (config.testCommand) {
|
|
2973
|
+
result.test = await this.driver.exec("test", config.testCommand, config);
|
|
2974
|
+
result.totalWallMs += result.test.wallMs;
|
|
2975
|
+
const passed = result.test.exitCode === 0;
|
|
2976
|
+
result.passed = passed;
|
|
2977
|
+
if (result.test.testsTotal !== void 0 && result.test.testsTotal > 0) {
|
|
2978
|
+
result.score = (result.test.testsPassed ?? 0) / result.test.testsTotal;
|
|
2979
|
+
} else {
|
|
2980
|
+
result.score = passed ? 1 : 0;
|
|
2981
|
+
}
|
|
2982
|
+
await handle.end({
|
|
2983
|
+
exitCode: result.test.exitCode,
|
|
2984
|
+
testsTotal: result.test.testsTotal,
|
|
2985
|
+
testsPassed: result.test.testsPassed,
|
|
2986
|
+
wallMs: result.totalWallMs,
|
|
2987
|
+
status: passed ? "ok" : "error"
|
|
2988
|
+
});
|
|
2989
|
+
} else {
|
|
2990
|
+
result.passed = true;
|
|
2991
|
+
result.score = 1;
|
|
2992
|
+
await handle.end({ wallMs: result.totalWallMs });
|
|
2993
|
+
}
|
|
2994
|
+
} catch (err) {
|
|
2995
|
+
await handle.fail(err instanceof Error ? err : String(err));
|
|
2996
|
+
throw err;
|
|
2997
|
+
}
|
|
2998
|
+
return result;
|
|
2999
|
+
}
|
|
3000
|
+
};
|
|
3001
|
+
|
|
3002
|
+
// src/judge-runner.ts
|
|
3003
|
+
var JudgeRunner = class {
|
|
3004
|
+
driver;
|
|
3005
|
+
constructor(driver = new SubprocessSandboxDriver()) {
|
|
3006
|
+
this.driver = driver;
|
|
3007
|
+
}
|
|
3008
|
+
async run(spec) {
|
|
3009
|
+
const store = new InMemoryTraceStore();
|
|
3010
|
+
const emitter = new TraceEmitter(store, { runId: `judge-${spec.id}` });
|
|
3011
|
+
await emitter.startRun({
|
|
3012
|
+
scenarioId: spec.id,
|
|
3013
|
+
layer: "meta",
|
|
3014
|
+
projectId: "judge-runner"
|
|
3015
|
+
});
|
|
3016
|
+
const harness = new SandboxHarness(this.driver);
|
|
3017
|
+
const detail = await harness.run(spec.config, emitter);
|
|
3018
|
+
await emitter.endRun({ pass: detail.passed, score: detail.score, notes: `${spec.kind} judge` });
|
|
3019
|
+
return {
|
|
3020
|
+
id: spec.id,
|
|
3021
|
+
kind: spec.kind,
|
|
3022
|
+
passed: detail.passed,
|
|
3023
|
+
score: detail.score,
|
|
3024
|
+
summary: renderJudgeSummary(spec.kind, detail),
|
|
3025
|
+
detail
|
|
3026
|
+
};
|
|
3027
|
+
}
|
|
3028
|
+
};
|
|
3029
|
+
async function runJudgeFleet(specs, options = {}) {
|
|
3030
|
+
const runner = new JudgeRunner(options.driver);
|
|
3031
|
+
if (options.parallel === false) {
|
|
3032
|
+
const results = [];
|
|
3033
|
+
for (const spec of specs) results.push(await runner.run(spec));
|
|
3034
|
+
return results;
|
|
3035
|
+
}
|
|
3036
|
+
return await Promise.all(specs.map((spec) => runner.run(spec)));
|
|
3037
|
+
}
|
|
3038
|
+
function compilerJudge(id, config) {
|
|
3039
|
+
return { id, kind: "compiler", config };
|
|
3040
|
+
}
|
|
3041
|
+
function testJudge(id, config) {
|
|
3042
|
+
return { id, kind: "test", config };
|
|
3043
|
+
}
|
|
3044
|
+
function linterJudge(id, config) {
|
|
3045
|
+
return { id, kind: "linter", config };
|
|
3046
|
+
}
|
|
3047
|
+
function securityJudge(id, config) {
|
|
3048
|
+
return { id, kind: "security", config };
|
|
3049
|
+
}
|
|
3050
|
+
function renderJudgeSummary(kind, detail) {
|
|
3051
|
+
if (!detail.passed) return `${kind} judge failed`;
|
|
3052
|
+
if (detail.test?.testsTotal) return `${kind} judge passed ${detail.test.testsPassed}/${detail.test.testsTotal} tests`;
|
|
3053
|
+
return `${kind} judge passed`;
|
|
3054
|
+
}
|
|
3055
|
+
|
|
3056
|
+
// src/dual-agent-bench.ts
|
|
3057
|
+
var DualAgentBench = class {
|
|
3058
|
+
async run(config) {
|
|
3059
|
+
const maxRounds = config.maxRounds ?? 5;
|
|
3060
|
+
const threshold = config.convergenceThreshold ?? 0.85;
|
|
3061
|
+
if (config.scenarios.length === 0) {
|
|
3062
|
+
throw new Error("DualAgentBench requires at least 1 scenario");
|
|
3063
|
+
}
|
|
3064
|
+
const results = [];
|
|
3065
|
+
for (const scenario of config.scenarios) {
|
|
3066
|
+
const history = [];
|
|
3067
|
+
let converged = false;
|
|
3068
|
+
let roundsToConverge = null;
|
|
3069
|
+
let finalProposal = "";
|
|
3070
|
+
let lastScore = 0;
|
|
3071
|
+
let priorCritique;
|
|
3072
|
+
for (let r = 0; r < maxRounds; r++) {
|
|
3073
|
+
const priorProposal = history[history.length - 1]?.proposal;
|
|
3074
|
+
const proposal = await config.propose({
|
|
3075
|
+
scenario,
|
|
3076
|
+
roundIndex: r,
|
|
3077
|
+
priorProposal,
|
|
3078
|
+
priorCritique
|
|
3079
|
+
});
|
|
3080
|
+
const { critique, convergenceScore } = await config.critique({
|
|
3081
|
+
scenario,
|
|
3082
|
+
roundIndex: r,
|
|
3083
|
+
proposal
|
|
3084
|
+
});
|
|
3085
|
+
if (!Number.isFinite(convergenceScore) || convergenceScore < 0 || convergenceScore > 1) {
|
|
3086
|
+
throw new Error(
|
|
3087
|
+
`critique must return convergenceScore in [0,1]; got ${convergenceScore} for scenario ${scenario.id} round ${r}`
|
|
3088
|
+
);
|
|
3089
|
+
}
|
|
3090
|
+
const round = {
|
|
3091
|
+
roundIndex: r,
|
|
3092
|
+
proposal,
|
|
3093
|
+
critique,
|
|
3094
|
+
convergenceScore
|
|
3095
|
+
};
|
|
3096
|
+
history.push(round);
|
|
3097
|
+
config.onRoundComplete?.({ scenarioId: scenario.id, round });
|
|
3098
|
+
finalProposal = proposal;
|
|
3099
|
+
lastScore = convergenceScore;
|
|
3100
|
+
priorCritique = critique;
|
|
3101
|
+
if (convergenceScore >= threshold) {
|
|
3102
|
+
converged = true;
|
|
3103
|
+
roundsToConverge = r + 1;
|
|
3104
|
+
break;
|
|
3105
|
+
}
|
|
3106
|
+
}
|
|
3107
|
+
results.push({
|
|
3108
|
+
scenarioId: scenario.id,
|
|
3109
|
+
converged,
|
|
3110
|
+
roundsToConverge,
|
|
3111
|
+
finalProposal,
|
|
3112
|
+
history,
|
|
3113
|
+
finalScore: lastScore
|
|
3114
|
+
});
|
|
3115
|
+
}
|
|
3116
|
+
const convergedResults = results.filter((r) => r.converged);
|
|
3117
|
+
const convergenceRate = results.length ? convergedResults.length / results.length : 0;
|
|
3118
|
+
const avgRoundsToConverge = convergedResults.length ? convergedResults.reduce((acc, r) => acc + (r.roundsToConverge ?? 0), 0) / convergedResults.length : null;
|
|
3119
|
+
const avgFinalScore = results.length ? results.reduce((acc, r) => acc + r.finalScore, 0) / results.length : 0;
|
|
3120
|
+
return {
|
|
3121
|
+
scenarios: results,
|
|
3122
|
+
aggregate: { convergenceRate, avgRoundsToConverge, avgFinalScore },
|
|
3123
|
+
config: { maxRounds, convergenceThreshold: threshold }
|
|
3124
|
+
};
|
|
3125
|
+
}
|
|
3126
|
+
};
|
|
3127
|
+
|
|
3128
|
+
// src/propose-review.ts
|
|
3129
|
+
import { appendFileSync, existsSync, mkdirSync, readFileSync } from "fs";
|
|
3130
|
+
import { dirname } from "path";
|
|
3131
|
+
function inMemoryReviewStore(initial = []) {
|
|
3132
|
+
const entries = [...initial];
|
|
3133
|
+
return {
|
|
3134
|
+
async load() {
|
|
3135
|
+
return [...entries];
|
|
3136
|
+
},
|
|
3137
|
+
async append(entry) {
|
|
3138
|
+
entries.push(entry);
|
|
3139
|
+
}
|
|
3140
|
+
};
|
|
3141
|
+
}
|
|
3142
|
+
function jsonlReviewStore(path) {
|
|
3143
|
+
return {
|
|
3144
|
+
async load() {
|
|
3145
|
+
if (!existsSync(path)) return [];
|
|
3146
|
+
const raw = readFileSync(path, "utf8");
|
|
3147
|
+
const out = [];
|
|
3148
|
+
for (const line of raw.split("\n")) {
|
|
3149
|
+
const trimmed = line.trim();
|
|
3150
|
+
if (!trimmed) continue;
|
|
3151
|
+
try {
|
|
3152
|
+
out.push(JSON.parse(trimmed));
|
|
3153
|
+
} catch {
|
|
3154
|
+
}
|
|
3155
|
+
}
|
|
3156
|
+
return out;
|
|
3157
|
+
},
|
|
3158
|
+
async append(entry) {
|
|
3159
|
+
mkdirSync(dirname(path), { recursive: true });
|
|
3160
|
+
appendFileSync(path, JSON.stringify(entry) + "\n");
|
|
3161
|
+
}
|
|
3162
|
+
};
|
|
3163
|
+
}
|
|
3164
|
+
var DEFAULT_FALLBACK_INSTRUCTION = "Inspect the verification failures above. Fix the critical issues first, then the major ones. Do not restate the failures \u2014 act on them.";
|
|
3165
|
+
async function runProposeReview(config) {
|
|
3166
|
+
const maxShots = config.maxShots ?? 10;
|
|
3167
|
+
const maxWallMs = config.maxWallMs ?? 10 * 60 * 1e3;
|
|
3168
|
+
const confidenceFloor = config.confidenceFloor ?? 0.3;
|
|
3169
|
+
const confidenceFloorWindow = config.confidenceFloorWindow ?? 2;
|
|
3170
|
+
const memory = config.memory ?? inMemoryReviewStore();
|
|
3171
|
+
const fallbackInstruction = config.fallbackInstruction ?? DEFAULT_FALLBACK_INSTRUCTION;
|
|
3172
|
+
const emitter = config.store ? new TraceEmitter(config.store) : null;
|
|
3173
|
+
if (emitter) {
|
|
3174
|
+
await emitter.startRun({
|
|
3175
|
+
scenarioId: config.scenarioId ?? "propose-review",
|
|
3176
|
+
projectId: config.projectId,
|
|
3177
|
+
variantId: config.variantId,
|
|
3178
|
+
layer: "meta",
|
|
3179
|
+
tags: {
|
|
3180
|
+
goal: config.goal.slice(0, 120),
|
|
3181
|
+
maxShots: String(maxShots)
|
|
3182
|
+
}
|
|
3183
|
+
});
|
|
3184
|
+
}
|
|
3185
|
+
const abort = new AbortController();
|
|
3186
|
+
const wallStart = Date.now();
|
|
3187
|
+
const wallTimer = setTimeout(() => abort.abort(new Error("propose-review wall timeout")), maxWallMs);
|
|
3188
|
+
const shots = [];
|
|
3189
|
+
let state = config.initialState;
|
|
3190
|
+
let priorReview = null;
|
|
3191
|
+
let lastVerification = { pass: false };
|
|
3192
|
+
let failureClass;
|
|
3193
|
+
let completed = false;
|
|
3194
|
+
let lowConfidenceStreak = 0;
|
|
3195
|
+
try {
|
|
3196
|
+
for (let shot = 1; shot <= maxShots; shot++) {
|
|
3197
|
+
if (abort.signal.aborted) {
|
|
3198
|
+
failureClass = "timeout";
|
|
3199
|
+
break;
|
|
3200
|
+
}
|
|
3201
|
+
const shotStart = Date.now();
|
|
3202
|
+
const shotHandle = emitter ? await emitter.span({ kind: "tool", name: `shot-${shot}` }) : null;
|
|
3203
|
+
let proposeOut;
|
|
3204
|
+
try {
|
|
3205
|
+
proposeOut = await config.propose({
|
|
3206
|
+
shot,
|
|
3207
|
+
goal: config.goal,
|
|
3208
|
+
state,
|
|
3209
|
+
priorReview,
|
|
3210
|
+
abortSignal: abort.signal,
|
|
3211
|
+
emitter: emitter ?? void 0
|
|
3212
|
+
});
|
|
3213
|
+
} catch (err) {
|
|
3214
|
+
await shotHandle?.fail(err instanceof Error ? err : String(err));
|
|
3215
|
+
failureClass = "unknown";
|
|
3216
|
+
throw err;
|
|
3217
|
+
}
|
|
3218
|
+
state = proposeOut.state;
|
|
3219
|
+
const traceSummary = proposeOut.traceSummary;
|
|
3220
|
+
let verification;
|
|
3221
|
+
try {
|
|
3222
|
+
verification = await config.verify(state);
|
|
3223
|
+
} catch (err) {
|
|
3224
|
+
await shotHandle?.fail(err instanceof Error ? err : String(err));
|
|
3225
|
+
failureClass = "unknown";
|
|
3226
|
+
throw err;
|
|
3227
|
+
}
|
|
3228
|
+
lastVerification = verification;
|
|
3229
|
+
const memorySnapshot = await memory.load();
|
|
3230
|
+
const verificationDigest = {
|
|
3231
|
+
pass: verification.pass,
|
|
3232
|
+
score: verification.score,
|
|
3233
|
+
failingLayers: verification.failingLayers ?? []
|
|
3234
|
+
};
|
|
3235
|
+
let review;
|
|
3236
|
+
let reviewAvailable = true;
|
|
3237
|
+
let reviewError;
|
|
3238
|
+
if (verification.pass) {
|
|
3239
|
+
review = {
|
|
3240
|
+
observations: "verification passed \u2014 skipping reviewer LLM call",
|
|
3241
|
+
diagnosis: "no failures to diagnose",
|
|
3242
|
+
nextShotInstruction: "(done)",
|
|
3243
|
+
shouldContinue: false,
|
|
3244
|
+
confidence: 1
|
|
3245
|
+
};
|
|
3246
|
+
} else {
|
|
3247
|
+
try {
|
|
3248
|
+
review = await config.review({
|
|
3249
|
+
shot,
|
|
3250
|
+
goal: config.goal,
|
|
3251
|
+
state,
|
|
3252
|
+
verification,
|
|
3253
|
+
traceSummary,
|
|
3254
|
+
memory: memorySnapshot
|
|
3255
|
+
});
|
|
3256
|
+
review = coerceReview(review);
|
|
3257
|
+
} catch (err) {
|
|
3258
|
+
reviewAvailable = false;
|
|
3259
|
+
reviewError = err instanceof Error ? err.message : String(err);
|
|
3260
|
+
const lastInstruction = memorySnapshot.length > 0 ? memorySnapshot[memorySnapshot.length - 1].nextShotInstruction : fallbackInstruction;
|
|
3261
|
+
review = {
|
|
3262
|
+
observations: "(reviewer unavailable \u2014 using last-known instruction)",
|
|
3263
|
+
diagnosis: reviewError,
|
|
3264
|
+
nextShotInstruction: lastInstruction,
|
|
3265
|
+
shouldContinue: true,
|
|
3266
|
+
confidence: 0.3
|
|
3267
|
+
};
|
|
3268
|
+
}
|
|
3269
|
+
}
|
|
3270
|
+
const entry = {
|
|
3271
|
+
shot,
|
|
3272
|
+
timestamp: Date.now(),
|
|
3273
|
+
...review,
|
|
3274
|
+
verification: verificationDigest
|
|
3275
|
+
};
|
|
3276
|
+
await memory.append(entry);
|
|
3277
|
+
const shotRecord = {
|
|
3278
|
+
shot,
|
|
3279
|
+
state,
|
|
3280
|
+
verification,
|
|
3281
|
+
traceSummary,
|
|
3282
|
+
review,
|
|
3283
|
+
reviewAvailable,
|
|
3284
|
+
reviewError,
|
|
3285
|
+
durationMs: Date.now() - shotStart
|
|
3286
|
+
};
|
|
3287
|
+
shots.push(shotRecord);
|
|
3288
|
+
await shotHandle?.end({
|
|
3289
|
+
attributes: {
|
|
3290
|
+
verificationPass: verification.pass,
|
|
3291
|
+
verificationScore: verification.score ?? null,
|
|
3292
|
+
reviewShouldContinue: review.shouldContinue,
|
|
3293
|
+
reviewConfidence: review.confidence,
|
|
3294
|
+
reviewAvailable
|
|
3295
|
+
}
|
|
3296
|
+
});
|
|
3297
|
+
if (verification.pass) {
|
|
3298
|
+
completed = true;
|
|
3299
|
+
break;
|
|
3300
|
+
}
|
|
3301
|
+
if (!review.shouldContinue) {
|
|
3302
|
+
break;
|
|
3303
|
+
}
|
|
3304
|
+
if (confidenceFloorWindow > 0 && review.confidence <= confidenceFloor) {
|
|
3305
|
+
lowConfidenceStreak += 1;
|
|
3306
|
+
if (lowConfidenceStreak >= confidenceFloorWindow) break;
|
|
3307
|
+
} else {
|
|
3308
|
+
lowConfidenceStreak = 0;
|
|
3309
|
+
}
|
|
3310
|
+
priorReview = review;
|
|
3311
|
+
}
|
|
3312
|
+
if (!completed && !failureClass) {
|
|
3313
|
+
failureClass = shots.length >= maxShots ? "budget_exceeded" : "unknown";
|
|
3314
|
+
}
|
|
3315
|
+
} finally {
|
|
3316
|
+
clearTimeout(wallTimer);
|
|
3317
|
+
}
|
|
3318
|
+
const score = lastVerification.pass ? 1 : typeof lastVerification.score === "number" ? lastVerification.score : 0;
|
|
3319
|
+
if (emitter) {
|
|
3320
|
+
await emitter.endRun({
|
|
3321
|
+
pass: completed,
|
|
3322
|
+
score,
|
|
3323
|
+
failureClass,
|
|
3324
|
+
notes: `${shots.length} shot(s); final pass=${lastVerification.pass}`
|
|
3325
|
+
});
|
|
3326
|
+
}
|
|
3327
|
+
return {
|
|
3328
|
+
runId: emitter?.runId ?? null,
|
|
3329
|
+
completed,
|
|
3330
|
+
shots,
|
|
3331
|
+
finalState: state,
|
|
3332
|
+
finalVerification: lastVerification,
|
|
3333
|
+
failureClass,
|
|
3334
|
+
wallMs: Date.now() - wallStart,
|
|
3335
|
+
score
|
|
3336
|
+
};
|
|
3337
|
+
}
|
|
3338
|
+
var REVIEWER_SYSTEM_PROMPT = `You are a senior reviewer directing a multi-shot build loop.
|
|
3339
|
+
You do NOT grade \u2014 the verifier already did. Your job is to direct the worker's next shot.
|
|
3340
|
+
You are blind to the worker's inner monologue. You see what it DID, not what it thought.
|
|
3341
|
+
Return STRICT JSON matching the schema. No prose outside the JSON.`;
|
|
3342
|
+
function createLlmReviewer(cfg) {
|
|
3343
|
+
const renderState = cfg.renderState ?? ((s) => safeJson(s));
|
|
3344
|
+
const renderTraceSummary = cfg.renderTraceSummary ?? ((s) => s === void 0 ? "(none)" : safeJson(s));
|
|
3345
|
+
const system = cfg.systemPromptAddendum ? `${REVIEWER_SYSTEM_PROMPT}
|
|
3346
|
+
|
|
3347
|
+
${cfg.systemPromptAddendum}` : REVIEWER_SYSTEM_PROMPT;
|
|
3348
|
+
return async (input) => {
|
|
3349
|
+
const memoryBlock = input.memory.length === 0 ? "(no prior shots \u2014 this is shot 1)" : input.memory.map((m) => [
|
|
3350
|
+
`shot ${m.shot} \u2014 verification.pass=${m.verification.pass}` + (typeof m.verification.score === "number" ? ` score=${m.verification.score.toFixed(2)}` : "") + ` confidence=${m.confidence.toFixed(2)} failing=[${(m.verification.failingLayers ?? []).join(",")}]`,
|
|
3351
|
+
` observations: ${m.observations.slice(0, 400)}`,
|
|
3352
|
+
` diagnosis: ${m.diagnosis.slice(0, 400)}`,
|
|
3353
|
+
` instruction given: ${m.nextShotInstruction.slice(0, 400)}`
|
|
3354
|
+
].join("\n")).join("\n\n");
|
|
3355
|
+
const user = [
|
|
3356
|
+
`=== GOAL ===`,
|
|
3357
|
+
input.goal,
|
|
3358
|
+
``,
|
|
3359
|
+
`=== SHOT NUMBER ===`,
|
|
3360
|
+
String(input.shot),
|
|
3361
|
+
``,
|
|
3362
|
+
`=== CURRENT STATE ===`,
|
|
3363
|
+
renderState(input.state),
|
|
3364
|
+
``,
|
|
3365
|
+
`=== TRACE SUMMARY ===`,
|
|
3366
|
+
renderTraceSummary(input.traceSummary),
|
|
3367
|
+
``,
|
|
3368
|
+
`=== VERIFICATION ===`,
|
|
3369
|
+
summarizeVerification(input.verification),
|
|
3370
|
+
``,
|
|
3371
|
+
`=== REVIEWER MEMORY (prior shots) ===`,
|
|
3372
|
+
memoryBlock,
|
|
3373
|
+
``,
|
|
3374
|
+
`=== YOUR TASK ===`,
|
|
3375
|
+
`Return STRICT JSON:`,
|
|
3376
|
+
`{`,
|
|
3377
|
+
` "observations": string (20..2000 chars, first-person worker behavior \u2014 quote counts, errors, loops)`,
|
|
3378
|
+
` "diagnosis": string (20..1500 chars, root cause, NOT a restatement of verification)`,
|
|
3379
|
+
` "nextShotInstruction": string (40..3000 chars, concrete directive to the worker)`,
|
|
3380
|
+
` "shouldContinue": boolean (false if verification.pass, or if thrashing, or unachievable)`,
|
|
3381
|
+
` "confidence": number in [0,1]`,
|
|
3382
|
+
`}`
|
|
3383
|
+
].join("\n");
|
|
3384
|
+
const raw = await cfg.callJson({ system, user });
|
|
3385
|
+
return coerceReview(raw);
|
|
3386
|
+
};
|
|
3387
|
+
}
|
|
3388
|
+
function coerceReview(raw) {
|
|
3389
|
+
if (!raw || typeof raw !== "object") {
|
|
3390
|
+
throw new Error("reviewer returned non-object");
|
|
3391
|
+
}
|
|
3392
|
+
const observations = typeof raw.observations === "string" ? raw.observations : "";
|
|
3393
|
+
const diagnosis = typeof raw.diagnosis === "string" ? raw.diagnosis : "";
|
|
3394
|
+
const nextShotInstruction = typeof raw.nextShotInstruction === "string" ? raw.nextShotInstruction : "";
|
|
3395
|
+
if (!observations || !diagnosis || !nextShotInstruction) {
|
|
3396
|
+
throw new Error("reviewer missing required string fields");
|
|
3397
|
+
}
|
|
3398
|
+
if (typeof raw.shouldContinue !== "boolean") {
|
|
3399
|
+
throw new Error("reviewer missing shouldContinue boolean");
|
|
3400
|
+
}
|
|
3401
|
+
const confidenceRaw = Number(raw.confidence);
|
|
3402
|
+
if (!Number.isFinite(confidenceRaw)) {
|
|
3403
|
+
throw new Error("reviewer confidence not finite");
|
|
3404
|
+
}
|
|
3405
|
+
return {
|
|
3406
|
+
observations,
|
|
3407
|
+
diagnosis,
|
|
3408
|
+
nextShotInstruction,
|
|
3409
|
+
shouldContinue: raw.shouldContinue,
|
|
3410
|
+
confidence: Math.max(0, Math.min(1, confidenceRaw))
|
|
3411
|
+
};
|
|
3412
|
+
}
|
|
3413
|
+
function summarizeVerification(v) {
|
|
3414
|
+
const header = `pass=${v.pass}` + (typeof v.score === "number" ? ` score=${v.score.toFixed(3)}` : "") + (v.failingLayers && v.failingLayers.length > 0 ? ` failing=[${v.failingLayers.join(", ")}]` : "");
|
|
3415
|
+
const details = v.details === void 0 ? "" : `
|
|
3416
|
+
${safeJson(v.details).slice(0, 1500)}`;
|
|
3417
|
+
return header + details;
|
|
3418
|
+
}
|
|
3419
|
+
function safeJson(x) {
|
|
3420
|
+
try {
|
|
3421
|
+
return JSON.stringify(x, null, 2);
|
|
3422
|
+
} catch {
|
|
3423
|
+
return String(x);
|
|
3424
|
+
}
|
|
3425
|
+
}
|
|
3426
|
+
|
|
3427
|
+
// src/trace/schema.ts
|
|
3428
|
+
var TRACE_SCHEMA_VERSION = "1.0.0";
|
|
3429
|
+
var FAILURE_CLASSES = [
|
|
3430
|
+
"success",
|
|
3431
|
+
"reasoning_error",
|
|
3432
|
+
"tool_selection_error",
|
|
3433
|
+
"tool_argument_error",
|
|
3434
|
+
"tool_recovery_failure",
|
|
3435
|
+
"hallucination",
|
|
3436
|
+
"instruction_following",
|
|
3437
|
+
"safety_refusal_miss",
|
|
3438
|
+
"policy_violation",
|
|
3439
|
+
"budget_exceeded",
|
|
3440
|
+
"format_drift",
|
|
3441
|
+
"permission_escalation",
|
|
3442
|
+
"pii_leak",
|
|
3443
|
+
"cost_overrun",
|
|
3444
|
+
"timeout",
|
|
3445
|
+
"sandbox_failure",
|
|
3446
|
+
"unknown"
|
|
3447
|
+
];
|
|
3448
|
+
function isLlmSpan(s) {
|
|
3449
|
+
return s.kind === "llm";
|
|
3450
|
+
}
|
|
3451
|
+
function isToolSpan(s) {
|
|
3452
|
+
return s.kind === "tool";
|
|
3453
|
+
}
|
|
3454
|
+
function isRetrievalSpan(s) {
|
|
3455
|
+
return s.kind === "retrieval";
|
|
3456
|
+
}
|
|
3457
|
+
function isJudgeSpan(s) {
|
|
3458
|
+
return s.kind === "judge";
|
|
3459
|
+
}
|
|
3460
|
+
function isSandboxSpan(s) {
|
|
3461
|
+
return s.kind === "sandbox";
|
|
3462
|
+
}
|
|
3463
|
+
|
|
3464
|
+
// src/trace/query.ts
|
|
3465
|
+
async function runsForScenario(store, scenarioId) {
|
|
3466
|
+
return store.listRuns({ scenarioId });
|
|
3467
|
+
}
|
|
3468
|
+
async function llmSpans(store, runId) {
|
|
3469
|
+
const spans = await store.spans({ runId, kind: "llm" });
|
|
3470
|
+
return spans.filter(isLlmSpan);
|
|
3471
|
+
}
|
|
3472
|
+
async function toolSpans(store, runId, toolName) {
|
|
3473
|
+
const spans = await store.spans({ runId, kind: "tool", toolName });
|
|
3474
|
+
return spans.filter(isToolSpan);
|
|
3475
|
+
}
|
|
3476
|
+
async function judgeSpans(store, runId) {
|
|
3477
|
+
const spans = await store.spans({ runId, kind: "judge" });
|
|
3478
|
+
return spans.filter(isJudgeSpan);
|
|
3479
|
+
}
|
|
3480
|
+
function groupBy(items, key) {
|
|
3481
|
+
const map = /* @__PURE__ */ new Map();
|
|
3482
|
+
for (const item of items) {
|
|
3483
|
+
const k = key(item);
|
|
3484
|
+
let bucket = map.get(k);
|
|
3485
|
+
if (!bucket) {
|
|
3486
|
+
bucket = [];
|
|
3487
|
+
map.set(k, bucket);
|
|
3488
|
+
}
|
|
3489
|
+
bucket.push(item);
|
|
3490
|
+
}
|
|
3491
|
+
return map;
|
|
3492
|
+
}
|
|
3493
|
+
function argHash(args) {
|
|
3494
|
+
return stableStringify(args);
|
|
3495
|
+
}
|
|
3496
|
+
function stableStringify(value) {
|
|
3497
|
+
if (value === null || typeof value !== "object") return JSON.stringify(value);
|
|
3498
|
+
if (Array.isArray(value)) return `[${value.map(stableStringify).join(",")}]`;
|
|
3499
|
+
const keys = Object.keys(value).sort();
|
|
3500
|
+
const parts = keys.map((k) => `${JSON.stringify(k)}:${stableStringify(value[k])}`);
|
|
3501
|
+
return `{${parts.join(",")}}`;
|
|
3502
|
+
}
|
|
3503
|
+
function aggregateLlm(spans) {
|
|
3504
|
+
return spans.reduce(
|
|
3505
|
+
(acc, s) => ({
|
|
3506
|
+
inputTokens: acc.inputTokens + (s.inputTokens ?? 0),
|
|
3507
|
+
outputTokens: acc.outputTokens + (s.outputTokens ?? 0),
|
|
3508
|
+
cachedTokens: acc.cachedTokens + (s.cachedTokens ?? 0),
|
|
3509
|
+
costUsd: acc.costUsd + (s.costUsd ?? 0)
|
|
3510
|
+
}),
|
|
3511
|
+
{ inputTokens: 0, outputTokens: 0, cachedTokens: 0, costUsd: 0 }
|
|
3512
|
+
);
|
|
3513
|
+
}
|
|
3514
|
+
function runFailureClass(run) {
|
|
3515
|
+
if (run.outcome?.failureClass) return run.outcome.failureClass;
|
|
2652
3516
|
if (run.status === "completed" && run.outcome?.pass !== false) return "success";
|
|
2653
3517
|
if (run.status === "aborted") return "budget_exceeded";
|
|
2654
3518
|
return "unknown";
|
|
@@ -2825,181 +3689,6 @@ function runToTraceId(run) {
|
|
|
2825
3689
|
return cleaned.slice(0, 32).padEnd(32, "0");
|
|
2826
3690
|
}
|
|
2827
3691
|
|
|
2828
|
-
// src/sandbox-harness.ts
|
|
2829
|
-
var vitestTestParser = {
|
|
2830
|
-
id: "vitest",
|
|
2831
|
-
parse(stdout) {
|
|
2832
|
-
const m = stdout.match(/Tests\s+(\d+)\s+(passed|failed)(?:\s*\|\s*(\d+)\s+(passed|failed))?/i);
|
|
2833
|
-
if (!m) return void 0;
|
|
2834
|
-
let passed = 0;
|
|
2835
|
-
let failed = 0;
|
|
2836
|
-
const a = parseInt(m[1], 10);
|
|
2837
|
-
const aLabel = m[2].toLowerCase();
|
|
2838
|
-
if (aLabel === "passed") passed += a;
|
|
2839
|
-
else failed += a;
|
|
2840
|
-
if (m[3] && m[4]) {
|
|
2841
|
-
const b = parseInt(m[3], 10);
|
|
2842
|
-
if (m[4].toLowerCase() === "passed") passed += b;
|
|
2843
|
-
else failed += b;
|
|
2844
|
-
}
|
|
2845
|
-
return { testsTotal: passed + failed, testsPassed: passed };
|
|
2846
|
-
}
|
|
2847
|
-
};
|
|
2848
|
-
var pytestTestParser = {
|
|
2849
|
-
id: "pytest",
|
|
2850
|
-
parse(stdout) {
|
|
2851
|
-
const total = stdout.match(/collected\s+(\d+)\s+items?/i);
|
|
2852
|
-
const passed = stdout.match(/(\d+)\s+passed/);
|
|
2853
|
-
if (!total || !passed) return void 0;
|
|
2854
|
-
return { testsTotal: parseInt(total[1], 10), testsPassed: parseInt(passed[1], 10) };
|
|
2855
|
-
}
|
|
2856
|
-
};
|
|
2857
|
-
var jestTestParser = {
|
|
2858
|
-
id: "jest",
|
|
2859
|
-
parse(stdout) {
|
|
2860
|
-
const m = stdout.match(/Tests:\s+(?:(\d+)\s+failed[^,]*,\s*)?(\d+)\s+passed,\s+(\d+)\s+total/i);
|
|
2861
|
-
if (!m) return void 0;
|
|
2862
|
-
return { testsTotal: parseInt(m[3], 10), testsPassed: parseInt(m[2], 10) };
|
|
2863
|
-
}
|
|
2864
|
-
};
|
|
2865
|
-
function composeParsers(...parsers) {
|
|
2866
|
-
return {
|
|
2867
|
-
id: parsers.map((p) => p.id).join("|"),
|
|
2868
|
-
parse(stdout, stderr, exitCode) {
|
|
2869
|
-
for (const p of parsers) {
|
|
2870
|
-
const res = p.parse(stdout, stderr, exitCode);
|
|
2871
|
-
if (res) return res;
|
|
2872
|
-
}
|
|
2873
|
-
return void 0;
|
|
2874
|
-
}
|
|
2875
|
-
};
|
|
2876
|
-
}
|
|
2877
|
-
var SubprocessSandboxDriver = class {
|
|
2878
|
-
id = "subprocess";
|
|
2879
|
-
async exec(phase, command, config) {
|
|
2880
|
-
const { spawn } = await import("child_process");
|
|
2881
|
-
const start = Date.now();
|
|
2882
|
-
return await new Promise((resolve) => {
|
|
2883
|
-
const child = spawn(command, {
|
|
2884
|
-
shell: true,
|
|
2885
|
-
cwd: config.cwd,
|
|
2886
|
-
env: { ...process.env, ...config.env ?? {} }
|
|
2887
|
-
});
|
|
2888
|
-
let stdout = "";
|
|
2889
|
-
let stderr = "";
|
|
2890
|
-
child.stdout?.on("data", (d) => {
|
|
2891
|
-
stdout += String(d);
|
|
2892
|
-
});
|
|
2893
|
-
child.stderr?.on("data", (d) => {
|
|
2894
|
-
stderr += String(d);
|
|
2895
|
-
});
|
|
2896
|
-
const timeout = setTimeout(() => {
|
|
2897
|
-
try {
|
|
2898
|
-
child.kill("SIGKILL");
|
|
2899
|
-
} catch {
|
|
2900
|
-
}
|
|
2901
|
-
}, config.timeoutMs ?? 10 * 6e4);
|
|
2902
|
-
child.on("close", (code) => {
|
|
2903
|
-
clearTimeout(timeout);
|
|
2904
|
-
const wallMs = Date.now() - start;
|
|
2905
|
-
const parsed = phase === "test" && config.testParser ? config.testParser.parse(stdout, stderr, code ?? 1) : void 0;
|
|
2906
|
-
resolve({
|
|
2907
|
-
phase,
|
|
2908
|
-
exitCode: code ?? 1,
|
|
2909
|
-
stdout,
|
|
2910
|
-
stderr,
|
|
2911
|
-
wallMs,
|
|
2912
|
-
testsTotal: parsed?.testsTotal,
|
|
2913
|
-
testsPassed: parsed?.testsPassed
|
|
2914
|
-
});
|
|
2915
|
-
});
|
|
2916
|
-
child.on("error", (err) => {
|
|
2917
|
-
clearTimeout(timeout);
|
|
2918
|
-
const wallMs = Date.now() - start;
|
|
2919
|
-
resolve({ phase, exitCode: 127, stdout, stderr: stderr + String(err), wallMs });
|
|
2920
|
-
});
|
|
2921
|
-
});
|
|
2922
|
-
}
|
|
2923
|
-
};
|
|
2924
|
-
var DockerSandboxDriver = class {
|
|
2925
|
-
id = "docker";
|
|
2926
|
-
async exec(phase, command, config) {
|
|
2927
|
-
if (!config.image) throw new Error("DockerSandboxDriver requires config.image");
|
|
2928
|
-
const sub = new SubprocessSandboxDriver();
|
|
2929
|
-
const envArgs = Object.entries(config.env ?? {}).map(([k, v]) => `-e ${shellQuote(k)}=${shellQuote(v)}`).join(" ");
|
|
2930
|
-
const wrapped = `docker run --rm ${envArgs} ${shellQuote(config.image)} sh -c ${shellQuote(command)}`;
|
|
2931
|
-
return sub.exec(phase, wrapped, { ...config, env: void 0 });
|
|
2932
|
-
}
|
|
2933
|
-
};
|
|
2934
|
-
function shellQuote(v) {
|
|
2935
|
-
if (/^[A-Za-z0-9_\-\/\.@:=]+$/.test(v)) return v;
|
|
2936
|
-
return `'${v.replace(/'/g, `'\\''`)}'`;
|
|
2937
|
-
}
|
|
2938
|
-
var SandboxHarness = class {
|
|
2939
|
-
driver;
|
|
2940
|
-
constructor(driver = new SubprocessSandboxDriver()) {
|
|
2941
|
-
this.driver = driver;
|
|
2942
|
-
}
|
|
2943
|
-
async run(config, emitter) {
|
|
2944
|
-
const handle = await emitter.sandbox({
|
|
2945
|
-
name: `sandbox(${this.driver.id})`,
|
|
2946
|
-
image: config.image,
|
|
2947
|
-
command: [config.setupCommand, config.runCommand, config.testCommand].filter(Boolean).join(" && ")
|
|
2948
|
-
});
|
|
2949
|
-
const result = { passed: false, totalWallMs: 0, score: 0 };
|
|
2950
|
-
try {
|
|
2951
|
-
if (config.setupCommand) {
|
|
2952
|
-
result.setup = await this.driver.exec("setup", config.setupCommand, config);
|
|
2953
|
-
result.totalWallMs += result.setup.wallMs;
|
|
2954
|
-
if (result.setup.exitCode !== 0) {
|
|
2955
|
-
await handle.fail(`setup failed (exit ${result.setup.exitCode})`, {
|
|
2956
|
-
exitCode: result.setup.exitCode,
|
|
2957
|
-
wallMs: result.totalWallMs
|
|
2958
|
-
});
|
|
2959
|
-
return result;
|
|
2960
|
-
}
|
|
2961
|
-
}
|
|
2962
|
-
if (config.runCommand) {
|
|
2963
|
-
result.run = await this.driver.exec("run", config.runCommand, config);
|
|
2964
|
-
result.totalWallMs += result.run.wallMs;
|
|
2965
|
-
if (result.run.exitCode !== 0) {
|
|
2966
|
-
await handle.fail(`run failed (exit ${result.run.exitCode})`, {
|
|
2967
|
-
exitCode: result.run.exitCode,
|
|
2968
|
-
wallMs: result.totalWallMs
|
|
2969
|
-
});
|
|
2970
|
-
return result;
|
|
2971
|
-
}
|
|
2972
|
-
}
|
|
2973
|
-
if (config.testCommand) {
|
|
2974
|
-
result.test = await this.driver.exec("test", config.testCommand, config);
|
|
2975
|
-
result.totalWallMs += result.test.wallMs;
|
|
2976
|
-
const passed = result.test.exitCode === 0;
|
|
2977
|
-
result.passed = passed;
|
|
2978
|
-
if (result.test.testsTotal !== void 0 && result.test.testsTotal > 0) {
|
|
2979
|
-
result.score = (result.test.testsPassed ?? 0) / result.test.testsTotal;
|
|
2980
|
-
} else {
|
|
2981
|
-
result.score = passed ? 1 : 0;
|
|
2982
|
-
}
|
|
2983
|
-
await handle.end({
|
|
2984
|
-
exitCode: result.test.exitCode,
|
|
2985
|
-
testsTotal: result.test.testsTotal,
|
|
2986
|
-
testsPassed: result.test.testsPassed,
|
|
2987
|
-
wallMs: result.totalWallMs,
|
|
2988
|
-
status: passed ? "ok" : "error"
|
|
2989
|
-
});
|
|
2990
|
-
} else {
|
|
2991
|
-
result.passed = true;
|
|
2992
|
-
result.score = 1;
|
|
2993
|
-
await handle.end({ wallMs: result.totalWallMs });
|
|
2994
|
-
}
|
|
2995
|
-
} catch (err) {
|
|
2996
|
-
await handle.fail(err instanceof Error ? err : String(err));
|
|
2997
|
-
throw err;
|
|
2998
|
-
}
|
|
2999
|
-
return result;
|
|
3000
|
-
}
|
|
3001
|
-
};
|
|
3002
|
-
|
|
3003
3692
|
// src/test-graded-scenario.ts
|
|
3004
3693
|
async function runTestGradedScenario(scenario, store, options = {}) {
|
|
3005
3694
|
const emitter = new TraceEmitter(store);
|
|
@@ -5284,8 +5973,11 @@ async function scoreProject(store, projectId) {
|
|
|
5284
5973
|
const runtimeScore = runtimeScores.length > 0 ? runtimeScores.reduce((a, b) => a + b, 0) / runtimeScores.length : null;
|
|
5285
5974
|
const runtimePassed = runtime.filter((r) => r.outcome?.pass === true).length;
|
|
5286
5975
|
const runtimePassRate = runtime.length > 0 ? runtimePassed / runtime.length : null;
|
|
5976
|
+
const kind = runtime.length === 0 ? "scaffold-only" : "full";
|
|
5977
|
+
const complete = kind === "scaffold-only" ? metaScore !== null && buildScore !== null : metaScore !== null && buildScore !== null && runtimeScore !== null;
|
|
5287
5978
|
return {
|
|
5288
5979
|
projectId,
|
|
5980
|
+
kind,
|
|
5289
5981
|
builderRunId: builder?.runId,
|
|
5290
5982
|
metaScore,
|
|
5291
5983
|
buildRunId: build?.runId,
|
|
@@ -5293,7 +5985,7 @@ async function scoreProject(store, projectId) {
|
|
|
5293
5985
|
appRuntimeRunIds: runtime.map((r) => r.runId),
|
|
5294
5986
|
runtimeScore,
|
|
5295
5987
|
runtimePassRate,
|
|
5296
|
-
complete
|
|
5988
|
+
complete
|
|
5297
5989
|
};
|
|
5298
5990
|
}
|
|
5299
5991
|
async function scoreAllProjects(store) {
|
|
@@ -6987,6 +7679,7 @@ async function euAiActReport(ctx, signals) {
|
|
|
6987
7679
|
}
|
|
6988
7680
|
export {
|
|
6989
7681
|
AgentDriver,
|
|
7682
|
+
AxGepaSteeringOptimizer,
|
|
6990
7683
|
BenchmarkRunner,
|
|
6991
7684
|
BudgetBreachError,
|
|
6992
7685
|
BudgetGuard,
|
|
@@ -6998,6 +7691,7 @@ export {
|
|
|
6998
7691
|
DEFAULT_MUTATORS,
|
|
6999
7692
|
DEFAULT_REDACTION_RULES,
|
|
7000
7693
|
DEFAULT_RED_TEAM_CORPUS,
|
|
7694
|
+
DEFAULT_RUN_SCORE_WEIGHTS,
|
|
7001
7695
|
Dataset,
|
|
7002
7696
|
DockerSandboxDriver,
|
|
7003
7697
|
DualAgentBench,
|
|
@@ -7011,15 +7705,19 @@ export {
|
|
|
7011
7705
|
InMemoryOutcomeStore,
|
|
7012
7706
|
InMemoryTraceStore,
|
|
7013
7707
|
InMemoryWorkspaceInspector,
|
|
7708
|
+
JudgeRunner,
|
|
7014
7709
|
MODEL_PRICING,
|
|
7015
7710
|
MetricsCollector,
|
|
7016
7711
|
OTEL_AGENT_EVAL_SCOPE,
|
|
7712
|
+
OptimizationLoop,
|
|
7713
|
+
PairwiseSteeringOptimizer,
|
|
7017
7714
|
PrmGrader,
|
|
7018
7715
|
ProductClient,
|
|
7019
7716
|
ProjectRegistry,
|
|
7020
7717
|
PromptOptimizer,
|
|
7021
7718
|
PromptRegistry,
|
|
7022
7719
|
REDACTION_VERSION,
|
|
7720
|
+
RunCritic,
|
|
7023
7721
|
SandboxHarness,
|
|
7024
7722
|
ScenarioRegistry,
|
|
7025
7723
|
SubprocessSandboxDriver,
|
|
@@ -7028,6 +7726,7 @@ export {
|
|
|
7028
7726
|
TraceEmitter,
|
|
7029
7727
|
adversarialJudge,
|
|
7030
7728
|
aggregateLlm,
|
|
7729
|
+
aggregateRunScore,
|
|
7031
7730
|
analyzeAntiSlop,
|
|
7032
7731
|
analyzeSeries,
|
|
7033
7732
|
argHash,
|
|
@@ -7044,6 +7743,7 @@ export {
|
|
|
7044
7743
|
causalAttribution,
|
|
7045
7744
|
checkCanaries,
|
|
7046
7745
|
checkSlos,
|
|
7746
|
+
clamp01,
|
|
7047
7747
|
classifyEuAiRisk,
|
|
7048
7748
|
classifyFailure,
|
|
7049
7749
|
codeExecutionJudge,
|
|
@@ -7052,6 +7752,7 @@ export {
|
|
|
7052
7752
|
collectionPreserved,
|
|
7053
7753
|
commitBisect,
|
|
7054
7754
|
compareToBaseline,
|
|
7755
|
+
compilerJudge,
|
|
7055
7756
|
composeParsers,
|
|
7056
7757
|
composeValidators,
|
|
7057
7758
|
computeToolUseMetrics,
|
|
@@ -7062,8 +7763,10 @@ export {
|
|
|
7062
7763
|
createAntiSlopJudge,
|
|
7063
7764
|
createCustomJudge,
|
|
7064
7765
|
createDomainExpertJudge,
|
|
7766
|
+
createLlmReviewer,
|
|
7065
7767
|
crossTraceDiff,
|
|
7066
7768
|
defaultJudges,
|
|
7769
|
+
distillPlaybook,
|
|
7067
7770
|
dominates,
|
|
7068
7771
|
estimateCost,
|
|
7069
7772
|
estimateTokens,
|
|
@@ -7085,6 +7788,7 @@ export {
|
|
|
7085
7788
|
groupBy,
|
|
7086
7789
|
hashContent,
|
|
7087
7790
|
hashScenarios,
|
|
7791
|
+
inMemoryReviewStore,
|
|
7088
7792
|
interRaterReliability,
|
|
7089
7793
|
iqr,
|
|
7090
7794
|
isJudgeSpan,
|
|
@@ -7096,14 +7800,17 @@ export {
|
|
|
7096
7800
|
jestTestParser,
|
|
7097
7801
|
jsonHasKeys,
|
|
7098
7802
|
jsonShape,
|
|
7803
|
+
jsonlReviewStore,
|
|
7099
7804
|
judgeAgreementView,
|
|
7100
7805
|
judgeSpans,
|
|
7101
7806
|
keyPreserved,
|
|
7807
|
+
linterJudge,
|
|
7102
7808
|
llmSpanFromProvider,
|
|
7103
7809
|
llmSpans,
|
|
7104
7810
|
loadScorerFromGrader,
|
|
7105
7811
|
lowercaseMutator,
|
|
7106
7812
|
mannWhitneyU,
|
|
7813
|
+
mergeSteeringBundle,
|
|
7107
7814
|
nistAiRmfReport,
|
|
7108
7815
|
nonRefusalRubric,
|
|
7109
7816
|
normalizeScores,
|
|
@@ -7131,6 +7838,8 @@ export {
|
|
|
7131
7838
|
regressionView,
|
|
7132
7839
|
renderMarkdown,
|
|
7133
7840
|
renderMarkdownReport,
|
|
7841
|
+
renderPlaybookMarkdown,
|
|
7842
|
+
renderSteeringText,
|
|
7134
7843
|
replayScorerOverCorpus,
|
|
7135
7844
|
replayTraceThroughJudge,
|
|
7136
7845
|
requiredSampleSize,
|
|
@@ -7142,6 +7851,8 @@ export {
|
|
|
7142
7851
|
runE2EWorkflow,
|
|
7143
7852
|
runExpectations,
|
|
7144
7853
|
runFailureClass,
|
|
7854
|
+
runJudgeFleet,
|
|
7855
|
+
runProposeReview,
|
|
7145
7856
|
runSelfPlay,
|
|
7146
7857
|
runTestGradedScenario,
|
|
7147
7858
|
runsForScenario,
|
|
@@ -7149,6 +7860,7 @@ export {
|
|
|
7149
7860
|
scoreContinuity,
|
|
7150
7861
|
scoreProject,
|
|
7151
7862
|
scoreRedTeamOutput,
|
|
7863
|
+
securityJudge,
|
|
7152
7864
|
selfPreference,
|
|
7153
7865
|
sentenceReorderMutator,
|
|
7154
7866
|
signManifest,
|
|
@@ -7156,6 +7868,7 @@ export {
|
|
|
7156
7868
|
statusAdvanced,
|
|
7157
7869
|
stuckLoopView,
|
|
7158
7870
|
summarize,
|
|
7871
|
+
testJudge,
|
|
7159
7872
|
textInSnapshot,
|
|
7160
7873
|
toLangfuseEnvelope,
|
|
7161
7874
|
toNdjson,
|