@tangle-network/agent-eval 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -2094,113 +2094,340 @@ function flatSamples(score) {
2094
2094
  return out;
2095
2095
  }
2096
2096
 
2097
- // src/dual-agent-bench.ts
2098
- var DualAgentBench = class {
2099
- async run(config) {
2100
- const maxRounds = config.maxRounds ?? 5;
2101
- const threshold = config.convergenceThreshold ?? 0.85;
2102
- if (config.scenarios.length === 0) {
2103
- throw new Error("DualAgentBench requires at least 1 scenario");
2104
- }
2105
- const results = [];
2106
- for (const scenario of config.scenarios) {
2107
- const history = [];
2108
- let converged = false;
2109
- let roundsToConverge = null;
2110
- let finalProposal = "";
2111
- let lastScore = 0;
2112
- let priorCritique;
2113
- for (let r = 0; r < maxRounds; r++) {
2114
- const priorProposal = history[history.length - 1]?.proposal;
2115
- const proposal = await config.propose({
2116
- scenario,
2117
- roundIndex: r,
2118
- priorProposal,
2119
- priorCritique
2120
- });
2121
- const { critique, convergenceScore } = await config.critique({
2122
- scenario,
2123
- roundIndex: r,
2124
- proposal
2125
- });
2126
- if (!Number.isFinite(convergenceScore) || convergenceScore < 0 || convergenceScore > 1) {
2127
- throw new Error(
2128
- `critique must return convergenceScore in [0,1]; got ${convergenceScore} for scenario ${scenario.id} round ${r}`
2129
- );
2130
- }
2131
- const round = {
2132
- roundIndex: r,
2133
- proposal,
2134
- critique,
2135
- convergenceScore
2136
- };
2137
- history.push(round);
2138
- config.onRoundComplete?.({ scenarioId: scenario.id, round });
2139
- finalProposal = proposal;
2140
- lastScore = convergenceScore;
2141
- priorCritique = critique;
2142
- if (convergenceScore >= threshold) {
2143
- converged = true;
2144
- roundsToConverge = r + 1;
2145
- break;
2146
- }
2147
- }
2148
- results.push({
2149
- scenarioId: scenario.id,
2150
- converged,
2151
- roundsToConverge,
2152
- finalProposal,
2153
- history,
2154
- finalScore: lastScore
2155
- });
2097
+ // src/steering.ts
2098
+ function mergeSteeringBundle(base, delta) {
2099
+ return {
2100
+ ...base,
2101
+ ...delta.coderPrompt !== void 0 ? { coderPrompt: delta.coderPrompt } : {},
2102
+ ...delta.continuePrompt !== void 0 ? { continuePrompt: delta.continuePrompt } : {},
2103
+ reviewerPrompts: {
2104
+ ...base.reviewerPrompts ?? {},
2105
+ ...delta.reviewerPrompts ?? {}
2106
+ },
2107
+ skills: delta.skills ?? base.skills,
2108
+ rolePrompts: {
2109
+ ...base.rolePrompts ?? {},
2110
+ ...delta.rolePrompts ?? {}
2111
+ },
2112
+ metadata: {
2113
+ ...base.metadata ?? {},
2114
+ ...delta.metadata ?? {}
2156
2115
  }
2157
- const convergedResults = results.filter((r) => r.converged);
2158
- const convergenceRate = results.length ? convergedResults.length / results.length : 0;
2159
- const avgRoundsToConverge = convergedResults.length ? convergedResults.reduce((acc, r) => acc + (r.roundsToConverge ?? 0), 0) / convergedResults.length : null;
2160
- const avgFinalScore = results.length ? results.reduce((acc, r) => acc + r.finalScore, 0) / results.length : 0;
2116
+ };
2117
+ }
2118
+ function renderSteeringText(bundle) {
2119
+ const lines = [`bundle:${bundle.id}`];
2120
+ if (bundle.coderPrompt) lines.push(`coder:${bundle.coderPrompt}`);
2121
+ if (bundle.continuePrompt) lines.push(`continue:${bundle.continuePrompt}`);
2122
+ const reviewers = Object.entries(bundle.reviewerPrompts ?? {}).sort(([a], [b]) => a.localeCompare(b));
2123
+ for (const [name, prompt] of reviewers) lines.push(`reviewer:${name}:${prompt}`);
2124
+ const skills = [...bundle.skills ?? []].sort();
2125
+ if (skills.length) lines.push(`skills:${skills.join(",")}`);
2126
+ return lines.join("\n");
2127
+ }
2128
+
2129
+ // src/run-score.ts
2130
+ var DEFAULT_RUN_SCORE_WEIGHTS = {
2131
+ success: 4,
2132
+ goalProgress: 2,
2133
+ repoGroundedness: 1.5,
2134
+ driftPenalty: -1.5,
2135
+ toolUseQuality: 1,
2136
+ patchQuality: 1.25,
2137
+ testReality: 1.5,
2138
+ costUsd: -0.2,
2139
+ wallSeconds: -0.1
2140
+ };
2141
+ function aggregateRunScore(score, weights = {}) {
2142
+ const w = { ...DEFAULT_RUN_SCORE_WEIGHTS, ...weights };
2143
+ return w.success * clamp01(score.success) + w.goalProgress * clamp01(score.goalProgress) + w.repoGroundedness * clamp01(score.repoGroundedness) + w.driftPenalty * clamp01(score.driftPenalty) + w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60);
2144
+ }
2145
+ function clamp01(value) {
2146
+ if (!Number.isFinite(value)) return 0;
2147
+ return Math.max(0, Math.min(1, value));
2148
+ }
2149
+
2150
+ // src/run-critic.ts
2151
+ var DEFAULT_DRIFT_PATTERNS = [
2152
+ /https?:\/\//i,
2153
+ /\btitle:\s/i,
2154
+ /\bsummary:\s/i,
2155
+ /\burl:\s/i,
2156
+ /\bnpm package usage\b/i,
2157
+ /\bnews\b/i
2158
+ ];
2159
+ var RunCritic = class {
2160
+ weights;
2161
+ driftPatterns;
2162
+ constructor(options = {}) {
2163
+ this.weights = options.weights;
2164
+ this.driftPatterns = options.driftPatterns ?? DEFAULT_DRIFT_PATTERNS;
2165
+ }
2166
+ async score(store, runId) {
2167
+ const run = await store.getRun(runId);
2168
+ if (!run) throw new Error(`run ${runId} not found`);
2169
+ const [spans, events, artifacts, budget] = await Promise.all([
2170
+ store.spans({ runId }),
2171
+ store.events({ runId }),
2172
+ store.artifacts(runId),
2173
+ store.budget(runId)
2174
+ ]);
2175
+ return this.scoreTrace({ run, spans, events, artifacts, budget });
2176
+ }
2177
+ scoreTrace(trace) {
2178
+ const notes = [];
2179
+ const llmSpans2 = trace.spans.filter((s) => s.kind === "llm");
2180
+ const toolSpans2 = trace.spans.filter((s) => s.kind === "tool");
2181
+ const judgeSpans2 = trace.spans.filter((s) => s.kind === "judge");
2182
+ const sandboxSpans = trace.spans.filter((s) => s.kind === "sandbox");
2183
+ const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
2184
+ if (!success) notes.push("run did not complete with pass=true");
2185
+ const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum, span) => sum + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
2186
+ const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score) : void 0;
2187
+ const goalProgress = outcomeScore ?? judgeAverage ?? success;
2188
+ const successfulTools = toolSpans2.filter((span) => span.status !== "error").length;
2189
+ const toolUseQuality = toolSpans2.length === 0 ? 0 : successfulTools / toolSpans2.length;
2190
+ if (toolSpans2.length === 0) notes.push("no tool spans recorded");
2191
+ const patchEvidence = trace.artifacts.length + toolSpans2.filter((span) => /write|edit|patch|apply/i.test(span.toolName)).length;
2192
+ const patchQuality = patchEvidence > 0 ? clamp01(patchEvidence / 4) : 0;
2193
+ if (!patchQuality) notes.push("no artifact or edit evidence recorded");
2194
+ const sandboxTests = sandboxSpans.filter((span) => typeof span.testsTotal === "number" && span.testsTotal > 0);
2195
+ const testReality = sandboxTests.length ? sandboxTests.reduce((sum, span) => sum + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1), 0) / sandboxTests.length : toolSpans2.some((span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))) ? 0.4 : 0;
2196
+ if (!testReality) notes.push("no real test/build evidence recorded");
2197
+ const positiveGroundingSignals = patchEvidence + sandboxSpans.length + llmSpans2.filter((span) => looksRepoGrounded(span.output ?? "")).length;
2198
+ const driftSignals = llmSpans2.filter((span) => this.isDrift(span.output ?? "")).length + trace.events.filter((event) => this.isDrift(JSON.stringify(event.payload))).length;
2199
+ const repoGroundedness = positiveGroundingSignals + driftSignals === 0 ? 0 : positiveGroundingSignals / (positiveGroundingSignals + driftSignals);
2200
+ const driftPenalty = positiveGroundingSignals + driftSignals === 0 ? 0 : driftSignals / (positiveGroundingSignals + driftSignals);
2201
+ if (driftSignals > 0) notes.push(`detected ${driftSignals} drift signal(s)`);
2202
+ const costUsd = trace.budget.length ? Math.max(...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed), 0) : llmSpans2.reduce((sum, span) => sum + (span.costUsd ?? 0), 0);
2203
+ const wallSeconds = trace.run.endedAt && trace.run.startedAt ? Math.max(0, (trace.run.endedAt - trace.run.startedAt) / 1e3) : 0;
2161
2204
  return {
2162
- scenarios: results,
2163
- aggregate: { convergenceRate, avgRoundsToConverge, avgFinalScore },
2164
- config: { maxRounds, convergenceThreshold: threshold }
2205
+ success,
2206
+ goalProgress,
2207
+ repoGroundedness,
2208
+ driftPenalty,
2209
+ toolUseQuality,
2210
+ patchQuality,
2211
+ testReality,
2212
+ costUsd,
2213
+ wallSeconds,
2214
+ notes
2165
2215
  };
2166
2216
  }
2217
+ rank(score) {
2218
+ return aggregateRunScore(score, this.weights);
2219
+ }
2220
+ isDrift(text) {
2221
+ return this.driftPatterns.some((pattern) => pattern.test(text));
2222
+ }
2167
2223
  };
2224
+ function normalizeJudgeScore(score) {
2225
+ return score > 1 ? clamp01(score / 10) : clamp01(score);
2226
+ }
2227
+ function looksRepoGrounded(text) {
2228
+ return /(?:src\/|tests?\/|package\.json|tsconfig|\.ts\b|\.tsx\b|git status|pnpm |npm |vitest|pytest|jest)/i.test(text);
2229
+ }
2168
2230
 
2169
- // src/trace/schema.ts
2170
- var TRACE_SCHEMA_VERSION = "1.0.0";
2171
- var FAILURE_CLASSES = [
2172
- "success",
2173
- "reasoning_error",
2174
- "tool_selection_error",
2175
- "tool_argument_error",
2176
- "tool_recovery_failure",
2177
- "hallucination",
2178
- "instruction_following",
2179
- "safety_refusal_miss",
2180
- "policy_violation",
2181
- "budget_exceeded",
2182
- "format_drift",
2183
- "permission_escalation",
2184
- "pii_leak",
2185
- "cost_overrun",
2186
- "timeout",
2187
- "sandbox_failure",
2188
- "unknown"
2189
- ];
2190
- function isLlmSpan(s) {
2191
- return s.kind === "llm";
2231
+ // src/playbook.ts
2232
+ function distillPlaybook(entries, options = {}) {
2233
+ const maxEntries = options.maxEntries ?? 12;
2234
+ const byInstruction = /* @__PURE__ */ new Map();
2235
+ for (const entry of entries) {
2236
+ const key = normalizeInstruction(entry.instruction);
2237
+ const existing = byInstruction.get(key);
2238
+ if (!existing || (entry.weight ?? 0) > (existing.weight ?? 0)) {
2239
+ byInstruction.set(key, { ...entry, instruction: canonicalInstruction(entry.instruction) });
2240
+ }
2241
+ }
2242
+ const distilled = [...byInstruction.values()].sort((a, b) => (b.weight ?? 0) - (a.weight ?? 0)).slice(0, maxEntries);
2243
+ return { entries: distilled };
2244
+ }
2245
+ function renderPlaybookMarkdown(playbook) {
2246
+ const lines = ["# Playbook", ""];
2247
+ for (const entry of playbook.entries) {
2248
+ lines.push(`- ${entry.instruction}`);
2249
+ lines.push(` Rationale: ${entry.rationale}`);
2250
+ if (entry.category) lines.push(` Category: ${entry.category}`);
2251
+ if (entry.evidence) lines.push(` Evidence: ${entry.evidence}`);
2252
+ if (entry.sourceRunId) lines.push(` Source run: ${entry.sourceRunId}`);
2253
+ lines.push("");
2254
+ }
2255
+ return lines.join("\n").trim() + "\n";
2192
2256
  }
2193
- function isToolSpan(s) {
2194
- return s.kind === "tool";
2257
+ function normalizeInstruction(value) {
2258
+ return value.trim().toLowerCase().replace(/\s+/g, " ");
2195
2259
  }
2196
- function isRetrievalSpan(s) {
2197
- return s.kind === "retrieval";
2260
+ function canonicalInstruction(value) {
2261
+ const normalized = value.trim().replace(/\s+/g, " ");
2262
+ return normalized.length === 0 ? normalized : normalized[0].toUpperCase() + normalized.slice(1);
2198
2263
  }
2199
- function isJudgeSpan(s) {
2200
- return s.kind === "judge";
2264
+
2265
+ // src/optimization-loop.ts
2266
+ var OptimizationLoop = class {
2267
+ optimizer;
2268
+ constructor(optimizer = new PromptOptimizer()) {
2269
+ this.optimizer = optimizer;
2270
+ }
2271
+ async run(config) {
2272
+ const byId = new Map(config.variants.map((variant) => [variant.id, variant]));
2273
+ const result = await this.optimizer.run({
2274
+ variants: config.variants.map((variant) => ({
2275
+ id: variant.id,
2276
+ prompt: renderSteeringText(variant),
2277
+ metadata: { bundle: variant }
2278
+ })),
2279
+ scenarioIds: config.examples.map((example) => example.scenarioId),
2280
+ trialsPerScenario: config.trialsPerScenario,
2281
+ scoreVariant: async ({ variant, scenarioId, trialIndex }) => {
2282
+ const bundle = byId.get(variant.id);
2283
+ if (!bundle) throw new Error(`unknown steering bundle ${variant.id}`);
2284
+ const example = config.examples.find((item) => item.scenarioId === scenarioId);
2285
+ if (!example) throw new Error(`unknown optimization example ${scenarioId}`);
2286
+ const score = await config.evaluate({ variant: bundle, example, trialIndex });
2287
+ return aggregateRunScore(score, config.scoreWeights);
2288
+ }
2289
+ });
2290
+ return {
2291
+ winner: byId.get(result.winner.variantId),
2292
+ significant: result.winner.significant,
2293
+ reports: result.scores.map((score) => ({
2294
+ variantId: score.variantId,
2295
+ bundle: byId.get(score.variantId),
2296
+ mean: score.mean,
2297
+ ci95: score.ci95,
2298
+ scenarioScores: score.perScenario
2299
+ })),
2300
+ pairwise: result.pairwise
2301
+ };
2302
+ }
2303
+ };
2304
+
2305
+ // src/steering-optimizer.ts
2306
+ var PairwiseSteeringOptimizer = class {
2307
+ optimize(rows, config = {}) {
2308
+ const ranked = rankRows(rows, config.weights);
2309
+ if (!ranked.length) throw new Error("no steering optimization rows");
2310
+ return {
2311
+ backend: "pairwise",
2312
+ recommendedVariantId: ranked[0].variantId,
2313
+ rationale: `Highest observed mean aggregate across ${rows.length} scored run(s).`,
2314
+ rankings: ranked
2315
+ };
2316
+ }
2317
+ };
2318
+ var AxGepaSteeringOptimizer = class {
2319
+ constructor(config) {
2320
+ this.config = config;
2321
+ }
2322
+ config;
2323
+ async optimize(rows) {
2324
+ const fallback = new PairwiseSteeringOptimizer().optimize(rows, this.config);
2325
+ const minRows = this.config.minRows ?? 6;
2326
+ const variantIds = [...new Set(rows.map((row) => row.variantId))];
2327
+ const byScenario = collapseScenarioWinners(rows, this.config.weights);
2328
+ if (variantIds.length < 2 || byScenario.length < minRows) {
2329
+ return {
2330
+ ...fallback,
2331
+ backend: "ax-gepa",
2332
+ skipped: true,
2333
+ rationale: `AxGEPA skipped: need >=2 variants and >=${minRows} scenario winners, got ${variantIds.length} variant(s) and ${byScenario.length} scenario winner(s).`
2334
+ };
2335
+ }
2336
+ let axLib;
2337
+ try {
2338
+ axLib = await import("@ax-llm/ax");
2339
+ } catch {
2340
+ return {
2341
+ ...fallback,
2342
+ backend: "ax-gepa",
2343
+ skipped: true,
2344
+ rationale: "AxGEPA unavailable: install @ax-llm/ax to enable selector optimization."
2345
+ };
2346
+ }
2347
+ const { ai, ax, AxGEPA } = axLib;
2348
+ const signature = `task:string, split:string, seedPreview:string -> variantId:class "${variantIds.join(", ")}", rationale:string`;
2349
+ const selector = ax(signature, {
2350
+ description: "Choose the best steering bundle variant for an autopilot task."
2351
+ });
2352
+ const splitIndex = Math.max(1, Math.floor(byScenario.length * 0.8));
2353
+ const train = byScenario.slice(0, splitIndex);
2354
+ const validation = byScenario.slice(splitIndex);
2355
+ if (!validation.length) {
2356
+ return {
2357
+ ...fallback,
2358
+ backend: "ax-gepa",
2359
+ skipped: true,
2360
+ rationale: "AxGEPA skipped: no validation examples after split."
2361
+ };
2362
+ }
2363
+ const optimizer = new AxGEPA({
2364
+ studentAI: createAxService(ai, this.config.provider, this.config.apiKey, this.config.model),
2365
+ teacherAI: createAxService(ai, this.config.provider, this.config.apiKey, this.config.teacherModel ?? this.config.model),
2366
+ numTrials: 8,
2367
+ minibatch: true,
2368
+ minibatchSize: 4,
2369
+ earlyStoppingTrials: 3,
2370
+ sampleCount: 1
2371
+ });
2372
+ const compiled = await optimizer.compile(
2373
+ selector,
2374
+ train,
2375
+ (({ prediction, example }) => prediction?.variantId === example?.variantId ? 1 : 0),
2376
+ {
2377
+ validationExamples: validation,
2378
+ maxMetricCalls: 64
2379
+ }
2380
+ );
2381
+ selector.applyOptimization(compiled.optimizedProgram);
2382
+ return {
2383
+ ...fallback,
2384
+ backend: "ax-gepa",
2385
+ rationale: `AxGEPA trained a variant selector from ${byScenario.length} scored scenario winner(s); default winner remains ${fallback.recommendedVariantId}.`,
2386
+ selector: {
2387
+ backend: "ax-gepa",
2388
+ signature,
2389
+ labels: variantIds,
2390
+ rationale: compiled.bestScore !== void 0 ? `bestScore=${compiled.bestScore}` : void 0
2391
+ }
2392
+ };
2393
+ }
2394
+ };
2395
+ function rankRows(rows, weights) {
2396
+ const buckets = /* @__PURE__ */ new Map();
2397
+ for (const row of rows) {
2398
+ const values = buckets.get(row.variantId) ?? [];
2399
+ values.push(aggregateRunScore(row.score, weights));
2400
+ buckets.set(row.variantId, values);
2401
+ }
2402
+ return [...buckets.entries()].map(([variantId, values]) => ({
2403
+ variantId,
2404
+ mean: values.reduce((sum, value) => sum + value, 0) / values.length,
2405
+ runs: values.length
2406
+ })).sort((a, b) => b.mean - a.mean);
2407
+ }
2408
+ function collapseScenarioWinners(rows, weights) {
2409
+ const byScenario = /* @__PURE__ */ new Map();
2410
+ for (const row of rows) {
2411
+ const bucket = byScenario.get(row.scenarioId) ?? [];
2412
+ bucket.push(row);
2413
+ byScenario.set(row.scenarioId, bucket);
2414
+ }
2415
+ return [...byScenario.entries()].map(([scenarioId, scenarioRows]) => {
2416
+ const best = scenarioRows.map((row) => ({ row, aggregate: aggregateRunScore(row.score, weights) })).sort((a, b) => b.aggregate - a.aggregate)[0];
2417
+ return {
2418
+ task: String(best.row.metadata?.task ?? best.row.metadata?.seed_preview ?? scenarioId),
2419
+ split: String(best.row.metadata?.split ?? "train"),
2420
+ seedPreview: String(best.row.metadata?.seed_preview ?? ""),
2421
+ variantId: best.row.variantId
2422
+ };
2423
+ });
2201
2424
  }
2202
- function isSandboxSpan(s) {
2203
- return s.kind === "sandbox";
2425
+ function createAxService(aiFactory, provider, apiKey, model) {
2426
+ return aiFactory({
2427
+ name: provider,
2428
+ apiKey,
2429
+ config: { model }
2430
+ });
2204
2431
  }
2205
2432
 
2206
2433
  // src/trace/store.ts
@@ -2597,58 +2824,695 @@ function llmSpanFromProvider(args) {
2597
2824
  };
2598
2825
  }
2599
2826
 
2600
- // src/trace/query.ts
2601
- async function runsForScenario(store, scenarioId) {
2602
- return store.listRuns({ scenarioId });
2603
- }
2604
- async function llmSpans(store, runId) {
2605
- const spans = await store.spans({ runId, kind: "llm" });
2606
- return spans.filter(isLlmSpan);
2607
- }
2608
- async function toolSpans(store, runId, toolName) {
2609
- const spans = await store.spans({ runId, kind: "tool", toolName });
2610
- return spans.filter(isToolSpan);
2611
- }
2612
- async function judgeSpans(store, runId) {
2613
- const spans = await store.spans({ runId, kind: "judge" });
2614
- return spans.filter(isJudgeSpan);
2615
- }
2616
- function groupBy(items, key) {
2617
- const map = /* @__PURE__ */ new Map();
2618
- for (const item of items) {
2619
- const k = key(item);
2620
- let bucket = map.get(k);
2621
- if (!bucket) {
2622
- bucket = [];
2623
- map.set(k, bucket);
2827
+ // src/sandbox-harness.ts
2828
+ var vitestTestParser = {
2829
+ id: "vitest",
2830
+ parse(stdout) {
2831
+ const m = stdout.match(/Tests\s+(\d+)\s+(passed|failed)(?:\s*\|\s*(\d+)\s+(passed|failed))?/i);
2832
+ if (!m) return void 0;
2833
+ let passed = 0;
2834
+ let failed = 0;
2835
+ const a = parseInt(m[1], 10);
2836
+ const aLabel = m[2].toLowerCase();
2837
+ if (aLabel === "passed") passed += a;
2838
+ else failed += a;
2839
+ if (m[3] && m[4]) {
2840
+ const b = parseInt(m[3], 10);
2841
+ if (m[4].toLowerCase() === "passed") passed += b;
2842
+ else failed += b;
2624
2843
  }
2625
- bucket.push(item);
2844
+ return { testsTotal: passed + failed, testsPassed: passed };
2626
2845
  }
2627
- return map;
2628
- }
2629
- function argHash(args) {
2630
- return stableStringify(args);
2631
- }
2632
- function stableStringify(value) {
2633
- if (value === null || typeof value !== "object") return JSON.stringify(value);
2634
- if (Array.isArray(value)) return `[${value.map(stableStringify).join(",")}]`;
2635
- const keys = Object.keys(value).sort();
2636
- const parts = keys.map((k) => `${JSON.stringify(k)}:${stableStringify(value[k])}`);
2637
- return `{${parts.join(",")}}`;
2638
- }
2639
- function aggregateLlm(spans) {
2640
- return spans.reduce(
2641
- (acc, s) => ({
2642
- inputTokens: acc.inputTokens + (s.inputTokens ?? 0),
2643
- outputTokens: acc.outputTokens + (s.outputTokens ?? 0),
2644
- cachedTokens: acc.cachedTokens + (s.cachedTokens ?? 0),
2645
- costUsd: acc.costUsd + (s.costUsd ?? 0)
2646
- }),
2647
- { inputTokens: 0, outputTokens: 0, cachedTokens: 0, costUsd: 0 }
2648
- );
2846
+ };
2847
+ var pytestTestParser = {
2848
+ id: "pytest",
2849
+ parse(stdout) {
2850
+ const total = stdout.match(/collected\s+(\d+)\s+items?/i);
2851
+ const passed = stdout.match(/(\d+)\s+passed/);
2852
+ if (!total || !passed) return void 0;
2853
+ return { testsTotal: parseInt(total[1], 10), testsPassed: parseInt(passed[1], 10) };
2854
+ }
2855
+ };
2856
+ var jestTestParser = {
2857
+ id: "jest",
2858
+ parse(stdout) {
2859
+ const m = stdout.match(/Tests:\s+(?:(\d+)\s+failed[^,]*,\s*)?(\d+)\s+passed,\s+(\d+)\s+total/i);
2860
+ if (!m) return void 0;
2861
+ return { testsTotal: parseInt(m[3], 10), testsPassed: parseInt(m[2], 10) };
2862
+ }
2863
+ };
2864
+ function composeParsers(...parsers) {
2865
+ return {
2866
+ id: parsers.map((p) => p.id).join("|"),
2867
+ parse(stdout, stderr, exitCode) {
2868
+ for (const p of parsers) {
2869
+ const res = p.parse(stdout, stderr, exitCode);
2870
+ if (res) return res;
2871
+ }
2872
+ return void 0;
2873
+ }
2874
+ };
2649
2875
  }
2650
- function runFailureClass(run) {
2651
- if (run.outcome?.failureClass) return run.outcome.failureClass;
2876
+ var SubprocessSandboxDriver = class {
2877
+ id = "subprocess";
2878
+ async exec(phase, command, config) {
2879
+ const { spawn } = await import("child_process");
2880
+ const start = Date.now();
2881
+ return await new Promise((resolve) => {
2882
+ const child = spawn(command, {
2883
+ shell: true,
2884
+ cwd: config.cwd,
2885
+ env: { ...process.env, ...config.env ?? {} }
2886
+ });
2887
+ let stdout = "";
2888
+ let stderr = "";
2889
+ child.stdout?.on("data", (d) => {
2890
+ stdout += String(d);
2891
+ });
2892
+ child.stderr?.on("data", (d) => {
2893
+ stderr += String(d);
2894
+ });
2895
+ const timeout = setTimeout(() => {
2896
+ try {
2897
+ child.kill("SIGKILL");
2898
+ } catch {
2899
+ }
2900
+ }, config.timeoutMs ?? 10 * 6e4);
2901
+ child.on("close", (code) => {
2902
+ clearTimeout(timeout);
2903
+ const wallMs = Date.now() - start;
2904
+ const parsed = phase === "test" && config.testParser ? config.testParser.parse(stdout, stderr, code ?? 1) : void 0;
2905
+ resolve({
2906
+ phase,
2907
+ exitCode: code ?? 1,
2908
+ stdout,
2909
+ stderr,
2910
+ wallMs,
2911
+ testsTotal: parsed?.testsTotal,
2912
+ testsPassed: parsed?.testsPassed
2913
+ });
2914
+ });
2915
+ child.on("error", (err) => {
2916
+ clearTimeout(timeout);
2917
+ const wallMs = Date.now() - start;
2918
+ resolve({ phase, exitCode: 127, stdout, stderr: stderr + String(err), wallMs });
2919
+ });
2920
+ });
2921
+ }
2922
+ };
2923
+ var DockerSandboxDriver = class {
2924
+ id = "docker";
2925
+ async exec(phase, command, config) {
2926
+ if (!config.image) throw new Error("DockerSandboxDriver requires config.image");
2927
+ const sub = new SubprocessSandboxDriver();
2928
+ const envArgs = Object.entries(config.env ?? {}).map(([k, v]) => `-e ${shellQuote(k)}=${shellQuote(v)}`).join(" ");
2929
+ const wrapped = `docker run --rm ${envArgs} ${shellQuote(config.image)} sh -c ${shellQuote(command)}`;
2930
+ return sub.exec(phase, wrapped, { ...config, env: void 0 });
2931
+ }
2932
+ };
2933
+ function shellQuote(v) {
2934
+ if (/^[A-Za-z0-9_\-\/\.@:=]+$/.test(v)) return v;
2935
+ return `'${v.replace(/'/g, `'\\''`)}'`;
2936
+ }
2937
+ var SandboxHarness = class {
2938
+ driver;
2939
+ constructor(driver = new SubprocessSandboxDriver()) {
2940
+ this.driver = driver;
2941
+ }
2942
+ async run(config, emitter) {
2943
+ const handle = await emitter.sandbox({
2944
+ name: `sandbox(${this.driver.id})`,
2945
+ image: config.image,
2946
+ command: [config.setupCommand, config.runCommand, config.testCommand].filter(Boolean).join(" && ")
2947
+ });
2948
+ const result = { passed: false, totalWallMs: 0, score: 0 };
2949
+ try {
2950
+ if (config.setupCommand) {
2951
+ result.setup = await this.driver.exec("setup", config.setupCommand, config);
2952
+ result.totalWallMs += result.setup.wallMs;
2953
+ if (result.setup.exitCode !== 0) {
2954
+ await handle.fail(`setup failed (exit ${result.setup.exitCode})`, {
2955
+ exitCode: result.setup.exitCode,
2956
+ wallMs: result.totalWallMs
2957
+ });
2958
+ return result;
2959
+ }
2960
+ }
2961
+ if (config.runCommand) {
2962
+ result.run = await this.driver.exec("run", config.runCommand, config);
2963
+ result.totalWallMs += result.run.wallMs;
2964
+ if (result.run.exitCode !== 0) {
2965
+ await handle.fail(`run failed (exit ${result.run.exitCode})`, {
2966
+ exitCode: result.run.exitCode,
2967
+ wallMs: result.totalWallMs
2968
+ });
2969
+ return result;
2970
+ }
2971
+ }
2972
+ if (config.testCommand) {
2973
+ result.test = await this.driver.exec("test", config.testCommand, config);
2974
+ result.totalWallMs += result.test.wallMs;
2975
+ const passed = result.test.exitCode === 0;
2976
+ result.passed = passed;
2977
+ if (result.test.testsTotal !== void 0 && result.test.testsTotal > 0) {
2978
+ result.score = (result.test.testsPassed ?? 0) / result.test.testsTotal;
2979
+ } else {
2980
+ result.score = passed ? 1 : 0;
2981
+ }
2982
+ await handle.end({
2983
+ exitCode: result.test.exitCode,
2984
+ testsTotal: result.test.testsTotal,
2985
+ testsPassed: result.test.testsPassed,
2986
+ wallMs: result.totalWallMs,
2987
+ status: passed ? "ok" : "error"
2988
+ });
2989
+ } else {
2990
+ result.passed = true;
2991
+ result.score = 1;
2992
+ await handle.end({ wallMs: result.totalWallMs });
2993
+ }
2994
+ } catch (err) {
2995
+ await handle.fail(err instanceof Error ? err : String(err));
2996
+ throw err;
2997
+ }
2998
+ return result;
2999
+ }
3000
+ };
3001
+
3002
+ // src/judge-runner.ts
3003
+ var JudgeRunner = class {
3004
+ driver;
3005
+ constructor(driver = new SubprocessSandboxDriver()) {
3006
+ this.driver = driver;
3007
+ }
3008
+ async run(spec) {
3009
+ const store = new InMemoryTraceStore();
3010
+ const emitter = new TraceEmitter(store, { runId: `judge-${spec.id}` });
3011
+ await emitter.startRun({
3012
+ scenarioId: spec.id,
3013
+ layer: "meta",
3014
+ projectId: "judge-runner"
3015
+ });
3016
+ const harness = new SandboxHarness(this.driver);
3017
+ const detail = await harness.run(spec.config, emitter);
3018
+ await emitter.endRun({ pass: detail.passed, score: detail.score, notes: `${spec.kind} judge` });
3019
+ return {
3020
+ id: spec.id,
3021
+ kind: spec.kind,
3022
+ passed: detail.passed,
3023
+ score: detail.score,
3024
+ summary: renderJudgeSummary(spec.kind, detail),
3025
+ detail
3026
+ };
3027
+ }
3028
+ };
3029
+ async function runJudgeFleet(specs, options = {}) {
3030
+ const runner = new JudgeRunner(options.driver);
3031
+ if (options.parallel === false) {
3032
+ const results = [];
3033
+ for (const spec of specs) results.push(await runner.run(spec));
3034
+ return results;
3035
+ }
3036
+ return await Promise.all(specs.map((spec) => runner.run(spec)));
3037
+ }
3038
+ function compilerJudge(id, config) {
3039
+ return { id, kind: "compiler", config };
3040
+ }
3041
+ function testJudge(id, config) {
3042
+ return { id, kind: "test", config };
3043
+ }
3044
+ function linterJudge(id, config) {
3045
+ return { id, kind: "linter", config };
3046
+ }
3047
+ function securityJudge(id, config) {
3048
+ return { id, kind: "security", config };
3049
+ }
3050
+ function renderJudgeSummary(kind, detail) {
3051
+ if (!detail.passed) return `${kind} judge failed`;
3052
+ if (detail.test?.testsTotal) return `${kind} judge passed ${detail.test.testsPassed}/${detail.test.testsTotal} tests`;
3053
+ return `${kind} judge passed`;
3054
+ }
3055
+
3056
+ // src/dual-agent-bench.ts
3057
+ var DualAgentBench = class {
3058
+ async run(config) {
3059
+ const maxRounds = config.maxRounds ?? 5;
3060
+ const threshold = config.convergenceThreshold ?? 0.85;
3061
+ if (config.scenarios.length === 0) {
3062
+ throw new Error("DualAgentBench requires at least 1 scenario");
3063
+ }
3064
+ const results = [];
3065
+ for (const scenario of config.scenarios) {
3066
+ const history = [];
3067
+ let converged = false;
3068
+ let roundsToConverge = null;
3069
+ let finalProposal = "";
3070
+ let lastScore = 0;
3071
+ let priorCritique;
3072
+ for (let r = 0; r < maxRounds; r++) {
3073
+ const priorProposal = history[history.length - 1]?.proposal;
3074
+ const proposal = await config.propose({
3075
+ scenario,
3076
+ roundIndex: r,
3077
+ priorProposal,
3078
+ priorCritique
3079
+ });
3080
+ const { critique, convergenceScore } = await config.critique({
3081
+ scenario,
3082
+ roundIndex: r,
3083
+ proposal
3084
+ });
3085
+ if (!Number.isFinite(convergenceScore) || convergenceScore < 0 || convergenceScore > 1) {
3086
+ throw new Error(
3087
+ `critique must return convergenceScore in [0,1]; got ${convergenceScore} for scenario ${scenario.id} round ${r}`
3088
+ );
3089
+ }
3090
+ const round = {
3091
+ roundIndex: r,
3092
+ proposal,
3093
+ critique,
3094
+ convergenceScore
3095
+ };
3096
+ history.push(round);
3097
+ config.onRoundComplete?.({ scenarioId: scenario.id, round });
3098
+ finalProposal = proposal;
3099
+ lastScore = convergenceScore;
3100
+ priorCritique = critique;
3101
+ if (convergenceScore >= threshold) {
3102
+ converged = true;
3103
+ roundsToConverge = r + 1;
3104
+ break;
3105
+ }
3106
+ }
3107
+ results.push({
3108
+ scenarioId: scenario.id,
3109
+ converged,
3110
+ roundsToConverge,
3111
+ finalProposal,
3112
+ history,
3113
+ finalScore: lastScore
3114
+ });
3115
+ }
3116
+ const convergedResults = results.filter((r) => r.converged);
3117
+ const convergenceRate = results.length ? convergedResults.length / results.length : 0;
3118
+ const avgRoundsToConverge = convergedResults.length ? convergedResults.reduce((acc, r) => acc + (r.roundsToConverge ?? 0), 0) / convergedResults.length : null;
3119
+ const avgFinalScore = results.length ? results.reduce((acc, r) => acc + r.finalScore, 0) / results.length : 0;
3120
+ return {
3121
+ scenarios: results,
3122
+ aggregate: { convergenceRate, avgRoundsToConverge, avgFinalScore },
3123
+ config: { maxRounds, convergenceThreshold: threshold }
3124
+ };
3125
+ }
3126
+ };
3127
+
3128
+ // src/propose-review.ts
3129
+ import { appendFileSync, existsSync, mkdirSync, readFileSync } from "fs";
3130
+ import { dirname } from "path";
3131
+ function inMemoryReviewStore(initial = []) {
3132
+ const entries = [...initial];
3133
+ return {
3134
+ async load() {
3135
+ return [...entries];
3136
+ },
3137
+ async append(entry) {
3138
+ entries.push(entry);
3139
+ }
3140
+ };
3141
+ }
3142
+ function jsonlReviewStore(path) {
3143
+ return {
3144
+ async load() {
3145
+ if (!existsSync(path)) return [];
3146
+ const raw = readFileSync(path, "utf8");
3147
+ const out = [];
3148
+ for (const line of raw.split("\n")) {
3149
+ const trimmed = line.trim();
3150
+ if (!trimmed) continue;
3151
+ try {
3152
+ out.push(JSON.parse(trimmed));
3153
+ } catch {
3154
+ }
3155
+ }
3156
+ return out;
3157
+ },
3158
+ async append(entry) {
3159
+ mkdirSync(dirname(path), { recursive: true });
3160
+ appendFileSync(path, JSON.stringify(entry) + "\n");
3161
+ }
3162
+ };
3163
+ }
3164
+ var DEFAULT_FALLBACK_INSTRUCTION = "Inspect the verification failures above. Fix the critical issues first, then the major ones. Do not restate the failures \u2014 act on them.";
3165
+ async function runProposeReview(config) {
3166
+ const maxShots = config.maxShots ?? 10;
3167
+ const maxWallMs = config.maxWallMs ?? 10 * 60 * 1e3;
3168
+ const confidenceFloor = config.confidenceFloor ?? 0.3;
3169
+ const confidenceFloorWindow = config.confidenceFloorWindow ?? 2;
3170
+ const memory = config.memory ?? inMemoryReviewStore();
3171
+ const fallbackInstruction = config.fallbackInstruction ?? DEFAULT_FALLBACK_INSTRUCTION;
3172
+ const emitter = config.store ? new TraceEmitter(config.store) : null;
3173
+ if (emitter) {
3174
+ await emitter.startRun({
3175
+ scenarioId: config.scenarioId ?? "propose-review",
3176
+ projectId: config.projectId,
3177
+ variantId: config.variantId,
3178
+ layer: "meta",
3179
+ tags: {
3180
+ goal: config.goal.slice(0, 120),
3181
+ maxShots: String(maxShots)
3182
+ }
3183
+ });
3184
+ }
3185
+ const abort = new AbortController();
3186
+ const wallStart = Date.now();
3187
+ const wallTimer = setTimeout(() => abort.abort(new Error("propose-review wall timeout")), maxWallMs);
3188
+ const shots = [];
3189
+ let state = config.initialState;
3190
+ let priorReview = null;
3191
+ let lastVerification = { pass: false };
3192
+ let failureClass;
3193
+ let completed = false;
3194
+ let lowConfidenceStreak = 0;
3195
+ try {
3196
+ for (let shot = 1; shot <= maxShots; shot++) {
3197
+ if (abort.signal.aborted) {
3198
+ failureClass = "timeout";
3199
+ break;
3200
+ }
3201
+ const shotStart = Date.now();
3202
+ const shotHandle = emitter ? await emitter.span({ kind: "tool", name: `shot-${shot}` }) : null;
3203
+ let proposeOut;
3204
+ try {
3205
+ proposeOut = await config.propose({
3206
+ shot,
3207
+ goal: config.goal,
3208
+ state,
3209
+ priorReview,
3210
+ abortSignal: abort.signal,
3211
+ emitter: emitter ?? void 0
3212
+ });
3213
+ } catch (err) {
3214
+ await shotHandle?.fail(err instanceof Error ? err : String(err));
3215
+ failureClass = "unknown";
3216
+ throw err;
3217
+ }
3218
+ state = proposeOut.state;
3219
+ const traceSummary = proposeOut.traceSummary;
3220
+ let verification;
3221
+ try {
3222
+ verification = await config.verify(state);
3223
+ } catch (err) {
3224
+ await shotHandle?.fail(err instanceof Error ? err : String(err));
3225
+ failureClass = "unknown";
3226
+ throw err;
3227
+ }
3228
+ lastVerification = verification;
3229
+ const memorySnapshot = await memory.load();
3230
+ const verificationDigest = {
3231
+ pass: verification.pass,
3232
+ score: verification.score,
3233
+ failingLayers: verification.failingLayers ?? []
3234
+ };
3235
+ let review;
3236
+ let reviewAvailable = true;
3237
+ let reviewError;
3238
+ if (verification.pass) {
3239
+ review = {
3240
+ observations: "verification passed \u2014 skipping reviewer LLM call",
3241
+ diagnosis: "no failures to diagnose",
3242
+ nextShotInstruction: "(done)",
3243
+ shouldContinue: false,
3244
+ confidence: 1
3245
+ };
3246
+ } else {
3247
+ try {
3248
+ review = await config.review({
3249
+ shot,
3250
+ goal: config.goal,
3251
+ state,
3252
+ verification,
3253
+ traceSummary,
3254
+ memory: memorySnapshot
3255
+ });
3256
+ review = coerceReview(review);
3257
+ } catch (err) {
3258
+ reviewAvailable = false;
3259
+ reviewError = err instanceof Error ? err.message : String(err);
3260
+ const lastInstruction = memorySnapshot.length > 0 ? memorySnapshot[memorySnapshot.length - 1].nextShotInstruction : fallbackInstruction;
3261
+ review = {
3262
+ observations: "(reviewer unavailable \u2014 using last-known instruction)",
3263
+ diagnosis: reviewError,
3264
+ nextShotInstruction: lastInstruction,
3265
+ shouldContinue: true,
3266
+ confidence: 0.3
3267
+ };
3268
+ }
3269
+ }
3270
+ const entry = {
3271
+ shot,
3272
+ timestamp: Date.now(),
3273
+ ...review,
3274
+ verification: verificationDigest
3275
+ };
3276
+ await memory.append(entry);
3277
+ const shotRecord = {
3278
+ shot,
3279
+ state,
3280
+ verification,
3281
+ traceSummary,
3282
+ review,
3283
+ reviewAvailable,
3284
+ reviewError,
3285
+ durationMs: Date.now() - shotStart
3286
+ };
3287
+ shots.push(shotRecord);
3288
+ await shotHandle?.end({
3289
+ attributes: {
3290
+ verificationPass: verification.pass,
3291
+ verificationScore: verification.score ?? null,
3292
+ reviewShouldContinue: review.shouldContinue,
3293
+ reviewConfidence: review.confidence,
3294
+ reviewAvailable
3295
+ }
3296
+ });
3297
+ if (verification.pass) {
3298
+ completed = true;
3299
+ break;
3300
+ }
3301
+ if (!review.shouldContinue) {
3302
+ break;
3303
+ }
3304
+ if (confidenceFloorWindow > 0 && review.confidence <= confidenceFloor) {
3305
+ lowConfidenceStreak += 1;
3306
+ if (lowConfidenceStreak >= confidenceFloorWindow) break;
3307
+ } else {
3308
+ lowConfidenceStreak = 0;
3309
+ }
3310
+ priorReview = review;
3311
+ }
3312
+ if (!completed && !failureClass) {
3313
+ failureClass = shots.length >= maxShots ? "budget_exceeded" : "unknown";
3314
+ }
3315
+ } finally {
3316
+ clearTimeout(wallTimer);
3317
+ }
3318
+ const score = lastVerification.pass ? 1 : typeof lastVerification.score === "number" ? lastVerification.score : 0;
3319
+ if (emitter) {
3320
+ await emitter.endRun({
3321
+ pass: completed,
3322
+ score,
3323
+ failureClass,
3324
+ notes: `${shots.length} shot(s); final pass=${lastVerification.pass}`
3325
+ });
3326
+ }
3327
+ return {
3328
+ runId: emitter?.runId ?? null,
3329
+ completed,
3330
+ shots,
3331
+ finalState: state,
3332
+ finalVerification: lastVerification,
3333
+ failureClass,
3334
+ wallMs: Date.now() - wallStart,
3335
+ score
3336
+ };
3337
+ }
3338
+ var REVIEWER_SYSTEM_PROMPT = `You are a senior reviewer directing a multi-shot build loop.
3339
+ You do NOT grade \u2014 the verifier already did. Your job is to direct the worker's next shot.
3340
+ You are blind to the worker's inner monologue. You see what it DID, not what it thought.
3341
+ Return STRICT JSON matching the schema. No prose outside the JSON.`;
3342
+ function createLlmReviewer(cfg) {
3343
+ const renderState = cfg.renderState ?? ((s) => safeJson(s));
3344
+ const renderTraceSummary = cfg.renderTraceSummary ?? ((s) => s === void 0 ? "(none)" : safeJson(s));
3345
+ const system = cfg.systemPromptAddendum ? `${REVIEWER_SYSTEM_PROMPT}
3346
+
3347
+ ${cfg.systemPromptAddendum}` : REVIEWER_SYSTEM_PROMPT;
3348
+ return async (input) => {
3349
+ const memoryBlock = input.memory.length === 0 ? "(no prior shots \u2014 this is shot 1)" : input.memory.map((m) => [
3350
+ `shot ${m.shot} \u2014 verification.pass=${m.verification.pass}` + (typeof m.verification.score === "number" ? ` score=${m.verification.score.toFixed(2)}` : "") + ` confidence=${m.confidence.toFixed(2)} failing=[${(m.verification.failingLayers ?? []).join(",")}]`,
3351
+ ` observations: ${m.observations.slice(0, 400)}`,
3352
+ ` diagnosis: ${m.diagnosis.slice(0, 400)}`,
3353
+ ` instruction given: ${m.nextShotInstruction.slice(0, 400)}`
3354
+ ].join("\n")).join("\n\n");
3355
+ const user = [
3356
+ `=== GOAL ===`,
3357
+ input.goal,
3358
+ ``,
3359
+ `=== SHOT NUMBER ===`,
3360
+ String(input.shot),
3361
+ ``,
3362
+ `=== CURRENT STATE ===`,
3363
+ renderState(input.state),
3364
+ ``,
3365
+ `=== TRACE SUMMARY ===`,
3366
+ renderTraceSummary(input.traceSummary),
3367
+ ``,
3368
+ `=== VERIFICATION ===`,
3369
+ summarizeVerification(input.verification),
3370
+ ``,
3371
+ `=== REVIEWER MEMORY (prior shots) ===`,
3372
+ memoryBlock,
3373
+ ``,
3374
+ `=== YOUR TASK ===`,
3375
+ `Return STRICT JSON:`,
3376
+ `{`,
3377
+ ` "observations": string (20..2000 chars, first-person worker behavior \u2014 quote counts, errors, loops)`,
3378
+ ` "diagnosis": string (20..1500 chars, root cause, NOT a restatement of verification)`,
3379
+ ` "nextShotInstruction": string (40..3000 chars, concrete directive to the worker)`,
3380
+ ` "shouldContinue": boolean (false if verification.pass, or if thrashing, or unachievable)`,
3381
+ ` "confidence": number in [0,1]`,
3382
+ `}`
3383
+ ].join("\n");
3384
+ const raw = await cfg.callJson({ system, user });
3385
+ return coerceReview(raw);
3386
+ };
3387
+ }
3388
+ function coerceReview(raw) {
3389
+ if (!raw || typeof raw !== "object") {
3390
+ throw new Error("reviewer returned non-object");
3391
+ }
3392
+ const observations = typeof raw.observations === "string" ? raw.observations : "";
3393
+ const diagnosis = typeof raw.diagnosis === "string" ? raw.diagnosis : "";
3394
+ const nextShotInstruction = typeof raw.nextShotInstruction === "string" ? raw.nextShotInstruction : "";
3395
+ if (!observations || !diagnosis || !nextShotInstruction) {
3396
+ throw new Error("reviewer missing required string fields");
3397
+ }
3398
+ if (typeof raw.shouldContinue !== "boolean") {
3399
+ throw new Error("reviewer missing shouldContinue boolean");
3400
+ }
3401
+ const confidenceRaw = Number(raw.confidence);
3402
+ if (!Number.isFinite(confidenceRaw)) {
3403
+ throw new Error("reviewer confidence not finite");
3404
+ }
3405
+ return {
3406
+ observations,
3407
+ diagnosis,
3408
+ nextShotInstruction,
3409
+ shouldContinue: raw.shouldContinue,
3410
+ confidence: Math.max(0, Math.min(1, confidenceRaw))
3411
+ };
3412
+ }
3413
+ function summarizeVerification(v) {
3414
+ const header = `pass=${v.pass}` + (typeof v.score === "number" ? ` score=${v.score.toFixed(3)}` : "") + (v.failingLayers && v.failingLayers.length > 0 ? ` failing=[${v.failingLayers.join(", ")}]` : "");
3415
+ const details = v.details === void 0 ? "" : `
3416
+ ${safeJson(v.details).slice(0, 1500)}`;
3417
+ return header + details;
3418
+ }
3419
+ function safeJson(x) {
3420
+ try {
3421
+ return JSON.stringify(x, null, 2);
3422
+ } catch {
3423
+ return String(x);
3424
+ }
3425
+ }
3426
+
3427
+ // src/trace/schema.ts
3428
+ var TRACE_SCHEMA_VERSION = "1.0.0";
3429
+ var FAILURE_CLASSES = [
3430
+ "success",
3431
+ "reasoning_error",
3432
+ "tool_selection_error",
3433
+ "tool_argument_error",
3434
+ "tool_recovery_failure",
3435
+ "hallucination",
3436
+ "instruction_following",
3437
+ "safety_refusal_miss",
3438
+ "policy_violation",
3439
+ "budget_exceeded",
3440
+ "format_drift",
3441
+ "permission_escalation",
3442
+ "pii_leak",
3443
+ "cost_overrun",
3444
+ "timeout",
3445
+ "sandbox_failure",
3446
+ "unknown"
3447
+ ];
3448
+ function isLlmSpan(s) {
3449
+ return s.kind === "llm";
3450
+ }
3451
+ function isToolSpan(s) {
3452
+ return s.kind === "tool";
3453
+ }
3454
+ function isRetrievalSpan(s) {
3455
+ return s.kind === "retrieval";
3456
+ }
3457
+ function isJudgeSpan(s) {
3458
+ return s.kind === "judge";
3459
+ }
3460
+ function isSandboxSpan(s) {
3461
+ return s.kind === "sandbox";
3462
+ }
3463
+
3464
+ // src/trace/query.ts
3465
+ async function runsForScenario(store, scenarioId) {
3466
+ return store.listRuns({ scenarioId });
3467
+ }
3468
+ async function llmSpans(store, runId) {
3469
+ const spans = await store.spans({ runId, kind: "llm" });
3470
+ return spans.filter(isLlmSpan);
3471
+ }
3472
+ async function toolSpans(store, runId, toolName) {
3473
+ const spans = await store.spans({ runId, kind: "tool", toolName });
3474
+ return spans.filter(isToolSpan);
3475
+ }
3476
+ async function judgeSpans(store, runId) {
3477
+ const spans = await store.spans({ runId, kind: "judge" });
3478
+ return spans.filter(isJudgeSpan);
3479
+ }
3480
+ function groupBy(items, key) {
3481
+ const map = /* @__PURE__ */ new Map();
3482
+ for (const item of items) {
3483
+ const k = key(item);
3484
+ let bucket = map.get(k);
3485
+ if (!bucket) {
3486
+ bucket = [];
3487
+ map.set(k, bucket);
3488
+ }
3489
+ bucket.push(item);
3490
+ }
3491
+ return map;
3492
+ }
3493
+ function argHash(args) {
3494
+ return stableStringify(args);
3495
+ }
3496
+ function stableStringify(value) {
3497
+ if (value === null || typeof value !== "object") return JSON.stringify(value);
3498
+ if (Array.isArray(value)) return `[${value.map(stableStringify).join(",")}]`;
3499
+ const keys = Object.keys(value).sort();
3500
+ const parts = keys.map((k) => `${JSON.stringify(k)}:${stableStringify(value[k])}`);
3501
+ return `{${parts.join(",")}}`;
3502
+ }
3503
+ function aggregateLlm(spans) {
3504
+ return spans.reduce(
3505
+ (acc, s) => ({
3506
+ inputTokens: acc.inputTokens + (s.inputTokens ?? 0),
3507
+ outputTokens: acc.outputTokens + (s.outputTokens ?? 0),
3508
+ cachedTokens: acc.cachedTokens + (s.cachedTokens ?? 0),
3509
+ costUsd: acc.costUsd + (s.costUsd ?? 0)
3510
+ }),
3511
+ { inputTokens: 0, outputTokens: 0, cachedTokens: 0, costUsd: 0 }
3512
+ );
3513
+ }
3514
+ function runFailureClass(run) {
3515
+ if (run.outcome?.failureClass) return run.outcome.failureClass;
2652
3516
  if (run.status === "completed" && run.outcome?.pass !== false) return "success";
2653
3517
  if (run.status === "aborted") return "budget_exceeded";
2654
3518
  return "unknown";
@@ -2825,181 +3689,6 @@ function runToTraceId(run) {
2825
3689
  return cleaned.slice(0, 32).padEnd(32, "0");
2826
3690
  }
2827
3691
 
2828
- // src/sandbox-harness.ts
2829
- var vitestTestParser = {
2830
- id: "vitest",
2831
- parse(stdout) {
2832
- const m = stdout.match(/Tests\s+(\d+)\s+(passed|failed)(?:\s*\|\s*(\d+)\s+(passed|failed))?/i);
2833
- if (!m) return void 0;
2834
- let passed = 0;
2835
- let failed = 0;
2836
- const a = parseInt(m[1], 10);
2837
- const aLabel = m[2].toLowerCase();
2838
- if (aLabel === "passed") passed += a;
2839
- else failed += a;
2840
- if (m[3] && m[4]) {
2841
- const b = parseInt(m[3], 10);
2842
- if (m[4].toLowerCase() === "passed") passed += b;
2843
- else failed += b;
2844
- }
2845
- return { testsTotal: passed + failed, testsPassed: passed };
2846
- }
2847
- };
2848
- var pytestTestParser = {
2849
- id: "pytest",
2850
- parse(stdout) {
2851
- const total = stdout.match(/collected\s+(\d+)\s+items?/i);
2852
- const passed = stdout.match(/(\d+)\s+passed/);
2853
- if (!total || !passed) return void 0;
2854
- return { testsTotal: parseInt(total[1], 10), testsPassed: parseInt(passed[1], 10) };
2855
- }
2856
- };
2857
- var jestTestParser = {
2858
- id: "jest",
2859
- parse(stdout) {
2860
- const m = stdout.match(/Tests:\s+(?:(\d+)\s+failed[^,]*,\s*)?(\d+)\s+passed,\s+(\d+)\s+total/i);
2861
- if (!m) return void 0;
2862
- return { testsTotal: parseInt(m[3], 10), testsPassed: parseInt(m[2], 10) };
2863
- }
2864
- };
2865
- function composeParsers(...parsers) {
2866
- return {
2867
- id: parsers.map((p) => p.id).join("|"),
2868
- parse(stdout, stderr, exitCode) {
2869
- for (const p of parsers) {
2870
- const res = p.parse(stdout, stderr, exitCode);
2871
- if (res) return res;
2872
- }
2873
- return void 0;
2874
- }
2875
- };
2876
- }
2877
- var SubprocessSandboxDriver = class {
2878
- id = "subprocess";
2879
- async exec(phase, command, config) {
2880
- const { spawn } = await import("child_process");
2881
- const start = Date.now();
2882
- return await new Promise((resolve) => {
2883
- const child = spawn(command, {
2884
- shell: true,
2885
- cwd: config.cwd,
2886
- env: { ...process.env, ...config.env ?? {} }
2887
- });
2888
- let stdout = "";
2889
- let stderr = "";
2890
- child.stdout?.on("data", (d) => {
2891
- stdout += String(d);
2892
- });
2893
- child.stderr?.on("data", (d) => {
2894
- stderr += String(d);
2895
- });
2896
- const timeout = setTimeout(() => {
2897
- try {
2898
- child.kill("SIGKILL");
2899
- } catch {
2900
- }
2901
- }, config.timeoutMs ?? 10 * 6e4);
2902
- child.on("close", (code) => {
2903
- clearTimeout(timeout);
2904
- const wallMs = Date.now() - start;
2905
- const parsed = phase === "test" && config.testParser ? config.testParser.parse(stdout, stderr, code ?? 1) : void 0;
2906
- resolve({
2907
- phase,
2908
- exitCode: code ?? 1,
2909
- stdout,
2910
- stderr,
2911
- wallMs,
2912
- testsTotal: parsed?.testsTotal,
2913
- testsPassed: parsed?.testsPassed
2914
- });
2915
- });
2916
- child.on("error", (err) => {
2917
- clearTimeout(timeout);
2918
- const wallMs = Date.now() - start;
2919
- resolve({ phase, exitCode: 127, stdout, stderr: stderr + String(err), wallMs });
2920
- });
2921
- });
2922
- }
2923
- };
2924
- var DockerSandboxDriver = class {
2925
- id = "docker";
2926
- async exec(phase, command, config) {
2927
- if (!config.image) throw new Error("DockerSandboxDriver requires config.image");
2928
- const sub = new SubprocessSandboxDriver();
2929
- const envArgs = Object.entries(config.env ?? {}).map(([k, v]) => `-e ${shellQuote(k)}=${shellQuote(v)}`).join(" ");
2930
- const wrapped = `docker run --rm ${envArgs} ${shellQuote(config.image)} sh -c ${shellQuote(command)}`;
2931
- return sub.exec(phase, wrapped, { ...config, env: void 0 });
2932
- }
2933
- };
2934
- function shellQuote(v) {
2935
- if (/^[A-Za-z0-9_\-\/\.@:=]+$/.test(v)) return v;
2936
- return `'${v.replace(/'/g, `'\\''`)}'`;
2937
- }
2938
- var SandboxHarness = class {
2939
- driver;
2940
- constructor(driver = new SubprocessSandboxDriver()) {
2941
- this.driver = driver;
2942
- }
2943
- async run(config, emitter) {
2944
- const handle = await emitter.sandbox({
2945
- name: `sandbox(${this.driver.id})`,
2946
- image: config.image,
2947
- command: [config.setupCommand, config.runCommand, config.testCommand].filter(Boolean).join(" && ")
2948
- });
2949
- const result = { passed: false, totalWallMs: 0, score: 0 };
2950
- try {
2951
- if (config.setupCommand) {
2952
- result.setup = await this.driver.exec("setup", config.setupCommand, config);
2953
- result.totalWallMs += result.setup.wallMs;
2954
- if (result.setup.exitCode !== 0) {
2955
- await handle.fail(`setup failed (exit ${result.setup.exitCode})`, {
2956
- exitCode: result.setup.exitCode,
2957
- wallMs: result.totalWallMs
2958
- });
2959
- return result;
2960
- }
2961
- }
2962
- if (config.runCommand) {
2963
- result.run = await this.driver.exec("run", config.runCommand, config);
2964
- result.totalWallMs += result.run.wallMs;
2965
- if (result.run.exitCode !== 0) {
2966
- await handle.fail(`run failed (exit ${result.run.exitCode})`, {
2967
- exitCode: result.run.exitCode,
2968
- wallMs: result.totalWallMs
2969
- });
2970
- return result;
2971
- }
2972
- }
2973
- if (config.testCommand) {
2974
- result.test = await this.driver.exec("test", config.testCommand, config);
2975
- result.totalWallMs += result.test.wallMs;
2976
- const passed = result.test.exitCode === 0;
2977
- result.passed = passed;
2978
- if (result.test.testsTotal !== void 0 && result.test.testsTotal > 0) {
2979
- result.score = (result.test.testsPassed ?? 0) / result.test.testsTotal;
2980
- } else {
2981
- result.score = passed ? 1 : 0;
2982
- }
2983
- await handle.end({
2984
- exitCode: result.test.exitCode,
2985
- testsTotal: result.test.testsTotal,
2986
- testsPassed: result.test.testsPassed,
2987
- wallMs: result.totalWallMs,
2988
- status: passed ? "ok" : "error"
2989
- });
2990
- } else {
2991
- result.passed = true;
2992
- result.score = 1;
2993
- await handle.end({ wallMs: result.totalWallMs });
2994
- }
2995
- } catch (err) {
2996
- await handle.fail(err instanceof Error ? err : String(err));
2997
- throw err;
2998
- }
2999
- return result;
3000
- }
3001
- };
3002
-
3003
3692
  // src/test-graded-scenario.ts
3004
3693
  async function runTestGradedScenario(scenario, store, options = {}) {
3005
3694
  const emitter = new TraceEmitter(store);
@@ -5284,8 +5973,11 @@ async function scoreProject(store, projectId) {
5284
5973
  const runtimeScore = runtimeScores.length > 0 ? runtimeScores.reduce((a, b) => a + b, 0) / runtimeScores.length : null;
5285
5974
  const runtimePassed = runtime.filter((r) => r.outcome?.pass === true).length;
5286
5975
  const runtimePassRate = runtime.length > 0 ? runtimePassed / runtime.length : null;
5976
+ const kind = runtime.length === 0 ? "scaffold-only" : "full";
5977
+ const complete = kind === "scaffold-only" ? metaScore !== null && buildScore !== null : metaScore !== null && buildScore !== null && runtimeScore !== null;
5287
5978
  return {
5288
5979
  projectId,
5980
+ kind,
5289
5981
  builderRunId: builder?.runId,
5290
5982
  metaScore,
5291
5983
  buildRunId: build?.runId,
@@ -5293,7 +5985,7 @@ async function scoreProject(store, projectId) {
5293
5985
  appRuntimeRunIds: runtime.map((r) => r.runId),
5294
5986
  runtimeScore,
5295
5987
  runtimePassRate,
5296
- complete: metaScore !== null && buildScore !== null && runtimeScore !== null
5988
+ complete
5297
5989
  };
5298
5990
  }
5299
5991
  async function scoreAllProjects(store) {
@@ -6987,6 +7679,7 @@ async function euAiActReport(ctx, signals) {
6987
7679
  }
6988
7680
  export {
6989
7681
  AgentDriver,
7682
+ AxGepaSteeringOptimizer,
6990
7683
  BenchmarkRunner,
6991
7684
  BudgetBreachError,
6992
7685
  BudgetGuard,
@@ -6998,6 +7691,7 @@ export {
6998
7691
  DEFAULT_MUTATORS,
6999
7692
  DEFAULT_REDACTION_RULES,
7000
7693
  DEFAULT_RED_TEAM_CORPUS,
7694
+ DEFAULT_RUN_SCORE_WEIGHTS,
7001
7695
  Dataset,
7002
7696
  DockerSandboxDriver,
7003
7697
  DualAgentBench,
@@ -7011,15 +7705,19 @@ export {
7011
7705
  InMemoryOutcomeStore,
7012
7706
  InMemoryTraceStore,
7013
7707
  InMemoryWorkspaceInspector,
7708
+ JudgeRunner,
7014
7709
  MODEL_PRICING,
7015
7710
  MetricsCollector,
7016
7711
  OTEL_AGENT_EVAL_SCOPE,
7712
+ OptimizationLoop,
7713
+ PairwiseSteeringOptimizer,
7017
7714
  PrmGrader,
7018
7715
  ProductClient,
7019
7716
  ProjectRegistry,
7020
7717
  PromptOptimizer,
7021
7718
  PromptRegistry,
7022
7719
  REDACTION_VERSION,
7720
+ RunCritic,
7023
7721
  SandboxHarness,
7024
7722
  ScenarioRegistry,
7025
7723
  SubprocessSandboxDriver,
@@ -7028,6 +7726,7 @@ export {
7028
7726
  TraceEmitter,
7029
7727
  adversarialJudge,
7030
7728
  aggregateLlm,
7729
+ aggregateRunScore,
7031
7730
  analyzeAntiSlop,
7032
7731
  analyzeSeries,
7033
7732
  argHash,
@@ -7044,6 +7743,7 @@ export {
7044
7743
  causalAttribution,
7045
7744
  checkCanaries,
7046
7745
  checkSlos,
7746
+ clamp01,
7047
7747
  classifyEuAiRisk,
7048
7748
  classifyFailure,
7049
7749
  codeExecutionJudge,
@@ -7052,6 +7752,7 @@ export {
7052
7752
  collectionPreserved,
7053
7753
  commitBisect,
7054
7754
  compareToBaseline,
7755
+ compilerJudge,
7055
7756
  composeParsers,
7056
7757
  composeValidators,
7057
7758
  computeToolUseMetrics,
@@ -7062,8 +7763,10 @@ export {
7062
7763
  createAntiSlopJudge,
7063
7764
  createCustomJudge,
7064
7765
  createDomainExpertJudge,
7766
+ createLlmReviewer,
7065
7767
  crossTraceDiff,
7066
7768
  defaultJudges,
7769
+ distillPlaybook,
7067
7770
  dominates,
7068
7771
  estimateCost,
7069
7772
  estimateTokens,
@@ -7085,6 +7788,7 @@ export {
7085
7788
  groupBy,
7086
7789
  hashContent,
7087
7790
  hashScenarios,
7791
+ inMemoryReviewStore,
7088
7792
  interRaterReliability,
7089
7793
  iqr,
7090
7794
  isJudgeSpan,
@@ -7096,14 +7800,17 @@ export {
7096
7800
  jestTestParser,
7097
7801
  jsonHasKeys,
7098
7802
  jsonShape,
7803
+ jsonlReviewStore,
7099
7804
  judgeAgreementView,
7100
7805
  judgeSpans,
7101
7806
  keyPreserved,
7807
+ linterJudge,
7102
7808
  llmSpanFromProvider,
7103
7809
  llmSpans,
7104
7810
  loadScorerFromGrader,
7105
7811
  lowercaseMutator,
7106
7812
  mannWhitneyU,
7813
+ mergeSteeringBundle,
7107
7814
  nistAiRmfReport,
7108
7815
  nonRefusalRubric,
7109
7816
  normalizeScores,
@@ -7131,6 +7838,8 @@ export {
7131
7838
  regressionView,
7132
7839
  renderMarkdown,
7133
7840
  renderMarkdownReport,
7841
+ renderPlaybookMarkdown,
7842
+ renderSteeringText,
7134
7843
  replayScorerOverCorpus,
7135
7844
  replayTraceThroughJudge,
7136
7845
  requiredSampleSize,
@@ -7142,6 +7851,8 @@ export {
7142
7851
  runE2EWorkflow,
7143
7852
  runExpectations,
7144
7853
  runFailureClass,
7854
+ runJudgeFleet,
7855
+ runProposeReview,
7145
7856
  runSelfPlay,
7146
7857
  runTestGradedScenario,
7147
7858
  runsForScenario,
@@ -7149,6 +7860,7 @@ export {
7149
7860
  scoreContinuity,
7150
7861
  scoreProject,
7151
7862
  scoreRedTeamOutput,
7863
+ securityJudge,
7152
7864
  selfPreference,
7153
7865
  sentenceReorderMutator,
7154
7866
  signManifest,
@@ -7156,6 +7868,7 @@ export {
7156
7868
  statusAdvanced,
7157
7869
  stuckLoopView,
7158
7870
  summarize,
7871
+ testJudge,
7159
7872
  textInSnapshot,
7160
7873
  toLangfuseEnvelope,
7161
7874
  toNdjson,