@tangle-network/agent-eval 0.6.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -410,7 +410,7 @@ function confidenceInterval(scores, confidence = 0.95) {
410
410
  if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
411
411
  if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
412
412
  const n = scores.length;
413
- const mean3 = scores.reduce((a, b) => a + b, 0) / n;
413
+ const mean4 = scores.reduce((a, b) => a + b, 0) / n;
414
414
  const B = 1e3;
415
415
  const bootstrapMeans = [];
416
416
  for (let i = 0; i < B; i++) {
@@ -425,7 +425,7 @@ function confidenceInterval(scores, confidence = 0.95) {
425
425
  const lowerIdx = Math.floor(alpha / 2 * B);
426
426
  const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
427
427
  return {
428
- mean: mean3,
428
+ mean: mean4,
429
429
  lower: bootstrapMeans[lowerIdx],
430
430
  upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
431
431
  };
@@ -513,11 +513,11 @@ function pairedTTest(before, after) {
513
513
  const n = before.length;
514
514
  if (n < 2) return { t: 0, df: 0, p: 1 };
515
515
  const diffs = before.map((b, i) => after[i] - b);
516
- const mean3 = diffs.reduce((a, b) => a + b, 0) / n;
517
- const variance2 = diffs.reduce((acc, d) => acc + (d - mean3) ** 2, 0) / (n - 1);
516
+ const mean4 = diffs.reduce((a, b) => a + b, 0) / n;
517
+ const variance2 = diffs.reduce((acc, d) => acc + (d - mean4) ** 2, 0) / (n - 1);
518
518
  const se = Math.sqrt(variance2 / n);
519
- if (se === 0) return { t: mean3 === 0 ? 0 : Infinity, df: n - 1, p: mean3 === 0 ? 1 : 0 };
520
- const t = mean3 / se;
519
+ if (se === 0) return { t: mean4 === 0 ? 0 : Infinity, df: n - 1, p: mean4 === 0 ? 1 : 0 };
520
+ const t = mean4 / se;
521
521
  const df = n - 1;
522
522
  const p = 2 * (1 - studentTCdf(Math.abs(t), df));
523
523
  return { t, df, p };
@@ -541,9 +541,9 @@ function wilcoxonSignedRank(before, after) {
541
541
  }
542
542
  let wPlus = 0;
543
543
  for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
544
- const mean3 = n * (n + 1) / 4;
544
+ const mean4 = n * (n + 1) / 4;
545
545
  const variance2 = n * (n + 1) * (2 * n + 1) / 24;
546
- const z = (wPlus - mean3) / Math.sqrt(variance2);
546
+ const z = (wPlus - mean4) / Math.sqrt(variance2);
547
547
  const p = 2 * (1 - normalCdf(Math.abs(z)));
548
548
  return { w: wPlus, p };
549
549
  }
@@ -2094,113 +2094,500 @@ function flatSamples(score) {
2094
2094
  return out;
2095
2095
  }
2096
2096
 
2097
- // src/dual-agent-bench.ts
2098
- var DualAgentBench = class {
2099
- async run(config) {
2100
- const maxRounds = config.maxRounds ?? 5;
2101
- const threshold = config.convergenceThreshold ?? 0.85;
2102
- if (config.scenarios.length === 0) {
2103
- throw new Error("DualAgentBench requires at least 1 scenario");
2097
+ // src/steering.ts
2098
+ function mergeSteeringBundle(base, delta) {
2099
+ return {
2100
+ ...base,
2101
+ ...delta.coderPrompt !== void 0 ? { coderPrompt: delta.coderPrompt } : {},
2102
+ ...delta.continuePrompt !== void 0 ? { continuePrompt: delta.continuePrompt } : {},
2103
+ reviewerPrompts: {
2104
+ ...base.reviewerPrompts ?? {},
2105
+ ...delta.reviewerPrompts ?? {}
2106
+ },
2107
+ skills: delta.skills ?? base.skills,
2108
+ rolePrompts: {
2109
+ ...base.rolePrompts ?? {},
2110
+ ...delta.rolePrompts ?? {}
2111
+ },
2112
+ metadata: {
2113
+ ...base.metadata ?? {},
2114
+ ...delta.metadata ?? {}
2104
2115
  }
2105
- const results = [];
2106
- for (const scenario of config.scenarios) {
2107
- const history = [];
2108
- let converged = false;
2109
- let roundsToConverge = null;
2110
- let finalProposal = "";
2111
- let lastScore = 0;
2112
- let priorCritique;
2113
- for (let r = 0; r < maxRounds; r++) {
2114
- const priorProposal = history[history.length - 1]?.proposal;
2115
- const proposal = await config.propose({
2116
- scenario,
2117
- roundIndex: r,
2118
- priorProposal,
2119
- priorCritique
2120
- });
2121
- const { critique, convergenceScore } = await config.critique({
2122
- scenario,
2123
- roundIndex: r,
2124
- proposal
2125
- });
2126
- if (!Number.isFinite(convergenceScore) || convergenceScore < 0 || convergenceScore > 1) {
2127
- throw new Error(
2128
- `critique must return convergenceScore in [0,1]; got ${convergenceScore} for scenario ${scenario.id} round ${r}`
2129
- );
2130
- }
2131
- const round = {
2132
- roundIndex: r,
2133
- proposal,
2134
- critique,
2135
- convergenceScore
2136
- };
2137
- history.push(round);
2138
- config.onRoundComplete?.({ scenarioId: scenario.id, round });
2139
- finalProposal = proposal;
2140
- lastScore = convergenceScore;
2141
- priorCritique = critique;
2142
- if (convergenceScore >= threshold) {
2143
- converged = true;
2144
- roundsToConverge = r + 1;
2145
- break;
2146
- }
2116
+ };
2117
+ }
2118
+ function renderSteeringText(bundle) {
2119
+ const lines = [`bundle:${bundle.id}`];
2120
+ if (bundle.coderPrompt) lines.push(`coder:${bundle.coderPrompt}`);
2121
+ if (bundle.continuePrompt) lines.push(`continue:${bundle.continuePrompt}`);
2122
+ const reviewers = Object.entries(bundle.reviewerPrompts ?? {}).sort(([a], [b]) => a.localeCompare(b));
2123
+ for (const [name, prompt] of reviewers) lines.push(`reviewer:${name}:${prompt}`);
2124
+ const skills = [...bundle.skills ?? []].sort();
2125
+ if (skills.length) lines.push(`skills:${skills.join(",")}`);
2126
+ return lines.join("\n");
2127
+ }
2128
+
2129
+ // src/run-score.ts
2130
+ var DEFAULT_RUN_SCORE_WEIGHTS = {
2131
+ success: 4,
2132
+ goalProgress: 2,
2133
+ repoGroundedness: 1.5,
2134
+ driftPenalty: -1.5,
2135
+ toolUseQuality: 1,
2136
+ patchQuality: 1.25,
2137
+ testReality: 1.5,
2138
+ finalGate: 3,
2139
+ reviewerBlockers: -2,
2140
+ costUsd: -0.2,
2141
+ wallSeconds: -0.1
2142
+ };
2143
+ function aggregateRunScore(score, weights = {}) {
2144
+ const w = { ...DEFAULT_RUN_SCORE_WEIGHTS, ...weights };
2145
+ return w.success * clamp01(score.success) + w.goalProgress * clamp01(score.goalProgress) + w.repoGroundedness * clamp01(score.repoGroundedness) + w.driftPenalty * clamp01(score.driftPenalty) + w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + w.finalGate * clamp01(score.finalGate) + w.reviewerBlockers * clamp01(score.reviewerBlockers) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60);
2146
+ }
2147
+ function clamp01(value) {
2148
+ if (!Number.isFinite(value)) return 0;
2149
+ return Math.max(0, Math.min(1, value));
2150
+ }
2151
+
2152
+ // src/run-critic.ts
2153
+ var DEFAULT_DRIFT_PATTERNS = [
2154
+ /https?:\/\//i,
2155
+ /\btitle:\s/i,
2156
+ /\bsummary:\s/i,
2157
+ /\burl:\s/i,
2158
+ /\bnpm package usage\b/i,
2159
+ /\bnews\b/i
2160
+ ];
2161
+ var RunCritic = class {
2162
+ weights;
2163
+ driftPatterns;
2164
+ constructor(options = {}) {
2165
+ this.weights = options.weights;
2166
+ this.driftPatterns = options.driftPatterns ?? DEFAULT_DRIFT_PATTERNS;
2167
+ }
2168
+ async score(store, runId) {
2169
+ const run = await store.getRun(runId);
2170
+ if (!run) throw new Error(`run ${runId} not found`);
2171
+ const [spans, events, artifacts, budget] = await Promise.all([
2172
+ store.spans({ runId }),
2173
+ store.events({ runId }),
2174
+ store.artifacts(runId),
2175
+ store.budget(runId)
2176
+ ]);
2177
+ return this.scoreTrace({ run, spans, events, artifacts, budget });
2178
+ }
2179
+ scoreTrace(trace) {
2180
+ const notes = [];
2181
+ const llmSpans2 = trace.spans.filter((s) => s.kind === "llm");
2182
+ const toolSpans2 = trace.spans.filter((s) => s.kind === "tool");
2183
+ const judgeSpans2 = trace.spans.filter((s) => s.kind === "judge");
2184
+ const sandboxSpans = trace.spans.filter((s) => s.kind === "sandbox");
2185
+ const finalGateSpans = judgeSpans2.filter(
2186
+ (span) => span.dimension === "final_gate" || span.attributes?.finalGate === true
2187
+ );
2188
+ const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
2189
+ if (!success) notes.push("run did not complete with pass=true");
2190
+ const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum, span) => sum + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
2191
+ const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score) : void 0;
2192
+ const goalProgress = outcomeScore ?? judgeAverage ?? success;
2193
+ const successfulTools = toolSpans2.filter((span) => span.status !== "error").length;
2194
+ const toolUseQuality = toolSpans2.length === 0 ? 0 : successfulTools / toolSpans2.length;
2195
+ if (toolSpans2.length === 0) notes.push("no tool spans recorded");
2196
+ const patchEvidence = trace.artifacts.length + toolSpans2.filter((span) => /write|edit|patch|apply/i.test(span.toolName)).length;
2197
+ const patchQuality = patchEvidence > 0 ? clamp01(patchEvidence / 4) : 0;
2198
+ if (!patchQuality) notes.push("no artifact or edit evidence recorded");
2199
+ const sandboxTests = sandboxSpans.filter((span) => typeof span.testsTotal === "number" && span.testsTotal > 0);
2200
+ const testReality = sandboxTests.length ? sandboxTests.reduce((sum, span) => sum + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1), 0) / sandboxTests.length : toolSpans2.some((span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))) ? 0.4 : 0;
2201
+ if (!testReality) notes.push("no real test/build evidence recorded");
2202
+ const blockerSpans = judgeSpans2.filter(
2203
+ (span) => isBlockingJudge(span)
2204
+ );
2205
+ const finalGateBlockers = finalGateSpans.filter((span) => isBlockingJudge(span));
2206
+ const finalGate = finalGateSpans.length ? finalGateBlockers.length ? 0 : 1 : success;
2207
+ if (finalGateBlockers.length) notes.push(`final gate blocked by ${finalGateBlockers.length} reviewer(s)`);
2208
+ else if (!finalGateSpans.length) notes.push("no final gate judgment recorded");
2209
+ const reviewerBlockers = judgeSpans2.length ? blockerSpans.length / judgeSpans2.length : 0;
2210
+ if (reviewerBlockers) notes.push(`detected ${blockerSpans.length} blocking reviewer signal(s)`);
2211
+ const positiveGroundingSignals = patchEvidence + sandboxSpans.length + llmSpans2.filter((span) => looksRepoGrounded(span.output ?? "")).length;
2212
+ const driftSignals = llmSpans2.filter((span) => this.isDrift(span.output ?? "")).length + trace.events.filter((event) => this.isDrift(JSON.stringify(event.payload))).length;
2213
+ const repoGroundedness = positiveGroundingSignals + driftSignals === 0 ? 0 : positiveGroundingSignals / (positiveGroundingSignals + driftSignals);
2214
+ const driftPenalty = positiveGroundingSignals + driftSignals === 0 ? 0 : driftSignals / (positiveGroundingSignals + driftSignals);
2215
+ if (driftSignals > 0) notes.push(`detected ${driftSignals} drift signal(s)`);
2216
+ const costUsd = trace.budget.length ? Math.max(...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed), 0) : llmSpans2.reduce((sum, span) => sum + (span.costUsd ?? 0), 0);
2217
+ const wallSeconds = trace.run.endedAt && trace.run.startedAt ? Math.max(0, (trace.run.endedAt - trace.run.startedAt) / 1e3) : 0;
2218
+ return {
2219
+ success,
2220
+ goalProgress,
2221
+ repoGroundedness,
2222
+ driftPenalty,
2223
+ toolUseQuality,
2224
+ patchQuality,
2225
+ testReality,
2226
+ finalGate,
2227
+ reviewerBlockers,
2228
+ costUsd,
2229
+ wallSeconds,
2230
+ notes
2231
+ };
2232
+ }
2233
+ rank(score) {
2234
+ return aggregateRunScore(score, this.weights);
2235
+ }
2236
+ isDrift(text) {
2237
+ return this.driftPatterns.some((pattern) => pattern.test(text));
2238
+ }
2239
+ };
2240
+ function normalizeJudgeScore(score) {
2241
+ return score > 1 ? clamp01(score / 10) : clamp01(score);
2242
+ }
2243
+ function looksRepoGrounded(text) {
2244
+ return /(?:src\/|tests?\/|package\.json|tsconfig|\.ts\b|\.tsx\b|git status|pnpm |npm |vitest|pytest|jest)/i.test(text);
2245
+ }
2246
+ function isBlockingJudge(span) {
2247
+ return span.attributes?.blocking === true || span.attributes?.verdict === "BLOCKING" || positiveNumber(span.attributes?.blockingFindings) || positiveNumber(span.attributes?.highFindings) || span.score <= 2;
2248
+ }
2249
+ function positiveNumber(value) {
2250
+ return typeof value === "number" && value > 0;
2251
+ }
2252
+
2253
+ // src/playbook.ts
2254
+ function distillPlaybook(entries, options = {}) {
2255
+ const maxEntries = options.maxEntries ?? 12;
2256
+ const byInstruction = /* @__PURE__ */ new Map();
2257
+ for (const entry of entries) {
2258
+ const key = normalizeInstruction(entry.instruction);
2259
+ const existing = byInstruction.get(key);
2260
+ if (!existing || (entry.weight ?? 0) > (existing.weight ?? 0)) {
2261
+ byInstruction.set(key, { ...entry, instruction: canonicalInstruction(entry.instruction) });
2262
+ }
2263
+ }
2264
+ const distilled = [...byInstruction.values()].sort((a, b) => (b.weight ?? 0) - (a.weight ?? 0)).slice(0, maxEntries);
2265
+ return { entries: distilled };
2266
+ }
2267
+ function renderPlaybookMarkdown(playbook) {
2268
+ const lines = ["# Playbook", ""];
2269
+ for (const entry of playbook.entries) {
2270
+ lines.push(`- ${entry.instruction}`);
2271
+ lines.push(` Rationale: ${entry.rationale}`);
2272
+ if (entry.category) lines.push(` Category: ${entry.category}`);
2273
+ if (entry.evidence) lines.push(` Evidence: ${entry.evidence}`);
2274
+ if (entry.sourceRunId) lines.push(` Source run: ${entry.sourceRunId}`);
2275
+ lines.push("");
2276
+ }
2277
+ return lines.join("\n").trim() + "\n";
2278
+ }
2279
+ function normalizeInstruction(value) {
2280
+ return value.trim().toLowerCase().replace(/\s+/g, " ");
2281
+ }
2282
+ function canonicalInstruction(value) {
2283
+ const normalized = value.trim().replace(/\s+/g, " ");
2284
+ return normalized.length === 0 ? normalized : normalized[0].toUpperCase() + normalized.slice(1);
2285
+ }
2286
+
2287
+ // src/optimization-loop.ts
2288
+ var OptimizationLoop = class {
2289
+ optimizer;
2290
+ constructor(optimizer = new PromptOptimizer()) {
2291
+ this.optimizer = optimizer;
2292
+ }
2293
+ async run(config) {
2294
+ const byId = new Map(config.variants.map((variant) => [variant.id, variant]));
2295
+ const result = await this.optimizer.run({
2296
+ variants: config.variants.map((variant) => ({
2297
+ id: variant.id,
2298
+ prompt: renderSteeringText(variant),
2299
+ metadata: { bundle: variant }
2300
+ })),
2301
+ scenarioIds: config.examples.map((example) => example.scenarioId),
2302
+ trialsPerScenario: config.trialsPerScenario,
2303
+ scoreVariant: async ({ variant, scenarioId, trialIndex }) => {
2304
+ const bundle = byId.get(variant.id);
2305
+ if (!bundle) throw new Error(`unknown steering bundle ${variant.id}`);
2306
+ const example = config.examples.find((item) => item.scenarioId === scenarioId);
2307
+ if (!example) throw new Error(`unknown optimization example ${scenarioId}`);
2308
+ const score = await config.evaluate({ variant: bundle, example, trialIndex });
2309
+ return aggregateRunScore(score, config.scoreWeights);
2147
2310
  }
2148
- results.push({
2149
- scenarioId: scenario.id,
2150
- converged,
2151
- roundsToConverge,
2152
- finalProposal,
2153
- history,
2154
- finalScore: lastScore
2155
- });
2311
+ });
2312
+ return {
2313
+ winner: byId.get(result.winner.variantId),
2314
+ significant: result.winner.significant,
2315
+ reports: result.scores.map((score) => ({
2316
+ variantId: score.variantId,
2317
+ bundle: byId.get(score.variantId),
2318
+ mean: score.mean,
2319
+ ci95: score.ci95,
2320
+ scenarioScores: score.perScenario
2321
+ })),
2322
+ pairwise: result.pairwise
2323
+ };
2324
+ }
2325
+ };
2326
+
2327
+ // src/steering-optimizer.ts
2328
+ var PairwiseSteeringOptimizer = class {
2329
+ optimize(rows, config = {}) {
2330
+ const ranked = rankRows(rows, config.weights);
2331
+ if (!ranked.length) throw new Error("no steering optimization rows");
2332
+ return {
2333
+ backend: "pairwise",
2334
+ recommendedVariantId: ranked[0].variantId,
2335
+ rationale: `Highest observed mean aggregate across ${rows.length} scored run(s).`,
2336
+ rankings: ranked
2337
+ };
2338
+ }
2339
+ };
2340
+ var AxGepaSteeringOptimizer = class {
2341
+ constructor(config) {
2342
+ this.config = config;
2343
+ }
2344
+ config;
2345
+ async optimize(rows) {
2346
+ const fallback = new PairwiseSteeringOptimizer().optimize(rows, this.config);
2347
+ const minRows = this.config.minRows ?? 6;
2348
+ const variantIds = [...new Set(rows.map((row) => row.variantId))];
2349
+ const byScenario = collapseScenarioWinners(rows, this.config.weights);
2350
+ if (variantIds.length < 2 || byScenario.length < minRows) {
2351
+ return {
2352
+ ...fallback,
2353
+ backend: "ax-gepa",
2354
+ skipped: true,
2355
+ rationale: `AxGEPA skipped: need >=2 variants and >=${minRows} scenario winners, got ${variantIds.length} variant(s) and ${byScenario.length} scenario winner(s).`
2356
+ };
2156
2357
  }
2157
- const convergedResults = results.filter((r) => r.converged);
2158
- const convergenceRate = results.length ? convergedResults.length / results.length : 0;
2159
- const avgRoundsToConverge = convergedResults.length ? convergedResults.reduce((acc, r) => acc + (r.roundsToConverge ?? 0), 0) / convergedResults.length : null;
2160
- const avgFinalScore = results.length ? results.reduce((acc, r) => acc + r.finalScore, 0) / results.length : 0;
2358
+ let axLib;
2359
+ try {
2360
+ axLib = await import("@ax-llm/ax");
2361
+ } catch {
2362
+ return {
2363
+ ...fallback,
2364
+ backend: "ax-gepa",
2365
+ skipped: true,
2366
+ rationale: "AxGEPA unavailable: install @ax-llm/ax to enable selector optimization."
2367
+ };
2368
+ }
2369
+ const { ai, ax, AxGEPA } = axLib;
2370
+ const signature = `task:string, split:string, seedPreview:string -> variantId:class "${variantIds.join(", ")}", rationale:string`;
2371
+ const selector = ax(signature, {
2372
+ description: "Choose the best steering bundle variant for an autopilot task."
2373
+ });
2374
+ const splitIndex = Math.max(1, Math.floor(byScenario.length * 0.8));
2375
+ const train = byScenario.slice(0, splitIndex);
2376
+ const validation = byScenario.slice(splitIndex);
2377
+ if (!validation.length) {
2378
+ return {
2379
+ ...fallback,
2380
+ backend: "ax-gepa",
2381
+ skipped: true,
2382
+ rationale: "AxGEPA skipped: no validation examples after split."
2383
+ };
2384
+ }
2385
+ const optimizer = new AxGEPA({
2386
+ studentAI: createAxService(ai, this.config.provider, this.config.apiKey, this.config.model),
2387
+ teacherAI: createAxService(ai, this.config.provider, this.config.apiKey, this.config.teacherModel ?? this.config.model),
2388
+ numTrials: 8,
2389
+ minibatch: true,
2390
+ minibatchSize: 4,
2391
+ earlyStoppingTrials: 3,
2392
+ sampleCount: 1
2393
+ });
2394
+ const compiled = await optimizer.compile(
2395
+ selector,
2396
+ train,
2397
+ (({ prediction, example }) => prediction?.variantId === example?.variantId ? 1 : 0),
2398
+ {
2399
+ validationExamples: validation,
2400
+ maxMetricCalls: 64
2401
+ }
2402
+ );
2403
+ selector.applyOptimization(compiled.optimizedProgram);
2161
2404
  return {
2162
- scenarios: results,
2163
- aggregate: { convergenceRate, avgRoundsToConverge, avgFinalScore },
2164
- config: { maxRounds, convergenceThreshold: threshold }
2405
+ ...fallback,
2406
+ backend: "ax-gepa",
2407
+ rationale: `AxGEPA trained a variant selector from ${byScenario.length} scored scenario winner(s); default winner remains ${fallback.recommendedVariantId}.`,
2408
+ selector: {
2409
+ backend: "ax-gepa",
2410
+ signature,
2411
+ labels: variantIds,
2412
+ rationale: compiled.bestScore !== void 0 ? `bestScore=${compiled.bestScore}` : void 0
2413
+ }
2165
2414
  };
2166
2415
  }
2167
2416
  };
2417
+ function rankRows(rows, weights) {
2418
+ const buckets = /* @__PURE__ */ new Map();
2419
+ for (const row of rows) {
2420
+ const values = buckets.get(row.variantId) ?? [];
2421
+ values.push(aggregateRunScore(row.score, weights));
2422
+ buckets.set(row.variantId, values);
2423
+ }
2424
+ return [...buckets.entries()].map(([variantId, values]) => ({
2425
+ variantId,
2426
+ mean: values.reduce((sum, value) => sum + value, 0) / values.length,
2427
+ runs: values.length
2428
+ })).sort((a, b) => b.mean - a.mean);
2429
+ }
2430
+ function collapseScenarioWinners(rows, weights) {
2431
+ const byScenario = /* @__PURE__ */ new Map();
2432
+ for (const row of rows) {
2433
+ const bucket = byScenario.get(row.scenarioId) ?? [];
2434
+ bucket.push(row);
2435
+ byScenario.set(row.scenarioId, bucket);
2436
+ }
2437
+ return [...byScenario.entries()].map(([scenarioId, scenarioRows]) => {
2438
+ const best = scenarioRows.map((row) => ({ row, aggregate: aggregateRunScore(row.score, weights) })).sort((a, b) => b.aggregate - a.aggregate)[0];
2439
+ return {
2440
+ task: String(best.row.metadata?.task ?? best.row.metadata?.seed_preview ?? scenarioId),
2441
+ split: String(best.row.metadata?.split ?? "train"),
2442
+ seedPreview: String(best.row.metadata?.seed_preview ?? ""),
2443
+ variantId: best.row.variantId
2444
+ };
2445
+ });
2446
+ }
2447
+ function createAxService(aiFactory, provider, apiKey, model) {
2448
+ return aiFactory({
2449
+ name: provider,
2450
+ apiKey,
2451
+ config: { model }
2452
+ });
2453
+ }
2168
2454
 
2169
- // src/trace/schema.ts
2170
- var TRACE_SCHEMA_VERSION = "1.0.0";
2171
- var FAILURE_CLASSES = [
2172
- "success",
2173
- "reasoning_error",
2174
- "tool_selection_error",
2175
- "tool_argument_error",
2176
- "tool_recovery_failure",
2177
- "hallucination",
2178
- "instruction_following",
2179
- "safety_refusal_miss",
2180
- "policy_violation",
2181
- "budget_exceeded",
2182
- "format_drift",
2183
- "permission_escalation",
2184
- "pii_leak",
2185
- "cost_overrun",
2186
- "timeout",
2187
- "sandbox_failure",
2188
- "unknown"
2189
- ];
2190
- function isLlmSpan(s) {
2191
- return s.kind === "llm";
2455
+ // src/pareto.ts
2456
+ function dominates(a, b, objectives) {
2457
+ let strictlyBetter = false;
2458
+ for (const obj of objectives) {
2459
+ const av = obj.value(a);
2460
+ const bv = obj.value(b);
2461
+ if (!Number.isFinite(av) || !Number.isFinite(bv)) return false;
2462
+ const aIsBetter = obj.direction === "maximize" ? av > bv : av < bv;
2463
+ const aIsWorse = obj.direction === "maximize" ? av < bv : av > bv;
2464
+ if (aIsWorse) return false;
2465
+ if (aIsBetter) strictlyBetter = true;
2466
+ }
2467
+ return strictlyBetter;
2192
2468
  }
2193
- function isToolSpan(s) {
2194
- return s.kind === "tool";
2469
+ function paretoFrontier(candidates, objectives) {
2470
+ if (objectives.length === 0) {
2471
+ throw new Error("paretoFrontier: at least 1 objective required");
2472
+ }
2473
+ const valid = candidates.filter(
2474
+ (c) => objectives.every((o) => Number.isFinite(o.value(c)))
2475
+ );
2476
+ const frontier = [];
2477
+ const dominated = [];
2478
+ for (const c of valid) {
2479
+ const isDominated = valid.some((other) => other !== c && dominates(other, c, objectives));
2480
+ if (isDominated) dominated.push(c);
2481
+ else frontier.push(c);
2482
+ }
2483
+ const dominanceMap = frontier.map((d) => ({
2484
+ dominator: d,
2485
+ dominated: dominated.filter((x) => dominates(d, x, objectives))
2486
+ }));
2487
+ return { frontier, dominated, dominanceMap };
2195
2488
  }
2196
- function isRetrievalSpan(s) {
2197
- return s.kind === "retrieval";
2489
+
2490
+ // src/harness-optimizer.ts
2491
+ var DEFAULT_HARNESS_OBJECTIVES = [
2492
+ { name: "aggregate", direction: "maximize", value: (r) => r.aggregateMean },
2493
+ { name: "pass_rate", direction: "maximize", value: (r) => r.passRate },
2494
+ { name: "cost", direction: "minimize", value: (r) => r.costUsdMean },
2495
+ { name: "wall", direction: "minimize", value: (r) => r.wallSecondsMean }
2496
+ ];
2497
+ async function runHarnessExperiment(config) {
2498
+ const jobs = buildJobs(config);
2499
+ const critic = new RunCritic({ weights: config.weights });
2500
+ const score = config.score ?? ((trace) => critic.scoreTrace(trace));
2501
+ const results = await mapLimit(jobs, config.parallelism ?? 1, async (request) => {
2502
+ const trace = await config.adapter.run(request);
2503
+ const runScore = await score(trace, request);
2504
+ const result = {
2505
+ variant: request.variant,
2506
+ scenario: request.scenario,
2507
+ trialIndex: request.trialIndex,
2508
+ trace,
2509
+ score: runScore,
2510
+ aggregate: aggregateRunScore(runScore, config.weights)
2511
+ };
2512
+ await config.onResult?.(result);
2513
+ return result;
2514
+ });
2515
+ return { results, selection: selectHarnessVariant(results, config.objectives) };
2516
+ }
2517
+ function selectHarnessVariant(results, objectives = DEFAULT_HARNESS_OBJECTIVES) {
2518
+ const reports = summarizeHarnessResults(results);
2519
+ if (reports.length === 0) throw new Error("selectHarnessVariant: no results");
2520
+ const frontier = paretoFrontier(reports, objectives);
2521
+ const candidates = frontier.frontier.length ? frontier.frontier : reports;
2522
+ const winner = [...candidates].sort((a, b) => b.aggregateMean - a.aggregateMean)[0];
2523
+ if (!winner) throw new Error("selectHarnessVariant: no winner");
2524
+ return { winner, frontier, reports };
2525
+ }
2526
+ function summarizeHarnessResults(results) {
2527
+ const byVariant = /* @__PURE__ */ new Map();
2528
+ for (const result of results) {
2529
+ byVariant.set(result.variant.id, [...byVariant.get(result.variant.id) ?? [], result]);
2530
+ }
2531
+ return [...byVariant.values()].map((runs) => {
2532
+ const variant = runs[0]?.variant;
2533
+ if (!variant) throw new Error("summarizeHarnessResults: empty variant bucket");
2534
+ return {
2535
+ variant,
2536
+ runs,
2537
+ aggregateMean: mean(runs.map((r) => r.aggregate)),
2538
+ passRate: mean(runs.map((r) => r.score.success)),
2539
+ costUsdMean: mean(runs.map((r) => r.score.costUsd)),
2540
+ wallSecondsMean: mean(runs.map((r) => r.score.wallSeconds)),
2541
+ scoreMean: meanRunScore(runs.map((r) => r.score))
2542
+ };
2543
+ }).sort((a, b) => b.aggregateMean - a.aggregateMean);
2544
+ }
2545
+ function buildJobs(config) {
2546
+ if (config.variants.length === 0) throw new Error("runHarnessExperiment: at least one variant required");
2547
+ if (config.scenarios.length === 0) throw new Error("runHarnessExperiment: at least one scenario required");
2548
+ const trials = Math.max(1, Math.floor(config.trialsPerScenario ?? 1));
2549
+ const jobs = [];
2550
+ for (const variant of config.variants) {
2551
+ for (const scenario of config.scenarios) {
2552
+ for (let trialIndex = 0; trialIndex < trials; trialIndex++) {
2553
+ jobs.push({ variant, scenario, trialIndex });
2554
+ }
2555
+ }
2556
+ }
2557
+ return jobs;
2198
2558
  }
2199
- function isJudgeSpan(s) {
2200
- return s.kind === "judge";
2559
+ async function mapLimit(items, limit, fn) {
2560
+ const results = new Array(items.length);
2561
+ let next = 0;
2562
+ const workerCount = Math.max(1, Math.min(Math.floor(limit), items.length));
2563
+ await Promise.all(Array.from({ length: workerCount }, async () => {
2564
+ while (next < items.length) {
2565
+ const index = next++;
2566
+ const item = items[index];
2567
+ if (item === void 0) continue;
2568
+ results[index] = await fn(item);
2569
+ }
2570
+ }));
2571
+ return results;
2201
2572
  }
2202
- function isSandboxSpan(s) {
2203
- return s.kind === "sandbox";
2573
+ function mean(values) {
2574
+ return values.length ? values.reduce((sum, value) => sum + value, 0) / values.length : 0;
2575
+ }
2576
+ function meanRunScore(scores) {
2577
+ return {
2578
+ success: mean(scores.map((s) => s.success)),
2579
+ goalProgress: mean(scores.map((s) => s.goalProgress)),
2580
+ repoGroundedness: mean(scores.map((s) => s.repoGroundedness)),
2581
+ driftPenalty: mean(scores.map((s) => s.driftPenalty)),
2582
+ toolUseQuality: mean(scores.map((s) => s.toolUseQuality)),
2583
+ patchQuality: mean(scores.map((s) => s.patchQuality)),
2584
+ testReality: mean(scores.map((s) => s.testReality)),
2585
+ finalGate: mean(scores.map((s) => s.finalGate)),
2586
+ reviewerBlockers: mean(scores.map((s) => s.reviewerBlockers)),
2587
+ costUsd: mean(scores.map((s) => s.costUsd)),
2588
+ wallSeconds: mean(scores.map((s) => s.wallSeconds)),
2589
+ notes: scores.flatMap((s) => s.notes ?? [])
2590
+ };
2204
2591
  }
2205
2592
 
2206
2593
  // src/trace/store.ts
@@ -2597,6 +2984,651 @@ function llmSpanFromProvider(args) {
2597
2984
  };
2598
2985
  }
2599
2986
 
2987
+ // src/sandbox-harness.ts
2988
+ var vitestTestParser = {
2989
+ id: "vitest",
2990
+ parse(stdout) {
2991
+ const m = stdout.match(/Tests\s+(\d+)\s+(passed|failed)(?:\s*\|\s*(\d+)\s+(passed|failed))?/i);
2992
+ if (!m) return void 0;
2993
+ let passed = 0;
2994
+ let failed = 0;
2995
+ const a = parseInt(m[1], 10);
2996
+ const aLabel = m[2].toLowerCase();
2997
+ if (aLabel === "passed") passed += a;
2998
+ else failed += a;
2999
+ if (m[3] && m[4]) {
3000
+ const b = parseInt(m[3], 10);
3001
+ if (m[4].toLowerCase() === "passed") passed += b;
3002
+ else failed += b;
3003
+ }
3004
+ return { testsTotal: passed + failed, testsPassed: passed };
3005
+ }
3006
+ };
3007
+ var pytestTestParser = {
3008
+ id: "pytest",
3009
+ parse(stdout) {
3010
+ const total = stdout.match(/collected\s+(\d+)\s+items?/i);
3011
+ const passed = stdout.match(/(\d+)\s+passed/);
3012
+ if (!total || !passed) return void 0;
3013
+ return { testsTotal: parseInt(total[1], 10), testsPassed: parseInt(passed[1], 10) };
3014
+ }
3015
+ };
3016
+ var jestTestParser = {
3017
+ id: "jest",
3018
+ parse(stdout) {
3019
+ const m = stdout.match(/Tests:\s+(?:(\d+)\s+failed[^,]*,\s*)?(\d+)\s+passed,\s+(\d+)\s+total/i);
3020
+ if (!m) return void 0;
3021
+ return { testsTotal: parseInt(m[3], 10), testsPassed: parseInt(m[2], 10) };
3022
+ }
3023
+ };
3024
+ function composeParsers(...parsers) {
3025
+ return {
3026
+ id: parsers.map((p) => p.id).join("|"),
3027
+ parse(stdout, stderr, exitCode) {
3028
+ for (const p of parsers) {
3029
+ const res = p.parse(stdout, stderr, exitCode);
3030
+ if (res) return res;
3031
+ }
3032
+ return void 0;
3033
+ }
3034
+ };
3035
+ }
3036
+ var SubprocessSandboxDriver = class {
3037
+ id = "subprocess";
3038
+ defaultCwd;
3039
+ defaultEnv;
3040
+ constructor(options = {}) {
3041
+ this.defaultCwd = options.cwd;
3042
+ this.defaultEnv = options.env;
3043
+ }
3044
+ async exec(phase, command, config) {
3045
+ const { spawn } = await import("child_process");
3046
+ const start = Date.now();
3047
+ const effectiveCwd = config.cwd ?? this.defaultCwd;
3048
+ const effectiveEnv = { ...process.env, ...this.defaultEnv ?? {}, ...config.env ?? {} };
3049
+ return await new Promise((resolve) => {
3050
+ const child = spawn(command, {
3051
+ shell: true,
3052
+ cwd: effectiveCwd,
3053
+ env: effectiveEnv
3054
+ });
3055
+ let stdout = "";
3056
+ let stderr = "";
3057
+ child.stdout?.on("data", (d) => {
3058
+ stdout += String(d);
3059
+ });
3060
+ child.stderr?.on("data", (d) => {
3061
+ stderr += String(d);
3062
+ });
3063
+ const timeout = setTimeout(() => {
3064
+ try {
3065
+ child.kill("SIGKILL");
3066
+ } catch {
3067
+ }
3068
+ }, config.timeoutMs ?? 10 * 6e4);
3069
+ child.on("close", (code) => {
3070
+ clearTimeout(timeout);
3071
+ const wallMs = Date.now() - start;
3072
+ const parsed = phase === "test" && config.testParser ? config.testParser.parse(stdout, stderr, code ?? 1) : void 0;
3073
+ resolve({
3074
+ phase,
3075
+ exitCode: code ?? 1,
3076
+ stdout,
3077
+ stderr,
3078
+ wallMs,
3079
+ testsTotal: parsed?.testsTotal,
3080
+ testsPassed: parsed?.testsPassed
3081
+ });
3082
+ });
3083
+ child.on("error", (err) => {
3084
+ clearTimeout(timeout);
3085
+ const wallMs = Date.now() - start;
3086
+ resolve({ phase, exitCode: 127, stdout, stderr: stderr + String(err), wallMs });
3087
+ });
3088
+ });
3089
+ }
3090
+ };
3091
+ var DockerSandboxDriver = class {
3092
+ id = "docker";
3093
+ async exec(phase, command, config) {
3094
+ if (!config.image) throw new Error("DockerSandboxDriver requires config.image");
3095
+ const sub = new SubprocessSandboxDriver();
3096
+ const envArgs = Object.entries(config.env ?? {}).map(([k, v]) => `-e ${shellQuote(k)}=${shellQuote(v)}`).join(" ");
3097
+ const wrapped = `docker run --rm ${envArgs} ${shellQuote(config.image)} sh -c ${shellQuote(command)}`;
3098
+ return sub.exec(phase, wrapped, { ...config, env: void 0 });
3099
+ }
3100
+ };
3101
+ function shellQuote(v) {
3102
+ if (/^[A-Za-z0-9_\-\/\.@:=]+$/.test(v)) return v;
3103
+ return `'${v.replace(/'/g, `'\\''`)}'`;
3104
+ }
3105
+ var SandboxHarness = class {
3106
+ driver;
3107
+ constructor(driver = new SubprocessSandboxDriver()) {
3108
+ this.driver = driver;
3109
+ }
3110
+ async run(config, emitter) {
3111
+ const handle = await emitter.sandbox({
3112
+ name: `sandbox(${this.driver.id})`,
3113
+ image: config.image,
3114
+ command: [config.setupCommand, config.runCommand, config.testCommand].filter(Boolean).join(" && ")
3115
+ });
3116
+ const result = { passed: false, totalWallMs: 0, score: 0 };
3117
+ try {
3118
+ if (config.setupCommand) {
3119
+ result.setup = await this.driver.exec("setup", config.setupCommand, config);
3120
+ result.totalWallMs += result.setup.wallMs;
3121
+ if (result.setup.exitCode !== 0) {
3122
+ await handle.fail(`setup failed (exit ${result.setup.exitCode})`, {
3123
+ exitCode: result.setup.exitCode,
3124
+ wallMs: result.totalWallMs
3125
+ });
3126
+ return result;
3127
+ }
3128
+ }
3129
+ if (config.runCommand) {
3130
+ result.run = await this.driver.exec("run", config.runCommand, config);
3131
+ result.totalWallMs += result.run.wallMs;
3132
+ if (result.run.exitCode !== 0) {
3133
+ await handle.fail(`run failed (exit ${result.run.exitCode})`, {
3134
+ exitCode: result.run.exitCode,
3135
+ wallMs: result.totalWallMs
3136
+ });
3137
+ return result;
3138
+ }
3139
+ }
3140
+ if (config.testCommand) {
3141
+ result.test = await this.driver.exec("test", config.testCommand, config);
3142
+ result.totalWallMs += result.test.wallMs;
3143
+ const passed = result.test.exitCode === 0;
3144
+ result.passed = passed;
3145
+ if (result.test.testsTotal !== void 0 && result.test.testsTotal > 0) {
3146
+ result.score = (result.test.testsPassed ?? 0) / result.test.testsTotal;
3147
+ } else {
3148
+ result.score = passed ? 1 : 0;
3149
+ }
3150
+ await handle.end({
3151
+ exitCode: result.test.exitCode,
3152
+ testsTotal: result.test.testsTotal,
3153
+ testsPassed: result.test.testsPassed,
3154
+ wallMs: result.totalWallMs,
3155
+ status: passed ? "ok" : "error"
3156
+ });
3157
+ } else {
3158
+ result.passed = true;
3159
+ result.score = 1;
3160
+ await handle.end({ wallMs: result.totalWallMs });
3161
+ }
3162
+ } catch (err) {
3163
+ await handle.fail(err instanceof Error ? err : String(err));
3164
+ throw err;
3165
+ }
3166
+ return result;
3167
+ }
3168
+ };
3169
+
3170
+ // src/judge-runner.ts
3171
+ var JudgeRunner = class {
3172
+ driver;
3173
+ constructor(driver = new SubprocessSandboxDriver()) {
3174
+ this.driver = driver;
3175
+ }
3176
+ async run(spec) {
3177
+ const store = new InMemoryTraceStore();
3178
+ const emitter = new TraceEmitter(store, { runId: `judge-${spec.id}` });
3179
+ await emitter.startRun({
3180
+ scenarioId: spec.id,
3181
+ layer: "meta",
3182
+ projectId: "judge-runner"
3183
+ });
3184
+ const harness = new SandboxHarness(this.driver);
3185
+ const detail = await harness.run(spec.config, emitter);
3186
+ await emitter.endRun({ pass: detail.passed, score: detail.score, notes: `${spec.kind} judge` });
3187
+ return {
3188
+ id: spec.id,
3189
+ kind: spec.kind,
3190
+ passed: detail.passed,
3191
+ score: detail.score,
3192
+ summary: renderJudgeSummary(spec.kind, detail),
3193
+ detail
3194
+ };
3195
+ }
3196
+ };
3197
+ async function runJudgeFleet(specs, options = {}) {
3198
+ const runner = new JudgeRunner(options.driver);
3199
+ if (options.parallel === false) {
3200
+ const results = [];
3201
+ for (const spec of specs) results.push(await runner.run(spec));
3202
+ return results;
3203
+ }
3204
+ return await Promise.all(specs.map((spec) => runner.run(spec)));
3205
+ }
3206
+ function compilerJudge(id, config) {
3207
+ return { id, kind: "compiler", config };
3208
+ }
3209
+ function testJudge(id, config) {
3210
+ return { id, kind: "test", config };
3211
+ }
3212
+ function linterJudge(id, config) {
3213
+ return { id, kind: "linter", config };
3214
+ }
3215
+ function securityJudge(id, config) {
3216
+ return { id, kind: "security", config };
3217
+ }
3218
+ function renderJudgeSummary(kind, detail) {
3219
+ if (!detail.passed) return `${kind} judge failed`;
3220
+ if (detail.test?.testsTotal) return `${kind} judge passed ${detail.test.testsPassed}/${detail.test.testsTotal} tests`;
3221
+ return `${kind} judge passed`;
3222
+ }
3223
+
3224
+ // src/dual-agent-bench.ts
3225
+ var DualAgentBench = class {
3226
+ async run(config) {
3227
+ const maxRounds = config.maxRounds ?? 5;
3228
+ const threshold = config.convergenceThreshold ?? 0.85;
3229
+ if (config.scenarios.length === 0) {
3230
+ throw new Error("DualAgentBench requires at least 1 scenario");
3231
+ }
3232
+ const results = [];
3233
+ for (const scenario of config.scenarios) {
3234
+ const history = [];
3235
+ let converged = false;
3236
+ let roundsToConverge = null;
3237
+ let finalProposal = "";
3238
+ let lastScore = 0;
3239
+ let priorCritique;
3240
+ for (let r = 0; r < maxRounds; r++) {
3241
+ const priorProposal = history[history.length - 1]?.proposal;
3242
+ const proposal = await config.propose({
3243
+ scenario,
3244
+ roundIndex: r,
3245
+ priorProposal,
3246
+ priorCritique
3247
+ });
3248
+ const { critique, convergenceScore } = await config.critique({
3249
+ scenario,
3250
+ roundIndex: r,
3251
+ proposal
3252
+ });
3253
+ if (!Number.isFinite(convergenceScore) || convergenceScore < 0 || convergenceScore > 1) {
3254
+ throw new Error(
3255
+ `critique must return convergenceScore in [0,1]; got ${convergenceScore} for scenario ${scenario.id} round ${r}`
3256
+ );
3257
+ }
3258
+ const round = {
3259
+ roundIndex: r,
3260
+ proposal,
3261
+ critique,
3262
+ convergenceScore
3263
+ };
3264
+ history.push(round);
3265
+ config.onRoundComplete?.({ scenarioId: scenario.id, round });
3266
+ finalProposal = proposal;
3267
+ lastScore = convergenceScore;
3268
+ priorCritique = critique;
3269
+ if (convergenceScore >= threshold) {
3270
+ converged = true;
3271
+ roundsToConverge = r + 1;
3272
+ break;
3273
+ }
3274
+ }
3275
+ results.push({
3276
+ scenarioId: scenario.id,
3277
+ converged,
3278
+ roundsToConverge,
3279
+ finalProposal,
3280
+ history,
3281
+ finalScore: lastScore
3282
+ });
3283
+ }
3284
+ const convergedResults = results.filter((r) => r.converged);
3285
+ const convergenceRate = results.length ? convergedResults.length / results.length : 0;
3286
+ const avgRoundsToConverge = convergedResults.length ? convergedResults.reduce((acc, r) => acc + (r.roundsToConverge ?? 0), 0) / convergedResults.length : null;
3287
+ const avgFinalScore = results.length ? results.reduce((acc, r) => acc + r.finalScore, 0) / results.length : 0;
3288
+ return {
3289
+ scenarios: results,
3290
+ aggregate: { convergenceRate, avgRoundsToConverge, avgFinalScore },
3291
+ config: { maxRounds, convergenceThreshold: threshold }
3292
+ };
3293
+ }
3294
+ };
3295
+
3296
+ // src/propose-review.ts
3297
+ import { appendFileSync, existsSync, mkdirSync, readFileSync } from "fs";
3298
+ import { dirname } from "path";
3299
+ function inMemoryReviewStore(initial = []) {
3300
+ const entries = [...initial];
3301
+ return {
3302
+ async load() {
3303
+ return [...entries];
3304
+ },
3305
+ async append(entry) {
3306
+ entries.push(entry);
3307
+ }
3308
+ };
3309
+ }
3310
+ function jsonlReviewStore(path) {
3311
+ return {
3312
+ async load() {
3313
+ if (!existsSync(path)) return [];
3314
+ const raw = readFileSync(path, "utf8");
3315
+ const out = [];
3316
+ for (const line of raw.split("\n")) {
3317
+ const trimmed = line.trim();
3318
+ if (!trimmed) continue;
3319
+ try {
3320
+ out.push(JSON.parse(trimmed));
3321
+ } catch {
3322
+ }
3323
+ }
3324
+ return out;
3325
+ },
3326
+ async append(entry) {
3327
+ mkdirSync(dirname(path), { recursive: true });
3328
+ appendFileSync(path, JSON.stringify(entry) + "\n");
3329
+ }
3330
+ };
3331
+ }
3332
+ var DEFAULT_FALLBACK_INSTRUCTION = "Inspect the verification failures above. Fix the critical issues first, then the major ones. Do not restate the failures \u2014 act on them.";
3333
+ async function runProposeReview(config) {
3334
+ const maxShots = config.maxShots ?? 10;
3335
+ const maxWallMs = config.maxWallMs ?? 10 * 60 * 1e3;
3336
+ const confidenceFloor = config.confidenceFloor ?? 0.3;
3337
+ const confidenceFloorWindow = config.confidenceFloorWindow ?? 2;
3338
+ const memory = config.memory ?? inMemoryReviewStore();
3339
+ const fallbackInstruction = config.fallbackInstruction ?? DEFAULT_FALLBACK_INSTRUCTION;
3340
+ const emitter = config.store ? new TraceEmitter(config.store) : null;
3341
+ if (emitter) {
3342
+ await emitter.startRun({
3343
+ scenarioId: config.scenarioId ?? "propose-review",
3344
+ projectId: config.projectId,
3345
+ variantId: config.variantId,
3346
+ layer: "meta",
3347
+ tags: {
3348
+ goal: config.goal.slice(0, 120),
3349
+ maxShots: String(maxShots)
3350
+ }
3351
+ });
3352
+ }
3353
+ const abort = new AbortController();
3354
+ const wallStart = Date.now();
3355
+ const wallTimer = setTimeout(() => abort.abort(new Error("propose-review wall timeout")), maxWallMs);
3356
+ const shots = [];
3357
+ let state = config.initialState;
3358
+ let priorReview = null;
3359
+ let lastVerification = { pass: false };
3360
+ let failureClass;
3361
+ let completed = false;
3362
+ let lowConfidenceStreak = 0;
3363
+ try {
3364
+ for (let shot = 1; shot <= maxShots; shot++) {
3365
+ if (abort.signal.aborted) {
3366
+ failureClass = "timeout";
3367
+ break;
3368
+ }
3369
+ const shotStart = Date.now();
3370
+ const shotHandle = emitter ? await emitter.span({ kind: "tool", name: `shot-${shot}` }) : null;
3371
+ let proposeOut;
3372
+ try {
3373
+ proposeOut = await config.propose({
3374
+ shot,
3375
+ goal: config.goal,
3376
+ state,
3377
+ priorReview,
3378
+ abortSignal: abort.signal,
3379
+ emitter: emitter ?? void 0
3380
+ });
3381
+ } catch (err) {
3382
+ await shotHandle?.fail(err instanceof Error ? err : String(err));
3383
+ failureClass = "unknown";
3384
+ throw err;
3385
+ }
3386
+ state = proposeOut.state;
3387
+ const traceSummary = proposeOut.traceSummary;
3388
+ let verification;
3389
+ try {
3390
+ verification = await config.verify(state);
3391
+ } catch (err) {
3392
+ await shotHandle?.fail(err instanceof Error ? err : String(err));
3393
+ failureClass = "unknown";
3394
+ throw err;
3395
+ }
3396
+ lastVerification = verification;
3397
+ const memorySnapshot = await memory.load();
3398
+ const verificationDigest = {
3399
+ pass: verification.pass,
3400
+ score: verification.score,
3401
+ failingLayers: verification.failingLayers ?? []
3402
+ };
3403
+ let review;
3404
+ let reviewAvailable = true;
3405
+ let reviewError;
3406
+ if (verification.pass) {
3407
+ review = {
3408
+ observations: "verification passed \u2014 skipping reviewer LLM call",
3409
+ diagnosis: "no failures to diagnose",
3410
+ nextShotInstruction: "(done)",
3411
+ shouldContinue: false,
3412
+ confidence: 1
3413
+ };
3414
+ } else {
3415
+ try {
3416
+ review = await config.review({
3417
+ shot,
3418
+ goal: config.goal,
3419
+ state,
3420
+ verification,
3421
+ traceSummary,
3422
+ memory: memorySnapshot
3423
+ });
3424
+ review = coerceReview(review);
3425
+ } catch (err) {
3426
+ reviewAvailable = false;
3427
+ reviewError = err instanceof Error ? err.message : String(err);
3428
+ const lastInstruction = memorySnapshot.length > 0 ? memorySnapshot[memorySnapshot.length - 1].nextShotInstruction : fallbackInstruction;
3429
+ review = {
3430
+ observations: "(reviewer unavailable \u2014 using last-known instruction)",
3431
+ diagnosis: reviewError,
3432
+ nextShotInstruction: lastInstruction,
3433
+ shouldContinue: true,
3434
+ confidence: 0.3
3435
+ };
3436
+ }
3437
+ }
3438
+ const entry = {
3439
+ shot,
3440
+ timestamp: Date.now(),
3441
+ ...review,
3442
+ verification: verificationDigest
3443
+ };
3444
+ await memory.append(entry);
3445
+ const shotRecord = {
3446
+ shot,
3447
+ state,
3448
+ verification,
3449
+ traceSummary,
3450
+ review,
3451
+ reviewAvailable,
3452
+ reviewError,
3453
+ durationMs: Date.now() - shotStart
3454
+ };
3455
+ shots.push(shotRecord);
3456
+ await shotHandle?.end({
3457
+ attributes: {
3458
+ verificationPass: verification.pass,
3459
+ verificationScore: verification.score ?? null,
3460
+ reviewShouldContinue: review.shouldContinue,
3461
+ reviewConfidence: review.confidence,
3462
+ reviewAvailable
3463
+ }
3464
+ });
3465
+ if (verification.pass) {
3466
+ completed = true;
3467
+ break;
3468
+ }
3469
+ if (!review.shouldContinue) {
3470
+ break;
3471
+ }
3472
+ if (confidenceFloorWindow > 0 && review.confidence <= confidenceFloor) {
3473
+ lowConfidenceStreak += 1;
3474
+ if (lowConfidenceStreak >= confidenceFloorWindow) break;
3475
+ } else {
3476
+ lowConfidenceStreak = 0;
3477
+ }
3478
+ priorReview = review;
3479
+ }
3480
+ if (!completed && !failureClass) {
3481
+ failureClass = shots.length >= maxShots ? "budget_exceeded" : "unknown";
3482
+ }
3483
+ } finally {
3484
+ clearTimeout(wallTimer);
3485
+ }
3486
+ const score = lastVerification.pass ? 1 : typeof lastVerification.score === "number" ? lastVerification.score : 0;
3487
+ if (emitter) {
3488
+ await emitter.endRun({
3489
+ pass: completed,
3490
+ score,
3491
+ failureClass,
3492
+ notes: `${shots.length} shot(s); final pass=${lastVerification.pass}`
3493
+ });
3494
+ }
3495
+ return {
3496
+ runId: emitter?.runId ?? null,
3497
+ completed,
3498
+ shots,
3499
+ finalState: state,
3500
+ finalVerification: lastVerification,
3501
+ failureClass,
3502
+ wallMs: Date.now() - wallStart,
3503
+ score
3504
+ };
3505
+ }
3506
+ var REVIEWER_SYSTEM_PROMPT = `You are a senior reviewer directing a multi-shot build loop.
3507
+ You do NOT grade \u2014 the verifier already did. Your job is to direct the worker's next shot.
3508
+ You are blind to the worker's inner monologue. You see what it DID, not what it thought.
3509
+ Return STRICT JSON matching the schema. No prose outside the JSON.`;
3510
+ function createLlmReviewer(cfg) {
3511
+ const renderState = cfg.renderState ?? ((s) => safeJson(s));
3512
+ const renderTraceSummary = cfg.renderTraceSummary ?? ((s) => s === void 0 ? "(none)" : safeJson(s));
3513
+ const system = cfg.systemPromptAddendum ? `${REVIEWER_SYSTEM_PROMPT}
3514
+
3515
+ ${cfg.systemPromptAddendum}` : REVIEWER_SYSTEM_PROMPT;
3516
+ return async (input) => {
3517
+ const memoryBlock = input.memory.length === 0 ? "(no prior shots \u2014 this is shot 1)" : input.memory.map((m) => [
3518
+ `shot ${m.shot} \u2014 verification.pass=${m.verification.pass}` + (typeof m.verification.score === "number" ? ` score=${m.verification.score.toFixed(2)}` : "") + ` confidence=${m.confidence.toFixed(2)} failing=[${(m.verification.failingLayers ?? []).join(",")}]`,
3519
+ ` observations: ${m.observations.slice(0, 400)}`,
3520
+ ` diagnosis: ${m.diagnosis.slice(0, 400)}`,
3521
+ ` instruction given: ${m.nextShotInstruction.slice(0, 400)}`
3522
+ ].join("\n")).join("\n\n");
3523
+ const user = [
3524
+ `=== GOAL ===`,
3525
+ input.goal,
3526
+ ``,
3527
+ `=== SHOT NUMBER ===`,
3528
+ String(input.shot),
3529
+ ``,
3530
+ `=== CURRENT STATE ===`,
3531
+ renderState(input.state),
3532
+ ``,
3533
+ `=== TRACE SUMMARY ===`,
3534
+ renderTraceSummary(input.traceSummary),
3535
+ ``,
3536
+ `=== VERIFICATION ===`,
3537
+ summarizeVerification(input.verification),
3538
+ ``,
3539
+ `=== REVIEWER MEMORY (prior shots) ===`,
3540
+ memoryBlock,
3541
+ ``,
3542
+ `=== YOUR TASK ===`,
3543
+ `Return STRICT JSON:`,
3544
+ `{`,
3545
+ ` "observations": string (20..2000 chars, first-person worker behavior \u2014 quote counts, errors, loops)`,
3546
+ ` "diagnosis": string (20..1500 chars, root cause, NOT a restatement of verification)`,
3547
+ ` "nextShotInstruction": string (40..3000 chars, concrete directive to the worker)`,
3548
+ ` "shouldContinue": boolean (false if verification.pass, or if thrashing, or unachievable)`,
3549
+ ` "confidence": number in [0,1]`,
3550
+ `}`
3551
+ ].join("\n");
3552
+ const raw = await cfg.callJson({ system, user });
3553
+ return coerceReview(raw);
3554
+ };
3555
+ }
3556
+ function coerceReview(raw) {
3557
+ if (!raw || typeof raw !== "object") {
3558
+ throw new Error("reviewer returned non-object");
3559
+ }
3560
+ const observations = typeof raw.observations === "string" ? raw.observations : "";
3561
+ const diagnosis = typeof raw.diagnosis === "string" ? raw.diagnosis : "";
3562
+ const nextShotInstruction = typeof raw.nextShotInstruction === "string" ? raw.nextShotInstruction : "";
3563
+ if (!observations || !diagnosis || !nextShotInstruction) {
3564
+ throw new Error("reviewer missing required string fields");
3565
+ }
3566
+ if (typeof raw.shouldContinue !== "boolean") {
3567
+ throw new Error("reviewer missing shouldContinue boolean");
3568
+ }
3569
+ const confidenceRaw = Number(raw.confidence);
3570
+ if (!Number.isFinite(confidenceRaw)) {
3571
+ throw new Error("reviewer confidence not finite");
3572
+ }
3573
+ return {
3574
+ observations,
3575
+ diagnosis,
3576
+ nextShotInstruction,
3577
+ shouldContinue: raw.shouldContinue,
3578
+ confidence: Math.max(0, Math.min(1, confidenceRaw))
3579
+ };
3580
+ }
3581
+ function summarizeVerification(v) {
3582
+ const header = `pass=${v.pass}` + (typeof v.score === "number" ? ` score=${v.score.toFixed(3)}` : "") + (v.failingLayers && v.failingLayers.length > 0 ? ` failing=[${v.failingLayers.join(", ")}]` : "");
3583
+ const details = v.details === void 0 ? "" : `
3584
+ ${safeJson(v.details).slice(0, 1500)}`;
3585
+ return header + details;
3586
+ }
3587
+ function safeJson(x) {
3588
+ try {
3589
+ return JSON.stringify(x, null, 2);
3590
+ } catch {
3591
+ return String(x);
3592
+ }
3593
+ }
3594
+
3595
+ // src/trace/schema.ts
3596
+ var TRACE_SCHEMA_VERSION = "1.0.0";
3597
+ var FAILURE_CLASSES = [
3598
+ "success",
3599
+ "reasoning_error",
3600
+ "tool_selection_error",
3601
+ "tool_argument_error",
3602
+ "tool_recovery_failure",
3603
+ "hallucination",
3604
+ "instruction_following",
3605
+ "safety_refusal_miss",
3606
+ "policy_violation",
3607
+ "budget_exceeded",
3608
+ "format_drift",
3609
+ "permission_escalation",
3610
+ "pii_leak",
3611
+ "cost_overrun",
3612
+ "timeout",
3613
+ "sandbox_failure",
3614
+ "unknown"
3615
+ ];
3616
+ function isLlmSpan(s) {
3617
+ return s.kind === "llm";
3618
+ }
3619
+ function isToolSpan(s) {
3620
+ return s.kind === "tool";
3621
+ }
3622
+ function isRetrievalSpan(s) {
3623
+ return s.kind === "retrieval";
3624
+ }
3625
+ function isJudgeSpan(s) {
3626
+ return s.kind === "judge";
3627
+ }
3628
+ function isSandboxSpan(s) {
3629
+ return s.kind === "sandbox";
3630
+ }
3631
+
2600
3632
  // src/trace/query.ts
2601
3633
  async function runsForScenario(store, scenarioId) {
2602
3634
  return store.listRuns({ scenarioId });
@@ -2825,181 +3857,6 @@ function runToTraceId(run) {
2825
3857
  return cleaned.slice(0, 32).padEnd(32, "0");
2826
3858
  }
2827
3859
 
2828
- // src/sandbox-harness.ts
2829
- var vitestTestParser = {
2830
- id: "vitest",
2831
- parse(stdout) {
2832
- const m = stdout.match(/Tests\s+(\d+)\s+(passed|failed)(?:\s*\|\s*(\d+)\s+(passed|failed))?/i);
2833
- if (!m) return void 0;
2834
- let passed = 0;
2835
- let failed = 0;
2836
- const a = parseInt(m[1], 10);
2837
- const aLabel = m[2].toLowerCase();
2838
- if (aLabel === "passed") passed += a;
2839
- else failed += a;
2840
- if (m[3] && m[4]) {
2841
- const b = parseInt(m[3], 10);
2842
- if (m[4].toLowerCase() === "passed") passed += b;
2843
- else failed += b;
2844
- }
2845
- return { testsTotal: passed + failed, testsPassed: passed };
2846
- }
2847
- };
2848
- var pytestTestParser = {
2849
- id: "pytest",
2850
- parse(stdout) {
2851
- const total = stdout.match(/collected\s+(\d+)\s+items?/i);
2852
- const passed = stdout.match(/(\d+)\s+passed/);
2853
- if (!total || !passed) return void 0;
2854
- return { testsTotal: parseInt(total[1], 10), testsPassed: parseInt(passed[1], 10) };
2855
- }
2856
- };
2857
- var jestTestParser = {
2858
- id: "jest",
2859
- parse(stdout) {
2860
- const m = stdout.match(/Tests:\s+(?:(\d+)\s+failed[^,]*,\s*)?(\d+)\s+passed,\s+(\d+)\s+total/i);
2861
- if (!m) return void 0;
2862
- return { testsTotal: parseInt(m[3], 10), testsPassed: parseInt(m[2], 10) };
2863
- }
2864
- };
2865
- function composeParsers(...parsers) {
2866
- return {
2867
- id: parsers.map((p) => p.id).join("|"),
2868
- parse(stdout, stderr, exitCode) {
2869
- for (const p of parsers) {
2870
- const res = p.parse(stdout, stderr, exitCode);
2871
- if (res) return res;
2872
- }
2873
- return void 0;
2874
- }
2875
- };
2876
- }
2877
- var SubprocessSandboxDriver = class {
2878
- id = "subprocess";
2879
- async exec(phase, command, config) {
2880
- const { spawn } = await import("child_process");
2881
- const start = Date.now();
2882
- return await new Promise((resolve) => {
2883
- const child = spawn(command, {
2884
- shell: true,
2885
- cwd: config.cwd,
2886
- env: { ...process.env, ...config.env ?? {} }
2887
- });
2888
- let stdout = "";
2889
- let stderr = "";
2890
- child.stdout?.on("data", (d) => {
2891
- stdout += String(d);
2892
- });
2893
- child.stderr?.on("data", (d) => {
2894
- stderr += String(d);
2895
- });
2896
- const timeout = setTimeout(() => {
2897
- try {
2898
- child.kill("SIGKILL");
2899
- } catch {
2900
- }
2901
- }, config.timeoutMs ?? 10 * 6e4);
2902
- child.on("close", (code) => {
2903
- clearTimeout(timeout);
2904
- const wallMs = Date.now() - start;
2905
- const parsed = phase === "test" && config.testParser ? config.testParser.parse(stdout, stderr, code ?? 1) : void 0;
2906
- resolve({
2907
- phase,
2908
- exitCode: code ?? 1,
2909
- stdout,
2910
- stderr,
2911
- wallMs,
2912
- testsTotal: parsed?.testsTotal,
2913
- testsPassed: parsed?.testsPassed
2914
- });
2915
- });
2916
- child.on("error", (err) => {
2917
- clearTimeout(timeout);
2918
- const wallMs = Date.now() - start;
2919
- resolve({ phase, exitCode: 127, stdout, stderr: stderr + String(err), wallMs });
2920
- });
2921
- });
2922
- }
2923
- };
2924
- var DockerSandboxDriver = class {
2925
- id = "docker";
2926
- async exec(phase, command, config) {
2927
- if (!config.image) throw new Error("DockerSandboxDriver requires config.image");
2928
- const sub = new SubprocessSandboxDriver();
2929
- const envArgs = Object.entries(config.env ?? {}).map(([k, v]) => `-e ${shellQuote(k)}=${shellQuote(v)}`).join(" ");
2930
- const wrapped = `docker run --rm ${envArgs} ${shellQuote(config.image)} sh -c ${shellQuote(command)}`;
2931
- return sub.exec(phase, wrapped, { ...config, env: void 0 });
2932
- }
2933
- };
2934
- function shellQuote(v) {
2935
- if (/^[A-Za-z0-9_\-\/\.@:=]+$/.test(v)) return v;
2936
- return `'${v.replace(/'/g, `'\\''`)}'`;
2937
- }
2938
- var SandboxHarness = class {
2939
- driver;
2940
- constructor(driver = new SubprocessSandboxDriver()) {
2941
- this.driver = driver;
2942
- }
2943
- async run(config, emitter) {
2944
- const handle = await emitter.sandbox({
2945
- name: `sandbox(${this.driver.id})`,
2946
- image: config.image,
2947
- command: [config.setupCommand, config.runCommand, config.testCommand].filter(Boolean).join(" && ")
2948
- });
2949
- const result = { passed: false, totalWallMs: 0, score: 0 };
2950
- try {
2951
- if (config.setupCommand) {
2952
- result.setup = await this.driver.exec("setup", config.setupCommand, config);
2953
- result.totalWallMs += result.setup.wallMs;
2954
- if (result.setup.exitCode !== 0) {
2955
- await handle.fail(`setup failed (exit ${result.setup.exitCode})`, {
2956
- exitCode: result.setup.exitCode,
2957
- wallMs: result.totalWallMs
2958
- });
2959
- return result;
2960
- }
2961
- }
2962
- if (config.runCommand) {
2963
- result.run = await this.driver.exec("run", config.runCommand, config);
2964
- result.totalWallMs += result.run.wallMs;
2965
- if (result.run.exitCode !== 0) {
2966
- await handle.fail(`run failed (exit ${result.run.exitCode})`, {
2967
- exitCode: result.run.exitCode,
2968
- wallMs: result.totalWallMs
2969
- });
2970
- return result;
2971
- }
2972
- }
2973
- if (config.testCommand) {
2974
- result.test = await this.driver.exec("test", config.testCommand, config);
2975
- result.totalWallMs += result.test.wallMs;
2976
- const passed = result.test.exitCode === 0;
2977
- result.passed = passed;
2978
- if (result.test.testsTotal !== void 0 && result.test.testsTotal > 0) {
2979
- result.score = (result.test.testsPassed ?? 0) / result.test.testsTotal;
2980
- } else {
2981
- result.score = passed ? 1 : 0;
2982
- }
2983
- await handle.end({
2984
- exitCode: result.test.exitCode,
2985
- testsTotal: result.test.testsTotal,
2986
- testsPassed: result.test.testsPassed,
2987
- wallMs: result.totalWallMs,
2988
- status: passed ? "ok" : "error"
2989
- });
2990
- } else {
2991
- result.passed = true;
2992
- result.score = 1;
2993
- await handle.end({ wallMs: result.totalWallMs });
2994
- }
2995
- } catch (err) {
2996
- await handle.fail(err instanceof Error ? err : String(err));
2997
- throw err;
2998
- }
2999
- return result;
3000
- }
3001
- };
3002
-
3003
3860
  // src/test-graded-scenario.ts
3004
3861
  async function runTestGradedScenario(scenario, store, options = {}) {
3005
3862
  const emitter = new TraceEmitter(store);
@@ -3619,8 +4476,8 @@ function compareToBaseline(samples, options = {}) {
3619
4476
  if (s.baseline.length < 2 || s.candidate.length < 2) {
3620
4477
  throw new Error(`compareToBaseline: need \u22652 samples per side for "${s.metric}"`);
3621
4478
  }
3622
- const bMean = mean(s.baseline);
3623
- const cMean = mean(s.candidate);
4479
+ const bMean = mean2(s.baseline);
4480
+ const cMean = mean2(s.candidate);
3624
4481
  const delta = cMean - bMean;
3625
4482
  const d = cohensD(s.baseline, s.candidate);
3626
4483
  const { t, df, p } = welchsTTest(s.baseline, s.candidate);
@@ -3659,7 +4516,7 @@ function compareToBaseline(samples, options = {}) {
3659
4516
  hasUnstable: metrics.some((m) => m.verdict === "unstable")
3660
4517
  };
3661
4518
  }
3662
- function mean(xs) {
4519
+ function mean2(xs) {
3663
4520
  return xs.reduce((a, b) => a + b, 0) / xs.length;
3664
4521
  }
3665
4522
  function iqr(xs) {
@@ -3675,8 +4532,8 @@ function iqr(xs) {
3675
4532
  }
3676
4533
  function welchsTTest(a, b) {
3677
4534
  if (a.length < 2 || b.length < 2) return { t: 0, df: 0, p: 1 };
3678
- const mA = mean(a);
3679
- const mB = mean(b);
4535
+ const mA = mean2(a);
4536
+ const mB = mean2(b);
3680
4537
  const vA = variance(a, mA);
3681
4538
  const vB = variance(b, mB);
3682
4539
  const seSquared = vA / a.length + vB / b.length;
@@ -4032,41 +4889,6 @@ function assertNonNegative(n, name) {
4032
4889
  }
4033
4890
  }
4034
4891
 
4035
- // src/pareto.ts
4036
- function dominates(a, b, objectives) {
4037
- let strictlyBetter = false;
4038
- for (const obj of objectives) {
4039
- const av = obj.value(a);
4040
- const bv = obj.value(b);
4041
- if (!Number.isFinite(av) || !Number.isFinite(bv)) return false;
4042
- const aIsBetter = obj.direction === "maximize" ? av > bv : av < bv;
4043
- const aIsWorse = obj.direction === "maximize" ? av < bv : av > bv;
4044
- if (aIsWorse) return false;
4045
- if (aIsBetter) strictlyBetter = true;
4046
- }
4047
- return strictlyBetter;
4048
- }
4049
- function paretoFrontier(candidates, objectives) {
4050
- if (objectives.length === 0) {
4051
- throw new Error("paretoFrontier: at least 1 objective required");
4052
- }
4053
- const valid = candidates.filter(
4054
- (c) => objectives.every((o) => Number.isFinite(o.value(c)))
4055
- );
4056
- const frontier = [];
4057
- const dominated = [];
4058
- for (const c of valid) {
4059
- const isDominated = valid.some((other) => other !== c && dominates(other, c, objectives));
4060
- if (isDominated) dominated.push(c);
4061
- else frontier.push(c);
4062
- }
4063
- const dominanceMap = frontier.map((d) => ({
4064
- dominator: d,
4065
- dominated: dominated.filter((x) => dominates(d, x, objectives))
4066
- }));
4067
- return { frontier, dominated, dominanceMap };
4068
- }
4069
-
4070
4892
  // src/series-convergence.ts
4071
4893
  function analyzeSeries(values, options = {}) {
4072
4894
  const window = options.window ?? 5;
@@ -4076,10 +4898,10 @@ function analyzeSeries(values, options = {}) {
4076
4898
  return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
4077
4899
  }
4078
4900
  const tail = values.slice(-window);
4079
- const mean3 = tail.reduce((a, b) => a + b, 0) / tail.length;
4080
- const variance2 = tail.reduce((acc, v) => acc + (v - mean3) ** 2, 0) / tail.length;
4901
+ const mean4 = tail.reduce((a, b) => a + b, 0) / tail.length;
4902
+ const variance2 = tail.reduce((acc, v) => acc + (v - mean4) ** 2, 0) / tail.length;
4081
4903
  const stdDev = Math.sqrt(variance2);
4082
- const refMean = Math.abs(mean3) > 1e-9 ? Math.abs(mean3) : 1;
4904
+ const refMean = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
4083
4905
  const cv = stdDev / refMean;
4084
4906
  const stable = tail.length >= window && cv <= stableCv;
4085
4907
  let tailRun = 0;
@@ -4100,7 +4922,7 @@ function analyzeSeries(values, options = {}) {
4100
4922
  } else {
4101
4923
  state = "noisy";
4102
4924
  }
4103
- return { state, windowMean: mean3, windowCv: cv, tailRun, stable };
4925
+ return { state, windowMean: mean4, windowCv: cv, tailRun, stable };
4104
4926
  }
4105
4927
 
4106
4928
  // src/state-continuity.ts
@@ -5028,12 +5850,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
5028
5850
  variantScores.push({ mutator: id, score, mutated });
5029
5851
  all.push(score);
5030
5852
  }
5031
- const mean3 = all.reduce((a, b) => a + b, 0) / all.length;
5032
- const variance2 = all.reduce((a, v) => a + (v - mean3) ** 2, 0) / all.length;
5853
+ const mean4 = all.reduce((a, b) => a + b, 0) / all.length;
5854
+ const variance2 = all.reduce((a, v) => a + (v - mean4) ** 2, 0) / all.length;
5033
5855
  const stdDev = Math.sqrt(variance2);
5034
- const ref = Math.abs(mean3) > 1e-9 ? Math.abs(mean3) : 1;
5856
+ const ref = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
5035
5857
  const robustness = Math.max(0, 1 - stdDev / ref);
5036
- return { originalScore, variantScores, meanScore: mean3, stdDev, robustness };
5858
+ return { originalScore, variantScores, meanScore: mean4, stdDev, robustness };
5037
5859
  }
5038
5860
  var lowercaseMutator = (p) => p.toLowerCase();
5039
5861
  var sentenceReorderMutator = (p, seed) => {
@@ -5284,8 +6106,11 @@ async function scoreProject(store, projectId) {
5284
6106
  const runtimeScore = runtimeScores.length > 0 ? runtimeScores.reduce((a, b) => a + b, 0) / runtimeScores.length : null;
5285
6107
  const runtimePassed = runtime.filter((r) => r.outcome?.pass === true).length;
5286
6108
  const runtimePassRate = runtime.length > 0 ? runtimePassed / runtime.length : null;
6109
+ const kind = runtime.length === 0 ? "scaffold-only" : "full";
6110
+ const complete = kind === "scaffold-only" ? metaScore !== null && buildScore !== null : metaScore !== null && buildScore !== null && runtimeScore !== null;
5287
6111
  return {
5288
6112
  projectId,
6113
+ kind,
5289
6114
  builderRunId: builder?.runId,
5290
6115
  metaScore,
5291
6116
  buildRunId: build?.runId,
@@ -5293,7 +6118,7 @@ async function scoreProject(store, projectId) {
5293
6118
  appRuntimeRunIds: runtime.map((r) => r.runId),
5294
6119
  runtimeScore,
5295
6120
  runtimePassRate,
5296
- complete: metaScore !== null && buildScore !== null && runtimeScore !== null
6121
+ complete
5297
6122
  };
5298
6123
  }
5299
6124
  async function scoreAllProjects(store) {
@@ -5715,8 +6540,8 @@ async function calibrationCurve(traceStore, outcomeStore, evalMetric, outcomeMet
5715
6540
  function toBin(chunk, lower, upper) {
5716
6541
  const xs = chunk.map((c) => c.x);
5717
6542
  const ys = chunk.map((c) => c.y);
5718
- const evalMean = mean2(xs);
5719
- const outcomeMean = mean2(ys);
6543
+ const evalMean = mean3(xs);
6544
+ const outcomeMean = mean3(ys);
5720
6545
  return {
5721
6546
  lower: lower ?? Math.min(...xs),
5722
6547
  upper: upper ?? Math.max(...xs),
@@ -5726,7 +6551,7 @@ function toBin(chunk, lower, upper) {
5726
6551
  gap: Math.abs(outcomeMean - evalMean)
5727
6552
  };
5728
6553
  }
5729
- function mean2(xs) {
6554
+ function mean3(xs) {
5730
6555
  return xs.reduce((a, b) => a + b, 0) / xs.length;
5731
6556
  }
5732
6557
  function defaultExtract4(metric) {
@@ -5951,8 +6776,8 @@ async function prmBestOfN(store, grader, runIds) {
5951
6776
  if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
5952
6777
  const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
5953
6778
  const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
5954
- const mean3 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
5955
- const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean3) ** 2, 0) / graded.length;
6779
+ const mean4 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
6780
+ const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean4) ** 2, 0) / graded.length;
5956
6781
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
5957
6782
  }
5958
6783
  async function prmEnsembleBestOfN(store, graders, runIds) {
@@ -5974,8 +6799,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
5974
6799
  const ranked = [...byRun.values()].sort(
5975
6800
  (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
5976
6801
  );
5977
- const mean3 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
5978
- const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean3) ** 2, 0) / ranked.length;
6802
+ const mean4 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
6803
+ const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean4) ** 2, 0) / ranked.length;
5979
6804
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
5980
6805
  }
5981
6806
 
@@ -6505,8 +7330,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
6505
7330
  const sRuns = runs.filter((r) => r.scenarioId === s.id);
6506
7331
  const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
6507
7332
  if (scores.length < 3) continue;
6508
- const mean3 = scores.reduce((a, b) => a + b, 0) / scores.length;
6509
- const variance2 = scores.reduce((a, b) => a + (b - mean3) ** 2, 0) / scores.length;
7333
+ const mean4 = scores.reduce((a, b) => a + b, 0) / scores.length;
7334
+ const variance2 = scores.reduce((a, b) => a + (b - mean4) ** 2, 0) / scores.length;
6510
7335
  if (variance2 > varianceThreshold) {
6511
7336
  targets.push({
6512
7337
  reason: "high-variance",
@@ -6987,6 +7812,7 @@ async function euAiActReport(ctx, signals) {
6987
7812
  }
6988
7813
  export {
6989
7814
  AgentDriver,
7815
+ AxGepaSteeringOptimizer,
6990
7816
  BenchmarkRunner,
6991
7817
  BudgetBreachError,
6992
7818
  BudgetGuard,
@@ -6995,9 +7821,11 @@ export {
6995
7821
  CostTracker,
6996
7822
  DEFAULT_AGENT_SLOS,
6997
7823
  DEFAULT_RULES as DEFAULT_FAILURE_RULES,
7824
+ DEFAULT_HARNESS_OBJECTIVES,
6998
7825
  DEFAULT_MUTATORS,
6999
7826
  DEFAULT_REDACTION_RULES,
7000
7827
  DEFAULT_RED_TEAM_CORPUS,
7828
+ DEFAULT_RUN_SCORE_WEIGHTS,
7001
7829
  Dataset,
7002
7830
  DockerSandboxDriver,
7003
7831
  DualAgentBench,
@@ -7011,15 +7839,19 @@ export {
7011
7839
  InMemoryOutcomeStore,
7012
7840
  InMemoryTraceStore,
7013
7841
  InMemoryWorkspaceInspector,
7842
+ JudgeRunner,
7014
7843
  MODEL_PRICING,
7015
7844
  MetricsCollector,
7016
7845
  OTEL_AGENT_EVAL_SCOPE,
7846
+ OptimizationLoop,
7847
+ PairwiseSteeringOptimizer,
7017
7848
  PrmGrader,
7018
7849
  ProductClient,
7019
7850
  ProjectRegistry,
7020
7851
  PromptOptimizer,
7021
7852
  PromptRegistry,
7022
7853
  REDACTION_VERSION,
7854
+ RunCritic,
7023
7855
  SandboxHarness,
7024
7856
  ScenarioRegistry,
7025
7857
  SubprocessSandboxDriver,
@@ -7028,6 +7860,7 @@ export {
7028
7860
  TraceEmitter,
7029
7861
  adversarialJudge,
7030
7862
  aggregateLlm,
7863
+ aggregateRunScore,
7031
7864
  analyzeAntiSlop,
7032
7865
  analyzeSeries,
7033
7866
  argHash,
@@ -7044,6 +7877,7 @@ export {
7044
7877
  causalAttribution,
7045
7878
  checkCanaries,
7046
7879
  checkSlos,
7880
+ clamp01,
7047
7881
  classifyEuAiRisk,
7048
7882
  classifyFailure,
7049
7883
  codeExecutionJudge,
@@ -7052,6 +7886,7 @@ export {
7052
7886
  collectionPreserved,
7053
7887
  commitBisect,
7054
7888
  compareToBaseline,
7889
+ compilerJudge,
7055
7890
  composeParsers,
7056
7891
  composeValidators,
7057
7892
  computeToolUseMetrics,
@@ -7062,8 +7897,10 @@ export {
7062
7897
  createAntiSlopJudge,
7063
7898
  createCustomJudge,
7064
7899
  createDomainExpertJudge,
7900
+ createLlmReviewer,
7065
7901
  crossTraceDiff,
7066
7902
  defaultJudges,
7903
+ distillPlaybook,
7067
7904
  dominates,
7068
7905
  estimateCost,
7069
7906
  estimateTokens,
@@ -7085,6 +7922,7 @@ export {
7085
7922
  groupBy,
7086
7923
  hashContent,
7087
7924
  hashScenarios,
7925
+ inMemoryReviewStore,
7088
7926
  interRaterReliability,
7089
7927
  iqr,
7090
7928
  isJudgeSpan,
@@ -7096,14 +7934,17 @@ export {
7096
7934
  jestTestParser,
7097
7935
  jsonHasKeys,
7098
7936
  jsonShape,
7937
+ jsonlReviewStore,
7099
7938
  judgeAgreementView,
7100
7939
  judgeSpans,
7101
7940
  keyPreserved,
7941
+ linterJudge,
7102
7942
  llmSpanFromProvider,
7103
7943
  llmSpans,
7104
7944
  loadScorerFromGrader,
7105
7945
  lowercaseMutator,
7106
7946
  mannWhitneyU,
7947
+ mergeSteeringBundle,
7107
7948
  nistAiRmfReport,
7108
7949
  nonRefusalRubric,
7109
7950
  normalizeScores,
@@ -7131,6 +7972,8 @@ export {
7131
7972
  regressionView,
7132
7973
  renderMarkdown,
7133
7974
  renderMarkdownReport,
7975
+ renderPlaybookMarkdown,
7976
+ renderSteeringText,
7134
7977
  replayScorerOverCorpus,
7135
7978
  replayTraceThroughJudge,
7136
7979
  requiredSampleSize,
@@ -7142,6 +7985,9 @@ export {
7142
7985
  runE2EWorkflow,
7143
7986
  runExpectations,
7144
7987
  runFailureClass,
7988
+ runHarnessExperiment,
7989
+ runJudgeFleet,
7990
+ runProposeReview,
7145
7991
  runSelfPlay,
7146
7992
  runTestGradedScenario,
7147
7993
  runsForScenario,
@@ -7149,6 +7995,8 @@ export {
7149
7995
  scoreContinuity,
7150
7996
  scoreProject,
7151
7997
  scoreRedTeamOutput,
7998
+ securityJudge,
7999
+ selectHarnessVariant,
7152
8000
  selfPreference,
7153
8001
  sentenceReorderMutator,
7154
8002
  signManifest,
@@ -7156,6 +8004,8 @@ export {
7156
8004
  statusAdvanced,
7157
8005
  stuckLoopView,
7158
8006
  summarize,
8007
+ summarizeHarnessResults,
8008
+ testJudge,
7159
8009
  textInSnapshot,
7160
8010
  toLangfuseEnvelope,
7161
8011
  toNdjson,