npm - @tangle-network/agent-eval - Versions diffs - 0.6.0 → 0.7.1 - Mend

@tangle-network/agent-eval 0.6.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/index.js CHANGED Viewed

@@ -410,7 +410,7 @@ function confidenceInterval(scores, confidence = 0.95) {
   if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
   if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
   const n = scores.length;
-  const mean3 = scores.reduce((a, b) => a + b, 0) / n;
+  const mean4 = scores.reduce((a, b) => a + b, 0) / n;
   const B = 1e3;
   const bootstrapMeans = [];
   for (let i = 0; i < B; i++) {
@@ -425,7 +425,7 @@ function confidenceInterval(scores, confidence = 0.95) {
   const lowerIdx = Math.floor(alpha / 2 * B);
   const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
   return {
-    mean: mean3,
+    mean: mean4,
     lower: bootstrapMeans[lowerIdx],
     upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
   };
@@ -513,11 +513,11 @@ function pairedTTest(before, after) {
   const n = before.length;
   if (n < 2) return { t: 0, df: 0, p: 1 };
   const diffs = before.map((b, i) => after[i] - b);
-  const mean3 = diffs.reduce((a, b) => a + b, 0) / n;
-  const variance2 = diffs.reduce((acc, d) => acc + (d - mean3) ** 2, 0) / (n - 1);
+  const mean4 = diffs.reduce((a, b) => a + b, 0) / n;
+  const variance2 = diffs.reduce((acc, d) => acc + (d - mean4) ** 2, 0) / (n - 1);
   const se = Math.sqrt(variance2 / n);
-  if (se === 0) return { t: mean3 === 0 ? 0 : Infinity, df: n - 1, p: mean3 === 0 ? 1 : 0 };
-  const t = mean3 / se;
+  if (se === 0) return { t: mean4 === 0 ? 0 : Infinity, df: n - 1, p: mean4 === 0 ? 1 : 0 };
+  const t = mean4 / se;
   const df = n - 1;
   const p = 2 * (1 - studentTCdf(Math.abs(t), df));
   return { t, df, p };
@@ -541,9 +541,9 @@ function wilcoxonSignedRank(before, after) {
   }
   let wPlus = 0;
   for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
-  const mean3 = n * (n + 1) / 4;
+  const mean4 = n * (n + 1) / 4;
   const variance2 = n * (n + 1) * (2 * n + 1) / 24;
-  const z = (wPlus - mean3) / Math.sqrt(variance2);
+  const z = (wPlus - mean4) / Math.sqrt(variance2);
   const p = 2 * (1 - normalCdf(Math.abs(z)));
   return { w: wPlus, p };
 }
@@ -2094,113 +2094,500 @@ function flatSamples(score) {
   return out;
 }
-// src/dual-agent-bench.ts
-var DualAgentBench = class {
-  async run(config) {
-    const maxRounds = config.maxRounds ?? 5;
-    const threshold = config.convergenceThreshold ?? 0.85;
-    if (config.scenarios.length === 0) {
-      throw new Error("DualAgentBench requires at least 1 scenario");
+// src/steering.ts
+function mergeSteeringBundle(base, delta) {
+  return {
+    ...base,
+    ...delta.coderPrompt !== void 0 ? { coderPrompt: delta.coderPrompt } : {},
+    ...delta.continuePrompt !== void 0 ? { continuePrompt: delta.continuePrompt } : {},
+    reviewerPrompts: {
+      ...base.reviewerPrompts ?? {},
+      ...delta.reviewerPrompts ?? {}
+    },
+    skills: delta.skills ?? base.skills,
+    rolePrompts: {
+      ...base.rolePrompts ?? {},
+      ...delta.rolePrompts ?? {}
+    },
+    metadata: {
+      ...base.metadata ?? {},
+      ...delta.metadata ?? {}
     }
-    const results = [];
-    for (const scenario of config.scenarios) {
-      const history = [];
-      let converged = false;
-      let roundsToConverge = null;
-      let finalProposal = "";
-      let lastScore = 0;
-      let priorCritique;
-      for (let r = 0; r < maxRounds; r++) {
-        const priorProposal = history[history.length - 1]?.proposal;
-        const proposal = await config.propose({
-          scenario,
-          roundIndex: r,
-          priorProposal,
-          priorCritique
-        });
-        const { critique, convergenceScore } = await config.critique({
-          scenario,
-          roundIndex: r,
-          proposal
-        });
-        if (!Number.isFinite(convergenceScore) || convergenceScore < 0 || convergenceScore > 1) {
-          throw new Error(
-            `critique must return convergenceScore in [0,1]; got ${convergenceScore} for scenario ${scenario.id} round ${r}`
-          );
-        }
-        const round = {
-          roundIndex: r,
-          proposal,
-          critique,
-          convergenceScore
-        };
-        history.push(round);
-        config.onRoundComplete?.({ scenarioId: scenario.id, round });
-        finalProposal = proposal;
-        lastScore = convergenceScore;
-        priorCritique = critique;
-        if (convergenceScore >= threshold) {
-          converged = true;
-          roundsToConverge = r + 1;
-          break;
-        }
+  };
+}
+function renderSteeringText(bundle) {
+  const lines = [`bundle:${bundle.id}`];
+  if (bundle.coderPrompt) lines.push(`coder:${bundle.coderPrompt}`);
+  if (bundle.continuePrompt) lines.push(`continue:${bundle.continuePrompt}`);
+  const reviewers = Object.entries(bundle.reviewerPrompts ?? {}).sort(([a], [b]) => a.localeCompare(b));
+  for (const [name, prompt] of reviewers) lines.push(`reviewer:${name}:${prompt}`);
+  const skills = [...bundle.skills ?? []].sort();
+  if (skills.length) lines.push(`skills:${skills.join(",")}`);
+  return lines.join("\n");
+}
+// src/run-score.ts
+var DEFAULT_RUN_SCORE_WEIGHTS = {
+  success: 4,
+  goalProgress: 2,
+  repoGroundedness: 1.5,
+  driftPenalty: -1.5,
+  toolUseQuality: 1,
+  patchQuality: 1.25,
+  testReality: 1.5,
+  finalGate: 3,
+  reviewerBlockers: -2,
+  costUsd: -0.2,
+  wallSeconds: -0.1
+};
+function aggregateRunScore(score, weights = {}) {
+  const w = { ...DEFAULT_RUN_SCORE_WEIGHTS, ...weights };
+  return w.success * clamp01(score.success) + w.goalProgress * clamp01(score.goalProgress) + w.repoGroundedness * clamp01(score.repoGroundedness) + w.driftPenalty * clamp01(score.driftPenalty) + w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + w.finalGate * clamp01(score.finalGate) + w.reviewerBlockers * clamp01(score.reviewerBlockers) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60);
+}
+function clamp01(value) {
+  if (!Number.isFinite(value)) return 0;
+  return Math.max(0, Math.min(1, value));
+}
+// src/run-critic.ts
+var DEFAULT_DRIFT_PATTERNS = [
+  /https?:\/\//i,
+  /\btitle:\s/i,
+  /\bsummary:\s/i,
+  /\burl:\s/i,
+  /\bnpm package usage\b/i,
+  /\bnews\b/i
+];
+var RunCritic = class {
+  weights;
+  driftPatterns;
+  constructor(options = {}) {
+    this.weights = options.weights;
+    this.driftPatterns = options.driftPatterns ?? DEFAULT_DRIFT_PATTERNS;
+  }
+  async score(store, runId) {
+    const run = await store.getRun(runId);
+    if (!run) throw new Error(`run ${runId} not found`);
+    const [spans, events, artifacts, budget] = await Promise.all([
+      store.spans({ runId }),
+      store.events({ runId }),
+      store.artifacts(runId),
+      store.budget(runId)
+    ]);
+    return this.scoreTrace({ run, spans, events, artifacts, budget });
+  }
+  scoreTrace(trace) {
+    const notes = [];
+    const llmSpans2 = trace.spans.filter((s) => s.kind === "llm");
+    const toolSpans2 = trace.spans.filter((s) => s.kind === "tool");
+    const judgeSpans2 = trace.spans.filter((s) => s.kind === "judge");
+    const sandboxSpans = trace.spans.filter((s) => s.kind === "sandbox");
+    const finalGateSpans = judgeSpans2.filter(
+      (span) => span.dimension === "final_gate" || span.attributes?.finalGate === true
+    );
+    const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
+    if (!success) notes.push("run did not complete with pass=true");
+    const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum, span) => sum + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
+    const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score) : void 0;
+    const goalProgress = outcomeScore ?? judgeAverage ?? success;
+    const successfulTools = toolSpans2.filter((span) => span.status !== "error").length;
+    const toolUseQuality = toolSpans2.length === 0 ? 0 : successfulTools / toolSpans2.length;
+    if (toolSpans2.length === 0) notes.push("no tool spans recorded");
+    const patchEvidence = trace.artifacts.length + toolSpans2.filter((span) => /write|edit|patch|apply/i.test(span.toolName)).length;
+    const patchQuality = patchEvidence > 0 ? clamp01(patchEvidence / 4) : 0;
+    if (!patchQuality) notes.push("no artifact or edit evidence recorded");
+    const sandboxTests = sandboxSpans.filter((span) => typeof span.testsTotal === "number" && span.testsTotal > 0);
+    const testReality = sandboxTests.length ? sandboxTests.reduce((sum, span) => sum + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1), 0) / sandboxTests.length : toolSpans2.some((span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))) ? 0.4 : 0;
+    if (!testReality) notes.push("no real test/build evidence recorded");
+    const blockerSpans = judgeSpans2.filter(
+      (span) => isBlockingJudge(span)
+    );
+    const finalGateBlockers = finalGateSpans.filter((span) => isBlockingJudge(span));
+    const finalGate = finalGateSpans.length ? finalGateBlockers.length ? 0 : 1 : success;
+    if (finalGateBlockers.length) notes.push(`final gate blocked by ${finalGateBlockers.length} reviewer(s)`);
+    else if (!finalGateSpans.length) notes.push("no final gate judgment recorded");
+    const reviewerBlockers = judgeSpans2.length ? blockerSpans.length / judgeSpans2.length : 0;
+    if (reviewerBlockers) notes.push(`detected ${blockerSpans.length} blocking reviewer signal(s)`);
+    const positiveGroundingSignals = patchEvidence + sandboxSpans.length + llmSpans2.filter((span) => looksRepoGrounded(span.output ?? "")).length;
+    const driftSignals = llmSpans2.filter((span) => this.isDrift(span.output ?? "")).length + trace.events.filter((event) => this.isDrift(JSON.stringify(event.payload))).length;
+    const repoGroundedness = positiveGroundingSignals + driftSignals === 0 ? 0 : positiveGroundingSignals / (positiveGroundingSignals + driftSignals);
+    const driftPenalty = positiveGroundingSignals + driftSignals === 0 ? 0 : driftSignals / (positiveGroundingSignals + driftSignals);
+    if (driftSignals > 0) notes.push(`detected ${driftSignals} drift signal(s)`);
+    const costUsd = trace.budget.length ? Math.max(...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed), 0) : llmSpans2.reduce((sum, span) => sum + (span.costUsd ?? 0), 0);
+    const wallSeconds = trace.run.endedAt && trace.run.startedAt ? Math.max(0, (trace.run.endedAt - trace.run.startedAt) / 1e3) : 0;
+    return {
+      success,
+      goalProgress,
+      repoGroundedness,
+      driftPenalty,
+      toolUseQuality,
+      patchQuality,
+      testReality,
+      finalGate,
+      reviewerBlockers,
+      costUsd,
+      wallSeconds,
+      notes
+    };
+  }
+  rank(score) {
+    return aggregateRunScore(score, this.weights);
+  }
+  isDrift(text) {
+    return this.driftPatterns.some((pattern) => pattern.test(text));
+  }
+};
+function normalizeJudgeScore(score) {
+  return score > 1 ? clamp01(score / 10) : clamp01(score);
+}
+function looksRepoGrounded(text) {
+  return /(?:src\/|tests?\/|package\.json|tsconfig|\.ts\b|\.tsx\b|git status|pnpm |npm |vitest|pytest|jest)/i.test(text);
+}
+function isBlockingJudge(span) {
+  return span.attributes?.blocking === true || span.attributes?.verdict === "BLOCKING" || positiveNumber(span.attributes?.blockingFindings) || positiveNumber(span.attributes?.highFindings) || span.score <= 2;
+}
+function positiveNumber(value) {
+  return typeof value === "number" && value > 0;
+}
+// src/playbook.ts
+function distillPlaybook(entries, options = {}) {
+  const maxEntries = options.maxEntries ?? 12;
+  const byInstruction = /* @__PURE__ */ new Map();
+  for (const entry of entries) {
+    const key = normalizeInstruction(entry.instruction);
+    const existing = byInstruction.get(key);
+    if (!existing || (entry.weight ?? 0) > (existing.weight ?? 0)) {
+      byInstruction.set(key, { ...entry, instruction: canonicalInstruction(entry.instruction) });
+    }
+  }
+  const distilled = [...byInstruction.values()].sort((a, b) => (b.weight ?? 0) - (a.weight ?? 0)).slice(0, maxEntries);
+  return { entries: distilled };
+}
+function renderPlaybookMarkdown(playbook) {
+  const lines = ["# Playbook", ""];
+  for (const entry of playbook.entries) {
+    lines.push(`- ${entry.instruction}`);
+    lines.push(`  Rationale: ${entry.rationale}`);
+    if (entry.category) lines.push(`  Category: ${entry.category}`);
+    if (entry.evidence) lines.push(`  Evidence: ${entry.evidence}`);
+    if (entry.sourceRunId) lines.push(`  Source run: ${entry.sourceRunId}`);
+    lines.push("");
+  }
+  return lines.join("\n").trim() + "\n";
+}
+function normalizeInstruction(value) {
+  return value.trim().toLowerCase().replace(/\s+/g, " ");
+}
+function canonicalInstruction(value) {
+  const normalized = value.trim().replace(/\s+/g, " ");
+  return normalized.length === 0 ? normalized : normalized[0].toUpperCase() + normalized.slice(1);
+}
+// src/optimization-loop.ts
+var OptimizationLoop = class {
+  optimizer;
+  constructor(optimizer = new PromptOptimizer()) {
+    this.optimizer = optimizer;
+  }
+  async run(config) {
+    const byId = new Map(config.variants.map((variant) => [variant.id, variant]));
+    const result = await this.optimizer.run({
+      variants: config.variants.map((variant) => ({
+        id: variant.id,
+        prompt: renderSteeringText(variant),
+        metadata: { bundle: variant }
+      })),
+      scenarioIds: config.examples.map((example) => example.scenarioId),
+      trialsPerScenario: config.trialsPerScenario,
+      scoreVariant: async ({ variant, scenarioId, trialIndex }) => {
+        const bundle = byId.get(variant.id);
+        if (!bundle) throw new Error(`unknown steering bundle ${variant.id}`);
+        const example = config.examples.find((item) => item.scenarioId === scenarioId);
+        if (!example) throw new Error(`unknown optimization example ${scenarioId}`);
+        const score = await config.evaluate({ variant: bundle, example, trialIndex });
+        return aggregateRunScore(score, config.scoreWeights);
       }
-      results.push({
-        scenarioId: scenario.id,
-        converged,
-        roundsToConverge,
-        finalProposal,
-        history,
-        finalScore: lastScore
-      });
+    });
+    return {
+      winner: byId.get(result.winner.variantId),
+      significant: result.winner.significant,
+      reports: result.scores.map((score) => ({
+        variantId: score.variantId,
+        bundle: byId.get(score.variantId),
+        mean: score.mean,
+        ci95: score.ci95,
+        scenarioScores: score.perScenario
+      })),
+      pairwise: result.pairwise
+    };
+  }
+};
+// src/steering-optimizer.ts
+var PairwiseSteeringOptimizer = class {
+  optimize(rows, config = {}) {
+    const ranked = rankRows(rows, config.weights);
+    if (!ranked.length) throw new Error("no steering optimization rows");
+    return {
+      backend: "pairwise",
+      recommendedVariantId: ranked[0].variantId,
+      rationale: `Highest observed mean aggregate across ${rows.length} scored run(s).`,
+      rankings: ranked
+    };
+  }
+};
+var AxGepaSteeringOptimizer = class {
+  constructor(config) {
+    this.config = config;
+  }
+  config;
+  async optimize(rows) {
+    const fallback = new PairwiseSteeringOptimizer().optimize(rows, this.config);
+    const minRows = this.config.minRows ?? 6;
+    const variantIds = [...new Set(rows.map((row) => row.variantId))];
+    const byScenario = collapseScenarioWinners(rows, this.config.weights);
+    if (variantIds.length < 2 || byScenario.length < minRows) {
+      return {
+        ...fallback,
+        backend: "ax-gepa",
+        skipped: true,
+        rationale: `AxGEPA skipped: need >=2 variants and >=${minRows} scenario winners, got ${variantIds.length} variant(s) and ${byScenario.length} scenario winner(s).`
+      };
     }
-    const convergedResults = results.filter((r) => r.converged);
-    const convergenceRate = results.length ? convergedResults.length / results.length : 0;
-    const avgRoundsToConverge = convergedResults.length ? convergedResults.reduce((acc, r) => acc + (r.roundsToConverge ?? 0), 0) / convergedResults.length : null;
-    const avgFinalScore = results.length ? results.reduce((acc, r) => acc + r.finalScore, 0) / results.length : 0;
+    let axLib;
+    try {
+      axLib = await import("@ax-llm/ax");
+    } catch {
+      return {
+        ...fallback,
+        backend: "ax-gepa",
+        skipped: true,
+        rationale: "AxGEPA unavailable: install @ax-llm/ax to enable selector optimization."
+      };
+    }
+    const { ai, ax, AxGEPA } = axLib;
+    const signature = `task:string, split:string, seedPreview:string -> variantId:class "${variantIds.join(", ")}", rationale:string`;
+    const selector = ax(signature, {
+      description: "Choose the best steering bundle variant for an autopilot task."
+    });
+    const splitIndex = Math.max(1, Math.floor(byScenario.length * 0.8));
+    const train = byScenario.slice(0, splitIndex);
+    const validation = byScenario.slice(splitIndex);
+    if (!validation.length) {
+      return {
+        ...fallback,
+        backend: "ax-gepa",
+        skipped: true,
+        rationale: "AxGEPA skipped: no validation examples after split."
+      };
+    }
+    const optimizer = new AxGEPA({
+      studentAI: createAxService(ai, this.config.provider, this.config.apiKey, this.config.model),
+      teacherAI: createAxService(ai, this.config.provider, this.config.apiKey, this.config.teacherModel ?? this.config.model),
+      numTrials: 8,
+      minibatch: true,
+      minibatchSize: 4,
+      earlyStoppingTrials: 3,
+      sampleCount: 1
+    });
+    const compiled = await optimizer.compile(
+      selector,
+      train,
+      (({ prediction, example }) => prediction?.variantId === example?.variantId ? 1 : 0),
+      {
+        validationExamples: validation,
+        maxMetricCalls: 64
+      }
+    );
+    selector.applyOptimization(compiled.optimizedProgram);
     return {
-      scenarios: results,
-      aggregate: { convergenceRate, avgRoundsToConverge, avgFinalScore },
-      config: { maxRounds, convergenceThreshold: threshold }
+      ...fallback,
+      backend: "ax-gepa",
+      rationale: `AxGEPA trained a variant selector from ${byScenario.length} scored scenario winner(s); default winner remains ${fallback.recommendedVariantId}.`,
+      selector: {
+        backend: "ax-gepa",
+        signature,
+        labels: variantIds,
+        rationale: compiled.bestScore !== void 0 ? `bestScore=${compiled.bestScore}` : void 0
+      }
     };
   }
 };
+function rankRows(rows, weights) {
+  const buckets = /* @__PURE__ */ new Map();
+  for (const row of rows) {
+    const values = buckets.get(row.variantId) ?? [];
+    values.push(aggregateRunScore(row.score, weights));
+    buckets.set(row.variantId, values);
+  }
+  return [...buckets.entries()].map(([variantId, values]) => ({
+    variantId,
+    mean: values.reduce((sum, value) => sum + value, 0) / values.length,
+    runs: values.length
+  })).sort((a, b) => b.mean - a.mean);
+}
+function collapseScenarioWinners(rows, weights) {
+  const byScenario = /* @__PURE__ */ new Map();
+  for (const row of rows) {
+    const bucket = byScenario.get(row.scenarioId) ?? [];
+    bucket.push(row);
+    byScenario.set(row.scenarioId, bucket);
+  }
+  return [...byScenario.entries()].map(([scenarioId, scenarioRows]) => {
+    const best = scenarioRows.map((row) => ({ row, aggregate: aggregateRunScore(row.score, weights) })).sort((a, b) => b.aggregate - a.aggregate)[0];
+    return {
+      task: String(best.row.metadata?.task ?? best.row.metadata?.seed_preview ?? scenarioId),
+      split: String(best.row.metadata?.split ?? "train"),
+      seedPreview: String(best.row.metadata?.seed_preview ?? ""),
+      variantId: best.row.variantId
+    };
+  });
+}
+function createAxService(aiFactory, provider, apiKey, model) {
+  return aiFactory({
+    name: provider,
+    apiKey,
+    config: { model }
+  });
+}
-// src/trace/schema.ts
-var TRACE_SCHEMA_VERSION = "1.0.0";
-var FAILURE_CLASSES = [
-  "success",
-  "reasoning_error",
-  "tool_selection_error",
-  "tool_argument_error",
-  "tool_recovery_failure",
-  "hallucination",
-  "instruction_following",
-  "safety_refusal_miss",
-  "policy_violation",
-  "budget_exceeded",
-  "format_drift",
-  "permission_escalation",
-  "pii_leak",
-  "cost_overrun",
-  "timeout",
-  "sandbox_failure",
-  "unknown"
-];
-function isLlmSpan(s) {
-  return s.kind === "llm";
+// src/pareto.ts
+function dominates(a, b, objectives) {
+  let strictlyBetter = false;
+  for (const obj of objectives) {
+    const av = obj.value(a);
+    const bv = obj.value(b);
+    if (!Number.isFinite(av) || !Number.isFinite(bv)) return false;
+    const aIsBetter = obj.direction === "maximize" ? av > bv : av < bv;
+    const aIsWorse = obj.direction === "maximize" ? av < bv : av > bv;
+    if (aIsWorse) return false;
+    if (aIsBetter) strictlyBetter = true;
+  }
+  return strictlyBetter;
 }
-function isToolSpan(s) {
-  return s.kind === "tool";
+function paretoFrontier(candidates, objectives) {
+  if (objectives.length === 0) {
+    throw new Error("paretoFrontier: at least 1 objective required");
+  }
+  const valid = candidates.filter(
+    (c) => objectives.every((o) => Number.isFinite(o.value(c)))
+  );
+  const frontier = [];
+  const dominated = [];
+  for (const c of valid) {
+    const isDominated = valid.some((other) => other !== c && dominates(other, c, objectives));
+    if (isDominated) dominated.push(c);
+    else frontier.push(c);
+  }
+  const dominanceMap = frontier.map((d) => ({
+    dominator: d,
+    dominated: dominated.filter((x) => dominates(d, x, objectives))
+  }));
+  return { frontier, dominated, dominanceMap };
 }
-function isRetrievalSpan(s) {
-  return s.kind === "retrieval";
+// src/harness-optimizer.ts
+var DEFAULT_HARNESS_OBJECTIVES = [
+  { name: "aggregate", direction: "maximize", value: (r) => r.aggregateMean },
+  { name: "pass_rate", direction: "maximize", value: (r) => r.passRate },
+  { name: "cost", direction: "minimize", value: (r) => r.costUsdMean },
+  { name: "wall", direction: "minimize", value: (r) => r.wallSecondsMean }
+];
+async function runHarnessExperiment(config) {
+  const jobs = buildJobs(config);
+  const critic = new RunCritic({ weights: config.weights });
+  const score = config.score ?? ((trace) => critic.scoreTrace(trace));
+  const results = await mapLimit(jobs, config.parallelism ?? 1, async (request) => {
+    const trace = await config.adapter.run(request);
+    const runScore = await score(trace, request);
+    const result = {
+      variant: request.variant,
+      scenario: request.scenario,
+      trialIndex: request.trialIndex,
+      trace,
+      score: runScore,
+      aggregate: aggregateRunScore(runScore, config.weights)
+    };
+    await config.onResult?.(result);
+    return result;
+  });
+  return { results, selection: selectHarnessVariant(results, config.objectives) };
+}
+function selectHarnessVariant(results, objectives = DEFAULT_HARNESS_OBJECTIVES) {
+  const reports = summarizeHarnessResults(results);
+  if (reports.length === 0) throw new Error("selectHarnessVariant: no results");
+  const frontier = paretoFrontier(reports, objectives);
+  const candidates = frontier.frontier.length ? frontier.frontier : reports;
+  const winner = [...candidates].sort((a, b) => b.aggregateMean - a.aggregateMean)[0];
+  if (!winner) throw new Error("selectHarnessVariant: no winner");
+  return { winner, frontier, reports };
+}
+function summarizeHarnessResults(results) {
+  const byVariant = /* @__PURE__ */ new Map();
+  for (const result of results) {
+    byVariant.set(result.variant.id, [...byVariant.get(result.variant.id) ?? [], result]);
+  }
+  return [...byVariant.values()].map((runs) => {
+    const variant = runs[0]?.variant;
+    if (!variant) throw new Error("summarizeHarnessResults: empty variant bucket");
+    return {
+      variant,
+      runs,
+      aggregateMean: mean(runs.map((r) => r.aggregate)),
+      passRate: mean(runs.map((r) => r.score.success)),
+      costUsdMean: mean(runs.map((r) => r.score.costUsd)),
+      wallSecondsMean: mean(runs.map((r) => r.score.wallSeconds)),
+      scoreMean: meanRunScore(runs.map((r) => r.score))
+    };
+  }).sort((a, b) => b.aggregateMean - a.aggregateMean);
+}
+function buildJobs(config) {
+  if (config.variants.length === 0) throw new Error("runHarnessExperiment: at least one variant required");
+  if (config.scenarios.length === 0) throw new Error("runHarnessExperiment: at least one scenario required");
+  const trials = Math.max(1, Math.floor(config.trialsPerScenario ?? 1));
+  const jobs = [];
+  for (const variant of config.variants) {
+    for (const scenario of config.scenarios) {
+      for (let trialIndex = 0; trialIndex < trials; trialIndex++) {
+        jobs.push({ variant, scenario, trialIndex });
+      }
+    }
+  }
+  return jobs;
 }
-function isJudgeSpan(s) {
-  return s.kind === "judge";
+async function mapLimit(items, limit, fn) {
+  const results = new Array(items.length);
+  let next = 0;
+  const workerCount = Math.max(1, Math.min(Math.floor(limit), items.length));
+  await Promise.all(Array.from({ length: workerCount }, async () => {
+    while (next < items.length) {
+      const index = next++;
+      const item = items[index];
+      if (item === void 0) continue;
+      results[index] = await fn(item);
+    }
+  }));
+  return results;
 }
-function isSandboxSpan(s) {
-  return s.kind === "sandbox";
+function mean(values) {
+  return values.length ? values.reduce((sum, value) => sum + value, 0) / values.length : 0;
+}
+function meanRunScore(scores) {
+  return {
+    success: mean(scores.map((s) => s.success)),
+    goalProgress: mean(scores.map((s) => s.goalProgress)),
+    repoGroundedness: mean(scores.map((s) => s.repoGroundedness)),
+    driftPenalty: mean(scores.map((s) => s.driftPenalty)),
+    toolUseQuality: mean(scores.map((s) => s.toolUseQuality)),
+    patchQuality: mean(scores.map((s) => s.patchQuality)),
+    testReality: mean(scores.map((s) => s.testReality)),
+    finalGate: mean(scores.map((s) => s.finalGate)),
+    reviewerBlockers: mean(scores.map((s) => s.reviewerBlockers)),
+    costUsd: mean(scores.map((s) => s.costUsd)),
+    wallSeconds: mean(scores.map((s) => s.wallSeconds)),
+    notes: scores.flatMap((s) => s.notes ?? [])
+  };
 }
 // src/trace/store.ts
@@ -2597,6 +2984,651 @@ function llmSpanFromProvider(args) {
   };
 }
+// src/sandbox-harness.ts
+var vitestTestParser = {
+  id: "vitest",
+  parse(stdout) {
+    const m = stdout.match(/Tests\s+(\d+)\s+(passed|failed)(?:\s*\|\s*(\d+)\s+(passed|failed))?/i);
+    if (!m) return void 0;
+    let passed = 0;
+    let failed = 0;
+    const a = parseInt(m[1], 10);
+    const aLabel = m[2].toLowerCase();
+    if (aLabel === "passed") passed += a;
+    else failed += a;
+    if (m[3] && m[4]) {
+      const b = parseInt(m[3], 10);
+      if (m[4].toLowerCase() === "passed") passed += b;
+      else failed += b;
+    }
+    return { testsTotal: passed + failed, testsPassed: passed };
+  }
+};
+var pytestTestParser = {
+  id: "pytest",
+  parse(stdout) {
+    const total = stdout.match(/collected\s+(\d+)\s+items?/i);
+    const passed = stdout.match(/(\d+)\s+passed/);
+    if (!total || !passed) return void 0;
+    return { testsTotal: parseInt(total[1], 10), testsPassed: parseInt(passed[1], 10) };
+  }
+};
+var jestTestParser = {
+  id: "jest",
+  parse(stdout) {
+    const m = stdout.match(/Tests:\s+(?:(\d+)\s+failed[^,]*,\s*)?(\d+)\s+passed,\s+(\d+)\s+total/i);
+    if (!m) return void 0;
+    return { testsTotal: parseInt(m[3], 10), testsPassed: parseInt(m[2], 10) };
+  }
+};
+function composeParsers(...parsers) {
+  return {
+    id: parsers.map((p) => p.id).join("|"),
+    parse(stdout, stderr, exitCode) {
+      for (const p of parsers) {
+        const res = p.parse(stdout, stderr, exitCode);
+        if (res) return res;
+      }
+      return void 0;
+    }
+  };
+}
+var SubprocessSandboxDriver = class {
+  id = "subprocess";
+  defaultCwd;
+  defaultEnv;
+  constructor(options = {}) {
+    this.defaultCwd = options.cwd;
+    this.defaultEnv = options.env;
+  }
+  async exec(phase, command, config) {
+    const { spawn } = await import("child_process");
+    const start = Date.now();
+    const effectiveCwd = config.cwd ?? this.defaultCwd;
+    const effectiveEnv = { ...process.env, ...this.defaultEnv ?? {}, ...config.env ?? {} };
+    return await new Promise((resolve) => {
+      const child = spawn(command, {
+        shell: true,
+        cwd: effectiveCwd,
+        env: effectiveEnv
+      });
+      let stdout = "";
+      let stderr = "";
+      child.stdout?.on("data", (d) => {
+        stdout += String(d);
+      });
+      child.stderr?.on("data", (d) => {
+        stderr += String(d);
+      });
+      const timeout = setTimeout(() => {
+        try {
+          child.kill("SIGKILL");
+        } catch {
+        }
+      }, config.timeoutMs ?? 10 * 6e4);
+      child.on("close", (code) => {
+        clearTimeout(timeout);
+        const wallMs = Date.now() - start;
+        const parsed = phase === "test" && config.testParser ? config.testParser.parse(stdout, stderr, code ?? 1) : void 0;
+        resolve({
+          phase,
+          exitCode: code ?? 1,
+          stdout,
+          stderr,
+          wallMs,
+          testsTotal: parsed?.testsTotal,
+          testsPassed: parsed?.testsPassed
+        });
+      });
+      child.on("error", (err) => {
+        clearTimeout(timeout);
+        const wallMs = Date.now() - start;
+        resolve({ phase, exitCode: 127, stdout, stderr: stderr + String(err), wallMs });
+      });
+    });
+  }
+};
+var DockerSandboxDriver = class {
+  id = "docker";
+  async exec(phase, command, config) {
+    if (!config.image) throw new Error("DockerSandboxDriver requires config.image");
+    const sub = new SubprocessSandboxDriver();
+    const envArgs = Object.entries(config.env ?? {}).map(([k, v]) => `-e ${shellQuote(k)}=${shellQuote(v)}`).join(" ");
+    const wrapped = `docker run --rm ${envArgs} ${shellQuote(config.image)} sh -c ${shellQuote(command)}`;
+    return sub.exec(phase, wrapped, { ...config, env: void 0 });
+  }
+};
+function shellQuote(v) {
+  if (/^[A-Za-z0-9_\-\/\.@:=]+$/.test(v)) return v;
+  return `'${v.replace(/'/g, `'\\''`)}'`;
+}
+var SandboxHarness = class {
+  driver;
+  constructor(driver = new SubprocessSandboxDriver()) {
+    this.driver = driver;
+  }
+  async run(config, emitter) {
+    const handle = await emitter.sandbox({
+      name: `sandbox(${this.driver.id})`,
+      image: config.image,
+      command: [config.setupCommand, config.runCommand, config.testCommand].filter(Boolean).join(" && ")
+    });
+    const result = { passed: false, totalWallMs: 0, score: 0 };
+    try {
+      if (config.setupCommand) {
+        result.setup = await this.driver.exec("setup", config.setupCommand, config);
+        result.totalWallMs += result.setup.wallMs;
+        if (result.setup.exitCode !== 0) {
+          await handle.fail(`setup failed (exit ${result.setup.exitCode})`, {
+            exitCode: result.setup.exitCode,
+            wallMs: result.totalWallMs
+          });
+          return result;
+        }
+      }
+      if (config.runCommand) {
+        result.run = await this.driver.exec("run", config.runCommand, config);
+        result.totalWallMs += result.run.wallMs;
+        if (result.run.exitCode !== 0) {
+          await handle.fail(`run failed (exit ${result.run.exitCode})`, {
+            exitCode: result.run.exitCode,
+            wallMs: result.totalWallMs
+          });
+          return result;
+        }
+      }
+      if (config.testCommand) {
+        result.test = await this.driver.exec("test", config.testCommand, config);
+        result.totalWallMs += result.test.wallMs;
+        const passed = result.test.exitCode === 0;
+        result.passed = passed;
+        if (result.test.testsTotal !== void 0 && result.test.testsTotal > 0) {
+          result.score = (result.test.testsPassed ?? 0) / result.test.testsTotal;
+        } else {
+          result.score = passed ? 1 : 0;
+        }
+        await handle.end({
+          exitCode: result.test.exitCode,
+          testsTotal: result.test.testsTotal,
+          testsPassed: result.test.testsPassed,
+          wallMs: result.totalWallMs,
+          status: passed ? "ok" : "error"
+        });
+      } else {
+        result.passed = true;
+        result.score = 1;
+        await handle.end({ wallMs: result.totalWallMs });
+      }
+    } catch (err) {
+      await handle.fail(err instanceof Error ? err : String(err));
+      throw err;
+    }
+    return result;
+  }
+};
+// src/judge-runner.ts
+var JudgeRunner = class {
+  driver;
+  constructor(driver = new SubprocessSandboxDriver()) {
+    this.driver = driver;
+  }
+  async run(spec) {
+    const store = new InMemoryTraceStore();
+    const emitter = new TraceEmitter(store, { runId: `judge-${spec.id}` });
+    await emitter.startRun({
+      scenarioId: spec.id,
+      layer: "meta",
+      projectId: "judge-runner"
+    });
+    const harness = new SandboxHarness(this.driver);
+    const detail = await harness.run(spec.config, emitter);
+    await emitter.endRun({ pass: detail.passed, score: detail.score, notes: `${spec.kind} judge` });
+    return {
+      id: spec.id,
+      kind: spec.kind,
+      passed: detail.passed,
+      score: detail.score,
+      summary: renderJudgeSummary(spec.kind, detail),
+      detail
+    };
+  }
+};
+async function runJudgeFleet(specs, options = {}) {
+  const runner = new JudgeRunner(options.driver);
+  if (options.parallel === false) {
+    const results = [];
+    for (const spec of specs) results.push(await runner.run(spec));
+    return results;
+  }
+  return await Promise.all(specs.map((spec) => runner.run(spec)));
+}
+function compilerJudge(id, config) {
+  return { id, kind: "compiler", config };
+}
+function testJudge(id, config) {
+  return { id, kind: "test", config };
+}
+function linterJudge(id, config) {
+  return { id, kind: "linter", config };
+}
+function securityJudge(id, config) {
+  return { id, kind: "security", config };
+}
+function renderJudgeSummary(kind, detail) {
+  if (!detail.passed) return `${kind} judge failed`;
+  if (detail.test?.testsTotal) return `${kind} judge passed ${detail.test.testsPassed}/${detail.test.testsTotal} tests`;
+  return `${kind} judge passed`;
+}
+// src/dual-agent-bench.ts
+var DualAgentBench = class {
+  async run(config) {
+    const maxRounds = config.maxRounds ?? 5;
+    const threshold = config.convergenceThreshold ?? 0.85;
+    if (config.scenarios.length === 0) {
+      throw new Error("DualAgentBench requires at least 1 scenario");
+    }
+    const results = [];
+    for (const scenario of config.scenarios) {
+      const history = [];
+      let converged = false;
+      let roundsToConverge = null;
+      let finalProposal = "";
+      let lastScore = 0;
+      let priorCritique;
+      for (let r = 0; r < maxRounds; r++) {
+        const priorProposal = history[history.length - 1]?.proposal;
+        const proposal = await config.propose({
+          scenario,
+          roundIndex: r,
+          priorProposal,
+          priorCritique
+        });
+        const { critique, convergenceScore } = await config.critique({
+          scenario,
+          roundIndex: r,
+          proposal
+        });
+        if (!Number.isFinite(convergenceScore) || convergenceScore < 0 || convergenceScore > 1) {
+          throw new Error(
+            `critique must return convergenceScore in [0,1]; got ${convergenceScore} for scenario ${scenario.id} round ${r}`
+          );
+        }
+        const round = {
+          roundIndex: r,
+          proposal,
+          critique,
+          convergenceScore
+        };
+        history.push(round);
+        config.onRoundComplete?.({ scenarioId: scenario.id, round });
+        finalProposal = proposal;
+        lastScore = convergenceScore;
+        priorCritique = critique;
+        if (convergenceScore >= threshold) {
+          converged = true;
+          roundsToConverge = r + 1;
+          break;
+        }
+      }
+      results.push({
+        scenarioId: scenario.id,
+        converged,
+        roundsToConverge,
+        finalProposal,
+        history,
+        finalScore: lastScore
+      });
+    }
+    const convergedResults = results.filter((r) => r.converged);
+    const convergenceRate = results.length ? convergedResults.length / results.length : 0;
+    const avgRoundsToConverge = convergedResults.length ? convergedResults.reduce((acc, r) => acc + (r.roundsToConverge ?? 0), 0) / convergedResults.length : null;
+    const avgFinalScore = results.length ? results.reduce((acc, r) => acc + r.finalScore, 0) / results.length : 0;
+    return {
+      scenarios: results,
+      aggregate: { convergenceRate, avgRoundsToConverge, avgFinalScore },
+      config: { maxRounds, convergenceThreshold: threshold }
+    };
+  }
+};
+// src/propose-review.ts
+import { appendFileSync, existsSync, mkdirSync, readFileSync } from "fs";
+import { dirname } from "path";
+function inMemoryReviewStore(initial = []) {
+  const entries = [...initial];
+  return {
+    async load() {
+      return [...entries];
+    },
+    async append(entry) {
+      entries.push(entry);
+    }
+  };
+}
+function jsonlReviewStore(path) {
+  return {
+    async load() {
+      if (!existsSync(path)) return [];
+      const raw = readFileSync(path, "utf8");
+      const out = [];
+      for (const line of raw.split("\n")) {
+        const trimmed = line.trim();
+        if (!trimmed) continue;
+        try {
+          out.push(JSON.parse(trimmed));
+        } catch {
+        }
+      }
+      return out;
+    },
+    async append(entry) {
+      mkdirSync(dirname(path), { recursive: true });
+      appendFileSync(path, JSON.stringify(entry) + "\n");
+    }
+  };
+}
+var DEFAULT_FALLBACK_INSTRUCTION = "Inspect the verification failures above. Fix the critical issues first, then the major ones. Do not restate the failures \u2014 act on them.";
+async function runProposeReview(config) {
+  const maxShots = config.maxShots ?? 10;
+  const maxWallMs = config.maxWallMs ?? 10 * 60 * 1e3;
+  const confidenceFloor = config.confidenceFloor ?? 0.3;
+  const confidenceFloorWindow = config.confidenceFloorWindow ?? 2;
+  const memory = config.memory ?? inMemoryReviewStore();
+  const fallbackInstruction = config.fallbackInstruction ?? DEFAULT_FALLBACK_INSTRUCTION;
+  const emitter = config.store ? new TraceEmitter(config.store) : null;
+  if (emitter) {
+    await emitter.startRun({
+      scenarioId: config.scenarioId ?? "propose-review",
+      projectId: config.projectId,
+      variantId: config.variantId,
+      layer: "meta",
+      tags: {
+        goal: config.goal.slice(0, 120),
+        maxShots: String(maxShots)
+      }
+    });
+  }
+  const abort = new AbortController();
+  const wallStart = Date.now();
+  const wallTimer = setTimeout(() => abort.abort(new Error("propose-review wall timeout")), maxWallMs);
+  const shots = [];
+  let state = config.initialState;
+  let priorReview = null;
+  let lastVerification = { pass: false };
+  let failureClass;
+  let completed = false;
+  let lowConfidenceStreak = 0;
+  try {
+    for (let shot = 1; shot <= maxShots; shot++) {
+      if (abort.signal.aborted) {
+        failureClass = "timeout";
+        break;
+      }
+      const shotStart = Date.now();
+      const shotHandle = emitter ? await emitter.span({ kind: "tool", name: `shot-${shot}` }) : null;
+      let proposeOut;
+      try {
+        proposeOut = await config.propose({
+          shot,
+          goal: config.goal,
+          state,
+          priorReview,
+          abortSignal: abort.signal,
+          emitter: emitter ?? void 0
+        });
+      } catch (err) {
+        await shotHandle?.fail(err instanceof Error ? err : String(err));
+        failureClass = "unknown";
+        throw err;
+      }
+      state = proposeOut.state;
+      const traceSummary = proposeOut.traceSummary;
+      let verification;
+      try {
+        verification = await config.verify(state);
+      } catch (err) {
+        await shotHandle?.fail(err instanceof Error ? err : String(err));
+        failureClass = "unknown";
+        throw err;
+      }
+      lastVerification = verification;
+      const memorySnapshot = await memory.load();
+      const verificationDigest = {
+        pass: verification.pass,
+        score: verification.score,
+        failingLayers: verification.failingLayers ?? []
+      };
+      let review;
+      let reviewAvailable = true;
+      let reviewError;
+      if (verification.pass) {
+        review = {
+          observations: "verification passed \u2014 skipping reviewer LLM call",
+          diagnosis: "no failures to diagnose",
+          nextShotInstruction: "(done)",
+          shouldContinue: false,
+          confidence: 1
+        };
+      } else {
+        try {
+          review = await config.review({
+            shot,
+            goal: config.goal,
+            state,
+            verification,
+            traceSummary,
+            memory: memorySnapshot
+          });
+          review = coerceReview(review);
+        } catch (err) {
+          reviewAvailable = false;
+          reviewError = err instanceof Error ? err.message : String(err);
+          const lastInstruction = memorySnapshot.length > 0 ? memorySnapshot[memorySnapshot.length - 1].nextShotInstruction : fallbackInstruction;
+          review = {
+            observations: "(reviewer unavailable \u2014 using last-known instruction)",
+            diagnosis: reviewError,
+            nextShotInstruction: lastInstruction,
+            shouldContinue: true,
+            confidence: 0.3
+          };
+        }
+      }
+      const entry = {
+        shot,
+        timestamp: Date.now(),
+        ...review,
+        verification: verificationDigest
+      };
+      await memory.append(entry);
+      const shotRecord = {
+        shot,
+        state,
+        verification,
+        traceSummary,
+        review,
+        reviewAvailable,
+        reviewError,
+        durationMs: Date.now() - shotStart
+      };
+      shots.push(shotRecord);
+      await shotHandle?.end({
+        attributes: {
+          verificationPass: verification.pass,
+          verificationScore: verification.score ?? null,
+          reviewShouldContinue: review.shouldContinue,
+          reviewConfidence: review.confidence,
+          reviewAvailable
+        }
+      });
+      if (verification.pass) {
+        completed = true;
+        break;
+      }
+      if (!review.shouldContinue) {
+        break;
+      }
+      if (confidenceFloorWindow > 0 && review.confidence <= confidenceFloor) {
+        lowConfidenceStreak += 1;
+        if (lowConfidenceStreak >= confidenceFloorWindow) break;
+      } else {
+        lowConfidenceStreak = 0;
+      }
+      priorReview = review;
+    }
+    if (!completed && !failureClass) {
+      failureClass = shots.length >= maxShots ? "budget_exceeded" : "unknown";
+    }
+  } finally {
+    clearTimeout(wallTimer);
+  }
+  const score = lastVerification.pass ? 1 : typeof lastVerification.score === "number" ? lastVerification.score : 0;
+  if (emitter) {
+    await emitter.endRun({
+      pass: completed,
+      score,
+      failureClass,
+      notes: `${shots.length} shot(s); final pass=${lastVerification.pass}`
+    });
+  }
+  return {
+    runId: emitter?.runId ?? null,
+    completed,
+    shots,
+    finalState: state,
+    finalVerification: lastVerification,
+    failureClass,
+    wallMs: Date.now() - wallStart,
+    score
+  };
+}
+var REVIEWER_SYSTEM_PROMPT = `You are a senior reviewer directing a multi-shot build loop.
+You do NOT grade \u2014 the verifier already did. Your job is to direct the worker's next shot.
+You are blind to the worker's inner monologue. You see what it DID, not what it thought.
+Return STRICT JSON matching the schema. No prose outside the JSON.`;
+function createLlmReviewer(cfg) {
+  const renderState = cfg.renderState ?? ((s) => safeJson(s));
+  const renderTraceSummary = cfg.renderTraceSummary ?? ((s) => s === void 0 ? "(none)" : safeJson(s));
+  const system = cfg.systemPromptAddendum ? `${REVIEWER_SYSTEM_PROMPT}
+${cfg.systemPromptAddendum}` : REVIEWER_SYSTEM_PROMPT;
+  return async (input) => {
+    const memoryBlock = input.memory.length === 0 ? "(no prior shots \u2014 this is shot 1)" : input.memory.map((m) => [
+      `shot ${m.shot} \u2014 verification.pass=${m.verification.pass}` + (typeof m.verification.score === "number" ? ` score=${m.verification.score.toFixed(2)}` : "") + ` confidence=${m.confidence.toFixed(2)} failing=[${(m.verification.failingLayers ?? []).join(",")}]`,
+      `  observations: ${m.observations.slice(0, 400)}`,
+      `  diagnosis: ${m.diagnosis.slice(0, 400)}`,
+      `  instruction given: ${m.nextShotInstruction.slice(0, 400)}`
+    ].join("\n")).join("\n\n");
+    const user = [
+      `=== GOAL ===`,
+      input.goal,
+      ``,
+      `=== SHOT NUMBER ===`,
+      String(input.shot),
+      ``,
+      `=== CURRENT STATE ===`,
+      renderState(input.state),
+      ``,
+      `=== TRACE SUMMARY ===`,
+      renderTraceSummary(input.traceSummary),
+      ``,
+      `=== VERIFICATION ===`,
+      summarizeVerification(input.verification),
+      ``,
+      `=== REVIEWER MEMORY (prior shots) ===`,
+      memoryBlock,
+      ``,
+      `=== YOUR TASK ===`,
+      `Return STRICT JSON:`,
+      `{`,
+      `  "observations": string (20..2000 chars, first-person worker behavior \u2014 quote counts, errors, loops)`,
+      `  "diagnosis": string (20..1500 chars, root cause, NOT a restatement of verification)`,
+      `  "nextShotInstruction": string (40..3000 chars, concrete directive to the worker)`,
+      `  "shouldContinue": boolean (false if verification.pass, or if thrashing, or unachievable)`,
+      `  "confidence": number in [0,1]`,
+      `}`
+    ].join("\n");
+    const raw = await cfg.callJson({ system, user });
+    return coerceReview(raw);
+  };
+}
+function coerceReview(raw) {
+  if (!raw || typeof raw !== "object") {
+    throw new Error("reviewer returned non-object");
+  }
+  const observations = typeof raw.observations === "string" ? raw.observations : "";
+  const diagnosis = typeof raw.diagnosis === "string" ? raw.diagnosis : "";
+  const nextShotInstruction = typeof raw.nextShotInstruction === "string" ? raw.nextShotInstruction : "";
+  if (!observations || !diagnosis || !nextShotInstruction) {
+    throw new Error("reviewer missing required string fields");
+  }
+  if (typeof raw.shouldContinue !== "boolean") {
+    throw new Error("reviewer missing shouldContinue boolean");
+  }
+  const confidenceRaw = Number(raw.confidence);
+  if (!Number.isFinite(confidenceRaw)) {
+    throw new Error("reviewer confidence not finite");
+  }
+  return {
+    observations,
+    diagnosis,
+    nextShotInstruction,
+    shouldContinue: raw.shouldContinue,
+    confidence: Math.max(0, Math.min(1, confidenceRaw))
+  };
+}
+function summarizeVerification(v) {
+  const header = `pass=${v.pass}` + (typeof v.score === "number" ? ` score=${v.score.toFixed(3)}` : "") + (v.failingLayers && v.failingLayers.length > 0 ? ` failing=[${v.failingLayers.join(", ")}]` : "");
+  const details = v.details === void 0 ? "" : `
+${safeJson(v.details).slice(0, 1500)}`;
+  return header + details;
+}
+function safeJson(x) {
+  try {
+    return JSON.stringify(x, null, 2);
+  } catch {
+    return String(x);
+  }
+}
+// src/trace/schema.ts
+var TRACE_SCHEMA_VERSION = "1.0.0";
+var FAILURE_CLASSES = [
+  "success",
+  "reasoning_error",
+  "tool_selection_error",
+  "tool_argument_error",
+  "tool_recovery_failure",
+  "hallucination",
+  "instruction_following",
+  "safety_refusal_miss",
+  "policy_violation",
+  "budget_exceeded",
+  "format_drift",
+  "permission_escalation",
+  "pii_leak",
+  "cost_overrun",
+  "timeout",
+  "sandbox_failure",
+  "unknown"
+];
+function isLlmSpan(s) {
+  return s.kind === "llm";
+}
+function isToolSpan(s) {
+  return s.kind === "tool";
+}
+function isRetrievalSpan(s) {
+  return s.kind === "retrieval";
+}
+function isJudgeSpan(s) {
+  return s.kind === "judge";
+}
+function isSandboxSpan(s) {
+  return s.kind === "sandbox";
+}
 // src/trace/query.ts
 async function runsForScenario(store, scenarioId) {
   return store.listRuns({ scenarioId });
@@ -2825,181 +3857,6 @@ function runToTraceId(run) {
   return cleaned.slice(0, 32).padEnd(32, "0");
 }
-// src/sandbox-harness.ts
-var vitestTestParser = {
-  id: "vitest",
-  parse(stdout) {
-    const m = stdout.match(/Tests\s+(\d+)\s+(passed|failed)(?:\s*\|\s*(\d+)\s+(passed|failed))?/i);
-    if (!m) return void 0;
-    let passed = 0;
-    let failed = 0;
-    const a = parseInt(m[1], 10);
-    const aLabel = m[2].toLowerCase();
-    if (aLabel === "passed") passed += a;
-    else failed += a;
-    if (m[3] && m[4]) {
-      const b = parseInt(m[3], 10);
-      if (m[4].toLowerCase() === "passed") passed += b;
-      else failed += b;
-    }
-    return { testsTotal: passed + failed, testsPassed: passed };
-  }
-};
-var pytestTestParser = {
-  id: "pytest",
-  parse(stdout) {
-    const total = stdout.match(/collected\s+(\d+)\s+items?/i);
-    const passed = stdout.match(/(\d+)\s+passed/);
-    if (!total || !passed) return void 0;
-    return { testsTotal: parseInt(total[1], 10), testsPassed: parseInt(passed[1], 10) };
-  }
-};
-var jestTestParser = {
-  id: "jest",
-  parse(stdout) {
-    const m = stdout.match(/Tests:\s+(?:(\d+)\s+failed[^,]*,\s*)?(\d+)\s+passed,\s+(\d+)\s+total/i);
-    if (!m) return void 0;
-    return { testsTotal: parseInt(m[3], 10), testsPassed: parseInt(m[2], 10) };
-  }
-};
-function composeParsers(...parsers) {
-  return {
-    id: parsers.map((p) => p.id).join("|"),
-    parse(stdout, stderr, exitCode) {
-      for (const p of parsers) {
-        const res = p.parse(stdout, stderr, exitCode);
-        if (res) return res;
-      }
-      return void 0;
-    }
-  };
-}
-var SubprocessSandboxDriver = class {
-  id = "subprocess";
-  async exec(phase, command, config) {
-    const { spawn } = await import("child_process");
-    const start = Date.now();
-    return await new Promise((resolve) => {
-      const child = spawn(command, {
-        shell: true,
-        cwd: config.cwd,
-        env: { ...process.env, ...config.env ?? {} }
-      });
-      let stdout = "";
-      let stderr = "";
-      child.stdout?.on("data", (d) => {
-        stdout += String(d);
-      });
-      child.stderr?.on("data", (d) => {
-        stderr += String(d);
-      });
-      const timeout = setTimeout(() => {
-        try {
-          child.kill("SIGKILL");
-        } catch {
-        }
-      }, config.timeoutMs ?? 10 * 6e4);
-      child.on("close", (code) => {
-        clearTimeout(timeout);
-        const wallMs = Date.now() - start;
-        const parsed = phase === "test" && config.testParser ? config.testParser.parse(stdout, stderr, code ?? 1) : void 0;
-        resolve({
-          phase,
-          exitCode: code ?? 1,
-          stdout,
-          stderr,
-          wallMs,
-          testsTotal: parsed?.testsTotal,
-          testsPassed: parsed?.testsPassed
-        });
-      });
-      child.on("error", (err) => {
-        clearTimeout(timeout);
-        const wallMs = Date.now() - start;
-        resolve({ phase, exitCode: 127, stdout, stderr: stderr + String(err), wallMs });
-      });
-    });
-  }
-};
-var DockerSandboxDriver = class {
-  id = "docker";
-  async exec(phase, command, config) {
-    if (!config.image) throw new Error("DockerSandboxDriver requires config.image");
-    const sub = new SubprocessSandboxDriver();
-    const envArgs = Object.entries(config.env ?? {}).map(([k, v]) => `-e ${shellQuote(k)}=${shellQuote(v)}`).join(" ");
-    const wrapped = `docker run --rm ${envArgs} ${shellQuote(config.image)} sh -c ${shellQuote(command)}`;
-    return sub.exec(phase, wrapped, { ...config, env: void 0 });
-  }
-};
-function shellQuote(v) {
-  if (/^[A-Za-z0-9_\-\/\.@:=]+$/.test(v)) return v;
-  return `'${v.replace(/'/g, `'\\''`)}'`;
-}
-var SandboxHarness = class {
-  driver;
-  constructor(driver = new SubprocessSandboxDriver()) {
-    this.driver = driver;
-  }
-  async run(config, emitter) {
-    const handle = await emitter.sandbox({
-      name: `sandbox(${this.driver.id})`,
-      image: config.image,
-      command: [config.setupCommand, config.runCommand, config.testCommand].filter(Boolean).join(" && ")
-    });
-    const result = { passed: false, totalWallMs: 0, score: 0 };
-    try {
-      if (config.setupCommand) {
-        result.setup = await this.driver.exec("setup", config.setupCommand, config);
-        result.totalWallMs += result.setup.wallMs;
-        if (result.setup.exitCode !== 0) {
-          await handle.fail(`setup failed (exit ${result.setup.exitCode})`, {
-            exitCode: result.setup.exitCode,
-            wallMs: result.totalWallMs
-          });
-          return result;
-        }
-      }
-      if (config.runCommand) {
-        result.run = await this.driver.exec("run", config.runCommand, config);
-        result.totalWallMs += result.run.wallMs;
-        if (result.run.exitCode !== 0) {
-          await handle.fail(`run failed (exit ${result.run.exitCode})`, {
-            exitCode: result.run.exitCode,
-            wallMs: result.totalWallMs
-          });
-          return result;
-        }
-      }
-      if (config.testCommand) {
-        result.test = await this.driver.exec("test", config.testCommand, config);
-        result.totalWallMs += result.test.wallMs;
-        const passed = result.test.exitCode === 0;
-        result.passed = passed;
-        if (result.test.testsTotal !== void 0 && result.test.testsTotal > 0) {
-          result.score = (result.test.testsPassed ?? 0) / result.test.testsTotal;
-        } else {
-          result.score = passed ? 1 : 0;
-        }
-        await handle.end({
-          exitCode: result.test.exitCode,
-          testsTotal: result.test.testsTotal,
-          testsPassed: result.test.testsPassed,
-          wallMs: result.totalWallMs,
-          status: passed ? "ok" : "error"
-        });
-      } else {
-        result.passed = true;
-        result.score = 1;
-        await handle.end({ wallMs: result.totalWallMs });
-      }
-    } catch (err) {
-      await handle.fail(err instanceof Error ? err : String(err));
-      throw err;
-    }
-    return result;
-  }
-};
 // src/test-graded-scenario.ts
 async function runTestGradedScenario(scenario, store, options = {}) {
   const emitter = new TraceEmitter(store);
@@ -3619,8 +4476,8 @@ function compareToBaseline(samples, options = {}) {
     if (s.baseline.length < 2 || s.candidate.length < 2) {
       throw new Error(`compareToBaseline: need \u22652 samples per side for "${s.metric}"`);
     }
-    const bMean = mean(s.baseline);
-    const cMean = mean(s.candidate);
+    const bMean = mean2(s.baseline);
+    const cMean = mean2(s.candidate);
     const delta = cMean - bMean;
     const d = cohensD(s.baseline, s.candidate);
     const { t, df, p } = welchsTTest(s.baseline, s.candidate);
@@ -3659,7 +4516,7 @@ function compareToBaseline(samples, options = {}) {
     hasUnstable: metrics.some((m) => m.verdict === "unstable")
   };
 }
-function mean(xs) {
+function mean2(xs) {
   return xs.reduce((a, b) => a + b, 0) / xs.length;
 }
 function iqr(xs) {
@@ -3675,8 +4532,8 @@ function iqr(xs) {
 }
 function welchsTTest(a, b) {
   if (a.length < 2 || b.length < 2) return { t: 0, df: 0, p: 1 };
-  const mA = mean(a);
-  const mB = mean(b);
+  const mA = mean2(a);
+  const mB = mean2(b);
   const vA = variance(a, mA);
   const vB = variance(b, mB);
   const seSquared = vA / a.length + vB / b.length;
@@ -4032,41 +4889,6 @@ function assertNonNegative(n, name) {
   }
 }
-// src/pareto.ts
-function dominates(a, b, objectives) {
-  let strictlyBetter = false;
-  for (const obj of objectives) {
-    const av = obj.value(a);
-    const bv = obj.value(b);
-    if (!Number.isFinite(av) || !Number.isFinite(bv)) return false;
-    const aIsBetter = obj.direction === "maximize" ? av > bv : av < bv;
-    const aIsWorse = obj.direction === "maximize" ? av < bv : av > bv;
-    if (aIsWorse) return false;
-    if (aIsBetter) strictlyBetter = true;
-  }
-  return strictlyBetter;
-}
-function paretoFrontier(candidates, objectives) {
-  if (objectives.length === 0) {
-    throw new Error("paretoFrontier: at least 1 objective required");
-  }
-  const valid = candidates.filter(
-    (c) => objectives.every((o) => Number.isFinite(o.value(c)))
-  );
-  const frontier = [];
-  const dominated = [];
-  for (const c of valid) {
-    const isDominated = valid.some((other) => other !== c && dominates(other, c, objectives));
-    if (isDominated) dominated.push(c);
-    else frontier.push(c);
-  }
-  const dominanceMap = frontier.map((d) => ({
-    dominator: d,
-    dominated: dominated.filter((x) => dominates(d, x, objectives))
-  }));
-  return { frontier, dominated, dominanceMap };
-}
 // src/series-convergence.ts
 function analyzeSeries(values, options = {}) {
   const window = options.window ?? 5;
@@ -4076,10 +4898,10 @@ function analyzeSeries(values, options = {}) {
     return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
   }
   const tail = values.slice(-window);
-  const mean3 = tail.reduce((a, b) => a + b, 0) / tail.length;
-  const variance2 = tail.reduce((acc, v) => acc + (v - mean3) ** 2, 0) / tail.length;
+  const mean4 = tail.reduce((a, b) => a + b, 0) / tail.length;
+  const variance2 = tail.reduce((acc, v) => acc + (v - mean4) ** 2, 0) / tail.length;
   const stdDev = Math.sqrt(variance2);
-  const refMean = Math.abs(mean3) > 1e-9 ? Math.abs(mean3) : 1;
+  const refMean = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
   const cv = stdDev / refMean;
   const stable = tail.length >= window && cv <= stableCv;
   let tailRun = 0;
@@ -4100,7 +4922,7 @@ function analyzeSeries(values, options = {}) {
   } else {
     state = "noisy";
   }
-  return { state, windowMean: mean3, windowCv: cv, tailRun, stable };
+  return { state, windowMean: mean4, windowCv: cv, tailRun, stable };
 }
 // src/state-continuity.ts
@@ -5028,12 +5850,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
     variantScores.push({ mutator: id, score, mutated });
     all.push(score);
   }
-  const mean3 = all.reduce((a, b) => a + b, 0) / all.length;
-  const variance2 = all.reduce((a, v) => a + (v - mean3) ** 2, 0) / all.length;
+  const mean4 = all.reduce((a, b) => a + b, 0) / all.length;
+  const variance2 = all.reduce((a, v) => a + (v - mean4) ** 2, 0) / all.length;
   const stdDev = Math.sqrt(variance2);
-  const ref = Math.abs(mean3) > 1e-9 ? Math.abs(mean3) : 1;
+  const ref = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
   const robustness = Math.max(0, 1 - stdDev / ref);
-  return { originalScore, variantScores, meanScore: mean3, stdDev, robustness };
+  return { originalScore, variantScores, meanScore: mean4, stdDev, robustness };
 }
 var lowercaseMutator = (p) => p.toLowerCase();
 var sentenceReorderMutator = (p, seed) => {
@@ -5284,8 +6106,11 @@ async function scoreProject(store, projectId) {
   const runtimeScore = runtimeScores.length > 0 ? runtimeScores.reduce((a, b) => a + b, 0) / runtimeScores.length : null;
   const runtimePassed = runtime.filter((r) => r.outcome?.pass === true).length;
   const runtimePassRate = runtime.length > 0 ? runtimePassed / runtime.length : null;
+  const kind = runtime.length === 0 ? "scaffold-only" : "full";
+  const complete = kind === "scaffold-only" ? metaScore !== null && buildScore !== null : metaScore !== null && buildScore !== null && runtimeScore !== null;
   return {
     projectId,
+    kind,
     builderRunId: builder?.runId,
     metaScore,
     buildRunId: build?.runId,
@@ -5293,7 +6118,7 @@ async function scoreProject(store, projectId) {
     appRuntimeRunIds: runtime.map((r) => r.runId),
     runtimeScore,
     runtimePassRate,
-    complete: metaScore !== null && buildScore !== null && runtimeScore !== null
+    complete
   };
 }
 async function scoreAllProjects(store) {
@@ -5715,8 +6540,8 @@ async function calibrationCurve(traceStore, outcomeStore, evalMetric, outcomeMet
 function toBin(chunk, lower, upper) {
   const xs = chunk.map((c) => c.x);
   const ys = chunk.map((c) => c.y);
-  const evalMean = mean2(xs);
-  const outcomeMean = mean2(ys);
+  const evalMean = mean3(xs);
+  const outcomeMean = mean3(ys);
   return {
     lower: lower ?? Math.min(...xs),
     upper: upper ?? Math.max(...xs),
@@ -5726,7 +6551,7 @@ function toBin(chunk, lower, upper) {
     gap: Math.abs(outcomeMean - evalMean)
   };
 }
-function mean2(xs) {
+function mean3(xs) {
   return xs.reduce((a, b) => a + b, 0) / xs.length;
 }
 function defaultExtract4(metric) {
@@ -5951,8 +6776,8 @@ async function prmBestOfN(store, grader, runIds) {
   if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
   const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
   const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
-  const mean3 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
-  const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean3) ** 2, 0) / graded.length;
+  const mean4 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
+  const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean4) ** 2, 0) / graded.length;
   return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
 }
 async function prmEnsembleBestOfN(store, graders, runIds) {
@@ -5974,8 +6799,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
   const ranked = [...byRun.values()].sort(
     (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
   );
-  const mean3 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
-  const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean3) ** 2, 0) / ranked.length;
+  const mean4 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
+  const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean4) ** 2, 0) / ranked.length;
   return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
 }
@@ -6505,8 +7330,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
     const sRuns = runs.filter((r) => r.scenarioId === s.id);
     const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
     if (scores.length < 3) continue;
-    const mean3 = scores.reduce((a, b) => a + b, 0) / scores.length;
-    const variance2 = scores.reduce((a, b) => a + (b - mean3) ** 2, 0) / scores.length;
+    const mean4 = scores.reduce((a, b) => a + b, 0) / scores.length;
+    const variance2 = scores.reduce((a, b) => a + (b - mean4) ** 2, 0) / scores.length;
     if (variance2 > varianceThreshold) {
       targets.push({
         reason: "high-variance",
@@ -6987,6 +7812,7 @@ async function euAiActReport(ctx, signals) {
 }
 export {
   AgentDriver,
+  AxGepaSteeringOptimizer,
   BenchmarkRunner,
   BudgetBreachError,
   BudgetGuard,
@@ -6995,9 +7821,11 @@ export {
   CostTracker,
   DEFAULT_AGENT_SLOS,
   DEFAULT_RULES as DEFAULT_FAILURE_RULES,
+  DEFAULT_HARNESS_OBJECTIVES,
   DEFAULT_MUTATORS,
   DEFAULT_REDACTION_RULES,
   DEFAULT_RED_TEAM_CORPUS,
+  DEFAULT_RUN_SCORE_WEIGHTS,
   Dataset,
   DockerSandboxDriver,
   DualAgentBench,
@@ -7011,15 +7839,19 @@ export {
   InMemoryOutcomeStore,
   InMemoryTraceStore,
   InMemoryWorkspaceInspector,
+  JudgeRunner,
   MODEL_PRICING,
   MetricsCollector,
   OTEL_AGENT_EVAL_SCOPE,
+  OptimizationLoop,
+  PairwiseSteeringOptimizer,
   PrmGrader,
   ProductClient,
   ProjectRegistry,
   PromptOptimizer,
   PromptRegistry,
   REDACTION_VERSION,
+  RunCritic,
   SandboxHarness,
   ScenarioRegistry,
   SubprocessSandboxDriver,
@@ -7028,6 +7860,7 @@ export {
   TraceEmitter,
   adversarialJudge,
   aggregateLlm,
+  aggregateRunScore,
   analyzeAntiSlop,
   analyzeSeries,
   argHash,
@@ -7044,6 +7877,7 @@ export {
   causalAttribution,
   checkCanaries,
   checkSlos,
+  clamp01,
   classifyEuAiRisk,
   classifyFailure,
   codeExecutionJudge,
@@ -7052,6 +7886,7 @@ export {
   collectionPreserved,
   commitBisect,
   compareToBaseline,
+  compilerJudge,
   composeParsers,
   composeValidators,
   computeToolUseMetrics,
@@ -7062,8 +7897,10 @@ export {
   createAntiSlopJudge,
   createCustomJudge,
   createDomainExpertJudge,
+  createLlmReviewer,
   crossTraceDiff,
   defaultJudges,
+  distillPlaybook,
   dominates,
   estimateCost,
   estimateTokens,
@@ -7085,6 +7922,7 @@ export {
   groupBy,
   hashContent,
   hashScenarios,
+  inMemoryReviewStore,
   interRaterReliability,
   iqr,
   isJudgeSpan,
@@ -7096,14 +7934,17 @@ export {
   jestTestParser,
   jsonHasKeys,
   jsonShape,
+  jsonlReviewStore,
   judgeAgreementView,
   judgeSpans,
   keyPreserved,
+  linterJudge,
   llmSpanFromProvider,
   llmSpans,
   loadScorerFromGrader,
   lowercaseMutator,
   mannWhitneyU,
+  mergeSteeringBundle,
   nistAiRmfReport,
   nonRefusalRubric,
   normalizeScores,
@@ -7131,6 +7972,8 @@ export {
   regressionView,
   renderMarkdown,
   renderMarkdownReport,
+  renderPlaybookMarkdown,
+  renderSteeringText,
   replayScorerOverCorpus,
   replayTraceThroughJudge,
   requiredSampleSize,
@@ -7142,6 +7985,9 @@ export {
   runE2EWorkflow,
   runExpectations,
   runFailureClass,
+  runHarnessExperiment,
+  runJudgeFleet,
+  runProposeReview,
   runSelfPlay,
   runTestGradedScenario,
   runsForScenario,
@@ -7149,6 +7995,8 @@ export {
   scoreContinuity,
   scoreProject,
   scoreRedTeamOutput,
+  securityJudge,
+  selectHarnessVariant,
   selfPreference,
   sentenceReorderMutator,
   signManifest,
@@ -7156,6 +8004,8 @@ export {
   statusAdvanced,
   stuckLoopView,
   summarize,
+  summarizeHarnessResults,
+  testJudge,
   textInSnapshot,
   toLangfuseEnvelope,
   toNdjson,