npm - kairn-cli - Versions diffs - 2.1.0 → 2.2.1 - Mend

kairn-cli 2.1.0 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/cli.js CHANGED Viewed

@@ -221,7 +221,7 @@ var ui = {
   // Key-value pairs
   kv: (key, value) => `  ${chalk.cyan(key.padEnd(14))} ${value}`,
   // File list
-  file: (path23) => chalk.dim(`    ${path23}`),
+  file: (path24) => chalk.dim(`    ${path24}`),
   // Tool display
   tool: (name, reason) => `    ${warmStone("\u25CF")} ${chalk.bold(name)}
       ${chalk.dim(reason)}`,
@@ -3694,9 +3694,9 @@ var keysCommand = new Command10("keys").description("Add or update API keys for
 import { Command as Command11 } from "commander";
 import chalk14 from "chalk";
 import ora2 from "ora";
-import fs22 from "fs/promises";
-import path22 from "path";
-import { parse as yamlParse } from "yaml";
+import fs23 from "fs/promises";
+import path23 from "path";
+import { parse as yamlParse2 } from "yaml";
 import { confirm as confirm3, select as select4 } from "@inquirer/prompts";
 // src/evolve/init.ts
@@ -3741,30 +3741,55 @@ var EVAL_TEMPLATES = {
     name: "Documentation",
     description: "Can the agent write and update docs?",
     bestFor: ["content", "api-building", "full-stack"]
+  },
+  "convention-adherence": {
+    id: "convention-adherence",
+    name: "Convention Adherence",
+    description: "Does the agent follow all project conventions defined in CLAUDE.md?",
+    bestFor: ["feature-development", "full-stack", "backend", "maintenance"]
+  },
+  "workflow-compliance": {
+    id: "workflow-compliance",
+    name: "Workflow Compliance",
+    description: "Does the agent use the project workflow commands and skills?",
+    bestFor: ["feature-development", "full-stack", "tdd", "qa"]
+  },
+  "rule-compliance": {
+    id: "rule-compliance",
+    name: "Rule Compliance",
+    description: "Does the agent follow all project rules without violations?",
+    bestFor: ["feature-development", "backend", "maintenance", "architecture"]
   }
 };
 function selectTemplatesForWorkflow(workflowType) {
   const mapping = {
-    "feature-development": ["add-feature", "test-writing", "documentation"],
-    "api-building": ["add-feature", "fix-bug", "test-writing"],
-    "full-stack": ["add-feature", "fix-bug", "test-writing"],
-    "maintenance": ["fix-bug", "refactor", "test-writing"],
-    "debugging": ["fix-bug", "test-writing"],
-    "qa": ["fix-bug", "test-writing", "add-feature"],
-    "architecture": ["refactor", "test-writing", "config-change"],
-    "backend": ["fix-bug", "refactor", "config-change", "test-writing"],
-    "devops": ["config-change", "fix-bug"],
-    "infrastructure": ["config-change", "refactor"],
-    "tdd": ["test-writing", "add-feature", "fix-bug"],
-    "content": ["documentation", "add-feature"],
-    "research": ["documentation", "add-feature"]
+    "feature-development": ["add-feature", "test-writing", "convention-adherence", "workflow-compliance"],
+    "api-building": ["add-feature", "fix-bug", "test-writing", "convention-adherence"],
+    "full-stack": ["add-feature", "fix-bug", "test-writing", "convention-adherence"],
+    "maintenance": ["fix-bug", "refactor", "test-writing", "rule-compliance"],
+    "debugging": ["fix-bug", "test-writing", "rule-compliance"],
+    "qa": ["fix-bug", "test-writing", "add-feature", "workflow-compliance"],
+    "architecture": ["refactor", "test-writing", "config-change", "convention-adherence"],
+    "backend": ["fix-bug", "refactor", "config-change", "rule-compliance"],
+    "devops": ["config-change", "fix-bug", "rule-compliance"],
+    "infrastructure": ["config-change", "refactor", "convention-adherence"],
+    "tdd": ["test-writing", "add-feature", "fix-bug", "workflow-compliance"],
+    "content": ["documentation", "add-feature", "convention-adherence"],
+    "research": ["documentation", "add-feature", "convention-adherence"]
   };
-  return mapping[workflowType] || ["add-feature", "fix-bug", "test-writing"];
+  return mapping[workflowType] || ["add-feature", "fix-bug", "test-writing", "convention-adherence"];
 }
 var TASK_GENERATION_PROMPT = `You are an eval task generator for Claude Code agent environments. Given a project's CLAUDE.md, project structure, and selected eval templates, generate concrete, project-specific tasks.
 Each task must be realistic and testable against the actual project. Avoid generic placeholders.
+IMPORTANT: For harness-aware templates (convention-adherence, workflow-compliance, rule-compliance), generate tasks where success DEPENDS on the agent reading and following the .claude/ harness content:
+- convention-adherence: Task must require following specific conventions from CLAUDE.md (naming, file structure, patterns). Judge by whether output matches the conventions.
+- workflow-compliance: Task must require using project slash commands or workflow steps defined in .claude/commands/. Judge by whether the agent followed the defined workflow.
+- rule-compliance: Task must create a scenario where .claude/rules/ content is relevant. Judge by whether the agent respected all rules.
+These harness-aware tasks are critical \u2014 they test whether the .claude/ environment actually improves agent behavior.
 Return a JSON object with a "tasks" array. Each task has:
 - id: kebab-case identifier (e.g., "add-health-endpoint")
 - template: which eval template this instantiates
@@ -4117,6 +4142,27 @@ async function writeIterationLog(workspacePath, log) {
     "utf-8"
   );
 }
+async function loadIterationLog(workspacePath, iteration) {
+  const iterDir = path17.join(workspacePath, "iterations", iteration.toString());
+  try {
+    await fs17.access(iterDir);
+  } catch {
+    return null;
+  }
+  const scoresStr = await fs17.readFile(path17.join(iterDir, "scores.json"), "utf-8").catch(() => "{}");
+  const reasoning = await fs17.readFile(path17.join(iterDir, "proposer_reasoning.md"), "utf-8").catch(() => "");
+  const diffPatch = await fs17.readFile(path17.join(iterDir, "mutation_diff.patch"), "utf-8").catch(() => "");
+  const scoresData = JSON.parse(scoresStr);
+  const proposal = reasoning ? { reasoning, mutations: [], expectedImpact: {} } : null;
+  return {
+    iteration,
+    score: scoresData.score ?? 0,
+    taskResults: scoresData.taskResults ?? {},
+    proposal,
+    diffPatch: diffPatch || null,
+    timestamp: ""
+  };
+}
 // src/evolve/exec.ts
 import { exec } from "child_process";
@@ -4169,7 +4215,8 @@ ${msg}`);
       details: passed2 ? `All ${commands.length} verification commands passed` : failures.join("\n")
     };
   }
-  const hasErrors = stderr.toLowerCase().includes("error") || stderr.toLowerCase().includes("failed") || stderr.toLowerCase().includes("exception");
+  const filteredStderr = stderr.split("\n").filter((line) => !line.startsWith("[setup]")).join("\n");
+  const hasErrors = filteredStderr.toLowerCase().includes("error") || filteredStderr.toLowerCase().includes("failed") || filteredStderr.toLowerCase().includes("exception");
   const passed = !hasErrors;
   return {
     pass: passed,
@@ -4297,24 +4344,88 @@ async function scoreTask(task, workspacePath, stdout, stderr, config) {
 // src/evolve/runner.ts
 var execAsync2 = promisify2(exec2);
-async function runTask(task, harnessPath, traceDir, iteration) {
+var COPY_SKIP_DIRS = /* @__PURE__ */ new Set([".git", "node_modules", ".kairn-evolve", ".claude"]);
+async function createIsolatedWorkspace(projectRoot, harnessPath) {
+  const suffix = `${Date.now()}-${Math.random().toString(36).slice(2)}`;
+  try {
+    await execAsync2("git rev-parse --is-inside-work-tree", {
+      cwd: projectRoot,
+      timeout: 5e3
+    });
+    const tmpDir2 = path18.join(os3.tmpdir(), `kairn-evolve-wt-${suffix}`);
+    await execAsync2(`git worktree add --detach "${tmpDir2}" HEAD`, {
+      cwd: projectRoot,
+      timeout: 3e4
+    });
+    await fs18.rm(path18.join(tmpDir2, ".claude"), { recursive: true, force: true });
+    await copyDir(harnessPath, path18.join(tmpDir2, ".claude"));
+    return { workDir: tmpDir2, isWorktree: true };
+  } catch {
+  }
+  const tmpDir = await fs18.mkdtemp(path18.join(os3.tmpdir(), `kairn-evolve-cp-`));
+  await copyProjectDir(projectRoot, tmpDir);
+  await fs18.rm(path18.join(tmpDir, ".claude"), { recursive: true, force: true });
+  await copyDir(harnessPath, path18.join(tmpDir, ".claude"));
+  return { workDir: tmpDir, isWorktree: false };
+}
+async function copyProjectDir(src, dest) {
+  await fs18.mkdir(dest, { recursive: true });
+  let entries;
+  try {
+    entries = await fs18.readdir(src, { withFileTypes: true });
+  } catch {
+    return;
+  }
+  for (const entry of entries) {
+    if (COPY_SKIP_DIRS.has(entry.name)) continue;
+    const srcPath = path18.join(src, entry.name);
+    const destPath = path18.join(dest, entry.name);
+    if (entry.isDirectory()) {
+      await copyDir(srcPath, destPath);
+    } else {
+      await fs18.copyFile(srcPath, destPath);
+    }
+  }
+}
+async function cleanupIsolatedWorkspace(workDir, isWorktree, projectRoot) {
+  if (isWorktree) {
+    try {
+      await execAsync2(`git worktree remove "${workDir}" --force`, {
+        cwd: projectRoot,
+        timeout: 1e4
+      });
+    } catch {
+      await fs18.rm(workDir, { recursive: true, force: true }).catch(() => {
+      });
+      await execAsync2("git worktree prune", {
+        cwd: projectRoot,
+        timeout: 5e3
+      }).catch(() => {
+      });
+    }
+  } else {
+    await fs18.rm(workDir, { recursive: true, force: true }).catch(() => {
+    });
+  }
+}
+async function runTask(task, harnessPath, traceDir, iteration, projectRoot) {
   await fs18.mkdir(traceDir, { recursive: true });
   const startedAt = (/* @__PURE__ */ new Date()).toISOString();
   const startMs = Date.now();
-  const tmpDir = await fs18.mkdtemp(path18.join(os3.tmpdir(), "kairn-evolve-"));
+  const root = projectRoot ?? process.cwd();
+  const { workDir, isWorktree } = await createIsolatedWorkspace(root, harnessPath);
   try {
-    await copyDir(harnessPath, path18.join(tmpDir, ".claude"));
     let setupStderr = "";
     if (task.setup.trim()) {
       try {
-        await execAsync2(task.setup, { cwd: tmpDir, timeout: 6e4 });
+        await execAsync2(task.setup, { cwd: workDir, timeout: 6e4 });
       } catch (err) {
         setupStderr = err instanceof Error ? err.message : String(err);
       }
     }
-    const filesBefore = await snapshotFileList(tmpDir);
-    const spawnResult = await spawnClaude(task.description, tmpDir, task.timeout);
-    const filesAfter = await snapshotFileList(tmpDir);
+    const filesBefore = await snapshotFileList(workDir);
+    const spawnResult = await spawnClaude(task.description, workDir, task.timeout);
+    const filesAfter = await snapshotFileList(workDir);
     const filesChanged = diffFileLists(filesBefore, filesAfter);
     const toolCalls = parseToolCalls(spawnResult.stdout);
     const completedAt = (/* @__PURE__ */ new Date()).toISOString();
@@ -4338,8 +4449,7 @@ ${spawnResult.stderr}` : spawnResult.stderr;
       traceDir
     };
   } finally {
-    await fs18.rm(tmpDir, { recursive: true, force: true }).catch(() => {
-    });
+    await cleanupIsolatedWorkspace(workDir, isWorktree, root);
   }
 }
 async function spawnClaude(instruction, cwd, timeoutSec) {
@@ -4437,8 +4547,9 @@ function parseToolCalls(stdout) {
     return [];
   }
 }
-async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config) {
+async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config, onProgress) {
   const results = {};
+  const projectRoot = path18.resolve(workspacePath, "..");
   for (const task of tasks) {
     const traceDir = path18.join(
       workspacePath,
@@ -4446,7 +4557,8 @@ async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config)
       iteration.toString(),
       task.id
     );
-    const taskResult = await runTask(task, harnessPath, traceDir, iteration);
+    onProgress?.({ type: "task-start", iteration, taskId: task.id });
+    const taskResult = await runTask(task, harnessPath, traceDir, iteration, projectRoot);
     let score = taskResult.score;
     if (config) {
       const stdout = await fs18.readFile(path18.join(traceDir, "stdout.log"), "utf-8").catch(() => "");
@@ -4455,6 +4567,12 @@ async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config)
       await writeScore(traceDir, score);
     }
     results[task.id] = score;
+    onProgress?.({
+      type: "task-scored",
+      iteration,
+      taskId: task.id,
+      score: score.score ?? (score.pass ? 100 : 0)
+    });
   }
   const scores = Object.values(results);
   const total = scores.reduce(
@@ -4516,7 +4634,8 @@ Return a JSON object:
 - Prefer ADDITIVE changes over replacements when possible.
 Return ONLY valid JSON.`;
-var STDOUT_TRUNCATION_LIMIT = 2e3;
+var STDOUT_TRUNCATION_LIMIT = 1e3;
+var MAX_CONTEXT_CHARS = 1e5;
 async function readHarnessFiles(harnessPath) {
   const result = {};
   async function walk(dir, prefix) {
@@ -4550,26 +4669,25 @@ function truncateStdout(stdout, limit) {
 ${stdout.slice(-limit)}`;
 }
 function buildProposerUserMessage(harnessFiles, traces, tasks, history) {
-  const sections = [];
-  sections.push("## Current Harness Files\n");
+  const harnessSection = ["## Current Harness Files\n"];
   const fileEntries = Object.entries(harnessFiles);
   if (fileEntries.length === 0) {
-    sections.push("(No harness files found)\n");
+    harnessSection.push("(No harness files found)\n");
   } else {
     for (const [filePath, content] of fileEntries) {
-      sections.push(`### ${filePath}
+      harnessSection.push(`### ${filePath}
 \`\`\`
 ${content}
 \`\`\`
 `);
     }
   }
-  sections.push("## Task Definitions\n");
+  const taskSection = ["## Task Definitions\n"];
   if (tasks.length === 0) {
-    sections.push("(No tasks defined)\n");
+    taskSection.push("(No tasks defined)\n");
   } else {
     for (const task of tasks) {
-      sections.push(
+      taskSection.push(
         `### Task: ${task.id}
 - Template: ${task.template}
 - Description: ${task.description}
@@ -4579,15 +4697,27 @@ ${content}
       );
     }
   }
-  sections.push("## Execution Traces\n");
-  if (traces.length === 0) {
-    sections.push("(No traces available)\n");
-  } else {
+  const fixedContent = harnessSection.join("\n") + "\n" + taskSection.join("\n");
+  const remainingBudget = MAX_CONTEXT_CHARS - fixedContent.length;
+  if (remainingBudget <= 0) {
+    return fixedContent + "\n\n[...traces and history omitted \u2014 harness + tasks fill context budget...]";
+  }
+  const traceBudget = Math.floor(remainingBudget * 0.7);
+  const historyBudget = remainingBudget - traceBudget;
+  const traceSection = buildTraceSection(traces, traceBudget);
+  const historySection = buildHistorySection(history, historyBudget);
+  return fixedContent + "\n" + traceSection + "\n" + historySection;
+}
+function buildTraceSection(traces, budget) {
+  if (traces.length === 0) return "## Execution Traces\n\n(No traces available)\n";
+  let stdoutLimit = STDOUT_TRUNCATION_LIMIT;
+  for (let attempt = 0; attempt < 4; attempt++) {
+    const parts = ["## Execution Traces\n"];
     for (const trace of traces) {
       const scoreNum = trace.score.score !== void 0 ? trace.score.score : trace.score.pass ? 100 : 0;
-      const truncatedStdout = truncateStdout(trace.stdout, STDOUT_TRUNCATION_LIMIT);
+      const truncatedStdout = truncateStdout(trace.stdout, stdoutLimit);
       const filesChangedList = Object.entries(trace.filesChanged).map(([f, action]) => `  - ${f}: ${action}`).join("\n");
-      sections.push(
+      parts.push(
         `### Trace: ${trace.taskId}
 - Pass: ${trace.score.pass}
 - Score: ${scoreNum}
@@ -4595,36 +4725,55 @@ ${content}
 ` : "") + `- Duration: ${trace.timing.durationMs}ms
 - Files changed:
 ${filesChangedList || "  (none)"}
-- Stdout (last ${STDOUT_TRUNCATION_LIMIT} chars):
+- Stdout (last ${stdoutLimit} chars):
 \`\`\`
 ${truncatedStdout}
 \`\`\`
 `
       );
     }
+    const result = parts.join("\n");
+    if (result.length <= budget) return result;
+    stdoutLimit = Math.floor(stdoutLimit / 2);
   }
-  sections.push("## Iteration History\n");
-  if (history.length === 0) {
-    sections.push("(No previous iterations)\n");
-  } else {
-    for (const log of history) {
+  const summary = ["## Execution Traces (summary \u2014 stdout omitted to fit budget)\n"];
+  for (const trace of traces) {
+    const scoreNum = trace.score.score !== void 0 ? trace.score.score : trace.score.pass ? 100 : 0;
+    summary.push(`- ${trace.taskId}: ${scoreNum} (pass=${trace.score.pass})
+`);
+  }
+  return summary.join("\n");
+}
+function buildHistorySection(history, budget) {
+  if (history.length === 0) return "## Iteration History\n\n(No previous iterations)\n";
+  let entries = [...history];
+  while (entries.length > 0) {
+    const parts = ["## Iteration History\n"];
+    if (entries.length < history.length) {
+      parts.push(`(Showing ${entries.length}/${history.length} most recent iterations)
+`);
+    }
+    for (const log of entries) {
       const taskScores = Object.entries(log.taskResults).map(([id, s]) => `  - ${id}: ${s.score !== void 0 ? s.score : s.pass ? 100 : 0} (pass=${s.pass})`).join("\n");
-      sections.push(
+      parts.push(
         `### Iteration ${log.iteration} \u2014 Score: ${log.score}
 - Task results:
 ${taskScores}
 `
       );
       if (log.proposal) {
-        sections.push(
+        parts.push(
           `- Proposal reasoning: ${log.proposal.reasoning}
 - Mutations: ${log.proposal.mutations.length} change(s)
 `
         );
       }
     }
+    const result = parts.join("\n");
+    if (result.length <= budget) return result;
+    entries = entries.slice(1);
   }
-  return sections.join("\n");
+  return "## Iteration History\n\n(History omitted to fit context budget)\n";
 }
 function parseProposerResponse(raw) {
   let cleaned = raw.trim();
@@ -4836,7 +4985,8 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
       harnessPath,
       workspacePath,
       iter,
-      kairnConfig
+      kairnConfig,
+      onProgress
     );
     onProgress?.({ type: "iteration-scored", iteration: iter, score: aggregate });
     if (iter === 0) baselineScore = aggregate;
@@ -4914,7 +5064,13 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
         kairnConfig,
         evolveConfig.proposerModel
       );
-    } catch {
+    } catch (err) {
+      const errMsg = err instanceof Error ? err.message : String(err);
+      onProgress?.({
+        type: "proposer-error",
+        iteration: iter,
+        message: `Proposer failed: ${errMsg}`
+      });
       const nextIterDir2 = path21.join(
         workspacePath,
         "iterations",
@@ -4978,6 +5134,215 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
   };
 }
+// src/evolve/report.ts
+import fs22 from "fs/promises";
+import path22 from "path";
+// src/evolve/diagnosis.ts
+function numericScore(s) {
+  return s.score ?? (s.pass ? 100 : 0);
+}
+function diagnoseCounterfactuals(iterations, _tasks) {
+  const entries = [];
+  for (let i = 1; i < iterations.length; i++) {
+    const prev = iterations[i - 1];
+    const curr = iterations[i];
+    if (!curr.proposal && !prev.proposal) continue;
+    const proposal = prev.proposal;
+    if (!proposal || proposal.mutations.length === 0) continue;
+    const mutationSummary = proposal.mutations.map((m) => `${m.action} in ${m.file}: ${m.rationale}`).join("; ");
+    const helpedTasks = [];
+    const hurtTasks = [];
+    const allTaskIds = /* @__PURE__ */ new Set([
+      ...Object.keys(prev.taskResults),
+      ...Object.keys(curr.taskResults)
+    ]);
+    let netDelta = 0;
+    for (const taskId of allTaskIds) {
+      const prevScore = prev.taskResults[taskId] ? numericScore(prev.taskResults[taskId]) : 0;
+      const currScore = curr.taskResults[taskId] ? numericScore(curr.taskResults[taskId]) : 0;
+      const delta = currScore - prevScore;
+      if (delta > 0) {
+        helpedTasks.push({ taskId, delta });
+      } else if (delta < 0) {
+        hurtTasks.push({ taskId, delta });
+      }
+      netDelta += delta;
+    }
+    entries.push({
+      iteration: i,
+      mutationSummary,
+      helpedTasks,
+      hurtTasks,
+      netScoreDelta: netDelta
+    });
+  }
+  return { entries };
+}
+// src/evolve/report.ts
+import { parse as yamlParse } from "yaml";
+function numericScore2(s) {
+  return s.score ?? (s.pass ? 100 : 0);
+}
+async function loadAllIterations(workspacePath) {
+  const iterDir = path22.join(workspacePath, "iterations");
+  let entries;
+  try {
+    entries = await fs22.readdir(iterDir);
+  } catch {
+    return [];
+  }
+  const iterations = [];
+  const iterNums = entries.map((e) => parseInt(e, 10)).filter((n) => !isNaN(n)).sort((a, b) => a - b);
+  for (const n of iterNums) {
+    const log = await loadIterationLog(workspacePath, n);
+    if (log) iterations.push(log);
+  }
+  return iterations;
+}
+async function loadTasks(workspacePath) {
+  try {
+    const content = await fs22.readFile(path22.join(workspacePath, "tasks.yaml"), "utf-8");
+    const parsed = yamlParse(content);
+    return parsed?.tasks ?? [];
+  } catch {
+    return [];
+  }
+}
+function buildLeaderboard(iterations, tasks) {
+  const taskIds = tasks.map((t) => t.id);
+  return taskIds.map((taskId) => {
+    const scores = {};
+    let bestScore = -1;
+    let bestIteration = 0;
+    for (const iter of iterations) {
+      const s = iter.taskResults[taskId];
+      if (s) {
+        const score = numericScore2(s);
+        scores[iter.iteration] = score;
+        if (score > bestScore) {
+          bestScore = score;
+          bestIteration = iter.iteration;
+        }
+      }
+    }
+    return { taskId, scores, bestIteration, bestScore };
+  });
+}
+function iterationStatus(iter, bestIteration) {
+  if (iter.iteration === 0) return "baseline";
+  if (!iter.proposal && !iter.diffPatch) return "rollback";
+  if (iter.score >= 100) return "perfect";
+  if (iter.iteration === bestIteration) return "best";
+  return "evaluated";
+}
+async function generateMarkdownReport(workspacePath) {
+  const iterations = await loadAllIterations(workspacePath);
+  const tasks = await loadTasks(workspacePath);
+  if (iterations.length === 0) {
+    return "# Evolution Report\n\nNo iterations found. Run `kairn evolve run` first.\n";
+  }
+  const baselineScore = iterations[0].score;
+  const bestIter = iterations.reduce((best, curr) => curr.score > best.score ? curr : best, iterations[0]);
+  const improvement = bestIter.score - baselineScore;
+  const counterfactuals = diagnoseCounterfactuals(iterations, tasks);
+  const leaderboard = buildLeaderboard(iterations, tasks);
+  const lines = [];
+  lines.push("# Evolution Report");
+  lines.push("");
+  lines.push("## Overview");
+  lines.push("");
+  lines.push(`| Metric | Value |`);
+  lines.push(`|--------|-------|`);
+  lines.push(`| Total iterations | ${iterations.length} |`);
+  lines.push(`| Baseline score | ${baselineScore.toFixed(1)}% |`);
+  lines.push(`| Best score | ${bestIter.score.toFixed(1)}% |`);
+  lines.push(`| Best iteration | ${bestIter.iteration} |`);
+  lines.push(`| Improvement | ${improvement >= 0 ? "+" : ""}${improvement.toFixed(1)} points |`);
+  lines.push("");
+  lines.push("## Iterations");
+  lines.push("");
+  lines.push("| Iter | Score | Mutations | Status |");
+  lines.push("|------|-------|-----------|--------|");
+  for (const iter of iterations) {
+    const mutations = iter.proposal?.mutations.length ?? 0;
+    const mutStr = mutations > 0 ? mutations.toString() : "-";
+    const status = iterationStatus(iter, bestIter.iteration);
+    lines.push(`| ${iter.iteration} | ${iter.score.toFixed(1)}% | ${mutStr} | ${status} |`);
+  }
+  lines.push("");
+  if (leaderboard.length > 0) {
+    lines.push("## Leaderboard");
+    lines.push("");
+    const iterNums = iterations.map((i) => i.iteration);
+    const headerCols = ["Task", ...iterNums.map((n) => `Iter ${n}`), "Best"];
+    lines.push(`| ${headerCols.join(" | ")} |`);
+    lines.push(`| ${headerCols.map(() => "---").join(" | ")} |`);
+    for (const entry of leaderboard) {
+      const scoreCols = iterNums.map((n) => {
+        const s = entry.scores[n];
+        return s !== void 0 ? `${s.toFixed(0)}%` : "-";
+      });
+      lines.push(`| ${entry.taskId} | ${scoreCols.join(" | ")} | ${entry.bestScore.toFixed(0)}% (iter ${entry.bestIteration}) |`);
+    }
+    lines.push("");
+  }
+  if (counterfactuals.entries.length > 0) {
+    lines.push("## Counterfactual Diagnosis");
+    lines.push("");
+    for (const entry of counterfactuals.entries) {
+      const sign = entry.netScoreDelta >= 0 ? "+" : "";
+      lines.push(`### Iteration ${entry.iteration} (net ${sign}${entry.netScoreDelta.toFixed(1)} points)`);
+      lines.push("");
+      lines.push(`**Mutations:** ${entry.mutationSummary}`);
+      lines.push("");
+      if (entry.helpedTasks.length > 0) {
+        lines.push("**Helped:**");
+        for (const t of entry.helpedTasks) {
+          lines.push(`- ${t.taskId}: +${t.delta.toFixed(1)}`);
+        }
+        lines.push("");
+      }
+      if (entry.hurtTasks.length > 0) {
+        lines.push("**Hurt:**");
+        for (const t of entry.hurtTasks) {
+          lines.push(`- ${t.taskId}: ${t.delta.toFixed(1)}`);
+        }
+        lines.push("");
+      }
+    }
+  }
+  return lines.join("\n");
+}
+async function generateJsonReport(workspacePath) {
+  const iterations = await loadAllIterations(workspacePath);
+  const tasks = await loadTasks(workspacePath);
+  const baselineScore = iterations.length > 0 ? iterations[0].score : 0;
+  const bestIter = iterations.length > 0 ? iterations.reduce((best, curr) => curr.score > best.score ? curr : best, iterations[0]) : { score: 0, iteration: 0 };
+  const improvement = bestIter.score - baselineScore;
+  const counterfactuals = diagnoseCounterfactuals(iterations, tasks);
+  const leaderboard = buildLeaderboard(iterations, tasks);
+  return {
+    overview: {
+      title: "Evolution Report",
+      totalIterations: iterations.length,
+      baselineScore,
+      bestScore: bestIter.score,
+      bestIteration: bestIter.iteration,
+      improvement
+    },
+    iterations: iterations.map((iter) => ({
+      iteration: iter.iteration,
+      score: iter.score,
+      mutationCount: iter.proposal?.mutations.length ?? 0,
+      status: iterationStatus(iter, bestIter.iteration)
+    })),
+    leaderboard,
+    counterfactuals
+  };
+}
 // src/commands/evolve.ts
 var DEFAULT_CONFIG = {
   model: "claude-sonnet-4-6",
@@ -4988,8 +5353,8 @@ var DEFAULT_CONFIG = {
 };
 async function loadEvolveConfigFromWorkspace(workspacePath) {
   try {
-    const configStr = await fs22.readFile(path22.join(workspacePath, "config.yaml"), "utf-8");
-    const parsed = yamlParse(configStr);
+    const configStr = await fs23.readFile(path23.join(workspacePath, "config.yaml"), "utf-8");
+    const parsed = yamlParse2(configStr);
     return {
       model: parsed.model ?? DEFAULT_CONFIG.model,
       proposerModel: parsed.proposer_model ?? DEFAULT_CONFIG.proposerModel,
@@ -5006,9 +5371,9 @@ evolveCommand.command("init").description("Initialize an evolution workspace wit
   try {
     const projectRoot = process.cwd();
     console.log(ui.section("Evolve Init"));
-    const claudeDir = path22.join(projectRoot, ".claude");
+    const claudeDir = path23.join(projectRoot, ".claude");
     try {
-      await fs22.access(claudeDir);
+      await fs23.access(claudeDir);
     } catch {
       console.log(ui.error("No .claude/ directory found. Run kairn describe first."));
       process.exit(1);
@@ -5058,7 +5423,7 @@ evolveCommand.command("init").description("Initialize an evolution workspace wit
           if (config) {
             let claudeMd = "";
             try {
-              claudeMd = await fs22.readFile(path22.join(claudeDir, "CLAUDE.md"), "utf-8");
+              claudeMd = await fs23.readFile(path23.join(claudeDir, "CLAUDE.md"), "utf-8");
             } catch {
             }
             const profile = await buildProjectProfile(projectRoot);
@@ -5089,16 +5454,16 @@ evolveCommand.command("init").description("Initialize an evolution workspace wit
 evolveCommand.command("baseline").description("Snapshot current .claude/ directory as baseline").action(async () => {
   try {
     const projectRoot = process.cwd();
-    const workspace = path22.join(projectRoot, ".kairn-evolve");
+    const workspace = path23.join(projectRoot, ".kairn-evolve");
     console.log(ui.section("Evolve Baseline"));
     try {
-      await fs22.access(workspace);
+      await fs23.access(workspace);
     } catch {
       console.log(ui.error("No .kairn-evolve/ directory found. Run kairn evolve init first."));
       process.exit(1);
     }
     await snapshotBaseline(projectRoot, workspace);
-    const baselineDir = path22.join(workspace, "baseline");
+    const baselineDir = path23.join(workspace, "baseline");
     const fileCount = await countFiles(baselineDir);
     console.log(ui.success(`Baseline snapshot created (${fileCount} files)`));
   } catch (err) {
@@ -5110,23 +5475,23 @@ evolveCommand.command("baseline").description("Snapshot current .claude/ directo
 evolveCommand.command("run").description("Run tasks against the current harness").option("--task <id>", "Run a specific task by ID").option("--iterations <n>", "Number of evolution iterations", "5").action(async (options) => {
   try {
     const projectRoot = process.cwd();
-    const workspace = path22.join(projectRoot, ".kairn-evolve");
+    const workspace = path23.join(projectRoot, ".kairn-evolve");
     console.log(ui.section("Evolve Run"));
     try {
-      await fs22.access(workspace);
+      await fs23.access(workspace);
     } catch {
       console.log(ui.error("No .kairn-evolve/ directory found. Run kairn evolve init first."));
       process.exit(1);
     }
-    const tasksPath = path22.join(workspace, "tasks.yaml");
+    const tasksPath = path23.join(workspace, "tasks.yaml");
     let tasksContent;
     try {
-      tasksContent = await fs22.readFile(tasksPath, "utf-8");
+      tasksContent = await fs23.readFile(tasksPath, "utf-8");
     } catch {
       console.log(ui.error("No tasks.yaml found. Run kairn evolve init first."));
       process.exit(1);
     }
-    const parsed = yamlParse(tasksContent);
+    const parsed = yamlParse2(tasksContent);
     if (!parsed?.tasks || parsed.tasks.length === 0) {
       console.log(ui.error("No tasks found in tasks.yaml"));
       process.exit(1);
@@ -5140,15 +5505,15 @@ evolveCommand.command("run").description("Run tasks against the current harness"
       console.log(ui.info(`Running ${tasksToRun.length} task(s)...`));
       console.log("");
       const config = await loadConfig();
-      const harnessPath = path22.join(projectRoot, ".claude");
+      const harnessPath = path23.join(projectRoot, ".claude");
       const results = [];
       for (const task of tasksToRun) {
-        const traceDir = path22.join(workspace, "traces", "0", task.id);
+        const traceDir = path23.join(workspace, "traces", "0", task.id);
         const spinner = ora2(`Running: ${task.id}`).start();
         const result = await runTask(task, harnessPath, traceDir, 0);
         if (config) {
-          const stdout = await fs22.readFile(path22.join(traceDir, "stdout.log"), "utf-8").catch(() => "");
-          const stderr = await fs22.readFile(path22.join(traceDir, "stderr.log"), "utf-8").catch(() => "");
+          const stdout = await fs23.readFile(path23.join(traceDir, "stdout.log"), "utf-8").catch(() => "");
+          const stderr = await fs23.readFile(path23.join(traceDir, "stderr.log"), "utf-8").catch(() => "");
           const score = await scoreTask(task, traceDir, stdout, stderr, config);
           result.score = score;
           await writeScore(traceDir, score);
@@ -5177,7 +5542,7 @@ evolveCommand.command("run").description("Run tasks against the current harness"
       }
       evolveConfig.maxIterations = iterations;
       try {
-        await fs22.access(path22.join(workspace, "iterations", "0", "harness"));
+        await fs23.access(path23.join(workspace, "iterations", "0", "harness"));
       } catch {
         console.log(ui.error("No baseline harness found. Run kairn evolve baseline first."));
         process.exit(1);
@@ -5204,6 +5569,18 @@ evolveCommand.command("run").description("Run tasks against the current harness"
           case "perfect-score":
             console.log(chalk14.green("  Perfect score. Stopping."));
             break;
+          case "proposer-error":
+            console.log(chalk14.yellow(`  Warning: ${event.message ?? "Proposer failed"}`));
+            break;
+          case "task-start":
+            console.log(chalk14.dim(`    Running: ${event.taskId ?? "unknown"}...`));
+            break;
+          case "task-scored": {
+            const taskScore = event.score ?? 0;
+            const taskStatus = taskScore >= 100 ? chalk14.green("PASS") : taskScore >= 60 ? chalk14.yellow("PARTIAL") : chalk14.red("FAIL");
+            console.log(`    ${taskStatus}  ${event.taskId ?? "unknown"} ${chalk14.dim(`(${taskScore.toFixed(0)}%)`)}`);
+            break;
+          }
           case "complete":
             break;
         }
@@ -5238,13 +5615,107 @@ evolveCommand.command("run").description("Run tasks against the current harness"
     process.exit(1);
   }
 });
+evolveCommand.command("report").description("Generate a summary report of the evolution run").option("--json", "Output machine-readable JSON instead of Markdown").action(async (options) => {
+  try {
+    const projectRoot = process.cwd();
+    const workspace = path23.join(projectRoot, ".kairn-evolve");
+    try {
+      await fs23.access(workspace);
+    } catch {
+      console.log(ui.error("No .kairn-evolve/ directory found. Run kairn evolve init first."));
+      process.exit(1);
+    }
+    if (options.json) {
+      const report = await generateJsonReport(workspace);
+      console.log(JSON.stringify(report, null, 2));
+    } else {
+      const markdown = await generateMarkdownReport(workspace);
+      console.log(markdown);
+    }
+  } catch (err) {
+    const msg = err instanceof Error ? err.message : String(err);
+    console.log(ui.error(msg));
+    process.exit(1);
+  }
+});
+evolveCommand.command("diff <iter1> <iter2>").description("Show harness changes between two iterations").action(async (iter1Str, iter2Str) => {
+  try {
+    const projectRoot = process.cwd();
+    const workspace = path23.join(projectRoot, ".kairn-evolve");
+    const iter1 = parseInt(iter1Str, 10);
+    const iter2 = parseInt(iter2Str, 10);
+    if (isNaN(iter1) || isNaN(iter2)) {
+      console.log(ui.error("Both arguments must be integers (iteration numbers)"));
+      process.exit(1);
+    }
+    const harness1 = path23.join(workspace, "iterations", iter1.toString(), "harness");
+    const harness2 = path23.join(workspace, "iterations", iter2.toString(), "harness");
+    try {
+      await fs23.access(harness1);
+    } catch {
+      console.log(ui.error(`Iteration ${iter1} harness not found at ${harness1}`));
+      process.exit(1);
+    }
+    try {
+      await fs23.access(harness2);
+    } catch {
+      console.log(ui.error(`Iteration ${iter2} harness not found at ${harness2}`));
+      process.exit(1);
+    }
+    console.log(ui.section(`Diff: Iteration ${iter1} \u2192 ${iter2}`));
+    const diffPatch = await generateDiff2(harness1, harness2);
+    if (!diffPatch) {
+      console.log(chalk14.dim("  No harness changes between these iterations."));
+    } else {
+      for (const line of diffPatch.split("\n")) {
+        if (line.startsWith("---") || line.startsWith("+++")) {
+          console.log(chalk14.bold(line));
+        } else if (line.startsWith("+")) {
+          console.log(chalk14.green(line));
+        } else if (line.startsWith("-")) {
+          console.log(chalk14.red(line));
+        } else {
+          console.log(line);
+        }
+      }
+    }
+    const [log1, log2] = await Promise.all([
+      loadIterationLog(workspace, iter1),
+      loadIterationLog(workspace, iter2)
+    ]);
+    if (log1 && log2) {
+      console.log("");
+      console.log(ui.section("Score Comparison"));
+      console.log("");
+      console.log("  Task                          Iter " + iter1 + "    Iter " + iter2 + "    Delta");
+      const allTaskIds = /* @__PURE__ */ new Set([
+        ...Object.keys(log1.taskResults),
+        ...Object.keys(log2.taskResults)
+      ]);
+      for (const taskId of [...allTaskIds].sort()) {
+        const s1 = log1.taskResults[taskId];
+        const s2 = log2.taskResults[taskId];
+        const score1 = s1 ? s1.score ?? (s1.pass ? 100 : 0) : 0;
+        const score2 = s2 ? s2.score ?? (s2.pass ? 100 : 0) : 0;
+        const delta = score2 - score1;
+        const deltaStr = delta > 0 ? chalk14.green(`+${delta.toFixed(0)}`) : delta < 0 ? chalk14.red(delta.toFixed(0).toString()) : chalk14.dim("0");
+        const name = taskId.padEnd(30);
+        console.log(`  ${name}  ${score1.toFixed(0).padStart(5)}%    ${score2.toFixed(0).padStart(5)}%    ${deltaStr}`);
+      }
+    }
+  } catch (err) {
+    const msg = err instanceof Error ? err.message : String(err);
+    console.log(ui.error(msg));
+    process.exit(1);
+  }
+});
 async function countFiles(dir) {
   let count = 0;
   try {
-    const entries = await fs22.readdir(dir, { withFileTypes: true });
+    const entries = await fs23.readdir(dir, { withFileTypes: true });
     for (const entry of entries) {
       if (entry.isDirectory()) {
-        count += await countFiles(path22.join(dir, entry.name));
+        count += await countFiles(path23.join(dir, entry.name));
       } else {
         count++;
       }