kairn-cli 2.1.0 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -221,7 +221,7 @@ var ui = {
221
221
  // Key-value pairs
222
222
  kv: (key, value) => ` ${chalk.cyan(key.padEnd(14))} ${value}`,
223
223
  // File list
224
- file: (path23) => chalk.dim(` ${path23}`),
224
+ file: (path24) => chalk.dim(` ${path24}`),
225
225
  // Tool display
226
226
  tool: (name, reason) => ` ${warmStone("\u25CF")} ${chalk.bold(name)}
227
227
  ${chalk.dim(reason)}`,
@@ -3694,9 +3694,9 @@ var keysCommand = new Command10("keys").description("Add or update API keys for
3694
3694
  import { Command as Command11 } from "commander";
3695
3695
  import chalk14 from "chalk";
3696
3696
  import ora2 from "ora";
3697
- import fs22 from "fs/promises";
3698
- import path22 from "path";
3699
- import { parse as yamlParse } from "yaml";
3697
+ import fs23 from "fs/promises";
3698
+ import path23 from "path";
3699
+ import { parse as yamlParse2 } from "yaml";
3700
3700
  import { confirm as confirm3, select as select4 } from "@inquirer/prompts";
3701
3701
 
3702
3702
  // src/evolve/init.ts
@@ -3741,30 +3741,55 @@ var EVAL_TEMPLATES = {
3741
3741
  name: "Documentation",
3742
3742
  description: "Can the agent write and update docs?",
3743
3743
  bestFor: ["content", "api-building", "full-stack"]
3744
+ },
3745
+ "convention-adherence": {
3746
+ id: "convention-adherence",
3747
+ name: "Convention Adherence",
3748
+ description: "Does the agent follow all project conventions defined in CLAUDE.md?",
3749
+ bestFor: ["feature-development", "full-stack", "backend", "maintenance"]
3750
+ },
3751
+ "workflow-compliance": {
3752
+ id: "workflow-compliance",
3753
+ name: "Workflow Compliance",
3754
+ description: "Does the agent use the project workflow commands and skills?",
3755
+ bestFor: ["feature-development", "full-stack", "tdd", "qa"]
3756
+ },
3757
+ "rule-compliance": {
3758
+ id: "rule-compliance",
3759
+ name: "Rule Compliance",
3760
+ description: "Does the agent follow all project rules without violations?",
3761
+ bestFor: ["feature-development", "backend", "maintenance", "architecture"]
3744
3762
  }
3745
3763
  };
3746
3764
  function selectTemplatesForWorkflow(workflowType) {
3747
3765
  const mapping = {
3748
- "feature-development": ["add-feature", "test-writing", "documentation"],
3749
- "api-building": ["add-feature", "fix-bug", "test-writing"],
3750
- "full-stack": ["add-feature", "fix-bug", "test-writing"],
3751
- "maintenance": ["fix-bug", "refactor", "test-writing"],
3752
- "debugging": ["fix-bug", "test-writing"],
3753
- "qa": ["fix-bug", "test-writing", "add-feature"],
3754
- "architecture": ["refactor", "test-writing", "config-change"],
3755
- "backend": ["fix-bug", "refactor", "config-change", "test-writing"],
3756
- "devops": ["config-change", "fix-bug"],
3757
- "infrastructure": ["config-change", "refactor"],
3758
- "tdd": ["test-writing", "add-feature", "fix-bug"],
3759
- "content": ["documentation", "add-feature"],
3760
- "research": ["documentation", "add-feature"]
3766
+ "feature-development": ["add-feature", "test-writing", "convention-adherence", "workflow-compliance"],
3767
+ "api-building": ["add-feature", "fix-bug", "test-writing", "convention-adherence"],
3768
+ "full-stack": ["add-feature", "fix-bug", "test-writing", "convention-adherence"],
3769
+ "maintenance": ["fix-bug", "refactor", "test-writing", "rule-compliance"],
3770
+ "debugging": ["fix-bug", "test-writing", "rule-compliance"],
3771
+ "qa": ["fix-bug", "test-writing", "add-feature", "workflow-compliance"],
3772
+ "architecture": ["refactor", "test-writing", "config-change", "convention-adherence"],
3773
+ "backend": ["fix-bug", "refactor", "config-change", "rule-compliance"],
3774
+ "devops": ["config-change", "fix-bug", "rule-compliance"],
3775
+ "infrastructure": ["config-change", "refactor", "convention-adherence"],
3776
+ "tdd": ["test-writing", "add-feature", "fix-bug", "workflow-compliance"],
3777
+ "content": ["documentation", "add-feature", "convention-adherence"],
3778
+ "research": ["documentation", "add-feature", "convention-adherence"]
3761
3779
  };
3762
- return mapping[workflowType] || ["add-feature", "fix-bug", "test-writing"];
3780
+ return mapping[workflowType] || ["add-feature", "fix-bug", "test-writing", "convention-adherence"];
3763
3781
  }
3764
3782
  var TASK_GENERATION_PROMPT = `You are an eval task generator for Claude Code agent environments. Given a project's CLAUDE.md, project structure, and selected eval templates, generate concrete, project-specific tasks.
3765
3783
 
3766
3784
  Each task must be realistic and testable against the actual project. Avoid generic placeholders.
3767
3785
 
3786
+ IMPORTANT: For harness-aware templates (convention-adherence, workflow-compliance, rule-compliance), generate tasks where success DEPENDS on the agent reading and following the .claude/ harness content:
3787
+ - convention-adherence: Task must require following specific conventions from CLAUDE.md (naming, file structure, patterns). Judge by whether output matches the conventions.
3788
+ - workflow-compliance: Task must require using project slash commands or workflow steps defined in .claude/commands/. Judge by whether the agent followed the defined workflow.
3789
+ - rule-compliance: Task must create a scenario where .claude/rules/ content is relevant. Judge by whether the agent respected all rules.
3790
+
3791
+ These harness-aware tasks are critical \u2014 they test whether the .claude/ environment actually improves agent behavior.
3792
+
3768
3793
  Return a JSON object with a "tasks" array. Each task has:
3769
3794
  - id: kebab-case identifier (e.g., "add-health-endpoint")
3770
3795
  - template: which eval template this instantiates
@@ -4117,6 +4142,27 @@ async function writeIterationLog(workspacePath, log) {
4117
4142
  "utf-8"
4118
4143
  );
4119
4144
  }
4145
+ async function loadIterationLog(workspacePath, iteration) {
4146
+ const iterDir = path17.join(workspacePath, "iterations", iteration.toString());
4147
+ try {
4148
+ await fs17.access(iterDir);
4149
+ } catch {
4150
+ return null;
4151
+ }
4152
+ const scoresStr = await fs17.readFile(path17.join(iterDir, "scores.json"), "utf-8").catch(() => "{}");
4153
+ const reasoning = await fs17.readFile(path17.join(iterDir, "proposer_reasoning.md"), "utf-8").catch(() => "");
4154
+ const diffPatch = await fs17.readFile(path17.join(iterDir, "mutation_diff.patch"), "utf-8").catch(() => "");
4155
+ const scoresData = JSON.parse(scoresStr);
4156
+ const proposal = reasoning ? { reasoning, mutations: [], expectedImpact: {} } : null;
4157
+ return {
4158
+ iteration,
4159
+ score: scoresData.score ?? 0,
4160
+ taskResults: scoresData.taskResults ?? {},
4161
+ proposal,
4162
+ diffPatch: diffPatch || null,
4163
+ timestamp: ""
4164
+ };
4165
+ }
4120
4166
 
4121
4167
  // src/evolve/exec.ts
4122
4168
  import { exec } from "child_process";
@@ -4169,7 +4215,8 @@ ${msg}`);
4169
4215
  details: passed2 ? `All ${commands.length} verification commands passed` : failures.join("\n")
4170
4216
  };
4171
4217
  }
4172
- const hasErrors = stderr.toLowerCase().includes("error") || stderr.toLowerCase().includes("failed") || stderr.toLowerCase().includes("exception");
4218
+ const filteredStderr = stderr.split("\n").filter((line) => !line.startsWith("[setup]")).join("\n");
4219
+ const hasErrors = filteredStderr.toLowerCase().includes("error") || filteredStderr.toLowerCase().includes("failed") || filteredStderr.toLowerCase().includes("exception");
4173
4220
  const passed = !hasErrors;
4174
4221
  return {
4175
4222
  pass: passed,
@@ -4297,24 +4344,88 @@ async function scoreTask(task, workspacePath, stdout, stderr, config) {
4297
4344
 
4298
4345
  // src/evolve/runner.ts
4299
4346
  var execAsync2 = promisify2(exec2);
4300
- async function runTask(task, harnessPath, traceDir, iteration) {
4347
+ var COPY_SKIP_DIRS = /* @__PURE__ */ new Set([".git", "node_modules", ".kairn-evolve", ".claude"]);
4348
+ async function createIsolatedWorkspace(projectRoot, harnessPath) {
4349
+ const suffix = `${Date.now()}-${Math.random().toString(36).slice(2)}`;
4350
+ try {
4351
+ await execAsync2("git rev-parse --is-inside-work-tree", {
4352
+ cwd: projectRoot,
4353
+ timeout: 5e3
4354
+ });
4355
+ const tmpDir2 = path18.join(os3.tmpdir(), `kairn-evolve-wt-${suffix}`);
4356
+ await execAsync2(`git worktree add --detach "${tmpDir2}" HEAD`, {
4357
+ cwd: projectRoot,
4358
+ timeout: 3e4
4359
+ });
4360
+ await fs18.rm(path18.join(tmpDir2, ".claude"), { recursive: true, force: true });
4361
+ await copyDir(harnessPath, path18.join(tmpDir2, ".claude"));
4362
+ return { workDir: tmpDir2, isWorktree: true };
4363
+ } catch {
4364
+ }
4365
+ const tmpDir = await fs18.mkdtemp(path18.join(os3.tmpdir(), `kairn-evolve-cp-`));
4366
+ await copyProjectDir(projectRoot, tmpDir);
4367
+ await fs18.rm(path18.join(tmpDir, ".claude"), { recursive: true, force: true });
4368
+ await copyDir(harnessPath, path18.join(tmpDir, ".claude"));
4369
+ return { workDir: tmpDir, isWorktree: false };
4370
+ }
4371
+ async function copyProjectDir(src, dest) {
4372
+ await fs18.mkdir(dest, { recursive: true });
4373
+ let entries;
4374
+ try {
4375
+ entries = await fs18.readdir(src, { withFileTypes: true });
4376
+ } catch {
4377
+ return;
4378
+ }
4379
+ for (const entry of entries) {
4380
+ if (COPY_SKIP_DIRS.has(entry.name)) continue;
4381
+ const srcPath = path18.join(src, entry.name);
4382
+ const destPath = path18.join(dest, entry.name);
4383
+ if (entry.isDirectory()) {
4384
+ await copyDir(srcPath, destPath);
4385
+ } else {
4386
+ await fs18.copyFile(srcPath, destPath);
4387
+ }
4388
+ }
4389
+ }
4390
+ async function cleanupIsolatedWorkspace(workDir, isWorktree, projectRoot) {
4391
+ if (isWorktree) {
4392
+ try {
4393
+ await execAsync2(`git worktree remove "${workDir}" --force`, {
4394
+ cwd: projectRoot,
4395
+ timeout: 1e4
4396
+ });
4397
+ } catch {
4398
+ await fs18.rm(workDir, { recursive: true, force: true }).catch(() => {
4399
+ });
4400
+ await execAsync2("git worktree prune", {
4401
+ cwd: projectRoot,
4402
+ timeout: 5e3
4403
+ }).catch(() => {
4404
+ });
4405
+ }
4406
+ } else {
4407
+ await fs18.rm(workDir, { recursive: true, force: true }).catch(() => {
4408
+ });
4409
+ }
4410
+ }
4411
+ async function runTask(task, harnessPath, traceDir, iteration, projectRoot) {
4301
4412
  await fs18.mkdir(traceDir, { recursive: true });
4302
4413
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
4303
4414
  const startMs = Date.now();
4304
- const tmpDir = await fs18.mkdtemp(path18.join(os3.tmpdir(), "kairn-evolve-"));
4415
+ const root = projectRoot ?? process.cwd();
4416
+ const { workDir, isWorktree } = await createIsolatedWorkspace(root, harnessPath);
4305
4417
  try {
4306
- await copyDir(harnessPath, path18.join(tmpDir, ".claude"));
4307
4418
  let setupStderr = "";
4308
4419
  if (task.setup.trim()) {
4309
4420
  try {
4310
- await execAsync2(task.setup, { cwd: tmpDir, timeout: 6e4 });
4421
+ await execAsync2(task.setup, { cwd: workDir, timeout: 6e4 });
4311
4422
  } catch (err) {
4312
4423
  setupStderr = err instanceof Error ? err.message : String(err);
4313
4424
  }
4314
4425
  }
4315
- const filesBefore = await snapshotFileList(tmpDir);
4316
- const spawnResult = await spawnClaude(task.description, tmpDir, task.timeout);
4317
- const filesAfter = await snapshotFileList(tmpDir);
4426
+ const filesBefore = await snapshotFileList(workDir);
4427
+ const spawnResult = await spawnClaude(task.description, workDir, task.timeout);
4428
+ const filesAfter = await snapshotFileList(workDir);
4318
4429
  const filesChanged = diffFileLists(filesBefore, filesAfter);
4319
4430
  const toolCalls = parseToolCalls(spawnResult.stdout);
4320
4431
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
@@ -4338,8 +4449,7 @@ ${spawnResult.stderr}` : spawnResult.stderr;
4338
4449
  traceDir
4339
4450
  };
4340
4451
  } finally {
4341
- await fs18.rm(tmpDir, { recursive: true, force: true }).catch(() => {
4342
- });
4452
+ await cleanupIsolatedWorkspace(workDir, isWorktree, root);
4343
4453
  }
4344
4454
  }
4345
4455
  async function spawnClaude(instruction, cwd, timeoutSec) {
@@ -4437,8 +4547,9 @@ function parseToolCalls(stdout) {
4437
4547
  return [];
4438
4548
  }
4439
4549
  }
4440
- async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config) {
4550
+ async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config, onProgress) {
4441
4551
  const results = {};
4552
+ const projectRoot = path18.resolve(workspacePath, "..");
4442
4553
  for (const task of tasks) {
4443
4554
  const traceDir = path18.join(
4444
4555
  workspacePath,
@@ -4446,7 +4557,8 @@ async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config)
4446
4557
  iteration.toString(),
4447
4558
  task.id
4448
4559
  );
4449
- const taskResult = await runTask(task, harnessPath, traceDir, iteration);
4560
+ onProgress?.({ type: "task-start", iteration, taskId: task.id });
4561
+ const taskResult = await runTask(task, harnessPath, traceDir, iteration, projectRoot);
4450
4562
  let score = taskResult.score;
4451
4563
  if (config) {
4452
4564
  const stdout = await fs18.readFile(path18.join(traceDir, "stdout.log"), "utf-8").catch(() => "");
@@ -4455,6 +4567,12 @@ async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config)
4455
4567
  await writeScore(traceDir, score);
4456
4568
  }
4457
4569
  results[task.id] = score;
4570
+ onProgress?.({
4571
+ type: "task-scored",
4572
+ iteration,
4573
+ taskId: task.id,
4574
+ score: score.score ?? (score.pass ? 100 : 0)
4575
+ });
4458
4576
  }
4459
4577
  const scores = Object.values(results);
4460
4578
  const total = scores.reduce(
@@ -4516,7 +4634,8 @@ Return a JSON object:
4516
4634
  - Prefer ADDITIVE changes over replacements when possible.
4517
4635
 
4518
4636
  Return ONLY valid JSON.`;
4519
- var STDOUT_TRUNCATION_LIMIT = 2e3;
4637
+ var STDOUT_TRUNCATION_LIMIT = 1e3;
4638
+ var MAX_CONTEXT_CHARS = 1e5;
4520
4639
  async function readHarnessFiles(harnessPath) {
4521
4640
  const result = {};
4522
4641
  async function walk(dir, prefix) {
@@ -4550,26 +4669,25 @@ function truncateStdout(stdout, limit) {
4550
4669
  ${stdout.slice(-limit)}`;
4551
4670
  }
4552
4671
  function buildProposerUserMessage(harnessFiles, traces, tasks, history) {
4553
- const sections = [];
4554
- sections.push("## Current Harness Files\n");
4672
+ const harnessSection = ["## Current Harness Files\n"];
4555
4673
  const fileEntries = Object.entries(harnessFiles);
4556
4674
  if (fileEntries.length === 0) {
4557
- sections.push("(No harness files found)\n");
4675
+ harnessSection.push("(No harness files found)\n");
4558
4676
  } else {
4559
4677
  for (const [filePath, content] of fileEntries) {
4560
- sections.push(`### ${filePath}
4678
+ harnessSection.push(`### ${filePath}
4561
4679
  \`\`\`
4562
4680
  ${content}
4563
4681
  \`\`\`
4564
4682
  `);
4565
4683
  }
4566
4684
  }
4567
- sections.push("## Task Definitions\n");
4685
+ const taskSection = ["## Task Definitions\n"];
4568
4686
  if (tasks.length === 0) {
4569
- sections.push("(No tasks defined)\n");
4687
+ taskSection.push("(No tasks defined)\n");
4570
4688
  } else {
4571
4689
  for (const task of tasks) {
4572
- sections.push(
4690
+ taskSection.push(
4573
4691
  `### Task: ${task.id}
4574
4692
  - Template: ${task.template}
4575
4693
  - Description: ${task.description}
@@ -4579,15 +4697,27 @@ ${content}
4579
4697
  );
4580
4698
  }
4581
4699
  }
4582
- sections.push("## Execution Traces\n");
4583
- if (traces.length === 0) {
4584
- sections.push("(No traces available)\n");
4585
- } else {
4700
+ const fixedContent = harnessSection.join("\n") + "\n" + taskSection.join("\n");
4701
+ const remainingBudget = MAX_CONTEXT_CHARS - fixedContent.length;
4702
+ if (remainingBudget <= 0) {
4703
+ return fixedContent + "\n\n[...traces and history omitted \u2014 harness + tasks fill context budget...]";
4704
+ }
4705
+ const traceBudget = Math.floor(remainingBudget * 0.7);
4706
+ const historyBudget = remainingBudget - traceBudget;
4707
+ const traceSection = buildTraceSection(traces, traceBudget);
4708
+ const historySection = buildHistorySection(history, historyBudget);
4709
+ return fixedContent + "\n" + traceSection + "\n" + historySection;
4710
+ }
4711
+ function buildTraceSection(traces, budget) {
4712
+ if (traces.length === 0) return "## Execution Traces\n\n(No traces available)\n";
4713
+ let stdoutLimit = STDOUT_TRUNCATION_LIMIT;
4714
+ for (let attempt = 0; attempt < 4; attempt++) {
4715
+ const parts = ["## Execution Traces\n"];
4586
4716
  for (const trace of traces) {
4587
4717
  const scoreNum = trace.score.score !== void 0 ? trace.score.score : trace.score.pass ? 100 : 0;
4588
- const truncatedStdout = truncateStdout(trace.stdout, STDOUT_TRUNCATION_LIMIT);
4718
+ const truncatedStdout = truncateStdout(trace.stdout, stdoutLimit);
4589
4719
  const filesChangedList = Object.entries(trace.filesChanged).map(([f, action]) => ` - ${f}: ${action}`).join("\n");
4590
- sections.push(
4720
+ parts.push(
4591
4721
  `### Trace: ${trace.taskId}
4592
4722
  - Pass: ${trace.score.pass}
4593
4723
  - Score: ${scoreNum}
@@ -4595,36 +4725,55 @@ ${content}
4595
4725
  ` : "") + `- Duration: ${trace.timing.durationMs}ms
4596
4726
  - Files changed:
4597
4727
  ${filesChangedList || " (none)"}
4598
- - Stdout (last ${STDOUT_TRUNCATION_LIMIT} chars):
4728
+ - Stdout (last ${stdoutLimit} chars):
4599
4729
  \`\`\`
4600
4730
  ${truncatedStdout}
4601
4731
  \`\`\`
4602
4732
  `
4603
4733
  );
4604
4734
  }
4735
+ const result = parts.join("\n");
4736
+ if (result.length <= budget) return result;
4737
+ stdoutLimit = Math.floor(stdoutLimit / 2);
4605
4738
  }
4606
- sections.push("## Iteration History\n");
4607
- if (history.length === 0) {
4608
- sections.push("(No previous iterations)\n");
4609
- } else {
4610
- for (const log of history) {
4739
+ const summary = ["## Execution Traces (summary \u2014 stdout omitted to fit budget)\n"];
4740
+ for (const trace of traces) {
4741
+ const scoreNum = trace.score.score !== void 0 ? trace.score.score : trace.score.pass ? 100 : 0;
4742
+ summary.push(`- ${trace.taskId}: ${scoreNum} (pass=${trace.score.pass})
4743
+ `);
4744
+ }
4745
+ return summary.join("\n");
4746
+ }
4747
+ function buildHistorySection(history, budget) {
4748
+ if (history.length === 0) return "## Iteration History\n\n(No previous iterations)\n";
4749
+ let entries = [...history];
4750
+ while (entries.length > 0) {
4751
+ const parts = ["## Iteration History\n"];
4752
+ if (entries.length < history.length) {
4753
+ parts.push(`(Showing ${entries.length}/${history.length} most recent iterations)
4754
+ `);
4755
+ }
4756
+ for (const log of entries) {
4611
4757
  const taskScores = Object.entries(log.taskResults).map(([id, s]) => ` - ${id}: ${s.score !== void 0 ? s.score : s.pass ? 100 : 0} (pass=${s.pass})`).join("\n");
4612
- sections.push(
4758
+ parts.push(
4613
4759
  `### Iteration ${log.iteration} \u2014 Score: ${log.score}
4614
4760
  - Task results:
4615
4761
  ${taskScores}
4616
4762
  `
4617
4763
  );
4618
4764
  if (log.proposal) {
4619
- sections.push(
4765
+ parts.push(
4620
4766
  `- Proposal reasoning: ${log.proposal.reasoning}
4621
4767
  - Mutations: ${log.proposal.mutations.length} change(s)
4622
4768
  `
4623
4769
  );
4624
4770
  }
4625
4771
  }
4772
+ const result = parts.join("\n");
4773
+ if (result.length <= budget) return result;
4774
+ entries = entries.slice(1);
4626
4775
  }
4627
- return sections.join("\n");
4776
+ return "## Iteration History\n\n(History omitted to fit context budget)\n";
4628
4777
  }
4629
4778
  function parseProposerResponse(raw) {
4630
4779
  let cleaned = raw.trim();
@@ -4836,7 +4985,8 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
4836
4985
  harnessPath,
4837
4986
  workspacePath,
4838
4987
  iter,
4839
- kairnConfig
4988
+ kairnConfig,
4989
+ onProgress
4840
4990
  );
4841
4991
  onProgress?.({ type: "iteration-scored", iteration: iter, score: aggregate });
4842
4992
  if (iter === 0) baselineScore = aggregate;
@@ -4914,7 +5064,13 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
4914
5064
  kairnConfig,
4915
5065
  evolveConfig.proposerModel
4916
5066
  );
4917
- } catch {
5067
+ } catch (err) {
5068
+ const errMsg = err instanceof Error ? err.message : String(err);
5069
+ onProgress?.({
5070
+ type: "proposer-error",
5071
+ iteration: iter,
5072
+ message: `Proposer failed: ${errMsg}`
5073
+ });
4918
5074
  const nextIterDir2 = path21.join(
4919
5075
  workspacePath,
4920
5076
  "iterations",
@@ -4978,6 +5134,215 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
4978
5134
  };
4979
5135
  }
4980
5136
 
5137
+ // src/evolve/report.ts
5138
+ import fs22 from "fs/promises";
5139
+ import path22 from "path";
5140
+
5141
+ // src/evolve/diagnosis.ts
5142
+ function numericScore(s) {
5143
+ return s.score ?? (s.pass ? 100 : 0);
5144
+ }
5145
+ function diagnoseCounterfactuals(iterations, _tasks) {
5146
+ const entries = [];
5147
+ for (let i = 1; i < iterations.length; i++) {
5148
+ const prev = iterations[i - 1];
5149
+ const curr = iterations[i];
5150
+ if (!curr.proposal && !prev.proposal) continue;
5151
+ const proposal = prev.proposal;
5152
+ if (!proposal || proposal.mutations.length === 0) continue;
5153
+ const mutationSummary = proposal.mutations.map((m) => `${m.action} in ${m.file}: ${m.rationale}`).join("; ");
5154
+ const helpedTasks = [];
5155
+ const hurtTasks = [];
5156
+ const allTaskIds = /* @__PURE__ */ new Set([
5157
+ ...Object.keys(prev.taskResults),
5158
+ ...Object.keys(curr.taskResults)
5159
+ ]);
5160
+ let netDelta = 0;
5161
+ for (const taskId of allTaskIds) {
5162
+ const prevScore = prev.taskResults[taskId] ? numericScore(prev.taskResults[taskId]) : 0;
5163
+ const currScore = curr.taskResults[taskId] ? numericScore(curr.taskResults[taskId]) : 0;
5164
+ const delta = currScore - prevScore;
5165
+ if (delta > 0) {
5166
+ helpedTasks.push({ taskId, delta });
5167
+ } else if (delta < 0) {
5168
+ hurtTasks.push({ taskId, delta });
5169
+ }
5170
+ netDelta += delta;
5171
+ }
5172
+ entries.push({
5173
+ iteration: i,
5174
+ mutationSummary,
5175
+ helpedTasks,
5176
+ hurtTasks,
5177
+ netScoreDelta: netDelta
5178
+ });
5179
+ }
5180
+ return { entries };
5181
+ }
5182
+
5183
+ // src/evolve/report.ts
5184
+ import { parse as yamlParse } from "yaml";
5185
+ function numericScore2(s) {
5186
+ return s.score ?? (s.pass ? 100 : 0);
5187
+ }
5188
+ async function loadAllIterations(workspacePath) {
5189
+ const iterDir = path22.join(workspacePath, "iterations");
5190
+ let entries;
5191
+ try {
5192
+ entries = await fs22.readdir(iterDir);
5193
+ } catch {
5194
+ return [];
5195
+ }
5196
+ const iterations = [];
5197
+ const iterNums = entries.map((e) => parseInt(e, 10)).filter((n) => !isNaN(n)).sort((a, b) => a - b);
5198
+ for (const n of iterNums) {
5199
+ const log = await loadIterationLog(workspacePath, n);
5200
+ if (log) iterations.push(log);
5201
+ }
5202
+ return iterations;
5203
+ }
5204
+ async function loadTasks(workspacePath) {
5205
+ try {
5206
+ const content = await fs22.readFile(path22.join(workspacePath, "tasks.yaml"), "utf-8");
5207
+ const parsed = yamlParse(content);
5208
+ return parsed?.tasks ?? [];
5209
+ } catch {
5210
+ return [];
5211
+ }
5212
+ }
5213
+ function buildLeaderboard(iterations, tasks) {
5214
+ const taskIds = tasks.map((t) => t.id);
5215
+ return taskIds.map((taskId) => {
5216
+ const scores = {};
5217
+ let bestScore = -1;
5218
+ let bestIteration = 0;
5219
+ for (const iter of iterations) {
5220
+ const s = iter.taskResults[taskId];
5221
+ if (s) {
5222
+ const score = numericScore2(s);
5223
+ scores[iter.iteration] = score;
5224
+ if (score > bestScore) {
5225
+ bestScore = score;
5226
+ bestIteration = iter.iteration;
5227
+ }
5228
+ }
5229
+ }
5230
+ return { taskId, scores, bestIteration, bestScore };
5231
+ });
5232
+ }
5233
+ function iterationStatus(iter, bestIteration) {
5234
+ if (iter.iteration === 0) return "baseline";
5235
+ if (!iter.proposal && !iter.diffPatch) return "rollback";
5236
+ if (iter.score >= 100) return "perfect";
5237
+ if (iter.iteration === bestIteration) return "best";
5238
+ return "evaluated";
5239
+ }
5240
+ async function generateMarkdownReport(workspacePath) {
5241
+ const iterations = await loadAllIterations(workspacePath);
5242
+ const tasks = await loadTasks(workspacePath);
5243
+ if (iterations.length === 0) {
5244
+ return "# Evolution Report\n\nNo iterations found. Run `kairn evolve run` first.\n";
5245
+ }
5246
+ const baselineScore = iterations[0].score;
5247
+ const bestIter = iterations.reduce((best, curr) => curr.score > best.score ? curr : best, iterations[0]);
5248
+ const improvement = bestIter.score - baselineScore;
5249
+ const counterfactuals = diagnoseCounterfactuals(iterations, tasks);
5250
+ const leaderboard = buildLeaderboard(iterations, tasks);
5251
+ const lines = [];
5252
+ lines.push("# Evolution Report");
5253
+ lines.push("");
5254
+ lines.push("## Overview");
5255
+ lines.push("");
5256
+ lines.push(`| Metric | Value |`);
5257
+ lines.push(`|--------|-------|`);
5258
+ lines.push(`| Total iterations | ${iterations.length} |`);
5259
+ lines.push(`| Baseline score | ${baselineScore.toFixed(1)}% |`);
5260
+ lines.push(`| Best score | ${bestIter.score.toFixed(1)}% |`);
5261
+ lines.push(`| Best iteration | ${bestIter.iteration} |`);
5262
+ lines.push(`| Improvement | ${improvement >= 0 ? "+" : ""}${improvement.toFixed(1)} points |`);
5263
+ lines.push("");
5264
+ lines.push("## Iterations");
5265
+ lines.push("");
5266
+ lines.push("| Iter | Score | Mutations | Status |");
5267
+ lines.push("|------|-------|-----------|--------|");
5268
+ for (const iter of iterations) {
5269
+ const mutations = iter.proposal?.mutations.length ?? 0;
5270
+ const mutStr = mutations > 0 ? mutations.toString() : "-";
5271
+ const status = iterationStatus(iter, bestIter.iteration);
5272
+ lines.push(`| ${iter.iteration} | ${iter.score.toFixed(1)}% | ${mutStr} | ${status} |`);
5273
+ }
5274
+ lines.push("");
5275
+ if (leaderboard.length > 0) {
5276
+ lines.push("## Leaderboard");
5277
+ lines.push("");
5278
+ const iterNums = iterations.map((i) => i.iteration);
5279
+ const headerCols = ["Task", ...iterNums.map((n) => `Iter ${n}`), "Best"];
5280
+ lines.push(`| ${headerCols.join(" | ")} |`);
5281
+ lines.push(`| ${headerCols.map(() => "---").join(" | ")} |`);
5282
+ for (const entry of leaderboard) {
5283
+ const scoreCols = iterNums.map((n) => {
5284
+ const s = entry.scores[n];
5285
+ return s !== void 0 ? `${s.toFixed(0)}%` : "-";
5286
+ });
5287
+ lines.push(`| ${entry.taskId} | ${scoreCols.join(" | ")} | ${entry.bestScore.toFixed(0)}% (iter ${entry.bestIteration}) |`);
5288
+ }
5289
+ lines.push("");
5290
+ }
5291
+ if (counterfactuals.entries.length > 0) {
5292
+ lines.push("## Counterfactual Diagnosis");
5293
+ lines.push("");
5294
+ for (const entry of counterfactuals.entries) {
5295
+ const sign = entry.netScoreDelta >= 0 ? "+" : "";
5296
+ lines.push(`### Iteration ${entry.iteration} (net ${sign}${entry.netScoreDelta.toFixed(1)} points)`);
5297
+ lines.push("");
5298
+ lines.push(`**Mutations:** ${entry.mutationSummary}`);
5299
+ lines.push("");
5300
+ if (entry.helpedTasks.length > 0) {
5301
+ lines.push("**Helped:**");
5302
+ for (const t of entry.helpedTasks) {
5303
+ lines.push(`- ${t.taskId}: +${t.delta.toFixed(1)}`);
5304
+ }
5305
+ lines.push("");
5306
+ }
5307
+ if (entry.hurtTasks.length > 0) {
5308
+ lines.push("**Hurt:**");
5309
+ for (const t of entry.hurtTasks) {
5310
+ lines.push(`- ${t.taskId}: ${t.delta.toFixed(1)}`);
5311
+ }
5312
+ lines.push("");
5313
+ }
5314
+ }
5315
+ }
5316
+ return lines.join("\n");
5317
+ }
5318
+ async function generateJsonReport(workspacePath) {
5319
+ const iterations = await loadAllIterations(workspacePath);
5320
+ const tasks = await loadTasks(workspacePath);
5321
+ const baselineScore = iterations.length > 0 ? iterations[0].score : 0;
5322
+ const bestIter = iterations.length > 0 ? iterations.reduce((best, curr) => curr.score > best.score ? curr : best, iterations[0]) : { score: 0, iteration: 0 };
5323
+ const improvement = bestIter.score - baselineScore;
5324
+ const counterfactuals = diagnoseCounterfactuals(iterations, tasks);
5325
+ const leaderboard = buildLeaderboard(iterations, tasks);
5326
+ return {
5327
+ overview: {
5328
+ title: "Evolution Report",
5329
+ totalIterations: iterations.length,
5330
+ baselineScore,
5331
+ bestScore: bestIter.score,
5332
+ bestIteration: bestIter.iteration,
5333
+ improvement
5334
+ },
5335
+ iterations: iterations.map((iter) => ({
5336
+ iteration: iter.iteration,
5337
+ score: iter.score,
5338
+ mutationCount: iter.proposal?.mutations.length ?? 0,
5339
+ status: iterationStatus(iter, bestIter.iteration)
5340
+ })),
5341
+ leaderboard,
5342
+ counterfactuals
5343
+ };
5344
+ }
5345
+
4981
5346
  // src/commands/evolve.ts
4982
5347
  var DEFAULT_CONFIG = {
4983
5348
  model: "claude-sonnet-4-6",
@@ -4988,8 +5353,8 @@ var DEFAULT_CONFIG = {
4988
5353
  };
4989
5354
  async function loadEvolveConfigFromWorkspace(workspacePath) {
4990
5355
  try {
4991
- const configStr = await fs22.readFile(path22.join(workspacePath, "config.yaml"), "utf-8");
4992
- const parsed = yamlParse(configStr);
5356
+ const configStr = await fs23.readFile(path23.join(workspacePath, "config.yaml"), "utf-8");
5357
+ const parsed = yamlParse2(configStr);
4993
5358
  return {
4994
5359
  model: parsed.model ?? DEFAULT_CONFIG.model,
4995
5360
  proposerModel: parsed.proposer_model ?? DEFAULT_CONFIG.proposerModel,
@@ -5006,9 +5371,9 @@ evolveCommand.command("init").description("Initialize an evolution workspace wit
5006
5371
  try {
5007
5372
  const projectRoot = process.cwd();
5008
5373
  console.log(ui.section("Evolve Init"));
5009
- const claudeDir = path22.join(projectRoot, ".claude");
5374
+ const claudeDir = path23.join(projectRoot, ".claude");
5010
5375
  try {
5011
- await fs22.access(claudeDir);
5376
+ await fs23.access(claudeDir);
5012
5377
  } catch {
5013
5378
  console.log(ui.error("No .claude/ directory found. Run kairn describe first."));
5014
5379
  process.exit(1);
@@ -5058,7 +5423,7 @@ evolveCommand.command("init").description("Initialize an evolution workspace wit
5058
5423
  if (config) {
5059
5424
  let claudeMd = "";
5060
5425
  try {
5061
- claudeMd = await fs22.readFile(path22.join(claudeDir, "CLAUDE.md"), "utf-8");
5426
+ claudeMd = await fs23.readFile(path23.join(claudeDir, "CLAUDE.md"), "utf-8");
5062
5427
  } catch {
5063
5428
  }
5064
5429
  const profile = await buildProjectProfile(projectRoot);
@@ -5089,16 +5454,16 @@ evolveCommand.command("init").description("Initialize an evolution workspace wit
5089
5454
  evolveCommand.command("baseline").description("Snapshot current .claude/ directory as baseline").action(async () => {
5090
5455
  try {
5091
5456
  const projectRoot = process.cwd();
5092
- const workspace = path22.join(projectRoot, ".kairn-evolve");
5457
+ const workspace = path23.join(projectRoot, ".kairn-evolve");
5093
5458
  console.log(ui.section("Evolve Baseline"));
5094
5459
  try {
5095
- await fs22.access(workspace);
5460
+ await fs23.access(workspace);
5096
5461
  } catch {
5097
5462
  console.log(ui.error("No .kairn-evolve/ directory found. Run kairn evolve init first."));
5098
5463
  process.exit(1);
5099
5464
  }
5100
5465
  await snapshotBaseline(projectRoot, workspace);
5101
- const baselineDir = path22.join(workspace, "baseline");
5466
+ const baselineDir = path23.join(workspace, "baseline");
5102
5467
  const fileCount = await countFiles(baselineDir);
5103
5468
  console.log(ui.success(`Baseline snapshot created (${fileCount} files)`));
5104
5469
  } catch (err) {
@@ -5110,23 +5475,23 @@ evolveCommand.command("baseline").description("Snapshot current .claude/ directo
5110
5475
  evolveCommand.command("run").description("Run tasks against the current harness").option("--task <id>", "Run a specific task by ID").option("--iterations <n>", "Number of evolution iterations", "5").action(async (options) => {
5111
5476
  try {
5112
5477
  const projectRoot = process.cwd();
5113
- const workspace = path22.join(projectRoot, ".kairn-evolve");
5478
+ const workspace = path23.join(projectRoot, ".kairn-evolve");
5114
5479
  console.log(ui.section("Evolve Run"));
5115
5480
  try {
5116
- await fs22.access(workspace);
5481
+ await fs23.access(workspace);
5117
5482
  } catch {
5118
5483
  console.log(ui.error("No .kairn-evolve/ directory found. Run kairn evolve init first."));
5119
5484
  process.exit(1);
5120
5485
  }
5121
- const tasksPath = path22.join(workspace, "tasks.yaml");
5486
+ const tasksPath = path23.join(workspace, "tasks.yaml");
5122
5487
  let tasksContent;
5123
5488
  try {
5124
- tasksContent = await fs22.readFile(tasksPath, "utf-8");
5489
+ tasksContent = await fs23.readFile(tasksPath, "utf-8");
5125
5490
  } catch {
5126
5491
  console.log(ui.error("No tasks.yaml found. Run kairn evolve init first."));
5127
5492
  process.exit(1);
5128
5493
  }
5129
- const parsed = yamlParse(tasksContent);
5494
+ const parsed = yamlParse2(tasksContent);
5130
5495
  if (!parsed?.tasks || parsed.tasks.length === 0) {
5131
5496
  console.log(ui.error("No tasks found in tasks.yaml"));
5132
5497
  process.exit(1);
@@ -5140,15 +5505,15 @@ evolveCommand.command("run").description("Run tasks against the current harness"
5140
5505
  console.log(ui.info(`Running ${tasksToRun.length} task(s)...`));
5141
5506
  console.log("");
5142
5507
  const config = await loadConfig();
5143
- const harnessPath = path22.join(projectRoot, ".claude");
5508
+ const harnessPath = path23.join(projectRoot, ".claude");
5144
5509
  const results = [];
5145
5510
  for (const task of tasksToRun) {
5146
- const traceDir = path22.join(workspace, "traces", "0", task.id);
5511
+ const traceDir = path23.join(workspace, "traces", "0", task.id);
5147
5512
  const spinner = ora2(`Running: ${task.id}`).start();
5148
5513
  const result = await runTask(task, harnessPath, traceDir, 0);
5149
5514
  if (config) {
5150
- const stdout = await fs22.readFile(path22.join(traceDir, "stdout.log"), "utf-8").catch(() => "");
5151
- const stderr = await fs22.readFile(path22.join(traceDir, "stderr.log"), "utf-8").catch(() => "");
5515
+ const stdout = await fs23.readFile(path23.join(traceDir, "stdout.log"), "utf-8").catch(() => "");
5516
+ const stderr = await fs23.readFile(path23.join(traceDir, "stderr.log"), "utf-8").catch(() => "");
5152
5517
  const score = await scoreTask(task, traceDir, stdout, stderr, config);
5153
5518
  result.score = score;
5154
5519
  await writeScore(traceDir, score);
@@ -5177,7 +5542,7 @@ evolveCommand.command("run").description("Run tasks against the current harness"
5177
5542
  }
5178
5543
  evolveConfig.maxIterations = iterations;
5179
5544
  try {
5180
- await fs22.access(path22.join(workspace, "iterations", "0", "harness"));
5545
+ await fs23.access(path23.join(workspace, "iterations", "0", "harness"));
5181
5546
  } catch {
5182
5547
  console.log(ui.error("No baseline harness found. Run kairn evolve baseline first."));
5183
5548
  process.exit(1);
@@ -5204,6 +5569,18 @@ evolveCommand.command("run").description("Run tasks against the current harness"
5204
5569
  case "perfect-score":
5205
5570
  console.log(chalk14.green(" Perfect score. Stopping."));
5206
5571
  break;
5572
+ case "proposer-error":
5573
+ console.log(chalk14.yellow(` Warning: ${event.message ?? "Proposer failed"}`));
5574
+ break;
5575
+ case "task-start":
5576
+ console.log(chalk14.dim(` Running: ${event.taskId ?? "unknown"}...`));
5577
+ break;
5578
+ case "task-scored": {
5579
+ const taskScore = event.score ?? 0;
5580
+ const taskStatus = taskScore >= 100 ? chalk14.green("PASS") : taskScore >= 60 ? chalk14.yellow("PARTIAL") : chalk14.red("FAIL");
5581
+ console.log(` ${taskStatus} ${event.taskId ?? "unknown"} ${chalk14.dim(`(${taskScore.toFixed(0)}%)`)}`);
5582
+ break;
5583
+ }
5207
5584
  case "complete":
5208
5585
  break;
5209
5586
  }
@@ -5238,13 +5615,107 @@ evolveCommand.command("run").description("Run tasks against the current harness"
5238
5615
  process.exit(1);
5239
5616
  }
5240
5617
  });
5618
+ evolveCommand.command("report").description("Generate a summary report of the evolution run").option("--json", "Output machine-readable JSON instead of Markdown").action(async (options) => {
5619
+ try {
5620
+ const projectRoot = process.cwd();
5621
+ const workspace = path23.join(projectRoot, ".kairn-evolve");
5622
+ try {
5623
+ await fs23.access(workspace);
5624
+ } catch {
5625
+ console.log(ui.error("No .kairn-evolve/ directory found. Run kairn evolve init first."));
5626
+ process.exit(1);
5627
+ }
5628
+ if (options.json) {
5629
+ const report = await generateJsonReport(workspace);
5630
+ console.log(JSON.stringify(report, null, 2));
5631
+ } else {
5632
+ const markdown = await generateMarkdownReport(workspace);
5633
+ console.log(markdown);
5634
+ }
5635
+ } catch (err) {
5636
+ const msg = err instanceof Error ? err.message : String(err);
5637
+ console.log(ui.error(msg));
5638
+ process.exit(1);
5639
+ }
5640
+ });
5641
+ evolveCommand.command("diff <iter1> <iter2>").description("Show harness changes between two iterations").action(async (iter1Str, iter2Str) => {
5642
+ try {
5643
+ const projectRoot = process.cwd();
5644
+ const workspace = path23.join(projectRoot, ".kairn-evolve");
5645
+ const iter1 = parseInt(iter1Str, 10);
5646
+ const iter2 = parseInt(iter2Str, 10);
5647
+ if (isNaN(iter1) || isNaN(iter2)) {
5648
+ console.log(ui.error("Both arguments must be integers (iteration numbers)"));
5649
+ process.exit(1);
5650
+ }
5651
+ const harness1 = path23.join(workspace, "iterations", iter1.toString(), "harness");
5652
+ const harness2 = path23.join(workspace, "iterations", iter2.toString(), "harness");
5653
+ try {
5654
+ await fs23.access(harness1);
5655
+ } catch {
5656
+ console.log(ui.error(`Iteration ${iter1} harness not found at ${harness1}`));
5657
+ process.exit(1);
5658
+ }
5659
+ try {
5660
+ await fs23.access(harness2);
5661
+ } catch {
5662
+ console.log(ui.error(`Iteration ${iter2} harness not found at ${harness2}`));
5663
+ process.exit(1);
5664
+ }
5665
+ console.log(ui.section(`Diff: Iteration ${iter1} \u2192 ${iter2}`));
5666
+ const diffPatch = await generateDiff2(harness1, harness2);
5667
+ if (!diffPatch) {
5668
+ console.log(chalk14.dim(" No harness changes between these iterations."));
5669
+ } else {
5670
+ for (const line of diffPatch.split("\n")) {
5671
+ if (line.startsWith("---") || line.startsWith("+++")) {
5672
+ console.log(chalk14.bold(line));
5673
+ } else if (line.startsWith("+")) {
5674
+ console.log(chalk14.green(line));
5675
+ } else if (line.startsWith("-")) {
5676
+ console.log(chalk14.red(line));
5677
+ } else {
5678
+ console.log(line);
5679
+ }
5680
+ }
5681
+ }
5682
+ const [log1, log2] = await Promise.all([
5683
+ loadIterationLog(workspace, iter1),
5684
+ loadIterationLog(workspace, iter2)
5685
+ ]);
5686
+ if (log1 && log2) {
5687
+ console.log("");
5688
+ console.log(ui.section("Score Comparison"));
5689
+ console.log("");
5690
+ console.log(" Task Iter " + iter1 + " Iter " + iter2 + " Delta");
5691
+ const allTaskIds = /* @__PURE__ */ new Set([
5692
+ ...Object.keys(log1.taskResults),
5693
+ ...Object.keys(log2.taskResults)
5694
+ ]);
5695
+ for (const taskId of [...allTaskIds].sort()) {
5696
+ const s1 = log1.taskResults[taskId];
5697
+ const s2 = log2.taskResults[taskId];
5698
+ const score1 = s1 ? s1.score ?? (s1.pass ? 100 : 0) : 0;
5699
+ const score2 = s2 ? s2.score ?? (s2.pass ? 100 : 0) : 0;
5700
+ const delta = score2 - score1;
5701
+ const deltaStr = delta > 0 ? chalk14.green(`+${delta.toFixed(0)}`) : delta < 0 ? chalk14.red(delta.toFixed(0).toString()) : chalk14.dim("0");
5702
+ const name = taskId.padEnd(30);
5703
+ console.log(` ${name} ${score1.toFixed(0).padStart(5)}% ${score2.toFixed(0).padStart(5)}% ${deltaStr}`);
5704
+ }
5705
+ }
5706
+ } catch (err) {
5707
+ const msg = err instanceof Error ? err.message : String(err);
5708
+ console.log(ui.error(msg));
5709
+ process.exit(1);
5710
+ }
5711
+ });
5241
5712
  async function countFiles(dir) {
5242
5713
  let count = 0;
5243
5714
  try {
5244
- const entries = await fs22.readdir(dir, { withFileTypes: true });
5715
+ const entries = await fs23.readdir(dir, { withFileTypes: true });
5245
5716
  for (const entry of entries) {
5246
5717
  if (entry.isDirectory()) {
5247
- count += await countFiles(path22.join(dir, entry.name));
5718
+ count += await countFiles(path23.join(dir, entry.name));
5248
5719
  } else {
5249
5720
  count++;
5250
5721
  }