kairn-cli 2.2.0 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -3741,30 +3741,55 @@ var EVAL_TEMPLATES = {
3741
3741
  name: "Documentation",
3742
3742
  description: "Can the agent write and update docs?",
3743
3743
  bestFor: ["content", "api-building", "full-stack"]
3744
+ },
3745
+ "convention-adherence": {
3746
+ id: "convention-adherence",
3747
+ name: "Convention Adherence",
3748
+ description: "Does the agent follow all project conventions defined in CLAUDE.md?",
3749
+ bestFor: ["feature-development", "full-stack", "backend", "maintenance"]
3750
+ },
3751
+ "workflow-compliance": {
3752
+ id: "workflow-compliance",
3753
+ name: "Workflow Compliance",
3754
+ description: "Does the agent use the project workflow commands and skills?",
3755
+ bestFor: ["feature-development", "full-stack", "tdd", "qa"]
3756
+ },
3757
+ "rule-compliance": {
3758
+ id: "rule-compliance",
3759
+ name: "Rule Compliance",
3760
+ description: "Does the agent follow all project rules without violations?",
3761
+ bestFor: ["feature-development", "backend", "maintenance", "architecture"]
3744
3762
  }
3745
3763
  };
3746
3764
  function selectTemplatesForWorkflow(workflowType) {
3747
3765
  const mapping = {
3748
- "feature-development": ["add-feature", "test-writing", "documentation"],
3749
- "api-building": ["add-feature", "fix-bug", "test-writing"],
3750
- "full-stack": ["add-feature", "fix-bug", "test-writing"],
3751
- "maintenance": ["fix-bug", "refactor", "test-writing"],
3752
- "debugging": ["fix-bug", "test-writing"],
3753
- "qa": ["fix-bug", "test-writing", "add-feature"],
3754
- "architecture": ["refactor", "test-writing", "config-change"],
3755
- "backend": ["fix-bug", "refactor", "config-change", "test-writing"],
3756
- "devops": ["config-change", "fix-bug"],
3757
- "infrastructure": ["config-change", "refactor"],
3758
- "tdd": ["test-writing", "add-feature", "fix-bug"],
3759
- "content": ["documentation", "add-feature"],
3760
- "research": ["documentation", "add-feature"]
3766
+ "feature-development": ["add-feature", "test-writing", "convention-adherence", "workflow-compliance"],
3767
+ "api-building": ["add-feature", "fix-bug", "test-writing", "convention-adherence"],
3768
+ "full-stack": ["add-feature", "fix-bug", "test-writing", "convention-adherence"],
3769
+ "maintenance": ["fix-bug", "refactor", "test-writing", "rule-compliance"],
3770
+ "debugging": ["fix-bug", "test-writing", "rule-compliance"],
3771
+ "qa": ["fix-bug", "test-writing", "add-feature", "workflow-compliance"],
3772
+ "architecture": ["refactor", "test-writing", "config-change", "convention-adherence"],
3773
+ "backend": ["fix-bug", "refactor", "config-change", "rule-compliance"],
3774
+ "devops": ["config-change", "fix-bug", "rule-compliance"],
3775
+ "infrastructure": ["config-change", "refactor", "convention-adherence"],
3776
+ "tdd": ["test-writing", "add-feature", "fix-bug", "workflow-compliance"],
3777
+ "content": ["documentation", "add-feature", "convention-adherence"],
3778
+ "research": ["documentation", "add-feature", "convention-adherence"]
3761
3779
  };
3762
- return mapping[workflowType] || ["add-feature", "fix-bug", "test-writing"];
3780
+ return mapping[workflowType] || ["add-feature", "fix-bug", "test-writing", "convention-adherence"];
3763
3781
  }
3764
3782
  var TASK_GENERATION_PROMPT = `You are an eval task generator for Claude Code agent environments. Given a project's CLAUDE.md, project structure, and selected eval templates, generate concrete, project-specific tasks.
3765
3783
 
3766
3784
  Each task must be realistic and testable against the actual project. Avoid generic placeholders.
3767
3785
 
3786
+ IMPORTANT: For harness-aware templates (convention-adherence, workflow-compliance, rule-compliance), generate tasks where success DEPENDS on the agent reading and following the .claude/ harness content:
3787
+ - convention-adherence: Task must require following specific conventions from CLAUDE.md (naming, file structure, patterns). Judge by whether output matches the conventions.
3788
+ - workflow-compliance: Task must require using project slash commands or workflow steps defined in .claude/commands/. Judge by whether the agent followed the defined workflow.
3789
+ - rule-compliance: Task must create a scenario where .claude/rules/ content is relevant. Judge by whether the agent respected all rules.
3790
+
3791
+ These harness-aware tasks are critical \u2014 they test whether the .claude/ environment actually improves agent behavior.
3792
+
3768
3793
  Return a JSON object with a "tasks" array. Each task has:
3769
3794
  - id: kebab-case identifier (e.g., "add-health-endpoint")
3770
3795
  - template: which eval template this instantiates
@@ -4190,7 +4215,8 @@ ${msg}`);
4190
4215
  details: passed2 ? `All ${commands.length} verification commands passed` : failures.join("\n")
4191
4216
  };
4192
4217
  }
4193
- const hasErrors = stderr.toLowerCase().includes("error") || stderr.toLowerCase().includes("failed") || stderr.toLowerCase().includes("exception");
4218
+ const filteredStderr = stderr.split("\n").filter((line) => !line.startsWith("[setup]")).join("\n");
4219
+ const hasErrors = filteredStderr.toLowerCase().includes("error") || filteredStderr.toLowerCase().includes("failed") || filteredStderr.toLowerCase().includes("exception");
4194
4220
  const passed = !hasErrors;
4195
4221
  return {
4196
4222
  pass: passed,
@@ -4318,24 +4344,88 @@ async function scoreTask(task, workspacePath, stdout, stderr, config) {
4318
4344
 
4319
4345
  // src/evolve/runner.ts
4320
4346
  var execAsync2 = promisify2(exec2);
4321
- async function runTask(task, harnessPath, traceDir, iteration) {
4347
+ var COPY_SKIP_DIRS = /* @__PURE__ */ new Set([".git", "node_modules", ".kairn-evolve", ".claude"]);
4348
+ async function createIsolatedWorkspace(projectRoot, harnessPath) {
4349
+ const suffix = `${Date.now()}-${Math.random().toString(36).slice(2)}`;
4350
+ try {
4351
+ await execAsync2("git rev-parse --is-inside-work-tree", {
4352
+ cwd: projectRoot,
4353
+ timeout: 5e3
4354
+ });
4355
+ const tmpDir2 = path18.join(os3.tmpdir(), `kairn-evolve-wt-${suffix}`);
4356
+ await execAsync2(`git worktree add --detach "${tmpDir2}" HEAD`, {
4357
+ cwd: projectRoot,
4358
+ timeout: 3e4
4359
+ });
4360
+ await fs18.rm(path18.join(tmpDir2, ".claude"), { recursive: true, force: true });
4361
+ await copyDir(harnessPath, path18.join(tmpDir2, ".claude"));
4362
+ return { workDir: tmpDir2, isWorktree: true };
4363
+ } catch {
4364
+ }
4365
+ const tmpDir = await fs18.mkdtemp(path18.join(os3.tmpdir(), `kairn-evolve-cp-`));
4366
+ await copyProjectDir(projectRoot, tmpDir);
4367
+ await fs18.rm(path18.join(tmpDir, ".claude"), { recursive: true, force: true });
4368
+ await copyDir(harnessPath, path18.join(tmpDir, ".claude"));
4369
+ return { workDir: tmpDir, isWorktree: false };
4370
+ }
4371
+ async function copyProjectDir(src, dest) {
4372
+ await fs18.mkdir(dest, { recursive: true });
4373
+ let entries;
4374
+ try {
4375
+ entries = await fs18.readdir(src, { withFileTypes: true });
4376
+ } catch {
4377
+ return;
4378
+ }
4379
+ for (const entry of entries) {
4380
+ if (COPY_SKIP_DIRS.has(entry.name)) continue;
4381
+ const srcPath = path18.join(src, entry.name);
4382
+ const destPath = path18.join(dest, entry.name);
4383
+ if (entry.isDirectory()) {
4384
+ await copyDir(srcPath, destPath);
4385
+ } else {
4386
+ await fs18.copyFile(srcPath, destPath);
4387
+ }
4388
+ }
4389
+ }
4390
+ async function cleanupIsolatedWorkspace(workDir, isWorktree, projectRoot) {
4391
+ if (isWorktree) {
4392
+ try {
4393
+ await execAsync2(`git worktree remove "${workDir}" --force`, {
4394
+ cwd: projectRoot,
4395
+ timeout: 1e4
4396
+ });
4397
+ } catch {
4398
+ await fs18.rm(workDir, { recursive: true, force: true }).catch(() => {
4399
+ });
4400
+ await execAsync2("git worktree prune", {
4401
+ cwd: projectRoot,
4402
+ timeout: 5e3
4403
+ }).catch(() => {
4404
+ });
4405
+ }
4406
+ } else {
4407
+ await fs18.rm(workDir, { recursive: true, force: true }).catch(() => {
4408
+ });
4409
+ }
4410
+ }
4411
+ async function runTask(task, harnessPath, traceDir, iteration, projectRoot) {
4322
4412
  await fs18.mkdir(traceDir, { recursive: true });
4323
4413
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
4324
4414
  const startMs = Date.now();
4325
- const tmpDir = await fs18.mkdtemp(path18.join(os3.tmpdir(), "kairn-evolve-"));
4415
+ const root = projectRoot ?? process.cwd();
4416
+ const { workDir, isWorktree } = await createIsolatedWorkspace(root, harnessPath);
4326
4417
  try {
4327
- await copyDir(harnessPath, path18.join(tmpDir, ".claude"));
4328
4418
  let setupStderr = "";
4329
4419
  if (task.setup.trim()) {
4330
4420
  try {
4331
- await execAsync2(task.setup, { cwd: tmpDir, timeout: 6e4 });
4421
+ await execAsync2(task.setup, { cwd: workDir, timeout: 6e4 });
4332
4422
  } catch (err) {
4333
4423
  setupStderr = err instanceof Error ? err.message : String(err);
4334
4424
  }
4335
4425
  }
4336
- const filesBefore = await snapshotFileList(tmpDir);
4337
- const spawnResult = await spawnClaude(task.description, tmpDir, task.timeout);
4338
- const filesAfter = await snapshotFileList(tmpDir);
4426
+ const filesBefore = await snapshotFileList(workDir);
4427
+ const spawnResult = await spawnClaude(task.description, workDir, task.timeout);
4428
+ const filesAfter = await snapshotFileList(workDir);
4339
4429
  const filesChanged = diffFileLists(filesBefore, filesAfter);
4340
4430
  const toolCalls = parseToolCalls(spawnResult.stdout);
4341
4431
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
@@ -4359,8 +4449,7 @@ ${spawnResult.stderr}` : spawnResult.stderr;
4359
4449
  traceDir
4360
4450
  };
4361
4451
  } finally {
4362
- await fs18.rm(tmpDir, { recursive: true, force: true }).catch(() => {
4363
- });
4452
+ await cleanupIsolatedWorkspace(workDir, isWorktree, root);
4364
4453
  }
4365
4454
  }
4366
4455
  async function spawnClaude(instruction, cwd, timeoutSec) {
@@ -4458,8 +4547,9 @@ function parseToolCalls(stdout) {
4458
4547
  return [];
4459
4548
  }
4460
4549
  }
4461
- async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config) {
4550
+ async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config, onProgress) {
4462
4551
  const results = {};
4552
+ const projectRoot = path18.resolve(workspacePath, "..");
4463
4553
  for (const task of tasks) {
4464
4554
  const traceDir = path18.join(
4465
4555
  workspacePath,
@@ -4467,7 +4557,8 @@ async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config)
4467
4557
  iteration.toString(),
4468
4558
  task.id
4469
4559
  );
4470
- const taskResult = await runTask(task, harnessPath, traceDir, iteration);
4560
+ onProgress?.({ type: "task-start", iteration, taskId: task.id });
4561
+ const taskResult = await runTask(task, harnessPath, traceDir, iteration, projectRoot);
4471
4562
  let score = taskResult.score;
4472
4563
  if (config) {
4473
4564
  const stdout = await fs18.readFile(path18.join(traceDir, "stdout.log"), "utf-8").catch(() => "");
@@ -4476,6 +4567,12 @@ async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config)
4476
4567
  await writeScore(traceDir, score);
4477
4568
  }
4478
4569
  results[task.id] = score;
4570
+ onProgress?.({
4571
+ type: "task-scored",
4572
+ iteration,
4573
+ taskId: task.id,
4574
+ score: score.score ?? (score.pass ? 100 : 0)
4575
+ });
4479
4576
  }
4480
4577
  const scores = Object.values(results);
4481
4578
  const total = scores.reduce(
@@ -4537,7 +4634,8 @@ Return a JSON object:
4537
4634
  - Prefer ADDITIVE changes over replacements when possible.
4538
4635
 
4539
4636
  Return ONLY valid JSON.`;
4540
- var STDOUT_TRUNCATION_LIMIT = 2e3;
4637
+ var STDOUT_TRUNCATION_LIMIT = 1e3;
4638
+ var MAX_CONTEXT_CHARS = 1e5;
4541
4639
  async function readHarnessFiles(harnessPath) {
4542
4640
  const result = {};
4543
4641
  async function walk(dir, prefix) {
@@ -4571,26 +4669,25 @@ function truncateStdout(stdout, limit) {
4571
4669
  ${stdout.slice(-limit)}`;
4572
4670
  }
4573
4671
  function buildProposerUserMessage(harnessFiles, traces, tasks, history) {
4574
- const sections = [];
4575
- sections.push("## Current Harness Files\n");
4672
+ const harnessSection = ["## Current Harness Files\n"];
4576
4673
  const fileEntries = Object.entries(harnessFiles);
4577
4674
  if (fileEntries.length === 0) {
4578
- sections.push("(No harness files found)\n");
4675
+ harnessSection.push("(No harness files found)\n");
4579
4676
  } else {
4580
4677
  for (const [filePath, content] of fileEntries) {
4581
- sections.push(`### ${filePath}
4678
+ harnessSection.push(`### ${filePath}
4582
4679
  \`\`\`
4583
4680
  ${content}
4584
4681
  \`\`\`
4585
4682
  `);
4586
4683
  }
4587
4684
  }
4588
- sections.push("## Task Definitions\n");
4685
+ const taskSection = ["## Task Definitions\n"];
4589
4686
  if (tasks.length === 0) {
4590
- sections.push("(No tasks defined)\n");
4687
+ taskSection.push("(No tasks defined)\n");
4591
4688
  } else {
4592
4689
  for (const task of tasks) {
4593
- sections.push(
4690
+ taskSection.push(
4594
4691
  `### Task: ${task.id}
4595
4692
  - Template: ${task.template}
4596
4693
  - Description: ${task.description}
@@ -4600,15 +4697,27 @@ ${content}
4600
4697
  );
4601
4698
  }
4602
4699
  }
4603
- sections.push("## Execution Traces\n");
4604
- if (traces.length === 0) {
4605
- sections.push("(No traces available)\n");
4606
- } else {
4700
+ const fixedContent = harnessSection.join("\n") + "\n" + taskSection.join("\n");
4701
+ const remainingBudget = MAX_CONTEXT_CHARS - fixedContent.length;
4702
+ if (remainingBudget <= 0) {
4703
+ return fixedContent + "\n\n[...traces and history omitted \u2014 harness + tasks fill context budget...]";
4704
+ }
4705
+ const traceBudget = Math.floor(remainingBudget * 0.7);
4706
+ const historyBudget = remainingBudget - traceBudget;
4707
+ const traceSection = buildTraceSection(traces, traceBudget);
4708
+ const historySection = buildHistorySection(history, historyBudget);
4709
+ return fixedContent + "\n" + traceSection + "\n" + historySection;
4710
+ }
4711
+ function buildTraceSection(traces, budget) {
4712
+ if (traces.length === 0) return "## Execution Traces\n\n(No traces available)\n";
4713
+ let stdoutLimit = STDOUT_TRUNCATION_LIMIT;
4714
+ for (let attempt = 0; attempt < 4; attempt++) {
4715
+ const parts = ["## Execution Traces\n"];
4607
4716
  for (const trace of traces) {
4608
4717
  const scoreNum = trace.score.score !== void 0 ? trace.score.score : trace.score.pass ? 100 : 0;
4609
- const truncatedStdout = truncateStdout(trace.stdout, STDOUT_TRUNCATION_LIMIT);
4718
+ const truncatedStdout = truncateStdout(trace.stdout, stdoutLimit);
4610
4719
  const filesChangedList = Object.entries(trace.filesChanged).map(([f, action]) => ` - ${f}: ${action}`).join("\n");
4611
- sections.push(
4720
+ parts.push(
4612
4721
  `### Trace: ${trace.taskId}
4613
4722
  - Pass: ${trace.score.pass}
4614
4723
  - Score: ${scoreNum}
@@ -4616,36 +4725,55 @@ ${content}
4616
4725
  ` : "") + `- Duration: ${trace.timing.durationMs}ms
4617
4726
  - Files changed:
4618
4727
  ${filesChangedList || " (none)"}
4619
- - Stdout (last ${STDOUT_TRUNCATION_LIMIT} chars):
4728
+ - Stdout (last ${stdoutLimit} chars):
4620
4729
  \`\`\`
4621
4730
  ${truncatedStdout}
4622
4731
  \`\`\`
4623
4732
  `
4624
4733
  );
4625
4734
  }
4735
+ const result = parts.join("\n");
4736
+ if (result.length <= budget) return result;
4737
+ stdoutLimit = Math.floor(stdoutLimit / 2);
4626
4738
  }
4627
- sections.push("## Iteration History\n");
4628
- if (history.length === 0) {
4629
- sections.push("(No previous iterations)\n");
4630
- } else {
4631
- for (const log of history) {
4739
+ const summary = ["## Execution Traces (summary \u2014 stdout omitted to fit budget)\n"];
4740
+ for (const trace of traces) {
4741
+ const scoreNum = trace.score.score !== void 0 ? trace.score.score : trace.score.pass ? 100 : 0;
4742
+ summary.push(`- ${trace.taskId}: ${scoreNum} (pass=${trace.score.pass})
4743
+ `);
4744
+ }
4745
+ return summary.join("\n");
4746
+ }
4747
+ function buildHistorySection(history, budget) {
4748
+ if (history.length === 0) return "## Iteration History\n\n(No previous iterations)\n";
4749
+ let entries = [...history];
4750
+ while (entries.length > 0) {
4751
+ const parts = ["## Iteration History\n"];
4752
+ if (entries.length < history.length) {
4753
+ parts.push(`(Showing ${entries.length}/${history.length} most recent iterations)
4754
+ `);
4755
+ }
4756
+ for (const log of entries) {
4632
4757
  const taskScores = Object.entries(log.taskResults).map(([id, s]) => ` - ${id}: ${s.score !== void 0 ? s.score : s.pass ? 100 : 0} (pass=${s.pass})`).join("\n");
4633
- sections.push(
4758
+ parts.push(
4634
4759
  `### Iteration ${log.iteration} \u2014 Score: ${log.score}
4635
4760
  - Task results:
4636
4761
  ${taskScores}
4637
4762
  `
4638
4763
  );
4639
4764
  if (log.proposal) {
4640
- sections.push(
4765
+ parts.push(
4641
4766
  `- Proposal reasoning: ${log.proposal.reasoning}
4642
4767
  - Mutations: ${log.proposal.mutations.length} change(s)
4643
4768
  `
4644
4769
  );
4645
4770
  }
4646
4771
  }
4772
+ const result = parts.join("\n");
4773
+ if (result.length <= budget) return result;
4774
+ entries = entries.slice(1);
4647
4775
  }
4648
- return sections.join("\n");
4776
+ return "## Iteration History\n\n(History omitted to fit context budget)\n";
4649
4777
  }
4650
4778
  function parseProposerResponse(raw) {
4651
4779
  let cleaned = raw.trim();
@@ -4857,7 +4985,8 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
4857
4985
  harnessPath,
4858
4986
  workspacePath,
4859
4987
  iter,
4860
- kairnConfig
4988
+ kairnConfig,
4989
+ onProgress
4861
4990
  );
4862
4991
  onProgress?.({ type: "iteration-scored", iteration: iter, score: aggregate });
4863
4992
  if (iter === 0) baselineScore = aggregate;
@@ -4935,7 +5064,13 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
4935
5064
  kairnConfig,
4936
5065
  evolveConfig.proposerModel
4937
5066
  );
4938
- } catch {
5067
+ } catch (err) {
5068
+ const errMsg = err instanceof Error ? err.message : String(err);
5069
+ onProgress?.({
5070
+ type: "proposer-error",
5071
+ iteration: iter,
5072
+ message: `Proposer failed: ${errMsg}`
5073
+ });
4939
5074
  const nextIterDir2 = path21.join(
4940
5075
  workspacePath,
4941
5076
  "iterations",
@@ -5434,6 +5569,18 @@ evolveCommand.command("run").description("Run tasks against the current harness"
5434
5569
  case "perfect-score":
5435
5570
  console.log(chalk14.green(" Perfect score. Stopping."));
5436
5571
  break;
5572
+ case "proposer-error":
5573
+ console.log(chalk14.yellow(` Warning: ${event.message ?? "Proposer failed"}`));
5574
+ break;
5575
+ case "task-start":
5576
+ console.log(chalk14.dim(` Running: ${event.taskId ?? "unknown"}...`));
5577
+ break;
5578
+ case "task-scored": {
5579
+ const taskScore = event.score ?? 0;
5580
+ const taskStatus = taskScore >= 100 ? chalk14.green("PASS") : taskScore >= 60 ? chalk14.yellow("PARTIAL") : chalk14.red("FAIL");
5581
+ console.log(` ${taskStatus} ${event.taskId ?? "unknown"} ${chalk14.dim(`(${taskScore.toFixed(0)}%)`)}`);
5582
+ break;
5583
+ }
5437
5584
  case "complete":
5438
5585
  break;
5439
5586
  }