npm - kairn-cli - Versions diffs - 2.2.0 → 2.2.2 - Mend

kairn-cli 2.2.0 → 2.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/cli.js CHANGED Viewed

@@ -1237,22 +1237,29 @@ function classifyError(err, provider) {
 }
 async function callLLM(config, userMessage, options) {
   const maxTokens = options.maxTokens ?? 8192;
-  const systemPrompt = options.systemPrompt;
+  const { systemPrompt } = options;
+  const jsonMode = options.jsonMode ?? false;
   const providerName = getProviderName(config.provider);
   if (config.provider === "anthropic") {
     const client2 = new Anthropic2({ apiKey: config.api_key });
+    const messages = [
+      { role: "user", content: userMessage }
+    ];
+    if (jsonMode) {
+      messages.push({ role: "assistant", content: "{" });
+    }
     try {
       const response = await client2.messages.create({
         model: config.model,
         max_tokens: maxTokens,
         system: systemPrompt,
-        messages: [{ role: "user", content: userMessage }]
+        messages
       });
       const textBlock = response.content.find((block) => block.type === "text");
       if (!textBlock || textBlock.type !== "text") {
         throw new Error("No text response from compiler LLM");
       }
-      return textBlock.text;
+      return jsonMode ? `{${textBlock.text}` : textBlock.text;
     } catch (err) {
       throw new Error(classifyError(err, providerName));
     }
@@ -1268,7 +1275,8 @@ async function callLLM(config, userMessage, options) {
       messages: [
         { role: "system", content: systemPrompt },
         { role: "user", content: userMessage }
-      ]
+      ],
+      ...jsonMode ? { response_format: { type: "json_object" } } : {}
     });
     const text = response.choices[0]?.message?.content;
     if (!text) {
@@ -3741,30 +3749,55 @@ var EVAL_TEMPLATES = {
     name: "Documentation",
     description: "Can the agent write and update docs?",
     bestFor: ["content", "api-building", "full-stack"]
+  },
+  "convention-adherence": {
+    id: "convention-adherence",
+    name: "Convention Adherence",
+    description: "Does the agent follow all project conventions defined in CLAUDE.md?",
+    bestFor: ["feature-development", "full-stack", "backend", "maintenance"]
+  },
+  "workflow-compliance": {
+    id: "workflow-compliance",
+    name: "Workflow Compliance",
+    description: "Does the agent use the project workflow commands and skills?",
+    bestFor: ["feature-development", "full-stack", "tdd", "qa"]
+  },
+  "rule-compliance": {
+    id: "rule-compliance",
+    name: "Rule Compliance",
+    description: "Does the agent follow all project rules without violations?",
+    bestFor: ["feature-development", "backend", "maintenance", "architecture"]
   }
 };
 function selectTemplatesForWorkflow(workflowType) {
   const mapping = {
-    "feature-development": ["add-feature", "test-writing", "documentation"],
-    "api-building": ["add-feature", "fix-bug", "test-writing"],
-    "full-stack": ["add-feature", "fix-bug", "test-writing"],
-    "maintenance": ["fix-bug", "refactor", "test-writing"],
-    "debugging": ["fix-bug", "test-writing"],
-    "qa": ["fix-bug", "test-writing", "add-feature"],
-    "architecture": ["refactor", "test-writing", "config-change"],
-    "backend": ["fix-bug", "refactor", "config-change", "test-writing"],
-    "devops": ["config-change", "fix-bug"],
-    "infrastructure": ["config-change", "refactor"],
-    "tdd": ["test-writing", "add-feature", "fix-bug"],
-    "content": ["documentation", "add-feature"],
-    "research": ["documentation", "add-feature"]
+    "feature-development": ["add-feature", "test-writing", "convention-adherence", "workflow-compliance"],
+    "api-building": ["add-feature", "fix-bug", "test-writing", "convention-adherence"],
+    "full-stack": ["add-feature", "fix-bug", "test-writing", "convention-adherence"],
+    "maintenance": ["fix-bug", "refactor", "test-writing", "rule-compliance"],
+    "debugging": ["fix-bug", "test-writing", "rule-compliance"],
+    "qa": ["fix-bug", "test-writing", "add-feature", "workflow-compliance"],
+    "architecture": ["refactor", "test-writing", "config-change", "convention-adherence"],
+    "backend": ["fix-bug", "refactor", "config-change", "rule-compliance"],
+    "devops": ["config-change", "fix-bug", "rule-compliance"],
+    "infrastructure": ["config-change", "refactor", "convention-adherence"],
+    "tdd": ["test-writing", "add-feature", "fix-bug", "workflow-compliance"],
+    "content": ["documentation", "add-feature", "convention-adherence"],
+    "research": ["documentation", "add-feature", "convention-adherence"]
   };
-  return mapping[workflowType] || ["add-feature", "fix-bug", "test-writing"];
+  return mapping[workflowType] || ["add-feature", "fix-bug", "test-writing", "convention-adherence"];
 }
 var TASK_GENERATION_PROMPT = `You are an eval task generator for Claude Code agent environments. Given a project's CLAUDE.md, project structure, and selected eval templates, generate concrete, project-specific tasks.
 Each task must be realistic and testable against the actual project. Avoid generic placeholders.
+IMPORTANT: For harness-aware templates (convention-adherence, workflow-compliance, rule-compliance), generate tasks where success DEPENDS on the agent reading and following the .claude/ harness content:
+- convention-adherence: Task must require following specific conventions from CLAUDE.md (naming, file structure, patterns). Judge by whether output matches the conventions.
+- workflow-compliance: Task must require using project slash commands or workflow steps defined in .claude/commands/. Judge by whether the agent followed the defined workflow.
+- rule-compliance: Task must create a scenario where .claude/rules/ content is relevant. Judge by whether the agent respected all rules.
+These harness-aware tasks are critical \u2014 they test whether the .claude/ environment actually improves agent behavior.
 Return a JSON object with a "tasks" array. Each task has:
 - id: kebab-case identifier (e.g., "add-health-endpoint")
 - template: which eval template this instantiates
@@ -4190,7 +4223,8 @@ ${msg}`);
       details: passed2 ? `All ${commands.length} verification commands passed` : failures.join("\n")
     };
   }
-  const hasErrors = stderr.toLowerCase().includes("error") || stderr.toLowerCase().includes("failed") || stderr.toLowerCase().includes("exception");
+  const filteredStderr = stderr.split("\n").filter((line) => !line.startsWith("[setup]")).join("\n");
+  const hasErrors = filteredStderr.toLowerCase().includes("error") || filteredStderr.toLowerCase().includes("failed") || filteredStderr.toLowerCase().includes("exception");
   const passed = !hasErrors;
   return {
     pass: passed,
@@ -4318,24 +4352,88 @@ async function scoreTask(task, workspacePath, stdout, stderr, config) {
 // src/evolve/runner.ts
 var execAsync2 = promisify2(exec2);
-async function runTask(task, harnessPath, traceDir, iteration) {
+var COPY_SKIP_DIRS = /* @__PURE__ */ new Set([".git", "node_modules", ".kairn-evolve", ".claude"]);
+async function createIsolatedWorkspace(projectRoot, harnessPath) {
+  const suffix = `${Date.now()}-${Math.random().toString(36).slice(2)}`;
+  try {
+    await execAsync2("git rev-parse --is-inside-work-tree", {
+      cwd: projectRoot,
+      timeout: 5e3
+    });
+    const tmpDir2 = path18.join(os3.tmpdir(), `kairn-evolve-wt-${suffix}`);
+    await execAsync2(`git worktree add --detach "${tmpDir2}" HEAD`, {
+      cwd: projectRoot,
+      timeout: 3e4
+    });
+    await fs18.rm(path18.join(tmpDir2, ".claude"), { recursive: true, force: true });
+    await copyDir(harnessPath, path18.join(tmpDir2, ".claude"));
+    return { workDir: tmpDir2, isWorktree: true };
+  } catch {
+  }
+  const tmpDir = await fs18.mkdtemp(path18.join(os3.tmpdir(), `kairn-evolve-cp-`));
+  await copyProjectDir(projectRoot, tmpDir);
+  await fs18.rm(path18.join(tmpDir, ".claude"), { recursive: true, force: true });
+  await copyDir(harnessPath, path18.join(tmpDir, ".claude"));
+  return { workDir: tmpDir, isWorktree: false };
+}
+async function copyProjectDir(src, dest) {
+  await fs18.mkdir(dest, { recursive: true });
+  let entries;
+  try {
+    entries = await fs18.readdir(src, { withFileTypes: true });
+  } catch {
+    return;
+  }
+  for (const entry of entries) {
+    if (COPY_SKIP_DIRS.has(entry.name)) continue;
+    const srcPath = path18.join(src, entry.name);
+    const destPath = path18.join(dest, entry.name);
+    if (entry.isDirectory()) {
+      await copyDir(srcPath, destPath);
+    } else {
+      await fs18.copyFile(srcPath, destPath);
+    }
+  }
+}
+async function cleanupIsolatedWorkspace(workDir, isWorktree, projectRoot) {
+  if (isWorktree) {
+    try {
+      await execAsync2(`git worktree remove "${workDir}" --force`, {
+        cwd: projectRoot,
+        timeout: 1e4
+      });
+    } catch {
+      await fs18.rm(workDir, { recursive: true, force: true }).catch(() => {
+      });
+      await execAsync2("git worktree prune", {
+        cwd: projectRoot,
+        timeout: 5e3
+      }).catch(() => {
+      });
+    }
+  } else {
+    await fs18.rm(workDir, { recursive: true, force: true }).catch(() => {
+    });
+  }
+}
+async function runTask(task, harnessPath, traceDir, iteration, projectRoot) {
   await fs18.mkdir(traceDir, { recursive: true });
   const startedAt = (/* @__PURE__ */ new Date()).toISOString();
   const startMs = Date.now();
-  const tmpDir = await fs18.mkdtemp(path18.join(os3.tmpdir(), "kairn-evolve-"));
+  const root = projectRoot ?? process.cwd();
+  const { workDir, isWorktree } = await createIsolatedWorkspace(root, harnessPath);
   try {
-    await copyDir(harnessPath, path18.join(tmpDir, ".claude"));
     let setupStderr = "";
     if (task.setup.trim()) {
       try {
-        await execAsync2(task.setup, { cwd: tmpDir, timeout: 6e4 });
+        await execAsync2(task.setup, { cwd: workDir, timeout: 6e4 });
       } catch (err) {
         setupStderr = err instanceof Error ? err.message : String(err);
       }
     }
-    const filesBefore = await snapshotFileList(tmpDir);
-    const spawnResult = await spawnClaude(task.description, tmpDir, task.timeout);
-    const filesAfter = await snapshotFileList(tmpDir);
+    const filesBefore = await snapshotFileList(workDir);
+    const spawnResult = await spawnClaude(task.description, workDir, task.timeout);
+    const filesAfter = await snapshotFileList(workDir);
     const filesChanged = diffFileLists(filesBefore, filesAfter);
     const toolCalls = parseToolCalls(spawnResult.stdout);
     const completedAt = (/* @__PURE__ */ new Date()).toISOString();
@@ -4359,8 +4457,7 @@ ${spawnResult.stderr}` : spawnResult.stderr;
       traceDir
     };
   } finally {
-    await fs18.rm(tmpDir, { recursive: true, force: true }).catch(() => {
-    });
+    await cleanupIsolatedWorkspace(workDir, isWorktree, root);
   }
 }
 async function spawnClaude(instruction, cwd, timeoutSec) {
@@ -4458,8 +4555,9 @@ function parseToolCalls(stdout) {
     return [];
   }
 }
-async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config) {
+async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config, onProgress) {
   const results = {};
+  const projectRoot = path18.resolve(workspacePath, "..");
   for (const task of tasks) {
     const traceDir = path18.join(
       workspacePath,
@@ -4467,7 +4565,8 @@ async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config)
       iteration.toString(),
       task.id
     );
-    const taskResult = await runTask(task, harnessPath, traceDir, iteration);
+    onProgress?.({ type: "task-start", iteration, taskId: task.id });
+    const taskResult = await runTask(task, harnessPath, traceDir, iteration, projectRoot);
     let score = taskResult.score;
     if (config) {
       const stdout = await fs18.readFile(path18.join(traceDir, "stdout.log"), "utf-8").catch(() => "");
@@ -4476,6 +4575,12 @@ async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config)
       await writeScore(traceDir, score);
     }
     results[task.id] = score;
+    onProgress?.({
+      type: "task-scored",
+      iteration,
+      taskId: task.id,
+      score: score.score ?? (score.pass ? 100 : 0)
+    });
   }
   const scores = Object.values(results);
   const total = scores.reduce(
@@ -4537,7 +4642,8 @@ Return a JSON object:
 - Prefer ADDITIVE changes over replacements when possible.
 Return ONLY valid JSON.`;
-var STDOUT_TRUNCATION_LIMIT = 2e3;
+var STDOUT_TRUNCATION_LIMIT = 1e3;
+var MAX_CONTEXT_CHARS = 1e5;
 async function readHarnessFiles(harnessPath) {
   const result = {};
   async function walk(dir, prefix) {
@@ -4571,26 +4677,25 @@ function truncateStdout(stdout, limit) {
 ${stdout.slice(-limit)}`;
 }
 function buildProposerUserMessage(harnessFiles, traces, tasks, history) {
-  const sections = [];
-  sections.push("## Current Harness Files\n");
+  const harnessSection = ["## Current Harness Files\n"];
   const fileEntries = Object.entries(harnessFiles);
   if (fileEntries.length === 0) {
-    sections.push("(No harness files found)\n");
+    harnessSection.push("(No harness files found)\n");
   } else {
     for (const [filePath, content] of fileEntries) {
-      sections.push(`### ${filePath}
+      harnessSection.push(`### ${filePath}
 \`\`\`
 ${content}
 \`\`\`
 `);
     }
   }
-  sections.push("## Task Definitions\n");
+  const taskSection = ["## Task Definitions\n"];
   if (tasks.length === 0) {
-    sections.push("(No tasks defined)\n");
+    taskSection.push("(No tasks defined)\n");
   } else {
     for (const task of tasks) {
-      sections.push(
+      taskSection.push(
         `### Task: ${task.id}
 - Template: ${task.template}
 - Description: ${task.description}
@@ -4600,15 +4705,27 @@ ${content}
       );
     }
   }
-  sections.push("## Execution Traces\n");
-  if (traces.length === 0) {
-    sections.push("(No traces available)\n");
-  } else {
+  const fixedContent = harnessSection.join("\n") + "\n" + taskSection.join("\n");
+  const remainingBudget = MAX_CONTEXT_CHARS - fixedContent.length;
+  if (remainingBudget <= 0) {
+    return fixedContent + "\n\n[...traces and history omitted \u2014 harness + tasks fill context budget...]";
+  }
+  const traceBudget = Math.floor(remainingBudget * 0.7);
+  const historyBudget = remainingBudget - traceBudget;
+  const traceSection = buildTraceSection(traces, traceBudget);
+  const historySection = buildHistorySection(history, historyBudget);
+  return fixedContent + "\n" + traceSection + "\n" + historySection;
+}
+function buildTraceSection(traces, budget) {
+  if (traces.length === 0) return "## Execution Traces\n\n(No traces available)\n";
+  let stdoutLimit = STDOUT_TRUNCATION_LIMIT;
+  for (let attempt = 0; attempt < 4; attempt++) {
+    const parts = ["## Execution Traces\n"];
     for (const trace of traces) {
       const scoreNum = trace.score.score !== void 0 ? trace.score.score : trace.score.pass ? 100 : 0;
-      const truncatedStdout = truncateStdout(trace.stdout, STDOUT_TRUNCATION_LIMIT);
+      const truncatedStdout = truncateStdout(trace.stdout, stdoutLimit);
       const filesChangedList = Object.entries(trace.filesChanged).map(([f, action]) => `  - ${f}: ${action}`).join("\n");
-      sections.push(
+      parts.push(
         `### Trace: ${trace.taskId}
 - Pass: ${trace.score.pass}
 - Score: ${scoreNum}
@@ -4616,36 +4733,55 @@ ${content}
 ` : "") + `- Duration: ${trace.timing.durationMs}ms
 - Files changed:
 ${filesChangedList || "  (none)"}
-- Stdout (last ${STDOUT_TRUNCATION_LIMIT} chars):
+- Stdout (last ${stdoutLimit} chars):
 \`\`\`
 ${truncatedStdout}
 \`\`\`
 `
       );
     }
+    const result = parts.join("\n");
+    if (result.length <= budget) return result;
+    stdoutLimit = Math.floor(stdoutLimit / 2);
   }
-  sections.push("## Iteration History\n");
-  if (history.length === 0) {
-    sections.push("(No previous iterations)\n");
-  } else {
-    for (const log of history) {
+  const summary = ["## Execution Traces (summary \u2014 stdout omitted to fit budget)\n"];
+  for (const trace of traces) {
+    const scoreNum = trace.score.score !== void 0 ? trace.score.score : trace.score.pass ? 100 : 0;
+    summary.push(`- ${trace.taskId}: ${scoreNum} (pass=${trace.score.pass})
+`);
+  }
+  return summary.join("\n");
+}
+function buildHistorySection(history, budget) {
+  if (history.length === 0) return "## Iteration History\n\n(No previous iterations)\n";
+  let entries = [...history];
+  while (entries.length > 0) {
+    const parts = ["## Iteration History\n"];
+    if (entries.length < history.length) {
+      parts.push(`(Showing ${entries.length}/${history.length} most recent iterations)
+`);
+    }
+    for (const log of entries) {
       const taskScores = Object.entries(log.taskResults).map(([id, s]) => `  - ${id}: ${s.score !== void 0 ? s.score : s.pass ? 100 : 0} (pass=${s.pass})`).join("\n");
-      sections.push(
+      parts.push(
         `### Iteration ${log.iteration} \u2014 Score: ${log.score}
 - Task results:
 ${taskScores}
 `
       );
       if (log.proposal) {
-        sections.push(
+        parts.push(
           `- Proposal reasoning: ${log.proposal.reasoning}
 - Mutations: ${log.proposal.mutations.length} change(s)
 `
         );
       }
     }
+    const result = parts.join("\n");
+    if (result.length <= budget) return result;
+    entries = entries.slice(1);
   }
-  return sections.join("\n");
+  return "## Iteration History\n\n(History omitted to fit context budget)\n";
 }
 function parseProposerResponse(raw) {
   let cleaned = raw.trim();
@@ -4657,7 +4793,18 @@ function parseProposerResponse(raw) {
   try {
     parsed = JSON.parse(cleaned);
   } catch {
-    throw new Error(`Proposer returned invalid JSON: ${cleaned.slice(0, 200)}`);
+    const firstBrace = cleaned.indexOf("{");
+    const lastBrace = cleaned.lastIndexOf("}");
+    if (firstBrace !== -1 && lastBrace > firstBrace) {
+      const extracted = cleaned.slice(firstBrace, lastBrace + 1);
+      try {
+        parsed = JSON.parse(extracted);
+      } catch {
+        throw new Error(`Proposer returned invalid JSON: ${cleaned.slice(0, 200)}`);
+      }
+    } else {
+      throw new Error(`Proposer returned invalid JSON: ${cleaned.slice(0, 200)}`);
+    }
   }
   if (typeof parsed !== "object" || parsed === null) {
     throw new Error("Proposer response is not a JSON object");
@@ -4720,7 +4867,8 @@ async function propose(iteration, workspacePath, harnessPath, history, tasks, co
   const proposerConfig = { ...config, model: proposerModel };
   const response = await callLLM(proposerConfig, userMessage, {
     systemPrompt: PROPOSER_SYSTEM_PROMPT,
-    maxTokens: 8192
+    maxTokens: 8192,
+    jsonMode: true
   });
   return parseProposerResponse(response);
 }
@@ -4857,7 +5005,8 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
       harnessPath,
       workspacePath,
       iter,
-      kairnConfig
+      kairnConfig,
+      onProgress
     );
     onProgress?.({ type: "iteration-scored", iteration: iter, score: aggregate });
     if (iter === 0) baselineScore = aggregate;
@@ -4935,7 +5084,13 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
         kairnConfig,
         evolveConfig.proposerModel
       );
-    } catch {
+    } catch (err) {
+      const errMsg = err instanceof Error ? err.message : String(err);
+      onProgress?.({
+        type: "proposer-error",
+        iteration: iter,
+        message: `Proposer failed: ${errMsg}`
+      });
       const nextIterDir2 = path21.join(
         workspacePath,
         "iterations",
@@ -5434,6 +5589,18 @@ evolveCommand.command("run").description("Run tasks against the current harness"
           case "perfect-score":
             console.log(chalk14.green("  Perfect score. Stopping."));
             break;
+          case "proposer-error":
+            console.log(chalk14.yellow(`  Warning: ${event.message ?? "Proposer failed"}`));
+            break;
+          case "task-start":
+            console.log(chalk14.dim(`    Running: ${event.taskId ?? "unknown"}...`));
+            break;
+          case "task-scored": {
+            const taskScore = event.score ?? 0;
+            const taskStatus = taskScore >= 100 ? chalk14.green("PASS") : taskScore >= 60 ? chalk14.yellow("PARTIAL") : chalk14.red("FAIL");
+            console.log(`    ${taskStatus}  ${event.taskId ?? "unknown"} ${chalk14.dim(`(${taskScore.toFixed(0)}%)`)}`);
+            break;
+          }
           case "complete":
             break;
         }