npm - kairn-cli - Versions diffs - 2.2.7 → 2.2.9 - Mend

kairn-cli 2.2.7 → 2.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/cli.js CHANGED Viewed

@@ -4741,13 +4741,20 @@ Treat .mcp.json like any other harness file \u2014 propose changes when traces s
 the agent lacks a tool it needs, or has tools that add noise without benefit.
 ## Rules
-- MINIMAL changes only. Don't rewrite the entire CLAUDE.md.
+- Propose AT MOST 3 mutations per iteration. Fewer, targeted mutations are more stable than many broad ones.
 - Each mutation must have a clear rationale tied to a specific trace observation.
 - Never remove something that's working for another task.
 - If a previous iteration's change caused a regression, REVERT it.
 - Consider both additions AND removals. Remove sections that add noise without improving task performance.
 - Bloated harnesses hurt performance \u2014 trim what isn't earning its keep.
+## Anti-Gaming (CRITICAL)
+- Mutations must improve GENERAL-PURPOSE development quality, not target specific eval criteria.
+- You do NOT have access to scoring rubrics or expected outcomes. Diagnose problems from traces only.
+- Do NOT add over-specified rules that restate existing conventions with stronger emphasis (e.g., changing "use chalk.green for success" to "MUST use chalk.green, no exceptions"). If a convention already exists, trust it.
+- Do NOT add rules that only apply to a narrow eval scenario (e.g., write permissions for a specific directory just because one task needed it).
+- Ask: "Would this mutation help a developer working on ANY task in this project?" If not, don't propose it.
 Return ONLY valid JSON.`;
 var STDOUT_TRUNCATION_LIMIT = 1e3;
 var MAX_CONTEXT_CHARS = 1e5;
@@ -4806,8 +4813,6 @@ ${content}
         `### Task: ${task.id}
 - Template: ${task.template}
 - Description: ${task.description}
-- Expected outcome: ${Array.isArray(task.expected_outcome) ? task.expected_outcome.join("; ") : task.expected_outcome}
-- Scoring: ${task.scoring}
 `
       );
     }
@@ -4825,10 +4830,15 @@ ${content}
 }
 function buildTraceSection(traces, budget) {
   if (traces.length === 0) return "## Execution Traces\n\n(No traces available)\n";
+  const sortedTraces = [...traces].sort((a, b) => {
+    const scoreA = a.score.score ?? (a.score.pass ? 100 : 0);
+    const scoreB = b.score.score ?? (b.score.pass ? 100 : 0);
+    return scoreA - scoreB;
+  });
   let stdoutLimit = STDOUT_TRUNCATION_LIMIT;
   for (let attempt = 0; attempt < 4; attempt++) {
-    const parts = ["## Execution Traces\n"];
-    for (const trace of traces) {
+    const parts = ["## Execution Traces (sorted worst-first)\n"];
+    for (const trace of sortedTraces) {
       const scoreNum = trace.score.score !== void 0 ? trace.score.score : trace.score.pass ? 100 : 0;
       const truncatedStdout = truncateStdout(trace.stdout, stdoutLimit);
       const filesChangedList = Object.entries(trace.filesChanged).map(([f, action]) => `  - ${f}: ${action}`).join("\n");
@@ -5130,18 +5140,19 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
     const prevLog = history.length > 0 ? history[history.length - 1] : null;
     let tasksToRun = tasks;
     const carriedScores = {};
+    const threshold = evolveConfig.pruneThreshold;
     if (!isFirstIter && !isLastIter && prevLog) {
       tasksToRun = [];
       for (const task of tasks) {
         const prevScore = prevLog.taskResults[task.id];
         const prevValue = prevScore ? prevScore.score ?? (prevScore.pass ? 100 : 0) : 0;
-        if (prevValue >= 100) {
-          carriedScores[task.id] = { pass: true, score: 100 };
+        if (prevValue >= threshold) {
+          carriedScores[task.id] = { pass: true, score: prevValue };
           onProgress?.({
             type: "task-skipped",
             iteration: iter,
             taskId: task.id,
-            message: `Skipped ${task.id} (scored 100% last iteration)`
+            message: `Skipped ${task.id} (scored ${prevValue.toFixed(0)}% >= ${threshold}% threshold)`
           });
         } else {
           tasksToRun.push(task);
@@ -5167,12 +5178,35 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
     const aggregate = allScores.length > 0 ? total / allScores.length : 0;
     onProgress?.({ type: "iteration-scored", iteration: iter, score: aggregate });
     if (iter === 0) baselineScore = aggregate;
-    if (iter > 0 && aggregate < bestScore) {
+    let shouldRollback = iter > 0 && aggregate < bestScore;
+    let rollbackMessage = shouldRollback ? `Regression: ${aggregate.toFixed(1)}% < ${bestScore.toFixed(1)}%. Rolling back.` : "";
+    const bestLog = history.find((h) => h.iteration === bestIteration);
+    if (iter > 0 && !shouldRollback && bestLog) {
+      for (const [taskId, score] of Object.entries(results)) {
+        const currValue = score.score ?? (score.pass ? 100 : 0);
+        const bestTaskScore = bestLog.taskResults[taskId];
+        const bestValue = bestTaskScore ? bestTaskScore.score ?? (bestTaskScore.pass ? 100 : 0) : currValue;
+        const drop = bestValue - currValue;
+        if (drop > evolveConfig.maxTaskDrop) {
+          shouldRollback = true;
+          rollbackMessage = `Task ${taskId} dropped ${drop.toFixed(0)} points (${bestValue.toFixed(0)}% \u2192 ${currValue.toFixed(0)}%). Rolling back.`;
+          onProgress?.({
+            type: "task-regression",
+            iteration: iter,
+            taskId,
+            score: currValue,
+            message: `dropped ${drop.toFixed(0)} points (limit: ${evolveConfig.maxTaskDrop})`
+          });
+          break;
+        }
+      }
+    }
+    if (shouldRollback) {
       onProgress?.({
         type: "rollback",
         iteration: iter,
         score: aggregate,
-        message: `Regression: ${aggregate.toFixed(1)}% < ${bestScore.toFixed(1)}%. Rolling back.`
+        message: rollbackMessage
       });
       const rollbackLog = {
         iteration: iter,
@@ -5241,6 +5275,12 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
         kairnConfig,
         evolveConfig.proposerModel
       );
+      if (proposal.mutations.length > evolveConfig.maxMutationsPerIteration) {
+        proposal = {
+          ...proposal,
+          mutations: proposal.mutations.slice(0, evolveConfig.maxMutationsPerIteration)
+        };
+      }
     } catch (err) {
       const errMsg = err instanceof Error ? err.message : String(err);
       onProgress?.({
@@ -5615,6 +5655,18 @@ async function applyEvolution(workspacePath, projectRoot, targetIteration) {
   }
   await fs23.rm(claudeDir, { recursive: true, force: true });
   await copyDir(harnessPath, claudeDir);
+  const harnessMcpJson = path23.join(harnessPath, ".mcp.json");
+  const projectMcpJson = path23.join(projectRoot, ".mcp.json");
+  try {
+    await fs23.access(harnessMcpJson);
+    const currentMcp = await fs23.readFile(projectMcpJson, "utf-8").catch(() => null);
+    const targetMcp = await fs23.readFile(harnessMcpJson, "utf-8").catch(() => null);
+    if (currentMcp !== targetMcp) {
+      filesChanged.push(".mcp.json");
+    }
+    await fs23.copyFile(harnessMcpJson, projectMcpJson);
+  } catch {
+  }
   return {
     iteration: iter,
     filesChanged,
@@ -5629,7 +5681,10 @@ var DEFAULT_CONFIG = {
   scorer: "pass-fail",
   maxIterations: 5,
   parallelTasks: 1,
-  runsPerTask: 1
+  runsPerTask: 1,
+  maxMutationsPerIteration: 3,
+  pruneThreshold: 95,
+  maxTaskDrop: 20
 };
 async function loadEvolveConfigFromWorkspace(workspacePath) {
   try {
@@ -5641,7 +5696,10 @@ async function loadEvolveConfigFromWorkspace(workspacePath) {
       scorer: parsed.scorer ?? DEFAULT_CONFIG.scorer,
       maxIterations: parsed.max_iterations ?? DEFAULT_CONFIG.maxIterations,
       parallelTasks: parsed.parallel_tasks ?? DEFAULT_CONFIG.parallelTasks,
-      runsPerTask: parsed.runs_per_task ?? DEFAULT_CONFIG.runsPerTask
+      runsPerTask: parsed.runs_per_task ?? DEFAULT_CONFIG.runsPerTask,
+      maxMutationsPerIteration: parsed.max_mutations_per_iteration ?? DEFAULT_CONFIG.maxMutationsPerIteration,
+      pruneThreshold: parsed.prune_threshold ?? DEFAULT_CONFIG.pruneThreshold,
+      maxTaskDrop: parsed.max_task_drop ?? DEFAULT_CONFIG.maxTaskDrop
     };
   } catch {
     return { ...DEFAULT_CONFIG };
@@ -5753,7 +5811,7 @@ evolveCommand.command("baseline").description("Snapshot current .claude/ directo
     process.exit(1);
   }
 });
-evolveCommand.command("run").description("Run tasks against the current harness").option("--task <id>", "Run a specific task by ID").option("--iterations <n>", "Number of evolution iterations", "5").option("--runs <n>", "Run each task N times for variance measurement", "1").option("--parallel <n>", "Run up to N tasks concurrently", "1").action(async (options) => {
+evolveCommand.command("run").description("Run tasks against the current harness").option("--task <id>", "Run a specific task by ID").option("--iterations <n>", "Number of evolution iterations", "5").option("--runs <n>", "Run each task N times for variance measurement", "1").option("--parallel <n>", "Run up to N tasks concurrently", "1").option("--max-mutations <n>", "Max mutations per iteration", "3").option("--prune-threshold <n>", "Skip tasks scoring above this on middle iterations", "95").option("--max-task-drop <n>", "Roll back if any task drops more than N points", "20").action(async (options) => {
   try {
     const projectRoot = process.cwd();
     const workspace = path24.join(projectRoot, ".kairn-evolve");
@@ -5834,6 +5892,24 @@ evolveCommand.command("run").description("Run tasks against the current harness"
         process.exit(1);
       }
       evolveConfig.parallelTasks = parallel;
+      const maxMutations = parseInt(options.maxMutations ?? "3", 10);
+      if (isNaN(maxMutations) || maxMutations < 1) {
+        console.log(ui.error("--max-mutations must be a positive integer"));
+        process.exit(1);
+      }
+      evolveConfig.maxMutationsPerIteration = maxMutations;
+      const pruneThreshold = parseInt(options.pruneThreshold ?? "95", 10);
+      if (isNaN(pruneThreshold) || pruneThreshold < 0 || pruneThreshold > 100) {
+        console.log(ui.error("--prune-threshold must be 0-100"));
+        process.exit(1);
+      }
+      evolveConfig.pruneThreshold = pruneThreshold;
+      const maxTaskDrop = parseInt(options.maxTaskDrop ?? "20", 10);
+      if (isNaN(maxTaskDrop) || maxTaskDrop < 1) {
+        console.log(ui.error("--max-task-drop must be a positive integer"));
+        process.exit(1);
+      }
+      evolveConfig.maxTaskDrop = maxTaskDrop;
       try {
         await fs24.access(path24.join(workspace, "iterations", "0", "harness"));
       } catch {
@@ -5872,7 +5948,10 @@ evolveCommand.command("run").description("Run tasks against the current harness"
             console.log(chalk14.dim(`      ${event.message ?? ""}`));
             break;
           case "task-skipped":
-            console.log(chalk14.dim(`    SKIP  ${event.taskId ?? "unknown"} (100% last iteration)`));
+            console.log(chalk14.dim(`    SKIP  ${event.taskId ?? "unknown"} (above prune threshold last iteration)`));
+            break;
+          case "task-regression":
+            console.log(chalk14.yellow(`    DROP  ${event.taskId ?? "unknown"} ${event.message ?? ""}`));
             break;
           case "task-scored": {
             const taskScore = event.score ?? 0;