kairn-cli 2.2.7 → 2.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -4741,13 +4741,20 @@ Treat .mcp.json like any other harness file \u2014 propose changes when traces s
4741
4741
  the agent lacks a tool it needs, or has tools that add noise without benefit.
4742
4742
 
4743
4743
  ## Rules
4744
- - MINIMAL changes only. Don't rewrite the entire CLAUDE.md.
4744
+ - Propose AT MOST 3 mutations per iteration. Fewer, targeted mutations are more stable than many broad ones.
4745
4745
  - Each mutation must have a clear rationale tied to a specific trace observation.
4746
4746
  - Never remove something that's working for another task.
4747
4747
  - If a previous iteration's change caused a regression, REVERT it.
4748
4748
  - Consider both additions AND removals. Remove sections that add noise without improving task performance.
4749
4749
  - Bloated harnesses hurt performance \u2014 trim what isn't earning its keep.
4750
4750
 
4751
+ ## Anti-Gaming (CRITICAL)
4752
+ - Mutations must improve GENERAL-PURPOSE development quality, not target specific eval criteria.
4753
+ - You do NOT have access to scoring rubrics or expected outcomes. Diagnose problems from traces only.
4754
+ - Do NOT add over-specified rules that restate existing conventions with stronger emphasis (e.g., changing "use chalk.green for success" to "MUST use chalk.green, no exceptions"). If a convention already exists, trust it.
4755
+ - Do NOT add rules that only apply to a narrow eval scenario (e.g., write permissions for a specific directory just because one task needed it).
4756
+ - Ask: "Would this mutation help a developer working on ANY task in this project?" If not, don't propose it.
4757
+
4751
4758
  Return ONLY valid JSON.`;
4752
4759
  var STDOUT_TRUNCATION_LIMIT = 1e3;
4753
4760
  var MAX_CONTEXT_CHARS = 1e5;
@@ -4806,8 +4813,6 @@ ${content}
4806
4813
  `### Task: ${task.id}
4807
4814
  - Template: ${task.template}
4808
4815
  - Description: ${task.description}
4809
- - Expected outcome: ${Array.isArray(task.expected_outcome) ? task.expected_outcome.join("; ") : task.expected_outcome}
4810
- - Scoring: ${task.scoring}
4811
4816
  `
4812
4817
  );
4813
4818
  }
@@ -4825,10 +4830,15 @@ ${content}
4825
4830
  }
4826
4831
  function buildTraceSection(traces, budget) {
4827
4832
  if (traces.length === 0) return "## Execution Traces\n\n(No traces available)\n";
4833
+ const sortedTraces = [...traces].sort((a, b) => {
4834
+ const scoreA = a.score.score ?? (a.score.pass ? 100 : 0);
4835
+ const scoreB = b.score.score ?? (b.score.pass ? 100 : 0);
4836
+ return scoreA - scoreB;
4837
+ });
4828
4838
  let stdoutLimit = STDOUT_TRUNCATION_LIMIT;
4829
4839
  for (let attempt = 0; attempt < 4; attempt++) {
4830
- const parts = ["## Execution Traces\n"];
4831
- for (const trace of traces) {
4840
+ const parts = ["## Execution Traces (sorted worst-first)\n"];
4841
+ for (const trace of sortedTraces) {
4832
4842
  const scoreNum = trace.score.score !== void 0 ? trace.score.score : trace.score.pass ? 100 : 0;
4833
4843
  const truncatedStdout = truncateStdout(trace.stdout, stdoutLimit);
4834
4844
  const filesChangedList = Object.entries(trace.filesChanged).map(([f, action]) => ` - ${f}: ${action}`).join("\n");
@@ -5130,18 +5140,19 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
5130
5140
  const prevLog = history.length > 0 ? history[history.length - 1] : null;
5131
5141
  let tasksToRun = tasks;
5132
5142
  const carriedScores = {};
5143
+ const threshold = evolveConfig.pruneThreshold;
5133
5144
  if (!isFirstIter && !isLastIter && prevLog) {
5134
5145
  tasksToRun = [];
5135
5146
  for (const task of tasks) {
5136
5147
  const prevScore = prevLog.taskResults[task.id];
5137
5148
  const prevValue = prevScore ? prevScore.score ?? (prevScore.pass ? 100 : 0) : 0;
5138
- if (prevValue >= 100) {
5139
- carriedScores[task.id] = { pass: true, score: 100 };
5149
+ if (prevValue >= threshold) {
5150
+ carriedScores[task.id] = { pass: true, score: prevValue };
5140
5151
  onProgress?.({
5141
5152
  type: "task-skipped",
5142
5153
  iteration: iter,
5143
5154
  taskId: task.id,
5144
- message: `Skipped ${task.id} (scored 100% last iteration)`
5155
+ message: `Skipped ${task.id} (scored ${prevValue.toFixed(0)}% >= ${threshold}% threshold)`
5145
5156
  });
5146
5157
  } else {
5147
5158
  tasksToRun.push(task);
@@ -5167,12 +5178,35 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
5167
5178
  const aggregate = allScores.length > 0 ? total / allScores.length : 0;
5168
5179
  onProgress?.({ type: "iteration-scored", iteration: iter, score: aggregate });
5169
5180
  if (iter === 0) baselineScore = aggregate;
5170
- if (iter > 0 && aggregate < bestScore) {
5181
+ let shouldRollback = iter > 0 && aggregate < bestScore;
5182
+ let rollbackMessage = shouldRollback ? `Regression: ${aggregate.toFixed(1)}% < ${bestScore.toFixed(1)}%. Rolling back.` : "";
5183
+ const bestLog = history.find((h) => h.iteration === bestIteration);
5184
+ if (iter > 0 && !shouldRollback && bestLog) {
5185
+ for (const [taskId, score] of Object.entries(results)) {
5186
+ const currValue = score.score ?? (score.pass ? 100 : 0);
5187
+ const bestTaskScore = bestLog.taskResults[taskId];
5188
+ const bestValue = bestTaskScore ? bestTaskScore.score ?? (bestTaskScore.pass ? 100 : 0) : currValue;
5189
+ const drop = bestValue - currValue;
5190
+ if (drop > evolveConfig.maxTaskDrop) {
5191
+ shouldRollback = true;
5192
+ rollbackMessage = `Task ${taskId} dropped ${drop.toFixed(0)} points (${bestValue.toFixed(0)}% \u2192 ${currValue.toFixed(0)}%). Rolling back.`;
5193
+ onProgress?.({
5194
+ type: "task-regression",
5195
+ iteration: iter,
5196
+ taskId,
5197
+ score: currValue,
5198
+ message: `dropped ${drop.toFixed(0)} points (limit: ${evolveConfig.maxTaskDrop})`
5199
+ });
5200
+ break;
5201
+ }
5202
+ }
5203
+ }
5204
+ if (shouldRollback) {
5171
5205
  onProgress?.({
5172
5206
  type: "rollback",
5173
5207
  iteration: iter,
5174
5208
  score: aggregate,
5175
- message: `Regression: ${aggregate.toFixed(1)}% < ${bestScore.toFixed(1)}%. Rolling back.`
5209
+ message: rollbackMessage
5176
5210
  });
5177
5211
  const rollbackLog = {
5178
5212
  iteration: iter,
@@ -5241,6 +5275,12 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
5241
5275
  kairnConfig,
5242
5276
  evolveConfig.proposerModel
5243
5277
  );
5278
+ if (proposal.mutations.length > evolveConfig.maxMutationsPerIteration) {
5279
+ proposal = {
5280
+ ...proposal,
5281
+ mutations: proposal.mutations.slice(0, evolveConfig.maxMutationsPerIteration)
5282
+ };
5283
+ }
5244
5284
  } catch (err) {
5245
5285
  const errMsg = err instanceof Error ? err.message : String(err);
5246
5286
  onProgress?.({
@@ -5615,6 +5655,18 @@ async function applyEvolution(workspacePath, projectRoot, targetIteration) {
5615
5655
  }
5616
5656
  await fs23.rm(claudeDir, { recursive: true, force: true });
5617
5657
  await copyDir(harnessPath, claudeDir);
5658
+ const harnessMcpJson = path23.join(harnessPath, ".mcp.json");
5659
+ const projectMcpJson = path23.join(projectRoot, ".mcp.json");
5660
+ try {
5661
+ await fs23.access(harnessMcpJson);
5662
+ const currentMcp = await fs23.readFile(projectMcpJson, "utf-8").catch(() => null);
5663
+ const targetMcp = await fs23.readFile(harnessMcpJson, "utf-8").catch(() => null);
5664
+ if (currentMcp !== targetMcp) {
5665
+ filesChanged.push(".mcp.json");
5666
+ }
5667
+ await fs23.copyFile(harnessMcpJson, projectMcpJson);
5668
+ } catch {
5669
+ }
5618
5670
  return {
5619
5671
  iteration: iter,
5620
5672
  filesChanged,
@@ -5629,7 +5681,10 @@ var DEFAULT_CONFIG = {
5629
5681
  scorer: "pass-fail",
5630
5682
  maxIterations: 5,
5631
5683
  parallelTasks: 1,
5632
- runsPerTask: 1
5684
+ runsPerTask: 1,
5685
+ maxMutationsPerIteration: 3,
5686
+ pruneThreshold: 95,
5687
+ maxTaskDrop: 20
5633
5688
  };
5634
5689
  async function loadEvolveConfigFromWorkspace(workspacePath) {
5635
5690
  try {
@@ -5641,7 +5696,10 @@ async function loadEvolveConfigFromWorkspace(workspacePath) {
5641
5696
  scorer: parsed.scorer ?? DEFAULT_CONFIG.scorer,
5642
5697
  maxIterations: parsed.max_iterations ?? DEFAULT_CONFIG.maxIterations,
5643
5698
  parallelTasks: parsed.parallel_tasks ?? DEFAULT_CONFIG.parallelTasks,
5644
- runsPerTask: parsed.runs_per_task ?? DEFAULT_CONFIG.runsPerTask
5699
+ runsPerTask: parsed.runs_per_task ?? DEFAULT_CONFIG.runsPerTask,
5700
+ maxMutationsPerIteration: parsed.max_mutations_per_iteration ?? DEFAULT_CONFIG.maxMutationsPerIteration,
5701
+ pruneThreshold: parsed.prune_threshold ?? DEFAULT_CONFIG.pruneThreshold,
5702
+ maxTaskDrop: parsed.max_task_drop ?? DEFAULT_CONFIG.maxTaskDrop
5645
5703
  };
5646
5704
  } catch {
5647
5705
  return { ...DEFAULT_CONFIG };
@@ -5753,7 +5811,7 @@ evolveCommand.command("baseline").description("Snapshot current .claude/ directo
5753
5811
  process.exit(1);
5754
5812
  }
5755
5813
  });
5756
- evolveCommand.command("run").description("Run tasks against the current harness").option("--task <id>", "Run a specific task by ID").option("--iterations <n>", "Number of evolution iterations", "5").option("--runs <n>", "Run each task N times for variance measurement", "1").option("--parallel <n>", "Run up to N tasks concurrently", "1").action(async (options) => {
5814
+ evolveCommand.command("run").description("Run tasks against the current harness").option("--task <id>", "Run a specific task by ID").option("--iterations <n>", "Number of evolution iterations", "5").option("--runs <n>", "Run each task N times for variance measurement", "1").option("--parallel <n>", "Run up to N tasks concurrently", "1").option("--max-mutations <n>", "Max mutations per iteration", "3").option("--prune-threshold <n>", "Skip tasks scoring above this on middle iterations", "95").option("--max-task-drop <n>", "Roll back if any task drops more than N points", "20").action(async (options) => {
5757
5815
  try {
5758
5816
  const projectRoot = process.cwd();
5759
5817
  const workspace = path24.join(projectRoot, ".kairn-evolve");
@@ -5834,6 +5892,24 @@ evolveCommand.command("run").description("Run tasks against the current harness"
5834
5892
  process.exit(1);
5835
5893
  }
5836
5894
  evolveConfig.parallelTasks = parallel;
5895
+ const maxMutations = parseInt(options.maxMutations ?? "3", 10);
5896
+ if (isNaN(maxMutations) || maxMutations < 1) {
5897
+ console.log(ui.error("--max-mutations must be a positive integer"));
5898
+ process.exit(1);
5899
+ }
5900
+ evolveConfig.maxMutationsPerIteration = maxMutations;
5901
+ const pruneThreshold = parseInt(options.pruneThreshold ?? "95", 10);
5902
+ if (isNaN(pruneThreshold) || pruneThreshold < 0 || pruneThreshold > 100) {
5903
+ console.log(ui.error("--prune-threshold must be 0-100"));
5904
+ process.exit(1);
5905
+ }
5906
+ evolveConfig.pruneThreshold = pruneThreshold;
5907
+ const maxTaskDrop = parseInt(options.maxTaskDrop ?? "20", 10);
5908
+ if (isNaN(maxTaskDrop) || maxTaskDrop < 1) {
5909
+ console.log(ui.error("--max-task-drop must be a positive integer"));
5910
+ process.exit(1);
5911
+ }
5912
+ evolveConfig.maxTaskDrop = maxTaskDrop;
5837
5913
  try {
5838
5914
  await fs24.access(path24.join(workspace, "iterations", "0", "harness"));
5839
5915
  } catch {
@@ -5872,7 +5948,10 @@ evolveCommand.command("run").description("Run tasks against the current harness"
5872
5948
  console.log(chalk14.dim(` ${event.message ?? ""}`));
5873
5949
  break;
5874
5950
  case "task-skipped":
5875
- console.log(chalk14.dim(` SKIP ${event.taskId ?? "unknown"} (100% last iteration)`));
5951
+ console.log(chalk14.dim(` SKIP ${event.taskId ?? "unknown"} (above prune threshold last iteration)`));
5952
+ break;
5953
+ case "task-regression":
5954
+ console.log(chalk14.yellow(` DROP ${event.taskId ?? "unknown"} ${event.message ?? ""}`));
5876
5955
  break;
5877
5956
  case "task-scored": {
5878
5957
  const taskScore = event.score ?? 0;