kairn-cli 2.2.8 → 2.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -4741,7 +4741,7 @@ Treat .mcp.json like any other harness file \u2014 propose changes when traces s
4741
4741
  the agent lacks a tool it needs, or has tools that add noise without benefit.
4742
4742
 
4743
4743
  ## Rules
4744
- - MINIMAL changes only. Don't rewrite the entire CLAUDE.md.
4744
+ - Propose AT MOST 3 mutations per iteration. Fewer, targeted mutations are more stable than many broad ones.
4745
4745
  - Each mutation must have a clear rationale tied to a specific trace observation.
4746
4746
  - Never remove something that's working for another task.
4747
4747
  - If a previous iteration's change caused a regression, REVERT it.
@@ -4830,10 +4830,15 @@ ${content}
4830
4830
  }
4831
4831
  function buildTraceSection(traces, budget) {
4832
4832
  if (traces.length === 0) return "## Execution Traces\n\n(No traces available)\n";
4833
+ const sortedTraces = [...traces].sort((a, b) => {
4834
+ const scoreA = a.score.score ?? (a.score.pass ? 100 : 0);
4835
+ const scoreB = b.score.score ?? (b.score.pass ? 100 : 0);
4836
+ return scoreA - scoreB;
4837
+ });
4833
4838
  let stdoutLimit = STDOUT_TRUNCATION_LIMIT;
4834
4839
  for (let attempt = 0; attempt < 4; attempt++) {
4835
- const parts = ["## Execution Traces\n"];
4836
- for (const trace of traces) {
4840
+ const parts = ["## Execution Traces (sorted worst-first)\n"];
4841
+ for (const trace of sortedTraces) {
4837
4842
  const scoreNum = trace.score.score !== void 0 ? trace.score.score : trace.score.pass ? 100 : 0;
4838
4843
  const truncatedStdout = truncateStdout(trace.stdout, stdoutLimit);
4839
4844
  const filesChangedList = Object.entries(trace.filesChanged).map(([f, action]) => ` - ${f}: ${action}`).join("\n");
@@ -5135,18 +5140,19 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
5135
5140
  const prevLog = history.length > 0 ? history[history.length - 1] : null;
5136
5141
  let tasksToRun = tasks;
5137
5142
  const carriedScores = {};
5143
+ const threshold = evolveConfig.pruneThreshold;
5138
5144
  if (!isFirstIter && !isLastIter && prevLog) {
5139
5145
  tasksToRun = [];
5140
5146
  for (const task of tasks) {
5141
5147
  const prevScore = prevLog.taskResults[task.id];
5142
5148
  const prevValue = prevScore ? prevScore.score ?? (prevScore.pass ? 100 : 0) : 0;
5143
- if (prevValue >= 100) {
5144
- carriedScores[task.id] = { pass: true, score: 100 };
5149
+ if (prevValue >= threshold) {
5150
+ carriedScores[task.id] = { pass: true, score: prevValue };
5145
5151
  onProgress?.({
5146
5152
  type: "task-skipped",
5147
5153
  iteration: iter,
5148
5154
  taskId: task.id,
5149
- message: `Skipped ${task.id} (scored 100% last iteration)`
5155
+ message: `Skipped ${task.id} (scored ${prevValue.toFixed(0)}% >= ${threshold}% threshold)`
5150
5156
  });
5151
5157
  } else {
5152
5158
  tasksToRun.push(task);
@@ -5172,12 +5178,35 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
5172
5178
  const aggregate = allScores.length > 0 ? total / allScores.length : 0;
5173
5179
  onProgress?.({ type: "iteration-scored", iteration: iter, score: aggregate });
5174
5180
  if (iter === 0) baselineScore = aggregate;
5175
- if (iter > 0 && aggregate < bestScore) {
5181
+ let shouldRollback = iter > 0 && aggregate < bestScore;
5182
+ let rollbackMessage = shouldRollback ? `Regression: ${aggregate.toFixed(1)}% < ${bestScore.toFixed(1)}%. Rolling back.` : "";
5183
+ const bestLog = history.find((h) => h.iteration === bestIteration);
5184
+ if (iter > 0 && !shouldRollback && bestLog) {
5185
+ for (const [taskId, score] of Object.entries(results)) {
5186
+ const currValue = score.score ?? (score.pass ? 100 : 0);
5187
+ const bestTaskScore = bestLog.taskResults[taskId];
5188
+ const bestValue = bestTaskScore ? bestTaskScore.score ?? (bestTaskScore.pass ? 100 : 0) : currValue;
5189
+ const drop = bestValue - currValue;
5190
+ if (drop > evolveConfig.maxTaskDrop) {
5191
+ shouldRollback = true;
5192
+ rollbackMessage = `Task ${taskId} dropped ${drop.toFixed(0)} points (${bestValue.toFixed(0)}% \u2192 ${currValue.toFixed(0)}%). Rolling back.`;
5193
+ onProgress?.({
5194
+ type: "task-regression",
5195
+ iteration: iter,
5196
+ taskId,
5197
+ score: currValue,
5198
+ message: `dropped ${drop.toFixed(0)} points (limit: ${evolveConfig.maxTaskDrop})`
5199
+ });
5200
+ break;
5201
+ }
5202
+ }
5203
+ }
5204
+ if (shouldRollback) {
5176
5205
  onProgress?.({
5177
5206
  type: "rollback",
5178
5207
  iteration: iter,
5179
5208
  score: aggregate,
5180
- message: `Regression: ${aggregate.toFixed(1)}% < ${bestScore.toFixed(1)}%. Rolling back.`
5209
+ message: rollbackMessage
5181
5210
  });
5182
5211
  const rollbackLog = {
5183
5212
  iteration: iter,
@@ -5246,6 +5275,12 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
5246
5275
  kairnConfig,
5247
5276
  evolveConfig.proposerModel
5248
5277
  );
5278
+ if (proposal.mutations.length > evolveConfig.maxMutationsPerIteration) {
5279
+ proposal = {
5280
+ ...proposal,
5281
+ mutations: proposal.mutations.slice(0, evolveConfig.maxMutationsPerIteration)
5282
+ };
5283
+ }
5249
5284
  } catch (err) {
5250
5285
  const errMsg = err instanceof Error ? err.message : String(err);
5251
5286
  onProgress?.({
@@ -5620,6 +5655,18 @@ async function applyEvolution(workspacePath, projectRoot, targetIteration) {
5620
5655
  }
5621
5656
  await fs23.rm(claudeDir, { recursive: true, force: true });
5622
5657
  await copyDir(harnessPath, claudeDir);
5658
+ const harnessMcpJson = path23.join(harnessPath, ".mcp.json");
5659
+ const projectMcpJson = path23.join(projectRoot, ".mcp.json");
5660
+ try {
5661
+ await fs23.access(harnessMcpJson);
5662
+ const currentMcp = await fs23.readFile(projectMcpJson, "utf-8").catch(() => null);
5663
+ const targetMcp = await fs23.readFile(harnessMcpJson, "utf-8").catch(() => null);
5664
+ if (currentMcp !== targetMcp) {
5665
+ filesChanged.push(".mcp.json");
5666
+ }
5667
+ await fs23.copyFile(harnessMcpJson, projectMcpJson);
5668
+ } catch {
5669
+ }
5623
5670
  return {
5624
5671
  iteration: iter,
5625
5672
  filesChanged,
@@ -5634,7 +5681,10 @@ var DEFAULT_CONFIG = {
5634
5681
  scorer: "pass-fail",
5635
5682
  maxIterations: 5,
5636
5683
  parallelTasks: 1,
5637
- runsPerTask: 1
5684
+ runsPerTask: 1,
5685
+ maxMutationsPerIteration: 3,
5686
+ pruneThreshold: 95,
5687
+ maxTaskDrop: 20
5638
5688
  };
5639
5689
  async function loadEvolveConfigFromWorkspace(workspacePath) {
5640
5690
  try {
@@ -5646,7 +5696,10 @@ async function loadEvolveConfigFromWorkspace(workspacePath) {
5646
5696
  scorer: parsed.scorer ?? DEFAULT_CONFIG.scorer,
5647
5697
  maxIterations: parsed.max_iterations ?? DEFAULT_CONFIG.maxIterations,
5648
5698
  parallelTasks: parsed.parallel_tasks ?? DEFAULT_CONFIG.parallelTasks,
5649
- runsPerTask: parsed.runs_per_task ?? DEFAULT_CONFIG.runsPerTask
5699
+ runsPerTask: parsed.runs_per_task ?? DEFAULT_CONFIG.runsPerTask,
5700
+ maxMutationsPerIteration: parsed.max_mutations_per_iteration ?? DEFAULT_CONFIG.maxMutationsPerIteration,
5701
+ pruneThreshold: parsed.prune_threshold ?? DEFAULT_CONFIG.pruneThreshold,
5702
+ maxTaskDrop: parsed.max_task_drop ?? DEFAULT_CONFIG.maxTaskDrop
5650
5703
  };
5651
5704
  } catch {
5652
5705
  return { ...DEFAULT_CONFIG };
@@ -5758,7 +5811,7 @@ evolveCommand.command("baseline").description("Snapshot current .claude/ directo
5758
5811
  process.exit(1);
5759
5812
  }
5760
5813
  });
5761
- evolveCommand.command("run").description("Run tasks against the current harness").option("--task <id>", "Run a specific task by ID").option("--iterations <n>", "Number of evolution iterations", "5").option("--runs <n>", "Run each task N times for variance measurement", "1").option("--parallel <n>", "Run up to N tasks concurrently", "1").action(async (options) => {
5814
+ evolveCommand.command("run").description("Run tasks against the current harness").option("--task <id>", "Run a specific task by ID").option("--iterations <n>", "Number of evolution iterations", "5").option("--runs <n>", "Run each task N times for variance measurement", "1").option("--parallel <n>", "Run up to N tasks concurrently", "1").option("--max-mutations <n>", "Max mutations per iteration", "3").option("--prune-threshold <n>", "Skip tasks scoring above this on middle iterations", "95").option("--max-task-drop <n>", "Roll back if any task drops more than N points", "20").action(async (options) => {
5762
5815
  try {
5763
5816
  const projectRoot = process.cwd();
5764
5817
  const workspace = path24.join(projectRoot, ".kairn-evolve");
@@ -5839,6 +5892,24 @@ evolveCommand.command("run").description("Run tasks against the current harness"
5839
5892
  process.exit(1);
5840
5893
  }
5841
5894
  evolveConfig.parallelTasks = parallel;
5895
+ const maxMutations = parseInt(options.maxMutations ?? "3", 10);
5896
+ if (isNaN(maxMutations) || maxMutations < 1) {
5897
+ console.log(ui.error("--max-mutations must be a positive integer"));
5898
+ process.exit(1);
5899
+ }
5900
+ evolveConfig.maxMutationsPerIteration = maxMutations;
5901
+ const pruneThreshold = parseInt(options.pruneThreshold ?? "95", 10);
5902
+ if (isNaN(pruneThreshold) || pruneThreshold < 0 || pruneThreshold > 100) {
5903
+ console.log(ui.error("--prune-threshold must be 0-100"));
5904
+ process.exit(1);
5905
+ }
5906
+ evolveConfig.pruneThreshold = pruneThreshold;
5907
+ const maxTaskDrop = parseInt(options.maxTaskDrop ?? "20", 10);
5908
+ if (isNaN(maxTaskDrop) || maxTaskDrop < 1) {
5909
+ console.log(ui.error("--max-task-drop must be a positive integer"));
5910
+ process.exit(1);
5911
+ }
5912
+ evolveConfig.maxTaskDrop = maxTaskDrop;
5842
5913
  try {
5843
5914
  await fs24.access(path24.join(workspace, "iterations", "0", "harness"));
5844
5915
  } catch {
@@ -5877,7 +5948,10 @@ evolveCommand.command("run").description("Run tasks against the current harness"
5877
5948
  console.log(chalk14.dim(` ${event.message ?? ""}`));
5878
5949
  break;
5879
5950
  case "task-skipped":
5880
- console.log(chalk14.dim(` SKIP ${event.taskId ?? "unknown"} (100% last iteration)`));
5951
+ console.log(chalk14.dim(` SKIP ${event.taskId ?? "unknown"} (above prune threshold last iteration)`));
5952
+ break;
5953
+ case "task-regression":
5954
+ console.log(chalk14.yellow(` DROP ${event.taskId ?? "unknown"} ${event.message ?? ""}`));
5881
5955
  break;
5882
5956
  case "task-scored": {
5883
5957
  const taskScore = event.score ?? 0;