kairn-cli 2.2.8 → 2.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -4741,7 +4741,7 @@ Treat .mcp.json like any other harness file \u2014 propose changes when traces s
4741
4741
  the agent lacks a tool it needs, or has tools that add noise without benefit.
4742
4742
 
4743
4743
  ## Rules
4744
- - MINIMAL changes only. Don't rewrite the entire CLAUDE.md.
4744
+ - Propose AT MOST 3 mutations per iteration. Fewer, targeted mutations are more stable than many broad ones.
4745
4745
  - Each mutation must have a clear rationale tied to a specific trace observation.
4746
4746
  - Never remove something that's working for another task.
4747
4747
  - If a previous iteration's change caused a regression, REVERT it.
@@ -4830,10 +4830,15 @@ ${content}
4830
4830
  }
4831
4831
  function buildTraceSection(traces, budget) {
4832
4832
  if (traces.length === 0) return "## Execution Traces\n\n(No traces available)\n";
4833
+ const sortedTraces = [...traces].sort((a, b) => {
4834
+ const scoreA = a.score.score ?? (a.score.pass ? 100 : 0);
4835
+ const scoreB = b.score.score ?? (b.score.pass ? 100 : 0);
4836
+ return scoreA - scoreB;
4837
+ });
4833
4838
  let stdoutLimit = STDOUT_TRUNCATION_LIMIT;
4834
4839
  for (let attempt = 0; attempt < 4; attempt++) {
4835
- const parts = ["## Execution Traces\n"];
4836
- for (const trace of traces) {
4840
+ const parts = ["## Execution Traces (sorted worst-first)\n"];
4841
+ for (const trace of sortedTraces) {
4837
4842
  const scoreNum = trace.score.score !== void 0 ? trace.score.score : trace.score.pass ? 100 : 0;
4838
4843
  const truncatedStdout = truncateStdout(trace.stdout, stdoutLimit);
4839
4844
  const filesChangedList = Object.entries(trace.filesChanged).map(([f, action]) => ` - ${f}: ${action}`).join("\n");
@@ -5135,18 +5140,19 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
5135
5140
  const prevLog = history.length > 0 ? history[history.length - 1] : null;
5136
5141
  let tasksToRun = tasks;
5137
5142
  const carriedScores = {};
5143
+ const threshold = evolveConfig.pruneThreshold;
5138
5144
  if (!isFirstIter && !isLastIter && prevLog) {
5139
5145
  tasksToRun = [];
5140
5146
  for (const task of tasks) {
5141
5147
  const prevScore = prevLog.taskResults[task.id];
5142
5148
  const prevValue = prevScore ? prevScore.score ?? (prevScore.pass ? 100 : 0) : 0;
5143
- if (prevValue >= 100) {
5144
- carriedScores[task.id] = { pass: true, score: 100 };
5149
+ if (prevValue >= threshold) {
5150
+ carriedScores[task.id] = { pass: true, score: prevValue };
5145
5151
  onProgress?.({
5146
5152
  type: "task-skipped",
5147
5153
  iteration: iter,
5148
5154
  taskId: task.id,
5149
- message: `Skipped ${task.id} (scored 100% last iteration)`
5155
+ message: `Skipped ${task.id} (scored ${prevValue.toFixed(0)}% >= ${threshold}% threshold)`
5150
5156
  });
5151
5157
  } else {
5152
5158
  tasksToRun.push(task);
@@ -5172,12 +5178,35 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
5172
5178
  const aggregate = allScores.length > 0 ? total / allScores.length : 0;
5173
5179
  onProgress?.({ type: "iteration-scored", iteration: iter, score: aggregate });
5174
5180
  if (iter === 0) baselineScore = aggregate;
5175
- if (iter > 0 && aggregate < bestScore) {
5181
+ let shouldRollback = iter > 0 && aggregate < bestScore;
5182
+ let rollbackMessage = shouldRollback ? `Regression: ${aggregate.toFixed(1)}% < ${bestScore.toFixed(1)}%. Rolling back.` : "";
5183
+ const bestLog = history.find((h) => h.iteration === bestIteration);
5184
+ if (iter > 0 && !shouldRollback && bestLog) {
5185
+ for (const [taskId, score] of Object.entries(results)) {
5186
+ const currValue = score.score ?? (score.pass ? 100 : 0);
5187
+ const bestTaskScore = bestLog.taskResults[taskId];
5188
+ const bestValue = bestTaskScore ? bestTaskScore.score ?? (bestTaskScore.pass ? 100 : 0) : currValue;
5189
+ const drop = bestValue - currValue;
5190
+ if (drop > evolveConfig.maxTaskDrop) {
5191
+ shouldRollback = true;
5192
+ rollbackMessage = `Task ${taskId} dropped ${drop.toFixed(0)} points (${bestValue.toFixed(0)}% \u2192 ${currValue.toFixed(0)}%). Rolling back.`;
5193
+ onProgress?.({
5194
+ type: "task-regression",
5195
+ iteration: iter,
5196
+ taskId,
5197
+ score: currValue,
5198
+ message: `dropped ${drop.toFixed(0)} points (limit: ${evolveConfig.maxTaskDrop})`
5199
+ });
5200
+ break;
5201
+ }
5202
+ }
5203
+ }
5204
+ if (shouldRollback) {
5176
5205
  onProgress?.({
5177
5206
  type: "rollback",
5178
5207
  iteration: iter,
5179
5208
  score: aggregate,
5180
- message: `Regression: ${aggregate.toFixed(1)}% < ${bestScore.toFixed(1)}%. Rolling back.`
5209
+ message: rollbackMessage
5181
5210
  });
5182
5211
  const rollbackLog = {
5183
5212
  iteration: iter,
@@ -5189,19 +5218,41 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
5189
5218
  };
5190
5219
  await writeIterationLog(workspacePath, rollbackLog);
5191
5220
  history.push(rollbackLog);
5221
+ const bestHarnessPath = path21.join(
5222
+ workspacePath,
5223
+ "iterations",
5224
+ bestIteration.toString(),
5225
+ "harness"
5226
+ );
5192
5227
  if (iter + 1 < evolveConfig.maxIterations) {
5193
- const nextIterDir2 = path21.join(
5194
- workspacePath,
5195
- "iterations",
5196
- (iter + 1).toString()
5197
- );
5198
- const bestHarnessPath = path21.join(
5199
- workspacePath,
5200
- "iterations",
5201
- bestIteration.toString(),
5202
- "harness"
5203
- );
5204
- await copyDir(bestHarnessPath, path21.join(nextIterDir2, "harness"));
5228
+ onProgress?.({ type: "proposing", iteration: iter, message: "Proposing new mutations after rollback" });
5229
+ try {
5230
+ let rollbackProposal = await propose(
5231
+ iter,
5232
+ workspacePath,
5233
+ bestHarnessPath,
5234
+ history,
5235
+ tasks,
5236
+ kairnConfig,
5237
+ evolveConfig.proposerModel
5238
+ );
5239
+ if (rollbackProposal.mutations.length > evolveConfig.maxMutationsPerIteration) {
5240
+ rollbackProposal = {
5241
+ ...rollbackProposal,
5242
+ mutations: rollbackProposal.mutations.slice(0, evolveConfig.maxMutationsPerIteration)
5243
+ };
5244
+ }
5245
+ const nextIterDir2 = path21.join(workspacePath, "iterations", (iter + 1).toString());
5246
+ await applyMutations(bestHarnessPath, nextIterDir2, rollbackProposal.mutations);
5247
+ onProgress?.({
5248
+ type: "mutations-applied",
5249
+ iteration: iter,
5250
+ mutationCount: rollbackProposal.mutations.length
5251
+ });
5252
+ } catch {
5253
+ const nextIterDir2 = path21.join(workspacePath, "iterations", (iter + 1).toString());
5254
+ await copyDir(bestHarnessPath, path21.join(nextIterDir2, "harness"));
5255
+ }
5205
5256
  }
5206
5257
  continue;
5207
5258
  }
@@ -5246,6 +5297,12 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
5246
5297
  kairnConfig,
5247
5298
  evolveConfig.proposerModel
5248
5299
  );
5300
+ if (proposal.mutations.length > evolveConfig.maxMutationsPerIteration) {
5301
+ proposal = {
5302
+ ...proposal,
5303
+ mutations: proposal.mutations.slice(0, evolveConfig.maxMutationsPerIteration)
5304
+ };
5305
+ }
5249
5306
  } catch (err) {
5250
5307
  const errMsg = err instanceof Error ? err.message : String(err);
5251
5308
  onProgress?.({
@@ -5620,6 +5677,18 @@ async function applyEvolution(workspacePath, projectRoot, targetIteration) {
5620
5677
  }
5621
5678
  await fs23.rm(claudeDir, { recursive: true, force: true });
5622
5679
  await copyDir(harnessPath, claudeDir);
5680
+ const harnessMcpJson = path23.join(harnessPath, ".mcp.json");
5681
+ const projectMcpJson = path23.join(projectRoot, ".mcp.json");
5682
+ try {
5683
+ await fs23.access(harnessMcpJson);
5684
+ const currentMcp = await fs23.readFile(projectMcpJson, "utf-8").catch(() => null);
5685
+ const targetMcp = await fs23.readFile(harnessMcpJson, "utf-8").catch(() => null);
5686
+ if (currentMcp !== targetMcp) {
5687
+ filesChanged.push(".mcp.json");
5688
+ }
5689
+ await fs23.copyFile(harnessMcpJson, projectMcpJson);
5690
+ } catch {
5691
+ }
5623
5692
  return {
5624
5693
  iteration: iter,
5625
5694
  filesChanged,
@@ -5634,7 +5703,10 @@ var DEFAULT_CONFIG = {
5634
5703
  scorer: "pass-fail",
5635
5704
  maxIterations: 5,
5636
5705
  parallelTasks: 1,
5637
- runsPerTask: 1
5706
+ runsPerTask: 1,
5707
+ maxMutationsPerIteration: 3,
5708
+ pruneThreshold: 95,
5709
+ maxTaskDrop: 20
5638
5710
  };
5639
5711
  async function loadEvolveConfigFromWorkspace(workspacePath) {
5640
5712
  try {
@@ -5646,7 +5718,10 @@ async function loadEvolveConfigFromWorkspace(workspacePath) {
5646
5718
  scorer: parsed.scorer ?? DEFAULT_CONFIG.scorer,
5647
5719
  maxIterations: parsed.max_iterations ?? DEFAULT_CONFIG.maxIterations,
5648
5720
  parallelTasks: parsed.parallel_tasks ?? DEFAULT_CONFIG.parallelTasks,
5649
- runsPerTask: parsed.runs_per_task ?? DEFAULT_CONFIG.runsPerTask
5721
+ runsPerTask: parsed.runs_per_task ?? DEFAULT_CONFIG.runsPerTask,
5722
+ maxMutationsPerIteration: parsed.max_mutations_per_iteration ?? DEFAULT_CONFIG.maxMutationsPerIteration,
5723
+ pruneThreshold: parsed.prune_threshold ?? DEFAULT_CONFIG.pruneThreshold,
5724
+ maxTaskDrop: parsed.max_task_drop ?? DEFAULT_CONFIG.maxTaskDrop
5650
5725
  };
5651
5726
  } catch {
5652
5727
  return { ...DEFAULT_CONFIG };
@@ -5758,7 +5833,7 @@ evolveCommand.command("baseline").description("Snapshot current .claude/ directo
5758
5833
  process.exit(1);
5759
5834
  }
5760
5835
  });
5761
- evolveCommand.command("run").description("Run tasks against the current harness").option("--task <id>", "Run a specific task by ID").option("--iterations <n>", "Number of evolution iterations", "5").option("--runs <n>", "Run each task N times for variance measurement", "1").option("--parallel <n>", "Run up to N tasks concurrently", "1").action(async (options) => {
5836
+ evolveCommand.command("run").description("Run tasks against the current harness").option("--task <id>", "Run a specific task by ID").option("--iterations <n>", "Number of evolution iterations", "5").option("--runs <n>", "Run each task N times for variance measurement", "1").option("--parallel <n>", "Run up to N tasks concurrently", "1").option("--max-mutations <n>", "Max mutations per iteration", "3").option("--prune-threshold <n>", "Skip tasks scoring above this on middle iterations", "95").option("--max-task-drop <n>", "Roll back if any task drops more than N points", "20").action(async (options) => {
5762
5837
  try {
5763
5838
  const projectRoot = process.cwd();
5764
5839
  const workspace = path24.join(projectRoot, ".kairn-evolve");
@@ -5839,6 +5914,24 @@ evolveCommand.command("run").description("Run tasks against the current harness"
5839
5914
  process.exit(1);
5840
5915
  }
5841
5916
  evolveConfig.parallelTasks = parallel;
5917
+ const maxMutations = parseInt(options.maxMutations ?? "3", 10);
5918
+ if (isNaN(maxMutations) || maxMutations < 1) {
5919
+ console.log(ui.error("--max-mutations must be a positive integer"));
5920
+ process.exit(1);
5921
+ }
5922
+ evolveConfig.maxMutationsPerIteration = maxMutations;
5923
+ const pruneThreshold = parseInt(options.pruneThreshold ?? "95", 10);
5924
+ if (isNaN(pruneThreshold) || pruneThreshold < 0 || pruneThreshold > 100) {
5925
+ console.log(ui.error("--prune-threshold must be 0-100"));
5926
+ process.exit(1);
5927
+ }
5928
+ evolveConfig.pruneThreshold = pruneThreshold;
5929
+ const maxTaskDrop = parseInt(options.maxTaskDrop ?? "20", 10);
5930
+ if (isNaN(maxTaskDrop) || maxTaskDrop < 1) {
5931
+ console.log(ui.error("--max-task-drop must be a positive integer"));
5932
+ process.exit(1);
5933
+ }
5934
+ evolveConfig.maxTaskDrop = maxTaskDrop;
5842
5935
  try {
5843
5936
  await fs24.access(path24.join(workspace, "iterations", "0", "harness"));
5844
5937
  } catch {
@@ -5877,7 +5970,10 @@ evolveCommand.command("run").description("Run tasks against the current harness"
5877
5970
  console.log(chalk14.dim(` ${event.message ?? ""}`));
5878
5971
  break;
5879
5972
  case "task-skipped":
5880
- console.log(chalk14.dim(` SKIP ${event.taskId ?? "unknown"} (100% last iteration)`));
5973
+ console.log(chalk14.dim(` SKIP ${event.taskId ?? "unknown"} (above prune threshold last iteration)`));
5974
+ break;
5975
+ case "task-regression":
5976
+ console.log(chalk14.yellow(` DROP ${event.taskId ?? "unknown"} ${event.message ?? ""}`));
5881
5977
  break;
5882
5978
  case "task-scored": {
5883
5979
  const taskScore = event.score ?? 0;