kairn-cli 2.2.8 → 2.2.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +120 -24
- package/dist/cli.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -4741,7 +4741,7 @@ Treat .mcp.json like any other harness file \u2014 propose changes when traces s
|
|
|
4741
4741
|
the agent lacks a tool it needs, or has tools that add noise without benefit.
|
|
4742
4742
|
|
|
4743
4743
|
## Rules
|
|
4744
|
-
-
|
|
4744
|
+
- Propose AT MOST 3 mutations per iteration. Fewer, targeted mutations are more stable than many broad ones.
|
|
4745
4745
|
- Each mutation must have a clear rationale tied to a specific trace observation.
|
|
4746
4746
|
- Never remove something that's working for another task.
|
|
4747
4747
|
- If a previous iteration's change caused a regression, REVERT it.
|
|
@@ -4830,10 +4830,15 @@ ${content}
|
|
|
4830
4830
|
}
|
|
4831
4831
|
function buildTraceSection(traces, budget) {
|
|
4832
4832
|
if (traces.length === 0) return "## Execution Traces\n\n(No traces available)\n";
|
|
4833
|
+
const sortedTraces = [...traces].sort((a, b) => {
|
|
4834
|
+
const scoreA = a.score.score ?? (a.score.pass ? 100 : 0);
|
|
4835
|
+
const scoreB = b.score.score ?? (b.score.pass ? 100 : 0);
|
|
4836
|
+
return scoreA - scoreB;
|
|
4837
|
+
});
|
|
4833
4838
|
let stdoutLimit = STDOUT_TRUNCATION_LIMIT;
|
|
4834
4839
|
for (let attempt = 0; attempt < 4; attempt++) {
|
|
4835
|
-
const parts = ["## Execution Traces\n"];
|
|
4836
|
-
for (const trace of
|
|
4840
|
+
const parts = ["## Execution Traces (sorted worst-first)\n"];
|
|
4841
|
+
for (const trace of sortedTraces) {
|
|
4837
4842
|
const scoreNum = trace.score.score !== void 0 ? trace.score.score : trace.score.pass ? 100 : 0;
|
|
4838
4843
|
const truncatedStdout = truncateStdout(trace.stdout, stdoutLimit);
|
|
4839
4844
|
const filesChangedList = Object.entries(trace.filesChanged).map(([f, action]) => ` - ${f}: ${action}`).join("\n");
|
|
@@ -5135,18 +5140,19 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
5135
5140
|
const prevLog = history.length > 0 ? history[history.length - 1] : null;
|
|
5136
5141
|
let tasksToRun = tasks;
|
|
5137
5142
|
const carriedScores = {};
|
|
5143
|
+
const threshold = evolveConfig.pruneThreshold;
|
|
5138
5144
|
if (!isFirstIter && !isLastIter && prevLog) {
|
|
5139
5145
|
tasksToRun = [];
|
|
5140
5146
|
for (const task of tasks) {
|
|
5141
5147
|
const prevScore = prevLog.taskResults[task.id];
|
|
5142
5148
|
const prevValue = prevScore ? prevScore.score ?? (prevScore.pass ? 100 : 0) : 0;
|
|
5143
|
-
if (prevValue >=
|
|
5144
|
-
carriedScores[task.id] = { pass: true, score:
|
|
5149
|
+
if (prevValue >= threshold) {
|
|
5150
|
+
carriedScores[task.id] = { pass: true, score: prevValue };
|
|
5145
5151
|
onProgress?.({
|
|
5146
5152
|
type: "task-skipped",
|
|
5147
5153
|
iteration: iter,
|
|
5148
5154
|
taskId: task.id,
|
|
5149
|
-
message: `Skipped ${task.id} (scored
|
|
5155
|
+
message: `Skipped ${task.id} (scored ${prevValue.toFixed(0)}% >= ${threshold}% threshold)`
|
|
5150
5156
|
});
|
|
5151
5157
|
} else {
|
|
5152
5158
|
tasksToRun.push(task);
|
|
@@ -5172,12 +5178,35 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
5172
5178
|
const aggregate = allScores.length > 0 ? total / allScores.length : 0;
|
|
5173
5179
|
onProgress?.({ type: "iteration-scored", iteration: iter, score: aggregate });
|
|
5174
5180
|
if (iter === 0) baselineScore = aggregate;
|
|
5175
|
-
|
|
5181
|
+
let shouldRollback = iter > 0 && aggregate < bestScore;
|
|
5182
|
+
let rollbackMessage = shouldRollback ? `Regression: ${aggregate.toFixed(1)}% < ${bestScore.toFixed(1)}%. Rolling back.` : "";
|
|
5183
|
+
const bestLog = history.find((h) => h.iteration === bestIteration);
|
|
5184
|
+
if (iter > 0 && !shouldRollback && bestLog) {
|
|
5185
|
+
for (const [taskId, score] of Object.entries(results)) {
|
|
5186
|
+
const currValue = score.score ?? (score.pass ? 100 : 0);
|
|
5187
|
+
const bestTaskScore = bestLog.taskResults[taskId];
|
|
5188
|
+
const bestValue = bestTaskScore ? bestTaskScore.score ?? (bestTaskScore.pass ? 100 : 0) : currValue;
|
|
5189
|
+
const drop = bestValue - currValue;
|
|
5190
|
+
if (drop > evolveConfig.maxTaskDrop) {
|
|
5191
|
+
shouldRollback = true;
|
|
5192
|
+
rollbackMessage = `Task ${taskId} dropped ${drop.toFixed(0)} points (${bestValue.toFixed(0)}% \u2192 ${currValue.toFixed(0)}%). Rolling back.`;
|
|
5193
|
+
onProgress?.({
|
|
5194
|
+
type: "task-regression",
|
|
5195
|
+
iteration: iter,
|
|
5196
|
+
taskId,
|
|
5197
|
+
score: currValue,
|
|
5198
|
+
message: `dropped ${drop.toFixed(0)} points (limit: ${evolveConfig.maxTaskDrop})`
|
|
5199
|
+
});
|
|
5200
|
+
break;
|
|
5201
|
+
}
|
|
5202
|
+
}
|
|
5203
|
+
}
|
|
5204
|
+
if (shouldRollback) {
|
|
5176
5205
|
onProgress?.({
|
|
5177
5206
|
type: "rollback",
|
|
5178
5207
|
iteration: iter,
|
|
5179
5208
|
score: aggregate,
|
|
5180
|
-
message:
|
|
5209
|
+
message: rollbackMessage
|
|
5181
5210
|
});
|
|
5182
5211
|
const rollbackLog = {
|
|
5183
5212
|
iteration: iter,
|
|
@@ -5189,19 +5218,41 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
5189
5218
|
};
|
|
5190
5219
|
await writeIterationLog(workspacePath, rollbackLog);
|
|
5191
5220
|
history.push(rollbackLog);
|
|
5221
|
+
const bestHarnessPath = path21.join(
|
|
5222
|
+
workspacePath,
|
|
5223
|
+
"iterations",
|
|
5224
|
+
bestIteration.toString(),
|
|
5225
|
+
"harness"
|
|
5226
|
+
);
|
|
5192
5227
|
if (iter + 1 < evolveConfig.maxIterations) {
|
|
5193
|
-
|
|
5194
|
-
|
|
5195
|
-
|
|
5196
|
-
|
|
5197
|
-
|
|
5198
|
-
|
|
5199
|
-
|
|
5200
|
-
|
|
5201
|
-
|
|
5202
|
-
|
|
5203
|
-
|
|
5204
|
-
|
|
5228
|
+
onProgress?.({ type: "proposing", iteration: iter, message: "Proposing new mutations after rollback" });
|
|
5229
|
+
try {
|
|
5230
|
+
let rollbackProposal = await propose(
|
|
5231
|
+
iter,
|
|
5232
|
+
workspacePath,
|
|
5233
|
+
bestHarnessPath,
|
|
5234
|
+
history,
|
|
5235
|
+
tasks,
|
|
5236
|
+
kairnConfig,
|
|
5237
|
+
evolveConfig.proposerModel
|
|
5238
|
+
);
|
|
5239
|
+
if (rollbackProposal.mutations.length > evolveConfig.maxMutationsPerIteration) {
|
|
5240
|
+
rollbackProposal = {
|
|
5241
|
+
...rollbackProposal,
|
|
5242
|
+
mutations: rollbackProposal.mutations.slice(0, evolveConfig.maxMutationsPerIteration)
|
|
5243
|
+
};
|
|
5244
|
+
}
|
|
5245
|
+
const nextIterDir2 = path21.join(workspacePath, "iterations", (iter + 1).toString());
|
|
5246
|
+
await applyMutations(bestHarnessPath, nextIterDir2, rollbackProposal.mutations);
|
|
5247
|
+
onProgress?.({
|
|
5248
|
+
type: "mutations-applied",
|
|
5249
|
+
iteration: iter,
|
|
5250
|
+
mutationCount: rollbackProposal.mutations.length
|
|
5251
|
+
});
|
|
5252
|
+
} catch {
|
|
5253
|
+
const nextIterDir2 = path21.join(workspacePath, "iterations", (iter + 1).toString());
|
|
5254
|
+
await copyDir(bestHarnessPath, path21.join(nextIterDir2, "harness"));
|
|
5255
|
+
}
|
|
5205
5256
|
}
|
|
5206
5257
|
continue;
|
|
5207
5258
|
}
|
|
@@ -5246,6 +5297,12 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
5246
5297
|
kairnConfig,
|
|
5247
5298
|
evolveConfig.proposerModel
|
|
5248
5299
|
);
|
|
5300
|
+
if (proposal.mutations.length > evolveConfig.maxMutationsPerIteration) {
|
|
5301
|
+
proposal = {
|
|
5302
|
+
...proposal,
|
|
5303
|
+
mutations: proposal.mutations.slice(0, evolveConfig.maxMutationsPerIteration)
|
|
5304
|
+
};
|
|
5305
|
+
}
|
|
5249
5306
|
} catch (err) {
|
|
5250
5307
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
5251
5308
|
onProgress?.({
|
|
@@ -5620,6 +5677,18 @@ async function applyEvolution(workspacePath, projectRoot, targetIteration) {
|
|
|
5620
5677
|
}
|
|
5621
5678
|
await fs23.rm(claudeDir, { recursive: true, force: true });
|
|
5622
5679
|
await copyDir(harnessPath, claudeDir);
|
|
5680
|
+
const harnessMcpJson = path23.join(harnessPath, ".mcp.json");
|
|
5681
|
+
const projectMcpJson = path23.join(projectRoot, ".mcp.json");
|
|
5682
|
+
try {
|
|
5683
|
+
await fs23.access(harnessMcpJson);
|
|
5684
|
+
const currentMcp = await fs23.readFile(projectMcpJson, "utf-8").catch(() => null);
|
|
5685
|
+
const targetMcp = await fs23.readFile(harnessMcpJson, "utf-8").catch(() => null);
|
|
5686
|
+
if (currentMcp !== targetMcp) {
|
|
5687
|
+
filesChanged.push(".mcp.json");
|
|
5688
|
+
}
|
|
5689
|
+
await fs23.copyFile(harnessMcpJson, projectMcpJson);
|
|
5690
|
+
} catch {
|
|
5691
|
+
}
|
|
5623
5692
|
return {
|
|
5624
5693
|
iteration: iter,
|
|
5625
5694
|
filesChanged,
|
|
@@ -5634,7 +5703,10 @@ var DEFAULT_CONFIG = {
|
|
|
5634
5703
|
scorer: "pass-fail",
|
|
5635
5704
|
maxIterations: 5,
|
|
5636
5705
|
parallelTasks: 1,
|
|
5637
|
-
runsPerTask: 1
|
|
5706
|
+
runsPerTask: 1,
|
|
5707
|
+
maxMutationsPerIteration: 3,
|
|
5708
|
+
pruneThreshold: 95,
|
|
5709
|
+
maxTaskDrop: 20
|
|
5638
5710
|
};
|
|
5639
5711
|
async function loadEvolveConfigFromWorkspace(workspacePath) {
|
|
5640
5712
|
try {
|
|
@@ -5646,7 +5718,10 @@ async function loadEvolveConfigFromWorkspace(workspacePath) {
|
|
|
5646
5718
|
scorer: parsed.scorer ?? DEFAULT_CONFIG.scorer,
|
|
5647
5719
|
maxIterations: parsed.max_iterations ?? DEFAULT_CONFIG.maxIterations,
|
|
5648
5720
|
parallelTasks: parsed.parallel_tasks ?? DEFAULT_CONFIG.parallelTasks,
|
|
5649
|
-
runsPerTask: parsed.runs_per_task ?? DEFAULT_CONFIG.runsPerTask
|
|
5721
|
+
runsPerTask: parsed.runs_per_task ?? DEFAULT_CONFIG.runsPerTask,
|
|
5722
|
+
maxMutationsPerIteration: parsed.max_mutations_per_iteration ?? DEFAULT_CONFIG.maxMutationsPerIteration,
|
|
5723
|
+
pruneThreshold: parsed.prune_threshold ?? DEFAULT_CONFIG.pruneThreshold,
|
|
5724
|
+
maxTaskDrop: parsed.max_task_drop ?? DEFAULT_CONFIG.maxTaskDrop
|
|
5650
5725
|
};
|
|
5651
5726
|
} catch {
|
|
5652
5727
|
return { ...DEFAULT_CONFIG };
|
|
@@ -5758,7 +5833,7 @@ evolveCommand.command("baseline").description("Snapshot current .claude/ directo
|
|
|
5758
5833
|
process.exit(1);
|
|
5759
5834
|
}
|
|
5760
5835
|
});
|
|
5761
|
-
evolveCommand.command("run").description("Run tasks against the current harness").option("--task <id>", "Run a specific task by ID").option("--iterations <n>", "Number of evolution iterations", "5").option("--runs <n>", "Run each task N times for variance measurement", "1").option("--parallel <n>", "Run up to N tasks concurrently", "1").action(async (options) => {
|
|
5836
|
+
evolveCommand.command("run").description("Run tasks against the current harness").option("--task <id>", "Run a specific task by ID").option("--iterations <n>", "Number of evolution iterations", "5").option("--runs <n>", "Run each task N times for variance measurement", "1").option("--parallel <n>", "Run up to N tasks concurrently", "1").option("--max-mutations <n>", "Max mutations per iteration", "3").option("--prune-threshold <n>", "Skip tasks scoring above this on middle iterations", "95").option("--max-task-drop <n>", "Roll back if any task drops more than N points", "20").action(async (options) => {
|
|
5762
5837
|
try {
|
|
5763
5838
|
const projectRoot = process.cwd();
|
|
5764
5839
|
const workspace = path24.join(projectRoot, ".kairn-evolve");
|
|
@@ -5839,6 +5914,24 @@ evolveCommand.command("run").description("Run tasks against the current harness"
|
|
|
5839
5914
|
process.exit(1);
|
|
5840
5915
|
}
|
|
5841
5916
|
evolveConfig.parallelTasks = parallel;
|
|
5917
|
+
const maxMutations = parseInt(options.maxMutations ?? "3", 10);
|
|
5918
|
+
if (isNaN(maxMutations) || maxMutations < 1) {
|
|
5919
|
+
console.log(ui.error("--max-mutations must be a positive integer"));
|
|
5920
|
+
process.exit(1);
|
|
5921
|
+
}
|
|
5922
|
+
evolveConfig.maxMutationsPerIteration = maxMutations;
|
|
5923
|
+
const pruneThreshold = parseInt(options.pruneThreshold ?? "95", 10);
|
|
5924
|
+
if (isNaN(pruneThreshold) || pruneThreshold < 0 || pruneThreshold > 100) {
|
|
5925
|
+
console.log(ui.error("--prune-threshold must be 0-100"));
|
|
5926
|
+
process.exit(1);
|
|
5927
|
+
}
|
|
5928
|
+
evolveConfig.pruneThreshold = pruneThreshold;
|
|
5929
|
+
const maxTaskDrop = parseInt(options.maxTaskDrop ?? "20", 10);
|
|
5930
|
+
if (isNaN(maxTaskDrop) || maxTaskDrop < 1) {
|
|
5931
|
+
console.log(ui.error("--max-task-drop must be a positive integer"));
|
|
5932
|
+
process.exit(1);
|
|
5933
|
+
}
|
|
5934
|
+
evolveConfig.maxTaskDrop = maxTaskDrop;
|
|
5842
5935
|
try {
|
|
5843
5936
|
await fs24.access(path24.join(workspace, "iterations", "0", "harness"));
|
|
5844
5937
|
} catch {
|
|
@@ -5877,7 +5970,10 @@ evolveCommand.command("run").description("Run tasks against the current harness"
|
|
|
5877
5970
|
console.log(chalk14.dim(` ${event.message ?? ""}`));
|
|
5878
5971
|
break;
|
|
5879
5972
|
case "task-skipped":
|
|
5880
|
-
console.log(chalk14.dim(` SKIP ${event.taskId ?? "unknown"} (
|
|
5973
|
+
console.log(chalk14.dim(` SKIP ${event.taskId ?? "unknown"} (above prune threshold last iteration)`));
|
|
5974
|
+
break;
|
|
5975
|
+
case "task-regression":
|
|
5976
|
+
console.log(chalk14.yellow(` DROP ${event.taskId ?? "unknown"} ${event.message ?? ""}`));
|
|
5881
5977
|
break;
|
|
5882
5978
|
case "task-scored": {
|
|
5883
5979
|
const taskScore = event.score ?? 0;
|