kairn-cli 2.2.7 → 2.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +93 -14
- package/dist/cli.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -4741,13 +4741,20 @@ Treat .mcp.json like any other harness file \u2014 propose changes when traces s
|
|
|
4741
4741
|
the agent lacks a tool it needs, or has tools that add noise without benefit.
|
|
4742
4742
|
|
|
4743
4743
|
## Rules
|
|
4744
|
-
-
|
|
4744
|
+
- Propose AT MOST 3 mutations per iteration. Fewer, targeted mutations are more stable than many broad ones.
|
|
4745
4745
|
- Each mutation must have a clear rationale tied to a specific trace observation.
|
|
4746
4746
|
- Never remove something that's working for another task.
|
|
4747
4747
|
- If a previous iteration's change caused a regression, REVERT it.
|
|
4748
4748
|
- Consider both additions AND removals. Remove sections that add noise without improving task performance.
|
|
4749
4749
|
- Bloated harnesses hurt performance \u2014 trim what isn't earning its keep.
|
|
4750
4750
|
|
|
4751
|
+
## Anti-Gaming (CRITICAL)
|
|
4752
|
+
- Mutations must improve GENERAL-PURPOSE development quality, not target specific eval criteria.
|
|
4753
|
+
- You do NOT have access to scoring rubrics or expected outcomes. Diagnose problems from traces only.
|
|
4754
|
+
- Do NOT add over-specified rules that restate existing conventions with stronger emphasis (e.g., changing "use chalk.green for success" to "MUST use chalk.green, no exceptions"). If a convention already exists, trust it.
|
|
4755
|
+
- Do NOT add rules that only apply to a narrow eval scenario (e.g., write permissions for a specific directory just because one task needed it).
|
|
4756
|
+
- Ask: "Would this mutation help a developer working on ANY task in this project?" If not, don't propose it.
|
|
4757
|
+
|
|
4751
4758
|
Return ONLY valid JSON.`;
|
|
4752
4759
|
var STDOUT_TRUNCATION_LIMIT = 1e3;
|
|
4753
4760
|
var MAX_CONTEXT_CHARS = 1e5;
|
|
@@ -4806,8 +4813,6 @@ ${content}
|
|
|
4806
4813
|
`### Task: ${task.id}
|
|
4807
4814
|
- Template: ${task.template}
|
|
4808
4815
|
- Description: ${task.description}
|
|
4809
|
-
- Expected outcome: ${Array.isArray(task.expected_outcome) ? task.expected_outcome.join("; ") : task.expected_outcome}
|
|
4810
|
-
- Scoring: ${task.scoring}
|
|
4811
4816
|
`
|
|
4812
4817
|
);
|
|
4813
4818
|
}
|
|
@@ -4825,10 +4830,15 @@ ${content}
|
|
|
4825
4830
|
}
|
|
4826
4831
|
function buildTraceSection(traces, budget) {
|
|
4827
4832
|
if (traces.length === 0) return "## Execution Traces\n\n(No traces available)\n";
|
|
4833
|
+
const sortedTraces = [...traces].sort((a, b) => {
|
|
4834
|
+
const scoreA = a.score.score ?? (a.score.pass ? 100 : 0);
|
|
4835
|
+
const scoreB = b.score.score ?? (b.score.pass ? 100 : 0);
|
|
4836
|
+
return scoreA - scoreB;
|
|
4837
|
+
});
|
|
4828
4838
|
let stdoutLimit = STDOUT_TRUNCATION_LIMIT;
|
|
4829
4839
|
for (let attempt = 0; attempt < 4; attempt++) {
|
|
4830
|
-
const parts = ["## Execution Traces\n"];
|
|
4831
|
-
for (const trace of
|
|
4840
|
+
const parts = ["## Execution Traces (sorted worst-first)\n"];
|
|
4841
|
+
for (const trace of sortedTraces) {
|
|
4832
4842
|
const scoreNum = trace.score.score !== void 0 ? trace.score.score : trace.score.pass ? 100 : 0;
|
|
4833
4843
|
const truncatedStdout = truncateStdout(trace.stdout, stdoutLimit);
|
|
4834
4844
|
const filesChangedList = Object.entries(trace.filesChanged).map(([f, action]) => ` - ${f}: ${action}`).join("\n");
|
|
@@ -5130,18 +5140,19 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
5130
5140
|
const prevLog = history.length > 0 ? history[history.length - 1] : null;
|
|
5131
5141
|
let tasksToRun = tasks;
|
|
5132
5142
|
const carriedScores = {};
|
|
5143
|
+
const threshold = evolveConfig.pruneThreshold;
|
|
5133
5144
|
if (!isFirstIter && !isLastIter && prevLog) {
|
|
5134
5145
|
tasksToRun = [];
|
|
5135
5146
|
for (const task of tasks) {
|
|
5136
5147
|
const prevScore = prevLog.taskResults[task.id];
|
|
5137
5148
|
const prevValue = prevScore ? prevScore.score ?? (prevScore.pass ? 100 : 0) : 0;
|
|
5138
|
-
if (prevValue >=
|
|
5139
|
-
carriedScores[task.id] = { pass: true, score:
|
|
5149
|
+
if (prevValue >= threshold) {
|
|
5150
|
+
carriedScores[task.id] = { pass: true, score: prevValue };
|
|
5140
5151
|
onProgress?.({
|
|
5141
5152
|
type: "task-skipped",
|
|
5142
5153
|
iteration: iter,
|
|
5143
5154
|
taskId: task.id,
|
|
5144
|
-
message: `Skipped ${task.id} (scored
|
|
5155
|
+
message: `Skipped ${task.id} (scored ${prevValue.toFixed(0)}% >= ${threshold}% threshold)`
|
|
5145
5156
|
});
|
|
5146
5157
|
} else {
|
|
5147
5158
|
tasksToRun.push(task);
|
|
@@ -5167,12 +5178,35 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
5167
5178
|
const aggregate = allScores.length > 0 ? total / allScores.length : 0;
|
|
5168
5179
|
onProgress?.({ type: "iteration-scored", iteration: iter, score: aggregate });
|
|
5169
5180
|
if (iter === 0) baselineScore = aggregate;
|
|
5170
|
-
|
|
5181
|
+
let shouldRollback = iter > 0 && aggregate < bestScore;
|
|
5182
|
+
let rollbackMessage = shouldRollback ? `Regression: ${aggregate.toFixed(1)}% < ${bestScore.toFixed(1)}%. Rolling back.` : "";
|
|
5183
|
+
const bestLog = history.find((h) => h.iteration === bestIteration);
|
|
5184
|
+
if (iter > 0 && !shouldRollback && bestLog) {
|
|
5185
|
+
for (const [taskId, score] of Object.entries(results)) {
|
|
5186
|
+
const currValue = score.score ?? (score.pass ? 100 : 0);
|
|
5187
|
+
const bestTaskScore = bestLog.taskResults[taskId];
|
|
5188
|
+
const bestValue = bestTaskScore ? bestTaskScore.score ?? (bestTaskScore.pass ? 100 : 0) : currValue;
|
|
5189
|
+
const drop = bestValue - currValue;
|
|
5190
|
+
if (drop > evolveConfig.maxTaskDrop) {
|
|
5191
|
+
shouldRollback = true;
|
|
5192
|
+
rollbackMessage = `Task ${taskId} dropped ${drop.toFixed(0)} points (${bestValue.toFixed(0)}% \u2192 ${currValue.toFixed(0)}%). Rolling back.`;
|
|
5193
|
+
onProgress?.({
|
|
5194
|
+
type: "task-regression",
|
|
5195
|
+
iteration: iter,
|
|
5196
|
+
taskId,
|
|
5197
|
+
score: currValue,
|
|
5198
|
+
message: `dropped ${drop.toFixed(0)} points (limit: ${evolveConfig.maxTaskDrop})`
|
|
5199
|
+
});
|
|
5200
|
+
break;
|
|
5201
|
+
}
|
|
5202
|
+
}
|
|
5203
|
+
}
|
|
5204
|
+
if (shouldRollback) {
|
|
5171
5205
|
onProgress?.({
|
|
5172
5206
|
type: "rollback",
|
|
5173
5207
|
iteration: iter,
|
|
5174
5208
|
score: aggregate,
|
|
5175
|
-
message:
|
|
5209
|
+
message: rollbackMessage
|
|
5176
5210
|
});
|
|
5177
5211
|
const rollbackLog = {
|
|
5178
5212
|
iteration: iter,
|
|
@@ -5241,6 +5275,12 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
5241
5275
|
kairnConfig,
|
|
5242
5276
|
evolveConfig.proposerModel
|
|
5243
5277
|
);
|
|
5278
|
+
if (proposal.mutations.length > evolveConfig.maxMutationsPerIteration) {
|
|
5279
|
+
proposal = {
|
|
5280
|
+
...proposal,
|
|
5281
|
+
mutations: proposal.mutations.slice(0, evolveConfig.maxMutationsPerIteration)
|
|
5282
|
+
};
|
|
5283
|
+
}
|
|
5244
5284
|
} catch (err) {
|
|
5245
5285
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
5246
5286
|
onProgress?.({
|
|
@@ -5615,6 +5655,18 @@ async function applyEvolution(workspacePath, projectRoot, targetIteration) {
|
|
|
5615
5655
|
}
|
|
5616
5656
|
await fs23.rm(claudeDir, { recursive: true, force: true });
|
|
5617
5657
|
await copyDir(harnessPath, claudeDir);
|
|
5658
|
+
const harnessMcpJson = path23.join(harnessPath, ".mcp.json");
|
|
5659
|
+
const projectMcpJson = path23.join(projectRoot, ".mcp.json");
|
|
5660
|
+
try {
|
|
5661
|
+
await fs23.access(harnessMcpJson);
|
|
5662
|
+
const currentMcp = await fs23.readFile(projectMcpJson, "utf-8").catch(() => null);
|
|
5663
|
+
const targetMcp = await fs23.readFile(harnessMcpJson, "utf-8").catch(() => null);
|
|
5664
|
+
if (currentMcp !== targetMcp) {
|
|
5665
|
+
filesChanged.push(".mcp.json");
|
|
5666
|
+
}
|
|
5667
|
+
await fs23.copyFile(harnessMcpJson, projectMcpJson);
|
|
5668
|
+
} catch {
|
|
5669
|
+
}
|
|
5618
5670
|
return {
|
|
5619
5671
|
iteration: iter,
|
|
5620
5672
|
filesChanged,
|
|
@@ -5629,7 +5681,10 @@ var DEFAULT_CONFIG = {
|
|
|
5629
5681
|
scorer: "pass-fail",
|
|
5630
5682
|
maxIterations: 5,
|
|
5631
5683
|
parallelTasks: 1,
|
|
5632
|
-
runsPerTask: 1
|
|
5684
|
+
runsPerTask: 1,
|
|
5685
|
+
maxMutationsPerIteration: 3,
|
|
5686
|
+
pruneThreshold: 95,
|
|
5687
|
+
maxTaskDrop: 20
|
|
5633
5688
|
};
|
|
5634
5689
|
async function loadEvolveConfigFromWorkspace(workspacePath) {
|
|
5635
5690
|
try {
|
|
@@ -5641,7 +5696,10 @@ async function loadEvolveConfigFromWorkspace(workspacePath) {
|
|
|
5641
5696
|
scorer: parsed.scorer ?? DEFAULT_CONFIG.scorer,
|
|
5642
5697
|
maxIterations: parsed.max_iterations ?? DEFAULT_CONFIG.maxIterations,
|
|
5643
5698
|
parallelTasks: parsed.parallel_tasks ?? DEFAULT_CONFIG.parallelTasks,
|
|
5644
|
-
runsPerTask: parsed.runs_per_task ?? DEFAULT_CONFIG.runsPerTask
|
|
5699
|
+
runsPerTask: parsed.runs_per_task ?? DEFAULT_CONFIG.runsPerTask,
|
|
5700
|
+
maxMutationsPerIteration: parsed.max_mutations_per_iteration ?? DEFAULT_CONFIG.maxMutationsPerIteration,
|
|
5701
|
+
pruneThreshold: parsed.prune_threshold ?? DEFAULT_CONFIG.pruneThreshold,
|
|
5702
|
+
maxTaskDrop: parsed.max_task_drop ?? DEFAULT_CONFIG.maxTaskDrop
|
|
5645
5703
|
};
|
|
5646
5704
|
} catch {
|
|
5647
5705
|
return { ...DEFAULT_CONFIG };
|
|
@@ -5753,7 +5811,7 @@ evolveCommand.command("baseline").description("Snapshot current .claude/ directo
|
|
|
5753
5811
|
process.exit(1);
|
|
5754
5812
|
}
|
|
5755
5813
|
});
|
|
5756
|
-
evolveCommand.command("run").description("Run tasks against the current harness").option("--task <id>", "Run a specific task by ID").option("--iterations <n>", "Number of evolution iterations", "5").option("--runs <n>", "Run each task N times for variance measurement", "1").option("--parallel <n>", "Run up to N tasks concurrently", "1").action(async (options) => {
|
|
5814
|
+
evolveCommand.command("run").description("Run tasks against the current harness").option("--task <id>", "Run a specific task by ID").option("--iterations <n>", "Number of evolution iterations", "5").option("--runs <n>", "Run each task N times for variance measurement", "1").option("--parallel <n>", "Run up to N tasks concurrently", "1").option("--max-mutations <n>", "Max mutations per iteration", "3").option("--prune-threshold <n>", "Skip tasks scoring above this on middle iterations", "95").option("--max-task-drop <n>", "Roll back if any task drops more than N points", "20").action(async (options) => {
|
|
5757
5815
|
try {
|
|
5758
5816
|
const projectRoot = process.cwd();
|
|
5759
5817
|
const workspace = path24.join(projectRoot, ".kairn-evolve");
|
|
@@ -5834,6 +5892,24 @@ evolveCommand.command("run").description("Run tasks against the current harness"
|
|
|
5834
5892
|
process.exit(1);
|
|
5835
5893
|
}
|
|
5836
5894
|
evolveConfig.parallelTasks = parallel;
|
|
5895
|
+
const maxMutations = parseInt(options.maxMutations ?? "3", 10);
|
|
5896
|
+
if (isNaN(maxMutations) || maxMutations < 1) {
|
|
5897
|
+
console.log(ui.error("--max-mutations must be a positive integer"));
|
|
5898
|
+
process.exit(1);
|
|
5899
|
+
}
|
|
5900
|
+
evolveConfig.maxMutationsPerIteration = maxMutations;
|
|
5901
|
+
const pruneThreshold = parseInt(options.pruneThreshold ?? "95", 10);
|
|
5902
|
+
if (isNaN(pruneThreshold) || pruneThreshold < 0 || pruneThreshold > 100) {
|
|
5903
|
+
console.log(ui.error("--prune-threshold must be 0-100"));
|
|
5904
|
+
process.exit(1);
|
|
5905
|
+
}
|
|
5906
|
+
evolveConfig.pruneThreshold = pruneThreshold;
|
|
5907
|
+
const maxTaskDrop = parseInt(options.maxTaskDrop ?? "20", 10);
|
|
5908
|
+
if (isNaN(maxTaskDrop) || maxTaskDrop < 1) {
|
|
5909
|
+
console.log(ui.error("--max-task-drop must be a positive integer"));
|
|
5910
|
+
process.exit(1);
|
|
5911
|
+
}
|
|
5912
|
+
evolveConfig.maxTaskDrop = maxTaskDrop;
|
|
5837
5913
|
try {
|
|
5838
5914
|
await fs24.access(path24.join(workspace, "iterations", "0", "harness"));
|
|
5839
5915
|
} catch {
|
|
@@ -5872,7 +5948,10 @@ evolveCommand.command("run").description("Run tasks against the current harness"
|
|
|
5872
5948
|
console.log(chalk14.dim(` ${event.message ?? ""}`));
|
|
5873
5949
|
break;
|
|
5874
5950
|
case "task-skipped":
|
|
5875
|
-
console.log(chalk14.dim(` SKIP ${event.taskId ?? "unknown"} (
|
|
5951
|
+
console.log(chalk14.dim(` SKIP ${event.taskId ?? "unknown"} (above prune threshold last iteration)`));
|
|
5952
|
+
break;
|
|
5953
|
+
case "task-regression":
|
|
5954
|
+
console.log(chalk14.yellow(` DROP ${event.taskId ?? "unknown"} ${event.message ?? ""}`));
|
|
5876
5955
|
break;
|
|
5877
5956
|
case "task-scored": {
|
|
5878
5957
|
const taskScore = event.score ?? 0;
|