kairn-cli 2.2.6 → 2.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +42 -4
- package/dist/cli.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -4748,6 +4748,13 @@ the agent lacks a tool it needs, or has tools that add noise without benefit.
|
|
|
4748
4748
|
- Consider both additions AND removals. Remove sections that add noise without improving task performance.
|
|
4749
4749
|
- Bloated harnesses hurt performance \u2014 trim what isn't earning its keep.
|
|
4750
4750
|
|
|
4751
|
+
## Anti-Gaming (CRITICAL)
|
|
4752
|
+
- Mutations must improve GENERAL-PURPOSE development quality, not target specific eval criteria.
|
|
4753
|
+
- You do NOT have access to scoring rubrics or expected outcomes. Diagnose problems from traces only.
|
|
4754
|
+
- Do NOT add over-specified rules that restate existing conventions with stronger emphasis (e.g., changing "use chalk.green for success" to "MUST use chalk.green, no exceptions"). If a convention already exists, trust it.
|
|
4755
|
+
- Do NOT add rules that only apply to a narrow eval scenario (e.g., write permissions for a specific directory just because one task needed it).
|
|
4756
|
+
- Ask: "Would this mutation help a developer working on ANY task in this project?" If not, don't propose it.
|
|
4757
|
+
|
|
4751
4758
|
Return ONLY valid JSON.`;
|
|
4752
4759
|
var STDOUT_TRUNCATION_LIMIT = 1e3;
|
|
4753
4760
|
var MAX_CONTEXT_CHARS = 1e5;
|
|
@@ -4806,8 +4813,6 @@ ${content}
|
|
|
4806
4813
|
`### Task: ${task.id}
|
|
4807
4814
|
- Template: ${task.template}
|
|
4808
4815
|
- Description: ${task.description}
|
|
4809
|
-
- Expected outcome: ${Array.isArray(task.expected_outcome) ? task.expected_outcome.join("; ") : task.expected_outcome}
|
|
4810
|
-
- Scoring: ${task.scoring}
|
|
4811
4816
|
`
|
|
4812
4817
|
);
|
|
4813
4818
|
}
|
|
@@ -5125,8 +5130,31 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
5125
5130
|
break;
|
|
5126
5131
|
}
|
|
5127
5132
|
onProgress?.({ type: "iteration-start", iteration: iter });
|
|
5128
|
-
const
|
|
5129
|
-
|
|
5133
|
+
const isFirstIter = iter === 0;
|
|
5134
|
+
const isLastIter = iter === evolveConfig.maxIterations - 1;
|
|
5135
|
+
const prevLog = history.length > 0 ? history[history.length - 1] : null;
|
|
5136
|
+
let tasksToRun = tasks;
|
|
5137
|
+
const carriedScores = {};
|
|
5138
|
+
if (!isFirstIter && !isLastIter && prevLog) {
|
|
5139
|
+
tasksToRun = [];
|
|
5140
|
+
for (const task of tasks) {
|
|
5141
|
+
const prevScore = prevLog.taskResults[task.id];
|
|
5142
|
+
const prevValue = prevScore ? prevScore.score ?? (prevScore.pass ? 100 : 0) : 0;
|
|
5143
|
+
if (prevValue >= 100) {
|
|
5144
|
+
carriedScores[task.id] = { pass: true, score: 100 };
|
|
5145
|
+
onProgress?.({
|
|
5146
|
+
type: "task-skipped",
|
|
5147
|
+
iteration: iter,
|
|
5148
|
+
taskId: task.id,
|
|
5149
|
+
message: `Skipped ${task.id} (scored 100% last iteration)`
|
|
5150
|
+
});
|
|
5151
|
+
} else {
|
|
5152
|
+
tasksToRun.push(task);
|
|
5153
|
+
}
|
|
5154
|
+
}
|
|
5155
|
+
}
|
|
5156
|
+
const { results: evalResults, aggregate: evalAggregate } = await evaluateAll(
|
|
5157
|
+
tasksToRun,
|
|
5130
5158
|
harnessPath,
|
|
5131
5159
|
workspacePath,
|
|
5132
5160
|
iter,
|
|
@@ -5135,6 +5163,13 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
5135
5163
|
evolveConfig.runsPerTask,
|
|
5136
5164
|
evolveConfig.parallelTasks
|
|
5137
5165
|
);
|
|
5166
|
+
const results = { ...carriedScores, ...evalResults };
|
|
5167
|
+
const allScores = Object.values(results);
|
|
5168
|
+
const total = allScores.reduce(
|
|
5169
|
+
(sum, s) => sum + (s.score ?? (s.pass ? 100 : 0)),
|
|
5170
|
+
0
|
|
5171
|
+
);
|
|
5172
|
+
const aggregate = allScores.length > 0 ? total / allScores.length : 0;
|
|
5138
5173
|
onProgress?.({ type: "iteration-scored", iteration: iter, score: aggregate });
|
|
5139
5174
|
if (iter === 0) baselineScore = aggregate;
|
|
5140
5175
|
if (iter > 0 && aggregate < bestScore) {
|
|
@@ -5841,6 +5876,9 @@ evolveCommand.command("run").description("Run tasks against the current harness"
|
|
|
5841
5876
|
case "task-run":
|
|
5842
5877
|
console.log(chalk14.dim(` ${event.message ?? ""}`));
|
|
5843
5878
|
break;
|
|
5879
|
+
case "task-skipped":
|
|
5880
|
+
console.log(chalk14.dim(` SKIP ${event.taskId ?? "unknown"} (100% last iteration)`));
|
|
5881
|
+
break;
|
|
5844
5882
|
case "task-scored": {
|
|
5845
5883
|
const taskScore = event.score ?? 0;
|
|
5846
5884
|
const taskStatus = taskScore >= 100 ? chalk14.green("PASS") : taskScore >= 60 ? chalk14.yellow("PARTIAL") : chalk14.red("FAIL");
|