kairn-cli 2.1.0 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +88 -66
- package/dist/cli.js +546 -75
- package/dist/cli.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -221,7 +221,7 @@ var ui = {
|
|
|
221
221
|
// Key-value pairs
|
|
222
222
|
kv: (key, value) => ` ${chalk.cyan(key.padEnd(14))} ${value}`,
|
|
223
223
|
// File list
|
|
224
|
-
file: (
|
|
224
|
+
file: (path24) => chalk.dim(` ${path24}`),
|
|
225
225
|
// Tool display
|
|
226
226
|
tool: (name, reason) => ` ${warmStone("\u25CF")} ${chalk.bold(name)}
|
|
227
227
|
${chalk.dim(reason)}`,
|
|
@@ -3694,9 +3694,9 @@ var keysCommand = new Command10("keys").description("Add or update API keys for
|
|
|
3694
3694
|
import { Command as Command11 } from "commander";
|
|
3695
3695
|
import chalk14 from "chalk";
|
|
3696
3696
|
import ora2 from "ora";
|
|
3697
|
-
import
|
|
3698
|
-
import
|
|
3699
|
-
import { parse as
|
|
3697
|
+
import fs23 from "fs/promises";
|
|
3698
|
+
import path23 from "path";
|
|
3699
|
+
import { parse as yamlParse2 } from "yaml";
|
|
3700
3700
|
import { confirm as confirm3, select as select4 } from "@inquirer/prompts";
|
|
3701
3701
|
|
|
3702
3702
|
// src/evolve/init.ts
|
|
@@ -3741,30 +3741,55 @@ var EVAL_TEMPLATES = {
|
|
|
3741
3741
|
name: "Documentation",
|
|
3742
3742
|
description: "Can the agent write and update docs?",
|
|
3743
3743
|
bestFor: ["content", "api-building", "full-stack"]
|
|
3744
|
+
},
|
|
3745
|
+
"convention-adherence": {
|
|
3746
|
+
id: "convention-adherence",
|
|
3747
|
+
name: "Convention Adherence",
|
|
3748
|
+
description: "Does the agent follow all project conventions defined in CLAUDE.md?",
|
|
3749
|
+
bestFor: ["feature-development", "full-stack", "backend", "maintenance"]
|
|
3750
|
+
},
|
|
3751
|
+
"workflow-compliance": {
|
|
3752
|
+
id: "workflow-compliance",
|
|
3753
|
+
name: "Workflow Compliance",
|
|
3754
|
+
description: "Does the agent use the project workflow commands and skills?",
|
|
3755
|
+
bestFor: ["feature-development", "full-stack", "tdd", "qa"]
|
|
3756
|
+
},
|
|
3757
|
+
"rule-compliance": {
|
|
3758
|
+
id: "rule-compliance",
|
|
3759
|
+
name: "Rule Compliance",
|
|
3760
|
+
description: "Does the agent follow all project rules without violations?",
|
|
3761
|
+
bestFor: ["feature-development", "backend", "maintenance", "architecture"]
|
|
3744
3762
|
}
|
|
3745
3763
|
};
|
|
3746
3764
|
function selectTemplatesForWorkflow(workflowType) {
|
|
3747
3765
|
const mapping = {
|
|
3748
|
-
"feature-development": ["add-feature", "test-writing", "
|
|
3749
|
-
"api-building": ["add-feature", "fix-bug", "test-writing"],
|
|
3750
|
-
"full-stack": ["add-feature", "fix-bug", "test-writing"],
|
|
3751
|
-
"maintenance": ["fix-bug", "refactor", "test-writing"],
|
|
3752
|
-
"debugging": ["fix-bug", "test-writing"],
|
|
3753
|
-
"qa": ["fix-bug", "test-writing", "add-feature"],
|
|
3754
|
-
"architecture": ["refactor", "test-writing", "config-change"],
|
|
3755
|
-
"backend": ["fix-bug", "refactor", "config-change", "
|
|
3756
|
-
"devops": ["config-change", "fix-bug"],
|
|
3757
|
-
"infrastructure": ["config-change", "refactor"],
|
|
3758
|
-
"tdd": ["test-writing", "add-feature", "fix-bug"],
|
|
3759
|
-
"content": ["documentation", "add-feature"],
|
|
3760
|
-
"research": ["documentation", "add-feature"]
|
|
3766
|
+
"feature-development": ["add-feature", "test-writing", "convention-adherence", "workflow-compliance"],
|
|
3767
|
+
"api-building": ["add-feature", "fix-bug", "test-writing", "convention-adherence"],
|
|
3768
|
+
"full-stack": ["add-feature", "fix-bug", "test-writing", "convention-adherence"],
|
|
3769
|
+
"maintenance": ["fix-bug", "refactor", "test-writing", "rule-compliance"],
|
|
3770
|
+
"debugging": ["fix-bug", "test-writing", "rule-compliance"],
|
|
3771
|
+
"qa": ["fix-bug", "test-writing", "add-feature", "workflow-compliance"],
|
|
3772
|
+
"architecture": ["refactor", "test-writing", "config-change", "convention-adherence"],
|
|
3773
|
+
"backend": ["fix-bug", "refactor", "config-change", "rule-compliance"],
|
|
3774
|
+
"devops": ["config-change", "fix-bug", "rule-compliance"],
|
|
3775
|
+
"infrastructure": ["config-change", "refactor", "convention-adherence"],
|
|
3776
|
+
"tdd": ["test-writing", "add-feature", "fix-bug", "workflow-compliance"],
|
|
3777
|
+
"content": ["documentation", "add-feature", "convention-adherence"],
|
|
3778
|
+
"research": ["documentation", "add-feature", "convention-adherence"]
|
|
3761
3779
|
};
|
|
3762
|
-
return mapping[workflowType] || ["add-feature", "fix-bug", "test-writing"];
|
|
3780
|
+
return mapping[workflowType] || ["add-feature", "fix-bug", "test-writing", "convention-adherence"];
|
|
3763
3781
|
}
|
|
3764
3782
|
var TASK_GENERATION_PROMPT = `You are an eval task generator for Claude Code agent environments. Given a project's CLAUDE.md, project structure, and selected eval templates, generate concrete, project-specific tasks.
|
|
3765
3783
|
|
|
3766
3784
|
Each task must be realistic and testable against the actual project. Avoid generic placeholders.
|
|
3767
3785
|
|
|
3786
|
+
IMPORTANT: For harness-aware templates (convention-adherence, workflow-compliance, rule-compliance), generate tasks where success DEPENDS on the agent reading and following the .claude/ harness content:
|
|
3787
|
+
- convention-adherence: Task must require following specific conventions from CLAUDE.md (naming, file structure, patterns). Judge by whether output matches the conventions.
|
|
3788
|
+
- workflow-compliance: Task must require using project slash commands or workflow steps defined in .claude/commands/. Judge by whether the agent followed the defined workflow.
|
|
3789
|
+
- rule-compliance: Task must create a scenario where .claude/rules/ content is relevant. Judge by whether the agent respected all rules.
|
|
3790
|
+
|
|
3791
|
+
These harness-aware tasks are critical \u2014 they test whether the .claude/ environment actually improves agent behavior.
|
|
3792
|
+
|
|
3768
3793
|
Return a JSON object with a "tasks" array. Each task has:
|
|
3769
3794
|
- id: kebab-case identifier (e.g., "add-health-endpoint")
|
|
3770
3795
|
- template: which eval template this instantiates
|
|
@@ -4117,6 +4142,27 @@ async function writeIterationLog(workspacePath, log) {
|
|
|
4117
4142
|
"utf-8"
|
|
4118
4143
|
);
|
|
4119
4144
|
}
|
|
4145
|
+
async function loadIterationLog(workspacePath, iteration) {
|
|
4146
|
+
const iterDir = path17.join(workspacePath, "iterations", iteration.toString());
|
|
4147
|
+
try {
|
|
4148
|
+
await fs17.access(iterDir);
|
|
4149
|
+
} catch {
|
|
4150
|
+
return null;
|
|
4151
|
+
}
|
|
4152
|
+
const scoresStr = await fs17.readFile(path17.join(iterDir, "scores.json"), "utf-8").catch(() => "{}");
|
|
4153
|
+
const reasoning = await fs17.readFile(path17.join(iterDir, "proposer_reasoning.md"), "utf-8").catch(() => "");
|
|
4154
|
+
const diffPatch = await fs17.readFile(path17.join(iterDir, "mutation_diff.patch"), "utf-8").catch(() => "");
|
|
4155
|
+
const scoresData = JSON.parse(scoresStr);
|
|
4156
|
+
const proposal = reasoning ? { reasoning, mutations: [], expectedImpact: {} } : null;
|
|
4157
|
+
return {
|
|
4158
|
+
iteration,
|
|
4159
|
+
score: scoresData.score ?? 0,
|
|
4160
|
+
taskResults: scoresData.taskResults ?? {},
|
|
4161
|
+
proposal,
|
|
4162
|
+
diffPatch: diffPatch || null,
|
|
4163
|
+
timestamp: ""
|
|
4164
|
+
};
|
|
4165
|
+
}
|
|
4120
4166
|
|
|
4121
4167
|
// src/evolve/exec.ts
|
|
4122
4168
|
import { exec } from "child_process";
|
|
@@ -4169,7 +4215,8 @@ ${msg}`);
|
|
|
4169
4215
|
details: passed2 ? `All ${commands.length} verification commands passed` : failures.join("\n")
|
|
4170
4216
|
};
|
|
4171
4217
|
}
|
|
4172
|
-
const
|
|
4218
|
+
const filteredStderr = stderr.split("\n").filter((line) => !line.startsWith("[setup]")).join("\n");
|
|
4219
|
+
const hasErrors = filteredStderr.toLowerCase().includes("error") || filteredStderr.toLowerCase().includes("failed") || filteredStderr.toLowerCase().includes("exception");
|
|
4173
4220
|
const passed = !hasErrors;
|
|
4174
4221
|
return {
|
|
4175
4222
|
pass: passed,
|
|
@@ -4297,24 +4344,88 @@ async function scoreTask(task, workspacePath, stdout, stderr, config) {
|
|
|
4297
4344
|
|
|
4298
4345
|
// src/evolve/runner.ts
|
|
4299
4346
|
var execAsync2 = promisify2(exec2);
|
|
4300
|
-
|
|
4347
|
+
var COPY_SKIP_DIRS = /* @__PURE__ */ new Set([".git", "node_modules", ".kairn-evolve", ".claude"]);
|
|
4348
|
+
async function createIsolatedWorkspace(projectRoot, harnessPath) {
|
|
4349
|
+
const suffix = `${Date.now()}-${Math.random().toString(36).slice(2)}`;
|
|
4350
|
+
try {
|
|
4351
|
+
await execAsync2("git rev-parse --is-inside-work-tree", {
|
|
4352
|
+
cwd: projectRoot,
|
|
4353
|
+
timeout: 5e3
|
|
4354
|
+
});
|
|
4355
|
+
const tmpDir2 = path18.join(os3.tmpdir(), `kairn-evolve-wt-${suffix}`);
|
|
4356
|
+
await execAsync2(`git worktree add --detach "${tmpDir2}" HEAD`, {
|
|
4357
|
+
cwd: projectRoot,
|
|
4358
|
+
timeout: 3e4
|
|
4359
|
+
});
|
|
4360
|
+
await fs18.rm(path18.join(tmpDir2, ".claude"), { recursive: true, force: true });
|
|
4361
|
+
await copyDir(harnessPath, path18.join(tmpDir2, ".claude"));
|
|
4362
|
+
return { workDir: tmpDir2, isWorktree: true };
|
|
4363
|
+
} catch {
|
|
4364
|
+
}
|
|
4365
|
+
const tmpDir = await fs18.mkdtemp(path18.join(os3.tmpdir(), `kairn-evolve-cp-`));
|
|
4366
|
+
await copyProjectDir(projectRoot, tmpDir);
|
|
4367
|
+
await fs18.rm(path18.join(tmpDir, ".claude"), { recursive: true, force: true });
|
|
4368
|
+
await copyDir(harnessPath, path18.join(tmpDir, ".claude"));
|
|
4369
|
+
return { workDir: tmpDir, isWorktree: false };
|
|
4370
|
+
}
|
|
4371
|
+
async function copyProjectDir(src, dest) {
|
|
4372
|
+
await fs18.mkdir(dest, { recursive: true });
|
|
4373
|
+
let entries;
|
|
4374
|
+
try {
|
|
4375
|
+
entries = await fs18.readdir(src, { withFileTypes: true });
|
|
4376
|
+
} catch {
|
|
4377
|
+
return;
|
|
4378
|
+
}
|
|
4379
|
+
for (const entry of entries) {
|
|
4380
|
+
if (COPY_SKIP_DIRS.has(entry.name)) continue;
|
|
4381
|
+
const srcPath = path18.join(src, entry.name);
|
|
4382
|
+
const destPath = path18.join(dest, entry.name);
|
|
4383
|
+
if (entry.isDirectory()) {
|
|
4384
|
+
await copyDir(srcPath, destPath);
|
|
4385
|
+
} else {
|
|
4386
|
+
await fs18.copyFile(srcPath, destPath);
|
|
4387
|
+
}
|
|
4388
|
+
}
|
|
4389
|
+
}
|
|
4390
|
+
async function cleanupIsolatedWorkspace(workDir, isWorktree, projectRoot) {
|
|
4391
|
+
if (isWorktree) {
|
|
4392
|
+
try {
|
|
4393
|
+
await execAsync2(`git worktree remove "${workDir}" --force`, {
|
|
4394
|
+
cwd: projectRoot,
|
|
4395
|
+
timeout: 1e4
|
|
4396
|
+
});
|
|
4397
|
+
} catch {
|
|
4398
|
+
await fs18.rm(workDir, { recursive: true, force: true }).catch(() => {
|
|
4399
|
+
});
|
|
4400
|
+
await execAsync2("git worktree prune", {
|
|
4401
|
+
cwd: projectRoot,
|
|
4402
|
+
timeout: 5e3
|
|
4403
|
+
}).catch(() => {
|
|
4404
|
+
});
|
|
4405
|
+
}
|
|
4406
|
+
} else {
|
|
4407
|
+
await fs18.rm(workDir, { recursive: true, force: true }).catch(() => {
|
|
4408
|
+
});
|
|
4409
|
+
}
|
|
4410
|
+
}
|
|
4411
|
+
async function runTask(task, harnessPath, traceDir, iteration, projectRoot) {
|
|
4301
4412
|
await fs18.mkdir(traceDir, { recursive: true });
|
|
4302
4413
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
4303
4414
|
const startMs = Date.now();
|
|
4304
|
-
const
|
|
4415
|
+
const root = projectRoot ?? process.cwd();
|
|
4416
|
+
const { workDir, isWorktree } = await createIsolatedWorkspace(root, harnessPath);
|
|
4305
4417
|
try {
|
|
4306
|
-
await copyDir(harnessPath, path18.join(tmpDir, ".claude"));
|
|
4307
4418
|
let setupStderr = "";
|
|
4308
4419
|
if (task.setup.trim()) {
|
|
4309
4420
|
try {
|
|
4310
|
-
await execAsync2(task.setup, { cwd:
|
|
4421
|
+
await execAsync2(task.setup, { cwd: workDir, timeout: 6e4 });
|
|
4311
4422
|
} catch (err) {
|
|
4312
4423
|
setupStderr = err instanceof Error ? err.message : String(err);
|
|
4313
4424
|
}
|
|
4314
4425
|
}
|
|
4315
|
-
const filesBefore = await snapshotFileList(
|
|
4316
|
-
const spawnResult = await spawnClaude(task.description,
|
|
4317
|
-
const filesAfter = await snapshotFileList(
|
|
4426
|
+
const filesBefore = await snapshotFileList(workDir);
|
|
4427
|
+
const spawnResult = await spawnClaude(task.description, workDir, task.timeout);
|
|
4428
|
+
const filesAfter = await snapshotFileList(workDir);
|
|
4318
4429
|
const filesChanged = diffFileLists(filesBefore, filesAfter);
|
|
4319
4430
|
const toolCalls = parseToolCalls(spawnResult.stdout);
|
|
4320
4431
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
@@ -4338,8 +4449,7 @@ ${spawnResult.stderr}` : spawnResult.stderr;
|
|
|
4338
4449
|
traceDir
|
|
4339
4450
|
};
|
|
4340
4451
|
} finally {
|
|
4341
|
-
await
|
|
4342
|
-
});
|
|
4452
|
+
await cleanupIsolatedWorkspace(workDir, isWorktree, root);
|
|
4343
4453
|
}
|
|
4344
4454
|
}
|
|
4345
4455
|
async function spawnClaude(instruction, cwd, timeoutSec) {
|
|
@@ -4437,8 +4547,9 @@ function parseToolCalls(stdout) {
|
|
|
4437
4547
|
return [];
|
|
4438
4548
|
}
|
|
4439
4549
|
}
|
|
4440
|
-
async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config) {
|
|
4550
|
+
async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config, onProgress) {
|
|
4441
4551
|
const results = {};
|
|
4552
|
+
const projectRoot = path18.resolve(workspacePath, "..");
|
|
4442
4553
|
for (const task of tasks) {
|
|
4443
4554
|
const traceDir = path18.join(
|
|
4444
4555
|
workspacePath,
|
|
@@ -4446,7 +4557,8 @@ async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config)
|
|
|
4446
4557
|
iteration.toString(),
|
|
4447
4558
|
task.id
|
|
4448
4559
|
);
|
|
4449
|
-
|
|
4560
|
+
onProgress?.({ type: "task-start", iteration, taskId: task.id });
|
|
4561
|
+
const taskResult = await runTask(task, harnessPath, traceDir, iteration, projectRoot);
|
|
4450
4562
|
let score = taskResult.score;
|
|
4451
4563
|
if (config) {
|
|
4452
4564
|
const stdout = await fs18.readFile(path18.join(traceDir, "stdout.log"), "utf-8").catch(() => "");
|
|
@@ -4455,6 +4567,12 @@ async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config)
|
|
|
4455
4567
|
await writeScore(traceDir, score);
|
|
4456
4568
|
}
|
|
4457
4569
|
results[task.id] = score;
|
|
4570
|
+
onProgress?.({
|
|
4571
|
+
type: "task-scored",
|
|
4572
|
+
iteration,
|
|
4573
|
+
taskId: task.id,
|
|
4574
|
+
score: score.score ?? (score.pass ? 100 : 0)
|
|
4575
|
+
});
|
|
4458
4576
|
}
|
|
4459
4577
|
const scores = Object.values(results);
|
|
4460
4578
|
const total = scores.reduce(
|
|
@@ -4516,7 +4634,8 @@ Return a JSON object:
|
|
|
4516
4634
|
- Prefer ADDITIVE changes over replacements when possible.
|
|
4517
4635
|
|
|
4518
4636
|
Return ONLY valid JSON.`;
|
|
4519
|
-
var STDOUT_TRUNCATION_LIMIT =
|
|
4637
|
+
var STDOUT_TRUNCATION_LIMIT = 1e3;
|
|
4638
|
+
var MAX_CONTEXT_CHARS = 1e5;
|
|
4520
4639
|
async function readHarnessFiles(harnessPath) {
|
|
4521
4640
|
const result = {};
|
|
4522
4641
|
async function walk(dir, prefix) {
|
|
@@ -4550,26 +4669,25 @@ function truncateStdout(stdout, limit) {
|
|
|
4550
4669
|
${stdout.slice(-limit)}`;
|
|
4551
4670
|
}
|
|
4552
4671
|
function buildProposerUserMessage(harnessFiles, traces, tasks, history) {
|
|
4553
|
-
const
|
|
4554
|
-
sections.push("## Current Harness Files\n");
|
|
4672
|
+
const harnessSection = ["## Current Harness Files\n"];
|
|
4555
4673
|
const fileEntries = Object.entries(harnessFiles);
|
|
4556
4674
|
if (fileEntries.length === 0) {
|
|
4557
|
-
|
|
4675
|
+
harnessSection.push("(No harness files found)\n");
|
|
4558
4676
|
} else {
|
|
4559
4677
|
for (const [filePath, content] of fileEntries) {
|
|
4560
|
-
|
|
4678
|
+
harnessSection.push(`### ${filePath}
|
|
4561
4679
|
\`\`\`
|
|
4562
4680
|
${content}
|
|
4563
4681
|
\`\`\`
|
|
4564
4682
|
`);
|
|
4565
4683
|
}
|
|
4566
4684
|
}
|
|
4567
|
-
|
|
4685
|
+
const taskSection = ["## Task Definitions\n"];
|
|
4568
4686
|
if (tasks.length === 0) {
|
|
4569
|
-
|
|
4687
|
+
taskSection.push("(No tasks defined)\n");
|
|
4570
4688
|
} else {
|
|
4571
4689
|
for (const task of tasks) {
|
|
4572
|
-
|
|
4690
|
+
taskSection.push(
|
|
4573
4691
|
`### Task: ${task.id}
|
|
4574
4692
|
- Template: ${task.template}
|
|
4575
4693
|
- Description: ${task.description}
|
|
@@ -4579,15 +4697,27 @@ ${content}
|
|
|
4579
4697
|
);
|
|
4580
4698
|
}
|
|
4581
4699
|
}
|
|
4582
|
-
|
|
4583
|
-
|
|
4584
|
-
|
|
4585
|
-
|
|
4700
|
+
const fixedContent = harnessSection.join("\n") + "\n" + taskSection.join("\n");
|
|
4701
|
+
const remainingBudget = MAX_CONTEXT_CHARS - fixedContent.length;
|
|
4702
|
+
if (remainingBudget <= 0) {
|
|
4703
|
+
return fixedContent + "\n\n[...traces and history omitted \u2014 harness + tasks fill context budget...]";
|
|
4704
|
+
}
|
|
4705
|
+
const traceBudget = Math.floor(remainingBudget * 0.7);
|
|
4706
|
+
const historyBudget = remainingBudget - traceBudget;
|
|
4707
|
+
const traceSection = buildTraceSection(traces, traceBudget);
|
|
4708
|
+
const historySection = buildHistorySection(history, historyBudget);
|
|
4709
|
+
return fixedContent + "\n" + traceSection + "\n" + historySection;
|
|
4710
|
+
}
|
|
4711
|
+
function buildTraceSection(traces, budget) {
|
|
4712
|
+
if (traces.length === 0) return "## Execution Traces\n\n(No traces available)\n";
|
|
4713
|
+
let stdoutLimit = STDOUT_TRUNCATION_LIMIT;
|
|
4714
|
+
for (let attempt = 0; attempt < 4; attempt++) {
|
|
4715
|
+
const parts = ["## Execution Traces\n"];
|
|
4586
4716
|
for (const trace of traces) {
|
|
4587
4717
|
const scoreNum = trace.score.score !== void 0 ? trace.score.score : trace.score.pass ? 100 : 0;
|
|
4588
|
-
const truncatedStdout = truncateStdout(trace.stdout,
|
|
4718
|
+
const truncatedStdout = truncateStdout(trace.stdout, stdoutLimit);
|
|
4589
4719
|
const filesChangedList = Object.entries(trace.filesChanged).map(([f, action]) => ` - ${f}: ${action}`).join("\n");
|
|
4590
|
-
|
|
4720
|
+
parts.push(
|
|
4591
4721
|
`### Trace: ${trace.taskId}
|
|
4592
4722
|
- Pass: ${trace.score.pass}
|
|
4593
4723
|
- Score: ${scoreNum}
|
|
@@ -4595,36 +4725,55 @@ ${content}
|
|
|
4595
4725
|
` : "") + `- Duration: ${trace.timing.durationMs}ms
|
|
4596
4726
|
- Files changed:
|
|
4597
4727
|
${filesChangedList || " (none)"}
|
|
4598
|
-
- Stdout (last ${
|
|
4728
|
+
- Stdout (last ${stdoutLimit} chars):
|
|
4599
4729
|
\`\`\`
|
|
4600
4730
|
${truncatedStdout}
|
|
4601
4731
|
\`\`\`
|
|
4602
4732
|
`
|
|
4603
4733
|
);
|
|
4604
4734
|
}
|
|
4735
|
+
const result = parts.join("\n");
|
|
4736
|
+
if (result.length <= budget) return result;
|
|
4737
|
+
stdoutLimit = Math.floor(stdoutLimit / 2);
|
|
4605
4738
|
}
|
|
4606
|
-
|
|
4607
|
-
|
|
4608
|
-
|
|
4609
|
-
|
|
4610
|
-
|
|
4739
|
+
const summary = ["## Execution Traces (summary \u2014 stdout omitted to fit budget)\n"];
|
|
4740
|
+
for (const trace of traces) {
|
|
4741
|
+
const scoreNum = trace.score.score !== void 0 ? trace.score.score : trace.score.pass ? 100 : 0;
|
|
4742
|
+
summary.push(`- ${trace.taskId}: ${scoreNum} (pass=${trace.score.pass})
|
|
4743
|
+
`);
|
|
4744
|
+
}
|
|
4745
|
+
return summary.join("\n");
|
|
4746
|
+
}
|
|
4747
|
+
function buildHistorySection(history, budget) {
|
|
4748
|
+
if (history.length === 0) return "## Iteration History\n\n(No previous iterations)\n";
|
|
4749
|
+
let entries = [...history];
|
|
4750
|
+
while (entries.length > 0) {
|
|
4751
|
+
const parts = ["## Iteration History\n"];
|
|
4752
|
+
if (entries.length < history.length) {
|
|
4753
|
+
parts.push(`(Showing ${entries.length}/${history.length} most recent iterations)
|
|
4754
|
+
`);
|
|
4755
|
+
}
|
|
4756
|
+
for (const log of entries) {
|
|
4611
4757
|
const taskScores = Object.entries(log.taskResults).map(([id, s]) => ` - ${id}: ${s.score !== void 0 ? s.score : s.pass ? 100 : 0} (pass=${s.pass})`).join("\n");
|
|
4612
|
-
|
|
4758
|
+
parts.push(
|
|
4613
4759
|
`### Iteration ${log.iteration} \u2014 Score: ${log.score}
|
|
4614
4760
|
- Task results:
|
|
4615
4761
|
${taskScores}
|
|
4616
4762
|
`
|
|
4617
4763
|
);
|
|
4618
4764
|
if (log.proposal) {
|
|
4619
|
-
|
|
4765
|
+
parts.push(
|
|
4620
4766
|
`- Proposal reasoning: ${log.proposal.reasoning}
|
|
4621
4767
|
- Mutations: ${log.proposal.mutations.length} change(s)
|
|
4622
4768
|
`
|
|
4623
4769
|
);
|
|
4624
4770
|
}
|
|
4625
4771
|
}
|
|
4772
|
+
const result = parts.join("\n");
|
|
4773
|
+
if (result.length <= budget) return result;
|
|
4774
|
+
entries = entries.slice(1);
|
|
4626
4775
|
}
|
|
4627
|
-
return
|
|
4776
|
+
return "## Iteration History\n\n(History omitted to fit context budget)\n";
|
|
4628
4777
|
}
|
|
4629
4778
|
function parseProposerResponse(raw) {
|
|
4630
4779
|
let cleaned = raw.trim();
|
|
@@ -4836,7 +4985,8 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
4836
4985
|
harnessPath,
|
|
4837
4986
|
workspacePath,
|
|
4838
4987
|
iter,
|
|
4839
|
-
kairnConfig
|
|
4988
|
+
kairnConfig,
|
|
4989
|
+
onProgress
|
|
4840
4990
|
);
|
|
4841
4991
|
onProgress?.({ type: "iteration-scored", iteration: iter, score: aggregate });
|
|
4842
4992
|
if (iter === 0) baselineScore = aggregate;
|
|
@@ -4914,7 +5064,13 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
4914
5064
|
kairnConfig,
|
|
4915
5065
|
evolveConfig.proposerModel
|
|
4916
5066
|
);
|
|
4917
|
-
} catch {
|
|
5067
|
+
} catch (err) {
|
|
5068
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
5069
|
+
onProgress?.({
|
|
5070
|
+
type: "proposer-error",
|
|
5071
|
+
iteration: iter,
|
|
5072
|
+
message: `Proposer failed: ${errMsg}`
|
|
5073
|
+
});
|
|
4918
5074
|
const nextIterDir2 = path21.join(
|
|
4919
5075
|
workspacePath,
|
|
4920
5076
|
"iterations",
|
|
@@ -4978,6 +5134,215 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
4978
5134
|
};
|
|
4979
5135
|
}
|
|
4980
5136
|
|
|
5137
|
+
// src/evolve/report.ts
|
|
5138
|
+
import fs22 from "fs/promises";
|
|
5139
|
+
import path22 from "path";
|
|
5140
|
+
|
|
5141
|
+
// src/evolve/diagnosis.ts
|
|
5142
|
+
function numericScore(s) {
|
|
5143
|
+
return s.score ?? (s.pass ? 100 : 0);
|
|
5144
|
+
}
|
|
5145
|
+
function diagnoseCounterfactuals(iterations, _tasks) {
|
|
5146
|
+
const entries = [];
|
|
5147
|
+
for (let i = 1; i < iterations.length; i++) {
|
|
5148
|
+
const prev = iterations[i - 1];
|
|
5149
|
+
const curr = iterations[i];
|
|
5150
|
+
if (!curr.proposal && !prev.proposal) continue;
|
|
5151
|
+
const proposal = prev.proposal;
|
|
5152
|
+
if (!proposal || proposal.mutations.length === 0) continue;
|
|
5153
|
+
const mutationSummary = proposal.mutations.map((m) => `${m.action} in ${m.file}: ${m.rationale}`).join("; ");
|
|
5154
|
+
const helpedTasks = [];
|
|
5155
|
+
const hurtTasks = [];
|
|
5156
|
+
const allTaskIds = /* @__PURE__ */ new Set([
|
|
5157
|
+
...Object.keys(prev.taskResults),
|
|
5158
|
+
...Object.keys(curr.taskResults)
|
|
5159
|
+
]);
|
|
5160
|
+
let netDelta = 0;
|
|
5161
|
+
for (const taskId of allTaskIds) {
|
|
5162
|
+
const prevScore = prev.taskResults[taskId] ? numericScore(prev.taskResults[taskId]) : 0;
|
|
5163
|
+
const currScore = curr.taskResults[taskId] ? numericScore(curr.taskResults[taskId]) : 0;
|
|
5164
|
+
const delta = currScore - prevScore;
|
|
5165
|
+
if (delta > 0) {
|
|
5166
|
+
helpedTasks.push({ taskId, delta });
|
|
5167
|
+
} else if (delta < 0) {
|
|
5168
|
+
hurtTasks.push({ taskId, delta });
|
|
5169
|
+
}
|
|
5170
|
+
netDelta += delta;
|
|
5171
|
+
}
|
|
5172
|
+
entries.push({
|
|
5173
|
+
iteration: i,
|
|
5174
|
+
mutationSummary,
|
|
5175
|
+
helpedTasks,
|
|
5176
|
+
hurtTasks,
|
|
5177
|
+
netScoreDelta: netDelta
|
|
5178
|
+
});
|
|
5179
|
+
}
|
|
5180
|
+
return { entries };
|
|
5181
|
+
}
|
|
5182
|
+
|
|
5183
|
+
// src/evolve/report.ts
|
|
5184
|
+
import { parse as yamlParse } from "yaml";
|
|
5185
|
+
function numericScore2(s) {
|
|
5186
|
+
return s.score ?? (s.pass ? 100 : 0);
|
|
5187
|
+
}
|
|
5188
|
+
async function loadAllIterations(workspacePath) {
|
|
5189
|
+
const iterDir = path22.join(workspacePath, "iterations");
|
|
5190
|
+
let entries;
|
|
5191
|
+
try {
|
|
5192
|
+
entries = await fs22.readdir(iterDir);
|
|
5193
|
+
} catch {
|
|
5194
|
+
return [];
|
|
5195
|
+
}
|
|
5196
|
+
const iterations = [];
|
|
5197
|
+
const iterNums = entries.map((e) => parseInt(e, 10)).filter((n) => !isNaN(n)).sort((a, b) => a - b);
|
|
5198
|
+
for (const n of iterNums) {
|
|
5199
|
+
const log = await loadIterationLog(workspacePath, n);
|
|
5200
|
+
if (log) iterations.push(log);
|
|
5201
|
+
}
|
|
5202
|
+
return iterations;
|
|
5203
|
+
}
|
|
5204
|
+
async function loadTasks(workspacePath) {
|
|
5205
|
+
try {
|
|
5206
|
+
const content = await fs22.readFile(path22.join(workspacePath, "tasks.yaml"), "utf-8");
|
|
5207
|
+
const parsed = yamlParse(content);
|
|
5208
|
+
return parsed?.tasks ?? [];
|
|
5209
|
+
} catch {
|
|
5210
|
+
return [];
|
|
5211
|
+
}
|
|
5212
|
+
}
|
|
5213
|
+
function buildLeaderboard(iterations, tasks) {
|
|
5214
|
+
const taskIds = tasks.map((t) => t.id);
|
|
5215
|
+
return taskIds.map((taskId) => {
|
|
5216
|
+
const scores = {};
|
|
5217
|
+
let bestScore = -1;
|
|
5218
|
+
let bestIteration = 0;
|
|
5219
|
+
for (const iter of iterations) {
|
|
5220
|
+
const s = iter.taskResults[taskId];
|
|
5221
|
+
if (s) {
|
|
5222
|
+
const score = numericScore2(s);
|
|
5223
|
+
scores[iter.iteration] = score;
|
|
5224
|
+
if (score > bestScore) {
|
|
5225
|
+
bestScore = score;
|
|
5226
|
+
bestIteration = iter.iteration;
|
|
5227
|
+
}
|
|
5228
|
+
}
|
|
5229
|
+
}
|
|
5230
|
+
return { taskId, scores, bestIteration, bestScore };
|
|
5231
|
+
});
|
|
5232
|
+
}
|
|
5233
|
+
function iterationStatus(iter, bestIteration) {
|
|
5234
|
+
if (iter.iteration === 0) return "baseline";
|
|
5235
|
+
if (!iter.proposal && !iter.diffPatch) return "rollback";
|
|
5236
|
+
if (iter.score >= 100) return "perfect";
|
|
5237
|
+
if (iter.iteration === bestIteration) return "best";
|
|
5238
|
+
return "evaluated";
|
|
5239
|
+
}
|
|
5240
|
+
async function generateMarkdownReport(workspacePath) {
|
|
5241
|
+
const iterations = await loadAllIterations(workspacePath);
|
|
5242
|
+
const tasks = await loadTasks(workspacePath);
|
|
5243
|
+
if (iterations.length === 0) {
|
|
5244
|
+
return "# Evolution Report\n\nNo iterations found. Run `kairn evolve run` first.\n";
|
|
5245
|
+
}
|
|
5246
|
+
const baselineScore = iterations[0].score;
|
|
5247
|
+
const bestIter = iterations.reduce((best, curr) => curr.score > best.score ? curr : best, iterations[0]);
|
|
5248
|
+
const improvement = bestIter.score - baselineScore;
|
|
5249
|
+
const counterfactuals = diagnoseCounterfactuals(iterations, tasks);
|
|
5250
|
+
const leaderboard = buildLeaderboard(iterations, tasks);
|
|
5251
|
+
const lines = [];
|
|
5252
|
+
lines.push("# Evolution Report");
|
|
5253
|
+
lines.push("");
|
|
5254
|
+
lines.push("## Overview");
|
|
5255
|
+
lines.push("");
|
|
5256
|
+
lines.push(`| Metric | Value |`);
|
|
5257
|
+
lines.push(`|--------|-------|`);
|
|
5258
|
+
lines.push(`| Total iterations | ${iterations.length} |`);
|
|
5259
|
+
lines.push(`| Baseline score | ${baselineScore.toFixed(1)}% |`);
|
|
5260
|
+
lines.push(`| Best score | ${bestIter.score.toFixed(1)}% |`);
|
|
5261
|
+
lines.push(`| Best iteration | ${bestIter.iteration} |`);
|
|
5262
|
+
lines.push(`| Improvement | ${improvement >= 0 ? "+" : ""}${improvement.toFixed(1)} points |`);
|
|
5263
|
+
lines.push("");
|
|
5264
|
+
lines.push("## Iterations");
|
|
5265
|
+
lines.push("");
|
|
5266
|
+
lines.push("| Iter | Score | Mutations | Status |");
|
|
5267
|
+
lines.push("|------|-------|-----------|--------|");
|
|
5268
|
+
for (const iter of iterations) {
|
|
5269
|
+
const mutations = iter.proposal?.mutations.length ?? 0;
|
|
5270
|
+
const mutStr = mutations > 0 ? mutations.toString() : "-";
|
|
5271
|
+
const status = iterationStatus(iter, bestIter.iteration);
|
|
5272
|
+
lines.push(`| ${iter.iteration} | ${iter.score.toFixed(1)}% | ${mutStr} | ${status} |`);
|
|
5273
|
+
}
|
|
5274
|
+
lines.push("");
|
|
5275
|
+
if (leaderboard.length > 0) {
|
|
5276
|
+
lines.push("## Leaderboard");
|
|
5277
|
+
lines.push("");
|
|
5278
|
+
const iterNums = iterations.map((i) => i.iteration);
|
|
5279
|
+
const headerCols = ["Task", ...iterNums.map((n) => `Iter ${n}`), "Best"];
|
|
5280
|
+
lines.push(`| ${headerCols.join(" | ")} |`);
|
|
5281
|
+
lines.push(`| ${headerCols.map(() => "---").join(" | ")} |`);
|
|
5282
|
+
for (const entry of leaderboard) {
|
|
5283
|
+
const scoreCols = iterNums.map((n) => {
|
|
5284
|
+
const s = entry.scores[n];
|
|
5285
|
+
return s !== void 0 ? `${s.toFixed(0)}%` : "-";
|
|
5286
|
+
});
|
|
5287
|
+
lines.push(`| ${entry.taskId} | ${scoreCols.join(" | ")} | ${entry.bestScore.toFixed(0)}% (iter ${entry.bestIteration}) |`);
|
|
5288
|
+
}
|
|
5289
|
+
lines.push("");
|
|
5290
|
+
}
|
|
5291
|
+
if (counterfactuals.entries.length > 0) {
|
|
5292
|
+
lines.push("## Counterfactual Diagnosis");
|
|
5293
|
+
lines.push("");
|
|
5294
|
+
for (const entry of counterfactuals.entries) {
|
|
5295
|
+
const sign = entry.netScoreDelta >= 0 ? "+" : "";
|
|
5296
|
+
lines.push(`### Iteration ${entry.iteration} (net ${sign}${entry.netScoreDelta.toFixed(1)} points)`);
|
|
5297
|
+
lines.push("");
|
|
5298
|
+
lines.push(`**Mutations:** ${entry.mutationSummary}`);
|
|
5299
|
+
lines.push("");
|
|
5300
|
+
if (entry.helpedTasks.length > 0) {
|
|
5301
|
+
lines.push("**Helped:**");
|
|
5302
|
+
for (const t of entry.helpedTasks) {
|
|
5303
|
+
lines.push(`- ${t.taskId}: +${t.delta.toFixed(1)}`);
|
|
5304
|
+
}
|
|
5305
|
+
lines.push("");
|
|
5306
|
+
}
|
|
5307
|
+
if (entry.hurtTasks.length > 0) {
|
|
5308
|
+
lines.push("**Hurt:**");
|
|
5309
|
+
for (const t of entry.hurtTasks) {
|
|
5310
|
+
lines.push(`- ${t.taskId}: ${t.delta.toFixed(1)}`);
|
|
5311
|
+
}
|
|
5312
|
+
lines.push("");
|
|
5313
|
+
}
|
|
5314
|
+
}
|
|
5315
|
+
}
|
|
5316
|
+
return lines.join("\n");
|
|
5317
|
+
}
|
|
5318
|
+
async function generateJsonReport(workspacePath) {
|
|
5319
|
+
const iterations = await loadAllIterations(workspacePath);
|
|
5320
|
+
const tasks = await loadTasks(workspacePath);
|
|
5321
|
+
const baselineScore = iterations.length > 0 ? iterations[0].score : 0;
|
|
5322
|
+
const bestIter = iterations.length > 0 ? iterations.reduce((best, curr) => curr.score > best.score ? curr : best, iterations[0]) : { score: 0, iteration: 0 };
|
|
5323
|
+
const improvement = bestIter.score - baselineScore;
|
|
5324
|
+
const counterfactuals = diagnoseCounterfactuals(iterations, tasks);
|
|
5325
|
+
const leaderboard = buildLeaderboard(iterations, tasks);
|
|
5326
|
+
return {
|
|
5327
|
+
overview: {
|
|
5328
|
+
title: "Evolution Report",
|
|
5329
|
+
totalIterations: iterations.length,
|
|
5330
|
+
baselineScore,
|
|
5331
|
+
bestScore: bestIter.score,
|
|
5332
|
+
bestIteration: bestIter.iteration,
|
|
5333
|
+
improvement
|
|
5334
|
+
},
|
|
5335
|
+
iterations: iterations.map((iter) => ({
|
|
5336
|
+
iteration: iter.iteration,
|
|
5337
|
+
score: iter.score,
|
|
5338
|
+
mutationCount: iter.proposal?.mutations.length ?? 0,
|
|
5339
|
+
status: iterationStatus(iter, bestIter.iteration)
|
|
5340
|
+
})),
|
|
5341
|
+
leaderboard,
|
|
5342
|
+
counterfactuals
|
|
5343
|
+
};
|
|
5344
|
+
}
|
|
5345
|
+
|
|
4981
5346
|
// src/commands/evolve.ts
|
|
4982
5347
|
var DEFAULT_CONFIG = {
|
|
4983
5348
|
model: "claude-sonnet-4-6",
|
|
@@ -4988,8 +5353,8 @@ var DEFAULT_CONFIG = {
|
|
|
4988
5353
|
};
|
|
4989
5354
|
async function loadEvolveConfigFromWorkspace(workspacePath) {
|
|
4990
5355
|
try {
|
|
4991
|
-
const configStr = await
|
|
4992
|
-
const parsed =
|
|
5356
|
+
const configStr = await fs23.readFile(path23.join(workspacePath, "config.yaml"), "utf-8");
|
|
5357
|
+
const parsed = yamlParse2(configStr);
|
|
4993
5358
|
return {
|
|
4994
5359
|
model: parsed.model ?? DEFAULT_CONFIG.model,
|
|
4995
5360
|
proposerModel: parsed.proposer_model ?? DEFAULT_CONFIG.proposerModel,
|
|
@@ -5006,9 +5371,9 @@ evolveCommand.command("init").description("Initialize an evolution workspace wit
|
|
|
5006
5371
|
try {
|
|
5007
5372
|
const projectRoot = process.cwd();
|
|
5008
5373
|
console.log(ui.section("Evolve Init"));
|
|
5009
|
-
const claudeDir =
|
|
5374
|
+
const claudeDir = path23.join(projectRoot, ".claude");
|
|
5010
5375
|
try {
|
|
5011
|
-
await
|
|
5376
|
+
await fs23.access(claudeDir);
|
|
5012
5377
|
} catch {
|
|
5013
5378
|
console.log(ui.error("No .claude/ directory found. Run kairn describe first."));
|
|
5014
5379
|
process.exit(1);
|
|
@@ -5058,7 +5423,7 @@ evolveCommand.command("init").description("Initialize an evolution workspace wit
|
|
|
5058
5423
|
if (config) {
|
|
5059
5424
|
let claudeMd = "";
|
|
5060
5425
|
try {
|
|
5061
|
-
claudeMd = await
|
|
5426
|
+
claudeMd = await fs23.readFile(path23.join(claudeDir, "CLAUDE.md"), "utf-8");
|
|
5062
5427
|
} catch {
|
|
5063
5428
|
}
|
|
5064
5429
|
const profile = await buildProjectProfile(projectRoot);
|
|
@@ -5089,16 +5454,16 @@ evolveCommand.command("init").description("Initialize an evolution workspace wit
|
|
|
5089
5454
|
evolveCommand.command("baseline").description("Snapshot current .claude/ directory as baseline").action(async () => {
|
|
5090
5455
|
try {
|
|
5091
5456
|
const projectRoot = process.cwd();
|
|
5092
|
-
const workspace =
|
|
5457
|
+
const workspace = path23.join(projectRoot, ".kairn-evolve");
|
|
5093
5458
|
console.log(ui.section("Evolve Baseline"));
|
|
5094
5459
|
try {
|
|
5095
|
-
await
|
|
5460
|
+
await fs23.access(workspace);
|
|
5096
5461
|
} catch {
|
|
5097
5462
|
console.log(ui.error("No .kairn-evolve/ directory found. Run kairn evolve init first."));
|
|
5098
5463
|
process.exit(1);
|
|
5099
5464
|
}
|
|
5100
5465
|
await snapshotBaseline(projectRoot, workspace);
|
|
5101
|
-
const baselineDir =
|
|
5466
|
+
const baselineDir = path23.join(workspace, "baseline");
|
|
5102
5467
|
const fileCount = await countFiles(baselineDir);
|
|
5103
5468
|
console.log(ui.success(`Baseline snapshot created (${fileCount} files)`));
|
|
5104
5469
|
} catch (err) {
|
|
@@ -5110,23 +5475,23 @@ evolveCommand.command("baseline").description("Snapshot current .claude/ directo
|
|
|
5110
5475
|
evolveCommand.command("run").description("Run tasks against the current harness").option("--task <id>", "Run a specific task by ID").option("--iterations <n>", "Number of evolution iterations", "5").action(async (options) => {
|
|
5111
5476
|
try {
|
|
5112
5477
|
const projectRoot = process.cwd();
|
|
5113
|
-
const workspace =
|
|
5478
|
+
const workspace = path23.join(projectRoot, ".kairn-evolve");
|
|
5114
5479
|
console.log(ui.section("Evolve Run"));
|
|
5115
5480
|
try {
|
|
5116
|
-
await
|
|
5481
|
+
await fs23.access(workspace);
|
|
5117
5482
|
} catch {
|
|
5118
5483
|
console.log(ui.error("No .kairn-evolve/ directory found. Run kairn evolve init first."));
|
|
5119
5484
|
process.exit(1);
|
|
5120
5485
|
}
|
|
5121
|
-
const tasksPath =
|
|
5486
|
+
const tasksPath = path23.join(workspace, "tasks.yaml");
|
|
5122
5487
|
let tasksContent;
|
|
5123
5488
|
try {
|
|
5124
|
-
tasksContent = await
|
|
5489
|
+
tasksContent = await fs23.readFile(tasksPath, "utf-8");
|
|
5125
5490
|
} catch {
|
|
5126
5491
|
console.log(ui.error("No tasks.yaml found. Run kairn evolve init first."));
|
|
5127
5492
|
process.exit(1);
|
|
5128
5493
|
}
|
|
5129
|
-
const parsed =
|
|
5494
|
+
const parsed = yamlParse2(tasksContent);
|
|
5130
5495
|
if (!parsed?.tasks || parsed.tasks.length === 0) {
|
|
5131
5496
|
console.log(ui.error("No tasks found in tasks.yaml"));
|
|
5132
5497
|
process.exit(1);
|
|
@@ -5140,15 +5505,15 @@ evolveCommand.command("run").description("Run tasks against the current harness"
|
|
|
5140
5505
|
console.log(ui.info(`Running ${tasksToRun.length} task(s)...`));
|
|
5141
5506
|
console.log("");
|
|
5142
5507
|
const config = await loadConfig();
|
|
5143
|
-
const harnessPath =
|
|
5508
|
+
const harnessPath = path23.join(projectRoot, ".claude");
|
|
5144
5509
|
const results = [];
|
|
5145
5510
|
for (const task of tasksToRun) {
|
|
5146
|
-
const traceDir =
|
|
5511
|
+
const traceDir = path23.join(workspace, "traces", "0", task.id);
|
|
5147
5512
|
const spinner = ora2(`Running: ${task.id}`).start();
|
|
5148
5513
|
const result = await runTask(task, harnessPath, traceDir, 0);
|
|
5149
5514
|
if (config) {
|
|
5150
|
-
const stdout = await
|
|
5151
|
-
const stderr = await
|
|
5515
|
+
const stdout = await fs23.readFile(path23.join(traceDir, "stdout.log"), "utf-8").catch(() => "");
|
|
5516
|
+
const stderr = await fs23.readFile(path23.join(traceDir, "stderr.log"), "utf-8").catch(() => "");
|
|
5152
5517
|
const score = await scoreTask(task, traceDir, stdout, stderr, config);
|
|
5153
5518
|
result.score = score;
|
|
5154
5519
|
await writeScore(traceDir, score);
|
|
@@ -5177,7 +5542,7 @@ evolveCommand.command("run").description("Run tasks against the current harness"
|
|
|
5177
5542
|
}
|
|
5178
5543
|
evolveConfig.maxIterations = iterations;
|
|
5179
5544
|
try {
|
|
5180
|
-
await
|
|
5545
|
+
await fs23.access(path23.join(workspace, "iterations", "0", "harness"));
|
|
5181
5546
|
} catch {
|
|
5182
5547
|
console.log(ui.error("No baseline harness found. Run kairn evolve baseline first."));
|
|
5183
5548
|
process.exit(1);
|
|
@@ -5204,6 +5569,18 @@ evolveCommand.command("run").description("Run tasks against the current harness"
|
|
|
5204
5569
|
case "perfect-score":
|
|
5205
5570
|
console.log(chalk14.green(" Perfect score. Stopping."));
|
|
5206
5571
|
break;
|
|
5572
|
+
case "proposer-error":
|
|
5573
|
+
console.log(chalk14.yellow(` Warning: ${event.message ?? "Proposer failed"}`));
|
|
5574
|
+
break;
|
|
5575
|
+
case "task-start":
|
|
5576
|
+
console.log(chalk14.dim(` Running: ${event.taskId ?? "unknown"}...`));
|
|
5577
|
+
break;
|
|
5578
|
+
case "task-scored": {
|
|
5579
|
+
const taskScore = event.score ?? 0;
|
|
5580
|
+
const taskStatus = taskScore >= 100 ? chalk14.green("PASS") : taskScore >= 60 ? chalk14.yellow("PARTIAL") : chalk14.red("FAIL");
|
|
5581
|
+
console.log(` ${taskStatus} ${event.taskId ?? "unknown"} ${chalk14.dim(`(${taskScore.toFixed(0)}%)`)}`);
|
|
5582
|
+
break;
|
|
5583
|
+
}
|
|
5207
5584
|
case "complete":
|
|
5208
5585
|
break;
|
|
5209
5586
|
}
|
|
@@ -5238,13 +5615,107 @@ evolveCommand.command("run").description("Run tasks against the current harness"
|
|
|
5238
5615
|
process.exit(1);
|
|
5239
5616
|
}
|
|
5240
5617
|
});
|
|
5618
|
+
evolveCommand.command("report").description("Generate a summary report of the evolution run").option("--json", "Output machine-readable JSON instead of Markdown").action(async (options) => {
|
|
5619
|
+
try {
|
|
5620
|
+
const projectRoot = process.cwd();
|
|
5621
|
+
const workspace = path23.join(projectRoot, ".kairn-evolve");
|
|
5622
|
+
try {
|
|
5623
|
+
await fs23.access(workspace);
|
|
5624
|
+
} catch {
|
|
5625
|
+
console.log(ui.error("No .kairn-evolve/ directory found. Run kairn evolve init first."));
|
|
5626
|
+
process.exit(1);
|
|
5627
|
+
}
|
|
5628
|
+
if (options.json) {
|
|
5629
|
+
const report = await generateJsonReport(workspace);
|
|
5630
|
+
console.log(JSON.stringify(report, null, 2));
|
|
5631
|
+
} else {
|
|
5632
|
+
const markdown = await generateMarkdownReport(workspace);
|
|
5633
|
+
console.log(markdown);
|
|
5634
|
+
}
|
|
5635
|
+
} catch (err) {
|
|
5636
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
5637
|
+
console.log(ui.error(msg));
|
|
5638
|
+
process.exit(1);
|
|
5639
|
+
}
|
|
5640
|
+
});
|
|
5641
|
+
evolveCommand.command("diff <iter1> <iter2>").description("Show harness changes between two iterations").action(async (iter1Str, iter2Str) => {
|
|
5642
|
+
try {
|
|
5643
|
+
const projectRoot = process.cwd();
|
|
5644
|
+
const workspace = path23.join(projectRoot, ".kairn-evolve");
|
|
5645
|
+
const iter1 = parseInt(iter1Str, 10);
|
|
5646
|
+
const iter2 = parseInt(iter2Str, 10);
|
|
5647
|
+
if (isNaN(iter1) || isNaN(iter2)) {
|
|
5648
|
+
console.log(ui.error("Both arguments must be integers (iteration numbers)"));
|
|
5649
|
+
process.exit(1);
|
|
5650
|
+
}
|
|
5651
|
+
const harness1 = path23.join(workspace, "iterations", iter1.toString(), "harness");
|
|
5652
|
+
const harness2 = path23.join(workspace, "iterations", iter2.toString(), "harness");
|
|
5653
|
+
try {
|
|
5654
|
+
await fs23.access(harness1);
|
|
5655
|
+
} catch {
|
|
5656
|
+
console.log(ui.error(`Iteration ${iter1} harness not found at ${harness1}`));
|
|
5657
|
+
process.exit(1);
|
|
5658
|
+
}
|
|
5659
|
+
try {
|
|
5660
|
+
await fs23.access(harness2);
|
|
5661
|
+
} catch {
|
|
5662
|
+
console.log(ui.error(`Iteration ${iter2} harness not found at ${harness2}`));
|
|
5663
|
+
process.exit(1);
|
|
5664
|
+
}
|
|
5665
|
+
console.log(ui.section(`Diff: Iteration ${iter1} \u2192 ${iter2}`));
|
|
5666
|
+
const diffPatch = await generateDiff2(harness1, harness2);
|
|
5667
|
+
if (!diffPatch) {
|
|
5668
|
+
console.log(chalk14.dim(" No harness changes between these iterations."));
|
|
5669
|
+
} else {
|
|
5670
|
+
for (const line of diffPatch.split("\n")) {
|
|
5671
|
+
if (line.startsWith("---") || line.startsWith("+++")) {
|
|
5672
|
+
console.log(chalk14.bold(line));
|
|
5673
|
+
} else if (line.startsWith("+")) {
|
|
5674
|
+
console.log(chalk14.green(line));
|
|
5675
|
+
} else if (line.startsWith("-")) {
|
|
5676
|
+
console.log(chalk14.red(line));
|
|
5677
|
+
} else {
|
|
5678
|
+
console.log(line);
|
|
5679
|
+
}
|
|
5680
|
+
}
|
|
5681
|
+
}
|
|
5682
|
+
const [log1, log2] = await Promise.all([
|
|
5683
|
+
loadIterationLog(workspace, iter1),
|
|
5684
|
+
loadIterationLog(workspace, iter2)
|
|
5685
|
+
]);
|
|
5686
|
+
if (log1 && log2) {
|
|
5687
|
+
console.log("");
|
|
5688
|
+
console.log(ui.section("Score Comparison"));
|
|
5689
|
+
console.log("");
|
|
5690
|
+
console.log(" Task Iter " + iter1 + " Iter " + iter2 + " Delta");
|
|
5691
|
+
const allTaskIds = /* @__PURE__ */ new Set([
|
|
5692
|
+
...Object.keys(log1.taskResults),
|
|
5693
|
+
...Object.keys(log2.taskResults)
|
|
5694
|
+
]);
|
|
5695
|
+
for (const taskId of [...allTaskIds].sort()) {
|
|
5696
|
+
const s1 = log1.taskResults[taskId];
|
|
5697
|
+
const s2 = log2.taskResults[taskId];
|
|
5698
|
+
const score1 = s1 ? s1.score ?? (s1.pass ? 100 : 0) : 0;
|
|
5699
|
+
const score2 = s2 ? s2.score ?? (s2.pass ? 100 : 0) : 0;
|
|
5700
|
+
const delta = score2 - score1;
|
|
5701
|
+
const deltaStr = delta > 0 ? chalk14.green(`+${delta.toFixed(0)}`) : delta < 0 ? chalk14.red(delta.toFixed(0).toString()) : chalk14.dim("0");
|
|
5702
|
+
const name = taskId.padEnd(30);
|
|
5703
|
+
console.log(` ${name} ${score1.toFixed(0).padStart(5)}% ${score2.toFixed(0).padStart(5)}% ${deltaStr}`);
|
|
5704
|
+
}
|
|
5705
|
+
}
|
|
5706
|
+
} catch (err) {
|
|
5707
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
5708
|
+
console.log(ui.error(msg));
|
|
5709
|
+
process.exit(1);
|
|
5710
|
+
}
|
|
5711
|
+
});
|
|
5241
5712
|
async function countFiles(dir) {
|
|
5242
5713
|
let count = 0;
|
|
5243
5714
|
try {
|
|
5244
|
-
const entries = await
|
|
5715
|
+
const entries = await fs23.readdir(dir, { withFileTypes: true });
|
|
5245
5716
|
for (const entry of entries) {
|
|
5246
5717
|
if (entry.isDirectory()) {
|
|
5247
|
-
count += await countFiles(
|
|
5718
|
+
count += await countFiles(path23.join(dir, entry.name));
|
|
5248
5719
|
} else {
|
|
5249
5720
|
count++;
|
|
5250
5721
|
}
|