kairn-cli 2.2.0 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +198 -51
- package/dist/cli.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -3741,30 +3741,55 @@ var EVAL_TEMPLATES = {
|
|
|
3741
3741
|
name: "Documentation",
|
|
3742
3742
|
description: "Can the agent write and update docs?",
|
|
3743
3743
|
bestFor: ["content", "api-building", "full-stack"]
|
|
3744
|
+
},
|
|
3745
|
+
"convention-adherence": {
|
|
3746
|
+
id: "convention-adherence",
|
|
3747
|
+
name: "Convention Adherence",
|
|
3748
|
+
description: "Does the agent follow all project conventions defined in CLAUDE.md?",
|
|
3749
|
+
bestFor: ["feature-development", "full-stack", "backend", "maintenance"]
|
|
3750
|
+
},
|
|
3751
|
+
"workflow-compliance": {
|
|
3752
|
+
id: "workflow-compliance",
|
|
3753
|
+
name: "Workflow Compliance",
|
|
3754
|
+
description: "Does the agent use the project workflow commands and skills?",
|
|
3755
|
+
bestFor: ["feature-development", "full-stack", "tdd", "qa"]
|
|
3756
|
+
},
|
|
3757
|
+
"rule-compliance": {
|
|
3758
|
+
id: "rule-compliance",
|
|
3759
|
+
name: "Rule Compliance",
|
|
3760
|
+
description: "Does the agent follow all project rules without violations?",
|
|
3761
|
+
bestFor: ["feature-development", "backend", "maintenance", "architecture"]
|
|
3744
3762
|
}
|
|
3745
3763
|
};
|
|
3746
3764
|
function selectTemplatesForWorkflow(workflowType) {
|
|
3747
3765
|
const mapping = {
|
|
3748
|
-
"feature-development": ["add-feature", "test-writing", "
|
|
3749
|
-
"api-building": ["add-feature", "fix-bug", "test-writing"],
|
|
3750
|
-
"full-stack": ["add-feature", "fix-bug", "test-writing"],
|
|
3751
|
-
"maintenance": ["fix-bug", "refactor", "test-writing"],
|
|
3752
|
-
"debugging": ["fix-bug", "test-writing"],
|
|
3753
|
-
"qa": ["fix-bug", "test-writing", "add-feature"],
|
|
3754
|
-
"architecture": ["refactor", "test-writing", "config-change"],
|
|
3755
|
-
"backend": ["fix-bug", "refactor", "config-change", "
|
|
3756
|
-
"devops": ["config-change", "fix-bug"],
|
|
3757
|
-
"infrastructure": ["config-change", "refactor"],
|
|
3758
|
-
"tdd": ["test-writing", "add-feature", "fix-bug"],
|
|
3759
|
-
"content": ["documentation", "add-feature"],
|
|
3760
|
-
"research": ["documentation", "add-feature"]
|
|
3766
|
+
"feature-development": ["add-feature", "test-writing", "convention-adherence", "workflow-compliance"],
|
|
3767
|
+
"api-building": ["add-feature", "fix-bug", "test-writing", "convention-adherence"],
|
|
3768
|
+
"full-stack": ["add-feature", "fix-bug", "test-writing", "convention-adherence"],
|
|
3769
|
+
"maintenance": ["fix-bug", "refactor", "test-writing", "rule-compliance"],
|
|
3770
|
+
"debugging": ["fix-bug", "test-writing", "rule-compliance"],
|
|
3771
|
+
"qa": ["fix-bug", "test-writing", "add-feature", "workflow-compliance"],
|
|
3772
|
+
"architecture": ["refactor", "test-writing", "config-change", "convention-adherence"],
|
|
3773
|
+
"backend": ["fix-bug", "refactor", "config-change", "rule-compliance"],
|
|
3774
|
+
"devops": ["config-change", "fix-bug", "rule-compliance"],
|
|
3775
|
+
"infrastructure": ["config-change", "refactor", "convention-adherence"],
|
|
3776
|
+
"tdd": ["test-writing", "add-feature", "fix-bug", "workflow-compliance"],
|
|
3777
|
+
"content": ["documentation", "add-feature", "convention-adherence"],
|
|
3778
|
+
"research": ["documentation", "add-feature", "convention-adherence"]
|
|
3761
3779
|
};
|
|
3762
|
-
return mapping[workflowType] || ["add-feature", "fix-bug", "test-writing"];
|
|
3780
|
+
return mapping[workflowType] || ["add-feature", "fix-bug", "test-writing", "convention-adherence"];
|
|
3763
3781
|
}
|
|
3764
3782
|
var TASK_GENERATION_PROMPT = `You are an eval task generator for Claude Code agent environments. Given a project's CLAUDE.md, project structure, and selected eval templates, generate concrete, project-specific tasks.
|
|
3765
3783
|
|
|
3766
3784
|
Each task must be realistic and testable against the actual project. Avoid generic placeholders.
|
|
3767
3785
|
|
|
3786
|
+
IMPORTANT: For harness-aware templates (convention-adherence, workflow-compliance, rule-compliance), generate tasks where success DEPENDS on the agent reading and following the .claude/ harness content:
|
|
3787
|
+
- convention-adherence: Task must require following specific conventions from CLAUDE.md (naming, file structure, patterns). Judge by whether output matches the conventions.
|
|
3788
|
+
- workflow-compliance: Task must require using project slash commands or workflow steps defined in .claude/commands/. Judge by whether the agent followed the defined workflow.
|
|
3789
|
+
- rule-compliance: Task must create a scenario where .claude/rules/ content is relevant. Judge by whether the agent respected all rules.
|
|
3790
|
+
|
|
3791
|
+
These harness-aware tasks are critical \u2014 they test whether the .claude/ environment actually improves agent behavior.
|
|
3792
|
+
|
|
3768
3793
|
Return a JSON object with a "tasks" array. Each task has:
|
|
3769
3794
|
- id: kebab-case identifier (e.g., "add-health-endpoint")
|
|
3770
3795
|
- template: which eval template this instantiates
|
|
@@ -4190,7 +4215,8 @@ ${msg}`);
|
|
|
4190
4215
|
details: passed2 ? `All ${commands.length} verification commands passed` : failures.join("\n")
|
|
4191
4216
|
};
|
|
4192
4217
|
}
|
|
4193
|
-
const
|
|
4218
|
+
const filteredStderr = stderr.split("\n").filter((line) => !line.startsWith("[setup]")).join("\n");
|
|
4219
|
+
const hasErrors = filteredStderr.toLowerCase().includes("error") || filteredStderr.toLowerCase().includes("failed") || filteredStderr.toLowerCase().includes("exception");
|
|
4194
4220
|
const passed = !hasErrors;
|
|
4195
4221
|
return {
|
|
4196
4222
|
pass: passed,
|
|
@@ -4318,24 +4344,88 @@ async function scoreTask(task, workspacePath, stdout, stderr, config) {
|
|
|
4318
4344
|
|
|
4319
4345
|
// src/evolve/runner.ts
|
|
4320
4346
|
var execAsync2 = promisify2(exec2);
|
|
4321
|
-
|
|
4347
|
+
var COPY_SKIP_DIRS = /* @__PURE__ */ new Set([".git", "node_modules", ".kairn-evolve", ".claude"]);
|
|
4348
|
+
async function createIsolatedWorkspace(projectRoot, harnessPath) {
|
|
4349
|
+
const suffix = `${Date.now()}-${Math.random().toString(36).slice(2)}`;
|
|
4350
|
+
try {
|
|
4351
|
+
await execAsync2("git rev-parse --is-inside-work-tree", {
|
|
4352
|
+
cwd: projectRoot,
|
|
4353
|
+
timeout: 5e3
|
|
4354
|
+
});
|
|
4355
|
+
const tmpDir2 = path18.join(os3.tmpdir(), `kairn-evolve-wt-${suffix}`);
|
|
4356
|
+
await execAsync2(`git worktree add --detach "${tmpDir2}" HEAD`, {
|
|
4357
|
+
cwd: projectRoot,
|
|
4358
|
+
timeout: 3e4
|
|
4359
|
+
});
|
|
4360
|
+
await fs18.rm(path18.join(tmpDir2, ".claude"), { recursive: true, force: true });
|
|
4361
|
+
await copyDir(harnessPath, path18.join(tmpDir2, ".claude"));
|
|
4362
|
+
return { workDir: tmpDir2, isWorktree: true };
|
|
4363
|
+
} catch {
|
|
4364
|
+
}
|
|
4365
|
+
const tmpDir = await fs18.mkdtemp(path18.join(os3.tmpdir(), `kairn-evolve-cp-`));
|
|
4366
|
+
await copyProjectDir(projectRoot, tmpDir);
|
|
4367
|
+
await fs18.rm(path18.join(tmpDir, ".claude"), { recursive: true, force: true });
|
|
4368
|
+
await copyDir(harnessPath, path18.join(tmpDir, ".claude"));
|
|
4369
|
+
return { workDir: tmpDir, isWorktree: false };
|
|
4370
|
+
}
|
|
4371
|
+
async function copyProjectDir(src, dest) {
|
|
4372
|
+
await fs18.mkdir(dest, { recursive: true });
|
|
4373
|
+
let entries;
|
|
4374
|
+
try {
|
|
4375
|
+
entries = await fs18.readdir(src, { withFileTypes: true });
|
|
4376
|
+
} catch {
|
|
4377
|
+
return;
|
|
4378
|
+
}
|
|
4379
|
+
for (const entry of entries) {
|
|
4380
|
+
if (COPY_SKIP_DIRS.has(entry.name)) continue;
|
|
4381
|
+
const srcPath = path18.join(src, entry.name);
|
|
4382
|
+
const destPath = path18.join(dest, entry.name);
|
|
4383
|
+
if (entry.isDirectory()) {
|
|
4384
|
+
await copyDir(srcPath, destPath);
|
|
4385
|
+
} else {
|
|
4386
|
+
await fs18.copyFile(srcPath, destPath);
|
|
4387
|
+
}
|
|
4388
|
+
}
|
|
4389
|
+
}
|
|
4390
|
+
async function cleanupIsolatedWorkspace(workDir, isWorktree, projectRoot) {
|
|
4391
|
+
if (isWorktree) {
|
|
4392
|
+
try {
|
|
4393
|
+
await execAsync2(`git worktree remove "${workDir}" --force`, {
|
|
4394
|
+
cwd: projectRoot,
|
|
4395
|
+
timeout: 1e4
|
|
4396
|
+
});
|
|
4397
|
+
} catch {
|
|
4398
|
+
await fs18.rm(workDir, { recursive: true, force: true }).catch(() => {
|
|
4399
|
+
});
|
|
4400
|
+
await execAsync2("git worktree prune", {
|
|
4401
|
+
cwd: projectRoot,
|
|
4402
|
+
timeout: 5e3
|
|
4403
|
+
}).catch(() => {
|
|
4404
|
+
});
|
|
4405
|
+
}
|
|
4406
|
+
} else {
|
|
4407
|
+
await fs18.rm(workDir, { recursive: true, force: true }).catch(() => {
|
|
4408
|
+
});
|
|
4409
|
+
}
|
|
4410
|
+
}
|
|
4411
|
+
async function runTask(task, harnessPath, traceDir, iteration, projectRoot) {
|
|
4322
4412
|
await fs18.mkdir(traceDir, { recursive: true });
|
|
4323
4413
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
4324
4414
|
const startMs = Date.now();
|
|
4325
|
-
const
|
|
4415
|
+
const root = projectRoot ?? process.cwd();
|
|
4416
|
+
const { workDir, isWorktree } = await createIsolatedWorkspace(root, harnessPath);
|
|
4326
4417
|
try {
|
|
4327
|
-
await copyDir(harnessPath, path18.join(tmpDir, ".claude"));
|
|
4328
4418
|
let setupStderr = "";
|
|
4329
4419
|
if (task.setup.trim()) {
|
|
4330
4420
|
try {
|
|
4331
|
-
await execAsync2(task.setup, { cwd:
|
|
4421
|
+
await execAsync2(task.setup, { cwd: workDir, timeout: 6e4 });
|
|
4332
4422
|
} catch (err) {
|
|
4333
4423
|
setupStderr = err instanceof Error ? err.message : String(err);
|
|
4334
4424
|
}
|
|
4335
4425
|
}
|
|
4336
|
-
const filesBefore = await snapshotFileList(
|
|
4337
|
-
const spawnResult = await spawnClaude(task.description,
|
|
4338
|
-
const filesAfter = await snapshotFileList(
|
|
4426
|
+
const filesBefore = await snapshotFileList(workDir);
|
|
4427
|
+
const spawnResult = await spawnClaude(task.description, workDir, task.timeout);
|
|
4428
|
+
const filesAfter = await snapshotFileList(workDir);
|
|
4339
4429
|
const filesChanged = diffFileLists(filesBefore, filesAfter);
|
|
4340
4430
|
const toolCalls = parseToolCalls(spawnResult.stdout);
|
|
4341
4431
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
@@ -4359,8 +4449,7 @@ ${spawnResult.stderr}` : spawnResult.stderr;
|
|
|
4359
4449
|
traceDir
|
|
4360
4450
|
};
|
|
4361
4451
|
} finally {
|
|
4362
|
-
await
|
|
4363
|
-
});
|
|
4452
|
+
await cleanupIsolatedWorkspace(workDir, isWorktree, root);
|
|
4364
4453
|
}
|
|
4365
4454
|
}
|
|
4366
4455
|
async function spawnClaude(instruction, cwd, timeoutSec) {
|
|
@@ -4458,8 +4547,9 @@ function parseToolCalls(stdout) {
|
|
|
4458
4547
|
return [];
|
|
4459
4548
|
}
|
|
4460
4549
|
}
|
|
4461
|
-
async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config) {
|
|
4550
|
+
async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config, onProgress) {
|
|
4462
4551
|
const results = {};
|
|
4552
|
+
const projectRoot = path18.resolve(workspacePath, "..");
|
|
4463
4553
|
for (const task of tasks) {
|
|
4464
4554
|
const traceDir = path18.join(
|
|
4465
4555
|
workspacePath,
|
|
@@ -4467,7 +4557,8 @@ async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config)
|
|
|
4467
4557
|
iteration.toString(),
|
|
4468
4558
|
task.id
|
|
4469
4559
|
);
|
|
4470
|
-
|
|
4560
|
+
onProgress?.({ type: "task-start", iteration, taskId: task.id });
|
|
4561
|
+
const taskResult = await runTask(task, harnessPath, traceDir, iteration, projectRoot);
|
|
4471
4562
|
let score = taskResult.score;
|
|
4472
4563
|
if (config) {
|
|
4473
4564
|
const stdout = await fs18.readFile(path18.join(traceDir, "stdout.log"), "utf-8").catch(() => "");
|
|
@@ -4476,6 +4567,12 @@ async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config)
|
|
|
4476
4567
|
await writeScore(traceDir, score);
|
|
4477
4568
|
}
|
|
4478
4569
|
results[task.id] = score;
|
|
4570
|
+
onProgress?.({
|
|
4571
|
+
type: "task-scored",
|
|
4572
|
+
iteration,
|
|
4573
|
+
taskId: task.id,
|
|
4574
|
+
score: score.score ?? (score.pass ? 100 : 0)
|
|
4575
|
+
});
|
|
4479
4576
|
}
|
|
4480
4577
|
const scores = Object.values(results);
|
|
4481
4578
|
const total = scores.reduce(
|
|
@@ -4537,7 +4634,8 @@ Return a JSON object:
|
|
|
4537
4634
|
- Prefer ADDITIVE changes over replacements when possible.
|
|
4538
4635
|
|
|
4539
4636
|
Return ONLY valid JSON.`;
|
|
4540
|
-
var STDOUT_TRUNCATION_LIMIT =
|
|
4637
|
+
var STDOUT_TRUNCATION_LIMIT = 1e3;
|
|
4638
|
+
var MAX_CONTEXT_CHARS = 1e5;
|
|
4541
4639
|
async function readHarnessFiles(harnessPath) {
|
|
4542
4640
|
const result = {};
|
|
4543
4641
|
async function walk(dir, prefix) {
|
|
@@ -4571,26 +4669,25 @@ function truncateStdout(stdout, limit) {
|
|
|
4571
4669
|
${stdout.slice(-limit)}`;
|
|
4572
4670
|
}
|
|
4573
4671
|
function buildProposerUserMessage(harnessFiles, traces, tasks, history) {
|
|
4574
|
-
const
|
|
4575
|
-
sections.push("## Current Harness Files\n");
|
|
4672
|
+
const harnessSection = ["## Current Harness Files\n"];
|
|
4576
4673
|
const fileEntries = Object.entries(harnessFiles);
|
|
4577
4674
|
if (fileEntries.length === 0) {
|
|
4578
|
-
|
|
4675
|
+
harnessSection.push("(No harness files found)\n");
|
|
4579
4676
|
} else {
|
|
4580
4677
|
for (const [filePath, content] of fileEntries) {
|
|
4581
|
-
|
|
4678
|
+
harnessSection.push(`### ${filePath}
|
|
4582
4679
|
\`\`\`
|
|
4583
4680
|
${content}
|
|
4584
4681
|
\`\`\`
|
|
4585
4682
|
`);
|
|
4586
4683
|
}
|
|
4587
4684
|
}
|
|
4588
|
-
|
|
4685
|
+
const taskSection = ["## Task Definitions\n"];
|
|
4589
4686
|
if (tasks.length === 0) {
|
|
4590
|
-
|
|
4687
|
+
taskSection.push("(No tasks defined)\n");
|
|
4591
4688
|
} else {
|
|
4592
4689
|
for (const task of tasks) {
|
|
4593
|
-
|
|
4690
|
+
taskSection.push(
|
|
4594
4691
|
`### Task: ${task.id}
|
|
4595
4692
|
- Template: ${task.template}
|
|
4596
4693
|
- Description: ${task.description}
|
|
@@ -4600,15 +4697,27 @@ ${content}
|
|
|
4600
4697
|
);
|
|
4601
4698
|
}
|
|
4602
4699
|
}
|
|
4603
|
-
|
|
4604
|
-
|
|
4605
|
-
|
|
4606
|
-
|
|
4700
|
+
const fixedContent = harnessSection.join("\n") + "\n" + taskSection.join("\n");
|
|
4701
|
+
const remainingBudget = MAX_CONTEXT_CHARS - fixedContent.length;
|
|
4702
|
+
if (remainingBudget <= 0) {
|
|
4703
|
+
return fixedContent + "\n\n[...traces and history omitted \u2014 harness + tasks fill context budget...]";
|
|
4704
|
+
}
|
|
4705
|
+
const traceBudget = Math.floor(remainingBudget * 0.7);
|
|
4706
|
+
const historyBudget = remainingBudget - traceBudget;
|
|
4707
|
+
const traceSection = buildTraceSection(traces, traceBudget);
|
|
4708
|
+
const historySection = buildHistorySection(history, historyBudget);
|
|
4709
|
+
return fixedContent + "\n" + traceSection + "\n" + historySection;
|
|
4710
|
+
}
|
|
4711
|
+
function buildTraceSection(traces, budget) {
|
|
4712
|
+
if (traces.length === 0) return "## Execution Traces\n\n(No traces available)\n";
|
|
4713
|
+
let stdoutLimit = STDOUT_TRUNCATION_LIMIT;
|
|
4714
|
+
for (let attempt = 0; attempt < 4; attempt++) {
|
|
4715
|
+
const parts = ["## Execution Traces\n"];
|
|
4607
4716
|
for (const trace of traces) {
|
|
4608
4717
|
const scoreNum = trace.score.score !== void 0 ? trace.score.score : trace.score.pass ? 100 : 0;
|
|
4609
|
-
const truncatedStdout = truncateStdout(trace.stdout,
|
|
4718
|
+
const truncatedStdout = truncateStdout(trace.stdout, stdoutLimit);
|
|
4610
4719
|
const filesChangedList = Object.entries(trace.filesChanged).map(([f, action]) => ` - ${f}: ${action}`).join("\n");
|
|
4611
|
-
|
|
4720
|
+
parts.push(
|
|
4612
4721
|
`### Trace: ${trace.taskId}
|
|
4613
4722
|
- Pass: ${trace.score.pass}
|
|
4614
4723
|
- Score: ${scoreNum}
|
|
@@ -4616,36 +4725,55 @@ ${content}
|
|
|
4616
4725
|
` : "") + `- Duration: ${trace.timing.durationMs}ms
|
|
4617
4726
|
- Files changed:
|
|
4618
4727
|
${filesChangedList || " (none)"}
|
|
4619
|
-
- Stdout (last ${
|
|
4728
|
+
- Stdout (last ${stdoutLimit} chars):
|
|
4620
4729
|
\`\`\`
|
|
4621
4730
|
${truncatedStdout}
|
|
4622
4731
|
\`\`\`
|
|
4623
4732
|
`
|
|
4624
4733
|
);
|
|
4625
4734
|
}
|
|
4735
|
+
const result = parts.join("\n");
|
|
4736
|
+
if (result.length <= budget) return result;
|
|
4737
|
+
stdoutLimit = Math.floor(stdoutLimit / 2);
|
|
4626
4738
|
}
|
|
4627
|
-
|
|
4628
|
-
|
|
4629
|
-
|
|
4630
|
-
|
|
4631
|
-
|
|
4739
|
+
const summary = ["## Execution Traces (summary \u2014 stdout omitted to fit budget)\n"];
|
|
4740
|
+
for (const trace of traces) {
|
|
4741
|
+
const scoreNum = trace.score.score !== void 0 ? trace.score.score : trace.score.pass ? 100 : 0;
|
|
4742
|
+
summary.push(`- ${trace.taskId}: ${scoreNum} (pass=${trace.score.pass})
|
|
4743
|
+
`);
|
|
4744
|
+
}
|
|
4745
|
+
return summary.join("\n");
|
|
4746
|
+
}
|
|
4747
|
+
function buildHistorySection(history, budget) {
|
|
4748
|
+
if (history.length === 0) return "## Iteration History\n\n(No previous iterations)\n";
|
|
4749
|
+
let entries = [...history];
|
|
4750
|
+
while (entries.length > 0) {
|
|
4751
|
+
const parts = ["## Iteration History\n"];
|
|
4752
|
+
if (entries.length < history.length) {
|
|
4753
|
+
parts.push(`(Showing ${entries.length}/${history.length} most recent iterations)
|
|
4754
|
+
`);
|
|
4755
|
+
}
|
|
4756
|
+
for (const log of entries) {
|
|
4632
4757
|
const taskScores = Object.entries(log.taskResults).map(([id, s]) => ` - ${id}: ${s.score !== void 0 ? s.score : s.pass ? 100 : 0} (pass=${s.pass})`).join("\n");
|
|
4633
|
-
|
|
4758
|
+
parts.push(
|
|
4634
4759
|
`### Iteration ${log.iteration} \u2014 Score: ${log.score}
|
|
4635
4760
|
- Task results:
|
|
4636
4761
|
${taskScores}
|
|
4637
4762
|
`
|
|
4638
4763
|
);
|
|
4639
4764
|
if (log.proposal) {
|
|
4640
|
-
|
|
4765
|
+
parts.push(
|
|
4641
4766
|
`- Proposal reasoning: ${log.proposal.reasoning}
|
|
4642
4767
|
- Mutations: ${log.proposal.mutations.length} change(s)
|
|
4643
4768
|
`
|
|
4644
4769
|
);
|
|
4645
4770
|
}
|
|
4646
4771
|
}
|
|
4772
|
+
const result = parts.join("\n");
|
|
4773
|
+
if (result.length <= budget) return result;
|
|
4774
|
+
entries = entries.slice(1);
|
|
4647
4775
|
}
|
|
4648
|
-
return
|
|
4776
|
+
return "## Iteration History\n\n(History omitted to fit context budget)\n";
|
|
4649
4777
|
}
|
|
4650
4778
|
function parseProposerResponse(raw) {
|
|
4651
4779
|
let cleaned = raw.trim();
|
|
@@ -4857,7 +4985,8 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
4857
4985
|
harnessPath,
|
|
4858
4986
|
workspacePath,
|
|
4859
4987
|
iter,
|
|
4860
|
-
kairnConfig
|
|
4988
|
+
kairnConfig,
|
|
4989
|
+
onProgress
|
|
4861
4990
|
);
|
|
4862
4991
|
onProgress?.({ type: "iteration-scored", iteration: iter, score: aggregate });
|
|
4863
4992
|
if (iter === 0) baselineScore = aggregate;
|
|
@@ -4935,7 +5064,13 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
4935
5064
|
kairnConfig,
|
|
4936
5065
|
evolveConfig.proposerModel
|
|
4937
5066
|
);
|
|
4938
|
-
} catch {
|
|
5067
|
+
} catch (err) {
|
|
5068
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
5069
|
+
onProgress?.({
|
|
5070
|
+
type: "proposer-error",
|
|
5071
|
+
iteration: iter,
|
|
5072
|
+
message: `Proposer failed: ${errMsg}`
|
|
5073
|
+
});
|
|
4939
5074
|
const nextIterDir2 = path21.join(
|
|
4940
5075
|
workspacePath,
|
|
4941
5076
|
"iterations",
|
|
@@ -5434,6 +5569,18 @@ evolveCommand.command("run").description("Run tasks against the current harness"
|
|
|
5434
5569
|
case "perfect-score":
|
|
5435
5570
|
console.log(chalk14.green(" Perfect score. Stopping."));
|
|
5436
5571
|
break;
|
|
5572
|
+
case "proposer-error":
|
|
5573
|
+
console.log(chalk14.yellow(` Warning: ${event.message ?? "Proposer failed"}`));
|
|
5574
|
+
break;
|
|
5575
|
+
case "task-start":
|
|
5576
|
+
console.log(chalk14.dim(` Running: ${event.taskId ?? "unknown"}...`));
|
|
5577
|
+
break;
|
|
5578
|
+
case "task-scored": {
|
|
5579
|
+
const taskScore = event.score ?? 0;
|
|
5580
|
+
const taskStatus = taskScore >= 100 ? chalk14.green("PASS") : taskScore >= 60 ? chalk14.yellow("PARTIAL") : chalk14.red("FAIL");
|
|
5581
|
+
console.log(` ${taskStatus} ${event.taskId ?? "unknown"} ${chalk14.dim(`(${taskScore.toFixed(0)}%)`)}`);
|
|
5582
|
+
break;
|
|
5583
|
+
}
|
|
5437
5584
|
case "complete":
|
|
5438
5585
|
break;
|
|
5439
5586
|
}
|