kairn-cli 2.2.0 → 2.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +224 -57
- package/dist/cli.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -1237,22 +1237,29 @@ function classifyError(err, provider) {
|
|
|
1237
1237
|
}
|
|
1238
1238
|
async function callLLM(config, userMessage, options) {
|
|
1239
1239
|
const maxTokens = options.maxTokens ?? 8192;
|
|
1240
|
-
const systemPrompt = options
|
|
1240
|
+
const { systemPrompt } = options;
|
|
1241
|
+
const jsonMode = options.jsonMode ?? false;
|
|
1241
1242
|
const providerName = getProviderName(config.provider);
|
|
1242
1243
|
if (config.provider === "anthropic") {
|
|
1243
1244
|
const client2 = new Anthropic2({ apiKey: config.api_key });
|
|
1245
|
+
const messages = [
|
|
1246
|
+
{ role: "user", content: userMessage }
|
|
1247
|
+
];
|
|
1248
|
+
if (jsonMode) {
|
|
1249
|
+
messages.push({ role: "assistant", content: "{" });
|
|
1250
|
+
}
|
|
1244
1251
|
try {
|
|
1245
1252
|
const response = await client2.messages.create({
|
|
1246
1253
|
model: config.model,
|
|
1247
1254
|
max_tokens: maxTokens,
|
|
1248
1255
|
system: systemPrompt,
|
|
1249
|
-
messages
|
|
1256
|
+
messages
|
|
1250
1257
|
});
|
|
1251
1258
|
const textBlock = response.content.find((block) => block.type === "text");
|
|
1252
1259
|
if (!textBlock || textBlock.type !== "text") {
|
|
1253
1260
|
throw new Error("No text response from compiler LLM");
|
|
1254
1261
|
}
|
|
1255
|
-
return textBlock.text;
|
|
1262
|
+
return jsonMode ? `{${textBlock.text}` : textBlock.text;
|
|
1256
1263
|
} catch (err) {
|
|
1257
1264
|
throw new Error(classifyError(err, providerName));
|
|
1258
1265
|
}
|
|
@@ -1268,7 +1275,8 @@ async function callLLM(config, userMessage, options) {
|
|
|
1268
1275
|
messages: [
|
|
1269
1276
|
{ role: "system", content: systemPrompt },
|
|
1270
1277
|
{ role: "user", content: userMessage }
|
|
1271
|
-
]
|
|
1278
|
+
],
|
|
1279
|
+
...jsonMode ? { response_format: { type: "json_object" } } : {}
|
|
1272
1280
|
});
|
|
1273
1281
|
const text = response.choices[0]?.message?.content;
|
|
1274
1282
|
if (!text) {
|
|
@@ -3741,30 +3749,55 @@ var EVAL_TEMPLATES = {
|
|
|
3741
3749
|
name: "Documentation",
|
|
3742
3750
|
description: "Can the agent write and update docs?",
|
|
3743
3751
|
bestFor: ["content", "api-building", "full-stack"]
|
|
3752
|
+
},
|
|
3753
|
+
"convention-adherence": {
|
|
3754
|
+
id: "convention-adherence",
|
|
3755
|
+
name: "Convention Adherence",
|
|
3756
|
+
description: "Does the agent follow all project conventions defined in CLAUDE.md?",
|
|
3757
|
+
bestFor: ["feature-development", "full-stack", "backend", "maintenance"]
|
|
3758
|
+
},
|
|
3759
|
+
"workflow-compliance": {
|
|
3760
|
+
id: "workflow-compliance",
|
|
3761
|
+
name: "Workflow Compliance",
|
|
3762
|
+
description: "Does the agent use the project workflow commands and skills?",
|
|
3763
|
+
bestFor: ["feature-development", "full-stack", "tdd", "qa"]
|
|
3764
|
+
},
|
|
3765
|
+
"rule-compliance": {
|
|
3766
|
+
id: "rule-compliance",
|
|
3767
|
+
name: "Rule Compliance",
|
|
3768
|
+
description: "Does the agent follow all project rules without violations?",
|
|
3769
|
+
bestFor: ["feature-development", "backend", "maintenance", "architecture"]
|
|
3744
3770
|
}
|
|
3745
3771
|
};
|
|
3746
3772
|
function selectTemplatesForWorkflow(workflowType) {
|
|
3747
3773
|
const mapping = {
|
|
3748
|
-
"feature-development": ["add-feature", "test-writing", "
|
|
3749
|
-
"api-building": ["add-feature", "fix-bug", "test-writing"],
|
|
3750
|
-
"full-stack": ["add-feature", "fix-bug", "test-writing"],
|
|
3751
|
-
"maintenance": ["fix-bug", "refactor", "test-writing"],
|
|
3752
|
-
"debugging": ["fix-bug", "test-writing"],
|
|
3753
|
-
"qa": ["fix-bug", "test-writing", "add-feature"],
|
|
3754
|
-
"architecture": ["refactor", "test-writing", "config-change"],
|
|
3755
|
-
"backend": ["fix-bug", "refactor", "config-change", "
|
|
3756
|
-
"devops": ["config-change", "fix-bug"],
|
|
3757
|
-
"infrastructure": ["config-change", "refactor"],
|
|
3758
|
-
"tdd": ["test-writing", "add-feature", "fix-bug"],
|
|
3759
|
-
"content": ["documentation", "add-feature"],
|
|
3760
|
-
"research": ["documentation", "add-feature"]
|
|
3774
|
+
"feature-development": ["add-feature", "test-writing", "convention-adherence", "workflow-compliance"],
|
|
3775
|
+
"api-building": ["add-feature", "fix-bug", "test-writing", "convention-adherence"],
|
|
3776
|
+
"full-stack": ["add-feature", "fix-bug", "test-writing", "convention-adherence"],
|
|
3777
|
+
"maintenance": ["fix-bug", "refactor", "test-writing", "rule-compliance"],
|
|
3778
|
+
"debugging": ["fix-bug", "test-writing", "rule-compliance"],
|
|
3779
|
+
"qa": ["fix-bug", "test-writing", "add-feature", "workflow-compliance"],
|
|
3780
|
+
"architecture": ["refactor", "test-writing", "config-change", "convention-adherence"],
|
|
3781
|
+
"backend": ["fix-bug", "refactor", "config-change", "rule-compliance"],
|
|
3782
|
+
"devops": ["config-change", "fix-bug", "rule-compliance"],
|
|
3783
|
+
"infrastructure": ["config-change", "refactor", "convention-adherence"],
|
|
3784
|
+
"tdd": ["test-writing", "add-feature", "fix-bug", "workflow-compliance"],
|
|
3785
|
+
"content": ["documentation", "add-feature", "convention-adherence"],
|
|
3786
|
+
"research": ["documentation", "add-feature", "convention-adherence"]
|
|
3761
3787
|
};
|
|
3762
|
-
return mapping[workflowType] || ["add-feature", "fix-bug", "test-writing"];
|
|
3788
|
+
return mapping[workflowType] || ["add-feature", "fix-bug", "test-writing", "convention-adherence"];
|
|
3763
3789
|
}
|
|
3764
3790
|
var TASK_GENERATION_PROMPT = `You are an eval task generator for Claude Code agent environments. Given a project's CLAUDE.md, project structure, and selected eval templates, generate concrete, project-specific tasks.
|
|
3765
3791
|
|
|
3766
3792
|
Each task must be realistic and testable against the actual project. Avoid generic placeholders.
|
|
3767
3793
|
|
|
3794
|
+
IMPORTANT: For harness-aware templates (convention-adherence, workflow-compliance, rule-compliance), generate tasks where success DEPENDS on the agent reading and following the .claude/ harness content:
|
|
3795
|
+
- convention-adherence: Task must require following specific conventions from CLAUDE.md (naming, file structure, patterns). Judge by whether output matches the conventions.
|
|
3796
|
+
- workflow-compliance: Task must require using project slash commands or workflow steps defined in .claude/commands/. Judge by whether the agent followed the defined workflow.
|
|
3797
|
+
- rule-compliance: Task must create a scenario where .claude/rules/ content is relevant. Judge by whether the agent respected all rules.
|
|
3798
|
+
|
|
3799
|
+
These harness-aware tasks are critical \u2014 they test whether the .claude/ environment actually improves agent behavior.
|
|
3800
|
+
|
|
3768
3801
|
Return a JSON object with a "tasks" array. Each task has:
|
|
3769
3802
|
- id: kebab-case identifier (e.g., "add-health-endpoint")
|
|
3770
3803
|
- template: which eval template this instantiates
|
|
@@ -4190,7 +4223,8 @@ ${msg}`);
|
|
|
4190
4223
|
details: passed2 ? `All ${commands.length} verification commands passed` : failures.join("\n")
|
|
4191
4224
|
};
|
|
4192
4225
|
}
|
|
4193
|
-
const
|
|
4226
|
+
const filteredStderr = stderr.split("\n").filter((line) => !line.startsWith("[setup]")).join("\n");
|
|
4227
|
+
const hasErrors = filteredStderr.toLowerCase().includes("error") || filteredStderr.toLowerCase().includes("failed") || filteredStderr.toLowerCase().includes("exception");
|
|
4194
4228
|
const passed = !hasErrors;
|
|
4195
4229
|
return {
|
|
4196
4230
|
pass: passed,
|
|
@@ -4318,24 +4352,88 @@ async function scoreTask(task, workspacePath, stdout, stderr, config) {
|
|
|
4318
4352
|
|
|
4319
4353
|
// src/evolve/runner.ts
|
|
4320
4354
|
var execAsync2 = promisify2(exec2);
|
|
4321
|
-
|
|
4355
|
+
var COPY_SKIP_DIRS = /* @__PURE__ */ new Set([".git", "node_modules", ".kairn-evolve", ".claude"]);
|
|
4356
|
+
async function createIsolatedWorkspace(projectRoot, harnessPath) {
|
|
4357
|
+
const suffix = `${Date.now()}-${Math.random().toString(36).slice(2)}`;
|
|
4358
|
+
try {
|
|
4359
|
+
await execAsync2("git rev-parse --is-inside-work-tree", {
|
|
4360
|
+
cwd: projectRoot,
|
|
4361
|
+
timeout: 5e3
|
|
4362
|
+
});
|
|
4363
|
+
const tmpDir2 = path18.join(os3.tmpdir(), `kairn-evolve-wt-${suffix}`);
|
|
4364
|
+
await execAsync2(`git worktree add --detach "${tmpDir2}" HEAD`, {
|
|
4365
|
+
cwd: projectRoot,
|
|
4366
|
+
timeout: 3e4
|
|
4367
|
+
});
|
|
4368
|
+
await fs18.rm(path18.join(tmpDir2, ".claude"), { recursive: true, force: true });
|
|
4369
|
+
await copyDir(harnessPath, path18.join(tmpDir2, ".claude"));
|
|
4370
|
+
return { workDir: tmpDir2, isWorktree: true };
|
|
4371
|
+
} catch {
|
|
4372
|
+
}
|
|
4373
|
+
const tmpDir = await fs18.mkdtemp(path18.join(os3.tmpdir(), `kairn-evolve-cp-`));
|
|
4374
|
+
await copyProjectDir(projectRoot, tmpDir);
|
|
4375
|
+
await fs18.rm(path18.join(tmpDir, ".claude"), { recursive: true, force: true });
|
|
4376
|
+
await copyDir(harnessPath, path18.join(tmpDir, ".claude"));
|
|
4377
|
+
return { workDir: tmpDir, isWorktree: false };
|
|
4378
|
+
}
|
|
4379
|
+
async function copyProjectDir(src, dest) {
|
|
4380
|
+
await fs18.mkdir(dest, { recursive: true });
|
|
4381
|
+
let entries;
|
|
4382
|
+
try {
|
|
4383
|
+
entries = await fs18.readdir(src, { withFileTypes: true });
|
|
4384
|
+
} catch {
|
|
4385
|
+
return;
|
|
4386
|
+
}
|
|
4387
|
+
for (const entry of entries) {
|
|
4388
|
+
if (COPY_SKIP_DIRS.has(entry.name)) continue;
|
|
4389
|
+
const srcPath = path18.join(src, entry.name);
|
|
4390
|
+
const destPath = path18.join(dest, entry.name);
|
|
4391
|
+
if (entry.isDirectory()) {
|
|
4392
|
+
await copyDir(srcPath, destPath);
|
|
4393
|
+
} else {
|
|
4394
|
+
await fs18.copyFile(srcPath, destPath);
|
|
4395
|
+
}
|
|
4396
|
+
}
|
|
4397
|
+
}
|
|
4398
|
+
async function cleanupIsolatedWorkspace(workDir, isWorktree, projectRoot) {
|
|
4399
|
+
if (isWorktree) {
|
|
4400
|
+
try {
|
|
4401
|
+
await execAsync2(`git worktree remove "${workDir}" --force`, {
|
|
4402
|
+
cwd: projectRoot,
|
|
4403
|
+
timeout: 1e4
|
|
4404
|
+
});
|
|
4405
|
+
} catch {
|
|
4406
|
+
await fs18.rm(workDir, { recursive: true, force: true }).catch(() => {
|
|
4407
|
+
});
|
|
4408
|
+
await execAsync2("git worktree prune", {
|
|
4409
|
+
cwd: projectRoot,
|
|
4410
|
+
timeout: 5e3
|
|
4411
|
+
}).catch(() => {
|
|
4412
|
+
});
|
|
4413
|
+
}
|
|
4414
|
+
} else {
|
|
4415
|
+
await fs18.rm(workDir, { recursive: true, force: true }).catch(() => {
|
|
4416
|
+
});
|
|
4417
|
+
}
|
|
4418
|
+
}
|
|
4419
|
+
async function runTask(task, harnessPath, traceDir, iteration, projectRoot) {
|
|
4322
4420
|
await fs18.mkdir(traceDir, { recursive: true });
|
|
4323
4421
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
4324
4422
|
const startMs = Date.now();
|
|
4325
|
-
const
|
|
4423
|
+
const root = projectRoot ?? process.cwd();
|
|
4424
|
+
const { workDir, isWorktree } = await createIsolatedWorkspace(root, harnessPath);
|
|
4326
4425
|
try {
|
|
4327
|
-
await copyDir(harnessPath, path18.join(tmpDir, ".claude"));
|
|
4328
4426
|
let setupStderr = "";
|
|
4329
4427
|
if (task.setup.trim()) {
|
|
4330
4428
|
try {
|
|
4331
|
-
await execAsync2(task.setup, { cwd:
|
|
4429
|
+
await execAsync2(task.setup, { cwd: workDir, timeout: 6e4 });
|
|
4332
4430
|
} catch (err) {
|
|
4333
4431
|
setupStderr = err instanceof Error ? err.message : String(err);
|
|
4334
4432
|
}
|
|
4335
4433
|
}
|
|
4336
|
-
const filesBefore = await snapshotFileList(
|
|
4337
|
-
const spawnResult = await spawnClaude(task.description,
|
|
4338
|
-
const filesAfter = await snapshotFileList(
|
|
4434
|
+
const filesBefore = await snapshotFileList(workDir);
|
|
4435
|
+
const spawnResult = await spawnClaude(task.description, workDir, task.timeout);
|
|
4436
|
+
const filesAfter = await snapshotFileList(workDir);
|
|
4339
4437
|
const filesChanged = diffFileLists(filesBefore, filesAfter);
|
|
4340
4438
|
const toolCalls = parseToolCalls(spawnResult.stdout);
|
|
4341
4439
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
@@ -4359,8 +4457,7 @@ ${spawnResult.stderr}` : spawnResult.stderr;
|
|
|
4359
4457
|
traceDir
|
|
4360
4458
|
};
|
|
4361
4459
|
} finally {
|
|
4362
|
-
await
|
|
4363
|
-
});
|
|
4460
|
+
await cleanupIsolatedWorkspace(workDir, isWorktree, root);
|
|
4364
4461
|
}
|
|
4365
4462
|
}
|
|
4366
4463
|
async function spawnClaude(instruction, cwd, timeoutSec) {
|
|
@@ -4458,8 +4555,9 @@ function parseToolCalls(stdout) {
|
|
|
4458
4555
|
return [];
|
|
4459
4556
|
}
|
|
4460
4557
|
}
|
|
4461
|
-
async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config) {
|
|
4558
|
+
async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config, onProgress) {
|
|
4462
4559
|
const results = {};
|
|
4560
|
+
const projectRoot = path18.resolve(workspacePath, "..");
|
|
4463
4561
|
for (const task of tasks) {
|
|
4464
4562
|
const traceDir = path18.join(
|
|
4465
4563
|
workspacePath,
|
|
@@ -4467,7 +4565,8 @@ async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config)
|
|
|
4467
4565
|
iteration.toString(),
|
|
4468
4566
|
task.id
|
|
4469
4567
|
);
|
|
4470
|
-
|
|
4568
|
+
onProgress?.({ type: "task-start", iteration, taskId: task.id });
|
|
4569
|
+
const taskResult = await runTask(task, harnessPath, traceDir, iteration, projectRoot);
|
|
4471
4570
|
let score = taskResult.score;
|
|
4472
4571
|
if (config) {
|
|
4473
4572
|
const stdout = await fs18.readFile(path18.join(traceDir, "stdout.log"), "utf-8").catch(() => "");
|
|
@@ -4476,6 +4575,12 @@ async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config)
|
|
|
4476
4575
|
await writeScore(traceDir, score);
|
|
4477
4576
|
}
|
|
4478
4577
|
results[task.id] = score;
|
|
4578
|
+
onProgress?.({
|
|
4579
|
+
type: "task-scored",
|
|
4580
|
+
iteration,
|
|
4581
|
+
taskId: task.id,
|
|
4582
|
+
score: score.score ?? (score.pass ? 100 : 0)
|
|
4583
|
+
});
|
|
4479
4584
|
}
|
|
4480
4585
|
const scores = Object.values(results);
|
|
4481
4586
|
const total = scores.reduce(
|
|
@@ -4537,7 +4642,8 @@ Return a JSON object:
|
|
|
4537
4642
|
- Prefer ADDITIVE changes over replacements when possible.
|
|
4538
4643
|
|
|
4539
4644
|
Return ONLY valid JSON.`;
|
|
4540
|
-
var STDOUT_TRUNCATION_LIMIT =
|
|
4645
|
+
var STDOUT_TRUNCATION_LIMIT = 1e3;
|
|
4646
|
+
var MAX_CONTEXT_CHARS = 1e5;
|
|
4541
4647
|
async function readHarnessFiles(harnessPath) {
|
|
4542
4648
|
const result = {};
|
|
4543
4649
|
async function walk(dir, prefix) {
|
|
@@ -4571,26 +4677,25 @@ function truncateStdout(stdout, limit) {
|
|
|
4571
4677
|
${stdout.slice(-limit)}`;
|
|
4572
4678
|
}
|
|
4573
4679
|
function buildProposerUserMessage(harnessFiles, traces, tasks, history) {
|
|
4574
|
-
const
|
|
4575
|
-
sections.push("## Current Harness Files\n");
|
|
4680
|
+
const harnessSection = ["## Current Harness Files\n"];
|
|
4576
4681
|
const fileEntries = Object.entries(harnessFiles);
|
|
4577
4682
|
if (fileEntries.length === 0) {
|
|
4578
|
-
|
|
4683
|
+
harnessSection.push("(No harness files found)\n");
|
|
4579
4684
|
} else {
|
|
4580
4685
|
for (const [filePath, content] of fileEntries) {
|
|
4581
|
-
|
|
4686
|
+
harnessSection.push(`### ${filePath}
|
|
4582
4687
|
\`\`\`
|
|
4583
4688
|
${content}
|
|
4584
4689
|
\`\`\`
|
|
4585
4690
|
`);
|
|
4586
4691
|
}
|
|
4587
4692
|
}
|
|
4588
|
-
|
|
4693
|
+
const taskSection = ["## Task Definitions\n"];
|
|
4589
4694
|
if (tasks.length === 0) {
|
|
4590
|
-
|
|
4695
|
+
taskSection.push("(No tasks defined)\n");
|
|
4591
4696
|
} else {
|
|
4592
4697
|
for (const task of tasks) {
|
|
4593
|
-
|
|
4698
|
+
taskSection.push(
|
|
4594
4699
|
`### Task: ${task.id}
|
|
4595
4700
|
- Template: ${task.template}
|
|
4596
4701
|
- Description: ${task.description}
|
|
@@ -4600,15 +4705,27 @@ ${content}
|
|
|
4600
4705
|
);
|
|
4601
4706
|
}
|
|
4602
4707
|
}
|
|
4603
|
-
|
|
4604
|
-
|
|
4605
|
-
|
|
4606
|
-
|
|
4708
|
+
const fixedContent = harnessSection.join("\n") + "\n" + taskSection.join("\n");
|
|
4709
|
+
const remainingBudget = MAX_CONTEXT_CHARS - fixedContent.length;
|
|
4710
|
+
if (remainingBudget <= 0) {
|
|
4711
|
+
return fixedContent + "\n\n[...traces and history omitted \u2014 harness + tasks fill context budget...]";
|
|
4712
|
+
}
|
|
4713
|
+
const traceBudget = Math.floor(remainingBudget * 0.7);
|
|
4714
|
+
const historyBudget = remainingBudget - traceBudget;
|
|
4715
|
+
const traceSection = buildTraceSection(traces, traceBudget);
|
|
4716
|
+
const historySection = buildHistorySection(history, historyBudget);
|
|
4717
|
+
return fixedContent + "\n" + traceSection + "\n" + historySection;
|
|
4718
|
+
}
|
|
4719
|
+
function buildTraceSection(traces, budget) {
|
|
4720
|
+
if (traces.length === 0) return "## Execution Traces\n\n(No traces available)\n";
|
|
4721
|
+
let stdoutLimit = STDOUT_TRUNCATION_LIMIT;
|
|
4722
|
+
for (let attempt = 0; attempt < 4; attempt++) {
|
|
4723
|
+
const parts = ["## Execution Traces\n"];
|
|
4607
4724
|
for (const trace of traces) {
|
|
4608
4725
|
const scoreNum = trace.score.score !== void 0 ? trace.score.score : trace.score.pass ? 100 : 0;
|
|
4609
|
-
const truncatedStdout = truncateStdout(trace.stdout,
|
|
4726
|
+
const truncatedStdout = truncateStdout(trace.stdout, stdoutLimit);
|
|
4610
4727
|
const filesChangedList = Object.entries(trace.filesChanged).map(([f, action]) => ` - ${f}: ${action}`).join("\n");
|
|
4611
|
-
|
|
4728
|
+
parts.push(
|
|
4612
4729
|
`### Trace: ${trace.taskId}
|
|
4613
4730
|
- Pass: ${trace.score.pass}
|
|
4614
4731
|
- Score: ${scoreNum}
|
|
@@ -4616,36 +4733,55 @@ ${content}
|
|
|
4616
4733
|
` : "") + `- Duration: ${trace.timing.durationMs}ms
|
|
4617
4734
|
- Files changed:
|
|
4618
4735
|
${filesChangedList || " (none)"}
|
|
4619
|
-
- Stdout (last ${
|
|
4736
|
+
- Stdout (last ${stdoutLimit} chars):
|
|
4620
4737
|
\`\`\`
|
|
4621
4738
|
${truncatedStdout}
|
|
4622
4739
|
\`\`\`
|
|
4623
4740
|
`
|
|
4624
4741
|
);
|
|
4625
4742
|
}
|
|
4743
|
+
const result = parts.join("\n");
|
|
4744
|
+
if (result.length <= budget) return result;
|
|
4745
|
+
stdoutLimit = Math.floor(stdoutLimit / 2);
|
|
4626
4746
|
}
|
|
4627
|
-
|
|
4628
|
-
|
|
4629
|
-
|
|
4630
|
-
|
|
4631
|
-
|
|
4747
|
+
const summary = ["## Execution Traces (summary \u2014 stdout omitted to fit budget)\n"];
|
|
4748
|
+
for (const trace of traces) {
|
|
4749
|
+
const scoreNum = trace.score.score !== void 0 ? trace.score.score : trace.score.pass ? 100 : 0;
|
|
4750
|
+
summary.push(`- ${trace.taskId}: ${scoreNum} (pass=${trace.score.pass})
|
|
4751
|
+
`);
|
|
4752
|
+
}
|
|
4753
|
+
return summary.join("\n");
|
|
4754
|
+
}
|
|
4755
|
+
function buildHistorySection(history, budget) {
|
|
4756
|
+
if (history.length === 0) return "## Iteration History\n\n(No previous iterations)\n";
|
|
4757
|
+
let entries = [...history];
|
|
4758
|
+
while (entries.length > 0) {
|
|
4759
|
+
const parts = ["## Iteration History\n"];
|
|
4760
|
+
if (entries.length < history.length) {
|
|
4761
|
+
parts.push(`(Showing ${entries.length}/${history.length} most recent iterations)
|
|
4762
|
+
`);
|
|
4763
|
+
}
|
|
4764
|
+
for (const log of entries) {
|
|
4632
4765
|
const taskScores = Object.entries(log.taskResults).map(([id, s]) => ` - ${id}: ${s.score !== void 0 ? s.score : s.pass ? 100 : 0} (pass=${s.pass})`).join("\n");
|
|
4633
|
-
|
|
4766
|
+
parts.push(
|
|
4634
4767
|
`### Iteration ${log.iteration} \u2014 Score: ${log.score}
|
|
4635
4768
|
- Task results:
|
|
4636
4769
|
${taskScores}
|
|
4637
4770
|
`
|
|
4638
4771
|
);
|
|
4639
4772
|
if (log.proposal) {
|
|
4640
|
-
|
|
4773
|
+
parts.push(
|
|
4641
4774
|
`- Proposal reasoning: ${log.proposal.reasoning}
|
|
4642
4775
|
- Mutations: ${log.proposal.mutations.length} change(s)
|
|
4643
4776
|
`
|
|
4644
4777
|
);
|
|
4645
4778
|
}
|
|
4646
4779
|
}
|
|
4780
|
+
const result = parts.join("\n");
|
|
4781
|
+
if (result.length <= budget) return result;
|
|
4782
|
+
entries = entries.slice(1);
|
|
4647
4783
|
}
|
|
4648
|
-
return
|
|
4784
|
+
return "## Iteration History\n\n(History omitted to fit context budget)\n";
|
|
4649
4785
|
}
|
|
4650
4786
|
function parseProposerResponse(raw) {
|
|
4651
4787
|
let cleaned = raw.trim();
|
|
@@ -4657,7 +4793,18 @@ function parseProposerResponse(raw) {
|
|
|
4657
4793
|
try {
|
|
4658
4794
|
parsed = JSON.parse(cleaned);
|
|
4659
4795
|
} catch {
|
|
4660
|
-
|
|
4796
|
+
const firstBrace = cleaned.indexOf("{");
|
|
4797
|
+
const lastBrace = cleaned.lastIndexOf("}");
|
|
4798
|
+
if (firstBrace !== -1 && lastBrace > firstBrace) {
|
|
4799
|
+
const extracted = cleaned.slice(firstBrace, lastBrace + 1);
|
|
4800
|
+
try {
|
|
4801
|
+
parsed = JSON.parse(extracted);
|
|
4802
|
+
} catch {
|
|
4803
|
+
throw new Error(`Proposer returned invalid JSON: ${cleaned.slice(0, 200)}`);
|
|
4804
|
+
}
|
|
4805
|
+
} else {
|
|
4806
|
+
throw new Error(`Proposer returned invalid JSON: ${cleaned.slice(0, 200)}`);
|
|
4807
|
+
}
|
|
4661
4808
|
}
|
|
4662
4809
|
if (typeof parsed !== "object" || parsed === null) {
|
|
4663
4810
|
throw new Error("Proposer response is not a JSON object");
|
|
@@ -4720,7 +4867,8 @@ async function propose(iteration, workspacePath, harnessPath, history, tasks, co
|
|
|
4720
4867
|
const proposerConfig = { ...config, model: proposerModel };
|
|
4721
4868
|
const response = await callLLM(proposerConfig, userMessage, {
|
|
4722
4869
|
systemPrompt: PROPOSER_SYSTEM_PROMPT,
|
|
4723
|
-
maxTokens: 8192
|
|
4870
|
+
maxTokens: 8192,
|
|
4871
|
+
jsonMode: true
|
|
4724
4872
|
});
|
|
4725
4873
|
return parseProposerResponse(response);
|
|
4726
4874
|
}
|
|
@@ -4857,7 +5005,8 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
4857
5005
|
harnessPath,
|
|
4858
5006
|
workspacePath,
|
|
4859
5007
|
iter,
|
|
4860
|
-
kairnConfig
|
|
5008
|
+
kairnConfig,
|
|
5009
|
+
onProgress
|
|
4861
5010
|
);
|
|
4862
5011
|
onProgress?.({ type: "iteration-scored", iteration: iter, score: aggregate });
|
|
4863
5012
|
if (iter === 0) baselineScore = aggregate;
|
|
@@ -4935,7 +5084,13 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
|
|
|
4935
5084
|
kairnConfig,
|
|
4936
5085
|
evolveConfig.proposerModel
|
|
4937
5086
|
);
|
|
4938
|
-
} catch {
|
|
5087
|
+
} catch (err) {
|
|
5088
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
5089
|
+
onProgress?.({
|
|
5090
|
+
type: "proposer-error",
|
|
5091
|
+
iteration: iter,
|
|
5092
|
+
message: `Proposer failed: ${errMsg}`
|
|
5093
|
+
});
|
|
4939
5094
|
const nextIterDir2 = path21.join(
|
|
4940
5095
|
workspacePath,
|
|
4941
5096
|
"iterations",
|
|
@@ -5434,6 +5589,18 @@ evolveCommand.command("run").description("Run tasks against the current harness"
|
|
|
5434
5589
|
case "perfect-score":
|
|
5435
5590
|
console.log(chalk14.green(" Perfect score. Stopping."));
|
|
5436
5591
|
break;
|
|
5592
|
+
case "proposer-error":
|
|
5593
|
+
console.log(chalk14.yellow(` Warning: ${event.message ?? "Proposer failed"}`));
|
|
5594
|
+
break;
|
|
5595
|
+
case "task-start":
|
|
5596
|
+
console.log(chalk14.dim(` Running: ${event.taskId ?? "unknown"}...`));
|
|
5597
|
+
break;
|
|
5598
|
+
case "task-scored": {
|
|
5599
|
+
const taskScore = event.score ?? 0;
|
|
5600
|
+
const taskStatus = taskScore >= 100 ? chalk14.green("PASS") : taskScore >= 60 ? chalk14.yellow("PARTIAL") : chalk14.red("FAIL");
|
|
5601
|
+
console.log(` ${taskStatus} ${event.taskId ?? "unknown"} ${chalk14.dim(`(${taskScore.toFixed(0)}%)`)}`);
|
|
5602
|
+
break;
|
|
5603
|
+
}
|
|
5437
5604
|
case "complete":
|
|
5438
5605
|
break;
|
|
5439
5606
|
}
|