kairn-cli 2.2.0 → 2.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -1237,22 +1237,29 @@ function classifyError(err, provider) {
1237
1237
  }
1238
1238
  async function callLLM(config, userMessage, options) {
1239
1239
  const maxTokens = options.maxTokens ?? 8192;
1240
- const systemPrompt = options.systemPrompt;
1240
+ const { systemPrompt } = options;
1241
+ const jsonMode = options.jsonMode ?? false;
1241
1242
  const providerName = getProviderName(config.provider);
1242
1243
  if (config.provider === "anthropic") {
1243
1244
  const client2 = new Anthropic2({ apiKey: config.api_key });
1245
+ const messages = [
1246
+ { role: "user", content: userMessage }
1247
+ ];
1248
+ if (jsonMode) {
1249
+ messages.push({ role: "assistant", content: "{" });
1250
+ }
1244
1251
  try {
1245
1252
  const response = await client2.messages.create({
1246
1253
  model: config.model,
1247
1254
  max_tokens: maxTokens,
1248
1255
  system: systemPrompt,
1249
- messages: [{ role: "user", content: userMessage }]
1256
+ messages
1250
1257
  });
1251
1258
  const textBlock = response.content.find((block) => block.type === "text");
1252
1259
  if (!textBlock || textBlock.type !== "text") {
1253
1260
  throw new Error("No text response from compiler LLM");
1254
1261
  }
1255
- return textBlock.text;
1262
+ return jsonMode ? `{${textBlock.text}` : textBlock.text;
1256
1263
  } catch (err) {
1257
1264
  throw new Error(classifyError(err, providerName));
1258
1265
  }
@@ -1268,7 +1275,8 @@ async function callLLM(config, userMessage, options) {
1268
1275
  messages: [
1269
1276
  { role: "system", content: systemPrompt },
1270
1277
  { role: "user", content: userMessage }
1271
- ]
1278
+ ],
1279
+ ...jsonMode ? { response_format: { type: "json_object" } } : {}
1272
1280
  });
1273
1281
  const text = response.choices[0]?.message?.content;
1274
1282
  if (!text) {
@@ -3741,30 +3749,55 @@ var EVAL_TEMPLATES = {
3741
3749
  name: "Documentation",
3742
3750
  description: "Can the agent write and update docs?",
3743
3751
  bestFor: ["content", "api-building", "full-stack"]
3752
+ },
3753
+ "convention-adherence": {
3754
+ id: "convention-adherence",
3755
+ name: "Convention Adherence",
3756
+ description: "Does the agent follow all project conventions defined in CLAUDE.md?",
3757
+ bestFor: ["feature-development", "full-stack", "backend", "maintenance"]
3758
+ },
3759
+ "workflow-compliance": {
3760
+ id: "workflow-compliance",
3761
+ name: "Workflow Compliance",
3762
+ description: "Does the agent use the project workflow commands and skills?",
3763
+ bestFor: ["feature-development", "full-stack", "tdd", "qa"]
3764
+ },
3765
+ "rule-compliance": {
3766
+ id: "rule-compliance",
3767
+ name: "Rule Compliance",
3768
+ description: "Does the agent follow all project rules without violations?",
3769
+ bestFor: ["feature-development", "backend", "maintenance", "architecture"]
3744
3770
  }
3745
3771
  };
3746
3772
  function selectTemplatesForWorkflow(workflowType) {
3747
3773
  const mapping = {
3748
- "feature-development": ["add-feature", "test-writing", "documentation"],
3749
- "api-building": ["add-feature", "fix-bug", "test-writing"],
3750
- "full-stack": ["add-feature", "fix-bug", "test-writing"],
3751
- "maintenance": ["fix-bug", "refactor", "test-writing"],
3752
- "debugging": ["fix-bug", "test-writing"],
3753
- "qa": ["fix-bug", "test-writing", "add-feature"],
3754
- "architecture": ["refactor", "test-writing", "config-change"],
3755
- "backend": ["fix-bug", "refactor", "config-change", "test-writing"],
3756
- "devops": ["config-change", "fix-bug"],
3757
- "infrastructure": ["config-change", "refactor"],
3758
- "tdd": ["test-writing", "add-feature", "fix-bug"],
3759
- "content": ["documentation", "add-feature"],
3760
- "research": ["documentation", "add-feature"]
3774
+ "feature-development": ["add-feature", "test-writing", "convention-adherence", "workflow-compliance"],
3775
+ "api-building": ["add-feature", "fix-bug", "test-writing", "convention-adherence"],
3776
+ "full-stack": ["add-feature", "fix-bug", "test-writing", "convention-adherence"],
3777
+ "maintenance": ["fix-bug", "refactor", "test-writing", "rule-compliance"],
3778
+ "debugging": ["fix-bug", "test-writing", "rule-compliance"],
3779
+ "qa": ["fix-bug", "test-writing", "add-feature", "workflow-compliance"],
3780
+ "architecture": ["refactor", "test-writing", "config-change", "convention-adherence"],
3781
+ "backend": ["fix-bug", "refactor", "config-change", "rule-compliance"],
3782
+ "devops": ["config-change", "fix-bug", "rule-compliance"],
3783
+ "infrastructure": ["config-change", "refactor", "convention-adherence"],
3784
+ "tdd": ["test-writing", "add-feature", "fix-bug", "workflow-compliance"],
3785
+ "content": ["documentation", "add-feature", "convention-adherence"],
3786
+ "research": ["documentation", "add-feature", "convention-adherence"]
3761
3787
  };
3762
- return mapping[workflowType] || ["add-feature", "fix-bug", "test-writing"];
3788
+ return mapping[workflowType] || ["add-feature", "fix-bug", "test-writing", "convention-adherence"];
3763
3789
  }
3764
3790
  var TASK_GENERATION_PROMPT = `You are an eval task generator for Claude Code agent environments. Given a project's CLAUDE.md, project structure, and selected eval templates, generate concrete, project-specific tasks.
3765
3791
 
3766
3792
  Each task must be realistic and testable against the actual project. Avoid generic placeholders.
3767
3793
 
3794
+ IMPORTANT: For harness-aware templates (convention-adherence, workflow-compliance, rule-compliance), generate tasks where success DEPENDS on the agent reading and following the .claude/ harness content:
3795
+ - convention-adherence: Task must require following specific conventions from CLAUDE.md (naming, file structure, patterns). Judge by whether output matches the conventions.
3796
+ - workflow-compliance: Task must require using project slash commands or workflow steps defined in .claude/commands/. Judge by whether the agent followed the defined workflow.
3797
+ - rule-compliance: Task must create a scenario where .claude/rules/ content is relevant. Judge by whether the agent respected all rules.
3798
+
3799
+ These harness-aware tasks are critical \u2014 they test whether the .claude/ environment actually improves agent behavior.
3800
+
3768
3801
  Return a JSON object with a "tasks" array. Each task has:
3769
3802
  - id: kebab-case identifier (e.g., "add-health-endpoint")
3770
3803
  - template: which eval template this instantiates
@@ -4190,7 +4223,8 @@ ${msg}`);
4190
4223
  details: passed2 ? `All ${commands.length} verification commands passed` : failures.join("\n")
4191
4224
  };
4192
4225
  }
4193
- const hasErrors = stderr.toLowerCase().includes("error") || stderr.toLowerCase().includes("failed") || stderr.toLowerCase().includes("exception");
4226
+ const filteredStderr = stderr.split("\n").filter((line) => !line.startsWith("[setup]")).join("\n");
4227
+ const hasErrors = filteredStderr.toLowerCase().includes("error") || filteredStderr.toLowerCase().includes("failed") || filteredStderr.toLowerCase().includes("exception");
4194
4228
  const passed = !hasErrors;
4195
4229
  return {
4196
4230
  pass: passed,
@@ -4318,24 +4352,88 @@ async function scoreTask(task, workspacePath, stdout, stderr, config) {
4318
4352
 
4319
4353
  // src/evolve/runner.ts
4320
4354
  var execAsync2 = promisify2(exec2);
4321
- async function runTask(task, harnessPath, traceDir, iteration) {
4355
+ var COPY_SKIP_DIRS = /* @__PURE__ */ new Set([".git", "node_modules", ".kairn-evolve", ".claude"]);
4356
+ async function createIsolatedWorkspace(projectRoot, harnessPath) {
4357
+ const suffix = `${Date.now()}-${Math.random().toString(36).slice(2)}`;
4358
+ try {
4359
+ await execAsync2("git rev-parse --is-inside-work-tree", {
4360
+ cwd: projectRoot,
4361
+ timeout: 5e3
4362
+ });
4363
+ const tmpDir2 = path18.join(os3.tmpdir(), `kairn-evolve-wt-${suffix}`);
4364
+ await execAsync2(`git worktree add --detach "${tmpDir2}" HEAD`, {
4365
+ cwd: projectRoot,
4366
+ timeout: 3e4
4367
+ });
4368
+ await fs18.rm(path18.join(tmpDir2, ".claude"), { recursive: true, force: true });
4369
+ await copyDir(harnessPath, path18.join(tmpDir2, ".claude"));
4370
+ return { workDir: tmpDir2, isWorktree: true };
4371
+ } catch {
4372
+ }
4373
+ const tmpDir = await fs18.mkdtemp(path18.join(os3.tmpdir(), `kairn-evolve-cp-`));
4374
+ await copyProjectDir(projectRoot, tmpDir);
4375
+ await fs18.rm(path18.join(tmpDir, ".claude"), { recursive: true, force: true });
4376
+ await copyDir(harnessPath, path18.join(tmpDir, ".claude"));
4377
+ return { workDir: tmpDir, isWorktree: false };
4378
+ }
4379
+ async function copyProjectDir(src, dest) {
4380
+ await fs18.mkdir(dest, { recursive: true });
4381
+ let entries;
4382
+ try {
4383
+ entries = await fs18.readdir(src, { withFileTypes: true });
4384
+ } catch {
4385
+ return;
4386
+ }
4387
+ for (const entry of entries) {
4388
+ if (COPY_SKIP_DIRS.has(entry.name)) continue;
4389
+ const srcPath = path18.join(src, entry.name);
4390
+ const destPath = path18.join(dest, entry.name);
4391
+ if (entry.isDirectory()) {
4392
+ await copyDir(srcPath, destPath);
4393
+ } else {
4394
+ await fs18.copyFile(srcPath, destPath);
4395
+ }
4396
+ }
4397
+ }
4398
+ async function cleanupIsolatedWorkspace(workDir, isWorktree, projectRoot) {
4399
+ if (isWorktree) {
4400
+ try {
4401
+ await execAsync2(`git worktree remove "${workDir}" --force`, {
4402
+ cwd: projectRoot,
4403
+ timeout: 1e4
4404
+ });
4405
+ } catch {
4406
+ await fs18.rm(workDir, { recursive: true, force: true }).catch(() => {
4407
+ });
4408
+ await execAsync2("git worktree prune", {
4409
+ cwd: projectRoot,
4410
+ timeout: 5e3
4411
+ }).catch(() => {
4412
+ });
4413
+ }
4414
+ } else {
4415
+ await fs18.rm(workDir, { recursive: true, force: true }).catch(() => {
4416
+ });
4417
+ }
4418
+ }
4419
+ async function runTask(task, harnessPath, traceDir, iteration, projectRoot) {
4322
4420
  await fs18.mkdir(traceDir, { recursive: true });
4323
4421
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
4324
4422
  const startMs = Date.now();
4325
- const tmpDir = await fs18.mkdtemp(path18.join(os3.tmpdir(), "kairn-evolve-"));
4423
+ const root = projectRoot ?? process.cwd();
4424
+ const { workDir, isWorktree } = await createIsolatedWorkspace(root, harnessPath);
4326
4425
  try {
4327
- await copyDir(harnessPath, path18.join(tmpDir, ".claude"));
4328
4426
  let setupStderr = "";
4329
4427
  if (task.setup.trim()) {
4330
4428
  try {
4331
- await execAsync2(task.setup, { cwd: tmpDir, timeout: 6e4 });
4429
+ await execAsync2(task.setup, { cwd: workDir, timeout: 6e4 });
4332
4430
  } catch (err) {
4333
4431
  setupStderr = err instanceof Error ? err.message : String(err);
4334
4432
  }
4335
4433
  }
4336
- const filesBefore = await snapshotFileList(tmpDir);
4337
- const spawnResult = await spawnClaude(task.description, tmpDir, task.timeout);
4338
- const filesAfter = await snapshotFileList(tmpDir);
4434
+ const filesBefore = await snapshotFileList(workDir);
4435
+ const spawnResult = await spawnClaude(task.description, workDir, task.timeout);
4436
+ const filesAfter = await snapshotFileList(workDir);
4339
4437
  const filesChanged = diffFileLists(filesBefore, filesAfter);
4340
4438
  const toolCalls = parseToolCalls(spawnResult.stdout);
4341
4439
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
@@ -4359,8 +4457,7 @@ ${spawnResult.stderr}` : spawnResult.stderr;
4359
4457
  traceDir
4360
4458
  };
4361
4459
  } finally {
4362
- await fs18.rm(tmpDir, { recursive: true, force: true }).catch(() => {
4363
- });
4460
+ await cleanupIsolatedWorkspace(workDir, isWorktree, root);
4364
4461
  }
4365
4462
  }
4366
4463
  async function spawnClaude(instruction, cwd, timeoutSec) {
@@ -4458,8 +4555,9 @@ function parseToolCalls(stdout) {
4458
4555
  return [];
4459
4556
  }
4460
4557
  }
4461
- async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config) {
4558
+ async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config, onProgress) {
4462
4559
  const results = {};
4560
+ const projectRoot = path18.resolve(workspacePath, "..");
4463
4561
  for (const task of tasks) {
4464
4562
  const traceDir = path18.join(
4465
4563
  workspacePath,
@@ -4467,7 +4565,8 @@ async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config)
4467
4565
  iteration.toString(),
4468
4566
  task.id
4469
4567
  );
4470
- const taskResult = await runTask(task, harnessPath, traceDir, iteration);
4568
+ onProgress?.({ type: "task-start", iteration, taskId: task.id });
4569
+ const taskResult = await runTask(task, harnessPath, traceDir, iteration, projectRoot);
4471
4570
  let score = taskResult.score;
4472
4571
  if (config) {
4473
4572
  const stdout = await fs18.readFile(path18.join(traceDir, "stdout.log"), "utf-8").catch(() => "");
@@ -4476,6 +4575,12 @@ async function evaluateAll(tasks, harnessPath, workspacePath, iteration, config)
4476
4575
  await writeScore(traceDir, score);
4477
4576
  }
4478
4577
  results[task.id] = score;
4578
+ onProgress?.({
4579
+ type: "task-scored",
4580
+ iteration,
4581
+ taskId: task.id,
4582
+ score: score.score ?? (score.pass ? 100 : 0)
4583
+ });
4479
4584
  }
4480
4585
  const scores = Object.values(results);
4481
4586
  const total = scores.reduce(
@@ -4537,7 +4642,8 @@ Return a JSON object:
4537
4642
  - Prefer ADDITIVE changes over replacements when possible.
4538
4643
 
4539
4644
  Return ONLY valid JSON.`;
4540
- var STDOUT_TRUNCATION_LIMIT = 2e3;
4645
+ var STDOUT_TRUNCATION_LIMIT = 1e3;
4646
+ var MAX_CONTEXT_CHARS = 1e5;
4541
4647
  async function readHarnessFiles(harnessPath) {
4542
4648
  const result = {};
4543
4649
  async function walk(dir, prefix) {
@@ -4571,26 +4677,25 @@ function truncateStdout(stdout, limit) {
4571
4677
  ${stdout.slice(-limit)}`;
4572
4678
  }
4573
4679
  function buildProposerUserMessage(harnessFiles, traces, tasks, history) {
4574
- const sections = [];
4575
- sections.push("## Current Harness Files\n");
4680
+ const harnessSection = ["## Current Harness Files\n"];
4576
4681
  const fileEntries = Object.entries(harnessFiles);
4577
4682
  if (fileEntries.length === 0) {
4578
- sections.push("(No harness files found)\n");
4683
+ harnessSection.push("(No harness files found)\n");
4579
4684
  } else {
4580
4685
  for (const [filePath, content] of fileEntries) {
4581
- sections.push(`### ${filePath}
4686
+ harnessSection.push(`### ${filePath}
4582
4687
  \`\`\`
4583
4688
  ${content}
4584
4689
  \`\`\`
4585
4690
  `);
4586
4691
  }
4587
4692
  }
4588
- sections.push("## Task Definitions\n");
4693
+ const taskSection = ["## Task Definitions\n"];
4589
4694
  if (tasks.length === 0) {
4590
- sections.push("(No tasks defined)\n");
4695
+ taskSection.push("(No tasks defined)\n");
4591
4696
  } else {
4592
4697
  for (const task of tasks) {
4593
- sections.push(
4698
+ taskSection.push(
4594
4699
  `### Task: ${task.id}
4595
4700
  - Template: ${task.template}
4596
4701
  - Description: ${task.description}
@@ -4600,15 +4705,27 @@ ${content}
4600
4705
  );
4601
4706
  }
4602
4707
  }
4603
- sections.push("## Execution Traces\n");
4604
- if (traces.length === 0) {
4605
- sections.push("(No traces available)\n");
4606
- } else {
4708
+ const fixedContent = harnessSection.join("\n") + "\n" + taskSection.join("\n");
4709
+ const remainingBudget = MAX_CONTEXT_CHARS - fixedContent.length;
4710
+ if (remainingBudget <= 0) {
4711
+ return fixedContent + "\n\n[...traces and history omitted \u2014 harness + tasks fill context budget...]";
4712
+ }
4713
+ const traceBudget = Math.floor(remainingBudget * 0.7);
4714
+ const historyBudget = remainingBudget - traceBudget;
4715
+ const traceSection = buildTraceSection(traces, traceBudget);
4716
+ const historySection = buildHistorySection(history, historyBudget);
4717
+ return fixedContent + "\n" + traceSection + "\n" + historySection;
4718
+ }
4719
+ function buildTraceSection(traces, budget) {
4720
+ if (traces.length === 0) return "## Execution Traces\n\n(No traces available)\n";
4721
+ let stdoutLimit = STDOUT_TRUNCATION_LIMIT;
4722
+ for (let attempt = 0; attempt < 4; attempt++) {
4723
+ const parts = ["## Execution Traces\n"];
4607
4724
  for (const trace of traces) {
4608
4725
  const scoreNum = trace.score.score !== void 0 ? trace.score.score : trace.score.pass ? 100 : 0;
4609
- const truncatedStdout = truncateStdout(trace.stdout, STDOUT_TRUNCATION_LIMIT);
4726
+ const truncatedStdout = truncateStdout(trace.stdout, stdoutLimit);
4610
4727
  const filesChangedList = Object.entries(trace.filesChanged).map(([f, action]) => ` - ${f}: ${action}`).join("\n");
4611
- sections.push(
4728
+ parts.push(
4612
4729
  `### Trace: ${trace.taskId}
4613
4730
  - Pass: ${trace.score.pass}
4614
4731
  - Score: ${scoreNum}
@@ -4616,36 +4733,55 @@ ${content}
4616
4733
  ` : "") + `- Duration: ${trace.timing.durationMs}ms
4617
4734
  - Files changed:
4618
4735
  ${filesChangedList || " (none)"}
4619
- - Stdout (last ${STDOUT_TRUNCATION_LIMIT} chars):
4736
+ - Stdout (last ${stdoutLimit} chars):
4620
4737
  \`\`\`
4621
4738
  ${truncatedStdout}
4622
4739
  \`\`\`
4623
4740
  `
4624
4741
  );
4625
4742
  }
4743
+ const result = parts.join("\n");
4744
+ if (result.length <= budget) return result;
4745
+ stdoutLimit = Math.floor(stdoutLimit / 2);
4626
4746
  }
4627
- sections.push("## Iteration History\n");
4628
- if (history.length === 0) {
4629
- sections.push("(No previous iterations)\n");
4630
- } else {
4631
- for (const log of history) {
4747
+ const summary = ["## Execution Traces (summary \u2014 stdout omitted to fit budget)\n"];
4748
+ for (const trace of traces) {
4749
+ const scoreNum = trace.score.score !== void 0 ? trace.score.score : trace.score.pass ? 100 : 0;
4750
+ summary.push(`- ${trace.taskId}: ${scoreNum} (pass=${trace.score.pass})
4751
+ `);
4752
+ }
4753
+ return summary.join("\n");
4754
+ }
4755
+ function buildHistorySection(history, budget) {
4756
+ if (history.length === 0) return "## Iteration History\n\n(No previous iterations)\n";
4757
+ let entries = [...history];
4758
+ while (entries.length > 0) {
4759
+ const parts = ["## Iteration History\n"];
4760
+ if (entries.length < history.length) {
4761
+ parts.push(`(Showing ${entries.length}/${history.length} most recent iterations)
4762
+ `);
4763
+ }
4764
+ for (const log of entries) {
4632
4765
  const taskScores = Object.entries(log.taskResults).map(([id, s]) => ` - ${id}: ${s.score !== void 0 ? s.score : s.pass ? 100 : 0} (pass=${s.pass})`).join("\n");
4633
- sections.push(
4766
+ parts.push(
4634
4767
  `### Iteration ${log.iteration} \u2014 Score: ${log.score}
4635
4768
  - Task results:
4636
4769
  ${taskScores}
4637
4770
  `
4638
4771
  );
4639
4772
  if (log.proposal) {
4640
- sections.push(
4773
+ parts.push(
4641
4774
  `- Proposal reasoning: ${log.proposal.reasoning}
4642
4775
  - Mutations: ${log.proposal.mutations.length} change(s)
4643
4776
  `
4644
4777
  );
4645
4778
  }
4646
4779
  }
4780
+ const result = parts.join("\n");
4781
+ if (result.length <= budget) return result;
4782
+ entries = entries.slice(1);
4647
4783
  }
4648
- return sections.join("\n");
4784
+ return "## Iteration History\n\n(History omitted to fit context budget)\n";
4649
4785
  }
4650
4786
  function parseProposerResponse(raw) {
4651
4787
  let cleaned = raw.trim();
@@ -4657,7 +4793,18 @@ function parseProposerResponse(raw) {
4657
4793
  try {
4658
4794
  parsed = JSON.parse(cleaned);
4659
4795
  } catch {
4660
- throw new Error(`Proposer returned invalid JSON: ${cleaned.slice(0, 200)}`);
4796
+ const firstBrace = cleaned.indexOf("{");
4797
+ const lastBrace = cleaned.lastIndexOf("}");
4798
+ if (firstBrace !== -1 && lastBrace > firstBrace) {
4799
+ const extracted = cleaned.slice(firstBrace, lastBrace + 1);
4800
+ try {
4801
+ parsed = JSON.parse(extracted);
4802
+ } catch {
4803
+ throw new Error(`Proposer returned invalid JSON: ${cleaned.slice(0, 200)}`);
4804
+ }
4805
+ } else {
4806
+ throw new Error(`Proposer returned invalid JSON: ${cleaned.slice(0, 200)}`);
4807
+ }
4661
4808
  }
4662
4809
  if (typeof parsed !== "object" || parsed === null) {
4663
4810
  throw new Error("Proposer response is not a JSON object");
@@ -4720,7 +4867,8 @@ async function propose(iteration, workspacePath, harnessPath, history, tasks, co
4720
4867
  const proposerConfig = { ...config, model: proposerModel };
4721
4868
  const response = await callLLM(proposerConfig, userMessage, {
4722
4869
  systemPrompt: PROPOSER_SYSTEM_PROMPT,
4723
- maxTokens: 8192
4870
+ maxTokens: 8192,
4871
+ jsonMode: true
4724
4872
  });
4725
4873
  return parseProposerResponse(response);
4726
4874
  }
@@ -4857,7 +5005,8 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
4857
5005
  harnessPath,
4858
5006
  workspacePath,
4859
5007
  iter,
4860
- kairnConfig
5008
+ kairnConfig,
5009
+ onProgress
4861
5010
  );
4862
5011
  onProgress?.({ type: "iteration-scored", iteration: iter, score: aggregate });
4863
5012
  if (iter === 0) baselineScore = aggregate;
@@ -4935,7 +5084,13 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
4935
5084
  kairnConfig,
4936
5085
  evolveConfig.proposerModel
4937
5086
  );
4938
- } catch {
5087
+ } catch (err) {
5088
+ const errMsg = err instanceof Error ? err.message : String(err);
5089
+ onProgress?.({
5090
+ type: "proposer-error",
5091
+ iteration: iter,
5092
+ message: `Proposer failed: ${errMsg}`
5093
+ });
4939
5094
  const nextIterDir2 = path21.join(
4940
5095
  workspacePath,
4941
5096
  "iterations",
@@ -5434,6 +5589,18 @@ evolveCommand.command("run").description("Run tasks against the current harness"
5434
5589
  case "perfect-score":
5435
5590
  console.log(chalk14.green(" Perfect score. Stopping."));
5436
5591
  break;
5592
+ case "proposer-error":
5593
+ console.log(chalk14.yellow(` Warning: ${event.message ?? "Proposer failed"}`));
5594
+ break;
5595
+ case "task-start":
5596
+ console.log(chalk14.dim(` Running: ${event.taskId ?? "unknown"}...`));
5597
+ break;
5598
+ case "task-scored": {
5599
+ const taskScore = event.score ?? 0;
5600
+ const taskStatus = taskScore >= 100 ? chalk14.green("PASS") : taskScore >= 60 ? chalk14.yellow("PARTIAL") : chalk14.red("FAIL");
5601
+ console.log(` ${taskStatus} ${event.taskId ?? "unknown"} ${chalk14.dim(`(${taskScore.toFixed(0)}%)`)}`);
5602
+ break;
5603
+ }
5437
5604
  case "complete":
5438
5605
  break;
5439
5606
  }