kairn-cli 1.13.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -1,6 +1,6 @@
1
1
  // src/cli.ts
2
- import { Command as Command11 } from "commander";
3
- import chalk14 from "chalk";
2
+ import { Command as Command12 } from "commander";
3
+ import chalk15 from "chalk";
4
4
 
5
5
  // src/commands/init.ts
6
6
  import { Command } from "commander";
@@ -221,7 +221,7 @@ var ui = {
221
221
  // Key-value pairs
222
222
  kv: (key, value) => ` ${chalk.cyan(key.padEnd(14))} ${value}`,
223
223
  // File list
224
- file: (path15) => chalk.dim(` ${path15}`),
224
+ file: (path20) => chalk.dim(` ${path20}`),
225
225
  // Tool display
226
226
  tool: (name, reason) => ` ${warmStone("\u25CF")} ${chalk.bold(name)}
227
227
  ${chalk.dim(reason)}`,
@@ -562,8 +562,6 @@ import chalk5 from "chalk";
562
562
  import fs4 from "fs/promises";
563
563
  import path4 from "path";
564
564
  import crypto from "crypto";
565
- import Anthropic2 from "@anthropic-ai/sdk";
566
- import OpenAI2 from "openai";
567
565
 
568
566
  // src/compiler/prompt.ts
569
567
  var SKELETON_PROMPT = `You are the Kairn skeleton compiler. Your job is to select tools and outline the project structure from a user's natural language description.
@@ -716,6 +714,13 @@ At the start of every session, before doing ANY work:
716
714
  4. Summarize what you see in 2-3 lines, then proceed
717
715
 
718
716
  This saves 2-5 exploratory turns. Never ask "what files are here?" \u2014 look first.
717
+
718
+ ## Completion Standards
719
+
720
+ Never mark a task "done" without running the Completion Verification checklist.
721
+ Tests passing is necessary but not sufficient \u2014 also verify requirements coverage,
722
+ state cleanliness, and review changes from the perspective of a test engineer,
723
+ code reviewer, and the requesting user.
719
724
  \`\`\`
720
725
 
721
726
  Do not add generic filler. Every line must be specific to the user's workflow.
@@ -738,6 +743,7 @@ Do not add generic filler. Every line must be specific to the user's workflow.
738
743
  14. A "Git Workflow" section in CLAUDE.md (3 rules: small commits, conventional format, <200 lines PR)
739
744
  15. "Engineering Standards", "Tool Usage Policy", and "Code Philosophy" sections in CLAUDE.md
740
745
  16. A "First Turn Protocol" section in CLAUDE.md (orient before working: pwd, ls, git status, check relevant runtimes, read task files)
746
+ 17. A "Completion Standards" section in CLAUDE.md (never mark done without verifying: requirements met, tests passing, no debug artifacts, reviewed from 3 perspectives)
741
747
 
742
748
  ## Shell-Integrated Commands
743
749
 
@@ -855,7 +861,7 @@ Merge this into the settings hooks alongside the PreToolUse and PostToolUse hook
855
861
  - \`/project:status\` command (live git status, recent commits, SPRINT.md overview using ! prefix)
856
862
  - \`/project:fix\` command (takes $ARGUMENTS as issue number, plans fix, implements, tests, commits)
857
863
  - \`/project:sprint\` command (define acceptance criteria before coding, writes to docs/SPRINT.md)
858
- - \`/project:develop\` command (full development pipeline \u2014 orchestrates @architect \u2192 @planner \u2192 @implementer \u2192 @verifier \u2192 @fixer \u2192 @grill \u2192 @doc-updater through spec, plan, TDD implement, review, and doc update phases)
864
+ - \`/project:develop\` command (full development pipeline \u2014 orchestrates @architect \u2192 @planner \u2192 @implementer \u2192 @verifier \u2192 @fixer \u2192 @grill \u2192 @doc-updater through spec, plan, TDD implement, review, and doc update phases). MUST include a Phase 7 "Completion Gate" that runs a Completion Verification checklist before marking the feature done: re-read original requirements, confirm each is met with evidence, run test suite + lint/typecheck, review git diff for unexpected changes or debug artifacts, answer 3 perspective questions (test engineer, code reviewer, requesting user). If ANY check fails, loop back to fix before completing.
859
865
  - A TDD skill using the 3-phase isolation pattern (RED \u2192 GREEN \u2192 REFACTOR):
860
866
  - RED: Write failing test only. Verify it FAILS.
861
867
  - GREEN: Write MINIMUM code to pass. Nothing extra.
@@ -1023,6 +1029,13 @@ At the start of every session, before doing ANY work:
1023
1029
  4. Summarize what you see in 2-3 lines, then proceed
1024
1030
 
1025
1031
  This saves 2-5 exploratory turns. Never ask "what files are here?" \u2014 look first.
1032
+
1033
+ ## Completion Standards
1034
+
1035
+ Never mark a task "done" without running the Completion Verification checklist.
1036
+ Tests passing is necessary but not sufficient \u2014 also verify requirements coverage,
1037
+ state cleanliness, and review changes from the perspective of a test engineer,
1038
+ code reviewer, and the requesting user.
1026
1039
  \`\`\`
1027
1040
 
1028
1041
  Do not add generic filler. Every line must be specific to the user's workflow.
@@ -1045,6 +1058,7 @@ Do not add generic filler. Every line must be specific to the user's workflow.
1045
1058
  14. A "Git Workflow" section in CLAUDE.md (3 rules: small commits, conventional format, <200 lines PR)
1046
1059
  15. "Engineering Standards", "Tool Usage Policy", and "Code Philosophy" sections in CLAUDE.md
1047
1060
  16. A "First Turn Protocol" section in CLAUDE.md (orient before working: pwd, ls, git status, check relevant runtimes, read task files)
1061
+ 17. A "Completion Standards" section in CLAUDE.md (never mark done without verifying: requirements met, tests passing, no debug artifacts, reviewed from 3 perspectives)
1048
1062
 
1049
1063
  ## Tool Selection Rules
1050
1064
 
@@ -1188,76 +1202,9 @@ async function loadRegistry() {
1188
1202
  return Array.from(merged.values());
1189
1203
  }
1190
1204
 
1191
- // src/compiler/compile.ts
1192
- function buildSkeletonMessage(intent, registry) {
1193
- const registrySummary = registry.map(
1194
- (t) => `- ${t.id} (${t.type}, tier ${t.tier}, auth: ${t.auth}): ${t.description} [best_for: ${t.best_for.join(", ")}]`
1195
- ).join("\n");
1196
- return `## User Intent
1197
-
1198
- ${intent}
1199
-
1200
- ## Available Tool Registry
1201
-
1202
- ${registrySummary}
1203
-
1204
- Generate the skeleton JSON now.`;
1205
- }
1206
- function buildHarnessMessage(intent, skeleton, concise) {
1207
- const skeletonJson = JSON.stringify(skeleton, null, 2);
1208
- const conciseNote = concise ? "\n\nIMPORTANT: Be concise. Maximum 80 lines for claude_md. Maximum 5 commands. Keep all content brief." : "";
1209
- return `## User Intent
1210
-
1211
- ${intent}
1212
-
1213
- ## Project Skeleton
1214
-
1215
- ${skeletonJson}
1216
-
1217
- Generate the harness content JSON now.${conciseNote}`;
1218
- }
1219
- function parseSkeletonResponse(text) {
1220
- let cleaned = text.trim();
1221
- if (cleaned.startsWith("```")) {
1222
- cleaned = cleaned.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
1223
- }
1224
- const jsonMatch = cleaned.match(/\{[\s\S]*\}/);
1225
- if (!jsonMatch) {
1226
- throw new Error("Pass 1 (skeleton) did not return valid JSON.");
1227
- }
1228
- try {
1229
- const parsed = JSON.parse(jsonMatch[0]);
1230
- if (!parsed.name || !parsed.tools || !Array.isArray(parsed.tools)) {
1231
- throw new Error("Skeleton missing required fields: name, tools");
1232
- }
1233
- return parsed;
1234
- } catch (err) {
1235
- throw new Error(
1236
- `Failed to parse skeleton JSON: ${err instanceof Error ? err.message : String(err)}`
1237
- );
1238
- }
1239
- }
1240
- function parseHarnessResponse(text) {
1241
- let cleaned = text.trim();
1242
- if (cleaned.startsWith("```")) {
1243
- cleaned = cleaned.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
1244
- }
1245
- const jsonMatch = cleaned.match(/\{[\s\S]*\}/);
1246
- if (!jsonMatch) {
1247
- throw new Error("Pass 2 (harness) did not return valid JSON.");
1248
- }
1249
- try {
1250
- const parsed = JSON.parse(jsonMatch[0]);
1251
- if (!parsed.claude_md || !parsed.commands) {
1252
- throw new Error("Harness missing required fields: claude_md, commands");
1253
- }
1254
- return parsed;
1255
- } catch (err) {
1256
- throw new Error(
1257
- `Failed to parse harness JSON: ${err instanceof Error ? err.message : String(err)}`
1258
- );
1259
- }
1260
- }
1205
+ // src/llm.ts
1206
+ import Anthropic2 from "@anthropic-ai/sdk";
1207
+ import OpenAI2 from "openai";
1261
1208
  function classifyError(err, provider) {
1262
1209
  const msg = err instanceof Error ? err.message : String(err);
1263
1210
  const status = err?.status;
@@ -1289,8 +1236,8 @@ function classifyError(err, provider) {
1289
1236
  return `${provider} API error: ${msg}`;
1290
1237
  }
1291
1238
  async function callLLM(config, userMessage, options) {
1292
- const maxTokens = options?.maxTokens ?? 8192;
1293
- const systemPrompt = options?.systemPrompt ?? SYSTEM_PROMPT;
1239
+ const maxTokens = options.maxTokens ?? 8192;
1240
+ const systemPrompt = options.systemPrompt;
1294
1241
  const providerName = getProviderName(config.provider);
1295
1242
  if (config.provider === "anthropic") {
1296
1243
  const client2 = new Anthropic2({ apiKey: config.api_key });
@@ -1332,6 +1279,77 @@ async function callLLM(config, userMessage, options) {
1332
1279
  throw new Error(classifyError(err, providerName));
1333
1280
  }
1334
1281
  }
1282
+
1283
+ // src/compiler/compile.ts
1284
+ function buildSkeletonMessage(intent, registry) {
1285
+ const registrySummary = registry.map(
1286
+ (t) => `- ${t.id} (${t.type}, tier ${t.tier}, auth: ${t.auth}): ${t.description} [best_for: ${t.best_for.join(", ")}]`
1287
+ ).join("\n");
1288
+ return `## User Intent
1289
+
1290
+ ${intent}
1291
+
1292
+ ## Available Tool Registry
1293
+
1294
+ ${registrySummary}
1295
+
1296
+ Generate the skeleton JSON now.`;
1297
+ }
1298
+ function buildHarnessMessage(intent, skeleton, concise) {
1299
+ const skeletonJson = JSON.stringify(skeleton, null, 2);
1300
+ const conciseNote = concise ? "\n\nIMPORTANT: Be concise. Maximum 80 lines for claude_md. Maximum 5 commands. Keep all content brief." : "";
1301
+ return `## User Intent
1302
+
1303
+ ${intent}
1304
+
1305
+ ## Project Skeleton
1306
+
1307
+ ${skeletonJson}
1308
+
1309
+ Generate the harness content JSON now.${conciseNote}`;
1310
+ }
1311
+ function parseSkeletonResponse(text) {
1312
+ let cleaned = text.trim();
1313
+ if (cleaned.startsWith("```")) {
1314
+ cleaned = cleaned.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
1315
+ }
1316
+ const jsonMatch = cleaned.match(/\{[\s\S]*\}/);
1317
+ if (!jsonMatch) {
1318
+ throw new Error("Pass 1 (skeleton) did not return valid JSON.");
1319
+ }
1320
+ try {
1321
+ const parsed = JSON.parse(jsonMatch[0]);
1322
+ if (!parsed.name || !parsed.tools || !Array.isArray(parsed.tools)) {
1323
+ throw new Error("Skeleton missing required fields: name, tools");
1324
+ }
1325
+ return parsed;
1326
+ } catch (err) {
1327
+ throw new Error(
1328
+ `Failed to parse skeleton JSON: ${err instanceof Error ? err.message : String(err)}`
1329
+ );
1330
+ }
1331
+ }
1332
+ function parseHarnessResponse(text) {
1333
+ let cleaned = text.trim();
1334
+ if (cleaned.startsWith("```")) {
1335
+ cleaned = cleaned.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
1336
+ }
1337
+ const jsonMatch = cleaned.match(/\{[\s\S]*\}/);
1338
+ if (!jsonMatch) {
1339
+ throw new Error("Pass 2 (harness) did not return valid JSON.");
1340
+ }
1341
+ try {
1342
+ const parsed = JSON.parse(jsonMatch[0]);
1343
+ if (!parsed.claude_md || !parsed.commands) {
1344
+ throw new Error("Harness missing required fields: claude_md, commands");
1345
+ }
1346
+ return parsed;
1347
+ } catch (err) {
1348
+ throw new Error(
1349
+ `Failed to parse harness JSON: ${err instanceof Error ? err.message : String(err)}`
1350
+ );
1351
+ }
1352
+ }
1335
1353
  function buildSettings(skeleton, registry) {
1336
1354
  const selectedTools = skeleton.tools.map((t) => registry.find((r) => r.id === t.tool_id)).filter(Boolean);
1337
1355
  const allow = ["Read", "Write", "Edit", "Bash(npm run *)", "Bash(npx *)"];
@@ -1501,7 +1519,9 @@ async function generateClarifications(intent, onProgress) {
1501
1519
  onProgress?.("Analyzing your request...");
1502
1520
  const clarificationConfig = { ...config };
1503
1521
  clarificationConfig.model = getCheapModel(config.provider, config.model);
1504
- const response = await callLLM(clarificationConfig, CLARIFICATION_PROMPT + "\n\nUser description: " + intent);
1522
+ const response = await callLLM(clarificationConfig, CLARIFICATION_PROMPT + "\n\nUser description: " + intent, {
1523
+ systemPrompt: SYSTEM_PROMPT
1524
+ });
1505
1525
  try {
1506
1526
  let cleaned = response.trim();
1507
1527
  if (cleaned.startsWith("```")) {
@@ -1682,7 +1702,29 @@ If confidence is LOW or MEDIUM, fix issues and re-verify.
1682
1702
  Run /project:grill for adversarial review.
1683
1703
  Fix any BLOCKERs.
1684
1704
 
1685
- ## Phase 6: SHIP
1705
+ ## Phase 6: COMPLETION GATE
1706
+
1707
+ Before shipping, run the Completion Verification checklist:
1708
+
1709
+ ### Requirements Check
1710
+ - [ ] Re-read the ORIGINAL task description (not your interpretation)
1711
+ - [ ] Each explicit requirement is met with evidence (test output, diff)
1712
+ - [ ] Each implicit requirement (error handling, types, tests) is addressed
1713
+
1714
+ ### State Check
1715
+ - [ ] Test suite passes
1716
+ - [ ] Lint/typecheck passes
1717
+ - [ ] \`git diff --stat\` \u2014 every changed file is intentional
1718
+ - [ ] No debug artifacts (console.log, TODO, commented-out code, temp files)
1719
+
1720
+ ### Perspective Check (1 sentence each)
1721
+ - **Test engineer:** Most likely production failure mode?
1722
+ - **Code reviewer:** What would I flag in review?
1723
+ - **Requesting user:** Does this solve my actual problem?
1724
+
1725
+ ALL pass \u2192 proceed to ship. ANY fail \u2192 fix first, then re-verify.
1726
+
1727
+ ## Phase 7: SHIP
1686
1728
  Run /project:commit.
1687
1729
  Report what was built and what's next from docs/SPRINT.md.
1688
1730
 
@@ -1762,11 +1804,26 @@ Run verification:
1762
1804
  - Run functional tests
1763
1805
  - If NEEDS FIXES: fix and re-verify
1764
1806
 
1765
- ## Phase 5: PR
1807
+ ## Phase 5: COMPLETION GATE
1808
+
1809
+ Before creating a PR, run the Completion Verification checklist:
1810
+ - [ ] Re-read the ORIGINAL spec from docs/SPRINT.md
1811
+ - [ ] Each requirement is met with evidence (test output, diff)
1812
+ - [ ] Test suite + lint/typecheck pass
1813
+ - [ ] \`git diff --stat\` \u2014 every changed file is intentional, no debug artifacts
1814
+ - **Test engineer:** Most likely production failure mode?
1815
+ - **Code reviewer:** What would I flag in review?
1816
+ - **Requesting user:** Does this solve my actual problem?
1817
+
1818
+ ALL pass \u2192 proceed to PR. ANY fail \u2192 fix first, then re-verify.
1819
+
1820
+ Include the checklist results in the PR description.
1821
+
1822
+ ## Phase 6: PR
1766
1823
  Create a pull request:
1767
- gh pr create --title "feat: {name}" --body "{spec + QA report}"
1824
+ gh pr create --title "feat: {name}" --body "{spec + QA report + verification checklist}"
1768
1825
 
1769
- ## Phase 6: NEXT
1826
+ ## Phase 7: NEXT
1770
1827
  Report:
1771
1828
  "PR #{N} ready for review: {link}
1772
1829
  Next priority from SPRINT.md: {next task}
@@ -1790,13 +1847,19 @@ Repeat until max features reached or stopped:
1790
1847
  2. Create worktree + branch
1791
1848
  3. Implement the feature
1792
1849
  4. Run verification (build, test, lint)
1793
- 5. Open PR via gh
1794
- 6. Report status
1795
- 7. Move to next feature
1850
+ 5. Run Completion Verification checklist:
1851
+ - Requirements met with evidence
1852
+ - Tests + lint/typecheck pass
1853
+ - No debug artifacts or unexpected file changes
1854
+ - 3-perspective check (test engineer, reviewer, user)
1855
+ 6. Open PR via gh (include verification results in PR body)
1856
+ 7. Report status
1857
+ 8. Move to next feature
1796
1858
 
1797
1859
  ## Stop Conditions
1798
1860
  - Max 5 features per autopilot session
1799
1861
  - Any BLOCKER from verification
1862
+ - Completion Verification checklist fails after 2 fix attempts
1800
1863
  - Build failure that can't be resolved in 3 attempts
1801
1864
  - User presses Escape`;
1802
1865
  var AUTOPILOT_WARNING = `
@@ -3627,8 +3690,882 @@ var keysCommand = new Command10("keys").description("Add or update API keys for
3627
3690
  console.log("");
3628
3691
  });
3629
3692
 
3693
+ // src/commands/evolve.ts
3694
+ import { Command as Command11 } from "commander";
3695
+ import chalk14 from "chalk";
3696
+ import ora2 from "ora";
3697
+ import fs19 from "fs/promises";
3698
+ import path19 from "path";
3699
+ import { parse as yamlParse } from "yaml";
3700
+ import { confirm as confirm3, select as select4 } from "@inquirer/prompts";
3701
+
3702
+ // src/evolve/init.ts
3703
+ import fs15 from "fs/promises";
3704
+ import path15 from "path";
3705
+ import { stringify as yamlStringify } from "yaml";
3706
+
3707
+ // src/evolve/templates.ts
3708
+ var EVAL_TEMPLATES = {
3709
+ "add-feature": {
3710
+ id: "add-feature",
3711
+ name: "Add Feature",
3712
+ description: "Can the agent add a new capability?",
3713
+ bestFor: ["feature-development", "api-building", "full-stack"]
3714
+ },
3715
+ "fix-bug": {
3716
+ id: "fix-bug",
3717
+ name: "Fix Bug",
3718
+ description: "Can the agent diagnose and fix a problem?",
3719
+ bestFor: ["maintenance", "debugging", "qa"]
3720
+ },
3721
+ "refactor": {
3722
+ id: "refactor",
3723
+ name: "Refactor",
3724
+ description: "Can the agent restructure code?",
3725
+ bestFor: ["maintenance", "architecture", "backend"]
3726
+ },
3727
+ "test-writing": {
3728
+ id: "test-writing",
3729
+ name: "Test Writing",
3730
+ description: "Can the agent write tests?",
3731
+ bestFor: ["tdd", "qa", "backend"]
3732
+ },
3733
+ "config-change": {
3734
+ id: "config-change",
3735
+ name: "Config Change",
3736
+ description: "Can the agent update configuration?",
3737
+ bestFor: ["devops", "infrastructure", "backend"]
3738
+ },
3739
+ "documentation": {
3740
+ id: "documentation",
3741
+ name: "Documentation",
3742
+ description: "Can the agent write and update docs?",
3743
+ bestFor: ["content", "api-building", "full-stack"]
3744
+ }
3745
+ };
3746
+ function selectTemplatesForWorkflow(workflowType) {
3747
+ const mapping = {
3748
+ "feature-development": ["add-feature", "test-writing", "documentation"],
3749
+ "api-building": ["add-feature", "fix-bug", "test-writing"],
3750
+ "full-stack": ["add-feature", "fix-bug", "test-writing"],
3751
+ "maintenance": ["fix-bug", "refactor", "test-writing"],
3752
+ "debugging": ["fix-bug", "test-writing"],
3753
+ "qa": ["fix-bug", "test-writing", "add-feature"],
3754
+ "architecture": ["refactor", "test-writing", "config-change"],
3755
+ "backend": ["fix-bug", "refactor", "config-change", "test-writing"],
3756
+ "devops": ["config-change", "fix-bug"],
3757
+ "infrastructure": ["config-change", "refactor"],
3758
+ "tdd": ["test-writing", "add-feature", "fix-bug"],
3759
+ "content": ["documentation", "add-feature"],
3760
+ "research": ["documentation", "add-feature"]
3761
+ };
3762
+ return mapping[workflowType] || ["add-feature", "fix-bug", "test-writing"];
3763
+ }
3764
+ var TASK_GENERATION_PROMPT = `You are an eval task generator for Claude Code agent environments. Given a project's CLAUDE.md, project structure, and selected eval templates, generate concrete, project-specific tasks.
3765
+
3766
+ Each task must be realistic and testable against the actual project. Avoid generic placeholders.
3767
+
3768
+ Return a JSON object with a "tasks" array. Each task has:
3769
+ - id: kebab-case identifier (e.g., "add-health-endpoint")
3770
+ - template: which eval template this instantiates
3771
+ - description: concrete task description the agent will receive
3772
+ - setup: shell commands to prepare the workspace (e.g., "npm install")
3773
+ - expected_outcome: multi-line string describing what success looks like
3774
+ - scoring: "pass-fail", "llm-judge", or "rubric"
3775
+ - timeout: seconds (300 for features/bugs, 600 for refactors, 180 for config/docs/tests)
3776
+
3777
+ Return ONLY valid JSON, no markdown fences.`;
3778
+ function parseJsonResponse(raw) {
3779
+ let cleaned = raw.trim();
3780
+ if (cleaned.startsWith("```")) {
3781
+ cleaned = cleaned.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
3782
+ }
3783
+ const jsonMatch = cleaned.match(/\{[\s\S]*\}/) ?? cleaned.match(/\[[\s\S]*\]/);
3784
+ if (!jsonMatch) {
3785
+ throw new Error(
3786
+ "LLM response did not contain valid JSON. Try again or use a different model."
3787
+ );
3788
+ }
3789
+ try {
3790
+ return JSON.parse(jsonMatch[0]);
3791
+ } catch (err) {
3792
+ throw new Error(
3793
+ `Failed to parse LLM response as JSON: ${err instanceof Error ? err.message : String(err)}`
3794
+ );
3795
+ }
3796
+ }
3797
+ var REQUIRED_TASK_FIELDS = [
3798
+ "id",
3799
+ "template",
3800
+ "description",
3801
+ "setup",
3802
+ "expected_outcome",
3803
+ "scoring",
3804
+ "timeout"
3805
+ ];
3806
+ function validateTask(obj, index) {
3807
+ if (typeof obj !== "object" || obj === null) {
3808
+ throw new Error(`Task at index ${index} is not an object`);
3809
+ }
3810
+ const record = obj;
3811
+ for (const field of REQUIRED_TASK_FIELDS) {
3812
+ if (!(field in record) || record[field] === void 0 || record[field] === null) {
3813
+ throw new Error(`Task at index ${index} is missing required field: ${field}`);
3814
+ }
3815
+ }
3816
+ return record;
3817
+ }
3818
+ function buildTaskGenerationMessage(claudeMd, projectProfile, templates) {
3819
+ const profileLines = [
3820
+ `Language: ${projectProfile.language ?? "unknown"}`,
3821
+ `Framework: ${projectProfile.framework ?? "none"}`,
3822
+ `Scripts: ${Object.entries(projectProfile.scripts).map(([k, v]) => `${k}=${v}`).join(", ") || "none"}`,
3823
+ `Key files: ${projectProfile.keyFiles.join(", ") || "none"}`
3824
+ ];
3825
+ const templateDescriptions = templates.map((t) => {
3826
+ const meta = EVAL_TEMPLATES[t];
3827
+ return `- ${t}: ${meta.description}`;
3828
+ }).join("\n");
3829
+ return [
3830
+ "## CLAUDE.md",
3831
+ "",
3832
+ claudeMd,
3833
+ "",
3834
+ "## Project Profile",
3835
+ "",
3836
+ ...profileLines,
3837
+ "",
3838
+ "## Selected Eval Templates",
3839
+ "",
3840
+ templateDescriptions,
3841
+ "",
3842
+ "Generate concrete, project-specific tasks for each template above."
3843
+ ].join("\n");
3844
+ }
3845
+ async function generateTasksFromTemplates(claudeMd, projectProfile, templates, config) {
3846
+ const userMessage = buildTaskGenerationMessage(claudeMd, projectProfile, templates);
3847
+ const rawResponse = await callLLM(config, userMessage, {
3848
+ systemPrompt: TASK_GENERATION_PROMPT,
3849
+ maxTokens: 4096
3850
+ });
3851
+ const parsed = parseJsonResponse(rawResponse);
3852
+ if (typeof parsed !== "object" || parsed === null) {
3853
+ throw new Error("LLM response is not a JSON object");
3854
+ }
3855
+ const tasksObj = parsed;
3856
+ if (!Array.isArray(tasksObj.tasks)) {
3857
+ throw new Error("LLM response does not contain a 'tasks' array");
3858
+ }
3859
+ const tasks = [];
3860
+ for (let i = 0; i < tasksObj.tasks.length; i++) {
3861
+ tasks.push(validateTask(tasksObj.tasks[i], i));
3862
+ }
3863
+ return tasks;
3864
+ }
3865
+
3866
+ // src/evolve/init.ts
3867
+ async function createEvolveWorkspace(projectRoot, config) {
3868
+ const workspace = path15.join(projectRoot, ".kairn-evolve");
3869
+ await fs15.mkdir(path15.join(workspace, "baseline"), { recursive: true });
3870
+ await fs15.mkdir(path15.join(workspace, "traces"), { recursive: true });
3871
+ await fs15.mkdir(path15.join(workspace, "iterations"), { recursive: true });
3872
+ const configObj = {
3873
+ model: config.model,
3874
+ proposer_model: config.proposerModel,
3875
+ scorer: config.scorer,
3876
+ max_iterations: config.maxIterations,
3877
+ parallel_tasks: config.parallelTasks
3878
+ };
3879
+ await fs15.writeFile(
3880
+ path15.join(workspace, "config.yaml"),
3881
+ yamlStringify(configObj),
3882
+ "utf-8"
3883
+ );
3884
+ return workspace;
3885
+ }
3886
+ async function writeTasksFile(workspacePath, tasks) {
3887
+ const doc = {
3888
+ tasks: tasks.map((t) => ({
3889
+ id: t.id,
3890
+ template: t.template,
3891
+ description: t.description,
3892
+ setup: t.setup,
3893
+ expected_outcome: t.expected_outcome,
3894
+ scoring: t.scoring,
3895
+ ...t.rubric ? { rubric: t.rubric } : {},
3896
+ timeout: t.timeout
3897
+ }))
3898
+ };
3899
+ const header = "# .kairn-evolve/tasks.yaml\n# Auto-generated by kairn evolve init \u2014 edit freely\n";
3900
+ await fs15.writeFile(
3901
+ path15.join(workspacePath, "tasks.yaml"),
3902
+ header + yamlStringify(doc),
3903
+ "utf-8"
3904
+ );
3905
+ }
3906
+ async function buildProjectProfile(projectRoot) {
3907
+ const profile = {
3908
+ language: null,
3909
+ framework: null,
3910
+ scripts: {},
3911
+ keyFiles: []
3912
+ };
3913
+ try {
3914
+ const pkgStr = await fs15.readFile(
3915
+ path15.join(projectRoot, "package.json"),
3916
+ "utf-8"
3917
+ );
3918
+ const pkg = JSON.parse(pkgStr);
3919
+ profile.language = "typescript";
3920
+ if (pkg.scripts && typeof pkg.scripts === "object") {
3921
+ profile.scripts = pkg.scripts;
3922
+ }
3923
+ const deps = {
3924
+ ...pkg.dependencies ?? {},
3925
+ ...pkg.devDependencies ?? {}
3926
+ };
3927
+ if (deps.next) {
3928
+ profile.framework = "Next.js";
3929
+ } else if (deps.express) {
3930
+ profile.framework = "Express";
3931
+ } else if (deps.react) {
3932
+ profile.framework = "React";
3933
+ } else if (deps.vue) {
3934
+ profile.framework = "Vue";
3935
+ } else if (deps.commander) {
3936
+ profile.framework = "CLI (Commander.js)";
3937
+ }
3938
+ } catch {
3939
+ }
3940
+ if (!profile.language) {
3941
+ try {
3942
+ await fs15.access(path15.join(projectRoot, "pyproject.toml"));
3943
+ profile.language = "python";
3944
+ } catch {
3945
+ try {
3946
+ await fs15.access(path15.join(projectRoot, "requirements.txt"));
3947
+ profile.language = "python";
3948
+ } catch {
3949
+ }
3950
+ }
3951
+ }
3952
+ try {
3953
+ const entries = await fs15.readdir(projectRoot);
3954
+ const keyPatterns = [
3955
+ "README.md",
3956
+ "package.json",
3957
+ "tsconfig.json",
3958
+ "pyproject.toml",
3959
+ "Cargo.toml",
3960
+ "go.mod",
3961
+ "Makefile",
3962
+ "Dockerfile"
3963
+ ];
3964
+ profile.keyFiles = entries.filter((e) => keyPatterns.includes(e));
3965
+ } catch {
3966
+ }
3967
+ return profile;
3968
+ }
3969
+ async function autoGenerateTasks(projectRoot, workflowType) {
3970
+ const config = await loadConfig();
3971
+ if (!config) {
3972
+ throw new Error("No config found. Run `kairn init` first.");
3973
+ }
3974
+ let claudeMd = "";
3975
+ try {
3976
+ claudeMd = await fs15.readFile(
3977
+ path15.join(projectRoot, ".claude", "CLAUDE.md"),
3978
+ "utf-8"
3979
+ );
3980
+ } catch {
3981
+ }
3982
+ const profile = await buildProjectProfile(projectRoot);
3983
+ const templates = selectTemplatesForWorkflow(workflowType);
3984
+ return generateTasksFromTemplates(claudeMd, profile, templates, config);
3985
+ }
3986
+
3987
+ // src/evolve/baseline.ts
3988
+ import fs16 from "fs/promises";
3989
+ import path16 from "path";
3990
+ async function snapshotBaseline(projectRoot, workspacePath) {
3991
+ const claudeDir = path16.join(projectRoot, ".claude");
3992
+ const baselineDir = path16.join(workspacePath, "baseline");
3993
+ const iter0Dir = path16.join(workspacePath, "iterations", "0", "harness");
3994
+ try {
3995
+ await fs16.access(claudeDir);
3996
+ } catch {
3997
+ throw new Error(`.claude/ directory not found in ${projectRoot}`);
3998
+ }
3999
+ await copyDir(claudeDir, baselineDir);
4000
+ await copyDir(claudeDir, iter0Dir);
4001
+ }
4002
+ async function copyDir(src, dest) {
4003
+ await fs16.mkdir(dest, { recursive: true });
4004
+ const entries = await fs16.readdir(src, { withFileTypes: true });
4005
+ for (const entry of entries) {
4006
+ const srcPath = path16.join(src, entry.name);
4007
+ const destPath = path16.join(dest, entry.name);
4008
+ if (entry.isDirectory()) {
4009
+ await copyDir(srcPath, destPath);
4010
+ } else {
4011
+ await fs16.copyFile(srcPath, destPath);
4012
+ }
4013
+ }
4014
+ }
4015
+
4016
+ // src/evolve/runner.ts
4017
+ import { exec, spawn } from "child_process";
4018
+ import { promisify } from "util";
4019
+ import fs18 from "fs/promises";
4020
+ import os3 from "os";
4021
+ import path18 from "path";
4022
+
4023
+ // src/evolve/trace.ts
4024
+ import fs17 from "fs/promises";
4025
+ import path17 from "path";
4026
+ async function writeTrace(traceDir, trace) {
4027
+ await fs17.mkdir(traceDir, { recursive: true });
4028
+ await fs17.writeFile(path17.join(traceDir, "stdout.log"), trace.stdout, "utf-8");
4029
+ await fs17.writeFile(path17.join(traceDir, "stderr.log"), trace.stderr, "utf-8");
4030
+ const toolCallsLines = trace.toolCalls.map((tc) => JSON.stringify(tc)).join("\n");
4031
+ await fs17.writeFile(path17.join(traceDir, "tool_calls.jsonl"), toolCallsLines, "utf-8");
4032
+ await fs17.writeFile(
4033
+ path17.join(traceDir, "files_changed.json"),
4034
+ JSON.stringify(trace.filesChanged, null, 2),
4035
+ "utf-8"
4036
+ );
4037
+ await fs17.writeFile(
4038
+ path17.join(traceDir, "timing.json"),
4039
+ JSON.stringify(trace.timing, null, 2),
4040
+ "utf-8"
4041
+ );
4042
+ await fs17.writeFile(
4043
+ path17.join(traceDir, "score.json"),
4044
+ JSON.stringify(trace.score, null, 2),
4045
+ "utf-8"
4046
+ );
4047
+ }
4048
+ async function writeScore(traceDir, score) {
4049
+ await fs17.writeFile(
4050
+ path17.join(traceDir, "score.json"),
4051
+ JSON.stringify(score, null, 2),
4052
+ "utf-8"
4053
+ );
4054
+ }
4055
+
4056
+ // src/evolve/runner.ts
4057
+ var execAsync = promisify(exec);
4058
+ async function runTask(task, harnessPath, traceDir, iteration) {
4059
+ await fs18.mkdir(traceDir, { recursive: true });
4060
+ const startedAt = (/* @__PURE__ */ new Date()).toISOString();
4061
+ const startMs = Date.now();
4062
+ const tmpDir = await fs18.mkdtemp(path18.join(os3.tmpdir(), "kairn-evolve-"));
4063
+ try {
4064
+ await copyDir(harnessPath, path18.join(tmpDir, ".claude"));
4065
+ let setupStderr = "";
4066
+ if (task.setup.trim()) {
4067
+ try {
4068
+ await execAsync(task.setup, { cwd: tmpDir, timeout: 6e4 });
4069
+ } catch (err) {
4070
+ setupStderr = err instanceof Error ? err.message : String(err);
4071
+ }
4072
+ }
4073
+ const filesBefore = await snapshotFileList(tmpDir);
4074
+ const spawnResult = await spawnClaude(task.description, tmpDir, task.timeout);
4075
+ const filesAfter = await snapshotFileList(tmpDir);
4076
+ const filesChanged = diffFileLists(filesBefore, filesAfter);
4077
+ const toolCalls = parseToolCalls(spawnResult.stdout);
4078
+ const completedAt = (/* @__PURE__ */ new Date()).toISOString();
4079
+ const durationMs = Date.now() - startMs;
4080
+ const combinedStderr = setupStderr ? `[setup] ${setupStderr}
4081
+ ${spawnResult.stderr}` : spawnResult.stderr;
4082
+ const trace = {
4083
+ taskId: task.id,
4084
+ iteration,
4085
+ stdout: spawnResult.stdout,
4086
+ stderr: combinedStderr,
4087
+ toolCalls,
4088
+ filesChanged,
4089
+ score: { pass: false, details: "Pending scoring" },
4090
+ timing: { startedAt, completedAt, durationMs }
4091
+ };
4092
+ await writeTrace(traceDir, trace);
4093
+ return {
4094
+ taskId: task.id,
4095
+ score: trace.score,
4096
+ traceDir
4097
+ };
4098
+ } finally {
4099
+ await fs18.rm(tmpDir, { recursive: true, force: true }).catch(() => {
4100
+ });
4101
+ }
4102
+ }
4103
+ async function spawnClaude(instruction, cwd, timeoutSec) {
4104
+ return new Promise((resolve) => {
4105
+ const args = ["--print", "--output-format", "text", "--max-turns", "50"];
4106
+ const child = spawn("claude", args, {
4107
+ cwd,
4108
+ stdio: ["pipe", "pipe", "pipe"],
4109
+ timeout: timeoutSec * 1e3,
4110
+ env: { ...process.env }
4111
+ });
4112
+ let stdout = "";
4113
+ let stderr = "";
4114
+ child.stdout.on("data", (data) => {
4115
+ stdout += data.toString();
4116
+ });
4117
+ child.stderr.on("data", (data) => {
4118
+ stderr += data.toString();
4119
+ });
4120
+ child.stdin.write(instruction);
4121
+ child.stdin.end();
4122
+ child.on("close", (code) => {
4123
+ resolve({ stdout, stderr, exitCode: code ?? 1 });
4124
+ });
4125
+ child.on("error", (err) => {
4126
+ resolve({
4127
+ stdout,
4128
+ stderr: stderr + `
4129
+ Spawn error: ${err.message}`,
4130
+ exitCode: 1
4131
+ });
4132
+ });
4133
+ });
4134
+ }
4135
+ async function snapshotFileList(dir) {
4136
+ const result = {};
4137
+ async function walk(current) {
4138
+ let entries;
4139
+ try {
4140
+ entries = await fs18.readdir(current, { withFileTypes: true });
4141
+ } catch {
4142
+ return;
4143
+ }
4144
+ for (const entry of entries) {
4145
+ const fullPath = path18.join(current, entry.name);
4146
+ const relativePath = path18.relative(dir, fullPath);
4147
+ if (relativePath.startsWith(".claude")) continue;
4148
+ if (relativePath.startsWith("node_modules")) continue;
4149
+ if (relativePath.startsWith(".git")) continue;
4150
+ if (entry.isDirectory()) {
4151
+ await walk(fullPath);
4152
+ } else {
4153
+ try {
4154
+ const stat = await fs18.stat(fullPath);
4155
+ result[relativePath] = stat.mtimeMs;
4156
+ } catch {
4157
+ }
4158
+ }
4159
+ }
4160
+ }
4161
+ await walk(dir);
4162
+ return result;
4163
+ }
4164
+ function diffFileLists(before, after) {
4165
+ const changes = {};
4166
+ for (const [file, mtime] of Object.entries(after)) {
4167
+ if (!(file in before)) {
4168
+ changes[file] = "created";
4169
+ } else if (before[file] !== mtime) {
4170
+ changes[file] = "modified";
4171
+ }
4172
+ }
4173
+ for (const file of Object.keys(before)) {
4174
+ if (!(file in after)) {
4175
+ changes[file] = "deleted";
4176
+ }
4177
+ }
4178
+ return changes;
4179
+ }
4180
+ function parseToolCalls(stdout) {
4181
+ try {
4182
+ const lines = stdout.split("\n").filter((l) => l.trim());
4183
+ const toolCalls = [];
4184
+ for (const line of lines) {
4185
+ try {
4186
+ const obj = JSON.parse(line);
4187
+ if (obj.type === "tool_use" || obj.tool_name) {
4188
+ toolCalls.push(obj);
4189
+ }
4190
+ } catch {
4191
+ }
4192
+ }
4193
+ return toolCalls;
4194
+ } catch {
4195
+ return [];
4196
+ }
4197
+ }
4198
+
4199
+ // src/evolve/exec.ts
4200
+ import { exec as exec2 } from "child_process";
4201
+ import { promisify as promisify2 } from "util";
4202
+ var execAsync2 = promisify2(exec2);
4203
+ async function execCommand(cmd, cwd, timeoutMs = 3e4) {
4204
+ return execAsync2(cmd, { cwd, timeout: timeoutMs });
4205
+ }
4206
+
4207
+ // src/evolve/scorers.ts
4208
+ var COMMAND_PATTERN = /^(npm |npx |node |python |make |cargo |go |git |test |ls |cat |grep |curl )/;
4209
+ var SHELL_METACHAR_PATTERN = /[;|&`$()<>]/;
4210
+ var JUDGE_SYSTEM_PROMPT = `You are an eval judge for Claude Code agent tasks. Given a task description, expected outcome, and actual execution results, determine if the task was completed successfully.
4211
+
4212
+ Return ONLY valid JSON with this structure:
4213
+ {
4214
+ "pass": true/false,
4215
+ "score": 0-100,
4216
+ "reasoning": "Brief explanation of your judgment"
4217
+ }`;
4218
+ var RUBRIC_SYSTEM_PROMPT = `You are an eval judge scoring a specific criterion. Given the task, the criterion to evaluate, and the execution results, score the criterion.
4219
+
4220
+ Return ONLY valid JSON:
4221
+ {
4222
+ "score": 0.0-1.0,
4223
+ "reasoning": "Brief explanation"
4224
+ }`;
4225
+ async function passFailScorer(task, workspacePath, stdout, stderr) {
4226
+ const outcomes = Array.isArray(task.expected_outcome) ? task.expected_outcome : task.expected_outcome.split("\n");
4227
+ const commands = outcomes.map((line) => line.replace(/^-\s*/, "").trim()).filter((line) => COMMAND_PATTERN.test(line));
4228
+ if (commands.length > 0) {
4229
+ const failures = [];
4230
+ for (const cmd of commands) {
4231
+ if (SHELL_METACHAR_PATTERN.test(cmd)) {
4232
+ failures.push(`Rejected unsafe command (shell metacharacters): ${cmd}`);
4233
+ continue;
4234
+ }
4235
+ try {
4236
+ await execCommand(cmd, workspacePath);
4237
+ } catch (err) {
4238
+ const msg = err instanceof Error ? err.message : String(err);
4239
+ failures.push(`Command failed: ${cmd}
4240
+ ${msg}`);
4241
+ }
4242
+ }
4243
+ const passed2 = failures.length === 0;
4244
+ return {
4245
+ pass: passed2,
4246
+ score: passed2 ? 100 : 0,
4247
+ details: passed2 ? `All ${commands.length} verification commands passed` : failures.join("\n")
4248
+ };
4249
+ }
4250
+ const hasErrors = stderr.toLowerCase().includes("error") || stderr.toLowerCase().includes("failed") || stderr.toLowerCase().includes("exception");
4251
+ const passed = !hasErrors;
4252
+ return {
4253
+ pass: passed,
4254
+ score: passed ? 100 : 0,
4255
+ details: passed ? "No errors detected in output" : "Errors found in stderr"
4256
+ };
4257
+ }
4258
+ async function llmJudgeScorer(task, workspacePath, stdout, stderr, config) {
4259
+ const expectedOutcome = Array.isArray(task.expected_outcome) ? task.expected_outcome.join("\n") : task.expected_outcome;
4260
+ const userMessage = [
4261
+ "## Task",
4262
+ task.description,
4263
+ "",
4264
+ "## Expected Outcome",
4265
+ expectedOutcome,
4266
+ "",
4267
+ "## Actual stdout (last 2000 chars)",
4268
+ stdout.slice(-2e3),
4269
+ "",
4270
+ "## Actual stderr (last 1000 chars)",
4271
+ stderr.slice(-1e3)
4272
+ ].join("\n");
4273
+ try {
4274
+ const response = await callLLM(config, userMessage, {
4275
+ systemPrompt: JUDGE_SYSTEM_PROMPT,
4276
+ maxTokens: 1024
4277
+ });
4278
+ let cleaned = response.trim();
4279
+ if (cleaned.startsWith("```")) {
4280
+ cleaned = cleaned.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
4281
+ }
4282
+ const jsonMatch = cleaned.match(/\{[\s\S]*\}/);
4283
+ if (!jsonMatch) {
4284
+ return { pass: false, score: 0, reasoning: "Judge returned invalid JSON" };
4285
+ }
4286
+ const result = JSON.parse(jsonMatch[0]);
4287
+ return {
4288
+ pass: result.pass,
4289
+ score: result.score,
4290
+ reasoning: result.reasoning
4291
+ };
4292
+ } catch (err) {
4293
+ return {
4294
+ pass: false,
4295
+ score: 0,
4296
+ reasoning: `LLM judge error: ${err instanceof Error ? err.message : String(err)}`
4297
+ };
4298
+ }
4299
+ }
4300
+ async function rubricScorer(task, workspacePath, stdout, stderr, config) {
4301
+ if (!task.rubric || task.rubric.length === 0) {
4302
+ return passFailScorer(task, workspacePath, stdout, stderr);
4303
+ }
4304
+ const breakdown = [];
4305
+ let weightedSum = 0;
4306
+ for (const criterion of task.rubric) {
4307
+ const userMessage = [
4308
+ "## Task",
4309
+ task.description,
4310
+ "",
4311
+ "## Criterion to Evaluate",
4312
+ `"${criterion.criterion}" (weight: ${criterion.weight})`,
4313
+ "",
4314
+ "## Actual stdout (last 2000 chars)",
4315
+ stdout.slice(-2e3),
4316
+ "",
4317
+ "## Actual stderr (last 500 chars)",
4318
+ stderr.slice(-500)
4319
+ ].join("\n");
4320
+ try {
4321
+ const response = await callLLM(config, userMessage, {
4322
+ systemPrompt: RUBRIC_SYSTEM_PROMPT,
4323
+ maxTokens: 512
4324
+ });
4325
+ let cleaned = response.trim();
4326
+ if (cleaned.startsWith("```")) {
4327
+ cleaned = cleaned.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
4328
+ }
4329
+ const jsonMatch = cleaned.match(/\{[\s\S]*\}/);
4330
+ if (jsonMatch) {
4331
+ const result = JSON.parse(jsonMatch[0]);
4332
+ const clampedScore = Math.max(0, Math.min(1, result.score));
4333
+ breakdown.push({
4334
+ criterion: criterion.criterion,
4335
+ score: clampedScore,
4336
+ weight: criterion.weight
4337
+ });
4338
+ weightedSum += clampedScore * criterion.weight;
4339
+ } else {
4340
+ breakdown.push({
4341
+ criterion: criterion.criterion,
4342
+ score: 0,
4343
+ weight: criterion.weight
4344
+ });
4345
+ }
4346
+ } catch {
4347
+ breakdown.push({
4348
+ criterion: criterion.criterion,
4349
+ score: 0,
4350
+ weight: criterion.weight
4351
+ });
4352
+ }
4353
+ }
4354
+ const totalWeight = task.rubric.reduce((sum, c) => sum + c.weight, 0);
4355
+ const totalScore = totalWeight > 0 ? Math.round(weightedSum / totalWeight * 100) : 0;
4356
+ return {
4357
+ pass: totalScore >= 60,
4358
+ score: totalScore,
4359
+ reasoning: `Rubric score: ${totalScore}%`,
4360
+ breakdown
4361
+ };
4362
+ }
4363
+ async function scoreTask(task, workspacePath, stdout, stderr, config) {
4364
+ if (task.scoring === "pass-fail") {
4365
+ return passFailScorer(task, workspacePath, stdout, stderr);
4366
+ }
4367
+ if (task.scoring === "llm-judge" && config) {
4368
+ return llmJudgeScorer(task, workspacePath, stdout, stderr, config);
4369
+ }
4370
+ if (task.scoring === "rubric" && config) {
4371
+ return rubricScorer(task, workspacePath, stdout, stderr, config);
4372
+ }
4373
+ return passFailScorer(task, workspacePath, stdout, stderr);
4374
+ }
4375
+
4376
+ // src/commands/evolve.ts
4377
+ var DEFAULT_CONFIG = {
4378
+ model: "claude-sonnet-4-6",
4379
+ proposerModel: "claude-opus-4-6",
4380
+ scorer: "pass-fail",
4381
+ maxIterations: 5,
4382
+ parallelTasks: 1
4383
+ };
4384
+ var evolveCommand = new Command11("evolve").description("Evolve your agent environment through automated optimization");
4385
+ evolveCommand.command("init").description("Initialize an evolution workspace with auto-generated tasks").option("--workflow <type>", "Workflow type for template selection", "feature-development").action(async (options) => {
4386
+ try {
4387
+ const projectRoot = process.cwd();
4388
+ console.log(ui.section("Evolve Init"));
4389
+ const claudeDir = path19.join(projectRoot, ".claude");
4390
+ try {
4391
+ await fs19.access(claudeDir);
4392
+ } catch {
4393
+ console.log(ui.error("No .claude/ directory found. Run kairn describe first."));
4394
+ process.exit(1);
4395
+ }
4396
+ const workspace = await createEvolveWorkspace(projectRoot, DEFAULT_CONFIG);
4397
+ console.log(ui.success("Created .kairn-evolve/ workspace"));
4398
+ const spinner = ora2("Generating project-specific eval tasks...").start();
4399
+ let tasks;
4400
+ try {
4401
+ tasks = await autoGenerateTasks(projectRoot, options.workflow);
4402
+ spinner.succeed(`Generated ${tasks.length} eval tasks`);
4403
+ } catch {
4404
+ spinner.fail("LLM task generation failed");
4405
+ const templateIds = selectTemplatesForWorkflow(options.workflow);
4406
+ tasks = templateIds.map((templateId, index) => ({
4407
+ id: `${templateId}-${index + 1}`,
4408
+ template: templateId,
4409
+ description: `${EVAL_TEMPLATES[templateId].description} (project-specific task \u2014 edit in tasks.yaml)`,
4410
+ setup: "npm install",
4411
+ expected_outcome: "Task completed successfully",
4412
+ scoring: "pass-fail",
4413
+ timeout: 300
4414
+ }));
4415
+ console.log(ui.info(`Fell back to ${tasks.length} template placeholders`));
4416
+ }
4417
+ for (const task of tasks) {
4418
+ console.log(chalk14.cyan(` ${task.id}`) + chalk14.dim(` (${task.template}) \u2014 ${task.description.slice(0, 80)}`));
4419
+ }
4420
+ let addMore = true;
4421
+ while (addMore) {
4422
+ try {
4423
+ addMore = await confirm3({ message: "Add another eval task?", default: false });
4424
+ } catch {
4425
+ addMore = false;
4426
+ }
4427
+ if (addMore) {
4428
+ const templateId = await select4({
4429
+ message: "Select eval template:",
4430
+ choices: Object.values(EVAL_TEMPLATES).map((t) => ({
4431
+ name: `${t.name} \u2014 ${t.description}`,
4432
+ value: t.id
4433
+ }))
4434
+ });
4435
+ const addSpinner = ora2("Generating task...").start();
4436
+ try {
4437
+ const config = await loadConfig();
4438
+ if (config) {
4439
+ let claudeMd = "";
4440
+ try {
4441
+ claudeMd = await fs19.readFile(path19.join(claudeDir, "CLAUDE.md"), "utf-8");
4442
+ } catch {
4443
+ }
4444
+ const profile = await buildProjectProfile(projectRoot);
4445
+ const newTasks = await generateTasksFromTemplates(claudeMd, profile, [templateId], config);
4446
+ tasks.push(...newTasks);
4447
+ addSpinner.succeed(`Added ${newTasks.length} task(s)`);
4448
+ } else {
4449
+ addSpinner.fail("No config found");
4450
+ }
4451
+ } catch {
4452
+ addSpinner.fail("Failed to generate task");
4453
+ }
4454
+ }
4455
+ }
4456
+ await writeTasksFile(workspace, tasks);
4457
+ console.log(ui.success(`Wrote ${tasks.length} tasks to tasks.yaml`));
4458
+ console.log("");
4459
+ console.log(chalk14.dim(" Next steps:"));
4460
+ console.log(chalk14.dim(" 1. Review .kairn-evolve/tasks.yaml"));
4461
+ console.log(chalk14.dim(" 2. Run: kairn evolve baseline"));
4462
+ console.log(chalk14.dim(" 3. Run: kairn evolve run"));
4463
+ } catch (err) {
4464
+ const msg = err instanceof Error ? err.message : String(err);
4465
+ console.log(ui.error(msg));
4466
+ process.exit(1);
4467
+ }
4468
+ });
4469
+ evolveCommand.command("baseline").description("Snapshot current .claude/ directory as baseline").action(async () => {
4470
+ try {
4471
+ const projectRoot = process.cwd();
4472
+ const workspace = path19.join(projectRoot, ".kairn-evolve");
4473
+ console.log(ui.section("Evolve Baseline"));
4474
+ try {
4475
+ await fs19.access(workspace);
4476
+ } catch {
4477
+ console.log(ui.error("No .kairn-evolve/ directory found. Run kairn evolve init first."));
4478
+ process.exit(1);
4479
+ }
4480
+ await snapshotBaseline(projectRoot, workspace);
4481
+ const baselineDir = path19.join(workspace, "baseline");
4482
+ const fileCount = await countFiles(baselineDir);
4483
+ console.log(ui.success(`Baseline snapshot created (${fileCount} files)`));
4484
+ } catch (err) {
4485
+ const msg = err instanceof Error ? err.message : String(err);
4486
+ console.log(ui.error(msg));
4487
+ process.exit(1);
4488
+ }
4489
+ });
4490
+ evolveCommand.command("run").description("Run tasks against the current harness").option("--task <id>", "Run a specific task by ID").action(async (options) => {
4491
+ try {
4492
+ const projectRoot = process.cwd();
4493
+ const workspace = path19.join(projectRoot, ".kairn-evolve");
4494
+ console.log(ui.section("Evolve Run"));
4495
+ try {
4496
+ await fs19.access(workspace);
4497
+ } catch {
4498
+ console.log(ui.error("No .kairn-evolve/ directory found. Run kairn evolve init first."));
4499
+ process.exit(1);
4500
+ }
4501
+ const tasksPath = path19.join(workspace, "tasks.yaml");
4502
+ let tasksContent;
4503
+ try {
4504
+ tasksContent = await fs19.readFile(tasksPath, "utf-8");
4505
+ } catch {
4506
+ console.log(ui.error("No tasks.yaml found. Run kairn evolve init first."));
4507
+ process.exit(1);
4508
+ }
4509
+ const parsed = yamlParse(tasksContent);
4510
+ if (!parsed?.tasks || parsed.tasks.length === 0) {
4511
+ console.log(ui.error("No tasks found in tasks.yaml"));
4512
+ process.exit(1);
4513
+ }
4514
+ const tasksToRun = options.task ? parsed.tasks.filter((t) => t.id === options.task) : parsed.tasks;
4515
+ if (tasksToRun.length === 0) {
4516
+ console.log(ui.error(`Task "${options.task}" not found in tasks.yaml`));
4517
+ process.exit(1);
4518
+ }
4519
+ console.log(ui.info(`Running ${tasksToRun.length} task(s)...`));
4520
+ console.log("");
4521
+ const config = await loadConfig();
4522
+ const harnessPath = path19.join(projectRoot, ".claude");
4523
+ const results = [];
4524
+ for (const task of tasksToRun) {
4525
+ const traceDir = path19.join(workspace, "traces", "0", task.id);
4526
+ const spinner = ora2(`Running: ${task.id}`).start();
4527
+ const result = await runTask(task, harnessPath, traceDir, 0);
4528
+ if (config) {
4529
+ const stdout = await fs19.readFile(path19.join(traceDir, "stdout.log"), "utf-8").catch(() => "");
4530
+ const stderr = await fs19.readFile(path19.join(traceDir, "stderr.log"), "utf-8").catch(() => "");
4531
+ const score = await scoreTask(task, traceDir, stdout, stderr, config);
4532
+ result.score = score;
4533
+ await writeScore(traceDir, score);
4534
+ }
4535
+ results.push(result);
4536
+ const status = result.score.pass ? chalk14.green("PASS") : chalk14.red("FAIL");
4537
+ const scoreStr = result.score.score !== void 0 ? chalk14.dim(` (${result.score.score}%)`) : "";
4538
+ spinner.stop();
4539
+ console.log(` ${status} ${task.id}${scoreStr}${result.score.details ? chalk14.dim(` \u2014 ${result.score.details}`) : ""}`);
4540
+ }
4541
+ const passed = results.filter((r) => r.score.pass).length;
4542
+ console.log("");
4543
+ console.log(ui.info(`Results: ${passed}/${results.length} passed`));
4544
+ console.log(ui.info("Traces written to .kairn-evolve/traces/0/"));
4545
+ } catch (err) {
4546
+ const msg = err instanceof Error ? err.message : String(err);
4547
+ console.log(ui.error(msg));
4548
+ process.exit(1);
4549
+ }
4550
+ });
4551
+ async function countFiles(dir) {
4552
+ let count = 0;
4553
+ try {
4554
+ const entries = await fs19.readdir(dir, { withFileTypes: true });
4555
+ for (const entry of entries) {
4556
+ if (entry.isDirectory()) {
4557
+ count += await countFiles(path19.join(dir, entry.name));
4558
+ } else {
4559
+ count++;
4560
+ }
4561
+ }
4562
+ } catch {
4563
+ }
4564
+ return count;
4565
+ }
4566
+
3630
4567
  // src/cli.ts
3631
- var program = new Command11();
4568
+ var program = new Command12();
3632
4569
  program.name("kairn").description(
3633
4570
  "Compile natural language intent into optimized Claude Code environments"
3634
4571
  ).version("1.9.0").option("--no-color", "Disable colored output");
@@ -3642,8 +4579,9 @@ program.addCommand(doctorCommand);
3642
4579
  program.addCommand(registryCommand);
3643
4580
  program.addCommand(templatesCommand);
3644
4581
  program.addCommand(keysCommand);
4582
+ program.addCommand(evolveCommand);
3645
4583
  if (process.argv.includes("--no-color") || process.env.NO_COLOR) {
3646
- chalk14.level = 0;
4584
+ chalk15.level = 0;
3647
4585
  }
3648
4586
  program.parse();
3649
4587
  //# sourceMappingURL=cli.js.map