kairn-cli 2.7.2 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -218,6 +218,7 @@ async function callLLM(config, userMessage, options) {
218
218
  const maxTokens = options.maxTokens ?? 8192;
219
219
  const { systemPrompt } = options;
220
220
  const jsonMode = options.jsonMode ?? false;
221
+ const cacheControl = options.cacheControl ?? false;
221
222
  const providerName = getProviderName(config.provider);
222
223
  let apiKey = config.api_key;
223
224
  if (config.auth_type === "claude-code-oauth") {
@@ -238,7 +239,7 @@ async function callLLM(config, userMessage, options) {
238
239
  const response = await client2.messages.create({
239
240
  model: config.model,
240
241
  max_tokens: maxTokens,
241
- system: systemPrompt,
242
+ system: cacheControl ? [{ type: "text", text: systemPrompt, cache_control: { type: "ephemeral" } }] : systemPrompt,
242
243
  messages
243
244
  });
244
245
  const textBlock = response.content.find((block) => block.type === "text");
@@ -461,6 +462,96 @@ var init_exec = __esm({
461
462
  });
462
463
 
463
464
  // src/evolve/scorers.ts
465
+ function scoreCriterionDeterministic(criterionText, stdout, stderr) {
466
+ const combined = `${stdout}
467
+ ${stderr}`.toLowerCase();
468
+ const criterionLower = criterionText.toLowerCase().trim();
469
+ if (/^ran\b/i.test(criterionText.trim())) {
470
+ for (const entry of RAN_COMMAND_EVIDENCE) {
471
+ const matchesKeyword = entry.keywords.some(
472
+ (kw) => criterionLower.includes(kw.toLowerCase())
473
+ );
474
+ if (matchesKeyword) {
475
+ const found = entry.evidence.some((ev) => combined.includes(ev.toLowerCase()));
476
+ if (found) {
477
+ const matchedEvidence = entry.evidence.find(
478
+ (ev) => combined.includes(ev.toLowerCase())
479
+ );
480
+ return {
481
+ score: 1,
482
+ reasoning: `Deterministic: found evidence of '${matchedEvidence}' in output`
483
+ };
484
+ }
485
+ return {
486
+ score: 0,
487
+ reasoning: `Deterministic: no evidence of '${entry.keywords[0]}' found`
488
+ };
489
+ }
490
+ }
491
+ return null;
492
+ }
493
+ if (/^(zero|no)\b/i.test(criterionText.trim())) {
494
+ for (const entry of ABSENCE_PATTERNS) {
495
+ const matchesKeyword = entry.keywords.some(
496
+ (kw) => criterionLower.includes(kw.toLowerCase())
497
+ );
498
+ if (matchesKeyword) {
499
+ const found = entry.search.some((pat) => combined.includes(pat.toLowerCase()));
500
+ if (found) {
501
+ const matchedPattern = entry.search.find(
502
+ (pat) => combined.includes(pat.toLowerCase())
503
+ );
504
+ return {
505
+ score: 0,
506
+ reasoning: `Deterministic: found '${matchedPattern}' which should be absent`
507
+ };
508
+ }
509
+ return {
510
+ score: 1,
511
+ reasoning: `Deterministic: no prohibited pattern found in output`
512
+ };
513
+ }
514
+ }
515
+ return null;
516
+ }
517
+ if (/^uses?\b/i.test(criterionText.trim())) {
518
+ for (const entry of PRESENCE_PATTERNS) {
519
+ if (criterionLower.includes(entry.keyword.toLowerCase())) {
520
+ const found = entry.search.some((s) => combined.includes(s.toLowerCase()));
521
+ if (found) {
522
+ return {
523
+ score: 1,
524
+ reasoning: `Deterministic: found '${entry.keyword}' in output`
525
+ };
526
+ }
527
+ return {
528
+ score: 0,
529
+ reasoning: `Deterministic: '${entry.keyword}' not found in output`
530
+ };
531
+ }
532
+ }
533
+ return null;
534
+ }
535
+ if (/^calls?\b/i.test(criterionText.trim())) {
536
+ for (const pattern of CALL_PATTERNS) {
537
+ if (criterionLower.includes(pattern.toLowerCase())) {
538
+ const found = combined.includes(pattern.toLowerCase());
539
+ if (found) {
540
+ return {
541
+ score: 1,
542
+ reasoning: `Deterministic: found '${pattern}' in output`
543
+ };
544
+ }
545
+ return {
546
+ score: 0,
547
+ reasoning: `Deterministic: '${pattern}' not found in output`
548
+ };
549
+ }
550
+ }
551
+ return null;
552
+ }
553
+ return null;
554
+ }
464
555
  async function passFailScorer(task, workspacePath, stdout, stderr) {
465
556
  const outcomes = Array.isArray(task.expected_outcome) ? task.expected_outcome : task.expected_outcome.split("\n");
466
557
  const commands = outcomes.map((line) => line.replace(/^-\s*/, "").trim()).filter((line) => COMMAND_PATTERN.test(line));
@@ -513,7 +604,8 @@ async function llmJudgeScorer(task, workspacePath, stdout, stderr, config) {
513
604
  try {
514
605
  const response = await callLLM(config, userMessage, {
515
606
  systemPrompt: JUDGE_SYSTEM_PROMPT,
516
- maxTokens: 1024
607
+ maxTokens: 1024,
608
+ cacheControl: true
517
609
  });
518
610
  let cleaned = response.trim();
519
611
  if (cleaned.startsWith("```")) {
@@ -544,6 +636,20 @@ async function rubricScorer(task, workspacePath, stdout, stderr, config) {
544
636
  const breakdown = [];
545
637
  let weightedSum = 0;
546
638
  for (const criterion of task.rubric) {
639
+ const deterministicResult = scoreCriterionDeterministic(
640
+ criterion.criterion,
641
+ stdout,
642
+ stderr
643
+ );
644
+ if (deterministicResult !== null) {
645
+ breakdown.push({
646
+ criterion: criterion.criterion,
647
+ score: deterministicResult.score,
648
+ weight: criterion.weight
649
+ });
650
+ weightedSum += deterministicResult.score * criterion.weight;
651
+ continue;
652
+ }
547
653
  const userMessage = [
548
654
  "## Task",
549
655
  task.description,
@@ -560,7 +666,8 @@ async function rubricScorer(task, workspacePath, stdout, stderr, config) {
560
666
  try {
561
667
  const response = await callLLM(config, userMessage, {
562
668
  systemPrompt: RUBRIC_SYSTEM_PROMPT,
563
- maxTokens: 512
669
+ maxTokens: 512,
670
+ cacheControl: true
564
671
  });
565
672
  let cleaned = response.trim();
566
673
  if (cleaned.startsWith("```")) {
@@ -638,7 +745,7 @@ async function scoreTask(task, workspacePath, stdout, stderr, config) {
638
745
  }
639
746
  return score;
640
747
  }
641
- var COMMAND_PATTERN, SHELL_METACHAR_PATTERN, JUDGE_SYSTEM_PROMPT, RUBRIC_SYSTEM_PROMPT;
748
+ var COMMAND_PATTERN, SHELL_METACHAR_PATTERN, JUDGE_SYSTEM_PROMPT, RUBRIC_SYSTEM_PROMPT, RAN_COMMAND_EVIDENCE, ABSENCE_PATTERNS, PRESENCE_PATTERNS, CALL_PATTERNS;
642
749
  var init_scorers = __esm({
643
750
  "src/evolve/scorers.ts"() {
644
751
  "use strict";
@@ -661,6 +768,31 @@ Return ONLY valid JSON:
661
768
  "score": 0.0-1.0,
662
769
  "reasoning": "Brief explanation"
663
770
  }`;
771
+ RAN_COMMAND_EVIDENCE = [
772
+ { keywords: ["npm run build", "build", "tsup"], evidence: ["build success", "tsup", "built in", "build completed"] },
773
+ { keywords: ["tsc", "typecheck"], evidence: ["tsc", "typecheck"] },
774
+ { keywords: ["npm run lint", "eslint", "lint"], evidence: ["lint", "eslint"] },
775
+ { keywords: ["npm test", "vitest", "test"], evidence: ["vitest", "test files", "tests passed", "passed (", "tests "] }
776
+ ];
777
+ ABSENCE_PATTERNS = [
778
+ { keywords: [".then()", ".catch()"], search: [".then(", ".catch("] },
779
+ { keywords: ["readfilesync", "writefilesync"], search: ["readfilesync", "writefilesync"] },
780
+ { keywords: ["sync"], search: ["sync"] }
781
+ ];
782
+ PRESENCE_PATTERNS = [
783
+ { keyword: "chalk.green", search: ["chalk.green"] },
784
+ { keyword: "chalk.yellow", search: ["chalk.yellow"] },
785
+ { keyword: "chalk.red", search: ["chalk.red"] },
786
+ { keyword: "chalk.cyan", search: ["chalk.cyan"] },
787
+ { keyword: "fs.promises", search: ["fs.promises", "fs/promises"] },
788
+ { keyword: "fs/promises", search: ["fs.promises", "fs/promises"] },
789
+ { keyword: "async/await", search: ["async ", "await "] },
790
+ { keyword: "@inquirer/prompts", search: ["@inquirer/prompts"] }
791
+ ];
792
+ CALL_PATTERNS = [
793
+ "process.exit(1)",
794
+ "process.exit"
795
+ ];
664
796
  }
665
797
  });
666
798
 
@@ -1329,7 +1461,8 @@ async function propose(iteration, workspacePath, harnessPath, history, tasks, co
1329
1461
  const response = await callLLM(proposerConfig, userMessage, {
1330
1462
  systemPrompt: PROPOSER_SYSTEM_PROMPT,
1331
1463
  maxTokens: 8192,
1332
- jsonMode: true
1464
+ jsonMode: true,
1465
+ cacheControl: true
1333
1466
  });
1334
1467
  return parseProposerResponse(response);
1335
1468
  }
@@ -1831,7 +1964,22 @@ async function parseAgents(harnessPath) {
1831
1964
  if (Array.isArray(disallowedTools)) {
1832
1965
  node.disallowedTools = disallowedTools;
1833
1966
  }
1834
- const knownKeys = /* @__PURE__ */ new Set(["name", "model", "disallowedTools"]);
1967
+ const modelRouting = frontmatter["modelRouting"];
1968
+ if (typeof modelRouting === "object" && modelRouting !== null) {
1969
+ const mr = modelRouting;
1970
+ if (typeof mr["default"] === "string") {
1971
+ node.modelRouting = {
1972
+ default: mr["default"]
1973
+ };
1974
+ if (typeof mr["escalateTo"] === "string") {
1975
+ node.modelRouting.escalateTo = mr["escalateTo"];
1976
+ }
1977
+ if (typeof mr["escalateWhen"] === "string") {
1978
+ node.modelRouting.escalateWhen = mr["escalateWhen"];
1979
+ }
1980
+ }
1981
+ }
1982
+ const knownKeys = /* @__PURE__ */ new Set(["name", "model", "disallowedTools", "modelRouting"]);
1835
1983
  const extra = {};
1836
1984
  for (const [key, value] of Object.entries(frontmatter)) {
1837
1985
  if (!knownKeys.has(key)) {
@@ -2457,8 +2605,9 @@ function renderRuleWithFrontmatter(rule) {
2457
2605
  function renderAgentWithFrontmatter(agent) {
2458
2606
  const hasModel = agent.model !== void 0;
2459
2607
  const hasDisallowed = agent.disallowedTools !== void 0 && agent.disallowedTools.length > 0;
2608
+ const hasRouting = agent.modelRouting !== void 0;
2460
2609
  const hasExtra = agent.extraFrontmatter !== void 0 && Object.keys(agent.extraFrontmatter).length > 0;
2461
- if (!hasModel && !hasDisallowed && !hasExtra) {
2610
+ if (!hasModel && !hasDisallowed && !hasRouting && !hasExtra) {
2462
2611
  return agent.content;
2463
2612
  }
2464
2613
  const yamlLines = ["---"];
@@ -2471,6 +2620,16 @@ function renderAgentWithFrontmatter(agent) {
2471
2620
  yamlLines.push(` - ${tool}`);
2472
2621
  }
2473
2622
  }
2623
+ if (hasRouting) {
2624
+ yamlLines.push("modelRouting:");
2625
+ yamlLines.push(` default: ${agent.modelRouting.default}`);
2626
+ if (agent.modelRouting.escalateTo) {
2627
+ yamlLines.push(` escalateTo: ${agent.modelRouting.escalateTo}`);
2628
+ }
2629
+ if (agent.modelRouting.escalateWhen) {
2630
+ yamlLines.push(` escalateWhen: ${agent.modelRouting.escalateWhen}`);
2631
+ }
2632
+ }
2474
2633
  if (hasExtra) {
2475
2634
  for (const [key, value] of Object.entries(agent.extraFrontmatter)) {
2476
2635
  if (Array.isArray(value)) {
@@ -3350,6 +3509,92 @@ var init_regularization = __esm({
3350
3509
  }
3351
3510
  });
3352
3511
 
3512
+ // src/evolve/targeting.ts
3513
+ function mutationToAspect(mutation) {
3514
+ switch (mutation.type) {
3515
+ case "update_section": {
3516
+ const id = mutation.sectionId;
3517
+ if (id === "conventions" || id === "gotchas" || id === "debugging" || id === "git") return "conventions";
3518
+ if (id === "commands" || id === "custom-key-commands") return "commands";
3519
+ if (id === "verification") return "verification";
3520
+ if (id === "architecture") return "architecture";
3521
+ return "general";
3522
+ }
3523
+ case "add_section": {
3524
+ const id = mutation.section.id;
3525
+ if (id === "conventions" || id === "gotchas" || id === "debugging" || id === "git") return "conventions";
3526
+ if (id === "commands" || id === "custom-key-commands") return "commands";
3527
+ if (id === "verification") return "verification";
3528
+ if (id === "architecture") return "architecture";
3529
+ return "general";
3530
+ }
3531
+ case "remove_section":
3532
+ case "reorder_section":
3533
+ return "general";
3534
+ case "add_command":
3535
+ case "update_command":
3536
+ case "remove_command":
3537
+ return "commands";
3538
+ case "add_rule":
3539
+ case "update_rule":
3540
+ case "remove_rule":
3541
+ return "rules";
3542
+ case "add_agent":
3543
+ case "update_agent":
3544
+ case "remove_agent":
3545
+ return "agents";
3546
+ case "add_mcp_server":
3547
+ case "remove_mcp_server":
3548
+ return "mcp";
3549
+ case "update_settings":
3550
+ return "settings";
3551
+ case "raw_text":
3552
+ return "general";
3553
+ }
3554
+ }
3555
+ function mutationsToAspects(mutations) {
3556
+ const aspects = /* @__PURE__ */ new Set();
3557
+ for (const m of mutations) {
3558
+ aspects.add(mutationToAspect(m));
3559
+ }
3560
+ return aspects;
3561
+ }
3562
+ function taskDependsOnAspects(task) {
3563
+ const aspects = TEMPLATE_ASPECTS[task.template];
3564
+ return new Set(aspects ?? ["general"]);
3565
+ }
3566
+ function shouldReEvaluate(task, changedAspects) {
3567
+ if (changedAspects.has("general")) return true;
3568
+ if (changedAspects.size === 0) return false;
3569
+ const taskAspects = taskDependsOnAspects(task);
3570
+ if (taskAspects.has("general")) return true;
3571
+ for (const aspect of taskAspects) {
3572
+ if (changedAspects.has(aspect)) return true;
3573
+ }
3574
+ return false;
3575
+ }
3576
+ function filterTasksByAspects(tasks, changedAspects) {
3577
+ return tasks.filter((t) => shouldReEvaluate(t, changedAspects));
3578
+ }
3579
+ var TEMPLATE_ASPECTS;
3580
+ var init_targeting = __esm({
3581
+ "src/evolve/targeting.ts"() {
3582
+ "use strict";
3583
+ TEMPLATE_ASPECTS = {
3584
+ "convention-adherence": ["conventions", "rules"],
3585
+ "workflow-compliance": ["commands", "verification"],
3586
+ "rule-compliance": ["rules"],
3587
+ "intent-routing": ["settings"],
3588
+ "add-feature": ["general"],
3589
+ "fix-bug": ["general"],
3590
+ "refactor": ["architecture", "conventions"],
3591
+ "test-writing": ["verification", "commands"],
3592
+ "config-change": ["settings", "mcp"],
3593
+ "documentation": ["general"]
3594
+ };
3595
+ }
3596
+ });
3597
+
3353
3598
  // src/evolve/loop.ts
3354
3599
  import fs25 from "fs/promises";
3355
3600
  import path25 from "path";
@@ -3382,6 +3627,7 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
3382
3627
  }
3383
3628
  }
3384
3629
  }
3630
+ let lastChangedAspects = null;
3385
3631
  let rngState = evolveConfig.rngSeed ?? 42;
3386
3632
  const rng = () => {
3387
3633
  rngState = rngState * 1664525 + 1013904223 & 4294967295;
@@ -3428,6 +3674,22 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
3428
3674
  tasksToRun.push(task);
3429
3675
  }
3430
3676
  }
3677
+ if (lastChangedAspects !== null) {
3678
+ const targetedTasks = filterTasksByAspects(tasksToRun, lastChangedAspects);
3679
+ const skippedByTargeting = tasksToRun.filter((t) => !targetedTasks.includes(t));
3680
+ for (const task of skippedByTargeting) {
3681
+ const prev = prevLog.taskResults[task.id];
3682
+ const prevVal = prev ? prev.score ?? (prev.pass ? 100 : 0) : 0;
3683
+ carriedScores[task.id] = { pass: prevVal >= 50, score: prevVal };
3684
+ onProgress?.({
3685
+ type: "task-skipped",
3686
+ iteration: iter,
3687
+ taskId: task.id,
3688
+ message: `Skipped ${task.id} (unaffected by mutations)`
3689
+ });
3690
+ }
3691
+ tasksToRun = targetedTasks;
3692
+ }
3431
3693
  const sampleSize = evolveConfig.evalSampleSize;
3432
3694
  if (sampleSize > 0 && sampleSize < tasksToRun.length) {
3433
3695
  let sampled;
@@ -3583,6 +3845,13 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
3583
3845
  }
3584
3846
  const nextIterDir2 = path25.join(workspacePath, "iterations", (iter + 1).toString());
3585
3847
  await applyMutations(bestHarnessPath, nextIterDir2, rollbackProposal.mutations);
3848
+ try {
3849
+ const rollbackIR = await parseHarness(bestHarnessPath);
3850
+ const irMuts = translateMutations(rollbackProposal.mutations, rollbackIR);
3851
+ lastChangedAspects = mutationsToAspects(irMuts);
3852
+ } catch {
3853
+ lastChangedAspects = null;
3854
+ }
3586
3855
  onProgress?.({
3587
3856
  type: "mutations-applied",
3588
3857
  iteration: iter,
@@ -3687,8 +3956,16 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
3687
3956
  proposal.mutations
3688
3957
  );
3689
3958
  diffPatch = mutationResult.diffPatch;
3959
+ try {
3960
+ const currentIR = await parseHarness(harnessPath);
3961
+ const irMuts = translateMutations(proposal.mutations, currentIR);
3962
+ lastChangedAspects = mutationsToAspects(irMuts);
3963
+ } catch {
3964
+ lastChangedAspects = null;
3965
+ }
3690
3966
  } catch {
3691
3967
  await copyDir(harnessPath, path25.join(nextIterDir, "harness"));
3968
+ lastChangedAspects = null;
3692
3969
  }
3693
3970
  onProgress?.({
3694
3971
  type: "mutations-applied",
@@ -3787,6 +4064,8 @@ var init_loop = __esm({
3787
4064
  init_sampling();
3788
4065
  init_regularization();
3789
4066
  init_parser();
4067
+ init_translate();
4068
+ init_targeting();
3790
4069
  }
3791
4070
  });
3792
4071
 
@@ -3917,7 +4196,8 @@ ${userMessage}`;
3917
4196
  const response = await callLLM(proposerConfig, fullMessage, {
3918
4197
  systemPrompt,
3919
4198
  maxTokens: 8192,
3920
- jsonMode: true
4199
+ jsonMode: true,
4200
+ cacheControl: true
3921
4201
  });
3922
4202
  const proposal = parseProposerResponse(response);
3923
4203
  return {
@@ -4752,6 +5032,12 @@ At the start of every session, before doing ANY work:
4752
5032
 
4753
5033
  This saves 2-5 exploratory turns. Never ask "what files are here?" \u2014 look first.
4754
5034
 
5035
+ ## Sprint Contract
5036
+
5037
+ Before implementing, confirm acceptance criteria exist in docs/SPRINT.md.
5038
+ Each criterion must be numbered, testable, and independently verifiable.
5039
+ After implementing, verify EACH criterion individually. Do not mark done until all pass.
5040
+
4755
5041
  ## Completion Standards
4756
5042
 
4757
5043
  Never mark a task "done" without running the Completion Verification checklist.
@@ -4781,6 +5067,7 @@ Do not add generic filler. Every line must be specific to the user's workflow.
4781
5067
  15. "Engineering Standards", "Tool Usage Policy", and "Code Philosophy" sections in CLAUDE.md
4782
5068
  16. A "First Turn Protocol" section in CLAUDE.md (orient before working: pwd, ls, git status, check relevant runtimes, read task files)
4783
5069
  17. A "Completion Standards" section in CLAUDE.md (never mark done without verifying: requirements met, tests passing, no debug artifacts, reviewed from 3 perspectives)
5070
+ 18. A "Sprint Contract" section in CLAUDE.md (confirm acceptance criteria exist before implementing, verify each criterion after)
4784
5071
 
4785
5072
  ## Shell-Integrated Commands
4786
5073
 
@@ -4837,17 +5124,19 @@ Only generate scoped rules when the workflow involves multiple code domains.
4837
5124
 
4838
5125
  Generate hooks in settings.json based on project type:
4839
5126
 
4840
- **All code projects** \u2014 block destructive commands:
5127
+ **All code projects** \u2014 block destructive commands, credential leaks, injection, and network exfiltration:
4841
5128
  \`\`\`json
4842
5129
  {
4843
5130
  "hooks": {
4844
- "PreToolUse": [{
4845
- "matcher": "Bash",
4846
- "hooks": [{
4847
- "type": "command",
4848
- "command": "CMD=$(cat | jq -r '.tool_input.command // empty') && echo \\"$CMD\\" | grep -qiE 'rm\\\\s+-rf\\\\s+/|DROP\\\\s+TABLE|curl.*\\\\|\\\\s*sh' && echo 'Blocked destructive command' >&2 && exit 2 || true"
4849
- }]
4850
- }]
5131
+ "PreToolUse": [
5132
+ {
5133
+ "matcher": "Bash",
5134
+ "hooks": [{
5135
+ "type": "command",
5136
+ "command": "CMD=$(cat | jq -r '.tool_input.command // empty') && echo \\"$CMD\\" | grep -qiE 'rm\\\\s+-rf\\\\s+/|DROP\\\\s+(TABLE|DATABASE)|curl.*\\\\|\\\\s*sh|:(){ :|:& };:|git\\\\s+push.*--force(?!-with-lease)|ch(mod|own).*-R\\\\s+/|npm\\\\s+publish(?!.*--dry-run)|(api[_-]?key|secret|token|password)\\\\s*[:=]|AKIA[0-9A-Z]{16}|BEGIN.*PRIVATE\\\\s+KEY|;\\\\s*(DROP|DELETE|ALTER|TRUNCATE)\\\\s+|\\\\.\\\\./\\\\.\\\\./\\\\.\\\\./|nc\\\\s+.*-e|/dev/tcp/|bash\\\\s+-i|curl.*-d.*@|wget.*--post-file' && echo 'Blocked dangerous command' >&2 && exit 2 || true"
5137
+ }]
5138
+ }
5139
+ ]
4851
5140
  }
4852
5141
  }
4853
5142
  \`\`\`
@@ -4889,6 +5178,17 @@ All projects should include a PostCompact hook to restore context after compacti
4889
5178
 
4890
5179
  Merge this into the settings hooks alongside the PreToolUse and PostToolUse hooks.
4891
5180
 
5181
+ For long-running sessions (>2 hours or >3 compactions), prefer "Full Reset" over re-inject:
5182
+ replace the prompt-type PostCompact hook with a command-type hook that pipes CLAUDE.md + SPRINT.md + DECISIONS.md content directly into additionalContext.
5183
+
5184
+ ## Memory Persistence Hooks
5185
+
5186
+ For projects with multi-session workflows, include SessionStart/End hooks that persist context to \`.claude/memory.json\`:
5187
+ - **SessionEnd:** Save recent decisions, sprint status, and known gotchas to \`.claude/memory.json\`
5188
+ - **SessionStart:** Load \`.claude/memory.json\` and inject as additionalContext
5189
+
5190
+ This ensures accumulated project knowledge survives session boundaries.
5191
+
4892
5192
  ## For Code Projects, Additionally Include
4893
5193
 
4894
5194
  - \`/project:plan\` command (plan before coding)
@@ -4898,7 +5198,7 @@ Merge this into the settings hooks alongside the PreToolUse and PostToolUse hook
4898
5198
  - \`/project:status\` command (live git status, recent commits, SPRINT.md overview using ! prefix)
4899
5199
  - \`/project:fix\` command (takes $ARGUMENTS as issue number, plans fix, implements, tests, commits)
4900
5200
  - \`/project:sprint\` command (define acceptance criteria before coding, writes to docs/SPRINT.md)
4901
- - \`/project:develop\` command (full development pipeline \u2014 orchestrates @architect \u2192 @planner \u2192 @implementer \u2192 @verifier \u2192 @fixer \u2192 @grill \u2192 @doc-updater through spec, plan, TDD implement, review, and doc update phases). MUST include a Phase 7 "Completion Gate" that runs a Completion Verification checklist before marking the feature done: re-read original requirements, confirm each is met with evidence, run test suite + lint/typecheck, review git diff for unexpected changes or debug artifacts, answer 3 perspective questions (test engineer, code reviewer, requesting user). If ANY check fails, loop back to fix before completing.
5201
+ - \`/project:develop\` command (full development pipeline \u2014 orchestrates @architect \u2192 @planner \u2192 @implementer \u2192 @verifier \u2192 @fixer \u2192 @grill \u2192 @doc-updater through spec, plan, TDD implement, review, and doc update phases). Phase 4 (Verify) MUST validate EACH acceptance criterion from docs/SPRINT.md individually, reporting PASS/FAIL per item as a contract scorecard. MUST include a Phase 7 "Completion Gate" that runs a Completion Verification checklist before marking the feature done: re-read original requirements, confirm each is met with evidence, run test suite + lint/typecheck, review git diff for unexpected changes or debug artifacts, answer 3 perspective questions (test engineer, code reviewer, requesting user). If ANY check fails, loop back to fix before completing.
4902
5202
  - A TDD skill using the 3-phase isolation pattern (RED \u2192 GREEN \u2192 REFACTOR):
4903
5203
  - RED: Write failing test only. Verify it FAILS.
4904
5204
  - GREEN: Write MINIMUM code to pass. Nothing extra.
@@ -4908,13 +5208,21 @@ Merge this into the settings hooks alongside the PreToolUse and PostToolUse hook
4908
5208
  - \`@qa-orchestrator\` (sonnet) \u2014 delegates to linter and e2e-tester, compiles QA report
4909
5209
  - \`@linter\` (haiku) \u2014 runs formatters, linters, security scanners
4910
5210
  - \`@e2e-tester\` (sonnet, only when Playwright is in tools) \u2014 browser-based QA via Playwright
4911
- - Development pipeline agents (used by /project:develop):
4912
- - \`@architect\` (opus) \u2014 conducts spec interview with user, writes confirmed spec to docs/SPRINT.md
4913
- - \`@planner\` (opus) \u2014 reads spec and codebase, creates step-by-step implementation plan in docs/PLAN.md
4914
- - \`@implementer\` (sonnet) \u2014 TDD-focused implementation, writes failing tests then minimum code to pass
4915
- - \`@fixer\` (sonnet) \u2014 targeted bug fixing from verifier/review feedback
4916
- - \`@doc-updater\` (haiku) \u2014 extracts decisions and learnings from completed work, updates docs/DECISIONS.md and docs/LEARNINGS.md
4917
- - \`/project:spec\` command (interview-based spec creation \u2014 asks 5-8 questions one at a time, writes structured spec to docs/SPRINT.md, does NOT start coding until confirmed)
5211
+ - A "Model Selection" section in generated agents:
5212
+ \`\`\`
5213
+ ## Model Selection (all agents)
5214
+ - Haiku: simple file edits, linting, formatting, doc updates (<50 lines changed)
5215
+ - Sonnet: implementation, testing, debugging, code review (50-500 lines)
5216
+ - Opus: architecture decisions, spec writing, complex refactors (>500 lines or cross-cutting)
5217
+ Default: Sonnet. Only escalate to Opus when the task involves multi-file architecture or ambiguous requirements.
5218
+ \`\`\`
5219
+ - Development pipeline agents (used by /project:develop). Each agent should include a modelRouting field in its YAML frontmatter:
5220
+ - \`@architect\` (default: opus) \u2014 conducts spec interview with user, writes confirmed spec to docs/SPRINT.md with numbered acceptance criteria. Your spec is a CONTRACT \u2014 the verifier will check every criterion. Vague criteria = guaranteed rework.
5221
+ - \`@planner\` (default: sonnet, escalate to opus for cross-cutting changes) \u2014 reads spec and codebase, creates step-by-step implementation plan in docs/PLAN.md
5222
+ - \`@implementer\` (default: sonnet, escalate to opus for cross-cutting changes) \u2014 TDD-focused implementation, writes failing tests then minimum code to pass
5223
+ - \`@fixer\` (default: sonnet, use haiku for single-file fixes) \u2014 targeted bug fixing from verifier/review feedback
5224
+ - \`@doc-updater\` (default: haiku) \u2014 extracts decisions and learnings from completed work, updates docs/DECISIONS.md and docs/LEARNINGS.md
5225
+ - \`/project:spec\` command (interview-based spec creation \u2014 asks 5-8 questions one at a time, writes structured spec to docs/SPRINT.md with ## Acceptance Criteria containing 3-8 numbered, testable conditions. Each criterion must be independently verifiable. Does NOT start coding until confirmed)
4918
5226
  - \`/project:prove\` command (runs tests, shows git diff vs main, rates confidence HIGH/MEDIUM/LOW with evidence)
4919
5227
  - \`/project:grill\` command (adversarial code review \u2014 challenges each change with "why this approach?", "what if X input?", rates BLOCKER/SHOULD-FIX/NITPICK, blocks until BLOCKERs resolved)
4920
5228
  - \`/project:reset\` command (reads DECISIONS.md and LEARNINGS.md, proposes clean restart, stashes current work, implements elegant solution)
@@ -5067,6 +5375,12 @@ At the start of every session, before doing ANY work:
5067
5375
 
5068
5376
  This saves 2-5 exploratory turns. Never ask "what files are here?" \u2014 look first.
5069
5377
 
5378
+ ## Sprint Contract
5379
+
5380
+ Before implementing, confirm acceptance criteria exist in docs/SPRINT.md.
5381
+ Each criterion must be numbered, testable, and independently verifiable.
5382
+ After implementing, verify EACH criterion individually. Do not mark done until all pass.
5383
+
5070
5384
  ## Completion Standards
5071
5385
 
5072
5386
  Never mark a task "done" without running the Completion Verification checklist.
@@ -5096,6 +5410,7 @@ Do not add generic filler. Every line must be specific to the user's workflow.
5096
5410
  15. "Engineering Standards", "Tool Usage Policy", and "Code Philosophy" sections in CLAUDE.md
5097
5411
  16. A "First Turn Protocol" section in CLAUDE.md (orient before working: pwd, ls, git status, check relevant runtimes, read task files)
5098
5412
  17. A "Completion Standards" section in CLAUDE.md (never mark done without verifying: requirements met, tests passing, no debug artifacts, reviewed from 3 perspectives)
5413
+ 18. A "Sprint Contract" section in CLAUDE.md (confirm acceptance criteria exist before implementing, verify each criterion after)
5099
5414
 
5100
5415
  ## Tool Selection Rules
5101
5416
 
@@ -8860,7 +9175,7 @@ async function applyEvolution(workspacePath, projectRoot, targetIteration, pbt)
8860
9175
  // src/commands/evolve.ts
8861
9176
  var DEFAULT_CONFIG = {
8862
9177
  model: "claude-sonnet-4-6",
8863
- proposerModel: "claude-opus-4-6",
9178
+ proposerModel: "claude-sonnet-4-6",
8864
9179
  scorer: "pass-fail",
8865
9180
  maxIterations: 5,
8866
9181
  parallelTasks: 1,