npm - kairn-cli - Versions diffs - 2.7.2 → 2.9.0 - Mend

kairn-cli 2.7.2 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/cli.js CHANGED Viewed

@@ -218,6 +218,7 @@ async function callLLM(config, userMessage, options) {
   const maxTokens = options.maxTokens ?? 8192;
   const { systemPrompt } = options;
   const jsonMode = options.jsonMode ?? false;
+  const cacheControl = options.cacheControl ?? false;
   const providerName = getProviderName(config.provider);
   let apiKey = config.api_key;
   if (config.auth_type === "claude-code-oauth") {
@@ -238,7 +239,7 @@ async function callLLM(config, userMessage, options) {
       const response = await client2.messages.create({
         model: config.model,
         max_tokens: maxTokens,
-        system: systemPrompt,
+        system: cacheControl ? [{ type: "text", text: systemPrompt, cache_control: { type: "ephemeral" } }] : systemPrompt,
         messages
       });
       const textBlock = response.content.find((block) => block.type === "text");
@@ -461,6 +462,96 @@ var init_exec = __esm({
 });
 // src/evolve/scorers.ts
+function scoreCriterionDeterministic(criterionText, stdout, stderr) {
+  const combined = `${stdout}
+${stderr}`.toLowerCase();
+  const criterionLower = criterionText.toLowerCase().trim();
+  if (/^ran\b/i.test(criterionText.trim())) {
+    for (const entry of RAN_COMMAND_EVIDENCE) {
+      const matchesKeyword = entry.keywords.some(
+        (kw) => criterionLower.includes(kw.toLowerCase())
+      );
+      if (matchesKeyword) {
+        const found = entry.evidence.some((ev) => combined.includes(ev.toLowerCase()));
+        if (found) {
+          const matchedEvidence = entry.evidence.find(
+            (ev) => combined.includes(ev.toLowerCase())
+          );
+          return {
+            score: 1,
+            reasoning: `Deterministic: found evidence of '${matchedEvidence}' in output`
+          };
+        }
+        return {
+          score: 0,
+          reasoning: `Deterministic: no evidence of '${entry.keywords[0]}' found`
+        };
+      }
+    }
+    return null;
+  }
+  if (/^(zero|no)\b/i.test(criterionText.trim())) {
+    for (const entry of ABSENCE_PATTERNS) {
+      const matchesKeyword = entry.keywords.some(
+        (kw) => criterionLower.includes(kw.toLowerCase())
+      );
+      if (matchesKeyword) {
+        const found = entry.search.some((pat) => combined.includes(pat.toLowerCase()));
+        if (found) {
+          const matchedPattern = entry.search.find(
+            (pat) => combined.includes(pat.toLowerCase())
+          );
+          return {
+            score: 0,
+            reasoning: `Deterministic: found '${matchedPattern}' which should be absent`
+          };
+        }
+        return {
+          score: 1,
+          reasoning: `Deterministic: no prohibited pattern found in output`
+        };
+      }
+    }
+    return null;
+  }
+  if (/^uses?\b/i.test(criterionText.trim())) {
+    for (const entry of PRESENCE_PATTERNS) {
+      if (criterionLower.includes(entry.keyword.toLowerCase())) {
+        const found = entry.search.some((s) => combined.includes(s.toLowerCase()));
+        if (found) {
+          return {
+            score: 1,
+            reasoning: `Deterministic: found '${entry.keyword}' in output`
+          };
+        }
+        return {
+          score: 0,
+          reasoning: `Deterministic: '${entry.keyword}' not found in output`
+        };
+      }
+    }
+    return null;
+  }
+  if (/^calls?\b/i.test(criterionText.trim())) {
+    for (const pattern of CALL_PATTERNS) {
+      if (criterionLower.includes(pattern.toLowerCase())) {
+        const found = combined.includes(pattern.toLowerCase());
+        if (found) {
+          return {
+            score: 1,
+            reasoning: `Deterministic: found '${pattern}' in output`
+          };
+        }
+        return {
+          score: 0,
+          reasoning: `Deterministic: '${pattern}' not found in output`
+        };
+      }
+    }
+    return null;
+  }
+  return null;
+}
 async function passFailScorer(task, workspacePath, stdout, stderr) {
   const outcomes = Array.isArray(task.expected_outcome) ? task.expected_outcome : task.expected_outcome.split("\n");
   const commands = outcomes.map((line) => line.replace(/^-\s*/, "").trim()).filter((line) => COMMAND_PATTERN.test(line));
@@ -513,7 +604,8 @@ async function llmJudgeScorer(task, workspacePath, stdout, stderr, config) {
   try {
     const response = await callLLM(config, userMessage, {
       systemPrompt: JUDGE_SYSTEM_PROMPT,
-      maxTokens: 1024
+      maxTokens: 1024,
+      cacheControl: true
     });
     let cleaned = response.trim();
     if (cleaned.startsWith("```")) {
@@ -544,6 +636,20 @@ async function rubricScorer(task, workspacePath, stdout, stderr, config) {
   const breakdown = [];
   let weightedSum = 0;
   for (const criterion of task.rubric) {
+    const deterministicResult = scoreCriterionDeterministic(
+      criterion.criterion,
+      stdout,
+      stderr
+    );
+    if (deterministicResult !== null) {
+      breakdown.push({
+        criterion: criterion.criterion,
+        score: deterministicResult.score,
+        weight: criterion.weight
+      });
+      weightedSum += deterministicResult.score * criterion.weight;
+      continue;
+    }
     const userMessage = [
       "## Task",
       task.description,
@@ -560,7 +666,8 @@ async function rubricScorer(task, workspacePath, stdout, stderr, config) {
     try {
       const response = await callLLM(config, userMessage, {
         systemPrompt: RUBRIC_SYSTEM_PROMPT,
-        maxTokens: 512
+        maxTokens: 512,
+        cacheControl: true
       });
       let cleaned = response.trim();
       if (cleaned.startsWith("```")) {
@@ -638,7 +745,7 @@ async function scoreTask(task, workspacePath, stdout, stderr, config) {
   }
   return score;
 }
-var COMMAND_PATTERN, SHELL_METACHAR_PATTERN, JUDGE_SYSTEM_PROMPT, RUBRIC_SYSTEM_PROMPT;
+var COMMAND_PATTERN, SHELL_METACHAR_PATTERN, JUDGE_SYSTEM_PROMPT, RUBRIC_SYSTEM_PROMPT, RAN_COMMAND_EVIDENCE, ABSENCE_PATTERNS, PRESENCE_PATTERNS, CALL_PATTERNS;
 var init_scorers = __esm({
   "src/evolve/scorers.ts"() {
     "use strict";
@@ -661,6 +768,31 @@ Return ONLY valid JSON:
   "score": 0.0-1.0,
   "reasoning": "Brief explanation"
 }`;
+    RAN_COMMAND_EVIDENCE = [
+      { keywords: ["npm run build", "build", "tsup"], evidence: ["build success", "tsup", "built in", "build completed"] },
+      { keywords: ["tsc", "typecheck"], evidence: ["tsc", "typecheck"] },
+      { keywords: ["npm run lint", "eslint", "lint"], evidence: ["lint", "eslint"] },
+      { keywords: ["npm test", "vitest", "test"], evidence: ["vitest", "test files", "tests passed", "passed (", "tests "] }
+    ];
+    ABSENCE_PATTERNS = [
+      { keywords: [".then()", ".catch()"], search: [".then(", ".catch("] },
+      { keywords: ["readfilesync", "writefilesync"], search: ["readfilesync", "writefilesync"] },
+      { keywords: ["sync"], search: ["sync"] }
+    ];
+    PRESENCE_PATTERNS = [
+      { keyword: "chalk.green", search: ["chalk.green"] },
+      { keyword: "chalk.yellow", search: ["chalk.yellow"] },
+      { keyword: "chalk.red", search: ["chalk.red"] },
+      { keyword: "chalk.cyan", search: ["chalk.cyan"] },
+      { keyword: "fs.promises", search: ["fs.promises", "fs/promises"] },
+      { keyword: "fs/promises", search: ["fs.promises", "fs/promises"] },
+      { keyword: "async/await", search: ["async ", "await "] },
+      { keyword: "@inquirer/prompts", search: ["@inquirer/prompts"] }
+    ];
+    CALL_PATTERNS = [
+      "process.exit(1)",
+      "process.exit"
+    ];
   }
 });
@@ -1329,7 +1461,8 @@ async function propose(iteration, workspacePath, harnessPath, history, tasks, co
   const response = await callLLM(proposerConfig, userMessage, {
     systemPrompt: PROPOSER_SYSTEM_PROMPT,
     maxTokens: 8192,
-    jsonMode: true
+    jsonMode: true,
+    cacheControl: true
   });
   return parseProposerResponse(response);
 }
@@ -1831,7 +1964,22 @@ async function parseAgents(harnessPath) {
     if (Array.isArray(disallowedTools)) {
       node.disallowedTools = disallowedTools;
     }
-    const knownKeys = /* @__PURE__ */ new Set(["name", "model", "disallowedTools"]);
+    const modelRouting = frontmatter["modelRouting"];
+    if (typeof modelRouting === "object" && modelRouting !== null) {
+      const mr = modelRouting;
+      if (typeof mr["default"] === "string") {
+        node.modelRouting = {
+          default: mr["default"]
+        };
+        if (typeof mr["escalateTo"] === "string") {
+          node.modelRouting.escalateTo = mr["escalateTo"];
+        }
+        if (typeof mr["escalateWhen"] === "string") {
+          node.modelRouting.escalateWhen = mr["escalateWhen"];
+        }
+      }
+    }
+    const knownKeys = /* @__PURE__ */ new Set(["name", "model", "disallowedTools", "modelRouting"]);
     const extra = {};
     for (const [key, value] of Object.entries(frontmatter)) {
       if (!knownKeys.has(key)) {
@@ -2457,8 +2605,9 @@ function renderRuleWithFrontmatter(rule) {
 function renderAgentWithFrontmatter(agent) {
   const hasModel = agent.model !== void 0;
   const hasDisallowed = agent.disallowedTools !== void 0 && agent.disallowedTools.length > 0;
+  const hasRouting = agent.modelRouting !== void 0;
   const hasExtra = agent.extraFrontmatter !== void 0 && Object.keys(agent.extraFrontmatter).length > 0;
-  if (!hasModel && !hasDisallowed && !hasExtra) {
+  if (!hasModel && !hasDisallowed && !hasRouting && !hasExtra) {
     return agent.content;
   }
   const yamlLines = ["---"];
@@ -2471,6 +2620,16 @@ function renderAgentWithFrontmatter(agent) {
       yamlLines.push(`  - ${tool}`);
     }
   }
+  if (hasRouting) {
+    yamlLines.push("modelRouting:");
+    yamlLines.push(`  default: ${agent.modelRouting.default}`);
+    if (agent.modelRouting.escalateTo) {
+      yamlLines.push(`  escalateTo: ${agent.modelRouting.escalateTo}`);
+    }
+    if (agent.modelRouting.escalateWhen) {
+      yamlLines.push(`  escalateWhen: ${agent.modelRouting.escalateWhen}`);
+    }
+  }
   if (hasExtra) {
     for (const [key, value] of Object.entries(agent.extraFrontmatter)) {
       if (Array.isArray(value)) {
@@ -3350,6 +3509,92 @@ var init_regularization = __esm({
   }
 });
+// src/evolve/targeting.ts
+function mutationToAspect(mutation) {
+  switch (mutation.type) {
+    case "update_section": {
+      const id = mutation.sectionId;
+      if (id === "conventions" || id === "gotchas" || id === "debugging" || id === "git") return "conventions";
+      if (id === "commands" || id === "custom-key-commands") return "commands";
+      if (id === "verification") return "verification";
+      if (id === "architecture") return "architecture";
+      return "general";
+    }
+    case "add_section": {
+      const id = mutation.section.id;
+      if (id === "conventions" || id === "gotchas" || id === "debugging" || id === "git") return "conventions";
+      if (id === "commands" || id === "custom-key-commands") return "commands";
+      if (id === "verification") return "verification";
+      if (id === "architecture") return "architecture";
+      return "general";
+    }
+    case "remove_section":
+    case "reorder_section":
+      return "general";
+    case "add_command":
+    case "update_command":
+    case "remove_command":
+      return "commands";
+    case "add_rule":
+    case "update_rule":
+    case "remove_rule":
+      return "rules";
+    case "add_agent":
+    case "update_agent":
+    case "remove_agent":
+      return "agents";
+    case "add_mcp_server":
+    case "remove_mcp_server":
+      return "mcp";
+    case "update_settings":
+      return "settings";
+    case "raw_text":
+      return "general";
+  }
+}
+function mutationsToAspects(mutations) {
+  const aspects = /* @__PURE__ */ new Set();
+  for (const m of mutations) {
+    aspects.add(mutationToAspect(m));
+  }
+  return aspects;
+}
+function taskDependsOnAspects(task) {
+  const aspects = TEMPLATE_ASPECTS[task.template];
+  return new Set(aspects ?? ["general"]);
+}
+function shouldReEvaluate(task, changedAspects) {
+  if (changedAspects.has("general")) return true;
+  if (changedAspects.size === 0) return false;
+  const taskAspects = taskDependsOnAspects(task);
+  if (taskAspects.has("general")) return true;
+  for (const aspect of taskAspects) {
+    if (changedAspects.has(aspect)) return true;
+  }
+  return false;
+}
+function filterTasksByAspects(tasks, changedAspects) {
+  return tasks.filter((t) => shouldReEvaluate(t, changedAspects));
+}
+var TEMPLATE_ASPECTS;
+var init_targeting = __esm({
+  "src/evolve/targeting.ts"() {
+    "use strict";
+    TEMPLATE_ASPECTS = {
+      "convention-adherence": ["conventions", "rules"],
+      "workflow-compliance": ["commands", "verification"],
+      "rule-compliance": ["rules"],
+      "intent-routing": ["settings"],
+      "add-feature": ["general"],
+      "fix-bug": ["general"],
+      "refactor": ["architecture", "conventions"],
+      "test-writing": ["verification", "commands"],
+      "config-change": ["settings", "mcp"],
+      "documentation": ["general"]
+    };
+  }
+});
 // src/evolve/loop.ts
 import fs25 from "fs/promises";
 import path25 from "path";
@@ -3382,6 +3627,7 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
       }
     }
   }
+  let lastChangedAspects = null;
   let rngState = evolveConfig.rngSeed ?? 42;
   const rng = () => {
     rngState = rngState * 1664525 + 1013904223 & 4294967295;
@@ -3428,6 +3674,22 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
           tasksToRun.push(task);
         }
       }
+      if (lastChangedAspects !== null) {
+        const targetedTasks = filterTasksByAspects(tasksToRun, lastChangedAspects);
+        const skippedByTargeting = tasksToRun.filter((t) => !targetedTasks.includes(t));
+        for (const task of skippedByTargeting) {
+          const prev = prevLog.taskResults[task.id];
+          const prevVal = prev ? prev.score ?? (prev.pass ? 100 : 0) : 0;
+          carriedScores[task.id] = { pass: prevVal >= 50, score: prevVal };
+          onProgress?.({
+            type: "task-skipped",
+            iteration: iter,
+            taskId: task.id,
+            message: `Skipped ${task.id} (unaffected by mutations)`
+          });
+        }
+        tasksToRun = targetedTasks;
+      }
       const sampleSize = evolveConfig.evalSampleSize;
       if (sampleSize > 0 && sampleSize < tasksToRun.length) {
         let sampled;
@@ -3583,6 +3845,13 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
           }
           const nextIterDir2 = path25.join(workspacePath, "iterations", (iter + 1).toString());
           await applyMutations(bestHarnessPath, nextIterDir2, rollbackProposal.mutations);
+          try {
+            const rollbackIR = await parseHarness(bestHarnessPath);
+            const irMuts = translateMutations(rollbackProposal.mutations, rollbackIR);
+            lastChangedAspects = mutationsToAspects(irMuts);
+          } catch {
+            lastChangedAspects = null;
+          }
           onProgress?.({
             type: "mutations-applied",
             iteration: iter,
@@ -3687,8 +3956,16 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
         proposal.mutations
       );
       diffPatch = mutationResult.diffPatch;
+      try {
+        const currentIR = await parseHarness(harnessPath);
+        const irMuts = translateMutations(proposal.mutations, currentIR);
+        lastChangedAspects = mutationsToAspects(irMuts);
+      } catch {
+        lastChangedAspects = null;
+      }
     } catch {
       await copyDir(harnessPath, path25.join(nextIterDir, "harness"));
+      lastChangedAspects = null;
     }
     onProgress?.({
       type: "mutations-applied",
@@ -3787,6 +4064,8 @@ var init_loop = __esm({
     init_sampling();
     init_regularization();
     init_parser();
+    init_translate();
+    init_targeting();
   }
 });
@@ -3917,7 +4196,8 @@ ${userMessage}`;
   const response = await callLLM(proposerConfig, fullMessage, {
     systemPrompt,
     maxTokens: 8192,
-    jsonMode: true
+    jsonMode: true,
+    cacheControl: true
   });
   const proposal = parseProposerResponse(response);
   return {
@@ -4752,6 +5032,12 @@ At the start of every session, before doing ANY work:
 This saves 2-5 exploratory turns. Never ask "what files are here?" \u2014 look first.
+## Sprint Contract
+Before implementing, confirm acceptance criteria exist in docs/SPRINT.md.
+Each criterion must be numbered, testable, and independently verifiable.
+After implementing, verify EACH criterion individually. Do not mark done until all pass.
 ## Completion Standards
 Never mark a task "done" without running the Completion Verification checklist.
@@ -4781,6 +5067,7 @@ Do not add generic filler. Every line must be specific to the user's workflow.
 15. "Engineering Standards", "Tool Usage Policy", and "Code Philosophy" sections in CLAUDE.md
 16. A "First Turn Protocol" section in CLAUDE.md (orient before working: pwd, ls, git status, check relevant runtimes, read task files)
 17. A "Completion Standards" section in CLAUDE.md (never mark done without verifying: requirements met, tests passing, no debug artifacts, reviewed from 3 perspectives)
+18. A "Sprint Contract" section in CLAUDE.md (confirm acceptance criteria exist before implementing, verify each criterion after)
 ## Shell-Integrated Commands
@@ -4837,17 +5124,19 @@ Only generate scoped rules when the workflow involves multiple code domains.
 Generate hooks in settings.json based on project type:
-**All code projects** \u2014 block destructive commands:
+**All code projects** \u2014 block destructive commands, credential leaks, injection, and network exfiltration:
 \`\`\`json
 {
   "hooks": {
-    "PreToolUse": [{
-      "matcher": "Bash",
-      "hooks": [{
-        "type": "command",
-        "command": "CMD=$(cat | jq -r '.tool_input.command // empty') && echo \\"$CMD\\" | grep -qiE 'rm\\\\s+-rf\\\\s+/|DROP\\\\s+TABLE|curl.*\\\\|\\\\s*sh' && echo 'Blocked destructive command' >&2 && exit 2 || true"
-      }]
-    }]
+    "PreToolUse": [
+      {
+        "matcher": "Bash",
+        "hooks": [{
+          "type": "command",
+          "command": "CMD=$(cat | jq -r '.tool_input.command // empty') && echo \\"$CMD\\" | grep -qiE 'rm\\\\s+-rf\\\\s+/|DROP\\\\s+(TABLE|DATABASE)|curl.*\\\\|\\\\s*sh|:(){ :|:& };:|git\\\\s+push.*--force(?!-with-lease)|ch(mod|own).*-R\\\\s+/|npm\\\\s+publish(?!.*--dry-run)|(api[_-]?key|secret|token|password)\\\\s*[:=]|AKIA[0-9A-Z]{16}|BEGIN.*PRIVATE\\\\s+KEY|;\\\\s*(DROP|DELETE|ALTER|TRUNCATE)\\\\s+|\\\\.\\\\./\\\\.\\\\./\\\\.\\\\./|nc\\\\s+.*-e|/dev/tcp/|bash\\\\s+-i|curl.*-d.*@|wget.*--post-file' && echo 'Blocked dangerous command' >&2 && exit 2 || true"
+        }]
+      }
+    ]
   }
 }
 \`\`\`
@@ -4889,6 +5178,17 @@ All projects should include a PostCompact hook to restore context after compacti
 Merge this into the settings hooks alongside the PreToolUse and PostToolUse hooks.
+For long-running sessions (>2 hours or >3 compactions), prefer "Full Reset" over re-inject:
+replace the prompt-type PostCompact hook with a command-type hook that pipes CLAUDE.md + SPRINT.md + DECISIONS.md content directly into additionalContext.
+## Memory Persistence Hooks
+For projects with multi-session workflows, include SessionStart/End hooks that persist context to \`.claude/memory.json\`:
+- **SessionEnd:** Save recent decisions, sprint status, and known gotchas to \`.claude/memory.json\`
+- **SessionStart:** Load \`.claude/memory.json\` and inject as additionalContext
+This ensures accumulated project knowledge survives session boundaries.
 ## For Code Projects, Additionally Include
 - \`/project:plan\` command (plan before coding)
@@ -4898,7 +5198,7 @@ Merge this into the settings hooks alongside the PreToolUse and PostToolUse hook
 - \`/project:status\` command (live git status, recent commits, SPRINT.md overview using ! prefix)
 - \`/project:fix\` command (takes $ARGUMENTS as issue number, plans fix, implements, tests, commits)
 - \`/project:sprint\` command (define acceptance criteria before coding, writes to docs/SPRINT.md)
-- \`/project:develop\` command (full development pipeline \u2014 orchestrates @architect \u2192 @planner \u2192 @implementer \u2192 @verifier \u2192 @fixer \u2192 @grill \u2192 @doc-updater through spec, plan, TDD implement, review, and doc update phases). MUST include a Phase 7 "Completion Gate" that runs a Completion Verification checklist before marking the feature done: re-read original requirements, confirm each is met with evidence, run test suite + lint/typecheck, review git diff for unexpected changes or debug artifacts, answer 3 perspective questions (test engineer, code reviewer, requesting user). If ANY check fails, loop back to fix before completing.
+- \`/project:develop\` command (full development pipeline \u2014 orchestrates @architect \u2192 @planner \u2192 @implementer \u2192 @verifier \u2192 @fixer \u2192 @grill \u2192 @doc-updater through spec, plan, TDD implement, review, and doc update phases). Phase 4 (Verify) MUST validate EACH acceptance criterion from docs/SPRINT.md individually, reporting PASS/FAIL per item as a contract scorecard. MUST include a Phase 7 "Completion Gate" that runs a Completion Verification checklist before marking the feature done: re-read original requirements, confirm each is met with evidence, run test suite + lint/typecheck, review git diff for unexpected changes or debug artifacts, answer 3 perspective questions (test engineer, code reviewer, requesting user). If ANY check fails, loop back to fix before completing.
 - A TDD skill using the 3-phase isolation pattern (RED \u2192 GREEN \u2192 REFACTOR):
   - RED: Write failing test only. Verify it FAILS.
   - GREEN: Write MINIMUM code to pass. Nothing extra.
@@ -4908,13 +5208,21 @@ Merge this into the settings hooks alongside the PreToolUse and PostToolUse hook
   - \`@qa-orchestrator\` (sonnet) \u2014 delegates to linter and e2e-tester, compiles QA report
   - \`@linter\` (haiku) \u2014 runs formatters, linters, security scanners
   - \`@e2e-tester\` (sonnet, only when Playwright is in tools) \u2014 browser-based QA via Playwright
-- Development pipeline agents (used by /project:develop):
-  - \`@architect\` (opus) \u2014 conducts spec interview with user, writes confirmed spec to docs/SPRINT.md
-  - \`@planner\` (opus) \u2014 reads spec and codebase, creates step-by-step implementation plan in docs/PLAN.md
-  - \`@implementer\` (sonnet) \u2014 TDD-focused implementation, writes failing tests then minimum code to pass
-  - \`@fixer\` (sonnet) \u2014 targeted bug fixing from verifier/review feedback
-  - \`@doc-updater\` (haiku) \u2014 extracts decisions and learnings from completed work, updates docs/DECISIONS.md and docs/LEARNINGS.md
-- \`/project:spec\` command (interview-based spec creation \u2014 asks 5-8 questions one at a time, writes structured spec to docs/SPRINT.md, does NOT start coding until confirmed)
+- A "Model Selection" section in generated agents:
+  \`\`\`
+  ## Model Selection (all agents)
+  - Haiku: simple file edits, linting, formatting, doc updates (<50 lines changed)
+  - Sonnet: implementation, testing, debugging, code review (50-500 lines)
+  - Opus: architecture decisions, spec writing, complex refactors (>500 lines or cross-cutting)
+  Default: Sonnet. Only escalate to Opus when the task involves multi-file architecture or ambiguous requirements.
+  \`\`\`
+- Development pipeline agents (used by /project:develop). Each agent should include a modelRouting field in its YAML frontmatter:
+  - \`@architect\` (default: opus) \u2014 conducts spec interview with user, writes confirmed spec to docs/SPRINT.md with numbered acceptance criteria. Your spec is a CONTRACT \u2014 the verifier will check every criterion. Vague criteria = guaranteed rework.
+  - \`@planner\` (default: sonnet, escalate to opus for cross-cutting changes) \u2014 reads spec and codebase, creates step-by-step implementation plan in docs/PLAN.md
+  - \`@implementer\` (default: sonnet, escalate to opus for cross-cutting changes) \u2014 TDD-focused implementation, writes failing tests then minimum code to pass
+  - \`@fixer\` (default: sonnet, use haiku for single-file fixes) \u2014 targeted bug fixing from verifier/review feedback
+  - \`@doc-updater\` (default: haiku) \u2014 extracts decisions and learnings from completed work, updates docs/DECISIONS.md and docs/LEARNINGS.md
+- \`/project:spec\` command (interview-based spec creation \u2014 asks 5-8 questions one at a time, writes structured spec to docs/SPRINT.md with ## Acceptance Criteria containing 3-8 numbered, testable conditions. Each criterion must be independently verifiable. Does NOT start coding until confirmed)
 - \`/project:prove\` command (runs tests, shows git diff vs main, rates confidence HIGH/MEDIUM/LOW with evidence)
 - \`/project:grill\` command (adversarial code review \u2014 challenges each change with "why this approach?", "what if X input?", rates BLOCKER/SHOULD-FIX/NITPICK, blocks until BLOCKERs resolved)
 - \`/project:reset\` command (reads DECISIONS.md and LEARNINGS.md, proposes clean restart, stashes current work, implements elegant solution)
@@ -5067,6 +5375,12 @@ At the start of every session, before doing ANY work:
 This saves 2-5 exploratory turns. Never ask "what files are here?" \u2014 look first.
+## Sprint Contract
+Before implementing, confirm acceptance criteria exist in docs/SPRINT.md.
+Each criterion must be numbered, testable, and independently verifiable.
+After implementing, verify EACH criterion individually. Do not mark done until all pass.
 ## Completion Standards
 Never mark a task "done" without running the Completion Verification checklist.
@@ -5096,6 +5410,7 @@ Do not add generic filler. Every line must be specific to the user's workflow.
 15. "Engineering Standards", "Tool Usage Policy", and "Code Philosophy" sections in CLAUDE.md
 16. A "First Turn Protocol" section in CLAUDE.md (orient before working: pwd, ls, git status, check relevant runtimes, read task files)
 17. A "Completion Standards" section in CLAUDE.md (never mark done without verifying: requirements met, tests passing, no debug artifacts, reviewed from 3 perspectives)
+18. A "Sprint Contract" section in CLAUDE.md (confirm acceptance criteria exist before implementing, verify each criterion after)
 ## Tool Selection Rules
@@ -8860,7 +9175,7 @@ async function applyEvolution(workspacePath, projectRoot, targetIteration, pbt)
 // src/commands/evolve.ts
 var DEFAULT_CONFIG = {
   model: "claude-sonnet-4-6",
-  proposerModel: "claude-opus-4-6",
+  proposerModel: "claude-sonnet-4-6",
   scorer: "pass-fail",
   maxIterations: 5,
   parallelTasks: 1,