npm - kairn-cli - Versions diffs - 2.7.2 → 2.10.0 - Mend

kairn-cli 2.7.2 → 2.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/cli.js CHANGED Viewed

@@ -218,6 +218,7 @@ async function callLLM(config, userMessage, options) {
   const maxTokens = options.maxTokens ?? 8192;
   const { systemPrompt } = options;
   const jsonMode = options.jsonMode ?? false;
+  const cacheControl = options.cacheControl ?? false;
   const providerName = getProviderName(config.provider);
   let apiKey = config.api_key;
   if (config.auth_type === "claude-code-oauth") {
@@ -238,7 +239,7 @@ async function callLLM(config, userMessage, options) {
       const response = await client2.messages.create({
         model: config.model,
         max_tokens: maxTokens,
-        system: systemPrompt,
+        system: cacheControl ? [{ type: "text", text: systemPrompt, cache_control: { type: "ephemeral" } }] : systemPrompt,
         messages
       });
       const textBlock = response.content.find((block) => block.type === "text");
@@ -461,6 +462,96 @@ var init_exec = __esm({
 });
 // src/evolve/scorers.ts
+function scoreCriterionDeterministic(criterionText, stdout, stderr) {
+  const combined = `${stdout}
+${stderr}`.toLowerCase();
+  const criterionLower = criterionText.toLowerCase().trim();
+  if (/^ran\b/i.test(criterionText.trim())) {
+    for (const entry of RAN_COMMAND_EVIDENCE) {
+      const matchesKeyword = entry.keywords.some(
+        (kw) => criterionLower.includes(kw.toLowerCase())
+      );
+      if (matchesKeyword) {
+        const found = entry.evidence.some((ev) => combined.includes(ev.toLowerCase()));
+        if (found) {
+          const matchedEvidence = entry.evidence.find(
+            (ev) => combined.includes(ev.toLowerCase())
+          );
+          return {
+            score: 1,
+            reasoning: `Deterministic: found evidence of '${matchedEvidence}' in output`
+          };
+        }
+        return {
+          score: 0,
+          reasoning: `Deterministic: no evidence of '${entry.keywords[0]}' found`
+        };
+      }
+    }
+    return null;
+  }
+  if (/^(zero|no)\b/i.test(criterionText.trim())) {
+    for (const entry of ABSENCE_PATTERNS) {
+      const matchesKeyword = entry.keywords.some(
+        (kw) => criterionLower.includes(kw.toLowerCase())
+      );
+      if (matchesKeyword) {
+        const found = entry.search.some((pat) => combined.includes(pat.toLowerCase()));
+        if (found) {
+          const matchedPattern = entry.search.find(
+            (pat) => combined.includes(pat.toLowerCase())
+          );
+          return {
+            score: 0,
+            reasoning: `Deterministic: found '${matchedPattern}' which should be absent`
+          };
+        }
+        return {
+          score: 1,
+          reasoning: `Deterministic: no prohibited pattern found in output`
+        };
+      }
+    }
+    return null;
+  }
+  if (/^uses?\b/i.test(criterionText.trim())) {
+    for (const entry of PRESENCE_PATTERNS) {
+      if (criterionLower.includes(entry.keyword.toLowerCase())) {
+        const found = entry.search.some((s) => combined.includes(s.toLowerCase()));
+        if (found) {
+          return {
+            score: 1,
+            reasoning: `Deterministic: found '${entry.keyword}' in output`
+          };
+        }
+        return {
+          score: 0,
+          reasoning: `Deterministic: '${entry.keyword}' not found in output`
+        };
+      }
+    }
+    return null;
+  }
+  if (/^calls?\b/i.test(criterionText.trim())) {
+    for (const pattern of CALL_PATTERNS) {
+      if (criterionLower.includes(pattern.toLowerCase())) {
+        const found = combined.includes(pattern.toLowerCase());
+        if (found) {
+          return {
+            score: 1,
+            reasoning: `Deterministic: found '${pattern}' in output`
+          };
+        }
+        return {
+          score: 0,
+          reasoning: `Deterministic: '${pattern}' not found in output`
+        };
+      }
+    }
+    return null;
+  }
+  return null;
+}
 async function passFailScorer(task, workspacePath, stdout, stderr) {
   const outcomes = Array.isArray(task.expected_outcome) ? task.expected_outcome : task.expected_outcome.split("\n");
   const commands = outcomes.map((line) => line.replace(/^-\s*/, "").trim()).filter((line) => COMMAND_PATTERN.test(line));
@@ -513,7 +604,8 @@ async function llmJudgeScorer(task, workspacePath, stdout, stderr, config) {
   try {
     const response = await callLLM(config, userMessage, {
       systemPrompt: JUDGE_SYSTEM_PROMPT,
-      maxTokens: 1024
+      maxTokens: 1024,
+      cacheControl: true
     });
     let cleaned = response.trim();
     if (cleaned.startsWith("```")) {
@@ -544,6 +636,20 @@ async function rubricScorer(task, workspacePath, stdout, stderr, config) {
   const breakdown = [];
   let weightedSum = 0;
   for (const criterion of task.rubric) {
+    const deterministicResult = scoreCriterionDeterministic(
+      criterion.criterion,
+      stdout,
+      stderr
+    );
+    if (deterministicResult !== null) {
+      breakdown.push({
+        criterion: criterion.criterion,
+        score: deterministicResult.score,
+        weight: criterion.weight
+      });
+      weightedSum += deterministicResult.score * criterion.weight;
+      continue;
+    }
     const userMessage = [
       "## Task",
       task.description,
@@ -560,7 +666,8 @@ async function rubricScorer(task, workspacePath, stdout, stderr, config) {
     try {
       const response = await callLLM(config, userMessage, {
         systemPrompt: RUBRIC_SYSTEM_PROMPT,
-        maxTokens: 512
+        maxTokens: 512,
+        cacheControl: true
       });
       let cleaned = response.trim();
       if (cleaned.startsWith("```")) {
@@ -638,7 +745,7 @@ async function scoreTask(task, workspacePath, stdout, stderr, config) {
   }
   return score;
 }
-var COMMAND_PATTERN, SHELL_METACHAR_PATTERN, JUDGE_SYSTEM_PROMPT, RUBRIC_SYSTEM_PROMPT;
+var COMMAND_PATTERN, SHELL_METACHAR_PATTERN, JUDGE_SYSTEM_PROMPT, RUBRIC_SYSTEM_PROMPT, RAN_COMMAND_EVIDENCE, ABSENCE_PATTERNS, PRESENCE_PATTERNS, CALL_PATTERNS;
 var init_scorers = __esm({
   "src/evolve/scorers.ts"() {
     "use strict";
@@ -661,6 +768,31 @@ Return ONLY valid JSON:
   "score": 0.0-1.0,
   "reasoning": "Brief explanation"
 }`;
+    RAN_COMMAND_EVIDENCE = [
+      { keywords: ["npm run build", "build", "tsup"], evidence: ["build success", "tsup", "built in", "build completed"] },
+      { keywords: ["tsc", "typecheck"], evidence: ["tsc", "typecheck"] },
+      { keywords: ["npm run lint", "eslint", "lint"], evidence: ["lint", "eslint"] },
+      { keywords: ["npm test", "vitest", "test"], evidence: ["vitest", "test files", "tests passed", "passed (", "tests "] }
+    ];
+    ABSENCE_PATTERNS = [
+      { keywords: [".then()", ".catch()"], search: [".then(", ".catch("] },
+      { keywords: ["readfilesync", "writefilesync"], search: ["readfilesync", "writefilesync"] },
+      { keywords: ["sync"], search: ["sync"] }
+    ];
+    PRESENCE_PATTERNS = [
+      { keyword: "chalk.green", search: ["chalk.green"] },
+      { keyword: "chalk.yellow", search: ["chalk.yellow"] },
+      { keyword: "chalk.red", search: ["chalk.red"] },
+      { keyword: "chalk.cyan", search: ["chalk.cyan"] },
+      { keyword: "fs.promises", search: ["fs.promises", "fs/promises"] },
+      { keyword: "fs/promises", search: ["fs.promises", "fs/promises"] },
+      { keyword: "async/await", search: ["async ", "await "] },
+      { keyword: "@inquirer/prompts", search: ["@inquirer/prompts"] }
+    ];
+    CALL_PATTERNS = [
+      "process.exit(1)",
+      "process.exit"
+    ];
   }
 });
@@ -1329,7 +1461,8 @@ async function propose(iteration, workspacePath, harnessPath, history, tasks, co
   const response = await callLLM(proposerConfig, userMessage, {
     systemPrompt: PROPOSER_SYSTEM_PROMPT,
     maxTokens: 8192,
-    jsonMode: true
+    jsonMode: true,
+    cacheControl: true
   });
   return parseProposerResponse(response);
 }
@@ -1831,7 +1964,22 @@ async function parseAgents(harnessPath) {
     if (Array.isArray(disallowedTools)) {
       node.disallowedTools = disallowedTools;
     }
-    const knownKeys = /* @__PURE__ */ new Set(["name", "model", "disallowedTools"]);
+    const modelRouting = frontmatter["modelRouting"];
+    if (typeof modelRouting === "object" && modelRouting !== null) {
+      const mr = modelRouting;
+      if (typeof mr["default"] === "string") {
+        node.modelRouting = {
+          default: mr["default"]
+        };
+        if (typeof mr["escalateTo"] === "string") {
+          node.modelRouting.escalateTo = mr["escalateTo"];
+        }
+        if (typeof mr["escalateWhen"] === "string") {
+          node.modelRouting.escalateWhen = mr["escalateWhen"];
+        }
+      }
+    }
+    const knownKeys = /* @__PURE__ */ new Set(["name", "model", "disallowedTools", "modelRouting"]);
     const extra = {};
     for (const [key, value] of Object.entries(frontmatter)) {
       if (!knownKeys.has(key)) {
@@ -2457,8 +2605,9 @@ function renderRuleWithFrontmatter(rule) {
 function renderAgentWithFrontmatter(agent) {
   const hasModel = agent.model !== void 0;
   const hasDisallowed = agent.disallowedTools !== void 0 && agent.disallowedTools.length > 0;
+  const hasRouting = agent.modelRouting !== void 0;
   const hasExtra = agent.extraFrontmatter !== void 0 && Object.keys(agent.extraFrontmatter).length > 0;
-  if (!hasModel && !hasDisallowed && !hasExtra) {
+  if (!hasModel && !hasDisallowed && !hasRouting && !hasExtra) {
     return agent.content;
   }
   const yamlLines = ["---"];
@@ -2471,6 +2620,16 @@ function renderAgentWithFrontmatter(agent) {
       yamlLines.push(`  - ${tool}`);
     }
   }
+  if (hasRouting) {
+    yamlLines.push("modelRouting:");
+    yamlLines.push(`  default: ${agent.modelRouting.default}`);
+    if (agent.modelRouting.escalateTo) {
+      yamlLines.push(`  escalateTo: ${agent.modelRouting.escalateTo}`);
+    }
+    if (agent.modelRouting.escalateWhen) {
+      yamlLines.push(`  escalateWhen: ${agent.modelRouting.escalateWhen}`);
+    }
+  }
   if (hasExtra) {
     for (const [key, value] of Object.entries(agent.extraFrontmatter)) {
       if (Array.isArray(value)) {
@@ -3350,6 +3509,93 @@ var init_regularization = __esm({
   }
 });
+// src/evolve/targeting.ts
+function mutationToAspect(mutation) {
+  switch (mutation.type) {
+    case "update_section": {
+      const id = mutation.sectionId;
+      if (id === "conventions" || id === "gotchas" || id === "debugging" || id === "git") return "conventions";
+      if (id === "commands" || id === "custom-key-commands") return "commands";
+      if (id === "verification") return "verification";
+      if (id === "architecture") return "architecture";
+      return "general";
+    }
+    case "add_section": {
+      const id = mutation.section.id;
+      if (id === "conventions" || id === "gotchas" || id === "debugging" || id === "git") return "conventions";
+      if (id === "commands" || id === "custom-key-commands") return "commands";
+      if (id === "verification") return "verification";
+      if (id === "architecture") return "architecture";
+      return "general";
+    }
+    case "remove_section":
+    case "reorder_section":
+      return "general";
+    case "add_command":
+    case "update_command":
+    case "remove_command":
+      return "commands";
+    case "add_rule":
+    case "update_rule":
+    case "remove_rule":
+      return "rules";
+    case "add_agent":
+    case "update_agent":
+    case "remove_agent":
+      return "agents";
+    case "add_mcp_server":
+    case "remove_mcp_server":
+      return "mcp";
+    case "update_settings":
+      return "settings";
+    case "raw_text":
+      return "general";
+  }
+}
+function mutationsToAspects(mutations) {
+  const aspects = /* @__PURE__ */ new Set();
+  for (const m of mutations) {
+    aspects.add(mutationToAspect(m));
+  }
+  return aspects;
+}
+function taskDependsOnAspects(task) {
+  const aspects = TEMPLATE_ASPECTS[task.template];
+  return new Set(aspects ?? ["general"]);
+}
+function shouldReEvaluate(task, changedAspects) {
+  if (changedAspects.has("general")) return true;
+  if (changedAspects.size === 0) return false;
+  const taskAspects = taskDependsOnAspects(task);
+  if (taskAspects.has("general")) return true;
+  for (const aspect of taskAspects) {
+    if (changedAspects.has(aspect)) return true;
+  }
+  return false;
+}
+function filterTasksByAspects(tasks, changedAspects) {
+  return tasks.filter((t) => shouldReEvaluate(t, changedAspects));
+}
+var TEMPLATE_ASPECTS;
+var init_targeting = __esm({
+  "src/evolve/targeting.ts"() {
+    "use strict";
+    TEMPLATE_ASPECTS = {
+      "convention-adherence": ["conventions", "rules"],
+      "workflow-compliance": ["commands", "verification"],
+      "rule-compliance": ["rules"],
+      "intent-routing": ["settings"],
+      "add-feature": ["general"],
+      "fix-bug": ["general"],
+      "refactor": ["architecture", "conventions"],
+      "test-writing": ["verification", "commands"],
+      "config-change": ["settings", "mcp"],
+      "documentation": ["general"],
+      "persistence-completion": ["commands", "verification"]
+    };
+  }
+});
 // src/evolve/loop.ts
 import fs25 from "fs/promises";
 import path25 from "path";
@@ -3382,6 +3628,7 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
       }
     }
   }
+  let lastChangedAspects = null;
   let rngState = evolveConfig.rngSeed ?? 42;
   const rng = () => {
     rngState = rngState * 1664525 + 1013904223 & 4294967295;
@@ -3428,6 +3675,22 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
           tasksToRun.push(task);
         }
       }
+      if (lastChangedAspects !== null) {
+        const targetedTasks = filterTasksByAspects(tasksToRun, lastChangedAspects);
+        const skippedByTargeting = tasksToRun.filter((t) => !targetedTasks.includes(t));
+        for (const task of skippedByTargeting) {
+          const prev = prevLog.taskResults[task.id];
+          const prevVal = prev ? prev.score ?? (prev.pass ? 100 : 0) : 0;
+          carriedScores[task.id] = { pass: prevVal >= 50, score: prevVal };
+          onProgress?.({
+            type: "task-skipped",
+            iteration: iter,
+            taskId: task.id,
+            message: `Skipped ${task.id} (unaffected by mutations)`
+          });
+        }
+        tasksToRun = targetedTasks;
+      }
       const sampleSize = evolveConfig.evalSampleSize;
       if (sampleSize > 0 && sampleSize < tasksToRun.length) {
         let sampled;
@@ -3583,6 +3846,13 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
           }
           const nextIterDir2 = path25.join(workspacePath, "iterations", (iter + 1).toString());
           await applyMutations(bestHarnessPath, nextIterDir2, rollbackProposal.mutations);
+          try {
+            const rollbackIR = await parseHarness(bestHarnessPath);
+            const irMuts = translateMutations(rollbackProposal.mutations, rollbackIR);
+            lastChangedAspects = mutationsToAspects(irMuts);
+          } catch {
+            lastChangedAspects = null;
+          }
           onProgress?.({
             type: "mutations-applied",
             iteration: iter,
@@ -3687,8 +3957,16 @@ async function evolve(workspacePath, tasks, kairnConfig, evolveConfig, onProgres
         proposal.mutations
       );
       diffPatch = mutationResult.diffPatch;
+      try {
+        const currentIR = await parseHarness(harnessPath);
+        const irMuts = translateMutations(proposal.mutations, currentIR);
+        lastChangedAspects = mutationsToAspects(irMuts);
+      } catch {
+        lastChangedAspects = null;
+      }
     } catch {
       await copyDir(harnessPath, path25.join(nextIterDir, "harness"));
+      lastChangedAspects = null;
     }
     onProgress?.({
       type: "mutations-applied",
@@ -3787,6 +4065,8 @@ var init_loop = __esm({
     init_sampling();
     init_regularization();
     init_parser();
+    init_translate();
+    init_targeting();
   }
 });
@@ -3917,7 +4197,8 @@ ${userMessage}`;
   const response = await callLLM(proposerConfig, fullMessage, {
     systemPrompt,
     maxTokens: 8192,
-    jsonMode: true
+    jsonMode: true,
+    cacheControl: true
   });
   const proposal = parseProposerResponse(response);
   return {
@@ -4629,7 +4910,7 @@ You must output a JSON object matching the SkeletonSpec schema.
 - MCP servers: maximum 6. Prefer fewer.
 - Skills: maximum 3. Only include directly relevant ones.
 - Agents: maximum 5. Orchestration pipeline (/develop) agents.
-- Hooks: maximum 4 (auto-format, block-destructive, PostCompact, plus one contextual).
+- Hooks: maximum 5 (auto-format, block-destructive, PostCompact, memory-persistence, plus one contextual).
 If the workflow doesn't clearly need a tool, DO NOT include it.
 Each MCP server costs 500-2000 tokens of context window.
@@ -4752,6 +5033,12 @@ At the start of every session, before doing ANY work:
 This saves 2-5 exploratory turns. Never ask "what files are here?" \u2014 look first.
+## Sprint Contract
+Before implementing, confirm acceptance criteria exist in docs/SPRINT.md.
+Each criterion must be numbered, testable, and independently verifiable.
+After implementing, verify EACH criterion individually. Do not mark done until all pass.
 ## Completion Standards
 Never mark a task "done" without running the Completion Verification checklist.
@@ -4781,6 +5068,7 @@ Do not add generic filler. Every line must be specific to the user's workflow.
 15. "Engineering Standards", "Tool Usage Policy", and "Code Philosophy" sections in CLAUDE.md
 16. A "First Turn Protocol" section in CLAUDE.md (orient before working: pwd, ls, git status, check relevant runtimes, read task files)
 17. A "Completion Standards" section in CLAUDE.md (never mark done without verifying: requirements met, tests passing, no debug artifacts, reviewed from 3 perspectives)
+18. A "Sprint Contract" section in CLAUDE.md (confirm acceptance criteria exist before implementing, verify each criterion after)
 ## Shell-Integrated Commands
@@ -4837,17 +5125,19 @@ Only generate scoped rules when the workflow involves multiple code domains.
 Generate hooks in settings.json based on project type:
-**All code projects** \u2014 block destructive commands:
+**All code projects** \u2014 block destructive commands, credential leaks, injection, and network exfiltration:
 \`\`\`json
 {
   "hooks": {
-    "PreToolUse": [{
-      "matcher": "Bash",
-      "hooks": [{
-        "type": "command",
-        "command": "CMD=$(cat | jq -r '.tool_input.command // empty') && echo \\"$CMD\\" | grep -qiE 'rm\\\\s+-rf\\\\s+/|DROP\\\\s+TABLE|curl.*\\\\|\\\\s*sh' && echo 'Blocked destructive command' >&2 && exit 2 || true"
-      }]
-    }]
+    "PreToolUse": [
+      {
+        "matcher": "Bash",
+        "hooks": [{
+          "type": "command",
+          "command": "CMD=$(cat | jq -r '.tool_input.command // empty') && echo \\"$CMD\\" | grep -qiE 'rm\\\\s+-rf\\\\s+/|DROP\\\\s+(TABLE|DATABASE)|curl.*\\\\|\\\\s*sh|:(){ :|:& };:|git\\\\s+push.*--force(?!-with-lease)|ch(mod|own).*-R\\\\s+/|npm\\\\s+publish(?!.*--dry-run)|(api[_-]?key|secret|token|password)\\\\s*[:=]|AKIA[0-9A-Z]{16}|BEGIN.*PRIVATE\\\\s+KEY|;\\\\s*(DROP|DELETE|ALTER|TRUNCATE)\\\\s+|\\\\.\\\\./\\\\.\\\\./\\\\.\\\\./|nc\\\\s+.*-e|/dev/tcp/|bash\\\\s+-i|curl.*-d.*@|wget.*--post-file' && echo 'Blocked dangerous command' >&2 && exit 2 || true"
+        }]
+      }
+    ]
   }
 }
 \`\`\`
@@ -4889,6 +5179,23 @@ All projects should include a PostCompact hook to restore context after compacti
 Merge this into the settings hooks alongside the PreToolUse and PostToolUse hooks.
+For long-running sessions (>2 hours or >3 compactions), prefer "Full Reset" over re-inject:
+replace the prompt-type PostCompact hook with a command-type hook that pipes CLAUDE.md + SPRINT.md + DECISIONS.md content directly into additionalContext.
+## Memory Persistence Hooks
+For projects with multi-session workflows, include SessionStart/End hooks that persist context to \`.claude/memory.json\`:
+- **SessionEnd:** Save recent decisions, sprint status, and known gotchas to \`.claude/memory.json\`
+- **SessionStart:** Load \`.claude/memory.json\` and inject as additionalContext
+This ensures accumulated project knowledge survives session boundaries.
+## Persistence Routing Hook
+For code projects with autonomy level 3+, include a \`UserPromptSubmit\` hook that routes complex tasks through \`/project:persist\`.
+Generate \`.claude/hooks/persist-router.mjs\` (the routing logic) and add a \`UserPromptSubmit\` hook entry in settings.json that invokes it.
+Also add \`"persistence_routing": "auto"\` (level 3-4) or \`"persistence_routing": "manual"\` (level 1-2) to settings.json.
 ## For Code Projects, Additionally Include
 - \`/project:plan\` command (plan before coding)
@@ -4898,7 +5205,7 @@ Merge this into the settings hooks alongside the PreToolUse and PostToolUse hook
 - \`/project:status\` command (live git status, recent commits, SPRINT.md overview using ! prefix)
 - \`/project:fix\` command (takes $ARGUMENTS as issue number, plans fix, implements, tests, commits)
 - \`/project:sprint\` command (define acceptance criteria before coding, writes to docs/SPRINT.md)
-- \`/project:develop\` command (full development pipeline \u2014 orchestrates @architect \u2192 @planner \u2192 @implementer \u2192 @verifier \u2192 @fixer \u2192 @grill \u2192 @doc-updater through spec, plan, TDD implement, review, and doc update phases). MUST include a Phase 7 "Completion Gate" that runs a Completion Verification checklist before marking the feature done: re-read original requirements, confirm each is met with evidence, run test suite + lint/typecheck, review git diff for unexpected changes or debug artifacts, answer 3 perspective questions (test engineer, code reviewer, requesting user). If ANY check fails, loop back to fix before completing.
+- \`/project:develop\` command (full development pipeline \u2014 orchestrates @architect \u2192 @planner \u2192 @implementer \u2192 @verifier \u2192 @fixer \u2192 @grill \u2192 @doc-updater through spec, plan, TDD implement, review, and doc update phases). Phase 4 (Verify) MUST validate EACH acceptance criterion from docs/SPRINT.md individually, reporting PASS/FAIL per item as a contract scorecard. MUST include a Phase 7 "Completion Gate" that runs a Completion Verification checklist before marking the feature done: re-read original requirements, confirm each is met with evidence, run test suite + lint/typecheck, review git diff for unexpected changes or debug artifacts, answer 3 perspective questions (test engineer, code reviewer, requesting user). If ANY check fails, loop back to fix before completing.
 - A TDD skill using the 3-phase isolation pattern (RED \u2192 GREEN \u2192 REFACTOR):
   - RED: Write failing test only. Verify it FAILS.
   - GREEN: Write MINIMUM code to pass. Nothing extra.
@@ -4908,16 +5215,31 @@ Merge this into the settings hooks alongside the PreToolUse and PostToolUse hook
   - \`@qa-orchestrator\` (sonnet) \u2014 delegates to linter and e2e-tester, compiles QA report
   - \`@linter\` (haiku) \u2014 runs formatters, linters, security scanners
   - \`@e2e-tester\` (sonnet, only when Playwright is in tools) \u2014 browser-based QA via Playwright
-- Development pipeline agents (used by /project:develop):
-  - \`@architect\` (opus) \u2014 conducts spec interview with user, writes confirmed spec to docs/SPRINT.md
-  - \`@planner\` (opus) \u2014 reads spec and codebase, creates step-by-step implementation plan in docs/PLAN.md
-  - \`@implementer\` (sonnet) \u2014 TDD-focused implementation, writes failing tests then minimum code to pass
-  - \`@fixer\` (sonnet) \u2014 targeted bug fixing from verifier/review feedback
-  - \`@doc-updater\` (haiku) \u2014 extracts decisions and learnings from completed work, updates docs/DECISIONS.md and docs/LEARNINGS.md
-- \`/project:spec\` command (interview-based spec creation \u2014 asks 5-8 questions one at a time, writes structured spec to docs/SPRINT.md, does NOT start coding until confirmed)
+- A "Model Selection" section in generated agents:
+  \`\`\`
+  ## Model Selection (all agents)
+  - Haiku: simple file edits, linting, formatting, doc updates (<50 lines changed)
+  - Sonnet: implementation, testing, debugging, code review (50-500 lines)
+  - Opus: architecture decisions, spec writing, complex refactors (>500 lines or cross-cutting)
+  Default: Sonnet. Only escalate to Opus when the task involves multi-file architecture or ambiguous requirements.
+  \`\`\`
+- Development pipeline agents (used by /project:develop). Each agent should include a modelRouting field in its YAML frontmatter:
+  - \`@architect\` (default: opus) \u2014 conducts spec interview with user, writes confirmed spec to docs/SPRINT.md with numbered acceptance criteria. Your spec is a CONTRACT \u2014 the verifier will check every criterion. Vague criteria = guaranteed rework.
+  - \`@planner\` (default: sonnet, escalate to opus for cross-cutting changes) \u2014 reads spec and codebase, creates step-by-step implementation plan in docs/PLAN.md
+  - \`@implementer\` (default: sonnet, escalate to opus for cross-cutting changes) \u2014 TDD-focused implementation, writes failing tests then minimum code to pass
+  - \`@fixer\` (default: sonnet, use haiku for single-file fixes) \u2014 targeted bug fixing from verifier/review feedback
+  - \`@doc-updater\` (default: haiku) \u2014 extracts decisions and learnings from completed work, updates docs/DECISIONS.md and docs/LEARNINGS.md
+- \`/project:spec\` command (interview-based spec creation \u2014 asks 5-8 questions one at a time, writes structured spec to docs/SPRINT.md with ## Acceptance Criteria containing 3-8 numbered, testable conditions. Each criterion must be independently verifiable. Does NOT start coding until confirmed)
 - \`/project:prove\` command (runs tests, shows git diff vs main, rates confidence HIGH/MEDIUM/LOW with evidence)
 - \`/project:grill\` command (adversarial code review \u2014 challenges each change with "why this approach?", "what if X input?", rates BLOCKER/SHOULD-FIX/NITPICK, blocks until BLOCKERs resolved)
 - \`/project:reset\` command (reads DECISIONS.md and LEARNINGS.md, proposes clean restart, stashes current work, implements elegant solution)
+- \`/project:persist\` command (persistent execution loop \u2014 reads acceptance criteria from docs/SPRINT.md, works criterion-by-criterion with structured progress tracking in .claude/progress.json, auto-retries on verification failure up to 3 times per criterion, delegates to @grill for review gate before completion, resumes from progress.json if session was interrupted). The command protocol:
+  1. Load or initialize .claude/progress.json from docs/SPRINT.md numbered acceptance criteria
+  2. For each incomplete criterion: implement, run verification (build/test/typecheck/lint), mark PASSED or retry (max 3 attempts per criterion, mark BLOCKED after 3 failures)
+  3. After all criteria attempted: if any BLOCKED report which and why; if all PASSED proceed to review gate
+  4. Review gate: delegate to @grill for adversarial review; fix blockers if found (max 1 fix cycle)
+  5. Persist state: write final progress.json; include progress summary in memory.json for session resume
+  Resume protocol: when progress.json exists, skip PASSED criteria, resume from first non-PASSED criterion, carry forward failure notes from prior attempts.
 ## For Research Projects, Additionally Include
@@ -4962,7 +5284,7 @@ Return ONLY valid JSON matching this structure:
 \`\`\`json
 {
   "claude_md": "Full CLAUDE.md content (under 150 lines)",
-  "commands": { "help": "...", "develop": "...", "status": "...", "fix": "...", "sprint": "...", "spec": "...", "prove": "...", "grill": "...", "reset": "..." },
+  "commands": { "help": "...", "develop": "...", "status": "...", "fix": "...", "sprint": "...", "spec": "...", "prove": "...", "grill": "...", "reset": "...", "persist": "..." },
   "rules": { "continuity": "...", "security": "..." },
   "agents": { "architect": "...", "planner": "...", "implementer": "...", "fixer": "...", "doc-updater": "...", "qa-orchestrator": "...", "linter": "...", "e2e-tester": "..." },
   "skills": { "skill-name/SKILL": "..." },
@@ -5067,6 +5389,12 @@ At the start of every session, before doing ANY work:
 This saves 2-5 exploratory turns. Never ask "what files are here?" \u2014 look first.
+## Sprint Contract
+Before implementing, confirm acceptance criteria exist in docs/SPRINT.md.
+Each criterion must be numbered, testable, and independently verifiable.
+After implementing, verify EACH criterion individually. Do not mark done until all pass.
 ## Completion Standards
 Never mark a task "done" without running the Completion Verification checklist.
@@ -5096,6 +5424,7 @@ Do not add generic filler. Every line must be specific to the user's workflow.
 15. "Engineering Standards", "Tool Usage Policy", and "Code Philosophy" sections in CLAUDE.md
 16. A "First Turn Protocol" section in CLAUDE.md (orient before working: pwd, ls, git status, check relevant runtimes, read task files)
 17. A "Completion Standards" section in CLAUDE.md (never mark done without verifying: requirements met, tests passing, no debug artifacts, reviewed from 3 perspectives)
+18. A "Sprint Contract" section in CLAUDE.md (confirm acceptance criteria exist before implementing, verify each criterion after)
 ## Tool Selection Rules
@@ -5114,7 +5443,7 @@ Do not add generic filler. Every line must be specific to the user's workflow.
 - Skills: maximum 3. Only include directly relevant ones.
 - Agents: maximum 5. Orchestration pipeline (/develop) agents.
 - Commands: no limit (loaded on demand, zero context cost).
-- Hooks: maximum 4 (auto-format, block-destructive, PostCompact, plus one contextual).
+- Hooks: maximum 5 (auto-format, block-destructive, PostCompact, memory-persistence, plus one contextual).
 If the workflow doesn't clearly need a tool, DO NOT include it.
 Each MCP server costs 500-2000 tokens of context window.
@@ -5143,7 +5472,8 @@ Return ONLY valid JSON matching this structure:
     },
     "commands": {
       "help": "markdown content for /project:help",
-      "develop": "markdown content for /project:develop"
+      "develop": "markdown content for /project:develop",
+      "persist": "markdown content for /project:persist"
     },
     "rules": {
       "continuity": "markdown content for continuity rule",
@@ -6220,6 +6550,9 @@ function applyAutonomyLevel(spec) {
   const agents = spec.harness.agents ?? {};
   const docs = spec.harness.docs ?? {};
   const settings = spec.harness.settings ?? {};
+  if (!("persistence_routing" in settings)) {
+    settings.persistence_routing = level >= 3 ? "auto" : "manual";
+  }
   if (level >= 1) {
     if (!("tour" in commands)) {
       commands.tour = TOUR_COMMAND;
@@ -6291,6 +6624,85 @@ var ENV_LOADER_HOOK = {
     command: 'if [ -f .env ] && [ -n "$CLAUDE_ENV_FILE" ]; then grep -v "^#" .env | grep -v "^$" | grep "=" >> "$CLAUDE_ENV_FILE"; fi'
   }]
 };
+var PERSIST_ROUTER_TEMPLATE = `import { readFileSync } from 'fs';
+const input = JSON.parse(readFileSync('/dev/stdin', 'utf8'));
+const prompt = (input.prompt ?? '').trim();
+// Pass-through patterns (fast exit)
+const PASSTHROUGH = /^(what|how|why|where|when|can you|does|is |show me|find |search |list |\\/project:)/i;
+const SINGLE_FILE = /^(edit|fix the typo|update the comment|change the|rename) .{3,60}$/i;
+if (PASSTHROUGH.test(prompt) || SINGLE_FILE.test(prompt) || prompt.length < 20) {
+  process.stdout.write(JSON.stringify({ continue: true }));
+  process.exit(0);
+}
+// Check config for routing mode
+let routingMode = 'auto';
+try {
+  const settings = JSON.parse(readFileSync('.claude/settings.json', 'utf8'));
+  routingMode = settings.persistence_routing ?? 'auto';
+} catch { /* default to auto */ }
+if (routingMode === 'off') {
+  process.stdout.write(JSON.stringify({ continue: true }));
+  process.exit(0);
+}
+// Complexity signals
+const signals = [];
+if (/\\b(then|after that|and also|next|finally|step \\d|first .* then)\\b/i.test(prompt)) {
+  signals.push('multi-step');
+}
+if (/\\b(add|implement|build|create|integrate|set up)\\b.*\\b(feature|auth|api|endpoint|page|component|module|service|database|migration)\\b/i.test(prompt)) {
+  signals.push('feature-scope');
+}
+if (/\\b(migrate|convert|replace|upgrade|refactor|rewrite|restructure)\\b/i.test(prompt)) {
+  signals.push('refactor-scope');
+}
+if (/\\b(when .* happens|steps to reproduce|broken|crash|regression|fails when)\\b/i.test(prompt)) {
+  signals.push('bug-with-repro');
+}
+if (/\\b(persist|keep working|don't stop|until done|until .* pass)\\b/i.test(prompt)) {
+  signals.push('explicit');
+}
+if (prompt.split(/\\s+/).length > 50) {
+  signals.push('long-prompt');
+}
+const shouldRoute = routingMode === 'manual'
+  ? signals.includes('explicit')
+  : signals.length >= 2 || signals.includes('explicit');
+if (shouldRoute) {
+  process.stdout.write(JSON.stringify({
+    continue: true,
+    hookSpecificOutput: {
+      hookEventName: 'UserPromptSubmit',
+      additionalContext: [
+        'PERSISTENCE ROUTING: This task has complexity signals (' + signals.join(', ') + ').',
+        'Execute this using the /project:persist workflow:',
+        '1. Ensure acceptance criteria exist in docs/SPRINT.md (create from this prompt if needed)',
+        '2. Initialize .claude/progress.json',
+        '3. Work criterion-by-criterion until all pass',
+        '4. Run review gate before marking complete',
+      ].join('\\n'),
+    },
+  }));
+} else {
+  process.stdout.write(JSON.stringify({ continue: true }));
+}
+`;
+var PERSIST_ROUTER_HOOK = {
+  matcher: "",
+  hooks: [{
+    type: "command",
+    command: 'node "$CLAUDE_PROJECT_DIR/.claude/hooks/persist-router.mjs"',
+    timeout: 5
+  }]
+};
 function resolveSettings(spec, options) {
   const settings = spec.harness.settings;
   const base = settings && Object.keys(settings).length > 0 ? { ...settings } : {};
@@ -6304,6 +6716,13 @@ function resolveSettings(spec, options) {
     hooks.SessionStart = sessionStart;
     base.hooks = hooks;
   }
+  if (isCodeProject(spec) && (spec.autonomy_level ?? 1) >= 3) {
+    const hooks = base.hooks ?? {};
+    const userPromptSubmit = hooks.UserPromptSubmit ?? [];
+    userPromptSubmit.push(PERSIST_ROUTER_HOOK);
+    hooks.UserPromptSubmit = userPromptSubmit;
+    base.hooks = hooks;
+  }
   const hasIntentHooks = spec.harness.hooks && Object.keys(spec.harness.hooks).length > 0;
   if (hasIntentHooks) {
     const hooks = base.hooks ?? {};
@@ -6395,6 +6814,9 @@ function buildFileMap(spec, options) {
       files.set(".claude/hooks/intent-log.jsonl", "");
     }
   }
+  if (isCodeProject(spec) && (spec.autonomy_level ?? 1) >= 3) {
+    files.set(".claude/hooks/persist-router.mjs", PERSIST_ROUTER_TEMPLATE);
+  }
   return files;
 }
 async function writeEnvironment(spec, targetDir, options) {
@@ -6465,6 +6887,11 @@ async function writeEnvironment(spec, targetDir, options) {
       written.push(".claude/hooks/intent-log.jsonl");
     }
   }
+  if (isCodeProject(spec) && (spec.autonomy_level ?? 1) >= 3) {
+    const p = path5.join(claudeDir, "hooks", "persist-router.mjs");
+    await writeFile(p, PERSIST_ROUTER_TEMPLATE);
+    written.push(".claude/hooks/persist-router.mjs");
+  }
   return written;
 }
 function summarizeSpec(spec, registry) {
@@ -8163,14 +8590,20 @@ var EVAL_TEMPLATES = {
     name: "Intent Routing",
     description: "Test that natural language prompts route to the correct workflow command via intent hooks",
     bestFor: ["feature-development", "full-stack", "api-building"]
+  },
+  "persistence-completion": {
+    id: "persistence-completion",
+    name: "Persistence Completion",
+    description: "Can the agent complete a multi-criterion task using the persistence loop?",
+    bestFor: ["feature-development", "full-stack", "api-building", "maintenance"]
   }
 };
 function selectTemplatesForWorkflow(workflowType) {
   const mapping = {
-    "feature-development": ["add-feature", "test-writing", "convention-adherence", "workflow-compliance", "intent-routing"],
-    "api-building": ["add-feature", "fix-bug", "test-writing", "convention-adherence"],
-    "full-stack": ["add-feature", "fix-bug", "test-writing", "convention-adherence"],
-    "maintenance": ["fix-bug", "refactor", "test-writing", "rule-compliance"],
+    "feature-development": ["add-feature", "test-writing", "convention-adherence", "workflow-compliance", "intent-routing", "persistence-completion"],
+    "api-building": ["add-feature", "fix-bug", "test-writing", "convention-adherence", "persistence-completion"],
+    "full-stack": ["add-feature", "fix-bug", "test-writing", "convention-adherence", "persistence-completion"],
+    "maintenance": ["fix-bug", "refactor", "test-writing", "rule-compliance", "persistence-completion"],
     "debugging": ["fix-bug", "test-writing", "rule-compliance"],
     "qa": ["fix-bug", "test-writing", "add-feature", "workflow-compliance"],
     "architecture": ["refactor", "test-writing", "config-change", "convention-adherence"],
@@ -8191,6 +8624,7 @@ IMPORTANT: For harness-aware templates (convention-adherence, workflow-complianc
 - convention-adherence: Task must require following specific conventions from CLAUDE.md (naming, file structure, patterns). Judge by whether output matches the conventions.
 - workflow-compliance: Task must require using project slash commands or workflow steps defined in .claude/commands/. Judge by whether the agent followed the defined workflow.
 - rule-compliance: Task must create a scenario where .claude/rules/ content is relevant. Judge by whether the agent respected all rules.
+- persistence-completion: Task MUST have 3+ acceptance criteria that require sequential implementation. The task description should be a realistic feature request \u2014 the agent must parse it into criteria. Judge by: (a) all criteria met (progress.json status: complete), (b) structured tracking used (progress.json exists with 3+ criteria), (c) tests pass, (d) review gate executed (progress.json review field present).
 These harness-aware tasks are critical \u2014 they test whether the .claude/ environment actually improves agent behavior.
@@ -8860,7 +9294,7 @@ async function applyEvolution(workspacePath, projectRoot, targetIteration, pbt)
 // src/commands/evolve.ts
 var DEFAULT_CONFIG = {
   model: "claude-sonnet-4-6",
-  proposerModel: "claude-opus-4-6",
+  proposerModel: "claude-sonnet-4-6",
   scorer: "pass-fail",
   maxIterations: 5,
   parallelTasks: 1,