npm - @specmarket/cli - Versions diffs - 0.0.5 → 0.0.6 - Mend

@specmarket/cli 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/dist/{chunk-DLEMNRTH.js → chunk-OTXWWFAO.js} +24 -2
package/dist/chunk-OTXWWFAO.js.map +1 -0
package/dist/{config-OAU6SJLC.js → config-5JMI3YAR.js} +2 -2
package/dist/index.js +1283 -389
package/dist/index.js.map +1 -1
package/package.json +1 -1
package/src/commands/init.test.ts +162 -23
package/src/commands/init.ts +349 -17
package/src/commands/issues.test.ts +8 -3
package/src/commands/issues.ts +2 -9
package/src/commands/login.ts +2 -6
package/src/commands/publish.test.ts +14 -1
package/src/commands/publish.ts +1 -0
package/src/commands/run.test.ts +206 -0
package/src/commands/run.ts +63 -3
package/src/commands/validate.test.ts +83 -6
package/src/commands/validate.ts +96 -114
package/src/lib/format-detection.test.ts +4 -4
package/src/lib/format-detection.ts +3 -3
package/src/lib/meta-instructions.test.ts +340 -0
package/src/lib/meta-instructions.ts +562 -0
package/src/lib/ralph-loop.test.ts +404 -0
package/src/lib/ralph-loop.ts +475 -98
package/src/lib/telemetry.ts +5 -0
package/dist/chunk-DLEMNRTH.js.map +0 -1
/package/dist/{config-OAU6SJLC.js.map → config-5JMI3YAR.js.map} +0 -0

package/src/lib/ralph-loop.ts CHANGED Viewed

@@ -11,30 +11,64 @@ import {
   RUN_DEFAULTS,
   EXIT_CODES,
   RUNNER_ID,
+  MODEL_COST_PER_TOKEN,
+  DEFAULT_HARNESS,
 } from '@specmarket/shared';
 import createDebug from 'debug';
+import { generateMetaInstructions, META_INSTRUCTION_FILENAME } from './meta-instructions.js';
+import { detectSpecFormat } from './format-detection.js';
 const debug = createDebug('specmarket:runner');
 const execAsync = promisify(exec);
 /**
- * Pre-flight check: Verifies that Claude CLI is installed before attempting to run a spec.
- * Throws an error with installation instructions if claude is not found.
+ * Pre-flight check: Verifies that the selected harness CLI is installed.
+ * Throws an error with installation instructions if the binary is not found.
+ *
+ * @param harness - The harness to check. Defaults to 'claude-code'.
  */
-export async function checkClaudeCliInstalled(): Promise<void> {
+export async function checkClaudeCliInstalled(harness?: string): Promise<void> {
+  const h = harness ?? DEFAULT_HARNESS;
+  const binaryName = HARNESS_BINARY[h] ?? 'claude';
   try {
-    // Use 'which claude' to check if claude is in PATH
-    await execAsync('which claude');
+    await execAsync(`which ${binaryName}`);
   } catch {
+    const installHint = HARNESS_INSTALL_HINT[h] ?? `Install ${binaryName} and ensure it is in your PATH.`;
     throw new Error(
-      `Claude CLI is not installed or not in your PATH.\n\n` +
-      `Installation instructions:\n` +
-      `  npm install -g @anthropic-ai/claude-code\n\n` +
-      `Or visit: https://www.anthropic.com/claude-code\n`
+      `Harness "${h}" binary "${binaryName}" is not installed or not in your PATH.\n\n` +
+      `${installHint}\n`
     );
   }
 }
+/** CLI binary name for each harness */
+const HARNESS_BINARY: Record<string, string> = {
+  'claude-code': 'claude',
+  'codex': 'codex',
+  'opencode': 'opencode',
+};
+/** Install hints for each harness */
+const HARNESS_INSTALL_HINT: Record<string, string> = {
+  'claude-code':
+    'Installation instructions:\n  npm install -g @anthropic-ai/claude-code\n\nOr visit: https://www.anthropic.com/claude-code',
+  'codex':
+    'Installation instructions:\n  npm install -g @openai/codex\n\nOr visit: https://github.com/openai/codex',
+  'opencode':
+    'Installation instructions:\n  npm install -g opencode-ai\n\nOr visit: https://opencode.ai',
+};
+/**
+ * A single steering action logged during the run.
+ * Written to `steering-log.json` in the run directory on completion.
+ */
+export interface SteeringEntry {
+  /** ISO-8601 timestamp when the message was injected */
+  timestamp: string;
+  /** User-provided steering content */
+  content: string;
+}
 export interface RunOptions {
   maxLoops?: number;
   maxBudgetUsd?: number;
@@ -43,6 +77,26 @@ export interface RunOptions {
   resumeRunId?: string;
   outputDir?: string;
   cliVersion: string;
+  /** Spec format override. When omitted, auto-detected from specDir. */
+  specFormat?: string;
+  /**
+   * Agentic harness to use for execution.
+   * One of: 'claude-code' (default), 'codex', 'opencode'.
+   */
+  harness?: string;
+  /**
+   * Existing working directory to run in instead of a fresh sandbox.
+   * When set, spec files are NOT copied — the agent operates directly on this directory.
+   * Enables `environmentType: 'existing'` in the run report.
+   */
+  workdir?: string;
+  /**
+   * Shared queue for steering messages typed by the user during the run.
+   * The caller pushes messages here; the runner drains the queue before each
+   * harness execution and injects the messages into the meta-instructions file.
+   * Each drained message increments `steeringActionCount` in the run report.
+   */
+  steeringQueue?: string[];
 }
 export interface RunResult {
@@ -56,15 +110,19 @@ export interface RunResult {
  * The loop:
  * 1. Creates a sandboxed working directory under ~/.specmarket/runs/<run-id>/
  * 2. Copies spec files into the working directory
- * 3. Initializes git for diff tracking
- * 4. Executes: `cat PROMPT.md | claude --print` in a loop
- * 5. After each loop: captures tokens, duration, git diff
- * 6. Checks for completion conditions:
+ * 3. Detects spec format (specmarket | speckit | bmad | ralph | custom) and generates
+ *    `.specmarket-runner.md` — format-aware meta-instructions for the AI agent.
+ * 4. Initializes git for diff tracking
+ * 5. Executes: `cat .specmarket-runner.md | claude --print` in a loop.
+ *    The meta-instructions tell the agent which files to read, how to find tasks,
+ *    how to mark completion, and when the run is done — regardless of spec format.
+ * 6. After each loop: captures tokens, duration, git diff
+ * 7. Checks for completion conditions:
  *    - SUCCESS: TASKS.md empty + tests pass + all SUCCESS_CRITERIA.md criteria met
  *    - STALL: 3 consecutive loops with no git diff
  *    - FAILURE: 10 consecutive loops with same failing output
  *    - BUDGET: total tokens > 2x estimated_tokens
- * 7. Writes run-report.json on completion
+ * 8. Writes run-report.json on completion
  *
  * SECURITY: Always prints sandboxing recommendation before starting.
  */
@@ -88,12 +146,20 @@ export async function runSpec(
       ? (opts.maxBudgetUsd / specYaml.estimatedCostUsd) * specYaml.estimatedTokens
       : specYaml.estimatedTokens * RUN_DEFAULTS.BUDGET_MULTIPLIER);
+  const harness = opts.harness ?? DEFAULT_HARNESS;
   const runId = opts.resumeRunId ?? randomUUID();
   const runsBaseDir = join(homedir(), CONFIG_PATHS.RUNS_DIR);
-  const runDir = opts.outputDir ?? join(runsBaseDir, runId);
-  await mkdir(runDir, { recursive: true });
-  debug('Run directory: %s', runDir);
+  // --workdir: run in the caller-provided existing directory (no file copying).
+  // Without --workdir: create a fresh sandbox under ~/.specmarket/runs/<run-id>/.
+  const usingWorkdir = opts.workdir !== undefined;
+  const runDir = opts.workdir ?? opts.outputDir ?? join(runsBaseDir, runId);
+  const environmentType: 'fresh' | 'existing' = usingWorkdir ? 'existing' : 'fresh';
+  if (!usingWorkdir) {
+    await mkdir(runDir, { recursive: true });
+  }
+  debug('Run directory: %s (environmentType=%s, harness=%s)', runDir, environmentType, harness);
   if (opts.dryRun) {
     debug('Dry run mode — skipping execution');
@@ -102,6 +168,11 @@ export async function runSpec(
       specVersion: specYaml.version,
       model: opts.model ?? specYaml.minModel,
       runner: specYaml.runner,
+      harness,
+      specFormat: opts.specFormat,
+      environmentType,
+      steeringActionCount: 0,
+      isPureRun: false,
       loopCount: 0,
       totalTokens: 0,
       totalCostUsd: 0,
@@ -129,9 +200,19 @@ export async function runSpec(
       totalTokens = existingReport.totalTokens;
       debug('Resuming from iteration %d with %d tokens carried over', startIteration, totalTokens);
     }
+    // Ensure meta-instructions exist in the run dir (may be missing for runs
+    // created before this feature was added).
+    await ensureMetaInstructions(specDir, runDir, opts.specFormat);
+  } else if (usingWorkdir) {
+    // --workdir: the directory already has the spec files. Just generate/refresh
+    // the meta-instructions so the agent knows what format it is working with.
+    await ensureMetaInstructions(specDir, runDir, opts.specFormat);
+    // Initialize git if not already a repo (best-effort — may be an existing git repo)
+    await initGit(runDir);
   } else {
-    // Fresh run: copy spec files and initialize git for diff tracking
+    // Fresh run: copy spec files, generate meta-instructions, initialize git.
     await copySpecFiles(specDir, runDir);
+    await ensureMetaInstructions(specDir, runDir, opts.specFormat);
     await initGit(runDir);
   }
@@ -140,6 +221,14 @@ export async function runSpec(
   let consecutiveNoChange = 0;
   let lastOutput = '';
   let consecutiveSameOutput = 0;
+  const steeringLog: SteeringEntry[] = [];
+  let steeringActionCount = 0;
+  /**
+   * Counts how many times the post-task test phase has detected failures after
+   * all TASKS.md items were checked. When this reaches TEST_PHASE_MAX_ITERATIONS,
+   * the run is declared a failure — the agent could not fix the tests.
+   */
+  let testPhaseAttempts = 0;
   let finalStatus: RunReport['status'] = 'failure';
   let successCriteriaResults: SuccessCriterionResult[] = [];
@@ -149,11 +238,21 @@ export async function runSpec(
     const iterStart = Date.now();
-    // Execute: cat PROMPT.md | claude --print
-    const result = await executeClaudeLoop(runDir, opts.model);
+    // Drain steering queue and inject any pending messages before this iteration.
+    // Messages are appended to the meta-instructions file so the harness sees them.
+    const pendingMessages = opts.steeringQueue ? opts.steeringQueue.splice(0) : [];
+    if (pendingMessages.length > 0) {
+      await injectSteeringMessages(runDir, pendingMessages, steeringLog);
+      steeringActionCount += pendingMessages.length;
+      debug('Injected %d steering message(s); total steeringActionCount=%d', pendingMessages.length, steeringActionCount);
+    }
+    // Execute via the selected harness
+    const result = await executeHarness(runDir, harness, opts.model);
     const iterDuration = Date.now() - iterStart;
-    const tokensThisLoop = parseTokensFromOutput(result.stdout);
+    const activeModel = opts.model ?? specYaml.minModel;
+    const tokensThisLoop = parseTokensFromOutput(result.stdout, activeModel);
     totalTokens += tokensThisLoop;
     // Capture git diff
@@ -215,17 +314,65 @@ export async function runSpec(
       lastOutput = currentOutputHash;
     }
-    // SUCCESS check
-    const completionCheck = await checkCompletion(runDir);
-    if (completionCheck.isComplete) {
-      debug('Success criteria met at iteration %d', i);
-      successCriteriaResults = completionCheck.results;
-      finalStatus = 'success';
-      break;
-    }
+    // ---- Post-task test phase ----
+    //
+    // When all TASKS.md items are checked, the runner takes over test execution:
+    // 1. Run the test suite and capture output.
+    // 2. If tests fail: write specific fix tasks to TASKS.md and TEST_FAILURES.md,
+    //    then continue the main loop so the agent can address them.
+    // 3. If tests pass: check SUCCESS_CRITERIA.md — if all met, declare success.
+    //
+    // This creates a test→fix→retest cycle driven by the runner, ensuring the
+    // agent only receives passing runs when everything is actually green.
+    const tasksComplete = await isFixPlanEmpty(runDir);
+    if (tasksComplete) {
+      const testResult = await runTestsWithOutput(runDir);
+      if (!testResult.passed) {
+        testPhaseAttempts++;
+        debug(
+          'Post-task test phase attempt %d/%d: tests failing, writing fix tasks',
+          testPhaseAttempts,
+          RUN_DEFAULTS.TEST_PHASE_MAX_ITERATIONS
+        );
+        if (testPhaseAttempts >= RUN_DEFAULTS.TEST_PHASE_MAX_ITERATIONS) {
+          debug(
+            'Test phase exceeded max iterations (%d), declaring failure',
+            RUN_DEFAULTS.TEST_PHASE_MAX_ITERATIONS
+          );
+          successCriteriaResults = await evaluateSuccessCriteria(runDir).catch(() => []);
+          finalStatus = 'failure';
+          break;
+        }
+        // Write actionable fix tasks so the next harness iteration has specific work.
+        await writeTestFixTasks(runDir, testResult.output);
+        await stageAllChanges(runDir);
+        successCriteriaResults = await evaluateSuccessCriteria(runDir).catch(() => []);
+        // Continue main loop — the harness will pick up the new fix tasks.
+      } else {
+        // Tests pass — evaluate SUCCESS_CRITERIA.md for the final gate.
+        const criteriaResults = await evaluateSuccessCriteria(runDir);
+        successCriteriaResults = criteriaResults;
+        if (criteriaResults.every((r) => r.passed)) {
+          debug('All tasks done, tests pass, criteria met at iteration %d', i);
+          finalStatus = 'success';
+          break;
+        }
-    // Update partial success criteria results for reporting
-    successCriteriaResults = completionCheck.results;
+        // Success criteria not yet all checked — continue loop.
+        // The agent must update SUCCESS_CRITERIA.md as criteria are satisfied.
+        debug(
+          'Tests pass but not all criteria met at iteration %d; continuing',
+          i
+        );
+      }
+    } else {
+      // Tasks still pending — update partial results for reporting.
+      successCriteriaResults = await evaluateSuccessCriteria(runDir).catch(() => []);
+    }
   }
   // If we exhausted all loops without a status, mark as failure
@@ -233,15 +380,34 @@ export async function runSpec(
     successCriteriaResults = await evaluateSuccessCriteria(runDir).catch(() => []);
   }
+  // Persist steering log if any steering actions occurred during this run
+  if (steeringLog.length > 0) {
+    await writeFile(
+      join(runDir, 'steering-log.json'),
+      JSON.stringify(steeringLog, null, 2),
+      'utf-8'
+    );
+    debug('Steering log written (%d entries)', steeringLog.length);
+  }
   const totalTimeMinutes = (Date.now() - startTime) / 60000;
   const costPerToken = specYaml.estimatedCostUsd / specYaml.estimatedTokens;
   const totalCostUsd = totalTokens * costPerToken;
+  // Auto-detect specFormat from the run directory when not provided explicitly
+  const detectedSpecFormat =
+    opts.specFormat ?? (await detectSpecFormat(runDir)).format;
   const report: RunReport = {
     runId,
     specVersion: specYaml.version,
     model: opts.model ?? specYaml.minModel,
     runner: specYaml.runner,
+    harness,
+    specFormat: detectedSpecFormat,
+    environmentType,
+    steeringActionCount,
+    isPureRun: finalStatus === 'success' && steeringActionCount === 0,
     loopCount: iterations.length,
     totalTokens,
     totalCostUsd,
@@ -265,10 +431,83 @@ export async function runSpec(
 // ---- Internal helpers ----
+/**
+ * Detects the spec format from specDir and writes `.specmarket-runner.md` to runDir.
+ *
+ * Idempotent: if the file already exists in runDir it is overwritten so that
+ * the instructions stay consistent with the detected format.
+ *
+ * @param specDir   - Source spec directory (used for format detection + sidecar data)
+ * @param runDir    - Sandboxed run directory where the file is written
+ * @param formatOverride - Optional pre-detected format (skips detection when provided)
+ */
+export async function ensureMetaInstructions(
+  specDir: string,
+  runDir: string,
+  formatOverride?: string
+): Promise<void> {
+  const format = formatOverride ?? (await detectSpecFormat(specDir)).format;
+  debug('Generating meta-instructions for format=%s', format);
+  const content = await generateMetaInstructions(specDir, format);
+  await writeFile(join(runDir, META_INSTRUCTION_FILENAME), content, 'utf-8');
+  debug('Meta-instructions written to %s/%s', runDir, META_INSTRUCTION_FILENAME);
+}
+/**
+ * Injects pending steering messages into the meta-instructions file for the
+ * current run directory.
+ *
+ * A "## Steering Input" section is appended to `.specmarket-runner.md` so the
+ * agent reads the user's guidance on its next harness invocation. Each call
+ * appends a timestamped section — messages accumulate across iterations so the
+ * agent retains the full steering history.
+ *
+ * Side effects:
+ * - Modifies `.specmarket-runner.md` in runDir (appends steering section)
+ * - Pushes `SteeringEntry` objects into `steeringLog`
+ *
+ * @param runDir      - Active run directory containing the meta-instructions file
+ * @param messages    - Steering messages to inject (already spliced from the queue)
+ * @param steeringLog - Mutable array collecting all steering entries for this run
+ */
+export async function injectSteeringMessages(
+  runDir: string,
+  messages: string[],
+  steeringLog: SteeringEntry[]
+): Promise<void> {
+  if (messages.length === 0) return;
+  const timestamp = new Date().toISOString();
+  const entries: SteeringEntry[] = messages.map((content) => ({ timestamp, content }));
+  steeringLog.push(...entries);
+  const steeringSection = [
+    '',
+    `## Steering Input (injected at ${timestamp})`,
+    '',
+    'The user has provided the following steering instructions. Incorporate them into your current work:',
+    '',
+    ...messages.map((m) => `> ${m}`),
+    '',
+  ].join('\n');
+  const metaPath = join(runDir, META_INSTRUCTION_FILENAME);
+  try {
+    const existing = await readFile(metaPath, 'utf-8');
+    await writeFile(metaPath, existing + steeringSection, 'utf-8');
+  } catch {
+    // Meta-instructions file missing — create it with just the steering section
+    await writeFile(metaPath, steeringSection, 'utf-8');
+  }
+  debug('injectSteeringMessages: appended %d message(s) to %s', messages.length, META_INSTRUCTION_FILENAME);
+}
 async function copySpecFiles(srcDir: string, destDir: string): Promise<void> {
   const { cp } = await import('fs/promises');
   await cp(srcDir, join(destDir, 'spec'), { recursive: true });
-  // Also copy directly to destDir so PROMPT.md is at root
+  // Also copy directly to destDir so spec files are accessible at the root
+  // of the run directory alongside the generated meta-instructions.
   await cp(srcDir, destDir, { recursive: true, force: false });
   debug('Spec files copied from %s to %s', srcDir, destDir);
 }
@@ -306,27 +545,62 @@ interface ExecuteResult {
   exitCode: number;
 }
-async function executeClaudeLoop(dir: string, model?: string): Promise<ExecuteResult> {
-  return new Promise((resolve) => {
-    const args = ['--print', '--output-format', 'json'];
-    if (model) {
-      args.push('--model', model);
+/**
+ * Builds the shell command string for the given harness.
+ *
+ * All harnesses receive the same meta-instructions file via stdin so they
+ * know what format they are working with and what tasks to execute.
+ *
+ * - claude-code: `cat .specmarket-runner.md | claude --print --output-format json [--model <m>]`
+ * - codex:       `cat .specmarket-runner.md | codex`
+ * - opencode:    `cat .specmarket-runner.md | opencode`
+ */
+function buildHarnessCommand(harness: string, model?: string): string {
+  switch (harness) {
+    case 'claude-code': {
+      const args = ['--print', '--output-format', 'json'];
+      if (model) args.push('--model', model);
+      return `cat ${META_INSTRUCTION_FILENAME} | claude ${args.join(' ')}`;
     }
+    case 'codex':
+      // Codex CLI reads from stdin; model selection is via OPENAI_MODEL env or its own flags
+      return `cat ${META_INSTRUCTION_FILENAME} | codex`;
+    case 'opencode':
+      // opencode reads from stdin
+      return `cat ${META_INSTRUCTION_FILENAME} | opencode`;
+    default:
+      // Unknown harness — fall back to claude-code behaviour
+      debug('Unknown harness "%s" — falling back to claude-code', harness);
+      return `cat ${META_INSTRUCTION_FILENAME} | claude --print --output-format json`;
+  }
+}
-    // Execute: cat PROMPT.md | claude --print --output-format json
-    // Using --output-format json gives us structured output with token usage metadata.
-    const proc = spawn('sh', ['-c', `cat PROMPT.md | claude ${args.join(' ')}`], {
+/**
+ * Executes a single loop iteration via the specified harness.
+ *
+ * The meta-instructions file (`.specmarket-runner.md`) is piped into the harness
+ * binary as stdin. The harness is expected to read the instructions, perform the
+ * requested work inside the run directory, and exit with code 0 on success.
+ */
+async function executeHarness(dir: string, harness: string, model?: string): Promise<ExecuteResult> {
+  const cmd = buildHarnessCommand(harness, model);
+  debug('executeHarness: %s (harness=%s)', cmd, harness);
+  return new Promise((resolve) => {
+    const proc = spawn('sh', ['-c', cmd], {
       cwd: dir,
-      stdio: ['inherit', 'pipe', 'pipe'],
+      // stdin is 'ignore': the harness reads its instructions from the meta-instructions file
+      // via `cat .specmarket-runner.md | <harness>`, not from parent stdin.
+      // Keeping stdin detached from the parent lets the CLI read steering messages
+      // from process.stdin without conflict.
+      stdio: ['ignore', 'pipe', 'pipe'],
     });
     let stdout = '';
-    let stderr = '';
     proc.stdout?.on('data', (chunk: Buffer) => {
       stdout += chunk.toString();
     });
     proc.stderr?.on('data', (chunk: Buffer) => {
-      stderr += chunk.toString();
       // Write stderr to process stderr for visibility
       process.stderr.write(chunk);
     });
@@ -336,7 +610,7 @@ async function executeClaudeLoop(dir: string, model?: string): Promise<ExecuteRe
     });
     proc.on('error', (err) => {
-      debug('claude spawn error: %O', err);
+      debug('%s spawn error: %O', harness, err);
       resolve({ stdout: '', exitCode: 1 });
     });
   });
@@ -347,15 +621,32 @@ async function executeClaudeLoop(dir: string, model?: string): Promise<ExecuteRe
  *
  * Strategy (in priority order):
  * 1. Parse JSON output format (claude --output-format json) which contains
- *    structured metadata including token counts in the response.
+ *    structured metadata including token counts or cost_usd in the response.
+ *    When only cost_usd is available (typical for Claude Code CLI), token count
+ *    is estimated using model-aware pricing constants from MODEL_COST_PER_TOKEN.
+ *    This estimate may deviate ±30% from the actual count depending on the
+ *    input/output token ratio for that specific run.
  * 2. Match known text patterns from Claude Code's output (total_tokens, etc.)
  * 3. Estimate from output length as a last-resort heuristic (~4 chars per token).
  *
+ * @param output - Raw stdout from the Claude CLI invocation
+ * @param model  - Model identifier (e.g. "claude-haiku-4-5", "claude-opus-4-6").
+ *                 Used to select the correct pricing tier for cost→token estimation.
+ *                 Defaults to Sonnet-tier pricing if omitted or unrecognised.
+ *
  * Returns 0 only if the output is empty (no meaningful work was done).
  */
-function parseTokensFromOutput(output: string): number {
+export function parseTokensFromOutput(output: string, model?: string): number {
   if (!output || output.trim().length === 0) return 0;
+  // Resolve cost-per-token for this model (case-insensitive substring match)
+  const modelLower = (model ?? '').toLowerCase();
+  const costPerToken = modelLower.includes('haiku')
+    ? MODEL_COST_PER_TOKEN.haiku
+    : modelLower.includes('opus')
+    ? MODEL_COST_PER_TOKEN.opus
+    : MODEL_COST_PER_TOKEN.default;
   // Strategy 1: Parse JSON output format from claude --output-format json
   // Claude Code JSON output may contain token usage info in the response metadata.
   try {
@@ -386,11 +677,16 @@ function parseTokensFromOutput(output: string): number {
           const output_tokens = parsed.usage?.output_tokens ?? parsed.usage?.completion_tokens ?? 0;
           if (input > 0 || output_tokens > 0) return input + output_tokens;
-          // Cost-based estimation (if cost is reported but not tokens)
-          // Haiku: ~$0.25/MTok input, $1.25/MTok output → avg ~$0.75/MTok
-          // Sonnet: ~$3/MTok input, $15/MTok output → avg ~$9/MTok
+          // Cost-based estimation: Claude Code CLI typically reports cost_usd but not
+          // raw token counts. Use model-aware pricing for the best estimate.
           if (typeof parsed.cost_usd === 'number' && parsed.cost_usd > 0) {
-            return Math.round(parsed.cost_usd / 0.000009); // Assume Sonnet pricing
+            debug(
+              'parseTokensFromOutput: using cost_usd=%f with model=%s (costPerToken=%e)',
+              parsed.cost_usd,
+              model ?? 'unknown',
+              costPerToken
+            );
+            return Math.round(parsed.cost_usd / costPerToken);
           }
         }
       } catch {
@@ -425,8 +721,8 @@ function parseTokensFromOutput(output: string): number {
   }
   // Strategy 3: Estimate from output length
-  // Rough heuristic: ~4 characters per token for English text
-  // This is imprecise but better than returning 0 (which breaks budget tracking)
+  // Rough heuristic: ~4 characters per token for English text.
+  // This is imprecise but better than returning 0 (which breaks budget tracking).
   const estimatedTokens = Math.ceil(output.length / 4);
   debug(
     'parseTokensFromOutput: no explicit token count found, estimating %d from %d chars',
@@ -441,40 +737,6 @@ function parseIntComma(s: string): number {
   return parseInt(s.replace(/,/g, ''), 10) || 0;
 }
-interface CompletionCheck {
-  isComplete: boolean;
-  results: SuccessCriterionResult[];
-}
-async function checkCompletion(dir: string): Promise<CompletionCheck> {
-  // Check 1: TASKS.md should be empty or have only checked items
-  const fixPlanEmpty = await isFixPlanEmpty(dir);
-  if (!fixPlanEmpty) {
-    return {
-      isComplete: false,
-      results: await evaluateSuccessCriteria(dir).catch(() => []),
-    };
-  }
-  // Check 2: Run test suite if detectable
-  const testsPass = await runTests(dir);
-  if (!testsPass) {
-    return {
-      isComplete: false,
-      results: await evaluateSuccessCriteria(dir).catch(() => []),
-    };
-  }
-  // Check 3: Evaluate SUCCESS_CRITERIA.md
-  const criteriaResults = await evaluateSuccessCriteria(dir);
-  const allPassed = criteriaResults.every((r) => r.passed);
-  return {
-    isComplete: allPassed,
-    results: criteriaResults,
-  };
-}
 async function isFixPlanEmpty(dir: string): Promise<boolean> {
   try {
     const content = await readFile(join(dir, 'TASKS.md'), 'utf-8');
@@ -487,13 +749,21 @@ async function isFixPlanEmpty(dir: string): Promise<boolean> {
   }
 }
-async function runTests(dir: string): Promise<boolean> {
-  // Try to detect and run tests using known test runner config files.
-  // Exit code is the primary failure signal; output regex is a fallback.
+/**
+ * Runs the test suite in `dir` and captures the raw output.
+ *
+ * Probes for known test runner config files in priority order; skips to the
+ * next runner on spawn or timeout errors. Returns `{ passed: true, output: '' }`
+ * when no test runner is detected (cannot verify — assume passing).
+ *
+ * The raw `output` is used by `writeTestFixTasks` to extract failure details
+ * and write them as actionable fix tasks for the agent.
+ */
+export async function runTestsWithOutput(dir: string): Promise<{ passed: boolean; output: string }> {
   const testRunners = [
     { file: 'package.json', cmd: 'npm test -- --run 2>&1' },
     { file: 'vitest.config.ts', cmd: 'npx vitest run 2>&1' },
-    { file: 'pytest.ini', cmd: 'python -m pytest --tb=no -q 2>&1' },
+    { file: 'pytest.ini', cmd: 'python -m pytest --tb=short -q 2>&1' },
     { file: 'Makefile', cmd: 'make test 2>&1' },
   ];
@@ -509,22 +779,129 @@ async function runTests(dir: string): Promise<boolean> {
         cwd: dir,
         timeout: 120000,
       });
-      // Exit code 0 — check output as secondary signal
       const combined = stdout + stderr;
       const hasFailed = /\d+ failed|\d+ error/i.test(combined);
-      return !hasFailed;
+      return { passed: !hasFailed, output: combined };
     } catch (err: unknown) {
-      // Non-zero exit code means tests failed
-      if (err && typeof err === 'object' && 'code' in err && typeof err.code === 'number') {
-        return false;
+      if (err && typeof err === 'object') {
+        const execErr = err as { code?: number; signal?: string; stdout?: string; stderr?: string };
+        if (typeof execErr.code === 'number' && execErr.signal == null) {
+          // Process exited with a non-zero exit code — genuine test failures.
+          const combined = (execErr.stdout ?? '') + (execErr.stderr ?? '');
+          return { passed: false, output: combined };
+        }
       }
-      // Timeout or other execution error — skip to next runner
+      // Timeout or spawn error — skip to next runner
       continue;
     }
   }
-  // No test runner found — assume passing
-  return true;
+  // No test runner detected — assume passing
+  return { passed: true, output: '' };
+}
+/**
+ * Extract a short list of failing test identifiers from raw test runner output.
+ *
+ * Supports:
+ * - Vitest/Jest:  "FAIL src/foo.test.ts" file-level failures
+ * - Vitest/Jest:  "× test name" / "✗ test name" individual test failures
+ * - Pytest:       "FAILED tests/foo.py::test_name"
+ * - Generic:      "N failed" summary line (fallback)
+ *
+ * Returns at most 10 entries. When specific failures cannot be parsed, returns
+ * a single generic entry directing the agent to TEST_FAILURES.md.
+ */
+export function extractTestFailures(output: string): string[] {
+  const failures: string[] = [];
+  // Vitest/Jest: "FAIL src/foo.test.ts" (file-level failure)
+  const failFileMatches = output.match(/^FAIL\s+\S+/gm) ?? [];
+  for (const m of failFileMatches) {
+    const name = m.replace(/^FAIL\s+/, '').trim();
+    if (name && !failures.includes(name)) failures.push(name);
+  }
+  // Vitest/Jest: individual test "× test name" or "✗ test name" or "✕ test name"
+  const failTestMatches = output.match(/^[\s]*[×✗✕]\s+(.+)/gm) ?? [];
+  for (const m of failTestMatches) {
+    const name = m.replace(/^[\s]*[×✗✕]\s+/, '').trim();
+    if (name && !failures.includes(name)) failures.push(name);
+  }
+  // Pytest: "FAILED tests/foo.py::test_bar"
+  const pytestMatches = output.match(/^FAILED\s+\S+/gm) ?? [];
+  for (const m of pytestMatches) {
+    const name = m.replace(/^FAILED\s+/, '').trim();
+    if (name && !failures.includes(name)) failures.push(name);
+  }
+  // Generic fallback when specific test names couldn't be parsed
+  if (failures.length === 0) {
+    const summaryMatch = output.match(/(\d+)\s+failed/i);
+    if (summaryMatch) {
+      failures.push(`${summaryMatch[1]} test(s) failed — see TEST_FAILURES.md for details`);
+    }
+  }
+  return failures.slice(0, 10);
+}
+/**
+ * Write test failures as actionable fix tasks into TASKS.md after the runner
+ * detects that all implementation tasks are done but tests are still failing.
+ *
+ * Side effects:
+ * - Writes `TEST_FAILURES.md` with the full test output for agent reference.
+ * - Appends (or replaces) a "## Test Failures (Auto-Generated)" section in
+ *   `TASKS.md` containing one `- [ ] Fix: <name>` item per failing test.
+ *   Any previous auto-generated section is replaced to avoid duplication.
+ *
+ * The agent will see TASKS.md has unchecked items, read TEST_FAILURES.md for
+ * context, and work to resolve each failure before marking them `[x]`.
+ */
+export async function writeTestFixTasks(dir: string, testOutput: string): Promise<void> {
+  // Always write the full output to TEST_FAILURES.md so the agent has context.
+  await writeFile(
+    join(dir, 'TEST_FAILURES.md'),
+    [
+      '# Test Failures',
+      '',
+      '> Auto-generated by SpecMarket runner. Delete this file when all tests pass.',
+      '',
+      '## Raw Test Output',
+      '',
+      '```',
+      testOutput.slice(0, 8000),
+      '```',
+    ].join('\n'),
+    'utf-8'
+  );
+  const failures = extractTestFailures(testOutput);
+  if (failures.length === 0) return;
+  const testFixSection = [
+    '',
+    '## Test Failures (Auto-Generated)',
+    '> These tasks were created by the runner after detecting test failures.',
+    '> Fix each failing test, then delete this section and TEST_FAILURES.md.',
+    '',
+    ...failures.map((f) => `- [ ] Fix: ${f}`),
+  ].join('\n');
+  try {
+    const existing = await readFile(join(dir, 'TASKS.md'), 'utf-8');
+    // Replace any previous auto-generated section to avoid duplication.
+    const withoutPrevious = existing.replace(
+      /\n## Test Failures \(Auto-Generated\)[\s\S]*/,
+      ''
+    );
+    await writeFile(join(dir, 'TASKS.md'), withoutPrevious + testFixSection, 'utf-8');
+  } catch {
+    // TASKS.md doesn't exist — create it.
+    await writeFile(join(dir, 'TASKS.md'), `# Tasks${testFixSection}`, 'utf-8');
+  }
 }
 async function evaluateSuccessCriteria(dir: string): Promise<SuccessCriterionResult[]> {