@specmarket/cli 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,30 +11,64 @@ import {
11
11
  RUN_DEFAULTS,
12
12
  EXIT_CODES,
13
13
  RUNNER_ID,
14
+ MODEL_COST_PER_TOKEN,
15
+ DEFAULT_HARNESS,
14
16
  } from '@specmarket/shared';
15
17
  import createDebug from 'debug';
18
+ import { generateMetaInstructions, META_INSTRUCTION_FILENAME } from './meta-instructions.js';
19
+ import { detectSpecFormat } from './format-detection.js';
16
20
 
17
21
  const debug = createDebug('specmarket:runner');
18
22
  const execAsync = promisify(exec);
19
23
 
20
24
  /**
21
- * Pre-flight check: Verifies that Claude CLI is installed before attempting to run a spec.
22
- * Throws an error with installation instructions if claude is not found.
25
+ * Pre-flight check: Verifies that the selected harness CLI is installed.
26
+ * Throws an error with installation instructions if the binary is not found.
27
+ *
28
+ * @param harness - The harness to check. Defaults to 'claude-code'.
23
29
  */
24
- export async function checkClaudeCliInstalled(): Promise<void> {
30
+ export async function checkClaudeCliInstalled(harness?: string): Promise<void> {
31
+ const h = harness ?? DEFAULT_HARNESS;
32
+ const binaryName = HARNESS_BINARY[h] ?? 'claude';
25
33
  try {
26
- // Use 'which claude' to check if claude is in PATH
27
- await execAsync('which claude');
34
+ await execAsync(`which ${binaryName}`);
28
35
  } catch {
36
+ const installHint = HARNESS_INSTALL_HINT[h] ?? `Install ${binaryName} and ensure it is in your PATH.`;
29
37
  throw new Error(
30
- `Claude CLI is not installed or not in your PATH.\n\n` +
31
- `Installation instructions:\n` +
32
- ` npm install -g @anthropic-ai/claude-code\n\n` +
33
- `Or visit: https://www.anthropic.com/claude-code\n`
38
+ `Harness "${h}" binary "${binaryName}" is not installed or not in your PATH.\n\n` +
39
+ `${installHint}\n`
34
40
  );
35
41
  }
36
42
  }
37
43
 
44
+ /** CLI binary name for each harness */
45
+ const HARNESS_BINARY: Record<string, string> = {
46
+ 'claude-code': 'claude',
47
+ 'codex': 'codex',
48
+ 'opencode': 'opencode',
49
+ };
50
+
51
+ /** Install hints for each harness */
52
+ const HARNESS_INSTALL_HINT: Record<string, string> = {
53
+ 'claude-code':
54
+ 'Installation instructions:\n npm install -g @anthropic-ai/claude-code\n\nOr visit: https://www.anthropic.com/claude-code',
55
+ 'codex':
56
+ 'Installation instructions:\n npm install -g @openai/codex\n\nOr visit: https://github.com/openai/codex',
57
+ 'opencode':
58
+ 'Installation instructions:\n npm install -g opencode-ai\n\nOr visit: https://opencode.ai',
59
+ };
60
+
61
+ /**
62
+ * A single steering action logged during the run.
63
+ * Written to `steering-log.json` in the run directory on completion.
64
+ */
65
+ export interface SteeringEntry {
66
+ /** ISO-8601 timestamp when the message was injected */
67
+ timestamp: string;
68
+ /** User-provided steering content */
69
+ content: string;
70
+ }
71
+
38
72
  export interface RunOptions {
39
73
  maxLoops?: number;
40
74
  maxBudgetUsd?: number;
@@ -43,6 +77,26 @@ export interface RunOptions {
43
77
  resumeRunId?: string;
44
78
  outputDir?: string;
45
79
  cliVersion: string;
80
+ /** Spec format override. When omitted, auto-detected from specDir. */
81
+ specFormat?: string;
82
+ /**
83
+ * Agentic harness to use for execution.
84
+ * One of: 'claude-code' (default), 'codex', 'opencode'.
85
+ */
86
+ harness?: string;
87
+ /**
88
+ * Existing working directory to run in instead of a fresh sandbox.
89
+ * When set, spec files are NOT copied — the agent operates directly on this directory.
90
+ * Enables `environmentType: 'existing'` in the run report.
91
+ */
92
+ workdir?: string;
93
+ /**
94
+ * Shared queue for steering messages typed by the user during the run.
95
+ * The caller pushes messages here; the runner drains the queue before each
96
+ * harness execution and injects the messages into the meta-instructions file.
97
+ * Each drained message increments `steeringActionCount` in the run report.
98
+ */
99
+ steeringQueue?: string[];
46
100
  }
47
101
 
48
102
  export interface RunResult {
@@ -56,15 +110,19 @@ export interface RunResult {
56
110
  * The loop:
57
111
  * 1. Creates a sandboxed working directory under ~/.specmarket/runs/<run-id>/
58
112
  * 2. Copies spec files into the working directory
59
- * 3. Initializes git for diff tracking
60
- * 4. Executes: `cat PROMPT.md | claude --print` in a loop
61
- * 5. After each loop: captures tokens, duration, git diff
62
- * 6. Checks for completion conditions:
113
+ * 3. Detects spec format (specmarket | speckit | bmad | ralph | custom) and generates
114
+ * `.specmarket-runner.md` format-aware meta-instructions for the AI agent.
115
+ * 4. Initializes git for diff tracking
116
+ * 5. Executes: `cat .specmarket-runner.md | claude --print` in a loop.
117
+ * The meta-instructions tell the agent which files to read, how to find tasks,
118
+ * how to mark completion, and when the run is done — regardless of spec format.
119
+ * 6. After each loop: captures tokens, duration, git diff
120
+ * 7. Checks for completion conditions:
63
121
  * - SUCCESS: TASKS.md empty + tests pass + all SUCCESS_CRITERIA.md criteria met
64
122
  * - STALL: 3 consecutive loops with no git diff
65
123
  * - FAILURE: 10 consecutive loops with same failing output
66
124
  * - BUDGET: total tokens > 2x estimated_tokens
67
- * 7. Writes run-report.json on completion
125
+ * 8. Writes run-report.json on completion
68
126
  *
69
127
  * SECURITY: Always prints sandboxing recommendation before starting.
70
128
  */
@@ -88,12 +146,20 @@ export async function runSpec(
88
146
  ? (opts.maxBudgetUsd / specYaml.estimatedCostUsd) * specYaml.estimatedTokens
89
147
  : specYaml.estimatedTokens * RUN_DEFAULTS.BUDGET_MULTIPLIER);
90
148
 
149
+ const harness = opts.harness ?? DEFAULT_HARNESS;
91
150
  const runId = opts.resumeRunId ?? randomUUID();
92
151
  const runsBaseDir = join(homedir(), CONFIG_PATHS.RUNS_DIR);
93
- const runDir = opts.outputDir ?? join(runsBaseDir, runId);
94
152
 
95
- await mkdir(runDir, { recursive: true });
96
- debug('Run directory: %s', runDir);
153
+ // --workdir: run in the caller-provided existing directory (no file copying).
154
+ // Without --workdir: create a fresh sandbox under ~/.specmarket/runs/<run-id>/.
155
+ const usingWorkdir = opts.workdir !== undefined;
156
+ const runDir = opts.workdir ?? opts.outputDir ?? join(runsBaseDir, runId);
157
+ const environmentType: 'fresh' | 'existing' = usingWorkdir ? 'existing' : 'fresh';
158
+
159
+ if (!usingWorkdir) {
160
+ await mkdir(runDir, { recursive: true });
161
+ }
162
+ debug('Run directory: %s (environmentType=%s, harness=%s)', runDir, environmentType, harness);
97
163
 
98
164
  if (opts.dryRun) {
99
165
  debug('Dry run mode — skipping execution');
@@ -102,6 +168,11 @@ export async function runSpec(
102
168
  specVersion: specYaml.version,
103
169
  model: opts.model ?? specYaml.minModel,
104
170
  runner: specYaml.runner,
171
+ harness,
172
+ specFormat: opts.specFormat,
173
+ environmentType,
174
+ steeringActionCount: 0,
175
+ isPureRun: false,
105
176
  loopCount: 0,
106
177
  totalTokens: 0,
107
178
  totalCostUsd: 0,
@@ -129,9 +200,19 @@ export async function runSpec(
129
200
  totalTokens = existingReport.totalTokens;
130
201
  debug('Resuming from iteration %d with %d tokens carried over', startIteration, totalTokens);
131
202
  }
203
+ // Ensure meta-instructions exist in the run dir (may be missing for runs
204
+ // created before this feature was added).
205
+ await ensureMetaInstructions(specDir, runDir, opts.specFormat);
206
+ } else if (usingWorkdir) {
207
+ // --workdir: the directory already has the spec files. Just generate/refresh
208
+ // the meta-instructions so the agent knows what format it is working with.
209
+ await ensureMetaInstructions(specDir, runDir, opts.specFormat);
210
+ // Initialize git if not already a repo (best-effort — may be an existing git repo)
211
+ await initGit(runDir);
132
212
  } else {
133
- // Fresh run: copy spec files and initialize git for diff tracking
213
+ // Fresh run: copy spec files, generate meta-instructions, initialize git.
134
214
  await copySpecFiles(specDir, runDir);
215
+ await ensureMetaInstructions(specDir, runDir, opts.specFormat);
135
216
  await initGit(runDir);
136
217
  }
137
218
 
@@ -140,6 +221,14 @@ export async function runSpec(
140
221
  let consecutiveNoChange = 0;
141
222
  let lastOutput = '';
142
223
  let consecutiveSameOutput = 0;
224
+ const steeringLog: SteeringEntry[] = [];
225
+ let steeringActionCount = 0;
226
+ /**
227
+ * Counts how many times the post-task test phase has detected failures after
228
+ * all TASKS.md items were checked. When this reaches TEST_PHASE_MAX_ITERATIONS,
229
+ * the run is declared a failure — the agent could not fix the tests.
230
+ */
231
+ let testPhaseAttempts = 0;
143
232
 
144
233
  let finalStatus: RunReport['status'] = 'failure';
145
234
  let successCriteriaResults: SuccessCriterionResult[] = [];
@@ -149,11 +238,21 @@ export async function runSpec(
149
238
 
150
239
  const iterStart = Date.now();
151
240
 
152
- // Execute: cat PROMPT.md | claude --print
153
- const result = await executeClaudeLoop(runDir, opts.model);
241
+ // Drain steering queue and inject any pending messages before this iteration.
242
+ // Messages are appended to the meta-instructions file so the harness sees them.
243
+ const pendingMessages = opts.steeringQueue ? opts.steeringQueue.splice(0) : [];
244
+ if (pendingMessages.length > 0) {
245
+ await injectSteeringMessages(runDir, pendingMessages, steeringLog);
246
+ steeringActionCount += pendingMessages.length;
247
+ debug('Injected %d steering message(s); total steeringActionCount=%d', pendingMessages.length, steeringActionCount);
248
+ }
249
+
250
+ // Execute via the selected harness
251
+ const result = await executeHarness(runDir, harness, opts.model);
154
252
 
155
253
  const iterDuration = Date.now() - iterStart;
156
- const tokensThisLoop = parseTokensFromOutput(result.stdout);
254
+ const activeModel = opts.model ?? specYaml.minModel;
255
+ const tokensThisLoop = parseTokensFromOutput(result.stdout, activeModel);
157
256
  totalTokens += tokensThisLoop;
158
257
 
159
258
  // Capture git diff
@@ -215,17 +314,65 @@ export async function runSpec(
215
314
  lastOutput = currentOutputHash;
216
315
  }
217
316
 
218
- // SUCCESS check
219
- const completionCheck = await checkCompletion(runDir);
220
- if (completionCheck.isComplete) {
221
- debug('Success criteria met at iteration %d', i);
222
- successCriteriaResults = completionCheck.results;
223
- finalStatus = 'success';
224
- break;
225
- }
317
+ // ---- Post-task test phase ----
318
+ //
319
+ // When all TASKS.md items are checked, the runner takes over test execution:
320
+ // 1. Run the test suite and capture output.
321
+ // 2. If tests fail: write specific fix tasks to TASKS.md and TEST_FAILURES.md,
322
+ // then continue the main loop so the agent can address them.
323
+ // 3. If tests pass: check SUCCESS_CRITERIA.md — if all met, declare success.
324
+ //
325
+ // This creates a test→fix→retest cycle driven by the runner, ensuring the
326
+ // agent only receives passing runs when everything is actually green.
327
+ const tasksComplete = await isFixPlanEmpty(runDir);
328
+ if (tasksComplete) {
329
+ const testResult = await runTestsWithOutput(runDir);
330
+
331
+ if (!testResult.passed) {
332
+ testPhaseAttempts++;
333
+ debug(
334
+ 'Post-task test phase attempt %d/%d: tests failing, writing fix tasks',
335
+ testPhaseAttempts,
336
+ RUN_DEFAULTS.TEST_PHASE_MAX_ITERATIONS
337
+ );
338
+
339
+ if (testPhaseAttempts >= RUN_DEFAULTS.TEST_PHASE_MAX_ITERATIONS) {
340
+ debug(
341
+ 'Test phase exceeded max iterations (%d), declaring failure',
342
+ RUN_DEFAULTS.TEST_PHASE_MAX_ITERATIONS
343
+ );
344
+ successCriteriaResults = await evaluateSuccessCriteria(runDir).catch(() => []);
345
+ finalStatus = 'failure';
346
+ break;
347
+ }
348
+
349
+ // Write actionable fix tasks so the next harness iteration has specific work.
350
+ await writeTestFixTasks(runDir, testResult.output);
351
+ await stageAllChanges(runDir);
352
+ successCriteriaResults = await evaluateSuccessCriteria(runDir).catch(() => []);
353
+ // Continue main loop — the harness will pick up the new fix tasks.
354
+ } else {
355
+ // Tests pass — evaluate SUCCESS_CRITERIA.md for the final gate.
356
+ const criteriaResults = await evaluateSuccessCriteria(runDir);
357
+ successCriteriaResults = criteriaResults;
358
+
359
+ if (criteriaResults.every((r) => r.passed)) {
360
+ debug('All tasks done, tests pass, criteria met at iteration %d', i);
361
+ finalStatus = 'success';
362
+ break;
363
+ }
226
364
 
227
- // Update partial success criteria results for reporting
228
- successCriteriaResults = completionCheck.results;
365
+ // Success criteria not yet all checked — continue loop.
366
+ // The agent must update SUCCESS_CRITERIA.md as criteria are satisfied.
367
+ debug(
368
+ 'Tests pass but not all criteria met at iteration %d; continuing',
369
+ i
370
+ );
371
+ }
372
+ } else {
373
+ // Tasks still pending — update partial results for reporting.
374
+ successCriteriaResults = await evaluateSuccessCriteria(runDir).catch(() => []);
375
+ }
229
376
  }
230
377
 
231
378
  // If we exhausted all loops without a status, mark as failure
@@ -233,15 +380,34 @@ export async function runSpec(
233
380
  successCriteriaResults = await evaluateSuccessCriteria(runDir).catch(() => []);
234
381
  }
235
382
 
383
+ // Persist steering log if any steering actions occurred during this run
384
+ if (steeringLog.length > 0) {
385
+ await writeFile(
386
+ join(runDir, 'steering-log.json'),
387
+ JSON.stringify(steeringLog, null, 2),
388
+ 'utf-8'
389
+ );
390
+ debug('Steering log written (%d entries)', steeringLog.length);
391
+ }
392
+
236
393
  const totalTimeMinutes = (Date.now() - startTime) / 60000;
237
394
  const costPerToken = specYaml.estimatedCostUsd / specYaml.estimatedTokens;
238
395
  const totalCostUsd = totalTokens * costPerToken;
239
396
 
397
+ // Auto-detect specFormat from the run directory when not provided explicitly
398
+ const detectedSpecFormat =
399
+ opts.specFormat ?? (await detectSpecFormat(runDir)).format;
400
+
240
401
  const report: RunReport = {
241
402
  runId,
242
403
  specVersion: specYaml.version,
243
404
  model: opts.model ?? specYaml.minModel,
244
405
  runner: specYaml.runner,
406
+ harness,
407
+ specFormat: detectedSpecFormat,
408
+ environmentType,
409
+ steeringActionCount,
410
+ isPureRun: finalStatus === 'success' && steeringActionCount === 0,
245
411
  loopCount: iterations.length,
246
412
  totalTokens,
247
413
  totalCostUsd,
@@ -265,10 +431,83 @@ export async function runSpec(
265
431
 
266
432
  // ---- Internal helpers ----
267
433
 
434
+ /**
435
+ * Detects the spec format from specDir and writes `.specmarket-runner.md` to runDir.
436
+ *
437
+ * Idempotent: if the file already exists in runDir it is overwritten so that
438
+ * the instructions stay consistent with the detected format.
439
+ *
440
+ * @param specDir - Source spec directory (used for format detection + sidecar data)
441
+ * @param runDir - Sandboxed run directory where the file is written
442
+ * @param formatOverride - Optional pre-detected format (skips detection when provided)
443
+ */
444
+ export async function ensureMetaInstructions(
445
+ specDir: string,
446
+ runDir: string,
447
+ formatOverride?: string
448
+ ): Promise<void> {
449
+ const format = formatOverride ?? (await detectSpecFormat(specDir)).format;
450
+ debug('Generating meta-instructions for format=%s', format);
451
+ const content = await generateMetaInstructions(specDir, format);
452
+ await writeFile(join(runDir, META_INSTRUCTION_FILENAME), content, 'utf-8');
453
+ debug('Meta-instructions written to %s/%s', runDir, META_INSTRUCTION_FILENAME);
454
+ }
455
+
456
+ /**
457
+ * Injects pending steering messages into the meta-instructions file for the
458
+ * current run directory.
459
+ *
460
+ * A "## Steering Input" section is appended to `.specmarket-runner.md` so the
461
+ * agent reads the user's guidance on its next harness invocation. Each call
462
+ * appends a timestamped section — messages accumulate across iterations so the
463
+ * agent retains the full steering history.
464
+ *
465
+ * Side effects:
466
+ * - Modifies `.specmarket-runner.md` in runDir (appends steering section)
467
+ * - Pushes `SteeringEntry` objects into `steeringLog`
468
+ *
469
+ * @param runDir - Active run directory containing the meta-instructions file
470
+ * @param messages - Steering messages to inject (already spliced from the queue)
471
+ * @param steeringLog - Mutable array collecting all steering entries for this run
472
+ */
473
+ export async function injectSteeringMessages(
474
+ runDir: string,
475
+ messages: string[],
476
+ steeringLog: SteeringEntry[]
477
+ ): Promise<void> {
478
+ if (messages.length === 0) return;
479
+
480
+ const timestamp = new Date().toISOString();
481
+ const entries: SteeringEntry[] = messages.map((content) => ({ timestamp, content }));
482
+ steeringLog.push(...entries);
483
+
484
+ const steeringSection = [
485
+ '',
486
+ `## Steering Input (injected at ${timestamp})`,
487
+ '',
488
+ 'The user has provided the following steering instructions. Incorporate them into your current work:',
489
+ '',
490
+ ...messages.map((m) => `> ${m}`),
491
+ '',
492
+ ].join('\n');
493
+
494
+ const metaPath = join(runDir, META_INSTRUCTION_FILENAME);
495
+ try {
496
+ const existing = await readFile(metaPath, 'utf-8');
497
+ await writeFile(metaPath, existing + steeringSection, 'utf-8');
498
+ } catch {
499
+ // Meta-instructions file missing — create it with just the steering section
500
+ await writeFile(metaPath, steeringSection, 'utf-8');
501
+ }
502
+
503
+ debug('injectSteeringMessages: appended %d message(s) to %s', messages.length, META_INSTRUCTION_FILENAME);
504
+ }
505
+
268
506
  async function copySpecFiles(srcDir: string, destDir: string): Promise<void> {
269
507
  const { cp } = await import('fs/promises');
270
508
  await cp(srcDir, join(destDir, 'spec'), { recursive: true });
271
- // Also copy directly to destDir so PROMPT.md is at root
509
+ // Also copy directly to destDir so spec files are accessible at the root
510
+ // of the run directory alongside the generated meta-instructions.
272
511
  await cp(srcDir, destDir, { recursive: true, force: false });
273
512
  debug('Spec files copied from %s to %s', srcDir, destDir);
274
513
  }
@@ -306,27 +545,62 @@ interface ExecuteResult {
306
545
  exitCode: number;
307
546
  }
308
547
 
309
- async function executeClaudeLoop(dir: string, model?: string): Promise<ExecuteResult> {
310
- return new Promise((resolve) => {
311
- const args = ['--print', '--output-format', 'json'];
312
- if (model) {
313
- args.push('--model', model);
548
+ /**
549
+ * Builds the shell command string for the given harness.
550
+ *
551
+ * All harnesses receive the same meta-instructions file via stdin so they
552
+ * know what format they are working with and what tasks to execute.
553
+ *
554
+ * - claude-code: `cat .specmarket-runner.md | claude --print --output-format json [--model <m>]`
555
+ * - codex: `cat .specmarket-runner.md | codex`
556
+ * - opencode: `cat .specmarket-runner.md | opencode`
557
+ */
558
+ function buildHarnessCommand(harness: string, model?: string): string {
559
+ switch (harness) {
560
+ case 'claude-code': {
561
+ const args = ['--print', '--output-format', 'json'];
562
+ if (model) args.push('--model', model);
563
+ return `cat ${META_INSTRUCTION_FILENAME} | claude ${args.join(' ')}`;
314
564
  }
565
+ case 'codex':
566
+ // Codex CLI reads from stdin; model selection is via OPENAI_MODEL env or its own flags
567
+ return `cat ${META_INSTRUCTION_FILENAME} | codex`;
568
+ case 'opencode':
569
+ // opencode reads from stdin
570
+ return `cat ${META_INSTRUCTION_FILENAME} | opencode`;
571
+ default:
572
+ // Unknown harness — fall back to claude-code behaviour
573
+ debug('Unknown harness "%s" — falling back to claude-code', harness);
574
+ return `cat ${META_INSTRUCTION_FILENAME} | claude --print --output-format json`;
575
+ }
576
+ }
315
577
 
316
- // Execute: cat PROMPT.md | claude --print --output-format json
317
- // Using --output-format json gives us structured output with token usage metadata.
318
- const proc = spawn('sh', ['-c', `cat PROMPT.md | claude ${args.join(' ')}`], {
578
+ /**
579
+ * Executes a single loop iteration via the specified harness.
580
+ *
581
+ * The meta-instructions file (`.specmarket-runner.md`) is piped into the harness
582
+ * binary as stdin. The harness is expected to read the instructions, perform the
583
+ * requested work inside the run directory, and exit with code 0 on success.
584
+ */
585
+ async function executeHarness(dir: string, harness: string, model?: string): Promise<ExecuteResult> {
586
+ const cmd = buildHarnessCommand(harness, model);
587
+ debug('executeHarness: %s (harness=%s)', cmd, harness);
588
+
589
+ return new Promise((resolve) => {
590
+ const proc = spawn('sh', ['-c', cmd], {
319
591
  cwd: dir,
320
- stdio: ['inherit', 'pipe', 'pipe'],
592
+ // stdin is 'ignore': the harness reads its instructions from the meta-instructions file
593
+ // via `cat .specmarket-runner.md | <harness>`, not from parent stdin.
594
+ // Keeping stdin detached from the parent lets the CLI read steering messages
595
+ // from process.stdin without conflict.
596
+ stdio: ['ignore', 'pipe', 'pipe'],
321
597
  });
322
598
 
323
599
  let stdout = '';
324
- let stderr = '';
325
600
  proc.stdout?.on('data', (chunk: Buffer) => {
326
601
  stdout += chunk.toString();
327
602
  });
328
603
  proc.stderr?.on('data', (chunk: Buffer) => {
329
- stderr += chunk.toString();
330
604
  // Write stderr to process stderr for visibility
331
605
  process.stderr.write(chunk);
332
606
  });
@@ -336,7 +610,7 @@ async function executeClaudeLoop(dir: string, model?: string): Promise<ExecuteRe
336
610
  });
337
611
 
338
612
  proc.on('error', (err) => {
339
- debug('claude spawn error: %O', err);
613
+ debug('%s spawn error: %O', harness, err);
340
614
  resolve({ stdout: '', exitCode: 1 });
341
615
  });
342
616
  });
@@ -347,15 +621,32 @@ async function executeClaudeLoop(dir: string, model?: string): Promise<ExecuteRe
347
621
  *
348
622
  * Strategy (in priority order):
349
623
  * 1. Parse JSON output format (claude --output-format json) which contains
350
- * structured metadata including token counts in the response.
624
+ * structured metadata including token counts or cost_usd in the response.
625
+ * When only cost_usd is available (typical for Claude Code CLI), token count
626
+ * is estimated using model-aware pricing constants from MODEL_COST_PER_TOKEN.
627
+ * This estimate may deviate ±30% from the actual count depending on the
628
+ * input/output token ratio for that specific run.
351
629
  * 2. Match known text patterns from Claude Code's output (total_tokens, etc.)
352
630
  * 3. Estimate from output length as a last-resort heuristic (~4 chars per token).
353
631
  *
632
+ * @param output - Raw stdout from the Claude CLI invocation
633
+ * @param model - Model identifier (e.g. "claude-haiku-4-5", "claude-opus-4-6").
634
+ * Used to select the correct pricing tier for cost→token estimation.
635
+ * Defaults to Sonnet-tier pricing if omitted or unrecognised.
636
+ *
354
637
  * Returns 0 only if the output is empty (no meaningful work was done).
355
638
  */
356
- function parseTokensFromOutput(output: string): number {
639
+ export function parseTokensFromOutput(output: string, model?: string): number {
357
640
  if (!output || output.trim().length === 0) return 0;
358
641
 
642
+ // Resolve cost-per-token for this model (case-insensitive substring match)
643
+ const modelLower = (model ?? '').toLowerCase();
644
+ const costPerToken = modelLower.includes('haiku')
645
+ ? MODEL_COST_PER_TOKEN.haiku
646
+ : modelLower.includes('opus')
647
+ ? MODEL_COST_PER_TOKEN.opus
648
+ : MODEL_COST_PER_TOKEN.default;
649
+
359
650
  // Strategy 1: Parse JSON output format from claude --output-format json
360
651
  // Claude Code JSON output may contain token usage info in the response metadata.
361
652
  try {
@@ -386,11 +677,16 @@ function parseTokensFromOutput(output: string): number {
386
677
  const output_tokens = parsed.usage?.output_tokens ?? parsed.usage?.completion_tokens ?? 0;
387
678
  if (input > 0 || output_tokens > 0) return input + output_tokens;
388
679
 
389
- // Cost-based estimation (if cost is reported but not tokens)
390
- // Haiku: ~$0.25/MTok input, $1.25/MTok output avg ~$0.75/MTok
391
- // Sonnet: ~$3/MTok input, $15/MTok output → avg ~$9/MTok
680
+ // Cost-based estimation: Claude Code CLI typically reports cost_usd but not
681
+ // raw token counts. Use model-aware pricing for the best estimate.
392
682
  if (typeof parsed.cost_usd === 'number' && parsed.cost_usd > 0) {
393
- return Math.round(parsed.cost_usd / 0.000009); // Assume Sonnet pricing
683
+ debug(
684
+ 'parseTokensFromOutput: using cost_usd=%f with model=%s (costPerToken=%e)',
685
+ parsed.cost_usd,
686
+ model ?? 'unknown',
687
+ costPerToken
688
+ );
689
+ return Math.round(parsed.cost_usd / costPerToken);
394
690
  }
395
691
  }
396
692
  } catch {
@@ -425,8 +721,8 @@ function parseTokensFromOutput(output: string): number {
425
721
  }
426
722
 
427
723
  // Strategy 3: Estimate from output length
428
- // Rough heuristic: ~4 characters per token for English text
429
- // This is imprecise but better than returning 0 (which breaks budget tracking)
724
+ // Rough heuristic: ~4 characters per token for English text.
725
+ // This is imprecise but better than returning 0 (which breaks budget tracking).
430
726
  const estimatedTokens = Math.ceil(output.length / 4);
431
727
  debug(
432
728
  'parseTokensFromOutput: no explicit token count found, estimating %d from %d chars',
@@ -441,40 +737,6 @@ function parseIntComma(s: string): number {
441
737
  return parseInt(s.replace(/,/g, ''), 10) || 0;
442
738
  }
443
739
 
444
- interface CompletionCheck {
445
- isComplete: boolean;
446
- results: SuccessCriterionResult[];
447
- }
448
-
449
- async function checkCompletion(dir: string): Promise<CompletionCheck> {
450
- // Check 1: TASKS.md should be empty or have only checked items
451
- const fixPlanEmpty = await isFixPlanEmpty(dir);
452
- if (!fixPlanEmpty) {
453
- return {
454
- isComplete: false,
455
- results: await evaluateSuccessCriteria(dir).catch(() => []),
456
- };
457
- }
458
-
459
- // Check 2: Run test suite if detectable
460
- const testsPass = await runTests(dir);
461
- if (!testsPass) {
462
- return {
463
- isComplete: false,
464
- results: await evaluateSuccessCriteria(dir).catch(() => []),
465
- };
466
- }
467
-
468
- // Check 3: Evaluate SUCCESS_CRITERIA.md
469
- const criteriaResults = await evaluateSuccessCriteria(dir);
470
- const allPassed = criteriaResults.every((r) => r.passed);
471
-
472
- return {
473
- isComplete: allPassed,
474
- results: criteriaResults,
475
- };
476
- }
477
-
478
740
  async function isFixPlanEmpty(dir: string): Promise<boolean> {
479
741
  try {
480
742
  const content = await readFile(join(dir, 'TASKS.md'), 'utf-8');
@@ -487,13 +749,21 @@ async function isFixPlanEmpty(dir: string): Promise<boolean> {
487
749
  }
488
750
  }
489
751
 
490
- async function runTests(dir: string): Promise<boolean> {
491
- // Try to detect and run tests using known test runner config files.
492
- // Exit code is the primary failure signal; output regex is a fallback.
752
+ /**
753
+ * Runs the test suite in `dir` and captures the raw output.
754
+ *
755
+ * Probes for known test runner config files in priority order; skips to the
756
+ * next runner on spawn or timeout errors. Returns `{ passed: true, output: '' }`
757
+ * when no test runner is detected (cannot verify — assume passing).
758
+ *
759
+ * The raw `output` is used by `writeTestFixTasks` to extract failure details
760
+ * and write them as actionable fix tasks for the agent.
761
+ */
762
+ export async function runTestsWithOutput(dir: string): Promise<{ passed: boolean; output: string }> {
493
763
  const testRunners = [
494
764
  { file: 'package.json', cmd: 'npm test -- --run 2>&1' },
495
765
  { file: 'vitest.config.ts', cmd: 'npx vitest run 2>&1' },
496
- { file: 'pytest.ini', cmd: 'python -m pytest --tb=no -q 2>&1' },
766
+ { file: 'pytest.ini', cmd: 'python -m pytest --tb=short -q 2>&1' },
497
767
  { file: 'Makefile', cmd: 'make test 2>&1' },
498
768
  ];
499
769
 
@@ -509,22 +779,129 @@ async function runTests(dir: string): Promise<boolean> {
509
779
  cwd: dir,
510
780
  timeout: 120000,
511
781
  });
512
- // Exit code 0 — check output as secondary signal
513
782
  const combined = stdout + stderr;
514
783
  const hasFailed = /\d+ failed|\d+ error/i.test(combined);
515
- return !hasFailed;
784
+ return { passed: !hasFailed, output: combined };
516
785
  } catch (err: unknown) {
517
- // Non-zero exit code means tests failed
518
- if (err && typeof err === 'object' && 'code' in err && typeof err.code === 'number') {
519
- return false;
786
+ if (err && typeof err === 'object') {
787
+ const execErr = err as { code?: number; signal?: string; stdout?: string; stderr?: string };
788
+ if (typeof execErr.code === 'number' && execErr.signal == null) {
789
+ // Process exited with a non-zero exit code — genuine test failures.
790
+ const combined = (execErr.stdout ?? '') + (execErr.stderr ?? '');
791
+ return { passed: false, output: combined };
792
+ }
520
793
  }
521
- // Timeout or other execution error — skip to next runner
794
+ // Timeout or spawn error — skip to next runner
522
795
  continue;
523
796
  }
524
797
  }
525
798
 
526
- // No test runner found — assume passing
527
- return true;
799
+ // No test runner detected — assume passing
800
+ return { passed: true, output: '' };
801
+ }
802
+
803
+ /**
804
+ * Extract a short list of failing test identifiers from raw test runner output.
805
+ *
806
+ * Supports:
807
+ * - Vitest/Jest: "FAIL src/foo.test.ts" file-level failures
808
+ * - Vitest/Jest: "× test name" / "✗ test name" individual test failures
809
+ * - Pytest: "FAILED tests/foo.py::test_name"
810
+ * - Generic: "N failed" summary line (fallback)
811
+ *
812
+ * Returns at most 10 entries. When specific failures cannot be parsed, returns
813
+ * a single generic entry directing the agent to TEST_FAILURES.md.
814
+ */
815
+ export function extractTestFailures(output: string): string[] {
816
+ const failures: string[] = [];
817
+
818
+ // Vitest/Jest: "FAIL src/foo.test.ts" (file-level failure)
819
+ const failFileMatches = output.match(/^FAIL\s+\S+/gm) ?? [];
820
+ for (const m of failFileMatches) {
821
+ const name = m.replace(/^FAIL\s+/, '').trim();
822
+ if (name && !failures.includes(name)) failures.push(name);
823
+ }
824
+
825
+ // Vitest/Jest: individual test "× test name" or "✗ test name" or "✕ test name"
826
+ const failTestMatches = output.match(/^[\s]*[×✗✕]\s+(.+)/gm) ?? [];
827
+ for (const m of failTestMatches) {
828
+ const name = m.replace(/^[\s]*[×✗✕]\s+/, '').trim();
829
+ if (name && !failures.includes(name)) failures.push(name);
830
+ }
831
+
832
+ // Pytest: "FAILED tests/foo.py::test_bar"
833
+ const pytestMatches = output.match(/^FAILED\s+\S+/gm) ?? [];
834
+ for (const m of pytestMatches) {
835
+ const name = m.replace(/^FAILED\s+/, '').trim();
836
+ if (name && !failures.includes(name)) failures.push(name);
837
+ }
838
+
839
+ // Generic fallback when specific test names couldn't be parsed
840
+ if (failures.length === 0) {
841
+ const summaryMatch = output.match(/(\d+)\s+failed/i);
842
+ if (summaryMatch) {
843
+ failures.push(`${summaryMatch[1]} test(s) failed — see TEST_FAILURES.md for details`);
844
+ }
845
+ }
846
+
847
+ return failures.slice(0, 10);
848
+ }
849
+
850
+ /**
851
+ * Write test failures as actionable fix tasks into TASKS.md after the runner
852
+ * detects that all implementation tasks are done but tests are still failing.
853
+ *
854
+ * Side effects:
855
+ * - Writes `TEST_FAILURES.md` with the full test output for agent reference.
856
+ * - Appends (or replaces) a "## Test Failures (Auto-Generated)" section in
857
+ * `TASKS.md` containing one `- [ ] Fix: <name>` item per failing test.
858
+ * Any previous auto-generated section is replaced to avoid duplication.
859
+ *
860
+ * The agent will see TASKS.md has unchecked items, read TEST_FAILURES.md for
861
+ * context, and work to resolve each failure before marking them `[x]`.
862
+ */
863
+ export async function writeTestFixTasks(dir: string, testOutput: string): Promise<void> {
864
+ // Always write the full output to TEST_FAILURES.md so the agent has context.
865
+ await writeFile(
866
+ join(dir, 'TEST_FAILURES.md'),
867
+ [
868
+ '# Test Failures',
869
+ '',
870
+ '> Auto-generated by SpecMarket runner. Delete this file when all tests pass.',
871
+ '',
872
+ '## Raw Test Output',
873
+ '',
874
+ '```',
875
+ testOutput.slice(0, 8000),
876
+ '```',
877
+ ].join('\n'),
878
+ 'utf-8'
879
+ );
880
+
881
+ const failures = extractTestFailures(testOutput);
882
+ if (failures.length === 0) return;
883
+
884
+ const testFixSection = [
885
+ '',
886
+ '## Test Failures (Auto-Generated)',
887
+ '> These tasks were created by the runner after detecting test failures.',
888
+ '> Fix each failing test, then delete this section and TEST_FAILURES.md.',
889
+ '',
890
+ ...failures.map((f) => `- [ ] Fix: ${f}`),
891
+ ].join('\n');
892
+
893
+ try {
894
+ const existing = await readFile(join(dir, 'TASKS.md'), 'utf-8');
895
+ // Replace any previous auto-generated section to avoid duplication.
896
+ const withoutPrevious = existing.replace(
897
+ /\n## Test Failures \(Auto-Generated\)[\s\S]*/,
898
+ ''
899
+ );
900
+ await writeFile(join(dir, 'TASKS.md'), withoutPrevious + testFixSection, 'utf-8');
901
+ } catch {
902
+ // TASKS.md doesn't exist — create it.
903
+ await writeFile(join(dir, 'TASKS.md'), `# Tasks${testFixSection}`, 'utf-8');
904
+ }
528
905
  }
529
906
 
530
907
  async function evaluateSuccessCriteria(dir: string): Promise<SuccessCriterionResult[]> {