@specmarket/cli 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.md +1 -1
  2. package/dist/{chunk-MS2DYACY.js → chunk-OTXWWFAO.js} +42 -3
  3. package/dist/chunk-OTXWWFAO.js.map +1 -0
  4. package/dist/{config-R5KWZSJP.js → config-5JMI3YAR.js} +2 -2
  5. package/dist/index.js +1945 -252
  6. package/dist/index.js.map +1 -1
  7. package/package.json +1 -1
  8. package/src/commands/comment.test.ts +211 -0
  9. package/src/commands/comment.ts +176 -0
  10. package/src/commands/fork.test.ts +163 -0
  11. package/src/commands/info.test.ts +192 -0
  12. package/src/commands/info.ts +66 -2
  13. package/src/commands/init.test.ts +245 -0
  14. package/src/commands/init.ts +359 -25
  15. package/src/commands/issues.test.ts +382 -0
  16. package/src/commands/issues.ts +436 -0
  17. package/src/commands/login.test.ts +99 -0
  18. package/src/commands/login.ts +2 -6
  19. package/src/commands/logout.test.ts +54 -0
  20. package/src/commands/publish.test.ts +159 -0
  21. package/src/commands/publish.ts +1 -0
  22. package/src/commands/report.test.ts +181 -0
  23. package/src/commands/run.test.ts +419 -0
  24. package/src/commands/run.ts +71 -3
  25. package/src/commands/search.test.ts +147 -0
  26. package/src/commands/validate.test.ts +206 -2
  27. package/src/commands/validate.ts +315 -192
  28. package/src/commands/whoami.test.ts +106 -0
  29. package/src/index.ts +6 -0
  30. package/src/lib/convex-client.ts +6 -2
  31. package/src/lib/format-detection.test.ts +223 -0
  32. package/src/lib/format-detection.ts +172 -0
  33. package/src/lib/meta-instructions.test.ts +340 -0
  34. package/src/lib/meta-instructions.ts +562 -0
  35. package/src/lib/ralph-loop.test.ts +404 -0
  36. package/src/lib/ralph-loop.ts +501 -95
  37. package/src/lib/telemetry.ts +7 -1
  38. package/dist/chunk-MS2DYACY.js.map +0 -1
  39. /package/dist/{config-R5KWZSJP.js.map → config-5JMI3YAR.js.map} +0 -0
@@ -3,7 +3,7 @@ import { mkdir, writeFile, readFile, access } from 'fs/promises';
3
3
  import { join, resolve } from 'path';
4
4
  import { homedir } from 'os';
5
5
  import { randomUUID } from 'crypto';
6
- import { execSync, exec } from 'child_process';
6
+ import { exec } from 'child_process';
7
7
  import { promisify } from 'util';
8
8
  import type { RunReport, LoopIteration, SuccessCriterionResult } from '@specmarket/shared';
9
9
  import {
@@ -11,12 +11,64 @@ import {
11
11
  RUN_DEFAULTS,
12
12
  EXIT_CODES,
13
13
  RUNNER_ID,
14
+ MODEL_COST_PER_TOKEN,
15
+ DEFAULT_HARNESS,
14
16
  } from '@specmarket/shared';
15
17
  import createDebug from 'debug';
18
+ import { generateMetaInstructions, META_INSTRUCTION_FILENAME } from './meta-instructions.js';
19
+ import { detectSpecFormat } from './format-detection.js';
16
20
 
17
21
  const debug = createDebug('specmarket:runner');
18
22
  const execAsync = promisify(exec);
19
23
 
24
+ /**
25
+ * Pre-flight check: Verifies that the selected harness CLI is installed.
26
+ * Throws an error with installation instructions if the binary is not found.
27
+ *
28
+ * @param harness - The harness to check. Defaults to 'claude-code'.
29
+ */
30
+ export async function checkClaudeCliInstalled(harness?: string): Promise<void> {
31
+ const h = harness ?? DEFAULT_HARNESS;
32
+ const binaryName = HARNESS_BINARY[h] ?? 'claude';
33
+ try {
34
+ await execAsync(`which ${binaryName}`);
35
+ } catch {
36
+ const installHint = HARNESS_INSTALL_HINT[h] ?? `Install ${binaryName} and ensure it is in your PATH.`;
37
+ throw new Error(
38
+ `Harness "${h}" binary "${binaryName}" is not installed or not in your PATH.\n\n` +
39
+ `${installHint}\n`
40
+ );
41
+ }
42
+ }
43
+
44
+ /** CLI binary name for each harness */
45
+ const HARNESS_BINARY: Record<string, string> = {
46
+ 'claude-code': 'claude',
47
+ 'codex': 'codex',
48
+ 'opencode': 'opencode',
49
+ };
50
+
51
+ /** Install hints for each harness */
52
+ const HARNESS_INSTALL_HINT: Record<string, string> = {
53
+ 'claude-code':
54
+ 'Installation instructions:\n npm install -g @anthropic-ai/claude-code\n\nOr visit: https://www.anthropic.com/claude-code',
55
+ 'codex':
56
+ 'Installation instructions:\n npm install -g @openai/codex\n\nOr visit: https://github.com/openai/codex',
57
+ 'opencode':
58
+ 'Installation instructions:\n npm install -g opencode-ai\n\nOr visit: https://opencode.ai',
59
+ };
60
+
61
+ /**
62
+ * A single steering action logged during the run.
63
+ * Written to `steering-log.json` in the run directory on completion.
64
+ */
65
+ export interface SteeringEntry {
66
+ /** ISO-8601 timestamp when the message was injected */
67
+ timestamp: string;
68
+ /** User-provided steering content */
69
+ content: string;
70
+ }
71
+
20
72
  export interface RunOptions {
21
73
  maxLoops?: number;
22
74
  maxBudgetUsd?: number;
@@ -25,6 +77,26 @@ export interface RunOptions {
25
77
  resumeRunId?: string;
26
78
  outputDir?: string;
27
79
  cliVersion: string;
80
+ /** Spec format override. When omitted, auto-detected from specDir. */
81
+ specFormat?: string;
82
+ /**
83
+ * Agentic harness to use for execution.
84
+ * One of: 'claude-code' (default), 'codex', 'opencode'.
85
+ */
86
+ harness?: string;
87
+ /**
88
+ * Existing working directory to run in instead of a fresh sandbox.
89
+ * When set, spec files are NOT copied — the agent operates directly on this directory.
90
+ * Enables `environmentType: 'existing'` in the run report.
91
+ */
92
+ workdir?: string;
93
+ /**
94
+ * Shared queue for steering messages typed by the user during the run.
95
+ * The caller pushes messages here; the runner drains the queue before each
96
+ * harness execution and injects the messages into the meta-instructions file.
97
+ * Each drained message increments `steeringActionCount` in the run report.
98
+ */
99
+ steeringQueue?: string[];
28
100
  }
29
101
 
30
102
  export interface RunResult {
@@ -38,15 +110,19 @@ export interface RunResult {
38
110
  * The loop:
39
111
  * 1. Creates a sandboxed working directory under ~/.specmarket/runs/<run-id>/
40
112
  * 2. Copies spec files into the working directory
41
- * 3. Initializes git for diff tracking
42
- * 4. Executes: `cat PROMPT.md | claude-code --print` in a loop
43
- * 5. After each loop: captures tokens, duration, git diff
44
- * 6. Checks for completion conditions:
45
- * - SUCCESS: fix_plan.md empty + tests pass + all SUCCESS_CRITERIA.md criteria met
113
+ * 3. Detects spec format (specmarket | speckit | bmad | ralph | custom) and generates
114
+ * `.specmarket-runner.md` format-aware meta-instructions for the AI agent.
115
+ * 4. Initializes git for diff tracking
116
+ * 5. Executes: `cat .specmarket-runner.md | claude --print` in a loop.
117
+ * The meta-instructions tell the agent which files to read, how to find tasks,
118
+ * how to mark completion, and when the run is done — regardless of spec format.
119
+ * 6. After each loop: captures tokens, duration, git diff
120
+ * 7. Checks for completion conditions:
121
+ * - SUCCESS: TASKS.md empty + tests pass + all SUCCESS_CRITERIA.md criteria met
46
122
  * - STALL: 3 consecutive loops with no git diff
47
123
  * - FAILURE: 10 consecutive loops with same failing output
48
124
  * - BUDGET: total tokens > 2x estimated_tokens
49
- * 7. Writes run-report.json on completion
125
+ * 8. Writes run-report.json on completion
50
126
  *
51
127
  * SECURITY: Always prints sandboxing recommendation before starting.
52
128
  */
@@ -70,12 +146,20 @@ export async function runSpec(
70
146
  ? (opts.maxBudgetUsd / specYaml.estimatedCostUsd) * specYaml.estimatedTokens
71
147
  : specYaml.estimatedTokens * RUN_DEFAULTS.BUDGET_MULTIPLIER);
72
148
 
149
+ const harness = opts.harness ?? DEFAULT_HARNESS;
73
150
  const runId = opts.resumeRunId ?? randomUUID();
74
151
  const runsBaseDir = join(homedir(), CONFIG_PATHS.RUNS_DIR);
75
- const runDir = opts.outputDir ?? join(runsBaseDir, runId);
76
152
 
77
- await mkdir(runDir, { recursive: true });
78
- debug('Run directory: %s', runDir);
153
+ // --workdir: run in the caller-provided existing directory (no file copying).
154
+ // Without --workdir: create a fresh sandbox under ~/.specmarket/runs/<run-id>/.
155
+ const usingWorkdir = opts.workdir !== undefined;
156
+ const runDir = opts.workdir ?? opts.outputDir ?? join(runsBaseDir, runId);
157
+ const environmentType: 'fresh' | 'existing' = usingWorkdir ? 'existing' : 'fresh';
158
+
159
+ if (!usingWorkdir) {
160
+ await mkdir(runDir, { recursive: true });
161
+ }
162
+ debug('Run directory: %s (environmentType=%s, harness=%s)', runDir, environmentType, harness);
79
163
 
80
164
  if (opts.dryRun) {
81
165
  debug('Dry run mode — skipping execution');
@@ -84,6 +168,11 @@ export async function runSpec(
84
168
  specVersion: specYaml.version,
85
169
  model: opts.model ?? specYaml.minModel,
86
170
  runner: specYaml.runner,
171
+ harness,
172
+ specFormat: opts.specFormat,
173
+ environmentType,
174
+ steeringActionCount: 0,
175
+ isPureRun: false,
87
176
  loopCount: 0,
88
177
  totalTokens: 0,
89
178
  totalCostUsd: 0,
@@ -111,9 +200,19 @@ export async function runSpec(
111
200
  totalTokens = existingReport.totalTokens;
112
201
  debug('Resuming from iteration %d with %d tokens carried over', startIteration, totalTokens);
113
202
  }
203
+ // Ensure meta-instructions exist in the run dir (may be missing for runs
204
+ // created before this feature was added).
205
+ await ensureMetaInstructions(specDir, runDir, opts.specFormat);
206
+ } else if (usingWorkdir) {
207
+ // --workdir: the directory already has the spec files. Just generate/refresh
208
+ // the meta-instructions so the agent knows what format it is working with.
209
+ await ensureMetaInstructions(specDir, runDir, opts.specFormat);
210
+ // Initialize git if not already a repo (best-effort — may be an existing git repo)
211
+ await initGit(runDir);
114
212
  } else {
115
- // Fresh run: copy spec files and initialize git for diff tracking
213
+ // Fresh run: copy spec files, generate meta-instructions, initialize git.
116
214
  await copySpecFiles(specDir, runDir);
215
+ await ensureMetaInstructions(specDir, runDir, opts.specFormat);
117
216
  await initGit(runDir);
118
217
  }
119
218
 
@@ -122,6 +221,14 @@ export async function runSpec(
122
221
  let consecutiveNoChange = 0;
123
222
  let lastOutput = '';
124
223
  let consecutiveSameOutput = 0;
224
+ const steeringLog: SteeringEntry[] = [];
225
+ let steeringActionCount = 0;
226
+ /**
227
+ * Counts how many times the post-task test phase has detected failures after
228
+ * all TASKS.md items were checked. When this reaches TEST_PHASE_MAX_ITERATIONS,
229
+ * the run is declared a failure — the agent could not fix the tests.
230
+ */
231
+ let testPhaseAttempts = 0;
125
232
 
126
233
  let finalStatus: RunReport['status'] = 'failure';
127
234
  let successCriteriaResults: SuccessCriterionResult[] = [];
@@ -131,11 +238,21 @@ export async function runSpec(
131
238
 
132
239
  const iterStart = Date.now();
133
240
 
134
- // Execute: cat PROMPT.md | claude-code --print
135
- const result = await executeClaudeLoop(runDir, opts.model);
241
+ // Drain steering queue and inject any pending messages before this iteration.
242
+ // Messages are appended to the meta-instructions file so the harness sees them.
243
+ const pendingMessages = opts.steeringQueue ? opts.steeringQueue.splice(0) : [];
244
+ if (pendingMessages.length > 0) {
245
+ await injectSteeringMessages(runDir, pendingMessages, steeringLog);
246
+ steeringActionCount += pendingMessages.length;
247
+ debug('Injected %d steering message(s); total steeringActionCount=%d', pendingMessages.length, steeringActionCount);
248
+ }
249
+
250
+ // Execute via the selected harness
251
+ const result = await executeHarness(runDir, harness, opts.model);
136
252
 
137
253
  const iterDuration = Date.now() - iterStart;
138
- const tokensThisLoop = parseTokensFromOutput(result.stdout);
254
+ const activeModel = opts.model ?? specYaml.minModel;
255
+ const tokensThisLoop = parseTokensFromOutput(result.stdout, activeModel);
139
256
  totalTokens += tokensThisLoop;
140
257
 
141
258
  // Capture git diff
@@ -197,17 +314,65 @@ export async function runSpec(
197
314
  lastOutput = currentOutputHash;
198
315
  }
199
316
 
200
- // SUCCESS check
201
- const completionCheck = await checkCompletion(runDir);
202
- if (completionCheck.isComplete) {
203
- debug('Success criteria met at iteration %d', i);
204
- successCriteriaResults = completionCheck.results;
205
- finalStatus = 'success';
206
- break;
207
- }
317
+ // ---- Post-task test phase ----
318
+ //
319
+ // When all TASKS.md items are checked, the runner takes over test execution:
320
+ // 1. Run the test suite and capture output.
321
+ // 2. If tests fail: write specific fix tasks to TASKS.md and TEST_FAILURES.md,
322
+ // then continue the main loop so the agent can address them.
323
+ // 3. If tests pass: check SUCCESS_CRITERIA.md — if all met, declare success.
324
+ //
325
+ // This creates a test→fix→retest cycle driven by the runner, ensuring the
326
+ // agent only receives passing runs when everything is actually green.
327
+ const tasksComplete = await isFixPlanEmpty(runDir);
328
+ if (tasksComplete) {
329
+ const testResult = await runTestsWithOutput(runDir);
330
+
331
+ if (!testResult.passed) {
332
+ testPhaseAttempts++;
333
+ debug(
334
+ 'Post-task test phase attempt %d/%d: tests failing, writing fix tasks',
335
+ testPhaseAttempts,
336
+ RUN_DEFAULTS.TEST_PHASE_MAX_ITERATIONS
337
+ );
338
+
339
+ if (testPhaseAttempts >= RUN_DEFAULTS.TEST_PHASE_MAX_ITERATIONS) {
340
+ debug(
341
+ 'Test phase exceeded max iterations (%d), declaring failure',
342
+ RUN_DEFAULTS.TEST_PHASE_MAX_ITERATIONS
343
+ );
344
+ successCriteriaResults = await evaluateSuccessCriteria(runDir).catch(() => []);
345
+ finalStatus = 'failure';
346
+ break;
347
+ }
208
348
 
209
- // Update partial success criteria results for reporting
210
- successCriteriaResults = completionCheck.results;
349
+ // Write actionable fix tasks so the next harness iteration has specific work.
350
+ await writeTestFixTasks(runDir, testResult.output);
351
+ await stageAllChanges(runDir);
352
+ successCriteriaResults = await evaluateSuccessCriteria(runDir).catch(() => []);
353
+ // Continue main loop — the harness will pick up the new fix tasks.
354
+ } else {
355
+ // Tests pass — evaluate SUCCESS_CRITERIA.md for the final gate.
356
+ const criteriaResults = await evaluateSuccessCriteria(runDir);
357
+ successCriteriaResults = criteriaResults;
358
+
359
+ if (criteriaResults.every((r) => r.passed)) {
360
+ debug('All tasks done, tests pass, criteria met at iteration %d', i);
361
+ finalStatus = 'success';
362
+ break;
363
+ }
364
+
365
+ // Success criteria not yet all checked — continue loop.
366
+ // The agent must update SUCCESS_CRITERIA.md as criteria are satisfied.
367
+ debug(
368
+ 'Tests pass but not all criteria met at iteration %d; continuing',
369
+ i
370
+ );
371
+ }
372
+ } else {
373
+ // Tasks still pending — update partial results for reporting.
374
+ successCriteriaResults = await evaluateSuccessCriteria(runDir).catch(() => []);
375
+ }
211
376
  }
212
377
 
213
378
  // If we exhausted all loops without a status, mark as failure
@@ -215,15 +380,34 @@ export async function runSpec(
215
380
  successCriteriaResults = await evaluateSuccessCriteria(runDir).catch(() => []);
216
381
  }
217
382
 
383
+ // Persist steering log if any steering actions occurred during this run
384
+ if (steeringLog.length > 0) {
385
+ await writeFile(
386
+ join(runDir, 'steering-log.json'),
387
+ JSON.stringify(steeringLog, null, 2),
388
+ 'utf-8'
389
+ );
390
+ debug('Steering log written (%d entries)', steeringLog.length);
391
+ }
392
+
218
393
  const totalTimeMinutes = (Date.now() - startTime) / 60000;
219
394
  const costPerToken = specYaml.estimatedCostUsd / specYaml.estimatedTokens;
220
395
  const totalCostUsd = totalTokens * costPerToken;
221
396
 
397
+ // Auto-detect specFormat from the run directory when not provided explicitly
398
+ const detectedSpecFormat =
399
+ opts.specFormat ?? (await detectSpecFormat(runDir)).format;
400
+
222
401
  const report: RunReport = {
223
402
  runId,
224
403
  specVersion: specYaml.version,
225
404
  model: opts.model ?? specYaml.minModel,
226
405
  runner: specYaml.runner,
406
+ harness,
407
+ specFormat: detectedSpecFormat,
408
+ environmentType,
409
+ steeringActionCount,
410
+ isPureRun: finalStatus === 'success' && steeringActionCount === 0,
227
411
  loopCount: iterations.length,
228
412
  totalTokens,
229
413
  totalCostUsd,
@@ -247,10 +431,83 @@ export async function runSpec(
247
431
 
248
432
  // ---- Internal helpers ----
249
433
 
434
+ /**
435
+ * Detects the spec format from specDir and writes `.specmarket-runner.md` to runDir.
436
+ *
437
+ * Idempotent: if the file already exists in runDir it is overwritten so that
438
+ * the instructions stay consistent with the detected format.
439
+ *
440
+ * @param specDir - Source spec directory (used for format detection + sidecar data)
441
+ * @param runDir - Sandboxed run directory where the file is written
442
+ * @param formatOverride - Optional pre-detected format (skips detection when provided)
443
+ */
444
+ export async function ensureMetaInstructions(
445
+ specDir: string,
446
+ runDir: string,
447
+ formatOverride?: string
448
+ ): Promise<void> {
449
+ const format = formatOverride ?? (await detectSpecFormat(specDir)).format;
450
+ debug('Generating meta-instructions for format=%s', format);
451
+ const content = await generateMetaInstructions(specDir, format);
452
+ await writeFile(join(runDir, META_INSTRUCTION_FILENAME), content, 'utf-8');
453
+ debug('Meta-instructions written to %s/%s', runDir, META_INSTRUCTION_FILENAME);
454
+ }
455
+
456
+ /**
457
+ * Injects pending steering messages into the meta-instructions file for the
458
+ * current run directory.
459
+ *
460
+ * A "## Steering Input" section is appended to `.specmarket-runner.md` so the
461
+ * agent reads the user's guidance on its next harness invocation. Each call
462
+ * appends a timestamped section — messages accumulate across iterations so the
463
+ * agent retains the full steering history.
464
+ *
465
+ * Side effects:
466
+ * - Modifies `.specmarket-runner.md` in runDir (appends steering section)
467
+ * - Pushes `SteeringEntry` objects into `steeringLog`
468
+ *
469
+ * @param runDir - Active run directory containing the meta-instructions file
470
+ * @param messages - Steering messages to inject (already spliced from the queue)
471
+ * @param steeringLog - Mutable array collecting all steering entries for this run
472
+ */
473
+ export async function injectSteeringMessages(
474
+ runDir: string,
475
+ messages: string[],
476
+ steeringLog: SteeringEntry[]
477
+ ): Promise<void> {
478
+ if (messages.length === 0) return;
479
+
480
+ const timestamp = new Date().toISOString();
481
+ const entries: SteeringEntry[] = messages.map((content) => ({ timestamp, content }));
482
+ steeringLog.push(...entries);
483
+
484
+ const steeringSection = [
485
+ '',
486
+ `## Steering Input (injected at ${timestamp})`,
487
+ '',
488
+ 'The user has provided the following steering instructions. Incorporate them into your current work:',
489
+ '',
490
+ ...messages.map((m) => `> ${m}`),
491
+ '',
492
+ ].join('\n');
493
+
494
+ const metaPath = join(runDir, META_INSTRUCTION_FILENAME);
495
+ try {
496
+ const existing = await readFile(metaPath, 'utf-8');
497
+ await writeFile(metaPath, existing + steeringSection, 'utf-8');
498
+ } catch {
499
+ // Meta-instructions file missing — create it with just the steering section
500
+ await writeFile(metaPath, steeringSection, 'utf-8');
501
+ }
502
+
503
+ debug('injectSteeringMessages: appended %d message(s) to %s', messages.length, META_INSTRUCTION_FILENAME);
504
+ }
505
+
250
506
  async function copySpecFiles(srcDir: string, destDir: string): Promise<void> {
251
507
  const { cp } = await import('fs/promises');
252
508
  await cp(srcDir, join(destDir, 'spec'), { recursive: true });
253
- // Also copy directly to destDir so PROMPT.md is at root
509
+ // Also copy directly to destDir so spec files are accessible at the root
510
+ // of the run directory alongside the generated meta-instructions.
254
511
  await cp(srcDir, destDir, { recursive: true, force: false });
255
512
  debug('Spec files copied from %s to %s', srcDir, destDir);
256
513
  }
@@ -288,27 +545,62 @@ interface ExecuteResult {
288
545
  exitCode: number;
289
546
  }
290
547
 
291
- async function executeClaudeLoop(dir: string, model?: string): Promise<ExecuteResult> {
292
- return new Promise((resolve) => {
293
- const args = ['--print', '--output-format', 'json'];
294
- if (model) {
295
- args.push('--model', model);
548
+ /**
549
+ * Builds the shell command string for the given harness.
550
+ *
551
+ * All harnesses receive the same meta-instructions file via stdin so they
552
+ * know what format they are working with and what tasks to execute.
553
+ *
554
+ * - claude-code: `cat .specmarket-runner.md | claude --print --output-format json [--model <m>]`
555
+ * - codex: `cat .specmarket-runner.md | codex`
556
+ * - opencode: `cat .specmarket-runner.md | opencode`
557
+ */
558
+ function buildHarnessCommand(harness: string, model?: string): string {
559
+ switch (harness) {
560
+ case 'claude-code': {
561
+ const args = ['--print', '--output-format', 'json'];
562
+ if (model) args.push('--model', model);
563
+ return `cat ${META_INSTRUCTION_FILENAME} | claude ${args.join(' ')}`;
296
564
  }
565
+ case 'codex':
566
+ // Codex CLI reads from stdin; model selection is via OPENAI_MODEL env or its own flags
567
+ return `cat ${META_INSTRUCTION_FILENAME} | codex`;
568
+ case 'opencode':
569
+ // opencode reads from stdin
570
+ return `cat ${META_INSTRUCTION_FILENAME} | opencode`;
571
+ default:
572
+ // Unknown harness — fall back to claude-code behaviour
573
+ debug('Unknown harness "%s" — falling back to claude-code', harness);
574
+ return `cat ${META_INSTRUCTION_FILENAME} | claude --print --output-format json`;
575
+ }
576
+ }
297
577
 
298
- // Execute: cat PROMPT.md | claude-code --print --output-format json
299
- // Using --output-format json gives us structured output with token usage metadata.
300
- const proc = spawn('sh', ['-c', `cat PROMPT.md | claude-code ${args.join(' ')}`], {
578
+ /**
579
+ * Executes a single loop iteration via the specified harness.
580
+ *
581
+ * The meta-instructions file (`.specmarket-runner.md`) is piped into the harness
582
+ * binary as stdin. The harness is expected to read the instructions, perform the
583
+ * requested work inside the run directory, and exit with code 0 on success.
584
+ */
585
+ async function executeHarness(dir: string, harness: string, model?: string): Promise<ExecuteResult> {
586
+ const cmd = buildHarnessCommand(harness, model);
587
+ debug('executeHarness: %s (harness=%s)', cmd, harness);
588
+
589
+ return new Promise((resolve) => {
590
+ const proc = spawn('sh', ['-c', cmd], {
301
591
  cwd: dir,
302
- stdio: ['inherit', 'pipe', 'pipe'],
592
+ // stdin is 'ignore': the harness reads its instructions from the meta-instructions file
593
+ // via `cat .specmarket-runner.md | <harness>`, not from parent stdin.
594
+ // Keeping stdin detached from the parent lets the CLI read steering messages
595
+ // from process.stdin without conflict.
596
+ stdio: ['ignore', 'pipe', 'pipe'],
303
597
  });
304
598
 
305
599
  let stdout = '';
306
- let stderr = '';
307
600
  proc.stdout?.on('data', (chunk: Buffer) => {
308
601
  stdout += chunk.toString();
309
602
  });
310
603
  proc.stderr?.on('data', (chunk: Buffer) => {
311
- stderr += chunk.toString();
312
604
  // Write stderr to process stderr for visibility
313
605
  process.stderr.write(chunk);
314
606
  });
@@ -318,7 +610,7 @@ async function executeClaudeLoop(dir: string, model?: string): Promise<ExecuteRe
318
610
  });
319
611
 
320
612
  proc.on('error', (err) => {
321
- debug('claude-code spawn error: %O', err);
613
+ debug('%s spawn error: %O', harness, err);
322
614
  resolve({ stdout: '', exitCode: 1 });
323
615
  });
324
616
  });
@@ -328,17 +620,34 @@ async function executeClaudeLoop(dir: string, model?: string): Promise<ExecuteRe
328
620
  * Extracts token count from Claude Code's output.
329
621
  *
330
622
  * Strategy (in priority order):
331
- * 1. Parse JSON output format (claude-code --output-format json) which contains
332
- * structured metadata including token counts in the response.
623
+ * 1. Parse JSON output format (claude --output-format json) which contains
624
+ * structured metadata including token counts or cost_usd in the response.
625
+ * When only cost_usd is available (typical for Claude Code CLI), token count
626
+ * is estimated using model-aware pricing constants from MODEL_COST_PER_TOKEN.
627
+ * This estimate may deviate ±30% from the actual count depending on the
628
+ * input/output token ratio for that specific run.
333
629
  * 2. Match known text patterns from Claude Code's output (total_tokens, etc.)
334
630
  * 3. Estimate from output length as a last-resort heuristic (~4 chars per token).
335
631
  *
632
+ * @param output - Raw stdout from the Claude CLI invocation
633
+ * @param model - Model identifier (e.g. "claude-haiku-4-5", "claude-opus-4-6").
634
+ * Used to select the correct pricing tier for cost→token estimation.
635
+ * Defaults to Sonnet-tier pricing if omitted or unrecognised.
636
+ *
336
637
  * Returns 0 only if the output is empty (no meaningful work was done).
337
638
  */
338
- function parseTokensFromOutput(output: string): number {
639
+ export function parseTokensFromOutput(output: string, model?: string): number {
339
640
  if (!output || output.trim().length === 0) return 0;
340
641
 
341
- // Strategy 1: Parse JSON output format from claude-code --output-format json
642
+ // Resolve cost-per-token for this model (case-insensitive substring match)
643
+ const modelLower = (model ?? '').toLowerCase();
644
+ const costPerToken = modelLower.includes('haiku')
645
+ ? MODEL_COST_PER_TOKEN.haiku
646
+ : modelLower.includes('opus')
647
+ ? MODEL_COST_PER_TOKEN.opus
648
+ : MODEL_COST_PER_TOKEN.default;
649
+
650
+ // Strategy 1: Parse JSON output format from claude --output-format json
342
651
  // Claude Code JSON output may contain token usage info in the response metadata.
343
652
  try {
344
653
  // The output might be a single JSON object or newline-delimited JSON
@@ -368,11 +677,16 @@ function parseTokensFromOutput(output: string): number {
368
677
  const output_tokens = parsed.usage?.output_tokens ?? parsed.usage?.completion_tokens ?? 0;
369
678
  if (input > 0 || output_tokens > 0) return input + output_tokens;
370
679
 
371
- // Cost-based estimation (if cost is reported but not tokens)
372
- // Haiku: ~$0.25/MTok input, $1.25/MTok output avg ~$0.75/MTok
373
- // Sonnet: ~$3/MTok input, $15/MTok output → avg ~$9/MTok
680
+ // Cost-based estimation: Claude Code CLI typically reports cost_usd but not
681
+ // raw token counts. Use model-aware pricing for the best estimate.
374
682
  if (typeof parsed.cost_usd === 'number' && parsed.cost_usd > 0) {
375
- return Math.round(parsed.cost_usd / 0.000009); // Assume Sonnet pricing
683
+ debug(
684
+ 'parseTokensFromOutput: using cost_usd=%f with model=%s (costPerToken=%e)',
685
+ parsed.cost_usd,
686
+ model ?? 'unknown',
687
+ costPerToken
688
+ );
689
+ return Math.round(parsed.cost_usd / costPerToken);
376
690
  }
377
691
  }
378
692
  } catch {
@@ -407,8 +721,8 @@ function parseTokensFromOutput(output: string): number {
407
721
  }
408
722
 
409
723
  // Strategy 3: Estimate from output length
410
- // Rough heuristic: ~4 characters per token for English text
411
- // This is imprecise but better than returning 0 (which breaks budget tracking)
724
+ // Rough heuristic: ~4 characters per token for English text.
725
+ // This is imprecise but better than returning 0 (which breaks budget tracking).
412
726
  const estimatedTokens = Math.ceil(output.length / 4);
413
727
  debug(
414
728
  'parseTokensFromOutput: no explicit token count found, estimating %d from %d chars',
@@ -423,79 +737,171 @@ function parseIntComma(s: string): number {
423
737
  return parseInt(s.replace(/,/g, ''), 10) || 0;
424
738
  }
425
739
 
426
- interface CompletionCheck {
427
- isComplete: boolean;
428
- results: SuccessCriterionResult[];
429
- }
430
-
431
- async function checkCompletion(dir: string): Promise<CompletionCheck> {
432
- // Check 1: fix_plan.md should be empty or have only checked items
433
- const fixPlanEmpty = await isFixPlanEmpty(dir);
434
- if (!fixPlanEmpty) {
435
- return {
436
- isComplete: false,
437
- results: await evaluateSuccessCriteria(dir).catch(() => []),
438
- };
439
- }
440
-
441
- // Check 2: Run test suite if detectable
442
- const testsPass = await runTests(dir);
443
- if (!testsPass) {
444
- return {
445
- isComplete: false,
446
- results: await evaluateSuccessCriteria(dir).catch(() => []),
447
- };
448
- }
449
-
450
- // Check 3: Evaluate SUCCESS_CRITERIA.md
451
- const criteriaResults = await evaluateSuccessCriteria(dir);
452
- const allPassed = criteriaResults.every((r) => r.passed);
453
-
454
- return {
455
- isComplete: allPassed,
456
- results: criteriaResults,
457
- };
458
- }
459
-
460
740
  async function isFixPlanEmpty(dir: string): Promise<boolean> {
461
741
  try {
462
- const content = await readFile(join(dir, 'fix_plan.md'), 'utf-8');
742
+ const content = await readFile(join(dir, 'TASKS.md'), 'utf-8');
463
743
  // Consider empty if: no unchecked items (- [ ] lines)
464
744
  const hasUncheckedItems = /^- \[ \]/m.test(content);
465
745
  return !hasUncheckedItems;
466
746
  } catch {
467
- // No fix_plan.md = considered empty
747
+ // No TASKS.md = considered empty
468
748
  return true;
469
749
  }
470
750
  }
471
751
 
472
- async function runTests(dir: string): Promise<boolean> {
473
- // Try to detect and run tests
752
+ /**
753
+ * Runs the test suite in `dir` and captures the raw output.
754
+ *
755
+ * Probes for known test runner config files in priority order; skips to the
756
+ * next runner on spawn or timeout errors. Returns `{ passed: true, output: '' }`
757
+ * when no test runner is detected (cannot verify — assume passing).
758
+ *
759
+ * The raw `output` is used by `writeTestFixTasks` to extract failure details
760
+ * and write them as actionable fix tasks for the agent.
761
+ */
762
+ export async function runTestsWithOutput(dir: string): Promise<{ passed: boolean; output: string }> {
474
763
  const testRunners = [
475
- { file: 'package.json', cmd: 'npm test -- --run 2>&1 || true' },
476
- { file: 'vitest.config.ts', cmd: 'npx vitest run 2>&1 || true' },
477
- { file: 'pytest.ini', cmd: 'python -m pytest --tb=no -q 2>&1 || true' },
478
- { file: 'Makefile', cmd: 'make test 2>&1 || true' },
764
+ { file: 'package.json', cmd: 'npm test -- --run 2>&1' },
765
+ { file: 'vitest.config.ts', cmd: 'npx vitest run 2>&1' },
766
+ { file: 'pytest.ini', cmd: 'python -m pytest --tb=short -q 2>&1' },
767
+ { file: 'Makefile', cmd: 'make test 2>&1' },
479
768
  ];
480
769
 
481
770
  for (const runner of testRunners) {
482
771
  try {
483
772
  await access(join(dir, runner.file));
773
+ } catch {
774
+ continue; // Config file doesn't exist — try next runner
775
+ }
776
+
777
+ try {
484
778
  const { stdout, stderr } = await execAsync(runner.cmd, {
485
779
  cwd: dir,
486
780
  timeout: 120000,
487
781
  });
488
782
  const combined = stdout + stderr;
489
- // Heuristic: "failed" or "error" in output means tests failed
490
- const hasFailed = /\d+ failed|\d+ error|FAILED|ERROR/i.test(combined);
491
- return !hasFailed;
492
- } catch {
783
+ const hasFailed = /\d+ failed|\d+ error/i.test(combined);
784
+ return { passed: !hasFailed, output: combined };
785
+ } catch (err: unknown) {
786
+ if (err && typeof err === 'object') {
787
+ const execErr = err as { code?: number; signal?: string; stdout?: string; stderr?: string };
788
+ if (typeof execErr.code === 'number' && execErr.signal == null) {
789
+ // Process exited with a non-zero exit code — genuine test failures.
790
+ const combined = (execErr.stdout ?? '') + (execErr.stderr ?? '');
791
+ return { passed: false, output: combined };
792
+ }
793
+ }
794
+ // Timeout or spawn error — skip to next runner
493
795
  continue;
494
796
  }
495
797
  }
496
798
 
497
- // No test runner found — assume passing
498
- return true;
799
+ // No test runner detected — assume passing
800
+ return { passed: true, output: '' };
801
+ }
802
+
803
+ /**
804
+ * Extract a short list of failing test identifiers from raw test runner output.
805
+ *
806
+ * Supports:
807
+ * - Vitest/Jest: "FAIL src/foo.test.ts" file-level failures
808
+ * - Vitest/Jest: "× test name" / "✗ test name" individual test failures
809
+ * - Pytest: "FAILED tests/foo.py::test_name"
810
+ * - Generic: "N failed" summary line (fallback)
811
+ *
812
+ * Returns at most 10 entries. When specific failures cannot be parsed, returns
813
+ * a single generic entry directing the agent to TEST_FAILURES.md.
814
+ */
815
+ export function extractTestFailures(output: string): string[] {
816
+ const failures: string[] = [];
817
+
818
+ // Vitest/Jest: "FAIL src/foo.test.ts" (file-level failure)
819
+ const failFileMatches = output.match(/^FAIL\s+\S+/gm) ?? [];
820
+ for (const m of failFileMatches) {
821
+ const name = m.replace(/^FAIL\s+/, '').trim();
822
+ if (name && !failures.includes(name)) failures.push(name);
823
+ }
824
+
825
+ // Vitest/Jest: individual test "× test name" or "✗ test name" or "✕ test name"
826
+ const failTestMatches = output.match(/^[\s]*[×✗✕]\s+(.+)/gm) ?? [];
827
+ for (const m of failTestMatches) {
828
+ const name = m.replace(/^[\s]*[×✗✕]\s+/, '').trim();
829
+ if (name && !failures.includes(name)) failures.push(name);
830
+ }
831
+
832
+ // Pytest: "FAILED tests/foo.py::test_bar"
833
+ const pytestMatches = output.match(/^FAILED\s+\S+/gm) ?? [];
834
+ for (const m of pytestMatches) {
835
+ const name = m.replace(/^FAILED\s+/, '').trim();
836
+ if (name && !failures.includes(name)) failures.push(name);
837
+ }
838
+
839
+ // Generic fallback when specific test names couldn't be parsed
840
+ if (failures.length === 0) {
841
+ const summaryMatch = output.match(/(\d+)\s+failed/i);
842
+ if (summaryMatch) {
843
+ failures.push(`${summaryMatch[1]} test(s) failed — see TEST_FAILURES.md for details`);
844
+ }
845
+ }
846
+
847
+ return failures.slice(0, 10);
848
+ }
849
+
850
+ /**
851
+ * Write test failures as actionable fix tasks into TASKS.md after the runner
852
+ * detects that all implementation tasks are done but tests are still failing.
853
+ *
854
+ * Side effects:
855
+ * - Writes `TEST_FAILURES.md` with the full test output for agent reference.
856
+ * - Appends (or replaces) a "## Test Failures (Auto-Generated)" section in
857
+ * `TASKS.md` containing one `- [ ] Fix: <name>` item per failing test.
858
+ * Any previous auto-generated section is replaced to avoid duplication.
859
+ *
860
+ * The agent will see TASKS.md has unchecked items, read TEST_FAILURES.md for
861
+ * context, and work to resolve each failure before marking them `[x]`.
862
+ */
863
+ export async function writeTestFixTasks(dir: string, testOutput: string): Promise<void> {
864
+ // Always write the full output to TEST_FAILURES.md so the agent has context.
865
+ await writeFile(
866
+ join(dir, 'TEST_FAILURES.md'),
867
+ [
868
+ '# Test Failures',
869
+ '',
870
+ '> Auto-generated by SpecMarket runner. Delete this file when all tests pass.',
871
+ '',
872
+ '## Raw Test Output',
873
+ '',
874
+ '```',
875
+ testOutput.slice(0, 8000),
876
+ '```',
877
+ ].join('\n'),
878
+ 'utf-8'
879
+ );
880
+
881
+ const failures = extractTestFailures(testOutput);
882
+ if (failures.length === 0) return;
883
+
884
+ const testFixSection = [
885
+ '',
886
+ '## Test Failures (Auto-Generated)',
887
+ '> These tasks were created by the runner after detecting test failures.',
888
+ '> Fix each failing test, then delete this section and TEST_FAILURES.md.',
889
+ '',
890
+ ...failures.map((f) => `- [ ] Fix: ${f}`),
891
+ ].join('\n');
892
+
893
+ try {
894
+ const existing = await readFile(join(dir, 'TASKS.md'), 'utf-8');
895
+ // Replace any previous auto-generated section to avoid duplication.
896
+ const withoutPrevious = existing.replace(
897
+ /\n## Test Failures \(Auto-Generated\)[\s\S]*/,
898
+ ''
899
+ );
900
+ await writeFile(join(dir, 'TASKS.md'), withoutPrevious + testFixSection, 'utf-8');
901
+ } catch {
902
+ // TASKS.md doesn't exist — create it.
903
+ await writeFile(join(dir, 'TASKS.md'), `# Tasks${testFixSection}`, 'utf-8');
904
+ }
499
905
  }
500
906
 
501
907
  async function evaluateSuccessCriteria(dir: string): Promise<SuccessCriterionResult[]> {