@specmarket/cli 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-DLEMNRTH.js → chunk-OTXWWFAO.js} +24 -2
- package/dist/chunk-OTXWWFAO.js.map +1 -0
- package/dist/{config-OAU6SJLC.js → config-5JMI3YAR.js} +2 -2
- package/dist/index.js +1283 -389
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/src/commands/init.test.ts +162 -23
- package/src/commands/init.ts +349 -17
- package/src/commands/issues.test.ts +8 -3
- package/src/commands/issues.ts +2 -9
- package/src/commands/login.ts +2 -6
- package/src/commands/publish.test.ts +14 -1
- package/src/commands/publish.ts +1 -0
- package/src/commands/run.test.ts +206 -0
- package/src/commands/run.ts +63 -3
- package/src/commands/validate.test.ts +83 -6
- package/src/commands/validate.ts +96 -114
- package/src/lib/format-detection.test.ts +4 -4
- package/src/lib/format-detection.ts +3 -3
- package/src/lib/meta-instructions.test.ts +340 -0
- package/src/lib/meta-instructions.ts +562 -0
- package/src/lib/ralph-loop.test.ts +404 -0
- package/src/lib/ralph-loop.ts +475 -98
- package/src/lib/telemetry.ts +5 -0
- package/dist/chunk-DLEMNRTH.js.map +0 -1
- /package/dist/{config-OAU6SJLC.js.map → config-5JMI3YAR.js.map} +0 -0
package/src/lib/ralph-loop.ts
CHANGED
|
@@ -11,30 +11,64 @@ import {
|
|
|
11
11
|
RUN_DEFAULTS,
|
|
12
12
|
EXIT_CODES,
|
|
13
13
|
RUNNER_ID,
|
|
14
|
+
MODEL_COST_PER_TOKEN,
|
|
15
|
+
DEFAULT_HARNESS,
|
|
14
16
|
} from '@specmarket/shared';
|
|
15
17
|
import createDebug from 'debug';
|
|
18
|
+
import { generateMetaInstructions, META_INSTRUCTION_FILENAME } from './meta-instructions.js';
|
|
19
|
+
import { detectSpecFormat } from './format-detection.js';
|
|
16
20
|
|
|
17
21
|
const debug = createDebug('specmarket:runner');
|
|
18
22
|
const execAsync = promisify(exec);
|
|
19
23
|
|
|
20
24
|
/**
|
|
21
|
-
* Pre-flight check: Verifies that
|
|
22
|
-
* Throws an error with installation instructions if
|
|
25
|
+
* Pre-flight check: Verifies that the selected harness CLI is installed.
|
|
26
|
+
* Throws an error with installation instructions if the binary is not found.
|
|
27
|
+
*
|
|
28
|
+
* @param harness - The harness to check. Defaults to 'claude-code'.
|
|
23
29
|
*/
|
|
24
|
-
export async function checkClaudeCliInstalled(): Promise<void> {
|
|
30
|
+
export async function checkClaudeCliInstalled(harness?: string): Promise<void> {
|
|
31
|
+
const h = harness ?? DEFAULT_HARNESS;
|
|
32
|
+
const binaryName = HARNESS_BINARY[h] ?? 'claude';
|
|
25
33
|
try {
|
|
26
|
-
|
|
27
|
-
await execAsync('which claude');
|
|
34
|
+
await execAsync(`which ${binaryName}`);
|
|
28
35
|
} catch {
|
|
36
|
+
const installHint = HARNESS_INSTALL_HINT[h] ?? `Install ${binaryName} and ensure it is in your PATH.`;
|
|
29
37
|
throw new Error(
|
|
30
|
-
`
|
|
31
|
-
|
|
32
|
-
` npm install -g @anthropic-ai/claude-code\n\n` +
|
|
33
|
-
`Or visit: https://www.anthropic.com/claude-code\n`
|
|
38
|
+
`Harness "${h}" binary "${binaryName}" is not installed or not in your PATH.\n\n` +
|
|
39
|
+
`${installHint}\n`
|
|
34
40
|
);
|
|
35
41
|
}
|
|
36
42
|
}
|
|
37
43
|
|
|
44
|
+
/** CLI binary name for each harness */
|
|
45
|
+
const HARNESS_BINARY: Record<string, string> = {
|
|
46
|
+
'claude-code': 'claude',
|
|
47
|
+
'codex': 'codex',
|
|
48
|
+
'opencode': 'opencode',
|
|
49
|
+
};
|
|
50
|
+
|
|
51
|
+
/** Install hints for each harness */
|
|
52
|
+
const HARNESS_INSTALL_HINT: Record<string, string> = {
|
|
53
|
+
'claude-code':
|
|
54
|
+
'Installation instructions:\n npm install -g @anthropic-ai/claude-code\n\nOr visit: https://www.anthropic.com/claude-code',
|
|
55
|
+
'codex':
|
|
56
|
+
'Installation instructions:\n npm install -g @openai/codex\n\nOr visit: https://github.com/openai/codex',
|
|
57
|
+
'opencode':
|
|
58
|
+
'Installation instructions:\n npm install -g opencode-ai\n\nOr visit: https://opencode.ai',
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* A single steering action logged during the run.
|
|
63
|
+
* Written to `steering-log.json` in the run directory on completion.
|
|
64
|
+
*/
|
|
65
|
+
export interface SteeringEntry {
|
|
66
|
+
/** ISO-8601 timestamp when the message was injected */
|
|
67
|
+
timestamp: string;
|
|
68
|
+
/** User-provided steering content */
|
|
69
|
+
content: string;
|
|
70
|
+
}
|
|
71
|
+
|
|
38
72
|
export interface RunOptions {
|
|
39
73
|
maxLoops?: number;
|
|
40
74
|
maxBudgetUsd?: number;
|
|
@@ -43,6 +77,26 @@ export interface RunOptions {
|
|
|
43
77
|
resumeRunId?: string;
|
|
44
78
|
outputDir?: string;
|
|
45
79
|
cliVersion: string;
|
|
80
|
+
/** Spec format override. When omitted, auto-detected from specDir. */
|
|
81
|
+
specFormat?: string;
|
|
82
|
+
/**
|
|
83
|
+
* Agentic harness to use for execution.
|
|
84
|
+
* One of: 'claude-code' (default), 'codex', 'opencode'.
|
|
85
|
+
*/
|
|
86
|
+
harness?: string;
|
|
87
|
+
/**
|
|
88
|
+
* Existing working directory to run in instead of a fresh sandbox.
|
|
89
|
+
* When set, spec files are NOT copied — the agent operates directly on this directory.
|
|
90
|
+
* Enables `environmentType: 'existing'` in the run report.
|
|
91
|
+
*/
|
|
92
|
+
workdir?: string;
|
|
93
|
+
/**
|
|
94
|
+
* Shared queue for steering messages typed by the user during the run.
|
|
95
|
+
* The caller pushes messages here; the runner drains the queue before each
|
|
96
|
+
* harness execution and injects the messages into the meta-instructions file.
|
|
97
|
+
* Each drained message increments `steeringActionCount` in the run report.
|
|
98
|
+
*/
|
|
99
|
+
steeringQueue?: string[];
|
|
46
100
|
}
|
|
47
101
|
|
|
48
102
|
export interface RunResult {
|
|
@@ -56,15 +110,19 @@ export interface RunResult {
|
|
|
56
110
|
* The loop:
|
|
57
111
|
* 1. Creates a sandboxed working directory under ~/.specmarket/runs/<run-id>/
|
|
58
112
|
* 2. Copies spec files into the working directory
|
|
59
|
-
* 3.
|
|
60
|
-
*
|
|
61
|
-
*
|
|
62
|
-
*
|
|
113
|
+
* 3. Detects spec format (specmarket | speckit | bmad | ralph | custom) and generates
|
|
114
|
+
* `.specmarket-runner.md` — format-aware meta-instructions for the AI agent.
|
|
115
|
+
* 4. Initializes git for diff tracking
|
|
116
|
+
* 5. Executes: `cat .specmarket-runner.md | claude --print` in a loop.
|
|
117
|
+
* The meta-instructions tell the agent which files to read, how to find tasks,
|
|
118
|
+
* how to mark completion, and when the run is done — regardless of spec format.
|
|
119
|
+
* 6. After each loop: captures tokens, duration, git diff
|
|
120
|
+
* 7. Checks for completion conditions:
|
|
63
121
|
* - SUCCESS: TASKS.md empty + tests pass + all SUCCESS_CRITERIA.md criteria met
|
|
64
122
|
* - STALL: 3 consecutive loops with no git diff
|
|
65
123
|
* - FAILURE: 10 consecutive loops with same failing output
|
|
66
124
|
* - BUDGET: total tokens > 2x estimated_tokens
|
|
67
|
-
*
|
|
125
|
+
* 8. Writes run-report.json on completion
|
|
68
126
|
*
|
|
69
127
|
* SECURITY: Always prints sandboxing recommendation before starting.
|
|
70
128
|
*/
|
|
@@ -88,12 +146,20 @@ export async function runSpec(
|
|
|
88
146
|
? (opts.maxBudgetUsd / specYaml.estimatedCostUsd) * specYaml.estimatedTokens
|
|
89
147
|
: specYaml.estimatedTokens * RUN_DEFAULTS.BUDGET_MULTIPLIER);
|
|
90
148
|
|
|
149
|
+
const harness = opts.harness ?? DEFAULT_HARNESS;
|
|
91
150
|
const runId = opts.resumeRunId ?? randomUUID();
|
|
92
151
|
const runsBaseDir = join(homedir(), CONFIG_PATHS.RUNS_DIR);
|
|
93
|
-
const runDir = opts.outputDir ?? join(runsBaseDir, runId);
|
|
94
152
|
|
|
95
|
-
|
|
96
|
-
|
|
153
|
+
// --workdir: run in the caller-provided existing directory (no file copying).
|
|
154
|
+
// Without --workdir: create a fresh sandbox under ~/.specmarket/runs/<run-id>/.
|
|
155
|
+
const usingWorkdir = opts.workdir !== undefined;
|
|
156
|
+
const runDir = opts.workdir ?? opts.outputDir ?? join(runsBaseDir, runId);
|
|
157
|
+
const environmentType: 'fresh' | 'existing' = usingWorkdir ? 'existing' : 'fresh';
|
|
158
|
+
|
|
159
|
+
if (!usingWorkdir) {
|
|
160
|
+
await mkdir(runDir, { recursive: true });
|
|
161
|
+
}
|
|
162
|
+
debug('Run directory: %s (environmentType=%s, harness=%s)', runDir, environmentType, harness);
|
|
97
163
|
|
|
98
164
|
if (opts.dryRun) {
|
|
99
165
|
debug('Dry run mode — skipping execution');
|
|
@@ -102,6 +168,11 @@ export async function runSpec(
|
|
|
102
168
|
specVersion: specYaml.version,
|
|
103
169
|
model: opts.model ?? specYaml.minModel,
|
|
104
170
|
runner: specYaml.runner,
|
|
171
|
+
harness,
|
|
172
|
+
specFormat: opts.specFormat,
|
|
173
|
+
environmentType,
|
|
174
|
+
steeringActionCount: 0,
|
|
175
|
+
isPureRun: false,
|
|
105
176
|
loopCount: 0,
|
|
106
177
|
totalTokens: 0,
|
|
107
178
|
totalCostUsd: 0,
|
|
@@ -129,9 +200,19 @@ export async function runSpec(
|
|
|
129
200
|
totalTokens = existingReport.totalTokens;
|
|
130
201
|
debug('Resuming from iteration %d with %d tokens carried over', startIteration, totalTokens);
|
|
131
202
|
}
|
|
203
|
+
// Ensure meta-instructions exist in the run dir (may be missing for runs
|
|
204
|
+
// created before this feature was added).
|
|
205
|
+
await ensureMetaInstructions(specDir, runDir, opts.specFormat);
|
|
206
|
+
} else if (usingWorkdir) {
|
|
207
|
+
// --workdir: the directory already has the spec files. Just generate/refresh
|
|
208
|
+
// the meta-instructions so the agent knows what format it is working with.
|
|
209
|
+
await ensureMetaInstructions(specDir, runDir, opts.specFormat);
|
|
210
|
+
// Initialize git if not already a repo (best-effort — may be an existing git repo)
|
|
211
|
+
await initGit(runDir);
|
|
132
212
|
} else {
|
|
133
|
-
// Fresh run: copy spec files
|
|
213
|
+
// Fresh run: copy spec files, generate meta-instructions, initialize git.
|
|
134
214
|
await copySpecFiles(specDir, runDir);
|
|
215
|
+
await ensureMetaInstructions(specDir, runDir, opts.specFormat);
|
|
135
216
|
await initGit(runDir);
|
|
136
217
|
}
|
|
137
218
|
|
|
@@ -140,6 +221,14 @@ export async function runSpec(
|
|
|
140
221
|
let consecutiveNoChange = 0;
|
|
141
222
|
let lastOutput = '';
|
|
142
223
|
let consecutiveSameOutput = 0;
|
|
224
|
+
const steeringLog: SteeringEntry[] = [];
|
|
225
|
+
let steeringActionCount = 0;
|
|
226
|
+
/**
|
|
227
|
+
* Counts how many times the post-task test phase has detected failures after
|
|
228
|
+
* all TASKS.md items were checked. When this reaches TEST_PHASE_MAX_ITERATIONS,
|
|
229
|
+
* the run is declared a failure — the agent could not fix the tests.
|
|
230
|
+
*/
|
|
231
|
+
let testPhaseAttempts = 0;
|
|
143
232
|
|
|
144
233
|
let finalStatus: RunReport['status'] = 'failure';
|
|
145
234
|
let successCriteriaResults: SuccessCriterionResult[] = [];
|
|
@@ -149,11 +238,21 @@ export async function runSpec(
|
|
|
149
238
|
|
|
150
239
|
const iterStart = Date.now();
|
|
151
240
|
|
|
152
|
-
//
|
|
153
|
-
|
|
241
|
+
// Drain steering queue and inject any pending messages before this iteration.
|
|
242
|
+
// Messages are appended to the meta-instructions file so the harness sees them.
|
|
243
|
+
const pendingMessages = opts.steeringQueue ? opts.steeringQueue.splice(0) : [];
|
|
244
|
+
if (pendingMessages.length > 0) {
|
|
245
|
+
await injectSteeringMessages(runDir, pendingMessages, steeringLog);
|
|
246
|
+
steeringActionCount += pendingMessages.length;
|
|
247
|
+
debug('Injected %d steering message(s); total steeringActionCount=%d', pendingMessages.length, steeringActionCount);
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// Execute via the selected harness
|
|
251
|
+
const result = await executeHarness(runDir, harness, opts.model);
|
|
154
252
|
|
|
155
253
|
const iterDuration = Date.now() - iterStart;
|
|
156
|
-
const
|
|
254
|
+
const activeModel = opts.model ?? specYaml.minModel;
|
|
255
|
+
const tokensThisLoop = parseTokensFromOutput(result.stdout, activeModel);
|
|
157
256
|
totalTokens += tokensThisLoop;
|
|
158
257
|
|
|
159
258
|
// Capture git diff
|
|
@@ -215,17 +314,65 @@ export async function runSpec(
|
|
|
215
314
|
lastOutput = currentOutputHash;
|
|
216
315
|
}
|
|
217
316
|
|
|
218
|
-
//
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
317
|
+
// ---- Post-task test phase ----
|
|
318
|
+
//
|
|
319
|
+
// When all TASKS.md items are checked, the runner takes over test execution:
|
|
320
|
+
// 1. Run the test suite and capture output.
|
|
321
|
+
// 2. If tests fail: write specific fix tasks to TASKS.md and TEST_FAILURES.md,
|
|
322
|
+
// then continue the main loop so the agent can address them.
|
|
323
|
+
// 3. If tests pass: check SUCCESS_CRITERIA.md — if all met, declare success.
|
|
324
|
+
//
|
|
325
|
+
// This creates a test→fix→retest cycle driven by the runner, ensuring the
|
|
326
|
+
// agent only receives passing runs when everything is actually green.
|
|
327
|
+
const tasksComplete = await isFixPlanEmpty(runDir);
|
|
328
|
+
if (tasksComplete) {
|
|
329
|
+
const testResult = await runTestsWithOutput(runDir);
|
|
330
|
+
|
|
331
|
+
if (!testResult.passed) {
|
|
332
|
+
testPhaseAttempts++;
|
|
333
|
+
debug(
|
|
334
|
+
'Post-task test phase attempt %d/%d: tests failing, writing fix tasks',
|
|
335
|
+
testPhaseAttempts,
|
|
336
|
+
RUN_DEFAULTS.TEST_PHASE_MAX_ITERATIONS
|
|
337
|
+
);
|
|
338
|
+
|
|
339
|
+
if (testPhaseAttempts >= RUN_DEFAULTS.TEST_PHASE_MAX_ITERATIONS) {
|
|
340
|
+
debug(
|
|
341
|
+
'Test phase exceeded max iterations (%d), declaring failure',
|
|
342
|
+
RUN_DEFAULTS.TEST_PHASE_MAX_ITERATIONS
|
|
343
|
+
);
|
|
344
|
+
successCriteriaResults = await evaluateSuccessCriteria(runDir).catch(() => []);
|
|
345
|
+
finalStatus = 'failure';
|
|
346
|
+
break;
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
// Write actionable fix tasks so the next harness iteration has specific work.
|
|
350
|
+
await writeTestFixTasks(runDir, testResult.output);
|
|
351
|
+
await stageAllChanges(runDir);
|
|
352
|
+
successCriteriaResults = await evaluateSuccessCriteria(runDir).catch(() => []);
|
|
353
|
+
// Continue main loop — the harness will pick up the new fix tasks.
|
|
354
|
+
} else {
|
|
355
|
+
// Tests pass — evaluate SUCCESS_CRITERIA.md for the final gate.
|
|
356
|
+
const criteriaResults = await evaluateSuccessCriteria(runDir);
|
|
357
|
+
successCriteriaResults = criteriaResults;
|
|
358
|
+
|
|
359
|
+
if (criteriaResults.every((r) => r.passed)) {
|
|
360
|
+
debug('All tasks done, tests pass, criteria met at iteration %d', i);
|
|
361
|
+
finalStatus = 'success';
|
|
362
|
+
break;
|
|
363
|
+
}
|
|
226
364
|
|
|
227
|
-
|
|
228
|
-
|
|
365
|
+
// Success criteria not yet all checked — continue loop.
|
|
366
|
+
// The agent must update SUCCESS_CRITERIA.md as criteria are satisfied.
|
|
367
|
+
debug(
|
|
368
|
+
'Tests pass but not all criteria met at iteration %d; continuing',
|
|
369
|
+
i
|
|
370
|
+
);
|
|
371
|
+
}
|
|
372
|
+
} else {
|
|
373
|
+
// Tasks still pending — update partial results for reporting.
|
|
374
|
+
successCriteriaResults = await evaluateSuccessCriteria(runDir).catch(() => []);
|
|
375
|
+
}
|
|
229
376
|
}
|
|
230
377
|
|
|
231
378
|
// If we exhausted all loops without a status, mark as failure
|
|
@@ -233,15 +380,34 @@ export async function runSpec(
|
|
|
233
380
|
successCriteriaResults = await evaluateSuccessCriteria(runDir).catch(() => []);
|
|
234
381
|
}
|
|
235
382
|
|
|
383
|
+
// Persist steering log if any steering actions occurred during this run
|
|
384
|
+
if (steeringLog.length > 0) {
|
|
385
|
+
await writeFile(
|
|
386
|
+
join(runDir, 'steering-log.json'),
|
|
387
|
+
JSON.stringify(steeringLog, null, 2),
|
|
388
|
+
'utf-8'
|
|
389
|
+
);
|
|
390
|
+
debug('Steering log written (%d entries)', steeringLog.length);
|
|
391
|
+
}
|
|
392
|
+
|
|
236
393
|
const totalTimeMinutes = (Date.now() - startTime) / 60000;
|
|
237
394
|
const costPerToken = specYaml.estimatedCostUsd / specYaml.estimatedTokens;
|
|
238
395
|
const totalCostUsd = totalTokens * costPerToken;
|
|
239
396
|
|
|
397
|
+
// Auto-detect specFormat from the run directory when not provided explicitly
|
|
398
|
+
const detectedSpecFormat =
|
|
399
|
+
opts.specFormat ?? (await detectSpecFormat(runDir)).format;
|
|
400
|
+
|
|
240
401
|
const report: RunReport = {
|
|
241
402
|
runId,
|
|
242
403
|
specVersion: specYaml.version,
|
|
243
404
|
model: opts.model ?? specYaml.minModel,
|
|
244
405
|
runner: specYaml.runner,
|
|
406
|
+
harness,
|
|
407
|
+
specFormat: detectedSpecFormat,
|
|
408
|
+
environmentType,
|
|
409
|
+
steeringActionCount,
|
|
410
|
+
isPureRun: finalStatus === 'success' && steeringActionCount === 0,
|
|
245
411
|
loopCount: iterations.length,
|
|
246
412
|
totalTokens,
|
|
247
413
|
totalCostUsd,
|
|
@@ -265,10 +431,83 @@ export async function runSpec(
|
|
|
265
431
|
|
|
266
432
|
// ---- Internal helpers ----
|
|
267
433
|
|
|
434
|
+
/**
|
|
435
|
+
* Detects the spec format from specDir and writes `.specmarket-runner.md` to runDir.
|
|
436
|
+
*
|
|
437
|
+
* Idempotent: if the file already exists in runDir it is overwritten so that
|
|
438
|
+
* the instructions stay consistent with the detected format.
|
|
439
|
+
*
|
|
440
|
+
* @param specDir - Source spec directory (used for format detection + sidecar data)
|
|
441
|
+
* @param runDir - Sandboxed run directory where the file is written
|
|
442
|
+
* @param formatOverride - Optional pre-detected format (skips detection when provided)
|
|
443
|
+
*/
|
|
444
|
+
export async function ensureMetaInstructions(
|
|
445
|
+
specDir: string,
|
|
446
|
+
runDir: string,
|
|
447
|
+
formatOverride?: string
|
|
448
|
+
): Promise<void> {
|
|
449
|
+
const format = formatOverride ?? (await detectSpecFormat(specDir)).format;
|
|
450
|
+
debug('Generating meta-instructions for format=%s', format);
|
|
451
|
+
const content = await generateMetaInstructions(specDir, format);
|
|
452
|
+
await writeFile(join(runDir, META_INSTRUCTION_FILENAME), content, 'utf-8');
|
|
453
|
+
debug('Meta-instructions written to %s/%s', runDir, META_INSTRUCTION_FILENAME);
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
/**
|
|
457
|
+
* Injects pending steering messages into the meta-instructions file for the
|
|
458
|
+
* current run directory.
|
|
459
|
+
*
|
|
460
|
+
* A "## Steering Input" section is appended to `.specmarket-runner.md` so the
|
|
461
|
+
* agent reads the user's guidance on its next harness invocation. Each call
|
|
462
|
+
* appends a timestamped section — messages accumulate across iterations so the
|
|
463
|
+
* agent retains the full steering history.
|
|
464
|
+
*
|
|
465
|
+
* Side effects:
|
|
466
|
+
* - Modifies `.specmarket-runner.md` in runDir (appends steering section)
|
|
467
|
+
* - Pushes `SteeringEntry` objects into `steeringLog`
|
|
468
|
+
*
|
|
469
|
+
* @param runDir - Active run directory containing the meta-instructions file
|
|
470
|
+
* @param messages - Steering messages to inject (already spliced from the queue)
|
|
471
|
+
* @param steeringLog - Mutable array collecting all steering entries for this run
|
|
472
|
+
*/
|
|
473
|
+
export async function injectSteeringMessages(
|
|
474
|
+
runDir: string,
|
|
475
|
+
messages: string[],
|
|
476
|
+
steeringLog: SteeringEntry[]
|
|
477
|
+
): Promise<void> {
|
|
478
|
+
if (messages.length === 0) return;
|
|
479
|
+
|
|
480
|
+
const timestamp = new Date().toISOString();
|
|
481
|
+
const entries: SteeringEntry[] = messages.map((content) => ({ timestamp, content }));
|
|
482
|
+
steeringLog.push(...entries);
|
|
483
|
+
|
|
484
|
+
const steeringSection = [
|
|
485
|
+
'',
|
|
486
|
+
`## Steering Input (injected at ${timestamp})`,
|
|
487
|
+
'',
|
|
488
|
+
'The user has provided the following steering instructions. Incorporate them into your current work:',
|
|
489
|
+
'',
|
|
490
|
+
...messages.map((m) => `> ${m}`),
|
|
491
|
+
'',
|
|
492
|
+
].join('\n');
|
|
493
|
+
|
|
494
|
+
const metaPath = join(runDir, META_INSTRUCTION_FILENAME);
|
|
495
|
+
try {
|
|
496
|
+
const existing = await readFile(metaPath, 'utf-8');
|
|
497
|
+
await writeFile(metaPath, existing + steeringSection, 'utf-8');
|
|
498
|
+
} catch {
|
|
499
|
+
// Meta-instructions file missing — create it with just the steering section
|
|
500
|
+
await writeFile(metaPath, steeringSection, 'utf-8');
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
debug('injectSteeringMessages: appended %d message(s) to %s', messages.length, META_INSTRUCTION_FILENAME);
|
|
504
|
+
}
|
|
505
|
+
|
|
268
506
|
async function copySpecFiles(srcDir: string, destDir: string): Promise<void> {
|
|
269
507
|
const { cp } = await import('fs/promises');
|
|
270
508
|
await cp(srcDir, join(destDir, 'spec'), { recursive: true });
|
|
271
|
-
// Also copy directly to destDir so
|
|
509
|
+
// Also copy directly to destDir so spec files are accessible at the root
|
|
510
|
+
// of the run directory alongside the generated meta-instructions.
|
|
272
511
|
await cp(srcDir, destDir, { recursive: true, force: false });
|
|
273
512
|
debug('Spec files copied from %s to %s', srcDir, destDir);
|
|
274
513
|
}
|
|
@@ -306,27 +545,62 @@ interface ExecuteResult {
|
|
|
306
545
|
exitCode: number;
|
|
307
546
|
}
|
|
308
547
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
548
|
+
/**
|
|
549
|
+
* Builds the shell command string for the given harness.
|
|
550
|
+
*
|
|
551
|
+
* All harnesses receive the same meta-instructions file via stdin so they
|
|
552
|
+
* know what format they are working with and what tasks to execute.
|
|
553
|
+
*
|
|
554
|
+
* - claude-code: `cat .specmarket-runner.md | claude --print --output-format json [--model <m>]`
|
|
555
|
+
* - codex: `cat .specmarket-runner.md | codex`
|
|
556
|
+
* - opencode: `cat .specmarket-runner.md | opencode`
|
|
557
|
+
*/
|
|
558
|
+
function buildHarnessCommand(harness: string, model?: string): string {
|
|
559
|
+
switch (harness) {
|
|
560
|
+
case 'claude-code': {
|
|
561
|
+
const args = ['--print', '--output-format', 'json'];
|
|
562
|
+
if (model) args.push('--model', model);
|
|
563
|
+
return `cat ${META_INSTRUCTION_FILENAME} | claude ${args.join(' ')}`;
|
|
314
564
|
}
|
|
565
|
+
case 'codex':
|
|
566
|
+
// Codex CLI reads from stdin; model selection is via OPENAI_MODEL env or its own flags
|
|
567
|
+
return `cat ${META_INSTRUCTION_FILENAME} | codex`;
|
|
568
|
+
case 'opencode':
|
|
569
|
+
// opencode reads from stdin
|
|
570
|
+
return `cat ${META_INSTRUCTION_FILENAME} | opencode`;
|
|
571
|
+
default:
|
|
572
|
+
// Unknown harness — fall back to claude-code behaviour
|
|
573
|
+
debug('Unknown harness "%s" — falling back to claude-code', harness);
|
|
574
|
+
return `cat ${META_INSTRUCTION_FILENAME} | claude --print --output-format json`;
|
|
575
|
+
}
|
|
576
|
+
}
|
|
315
577
|
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
578
|
+
/**
|
|
579
|
+
* Executes a single loop iteration via the specified harness.
|
|
580
|
+
*
|
|
581
|
+
* The meta-instructions file (`.specmarket-runner.md`) is piped into the harness
|
|
582
|
+
* binary as stdin. The harness is expected to read the instructions, perform the
|
|
583
|
+
* requested work inside the run directory, and exit with code 0 on success.
|
|
584
|
+
*/
|
|
585
|
+
async function executeHarness(dir: string, harness: string, model?: string): Promise<ExecuteResult> {
|
|
586
|
+
const cmd = buildHarnessCommand(harness, model);
|
|
587
|
+
debug('executeHarness: %s (harness=%s)', cmd, harness);
|
|
588
|
+
|
|
589
|
+
return new Promise((resolve) => {
|
|
590
|
+
const proc = spawn('sh', ['-c', cmd], {
|
|
319
591
|
cwd: dir,
|
|
320
|
-
|
|
592
|
+
// stdin is 'ignore': the harness reads its instructions from the meta-instructions file
|
|
593
|
+
// via `cat .specmarket-runner.md | <harness>`, not from parent stdin.
|
|
594
|
+
// Keeping stdin detached from the parent lets the CLI read steering messages
|
|
595
|
+
// from process.stdin without conflict.
|
|
596
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
321
597
|
});
|
|
322
598
|
|
|
323
599
|
let stdout = '';
|
|
324
|
-
let stderr = '';
|
|
325
600
|
proc.stdout?.on('data', (chunk: Buffer) => {
|
|
326
601
|
stdout += chunk.toString();
|
|
327
602
|
});
|
|
328
603
|
proc.stderr?.on('data', (chunk: Buffer) => {
|
|
329
|
-
stderr += chunk.toString();
|
|
330
604
|
// Write stderr to process stderr for visibility
|
|
331
605
|
process.stderr.write(chunk);
|
|
332
606
|
});
|
|
@@ -336,7 +610,7 @@ async function executeClaudeLoop(dir: string, model?: string): Promise<ExecuteRe
|
|
|
336
610
|
});
|
|
337
611
|
|
|
338
612
|
proc.on('error', (err) => {
|
|
339
|
-
debug('
|
|
613
|
+
debug('%s spawn error: %O', harness, err);
|
|
340
614
|
resolve({ stdout: '', exitCode: 1 });
|
|
341
615
|
});
|
|
342
616
|
});
|
|
@@ -347,15 +621,32 @@ async function executeClaudeLoop(dir: string, model?: string): Promise<ExecuteRe
|
|
|
347
621
|
*
|
|
348
622
|
* Strategy (in priority order):
|
|
349
623
|
* 1. Parse JSON output format (claude --output-format json) which contains
|
|
350
|
-
* structured metadata including token counts in the response.
|
|
624
|
+
* structured metadata including token counts or cost_usd in the response.
|
|
625
|
+
* When only cost_usd is available (typical for Claude Code CLI), token count
|
|
626
|
+
* is estimated using model-aware pricing constants from MODEL_COST_PER_TOKEN.
|
|
627
|
+
* This estimate may deviate ±30% from the actual count depending on the
|
|
628
|
+
* input/output token ratio for that specific run.
|
|
351
629
|
* 2. Match known text patterns from Claude Code's output (total_tokens, etc.)
|
|
352
630
|
* 3. Estimate from output length as a last-resort heuristic (~4 chars per token).
|
|
353
631
|
*
|
|
632
|
+
* @param output - Raw stdout from the Claude CLI invocation
|
|
633
|
+
* @param model - Model identifier (e.g. "claude-haiku-4-5", "claude-opus-4-6").
|
|
634
|
+
* Used to select the correct pricing tier for cost→token estimation.
|
|
635
|
+
* Defaults to Sonnet-tier pricing if omitted or unrecognised.
|
|
636
|
+
*
|
|
354
637
|
* Returns 0 only if the output is empty (no meaningful work was done).
|
|
355
638
|
*/
|
|
356
|
-
function parseTokensFromOutput(output: string): number {
|
|
639
|
+
export function parseTokensFromOutput(output: string, model?: string): number {
|
|
357
640
|
if (!output || output.trim().length === 0) return 0;
|
|
358
641
|
|
|
642
|
+
// Resolve cost-per-token for this model (case-insensitive substring match)
|
|
643
|
+
const modelLower = (model ?? '').toLowerCase();
|
|
644
|
+
const costPerToken = modelLower.includes('haiku')
|
|
645
|
+
? MODEL_COST_PER_TOKEN.haiku
|
|
646
|
+
: modelLower.includes('opus')
|
|
647
|
+
? MODEL_COST_PER_TOKEN.opus
|
|
648
|
+
: MODEL_COST_PER_TOKEN.default;
|
|
649
|
+
|
|
359
650
|
// Strategy 1: Parse JSON output format from claude --output-format json
|
|
360
651
|
// Claude Code JSON output may contain token usage info in the response metadata.
|
|
361
652
|
try {
|
|
@@ -386,11 +677,16 @@ function parseTokensFromOutput(output: string): number {
|
|
|
386
677
|
const output_tokens = parsed.usage?.output_tokens ?? parsed.usage?.completion_tokens ?? 0;
|
|
387
678
|
if (input > 0 || output_tokens > 0) return input + output_tokens;
|
|
388
679
|
|
|
389
|
-
// Cost-based estimation
|
|
390
|
-
//
|
|
391
|
-
// Sonnet: ~$3/MTok input, $15/MTok output → avg ~$9/MTok
|
|
680
|
+
// Cost-based estimation: Claude Code CLI typically reports cost_usd but not
|
|
681
|
+
// raw token counts. Use model-aware pricing for the best estimate.
|
|
392
682
|
if (typeof parsed.cost_usd === 'number' && parsed.cost_usd > 0) {
|
|
393
|
-
|
|
683
|
+
debug(
|
|
684
|
+
'parseTokensFromOutput: using cost_usd=%f with model=%s (costPerToken=%e)',
|
|
685
|
+
parsed.cost_usd,
|
|
686
|
+
model ?? 'unknown',
|
|
687
|
+
costPerToken
|
|
688
|
+
);
|
|
689
|
+
return Math.round(parsed.cost_usd / costPerToken);
|
|
394
690
|
}
|
|
395
691
|
}
|
|
396
692
|
} catch {
|
|
@@ -425,8 +721,8 @@ function parseTokensFromOutput(output: string): number {
|
|
|
425
721
|
}
|
|
426
722
|
|
|
427
723
|
// Strategy 3: Estimate from output length
|
|
428
|
-
// Rough heuristic: ~4 characters per token for English text
|
|
429
|
-
// This is imprecise but better than returning 0 (which breaks budget tracking)
|
|
724
|
+
// Rough heuristic: ~4 characters per token for English text.
|
|
725
|
+
// This is imprecise but better than returning 0 (which breaks budget tracking).
|
|
430
726
|
const estimatedTokens = Math.ceil(output.length / 4);
|
|
431
727
|
debug(
|
|
432
728
|
'parseTokensFromOutput: no explicit token count found, estimating %d from %d chars',
|
|
@@ -441,40 +737,6 @@ function parseIntComma(s: string): number {
|
|
|
441
737
|
return parseInt(s.replace(/,/g, ''), 10) || 0;
|
|
442
738
|
}
|
|
443
739
|
|
|
444
|
-
interface CompletionCheck {
|
|
445
|
-
isComplete: boolean;
|
|
446
|
-
results: SuccessCriterionResult[];
|
|
447
|
-
}
|
|
448
|
-
|
|
449
|
-
async function checkCompletion(dir: string): Promise<CompletionCheck> {
|
|
450
|
-
// Check 1: TASKS.md should be empty or have only checked items
|
|
451
|
-
const fixPlanEmpty = await isFixPlanEmpty(dir);
|
|
452
|
-
if (!fixPlanEmpty) {
|
|
453
|
-
return {
|
|
454
|
-
isComplete: false,
|
|
455
|
-
results: await evaluateSuccessCriteria(dir).catch(() => []),
|
|
456
|
-
};
|
|
457
|
-
}
|
|
458
|
-
|
|
459
|
-
// Check 2: Run test suite if detectable
|
|
460
|
-
const testsPass = await runTests(dir);
|
|
461
|
-
if (!testsPass) {
|
|
462
|
-
return {
|
|
463
|
-
isComplete: false,
|
|
464
|
-
results: await evaluateSuccessCriteria(dir).catch(() => []),
|
|
465
|
-
};
|
|
466
|
-
}
|
|
467
|
-
|
|
468
|
-
// Check 3: Evaluate SUCCESS_CRITERIA.md
|
|
469
|
-
const criteriaResults = await evaluateSuccessCriteria(dir);
|
|
470
|
-
const allPassed = criteriaResults.every((r) => r.passed);
|
|
471
|
-
|
|
472
|
-
return {
|
|
473
|
-
isComplete: allPassed,
|
|
474
|
-
results: criteriaResults,
|
|
475
|
-
};
|
|
476
|
-
}
|
|
477
|
-
|
|
478
740
|
async function isFixPlanEmpty(dir: string): Promise<boolean> {
|
|
479
741
|
try {
|
|
480
742
|
const content = await readFile(join(dir, 'TASKS.md'), 'utf-8');
|
|
@@ -487,13 +749,21 @@ async function isFixPlanEmpty(dir: string): Promise<boolean> {
|
|
|
487
749
|
}
|
|
488
750
|
}
|
|
489
751
|
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
752
|
+
/**
|
|
753
|
+
* Runs the test suite in `dir` and captures the raw output.
|
|
754
|
+
*
|
|
755
|
+
* Probes for known test runner config files in priority order; skips to the
|
|
756
|
+
* next runner on spawn or timeout errors. Returns `{ passed: true, output: '' }`
|
|
757
|
+
* when no test runner is detected (cannot verify — assume passing).
|
|
758
|
+
*
|
|
759
|
+
* The raw `output` is used by `writeTestFixTasks` to extract failure details
|
|
760
|
+
* and write them as actionable fix tasks for the agent.
|
|
761
|
+
*/
|
|
762
|
+
export async function runTestsWithOutput(dir: string): Promise<{ passed: boolean; output: string }> {
|
|
493
763
|
const testRunners = [
|
|
494
764
|
{ file: 'package.json', cmd: 'npm test -- --run 2>&1' },
|
|
495
765
|
{ file: 'vitest.config.ts', cmd: 'npx vitest run 2>&1' },
|
|
496
|
-
{ file: 'pytest.ini', cmd: 'python -m pytest --tb=
|
|
766
|
+
{ file: 'pytest.ini', cmd: 'python -m pytest --tb=short -q 2>&1' },
|
|
497
767
|
{ file: 'Makefile', cmd: 'make test 2>&1' },
|
|
498
768
|
];
|
|
499
769
|
|
|
@@ -509,22 +779,129 @@ async function runTests(dir: string): Promise<boolean> {
|
|
|
509
779
|
cwd: dir,
|
|
510
780
|
timeout: 120000,
|
|
511
781
|
});
|
|
512
|
-
// Exit code 0 — check output as secondary signal
|
|
513
782
|
const combined = stdout + stderr;
|
|
514
783
|
const hasFailed = /\d+ failed|\d+ error/i.test(combined);
|
|
515
|
-
return !hasFailed;
|
|
784
|
+
return { passed: !hasFailed, output: combined };
|
|
516
785
|
} catch (err: unknown) {
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
786
|
+
if (err && typeof err === 'object') {
|
|
787
|
+
const execErr = err as { code?: number; signal?: string; stdout?: string; stderr?: string };
|
|
788
|
+
if (typeof execErr.code === 'number' && execErr.signal == null) {
|
|
789
|
+
// Process exited with a non-zero exit code — genuine test failures.
|
|
790
|
+
const combined = (execErr.stdout ?? '') + (execErr.stderr ?? '');
|
|
791
|
+
return { passed: false, output: combined };
|
|
792
|
+
}
|
|
520
793
|
}
|
|
521
|
-
// Timeout or
|
|
794
|
+
// Timeout or spawn error — skip to next runner
|
|
522
795
|
continue;
|
|
523
796
|
}
|
|
524
797
|
}
|
|
525
798
|
|
|
526
|
-
// No test runner
|
|
527
|
-
return true;
|
|
799
|
+
// No test runner detected — assume passing
|
|
800
|
+
return { passed: true, output: '' };
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
/**
|
|
804
|
+
* Extract a short list of failing test identifiers from raw test runner output.
|
|
805
|
+
*
|
|
806
|
+
* Supports:
|
|
807
|
+
* - Vitest/Jest: "FAIL src/foo.test.ts" file-level failures
|
|
808
|
+
* - Vitest/Jest: "× test name" / "✗ test name" individual test failures
|
|
809
|
+
* - Pytest: "FAILED tests/foo.py::test_name"
|
|
810
|
+
* - Generic: "N failed" summary line (fallback)
|
|
811
|
+
*
|
|
812
|
+
* Returns at most 10 entries. When specific failures cannot be parsed, returns
|
|
813
|
+
* a single generic entry directing the agent to TEST_FAILURES.md.
|
|
814
|
+
*/
|
|
815
|
+
export function extractTestFailures(output: string): string[] {
|
|
816
|
+
const failures: string[] = [];
|
|
817
|
+
|
|
818
|
+
// Vitest/Jest: "FAIL src/foo.test.ts" (file-level failure)
|
|
819
|
+
const failFileMatches = output.match(/^FAIL\s+\S+/gm) ?? [];
|
|
820
|
+
for (const m of failFileMatches) {
|
|
821
|
+
const name = m.replace(/^FAIL\s+/, '').trim();
|
|
822
|
+
if (name && !failures.includes(name)) failures.push(name);
|
|
823
|
+
}
|
|
824
|
+
|
|
825
|
+
// Vitest/Jest: individual test "× test name" or "✗ test name" or "✕ test name"
|
|
826
|
+
const failTestMatches = output.match(/^[\s]*[×✗✕]\s+(.+)/gm) ?? [];
|
|
827
|
+
for (const m of failTestMatches) {
|
|
828
|
+
const name = m.replace(/^[\s]*[×✗✕]\s+/, '').trim();
|
|
829
|
+
if (name && !failures.includes(name)) failures.push(name);
|
|
830
|
+
}
|
|
831
|
+
|
|
832
|
+
// Pytest: "FAILED tests/foo.py::test_bar"
|
|
833
|
+
const pytestMatches = output.match(/^FAILED\s+\S+/gm) ?? [];
|
|
834
|
+
for (const m of pytestMatches) {
|
|
835
|
+
const name = m.replace(/^FAILED\s+/, '').trim();
|
|
836
|
+
if (name && !failures.includes(name)) failures.push(name);
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
// Generic fallback when specific test names couldn't be parsed
|
|
840
|
+
if (failures.length === 0) {
|
|
841
|
+
const summaryMatch = output.match(/(\d+)\s+failed/i);
|
|
842
|
+
if (summaryMatch) {
|
|
843
|
+
failures.push(`${summaryMatch[1]} test(s) failed — see TEST_FAILURES.md for details`);
|
|
844
|
+
}
|
|
845
|
+
}
|
|
846
|
+
|
|
847
|
+
return failures.slice(0, 10);
|
|
848
|
+
}
|
|
849
|
+
|
|
850
|
+
/**
|
|
851
|
+
* Write test failures as actionable fix tasks into TASKS.md after the runner
|
|
852
|
+
* detects that all implementation tasks are done but tests are still failing.
|
|
853
|
+
*
|
|
854
|
+
* Side effects:
|
|
855
|
+
* - Writes `TEST_FAILURES.md` with the full test output for agent reference.
|
|
856
|
+
* - Appends (or replaces) a "## Test Failures (Auto-Generated)" section in
|
|
857
|
+
* `TASKS.md` containing one `- [ ] Fix: <name>` item per failing test.
|
|
858
|
+
* Any previous auto-generated section is replaced to avoid duplication.
|
|
859
|
+
*
|
|
860
|
+
* The agent will see TASKS.md has unchecked items, read TEST_FAILURES.md for
|
|
861
|
+
* context, and work to resolve each failure before marking them `[x]`.
|
|
862
|
+
*/
|
|
863
|
+
export async function writeTestFixTasks(dir: string, testOutput: string): Promise<void> {
|
|
864
|
+
// Always write the full output to TEST_FAILURES.md so the agent has context.
|
|
865
|
+
await writeFile(
|
|
866
|
+
join(dir, 'TEST_FAILURES.md'),
|
|
867
|
+
[
|
|
868
|
+
'# Test Failures',
|
|
869
|
+
'',
|
|
870
|
+
'> Auto-generated by SpecMarket runner. Delete this file when all tests pass.',
|
|
871
|
+
'',
|
|
872
|
+
'## Raw Test Output',
|
|
873
|
+
'',
|
|
874
|
+
'```',
|
|
875
|
+
testOutput.slice(0, 8000),
|
|
876
|
+
'```',
|
|
877
|
+
].join('\n'),
|
|
878
|
+
'utf-8'
|
|
879
|
+
);
|
|
880
|
+
|
|
881
|
+
const failures = extractTestFailures(testOutput);
|
|
882
|
+
if (failures.length === 0) return;
|
|
883
|
+
|
|
884
|
+
const testFixSection = [
|
|
885
|
+
'',
|
|
886
|
+
'## Test Failures (Auto-Generated)',
|
|
887
|
+
'> These tasks were created by the runner after detecting test failures.',
|
|
888
|
+
'> Fix each failing test, then delete this section and TEST_FAILURES.md.',
|
|
889
|
+
'',
|
|
890
|
+
...failures.map((f) => `- [ ] Fix: ${f}`),
|
|
891
|
+
].join('\n');
|
|
892
|
+
|
|
893
|
+
try {
|
|
894
|
+
const existing = await readFile(join(dir, 'TASKS.md'), 'utf-8');
|
|
895
|
+
// Replace any previous auto-generated section to avoid duplication.
|
|
896
|
+
const withoutPrevious = existing.replace(
|
|
897
|
+
/\n## Test Failures \(Auto-Generated\)[\s\S]*/,
|
|
898
|
+
''
|
|
899
|
+
);
|
|
900
|
+
await writeFile(join(dir, 'TASKS.md'), withoutPrevious + testFixSection, 'utf-8');
|
|
901
|
+
} catch {
|
|
902
|
+
// TASKS.md doesn't exist — create it.
|
|
903
|
+
await writeFile(join(dir, 'TASKS.md'), `# Tasks${testFixSection}`, 'utf-8');
|
|
904
|
+
}
|
|
528
905
|
}
|
|
529
906
|
|
|
530
907
|
async function evaluateSuccessCriteria(dir: string): Promise<SuccessCriterionResult[]> {
|