@smartmemory/compose 0.2.7-beta → 0.2.8-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/gsd.js CHANGED
@@ -13,7 +13,7 @@
13
13
  // V1 limitation: runtime task-to-task handoff is not implemented; tasks see
14
14
  // only spec-level upstream context (Boundary Map declarations) per blueprint.
15
15
 
16
- import { readFileSync, existsSync, readdirSync } from 'node:fs';
16
+ import { readFileSync, writeFileSync, existsSync, readdirSync, mkdirSync, rmSync } from 'node:fs';
17
17
  import { join, resolve, dirname } from 'node:path';
18
18
  import { fileURLToPath } from 'node:url';
19
19
  import { execSync } from 'node:child_process';
@@ -22,8 +22,11 @@ import { StratumMcpClient } from './stratum-mcp-client.js';
22
22
  import { validateBoundaryMap } from './boundary-map.js';
23
23
  import { enrichTaskGraph } from './gsd-decompose-enrich.js';
24
24
  import { buildTaskDescription } from './gsd-prompt.js';
25
- import { writeAll, validate as validateTaskResult } from './gsd-blackboard.js';
25
+ import { writeAll, validate as validateTaskResult, read as readBlackboard } from './gsd-blackboard.js';
26
26
  import { executeParallelDispatchServer, executeShipStep } from './build.js';
27
+ import { GsdStuckDetector, DEFAULT_THRESHOLDS } from './gsd-stuck.js';
28
+ import { readGsdBudgetConfig, buildBudgetBlock, injectBudget, composeBudgetDiagnostic } from './gsd-budget.js';
29
+ import { recordGsdUsage, checkGsdCumulativeBudget } from './budget-ledger.js';
27
30
 
28
31
  const __dirname = dirname(fileURLToPath(import.meta.url));
29
32
  const PACKAGE_ROOT = resolve(__dirname, '..');
@@ -62,12 +65,36 @@ export async function runGsd(featureCode, opts = {}) {
62
65
  );
63
66
  }
64
67
 
65
- // 2. Refuse to start in a dirty workspace BEFORE any Stratum side effects.
68
+ // 2. COMP-GSD-5 resume branch runs BEFORE the dirty-tree check so a
69
+ // pid/mode-guard failure (the more specific precondition) is reported first.
70
+ // --resume reads pause.json, guards on ownership (no live pid) +
71
+ // mode==='gsd' (mirrors `compose fix --resume`), and seeds a precomputed task
72
+ // graph = decomposedTasks MINUS completedTaskIds so the execute step
73
+ // re-dispatches only the unfinished work. Completed results already live in
74
+ // the blackboard. resumeTaskGraph (when set) makes runOneStep skip the
75
+ // decompose agent entirely → stable task IDs, no re-decompose.
76
+ // COMP-GSD-4: read+guard the resume graph here for guard-ordering, but DEFER
77
+ // the atomic pause.lock claim (claim:false) — runGsd claims inside its try so
78
+ // the finally always releases it (no strand on re-halt/refusal/throw).
79
+ let resumeTaskGraph = null;
80
+ if (opts.resume) {
81
+ resumeTaskGraph = loadResumeTaskGraph(cwd, featureCode, { claim: false });
82
+ }
83
+
84
+ // 3. Refuse to start in a dirty workspace BEFORE any Stratum side effects.
66
85
  // v1 rationale: alternatives (baseline subtract + post-execute delta) drop
67
86
  // legitimate edits to pre-existing dirty files. Refuse-if-dirty makes
68
87
  // post-execute dirty set unambiguous: every entry is GSD-produced.
88
+ //
89
+ // On --resume the GSD control plane (.compose/gsd/<feature>/) legitimately
90
+ // carries the prior run's pause.json/blackboard.json/results — that's the
91
+ // resume STATE, not an unrelated edit — so exclude it from the dirty set.
69
92
  if (!opts.allowDirtyWorkspace) {
70
- const startingDirty = collectChangedFiles(cwd);
93
+ let startingDirty = collectChangedFiles(cwd);
94
+ if (opts.resume) {
95
+ const ctrlPrefix = `.compose/gsd/${featureCode}/`;
96
+ startingDirty = startingDirty.filter((f) => !f.startsWith(ctrlPrefix));
97
+ }
71
98
  if (startingDirty.length > 0) {
72
99
  throw new Error(
73
100
  `runGsd: working tree must be clean to ensure ship_gsd stages only GSD-produced changes. ` +
@@ -77,19 +104,54 @@ export async function runGsd(featureCode, opts = {}) {
77
104
  }
78
105
  }
79
106
 
80
- // 3. Resolve gateCommands. loadProjectConfig() does not merge defaults, so
107
+ // 4. Resolve gateCommands. loadProjectConfig() does not merge defaults, so
81
108
  // explicit fallback here.
82
109
  const gateCommands = resolveGateCommands(cwd, opts.gateCommands);
83
110
 
84
111
  // 4. Load pipeline spec
85
112
  const specPath = join(PACKAGE_ROOT, 'pipelines', 'gsd.stratum.yaml');
86
- const specYaml = readFileSync(specPath, 'utf-8');
113
+ // 4a. COMP-GSD-4: inject the stratum flow budget block from `gsd.budget.*`.
114
+ // injectBudget is IDENTITY when nothing is configured, so an un-budgeted gsd
115
+ // run (and plain `compose build`) is byte-identical.
116
+ const budgetCfg = readGsdBudgetConfig(cwd);
117
+ const specYaml = injectBudget(readFileSync(specPath, 'utf-8'), budgetCfg);
118
+
119
+ // 4a. COMP-GSD-4: cumulative cross-session ceiling pre-check (tokens/cost).
120
+ // Refuse to start/resume a run that has already spent its lifetime budget —
121
+ // re-dispatching would immediately re-trip. Runs before the try, so no
122
+ // pause.lock is held yet (the claim is the first statement inside the try).
123
+ const cumulative = buildBudgetBlock(budgetCfg).cumulative;
124
+ if (cumulative) {
125
+ const chk = checkGsdCumulativeBudget(join(cwd, '.compose'), featureCode, cumulative);
126
+ if (chk.exceeded) {
127
+ writeCumulativeRefusal(cwd, featureCode, chk, cumulative);
128
+ return { status: 'budget', flowId: null, axis: 'cumulative', reason: chk.reason };
129
+ }
130
+ }
131
+
132
+ // 4b. COMP-GSD-5 stuck detector — thresholds from .compose/compose.json
133
+ // `gsd.stuck.*` with documented defaults. ONLY gsd passes this into the
134
+ // shared executeParallelDispatchServer, so build mode is byte-identical.
135
+ const stuckDetector = buildStuckDetector(cwd);
87
136
 
88
137
  // 5. Connect Stratum + plan (only after preconditions pass)
89
138
  const stratum = opts.stratum ?? new StratumMcpClient();
90
139
  const ownsStratum = !opts.stratum;
91
140
  if (ownsStratum) await stratum.connect();
141
+ // COMP-GSD-4: ownership flag — release the resume lock in finally ONLY if THIS
142
+ // process successfully claimed it (set below). Prevents (a) a non-resume run
143
+ // from clobbering a concurrent resume's valid claim and (b) a claim-race loser
144
+ // (EEXIST) from deleting the winner's lock on its way out.
145
+ let lockClaimed = false;
92
146
  try {
147
+ // COMP-GSD-4: claim the resume lock HERE (first statement in the try) so the
148
+ // finally releases it on EVERY exit — budget/stuck re-halt, throw, or clean
149
+ // finish. loadResumeTaskGraph above already read+guarded (claim:false).
150
+ if (opts.resume) {
151
+ claimResumeLock(cwd, featureCode); // throws EEXIST → finally sees lockClaimed=false
152
+ lockClaimed = true;
153
+ }
154
+
93
155
  let response = await stratum.plan(specYaml, 'gsd', {
94
156
  featureCode,
95
157
  gateCommands,
@@ -102,13 +164,46 @@ export async function runGsd(featureCode, opts = {}) {
102
164
  const stepCtx = {
103
165
  stratum, cwd, featureCode, blueprintText, gateCommands,
104
166
  filesChanged: [],
167
+ stuckDetector,
168
+ resumeTaskGraph,
169
+ stuck: null, // set by runOneStep on a stuck verdict
105
170
  };
106
171
 
107
- // 5. Status loop
108
- while (response.status !== 'complete' && response.status !== 'killed') {
172
+ // 5. Status loop. `stuck` (COMP-GSD-5) and `budget_exhausted` (COMP-GSD-4)
173
+ // are terminal statuses. `stuck` is set compose-side by runOneStep; budget
174
+ // is the stratum flow-budget terminal, surfaced verbatim through the advance/
175
+ // poll envelopes (and carries budget_state).
176
+ while (
177
+ response.status !== 'complete' &&
178
+ response.status !== 'killed' &&
179
+ response.status !== 'stuck' &&
180
+ response.status !== 'budget_exhausted'
181
+ ) {
109
182
  response = await runOneStep(response, stepCtx);
110
183
  }
111
184
 
185
+ if (response.status === 'stuck') {
186
+ // Artifacts (stuck.md/json + pause.json) were written by runOneStep.
187
+ return {
188
+ status: 'stuck',
189
+ flowId,
190
+ stuckTaskId: stepCtx.stuck?.taskId ?? null,
191
+ signal: stepCtx.stuck?.signal ?? null,
192
+ };
193
+ }
194
+
195
+ if (response.status === 'budget_exhausted') {
196
+ // COMP-GSD-4: the stratum flow budget tripped. The flow already
197
+ // cascade-cancelled in-flight siblings. Persist budget.{md,json} +
198
+ // pause.json (kind:budget) for --resume, record cumulative usage, and
199
+ // return a terminal `budget` envelope. pause.lock is released by finally.
200
+ const budgetState = response.budget_state ?? {};
201
+ writeBudgetArtifacts(stepCtx, response, budgetState);
202
+ recordGsdUsageFromState(cwd, featureCode, budgetState);
203
+ const axis = composeBudgetDiagnostic(budgetState, { feature: featureCode }).json.axis;
204
+ return { status: 'budget', flowId, axis, consumed: budgetState.consumed ?? {}, caps: budgetState.caps ?? {} };
205
+ }
206
+
112
207
  // 6. Post-step blackboard finalization — read each task's TaskResult JSON
113
208
  // and write the consolidated blackboard.
114
209
  const blackboard = collectBlackboard(cwd, featureCode);
@@ -116,12 +211,26 @@ export async function runGsd(featureCode, opts = {}) {
116
211
  await writeAll(featureCode, blackboard, { cwd });
117
212
  }
118
213
 
214
+ // 6b. COMP-GSD-5: a clean (non-stuck) finish clears any pause.json — the
215
+ // resume completed, or a fresh run superseded a stale pause.
216
+ if (response.status === 'complete') {
217
+ // COMP-GSD-4: record this run's cumulative usage (best-effort; no-op when
218
+ // the complete envelope carries no budget_state, e.g. un-budgeted runs).
219
+ recordGsdUsageFromState(cwd, featureCode, response.budget_state);
220
+ clearPauseFile(cwd, featureCode);
221
+ }
222
+
119
223
  return {
120
224
  status: response.status,
121
225
  flowId,
122
226
  blackboardEntries: Object.keys(blackboard).length,
123
227
  };
124
228
  } finally {
229
+ // COMP-GSD-4: release the resume claim ONLY if THIS process claimed it
230
+ // (ownership-aware — never clobber a concurrent run's valid claim, and don't
231
+ // release after losing the claim race). pause.json persists for --resume
232
+ // unless a clean complete cleared it above.
233
+ if (lockClaimed) releasePauseLock(cwd, featureCode);
125
234
  if (ownsStratum) {
126
235
  try { await stratum.disconnect?.(); } catch { /* best-effort */ }
127
236
  }
@@ -153,6 +262,7 @@ async function runOneStep(response, ctx) {
153
262
  const flowId = response.flow_id;
154
263
  const stepId = response.step_id;
155
264
  const stepType = response.type ?? response.step_type;
265
+ if (stepId) ctx.lastStepId = stepId; // COMP-GSD-4: for the budget pause's stepId
156
266
 
157
267
  if (response.status === 'execute_step') {
158
268
  // parallel_dispatch step (the `execute` step)
@@ -164,7 +274,18 @@ async function runOneStep(response, ctx) {
164
274
  null, // progress
165
275
  { write: () => {} }, // streamWriter — no-op for v1
166
276
  cwd,
277
+ { stuckDetector: ctx.stuckDetector }, // COMP-GSD-5 (null in non-gsd callers)
167
278
  );
279
+
280
+ // COMP-GSD-5: a stuck verdict halts the run. Persist the diagnostic +
281
+ // resume state, then return a terminal `stuck` envelope so runGsd's loop
282
+ // exits. The task was already cancelled (conflict) inside dispatch.
283
+ if (outcome && outcome.stuck) {
284
+ ctx.stuck = outcome.stuck;
285
+ writeStuckArtifacts(ctx, response, outcome.stuck);
286
+ return { status: 'stuck', flow_id: flowId, step_id: stepId };
287
+ }
288
+
168
289
  // After diffs are merged, capture the touched files for ship_gsd
169
290
  // staging. The clean-workspace precondition above guarantees every
170
291
  // file in the post-execute dirty set is genuinely a GSD-produced change.
@@ -191,6 +312,17 @@ async function runOneStep(response, ctx) {
191
312
  return await stratum.stepDone(flowId, stepId, shipResult);
192
313
  }
193
314
 
315
+ // COMP-GSD-5 resume: skip the decompose AGENT entirely and substitute the
316
+ // persisted task graph (already enriched/repaired during the original run
317
+ // and already filtered to exclude completedTaskIds). We do NOT re-run
318
+ // validateAndRepairTaskGraph: enrichTaskGraph would flag the completed
319
+ // tasks' Boundary Map slices as orphaned (no task in the SUBSET owns them).
320
+ // Stable task IDs + no re-decompose are the whole point.
321
+ if (stepId === 'decompose_gsd' && ctx.resumeTaskGraph) {
322
+ ctx.lastTaskGraph = ctx.resumeTaskGraph;
323
+ return await stratum.stepDone(flowId, stepId, ctx.resumeTaskGraph);
324
+ }
325
+
194
326
  // Single-agent step: dispatch via runAgentText. The agent returns text;
195
327
  // we expect JSON matching the step's output_contract.
196
328
  const prompt = response.intent ?? '';
@@ -207,6 +339,10 @@ async function runOneStep(response, ctx) {
207
339
  // T6 step 7: validate decompose_gsd output and repair missing descriptions.
208
340
  if (stepId === 'decompose_gsd') {
209
341
  result = validateAndRepairTaskGraph(result, blueprintText, gateCommands);
342
+ // COMP-GSD-5: remember the ENRICHED graph so a later stuck halt can
343
+ // persist the full task definitions (with descriptions/produces/consumes)
344
+ // into pause.json — resume re-dispatches these without re-enriching.
345
+ ctx.lastTaskGraph = result;
210
346
  }
211
347
 
212
348
  return await stratum.stepDone(flowId, stepId, result);
@@ -362,3 +498,358 @@ function collectBlackboard(cwd, featureCode) {
362
498
  }
363
499
  return out;
364
500
  }
501
+
502
+ // ===========================================================================
503
+ // COMP-GSD-5: stuck detection + resume
504
+ // ===========================================================================
505
+
506
+ function gsdDir(cwd, featureCode) {
507
+ return join(cwd, '.compose', 'gsd', featureCode);
508
+ }
509
+
510
+ /**
511
+ * Build a GsdStuckDetector from `.compose/compose.json` `gsd.stuck.*`, falling
512
+ * back to documented defaults (sameFileEdits=3, errorRepeats=3,
513
+ * noProgressCalls=8, wallClockMs=600000). Config keys use snake_case to match
514
+ * the design table; the detector takes camelCase.
515
+ */
516
+ export function buildStuckDetector(cwd) {
517
+ const cfg = readGsdStuckConfig(cwd);
518
+ return new GsdStuckDetector({
519
+ sameFileEdits: cfg.same_file_edits ?? DEFAULT_THRESHOLDS.sameFileEdits,
520
+ errorRepeats: cfg.error_repeats ?? DEFAULT_THRESHOLDS.errorRepeats,
521
+ noProgressCalls: cfg.no_progress_calls ?? DEFAULT_THRESHOLDS.noProgressCalls,
522
+ wallClockMs: cfg.wall_clock_ms ?? DEFAULT_THRESHOLDS.wallClockMs,
523
+ });
524
+ }
525
+
526
+ function readGsdStuckConfig(cwd) {
527
+ const configPath = join(cwd, '.compose', 'compose.json');
528
+ if (!existsSync(configPath)) return {};
529
+ try {
530
+ const cfg = JSON.parse(readFileSync(configPath, 'utf-8'));
531
+ return cfg?.gsd?.stuck ?? {};
532
+ } catch {
533
+ return {};
534
+ }
535
+ }
536
+
537
+ /**
538
+ * Task ids whose VALIDATED TaskResult is already known — the union of the
539
+ * persisted blackboard and any per-task result files that validate. Lenient
540
+ * (does NOT throw on a bad file) because at stuck-halt time the run is being
541
+ * abandoned, not finalized.
542
+ */
543
+ function collectCompletedTaskIds(cwd, featureCode) {
544
+ const done = new Set(Object.keys(readBlackboard(featureCode, { cwd }) ?? {}));
545
+ const dir = join(gsdDir(cwd, featureCode), 'results');
546
+ if (existsSync(dir)) {
547
+ for (const f of readdirSync(dir).filter((x) => x.endsWith('.json'))) {
548
+ try {
549
+ const parsed = JSON.parse(readFileSync(join(dir, f), 'utf-8'));
550
+ if (validateTaskResult(parsed).ok) done.add(f.replace(/\.json$/, ''));
551
+ } catch { /* skip unreadable */ }
552
+ }
553
+ }
554
+ return [...done];
555
+ }
556
+
557
+ /** Best-effort unified diff of the whole working tree (for the stuck.md triage). */
558
+ function captureWorkingDiff(cwd) {
559
+ try {
560
+ return execSync('git diff HEAD', {
561
+ cwd, encoding: 'utf-8', timeout: 5000, maxBuffer: 4 * 1024 * 1024,
562
+ stdio: ['ignore', 'pipe', 'ignore'],
563
+ }).trim();
564
+ } catch {
565
+ return '';
566
+ }
567
+ }
568
+
569
+ /**
570
+ * Persist the stuck diagnostic (stuck.md + stuck.json, per
571
+ * contracts/gsd-stuck.json#stuck) AND the resume state (pause.json, per
572
+ * #pause). decomposedTasks is the FULL task list (from the dispatch envelope),
573
+ * persisted so --resume does not re-decompose. completedTaskIds comes from the
574
+ * blackboard / results dir.
575
+ */
576
+ function writeStuckArtifacts(ctx, dispatchResponse, verdict) {
577
+ const { cwd, featureCode } = ctx;
578
+ const dir = gsdDir(cwd, featureCode);
579
+ mkdirSync(dir, { recursive: true });
580
+ const ts = new Date().toISOString();
581
+
582
+ // Persist the FULLY-ENRICHED task graph (captured at decompose) so --resume
583
+ // re-dispatches the unfinished subset WITHOUT re-decomposing or re-enriching.
584
+ // Fall back to the dispatch envelope's tasks only if enrichment wasn't seen.
585
+ const sourceTasks = ctx.lastTaskGraph?.tasks ?? dispatchResponse.tasks ?? [];
586
+ const decomposedTasks = sourceTasks.map((t) => ({ ...t }));
587
+ const completedTaskIds = collectCompletedTaskIds(cwd, featureCode);
588
+ const partialDiff = captureWorkingDiff(cwd);
589
+
590
+ const stuck = {
591
+ feature: featureCode,
592
+ taskId: verdict.taskId,
593
+ signal: verdict.signal,
594
+ detail: verdict.detail,
595
+ attemptCounts: verdict.attemptCounts ?? {},
596
+ ts,
597
+ };
598
+ if (partialDiff) stuck.partialDiff = partialDiff;
599
+ writeFileSync(join(dir, 'stuck.json'), JSON.stringify(stuck, null, 2) + '\n');
600
+
601
+ const pause = {
602
+ flowId: dispatchResponse.flow_id,
603
+ stepId: dispatchResponse.step_id,
604
+ stuckTaskId: verdict.taskId,
605
+ signal: verdict.signal,
606
+ detail: verdict.detail,
607
+ decomposedTasks,
608
+ completedTaskIds,
609
+ pid: process.pid,
610
+ mode: 'gsd',
611
+ ts,
612
+ };
613
+ writeFileSync(join(dir, 'pause.json'), JSON.stringify(pause, null, 2) + '\n');
614
+
615
+ writeFileSync(join(dir, 'stuck.md'), renderStuckMarkdown(stuck, pause));
616
+ }
617
+
618
+ function renderStuckMarkdown(stuck, pause) {
619
+ const remaining = pause.decomposedTasks
620
+ .map((t) => t.id)
621
+ .filter((id) => !pause.completedTaskIds.includes(id));
622
+ return `# GSD stuck: ${stuck.feature}
623
+
624
+ **Signal:** \`${stuck.signal}\`
625
+ **Stuck task:** \`${stuck.taskId}\`
626
+ **Detected:** ${stuck.ts}
627
+
628
+ ## What happened
629
+
630
+ ${stuck.detail}
631
+
632
+ Attempt counts at halt:
633
+ - same-file edits (max across files): ${stuck.attemptCounts.sameFileEdits ?? 0}
634
+ - error repeats (max across hashes): ${stuck.attemptCounts.errorRepeats ?? 0}
635
+ - consecutive no-progress calls: ${stuck.attemptCounts.noProgressCalls ?? 0}
636
+
637
+ The in-flight task was cancelled and the run halted cleanly.
638
+
639
+ ## Resume or abort
640
+
641
+ Completed tasks (already in the blackboard, will be skipped): ${pause.completedTaskIds.length ? pause.completedTaskIds.map((x) => `\`${x}\``).join(', ') : '(none)'}
642
+ Tasks that will re-dispatch on resume: ${remaining.length ? remaining.map((x) => `\`${x}\``).join(', ') : '(none)'}
643
+
644
+ - **Resume:** \`compose gsd ${stuck.feature} --resume\` — re-dispatches the unfinished tasks into fresh worktrees.
645
+ - **Abort:** delete \`.compose/gsd/${stuck.feature}/pause.json\` and start over.
646
+
647
+ State for resume is in \`pause.json\` (schema: \`contracts/gsd-stuck.json#/definitions/pause\`).
648
+ `;
649
+ }
650
+
651
+ /**
652
+ * --resume: read pause.json, enforce the ownership + mode guard (mirrors
653
+ * `compose fix --resume`, bin/compose.js:1933), and return the persisted task
654
+ * graph filtered to exclude completedTaskIds. Throws (caller surfaces the
655
+ * message + exits 1) when there is nothing to resume or the guard fails.
656
+ *
657
+ * COMP-GSD-4: `claim` (default true) controls the atomic pause.lock ownership
658
+ * claim. runGsd passes `{claim:false}` and claims later (claimResumeLock) as
659
+ * the first statement INSIDE its try, so the run-loop's finally always releases
660
+ * the lock — no strand on a budget/stuck re-halt or a pre-dispatch throw. The
661
+ * CLI/test callers keep the default (read+guard+claim in one call).
662
+ */
663
+ export function loadResumeTaskGraph(cwd, featureCode, { claim = true } = {}) {
664
+ const pausePath = join(gsdDir(cwd, featureCode), 'pause.json');
665
+ if (!existsSync(pausePath)) {
666
+ throw new Error(
667
+ `runGsd: no pause.json to resume for ${featureCode}. ` +
668
+ `Nothing to resume — run \`compose gsd ${featureCode}\` to start fresh.`,
669
+ );
670
+ }
671
+ let pause;
672
+ try {
673
+ pause = JSON.parse(readFileSync(pausePath, 'utf-8'));
674
+ } catch (err) {
675
+ throw new Error(`runGsd: pause.json for ${featureCode} is unreadable: ${err.message}`);
676
+ }
677
+
678
+ // Mode guard: refuse to resume a non-gsd pause file.
679
+ if (pause.mode && pause.mode !== 'gsd') {
680
+ throw new Error(
681
+ `runGsd: cannot --resume: pause.json for ${featureCode} is in ${pause.mode} mode, not gsd.`,
682
+ );
683
+ }
684
+
685
+ // Ownership guard: refuse if the recorded pid is still alive. A resumable
686
+ // pause is one whose writing process has EXITED — a live pid means another
687
+ // run still owns this feature (mirrors `compose fix --resume`). We do not
688
+ // make a self-pid exception: if a live process holds the pause, resuming is
689
+ // unsafe regardless of whether that pid happens to match ours.
690
+ if (typeof pause.pid === 'number' && isPidAlive(pause.pid)) {
691
+ throw new Error(
692
+ `runGsd: cannot --resume: pid ${pause.pid} still owns this gsd run (process is live). ` +
693
+ `Wait for it to exit (or remove a stale pause.json) before resuming.`,
694
+ );
695
+ }
696
+
697
+ const tasks = Array.isArray(pause.decomposedTasks) ? pause.decomposedTasks : [];
698
+ if (tasks.length === 0) {
699
+ throw new Error(`runGsd: pause.json for ${featureCode} has no decomposedTasks to resume.`);
700
+ }
701
+ const completed = new Set(pause.completedTaskIds ?? []);
702
+ const remaining = tasks
703
+ .filter((t) => !completed.has(t.id))
704
+ .map((t) => {
705
+ // A completed dependency is already satisfied (its result is in the
706
+ // blackboard); strip it from depends_on so the re-dispatched subgraph is
707
+ // self-consistent and a remaining task does not wait on a task that will
708
+ // never be re-dispatched (COMP-GSD-5 Codex review residual).
709
+ if (!Array.isArray(t.depends_on) || t.depends_on.length === 0) return t;
710
+ const deps = t.depends_on.filter((id) => !completed.has(id));
711
+ return deps.length === t.depends_on.length ? t : { ...t, depends_on: deps };
712
+ });
713
+ if (remaining.length === 0) {
714
+ // Everything already completed — nothing to re-dispatch. Treat as clean.
715
+ throw new Error(
716
+ `runGsd: all tasks for ${featureCode} are already completed; nothing to re-dispatch. ` +
717
+ `Delete pause.json to finish.`,
718
+ );
719
+ }
720
+ if (claim) claimResumeLock(cwd, featureCode);
721
+ return { tasks: remaining };
722
+ }
723
+
724
+ /**
725
+ * Atomic ownership claim (COMP-GSD-5 Codex review, HIGH). `mkdirSync` is an
726
+ * atomically exclusive create, so two concurrent --resume invocations cannot
727
+ * both claim — the loser gets EEXIST and refuses. We deliberately do NOT
728
+ * auto-take-over a pre-existing claim: stale-claim recovery (a crashed resume's
729
+ * leftover) has an inherent TOCTOU race and is GSD-6's (crash-recovery) job,
730
+ * built on this same pause-state. A claim left by a crashed resume is cleared
731
+ * manually (message below) until GSD-6 lands.
732
+ */
733
+ export function claimResumeLock(cwd, featureCode) {
734
+ const claimPath = join(gsdDir(cwd, featureCode), 'pause.lock');
735
+ try {
736
+ mkdirSync(claimPath);
737
+ } catch (err) {
738
+ if (err.code === 'EEXIST') {
739
+ throw new Error(
740
+ `runGsd: a resume claim already exists for ${featureCode} ` +
741
+ `(.compose/gsd/${featureCode}/pause.lock). Another --resume may be in progress; ` +
742
+ `if none is, remove that directory to clear a stale claim, then retry.`,
743
+ );
744
+ }
745
+ throw err;
746
+ }
747
+ }
748
+
749
+ /**
750
+ * COMP-GSD-4: release ONLY the resume ownership claim (pause.lock), leaving
751
+ * pause.json intact for the next --resume. Called in runGsd's finally on every
752
+ * exit so a budget/stuck re-halt, cumulative refusal, or pre-dispatch throw
753
+ * never strands the lock. Idempotent (force) — a no-op when no lock was claimed.
754
+ */
755
+ function releasePauseLock(cwd, featureCode) {
756
+ try { rmSync(join(gsdDir(cwd, featureCode), 'pause.lock'), { recursive: true, force: true }); } catch { /* best-effort */ }
757
+ }
758
+
759
+ /**
760
+ * COMP-GSD-4: persist the budget halt diagnostic (budget.json + budget.md, via
761
+ * composeBudgetDiagnostic) AND the resume state (pause.json, kind:'budget').
762
+ * Mirrors writeStuckArtifacts but carries the `budget` block instead of the
763
+ * stuck-specific fields. decomposedTasks comes from the enriched graph so
764
+ * --resume re-dispatches the unfinished subset without re-decomposing.
765
+ */
766
+ function writeBudgetArtifacts(ctx, response, budgetState) {
767
+ const { cwd, featureCode } = ctx;
768
+ const dir = gsdDir(cwd, featureCode);
769
+ mkdirSync(dir, { recursive: true });
770
+ const ts = new Date().toISOString();
771
+
772
+ const sourceTasks = ctx.lastTaskGraph?.tasks ?? response.tasks ?? [];
773
+ const decomposedTasks = sourceTasks.map((t) => ({ ...t }));
774
+ const completedTaskIds = collectCompletedTaskIds(cwd, featureCode);
775
+
776
+ const { json, md } = composeBudgetDiagnostic(budgetState, { feature: featureCode, decomposedTasks, completedTaskIds });
777
+ writeFileSync(join(dir, 'budget.json'), JSON.stringify(json, null, 2) + '\n');
778
+ writeFileSync(join(dir, 'budget.md'), md);
779
+
780
+ const pause = {
781
+ flowId: response.flow_id ?? null,
782
+ stepId: response.step_id ?? ctx.lastStepId ?? 'execute',
783
+ kind: 'budget',
784
+ budget: { axis: json.axis, caps: budgetState.caps ?? {}, consumed: budgetState.consumed ?? {} },
785
+ decomposedTasks,
786
+ completedTaskIds,
787
+ pid: process.pid,
788
+ mode: 'gsd',
789
+ ts,
790
+ };
791
+ writeFileSync(join(dir, 'pause.json'), JSON.stringify(pause, null, 2) + '\n');
792
+ }
793
+
794
+ /**
795
+ * COMP-GSD-4: append a run's consumed usage to the cumulative ledger. Sourced
796
+ * from the stratum budget_state.consumed ({tokens,dispatches,wall_s,dollars}).
797
+ * No-op when budget_state is absent (un-budgeted runs).
798
+ */
799
+ function recordGsdUsageFromState(cwd, featureCode, budgetState) {
800
+ const consumed = budgetState?.consumed;
801
+ if (!consumed) return;
802
+ recordGsdUsage(join(cwd, '.compose'), featureCode, {
803
+ tokens: consumed.tokens ?? 0,
804
+ costUsd: consumed.dollars ?? 0,
805
+ dispatches: consumed.dispatches ?? 0,
806
+ timeMs: Math.round((consumed.wall_s ?? 0) * 1000),
807
+ });
808
+ }
809
+
810
+ /**
811
+ * COMP-GSD-4: write a budget refusal diagnostic when the cumulative ceiling is
812
+ * already spent (pre-dispatch). No pause.json — nothing was dispatched, so
813
+ * there is no run to resume; the user raises the cap or runs --reset-budget.
814
+ */
815
+ function writeCumulativeRefusal(cwd, featureCode, chk, limits) {
816
+ const dir = gsdDir(cwd, featureCode);
817
+ mkdirSync(dir, { recursive: true });
818
+ const ts = new Date().toISOString();
819
+ const json = { feature: featureCode, kind: 'budget', axis: 'cumulative', reason: chk.reason, usage: chk.usage, limits, ts };
820
+ writeFileSync(join(dir, 'budget.json'), JSON.stringify(json, null, 2) + '\n');
821
+ const md = [
822
+ `# GSD budget refusal — ${featureCode}`,
823
+ '',
824
+ `**${chk.reason}**`,
825
+ '',
826
+ `Cumulative usage: ${chk.usage.totalTokens} tokens, $${(chk.usage.totalCostUsd ?? 0).toFixed(4)}.`,
827
+ '',
828
+ 'This feature has already spent its cumulative `gsd.budget.cumulative.*` ceiling.',
829
+ 'Raise the cap in `.compose/compose.json`, or clear the ledger:',
830
+ '',
831
+ '```',
832
+ `compose gsd ${featureCode} --reset-budget`,
833
+ '```',
834
+ '',
835
+ ].join('\n');
836
+ writeFileSync(join(dir, 'budget.md'), md);
837
+ }
838
+
839
+ function isPidAlive(pid) {
840
+ try {
841
+ // signal 0 probes existence without sending a signal.
842
+ process.kill(pid, 0);
843
+ return true;
844
+ } catch (err) {
845
+ // ESRCH = no such process; EPERM = exists but not ours (still alive).
846
+ return err.code === 'EPERM';
847
+ }
848
+ }
849
+
850
+ function clearPauseFile(cwd, featureCode) {
851
+ const dir = gsdDir(cwd, featureCode);
852
+ try { rmSync(join(dir, 'pause.json'), { force: true }); } catch { /* best-effort */ }
853
+ // Release the resume ownership claim dir (COMP-GSD-5 Codex review) alongside it.
854
+ try { rmSync(join(dir, 'pause.lock'), { recursive: true, force: true }); } catch { /* best-effort */ }
855
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@smartmemory/compose",
3
- "version": "0.2.7-beta",
3
+ "version": "0.2.8-beta",
4
4
  "description": "Structured AI dev pipeline — goal-to-product orchestration with gates, iteration loops, and feature lifecycle management.",
5
5
  "author": "SmartMemory",
6
6
  "license": "MIT",