@smartmemory/compose 0.2.7-beta → 0.2.9-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/bin/compose.js +112 -3
  2. package/contracts/gsd-state.json +140 -0
  3. package/contracts/gsd-stuck.json +141 -0
  4. package/dist/assets/{App-D3ehVPvi.js → App-CG-2euMe.js} +164 -164
  5. package/dist/assets/{arc-Dmf69iHG.js → arc-7QBWoLra.js} +1 -1
  6. package/dist/assets/{architectureDiagram-3BPJPVTR-xYo993Yw.js → architectureDiagram-3BPJPVTR-CUw-7uLm.js} +1 -1
  7. package/dist/assets/{blockDiagram-GPEHLZMM-UX4EF98O.js → blockDiagram-GPEHLZMM-COU1vmr7.js} +1 -1
  8. package/dist/assets/{c4Diagram-AAUBKEIU-DaP9CGWb.js → c4Diagram-AAUBKEIU-XPO9PSJL.js} +1 -1
  9. package/dist/assets/channel-Bcu04MIK.js +1 -0
  10. package/dist/assets/{chunk-2J33WTMH-CKk_RN3A.js → chunk-2J33WTMH-zMzVB2a6.js} +1 -1
  11. package/dist/assets/{chunk-4BX2VUAB-DboAwYKw.js → chunk-4BX2VUAB-Kke_qcHU.js} +1 -1
  12. package/dist/assets/{chunk-55IACEB6-Dsy9RYvI.js → chunk-55IACEB6-hMeFx5Nh.js} +1 -1
  13. package/dist/assets/{chunk-727SXJPM-fAH0QO9v.js → chunk-727SXJPM-DesUnrEw.js} +1 -1
  14. package/dist/assets/{chunk-AQP2D5EJ-DyZYerFP.js → chunk-AQP2D5EJ-1uGGvkxW.js} +1 -1
  15. package/dist/assets/{chunk-FMBD7UC4-BnboGO5t.js → chunk-FMBD7UC4-DYHv1PcZ.js} +1 -1
  16. package/dist/assets/{chunk-ND2GUHAM-Di9tYXme.js → chunk-ND2GUHAM-D0MENOLX.js} +1 -1
  17. package/dist/assets/{chunk-QZHKN3VN-zRPRlAIL.js → chunk-QZHKN3VN-8nn3HP-N.js} +1 -1
  18. package/dist/assets/classDiagram-4FO5ZUOK-DU4yxldU.js +1 -0
  19. package/dist/assets/classDiagram-v2-Q7XG4LA2-DU4yxldU.js +1 -0
  20. package/dist/assets/{cose-bilkent-S5V4N54A-C7Hqukaf.js → cose-bilkent-S5V4N54A-BoZPVIny.js} +1 -1
  21. package/dist/assets/{dagre-BM42HDAG-B-cR-BjI.js → dagre-BM42HDAG-BgZzdLG9.js} +1 -1
  22. package/dist/assets/{diagram-2AECGRRQ-B6-5onDk.js → diagram-2AECGRRQ-CknAnpSu.js} +1 -1
  23. package/dist/assets/{diagram-5GNKFQAL-DoZZgFAM.js → diagram-5GNKFQAL-CZUEbKim.js} +1 -1
  24. package/dist/assets/{diagram-KO2AKTUF-77jEGlJh.js → diagram-KO2AKTUF-DCs-pLdH.js} +1 -1
  25. package/dist/assets/{diagram-LMA3HP47-D3S7XDRD.js → diagram-LMA3HP47-lRaDjIfM.js} +1 -1
  26. package/dist/assets/{diagram-OG6HWLK6-KbYL9aCY.js → diagram-OG6HWLK6-CIGqmehP.js} +1 -1
  27. package/dist/assets/{erDiagram-TEJ5UH35-DezFbJP-.js → erDiagram-TEJ5UH35-Lx3c2N6F.js} +1 -1
  28. package/dist/assets/{flowDiagram-I6XJVG4X-4x31cK9j.js → flowDiagram-I6XJVG4X-VoluKqSq.js} +1 -1
  29. package/dist/assets/{ganttDiagram-6RSMTGT7-FopfSTyZ.js → ganttDiagram-6RSMTGT7-D7hETiNZ.js} +1 -1
  30. package/dist/assets/{gitGraphDiagram-PVQCEYII-DSiQGKbN.js → gitGraphDiagram-PVQCEYII-DenEcUvY.js} +1 -1
  31. package/dist/assets/{index-ClX6LVAf.js → index-B4dv3acY.js} +2 -2
  32. package/dist/assets/{infoDiagram-5YYISTIA-DE6BqzK_.js → infoDiagram-5YYISTIA-v7cq9Er9.js} +1 -1
  33. package/dist/assets/{ishikawaDiagram-YF4QCWOH-Dml8NwQI.js → ishikawaDiagram-YF4QCWOH-CfCCXt2x.js} +1 -1
  34. package/dist/assets/{journeyDiagram-JHISSGLW-CwWeJgjE.js → journeyDiagram-JHISSGLW-Bbokl_xO.js} +1 -1
  35. package/dist/assets/{kanban-definition-UN3LZRKU-DnG956Wh.js → kanban-definition-UN3LZRKU-DhkOZ2hg.js} +1 -1
  36. package/dist/assets/{linear-CA3N7Rpi.js → linear-bHjluRm2.js} +1 -1
  37. package/dist/assets/{mindmap-definition-RKZ34NQL-CxfIOjLX.js → mindmap-definition-RKZ34NQL-C1bHpoXH.js} +1 -1
  38. package/dist/assets/{pieDiagram-4H26LBE5-O7aIwy1x.js → pieDiagram-4H26LBE5-CZb1i55T.js} +1 -1
  39. package/dist/assets/{quadrantDiagram-W4KKPZXB-CPQ2qq7c.js → quadrantDiagram-W4KKPZXB-o37AwRHB.js} +1 -1
  40. package/dist/assets/{requirementDiagram-4Y6WPE33-C23horL4.js → requirementDiagram-4Y6WPE33-BVErWDzU.js} +1 -1
  41. package/dist/assets/{sankeyDiagram-5OEKKPKP-DPY04kOW.js → sankeyDiagram-5OEKKPKP-BhBK8gHQ.js} +1 -1
  42. package/dist/assets/{sequenceDiagram-3UESZ5HK-BKaTfIvo.js → sequenceDiagram-3UESZ5HK-CsICF23P.js} +1 -1
  43. package/dist/assets/{stateDiagram-AJRCARHV-B9na_6mY.js → stateDiagram-AJRCARHV-TN1AXwim.js} +1 -1
  44. package/dist/assets/stateDiagram-v2-BHNVJYJU-BLR6AkKX.js +1 -0
  45. package/dist/assets/{timeline-definition-PNZ67QCA-BBWPqd7X.js → timeline-definition-PNZ67QCA-DftAajbU.js} +1 -1
  46. package/dist/assets/{vennDiagram-CIIHVFJN-tWqiHsOZ.js → vennDiagram-CIIHVFJN-cFTMstT7.js} +1 -1
  47. package/dist/assets/{wardley-L42UT6IY-DorxG6os.js → wardley-L42UT6IY-DL8CivzO.js} +1 -1
  48. package/dist/assets/{wardleyDiagram-YWT4CUSO-B49f8GzW.js → wardleyDiagram-YWT4CUSO-BDZT1hQj.js} +1 -1
  49. package/dist/assets/{xychartDiagram-2RQKCTM6-BgKSj8Qb.js → xychartDiagram-2RQKCTM6-DQQSkfC4.js} +1 -1
  50. package/dist/index.html +1 -1
  51. package/lib/budget-ledger.js +84 -0
  52. package/lib/build-stream-schema.js +5 -3
  53. package/lib/build.js +122 -2
  54. package/lib/feature-validator.js +40 -8
  55. package/lib/gsd-budget.js +205 -0
  56. package/lib/gsd-diff-capture.js +34 -0
  57. package/lib/gsd-events.js +61 -0
  58. package/lib/gsd-headless-config.js +110 -0
  59. package/lib/gsd-milestone-report.js +323 -0
  60. package/lib/gsd-state.js +165 -0
  61. package/lib/gsd-stuck.js +275 -0
  62. package/lib/gsd-supervisor.js +223 -0
  63. package/lib/gsd-timing.js +89 -0
  64. package/lib/gsd.js +908 -16
  65. package/package.json +1 -1
  66. package/dist/assets/channel-D_RXsFFT.js +0 -1
  67. package/dist/assets/classDiagram-4FO5ZUOK-K6wdB4ic.js +0 -1
  68. package/dist/assets/classDiagram-v2-Q7XG4LA2-K6wdB4ic.js +0 -1
  69. package/dist/assets/stateDiagram-v2-BHNVJYJU-Cf84VDiH.js +0 -1
package/lib/gsd.js CHANGED
@@ -13,7 +13,7 @@
13
13
  // V1 limitation: runtime task-to-task handoff is not implemented; tasks see
14
14
  // only spec-level upstream context (Boundary Map declarations) per blueprint.
15
15
 
16
- import { readFileSync, existsSync, readdirSync } from 'node:fs';
16
+ import { readFileSync, writeFileSync, existsSync, readdirSync, mkdirSync, rmSync, statSync, renameSync } from 'node:fs';
17
17
  import { join, resolve, dirname } from 'node:path';
18
18
  import { fileURLToPath } from 'node:url';
19
19
  import { execSync } from 'node:child_process';
@@ -22,8 +22,17 @@ import { StratumMcpClient } from './stratum-mcp-client.js';
22
22
  import { validateBoundaryMap } from './boundary-map.js';
23
23
  import { enrichTaskGraph } from './gsd-decompose-enrich.js';
24
24
  import { buildTaskDescription } from './gsd-prompt.js';
25
- import { writeAll, validate as validateTaskResult } from './gsd-blackboard.js';
25
+ import { writeAll, validate as validateTaskResult, read as readBlackboard } from './gsd-blackboard.js';
26
26
  import { executeParallelDispatchServer, executeShipStep } from './build.js';
27
+ import { GsdStuckDetector, DEFAULT_THRESHOLDS } from './gsd-stuck.js';
28
+ import { readGsdBudgetConfig, buildBudgetBlock, injectBudget, composeBudgetDiagnostic } from './gsd-budget.js';
29
+ import { recordGsdUsage, checkGsdCumulativeBudget } from './budget-ledger.js';
30
+ // COMP-GSD-6: continuous run-state checkpoint + canonical pid-liveness probe.
31
+ // pidAlive is canonical in gsd-state.js (EPERM=alive) and imported one-way here.
32
+ import { writeGsdState, readGsdState, gsdStatePath, pidAlive, clearGsdHaltArtifacts } from './gsd-state.js';
33
+ import { generateGsdMilestoneReport } from './gsd-milestone-report.js';
34
+ import { readHeadlessConfig } from './gsd-headless-config.js';
35
+ import { appendGsdEvent, clearGsdEvents } from './gsd-events.js';
27
36
 
28
37
  const __dirname = dirname(fileURLToPath(import.meta.url));
29
38
  const PACKAGE_ROOT = resolve(__dirname, '..');
@@ -38,6 +47,15 @@ export async function runGsd(featureCode, opts = {}) {
38
47
  }
39
48
  const cwd = opts.cwd ?? process.cwd();
40
49
 
50
+ // COMP-GSD-6: a FRESH (non-resume) run must not inherit a prior run's
51
+ // state.json. Clear it up front so that if a precondition below throws BEFORE
52
+ // the planning checkpoint, NO running state remains → the headless supervisor
53
+ // (and `query`) read 'absent' → fatal-by-absence, never a stale 'complete'
54
+ // success. A resume keeps the old state.json (the crash-bridge may need it).
55
+ if (!opts.resume) {
56
+ try { rmSync(gsdStatePath(cwd, featureCode), { force: true }); } catch { /* ignore */ }
57
+ }
58
+
41
59
  // 1. Validate preconditions: blueprint exists + Boundary Map ok
42
60
  const blueprintPath = join(cwd, 'docs', 'features', featureCode, 'blueprint.md');
43
61
  if (!existsSync(blueprintPath)) {
@@ -62,12 +80,36 @@ export async function runGsd(featureCode, opts = {}) {
62
80
  );
63
81
  }
64
82
 
65
- // 2. Refuse to start in a dirty workspace BEFORE any Stratum side effects.
83
+ // 2. COMP-GSD-5 resume branch runs BEFORE the dirty-tree check so a
84
+ // pid/mode-guard failure (the more specific precondition) is reported first.
85
+ // --resume reads pause.json, guards on ownership (no live pid) +
86
+ // mode==='gsd' (mirrors `compose fix --resume`), and seeds a precomputed task
87
+ // graph = decomposedTasks MINUS completedTaskIds so the execute step
88
+ // re-dispatches only the unfinished work. Completed results already live in
89
+ // the blackboard. resumeTaskGraph (when set) makes runOneStep skip the
90
+ // decompose agent entirely → stable task IDs, no re-decompose.
91
+ // COMP-GSD-4: read+guard the resume graph here for guard-ordering, but DEFER
92
+ // the atomic pause.lock claim (claim:false) — runGsd claims inside its try so
93
+ // the finally always releases it (no strand on re-halt/refusal/throw).
94
+ let resumeTaskGraph = null;
95
+ if (opts.resume) {
96
+ resumeTaskGraph = loadResumeTaskGraph(cwd, featureCode, { claim: false });
97
+ }
98
+
99
+ // 3. Refuse to start in a dirty workspace BEFORE any Stratum side effects.
66
100
  // v1 rationale: alternatives (baseline subtract + post-execute delta) drop
67
101
  // legitimate edits to pre-existing dirty files. Refuse-if-dirty makes
68
102
  // post-execute dirty set unambiguous: every entry is GSD-produced.
103
+ //
104
+ // On --resume the GSD control plane (.compose/gsd/<feature>/) legitimately
105
+ // carries the prior run's pause.json/blackboard.json/results — that's the
106
+ // resume STATE, not an unrelated edit — so exclude it from the dirty set.
69
107
  if (!opts.allowDirtyWorkspace) {
70
- const startingDirty = collectChangedFiles(cwd);
108
+ let startingDirty = collectChangedFiles(cwd);
109
+ if (opts.resume) {
110
+ const ctrlPrefix = `.compose/gsd/${featureCode}/`;
111
+ startingDirty = startingDirty.filter((f) => !f.startsWith(ctrlPrefix));
112
+ }
71
113
  if (startingDirty.length > 0) {
72
114
  throw new Error(
73
115
  `runGsd: working tree must be clean to ensure ship_gsd stages only GSD-produced changes. ` +
@@ -77,38 +119,197 @@ export async function runGsd(featureCode, opts = {}) {
77
119
  }
78
120
  }
79
121
 
80
- // 3. Resolve gateCommands. loadProjectConfig() does not merge defaults, so
122
+ // 4. Resolve gateCommands. loadProjectConfig() does not merge defaults, so
81
123
  // explicit fallback here.
82
124
  const gateCommands = resolveGateCommands(cwd, opts.gateCommands);
83
125
 
84
126
  // 4. Load pipeline spec
85
127
  const specPath = join(PACKAGE_ROOT, 'pipelines', 'gsd.stratum.yaml');
86
- const specYaml = readFileSync(specPath, 'utf-8');
128
+ // 4a. COMP-GSD-4: inject the stratum flow budget block from `gsd.budget.*`.
129
+ // injectBudget is IDENTITY when nothing is configured, so an un-budgeted gsd
130
+ // run (and plain `compose build`) is byte-identical.
131
+ const budgetCfg = readGsdBudgetConfig(cwd);
132
+ const specYaml = injectBudget(readFileSync(specPath, 'utf-8'), budgetCfg);
133
+
134
+ // 4a. COMP-GSD-4: cumulative cross-session ceiling pre-check (tokens/cost).
135
+ // Refuse to start/resume a run that has already spent its lifetime budget —
136
+ // re-dispatching would immediately re-trip. Runs before the try, so no
137
+ // pause.lock is held yet (the claim is the first statement inside the try).
138
+ const cumulative = buildBudgetBlock(budgetCfg).cumulative;
139
+ if (cumulative) {
140
+ const chk = checkGsdCumulativeBudget(join(cwd, '.compose'), featureCode, cumulative);
141
+ if (chk.exceeded) {
142
+ writeCumulativeRefusal(cwd, featureCode, chk, cumulative);
143
+ return { status: 'budget', flowId: null, axis: 'cumulative', reason: chk.reason };
144
+ }
145
+ }
146
+
147
+ // 4b. COMP-GSD-5 stuck detector — thresholds from .compose/compose.json
148
+ // `gsd.stuck.*` with documented defaults. ONLY gsd passes this into the
149
+ // shared executeParallelDispatchServer, so build mode is byte-identical.
150
+ const stuckDetector = buildStuckDetector(cwd);
87
151
 
88
152
  // 5. Connect Stratum + plan (only after preconditions pass)
89
153
  const stratum = opts.stratum ?? new StratumMcpClient();
90
154
  const ownsStratum = !opts.stratum;
91
155
  if (ownsStratum) await stratum.connect();
156
+ // COMP-GSD-4: ownership flag — release the resume lock in finally ONLY if THIS
157
+ // process successfully claimed it (set below). Prevents (a) a non-resume run
158
+ // from clobbering a concurrent resume's valid claim and (b) a claim-race loser
159
+ // (EEXIST) from deleting the winner's lock on its way out.
160
+ let lockClaimed = false;
161
+ let runLockClaimed = false;
162
+ // COMP-GSD-6: the in-memory run-state, threaded through stepCtx and flushed to
163
+ // state.json. Declared here so the catch/finally can read it.
164
+ let stepCtx = null;
165
+ // COMP-GSD-6-WATCHDOG: independent wall-clock heartbeat timer (see below).
166
+ // Declared here so the finally can always clear it.
167
+ let heartbeatTimer = null;
92
168
  try {
93
- let response = await stratum.plan(specYaml, 'gsd', {
94
- featureCode,
95
- gateCommands,
96
- });
97
- const flowId = response.flow_id;
169
+ // COMP-GSD-6: claim the live-run lock BEFORE any stratum side effect, so two
170
+ // fresh `compose gsd <same-feature>` runs can't race the results dir. Takes
171
+ // over a stale lock (dead owner) and refuses a live one.
172
+ claimRunLock(cwd, featureCode);
173
+ runLockClaimed = true;
174
+
175
+ // COMP-GSD-4: claim the resume lock HERE (first statement in the try) so the
176
+ // finally releases it on EVERY exit — budget/stuck re-halt, throw, or clean
177
+ // finish. loadResumeTaskGraph above already read+guarded (claim:false).
178
+ if (opts.resume) {
179
+ claimResumeLock(cwd, featureCode); // throws EEXIST → finally sees lockClaimed=false
180
+ lockClaimed = true;
181
+ }
182
+
183
+ // COMP-GSD-6: pre-plan "planning" checkpoint. A crash during plan/decompose
184
+ // now leaves a dead-pid state.json — the failed-vs-fatal boundary. A throw
185
+ // BEFORE this point (preconditions) leaves no running state → fatal by
186
+ // absence; a throw AFTER → the catch converts it to status:"failed".
187
+ // On resume, seed the planning checkpoint from the (in-memory) resume graph
188
+ // so that if THIS resume re-crashes before its decompose step repopulates
189
+ // state.json, the crash-bridge still has a task graph to recover from
190
+ // (otherwise the fresh empty checkpoint would clobber the prior good data).
191
+ const resumeTasks = opts.resume ? (resumeTaskGraph?.tasks ?? []).map((t) => ({ ...t })) : [];
192
+ const initialState = {
193
+ feature: featureCode,
194
+ flowId: null,
195
+ pid: process.pid,
196
+ mode: 'gsd',
197
+ phase: 'planning',
198
+ status: 'running',
199
+ startedAt: new Date().toISOString(),
200
+ headless: !!opts.headless,
201
+ attempt: opts.attempt ?? 1,
202
+ resumeReady: opts.resume && resumeTasks.length > 0,
203
+ decomposedTasks: resumeTasks,
204
+ completedTaskIds: collectCompletedTaskIds(cwd, featureCode),
205
+ };
98
206
 
99
207
  // Track files merged into the base cwd by the execute step so ship_gsd
100
208
  // can stage them. executeShipStep's default filter only stages feature
101
209
  // docs unless context.filesChanged is provided.
102
- const stepCtx = {
210
+ stepCtx = {
103
211
  stratum, cwd, featureCode, blueprintText, gateCommands,
104
212
  filesChanged: [],
213
+ stuckDetector,
214
+ resumeTaskGraph,
215
+ stuck: null, // set by runOneStep on a stuck verdict
216
+ runState: initialState, // COMP-GSD-6: flushState merges into this
217
+ // COMP-GSD-7-EVENTLOG: tasks already completed at run start (a resume
218
+ // preloads them) are seeded as already-emitted so the appended log never
219
+ // re-fires task_completed for prior-session completions.
220
+ emittedCompletions: new Set(initialState.completedTaskIds),
221
+ // COMP-GSD-7-EVENTLOG: phases already announced (dedupe — runState.phase is
222
+ // set to 'execute' before the merge checkpoint, so it can't gate emission).
223
+ emittedPhases: new Set(),
105
224
  };
225
+ flushState(stepCtx, {}); // write the planning checkpoint
226
+
227
+ // COMP-GSD-7-EVENTLOG: at the planning checkpoint — AFTER preconditions
228
+ // passed (so a failed fresh invocation never wipes a prior run's history) —
229
+ // a fresh run truncates the event log and clears stale halt artifacts so the
230
+ // timeline reflects only this run; a resume appends to the existing log.
231
+ if (!opts.resume) {
232
+ clearGsdEvents(cwd, featureCode);
233
+ clearGsdHaltArtifacts(cwd, featureCode);
234
+ }
235
+ appendGsdEvent(cwd, featureCode, 'run_started', {
236
+ mode: opts.resume ? 'resume' : 'fresh',
237
+ attempt: opts.attempt ?? 1,
238
+ });
239
+
240
+ // COMP-GSD-6-WATCHDOG: an INDEPENDENT wall-clock heartbeat. The existing
241
+ // heartbeat only advances on agent push-events (onHeartbeat below), so a
242
+ // quiet-but-healthy task would look stale. This timer restamps state.json's
243
+ // heartbeat on a fixed cadence whenever the event loop is still turning — so
244
+ // a stale heartbeat genuinely means the loop is WEDGED (or the process dead),
245
+ // which is what the headless watchdog keys its hung-kill on. .unref() so it
246
+ // never holds the process open; cleared in finally. Same empty-patch restamp
247
+ // onHeartbeat uses, so it's behavior-compatible.
248
+ //
249
+ // Gated to SUPERVISED children only (GSD_HEADLESS_ATTEMPT, set by the
250
+ // supervisor's spawner) — the supervisor is the sole watcher, so an
251
+ // interactive `compose gsd` stays byte-identical (no extra state.json writes).
252
+ if (process.env.GSD_HEADLESS_ATTEMPT != null) {
253
+ const hbMs = readHeadlessConfig(cwd).watchdogHeartbeatMs;
254
+ heartbeatTimer = setInterval(() => {
255
+ try { if (stepCtx?.runState) flushState(stepCtx, {}); } catch { /* best-effort */ }
256
+ }, hbMs);
257
+ heartbeatTimer.unref?.();
258
+ }
106
259
 
107
- // 5. Status loop
108
- while (response.status !== 'complete' && response.status !== 'killed') {
260
+ let response = await stratum.plan(specYaml, 'gsd', {
261
+ featureCode,
262
+ gateCommands,
263
+ });
264
+ const flowId = response.flow_id;
265
+ flushState(stepCtx, { flowId, phase: 'decompose' });
266
+ emitPhaseOnce(stepCtx, 'decompose'); // COMP-GSD-7-EVENTLOG
267
+
268
+ // 5. Status loop. `stuck` (COMP-GSD-5) and `budget_exhausted` (COMP-GSD-4)
269
+ // are terminal statuses. `stuck` is set compose-side by runOneStep; budget
270
+ // is the stratum flow-budget terminal, surfaced verbatim through the advance/
271
+ // poll envelopes (and carries budget_state).
272
+ while (
273
+ response.status !== 'complete' &&
274
+ response.status !== 'killed' &&
275
+ response.status !== 'stuck' &&
276
+ response.status !== 'budget_exhausted'
277
+ ) {
109
278
  response = await runOneStep(response, stepCtx);
110
279
  }
111
280
 
281
+ if (response.status === 'stuck') {
282
+ // Artifacts (stuck.md/json + pause.json) were written by runOneStep.
283
+ // COMP-GSD-7-EVENTLOG: flush any completions that finished before the stuck
284
+ // verdict (the stuck path returns early, before the execute-merge delta),
285
+ // then record the pause.
286
+ emitCompletionDeltas(stepCtx);
287
+ appendGsdEvent(cwd, featureCode, 'paused', { pauseKind: 'stuck', taskId: stepCtx.stuck?.taskId ?? null });
288
+ flushState(stepCtx, { status: 'stuck' }); // COMP-GSD-6 terminal checkpoint
289
+ return {
290
+ status: 'stuck',
291
+ flowId,
292
+ stuckTaskId: stepCtx.stuck?.taskId ?? null,
293
+ signal: stepCtx.stuck?.signal ?? null,
294
+ };
295
+ }
296
+
297
+ if (response.status === 'budget_exhausted') {
298
+ // COMP-GSD-4: the stratum flow budget tripped. The flow already
299
+ // cascade-cancelled in-flight siblings. Persist budget.{md,json} +
300
+ // pause.json (kind:budget) for --resume, record cumulative usage, and
301
+ // return a terminal `budget` envelope. pause.lock is released by finally.
302
+ const budgetState = response.budget_state ?? {};
303
+ writeBudgetArtifacts(stepCtx, response, budgetState);
304
+ recordGsdUsageFromState(cwd, featureCode, budgetState);
305
+ const axis = composeBudgetDiagnostic(budgetState, { feature: featureCode }).json.axis;
306
+ // COMP-GSD-7-EVENTLOG: flush pre-halt completions, then record the pause.
307
+ emitCompletionDeltas(stepCtx);
308
+ appendGsdEvent(cwd, featureCode, 'paused', { pauseKind: 'budget', axis });
309
+ flushState(stepCtx, { status: 'budget' }); // COMP-GSD-6 terminal checkpoint
310
+ return { status: 'budget', flowId, axis, consumed: budgetState.consumed ?? {}, caps: budgetState.caps ?? {} };
311
+ }
312
+
112
313
  // 6. Post-step blackboard finalization — read each task's TaskResult JSON
113
314
  // and write the consolidated blackboard.
114
315
  const blackboard = collectBlackboard(cwd, featureCode);
@@ -116,12 +317,85 @@ export async function runGsd(featureCode, opts = {}) {
116
317
  await writeAll(featureCode, blackboard, { cwd });
117
318
  }
118
319
 
320
+ // 6b. COMP-GSD-5: a clean (non-stuck) finish clears any pause.json — the
321
+ // resume completed, or a fresh run superseded a stale pause.
322
+ if (response.status === 'complete') {
323
+ // COMP-GSD-4: record this run's cumulative usage (best-effort; no-op when
324
+ // the complete envelope carries no budget_state, e.g. un-budgeted runs).
325
+ recordGsdUsageFromState(cwd, featureCode, response.budget_state);
326
+ clearPauseFile(cwd, featureCode);
327
+ // COMP-GSD-7: on a clean complete, budget.json is NOT written (only halts
328
+ // write it). Persist a budget-final.json snapshot so the milestone report
329
+ // (auto + retroactive `gsd report`) has actuals-vs-caps. No-op when the
330
+ // envelope carries no budget_state (un-budgeted run). Best-effort: this is
331
+ // a derived report input — a write failure must NEVER demote a successful
332
+ // run to 'failed' via the outer catch.
333
+ if (response.budget_state) {
334
+ try {
335
+ writeBudgetFinalSnapshot(stepCtx, response.budget_state);
336
+ } catch (err) {
337
+ console.warn(`[gsd] budget-final snapshot failed: ${err.message}`);
338
+ }
339
+ }
340
+ }
341
+
342
+ // COMP-GSD-6: terminal state.json flush. Only 'complete' is a success; any
343
+ // other terminal here (e.g. stratum 'killed') maps to 'failed' so we stay
344
+ // within the closed status vocabulary the contract + supervisor share.
345
+ // COMP-GSD-7: stamp completedAt so retroactive reports can recover wall-clock.
346
+ const terminalStatus = response.status === 'complete' ? 'complete' : 'failed';
347
+ // COMP-GSD-7-EVENTLOG: emit the terminal event. complete → final completion
348
+ // deltas + 'completed'; any other terminal (e.g. stratum 'killed') → 'failed'.
349
+ if (terminalStatus === 'complete') {
350
+ emitCompletionDeltas(stepCtx);
351
+ appendGsdEvent(cwd, featureCode, 'completed', {});
352
+ } else {
353
+ appendGsdEvent(cwd, featureCode, 'failed', { reason: response.status ?? 'unknown' });
354
+ }
355
+ flushState(stepCtx, { status: terminalStatus, phase: 'done', completedAt: new Date().toISOString() });
356
+
357
+ // COMP-GSD-7: best-effort milestone report on a clean complete. A report
358
+ // failure must never fail the run — it is a derived artifact.
359
+ if (terminalStatus === 'complete') {
360
+ try {
361
+ const r = generateGsdMilestoneReport(featureCode, cwd);
362
+ if (!r.ok) console.warn(`[gsd] milestone report skipped: ${r.error}`);
363
+ } catch (err) {
364
+ console.warn(`[gsd] milestone report generation failed: ${err.message}`);
365
+ }
366
+ }
367
+
368
+ // Return the normalized closed-vocabulary status (not the raw stratum status)
369
+ // so the CLI/callers don't mistake a 'killed' terminal for success.
119
370
  return {
120
- status: response.status,
371
+ status: terminalStatus,
121
372
  flowId,
122
373
  blackboardEntries: Object.keys(blackboard).length,
123
374
  };
375
+ } catch (err) {
376
+ // COMP-GSD-6: an orderly throw AFTER the planning checkpoint becomes a
377
+ // terminal status:"failed" so the supervisor treats it as non-recoverable
378
+ // (vs a hard crash → status stays "running" + dead pid → reader-derived
379
+ // "crashed"). Guard on a persisted running state so pre-checkpoint throws
380
+ // (which left no running state) stay fatal-by-absence, not "failed".
381
+ if (stepCtx?.runState && readGsdState(cwd, featureCode)?.status === 'running') {
382
+ try { flushState(stepCtx, { status: 'failed' }); } catch { /* best-effort */ }
383
+ // COMP-GSD-7-EVENTLOG: record the failure (only when a run actually started
384
+ // — a pre-checkpoint throw left no running state and gets no event). Append
385
+ // is best-effort; never mask the original error.
386
+ appendGsdEvent(cwd, featureCode, 'failed', { reason: err?.message ?? 'error' });
387
+ }
388
+ throw err;
124
389
  } finally {
390
+ // COMP-GSD-6-WATCHDOG: stop the independent heartbeat timer.
391
+ if (heartbeatTimer) clearInterval(heartbeatTimer);
392
+ // COMP-GSD-6: release the live-run lock if THIS process claimed it.
393
+ if (runLockClaimed) releaseRunLock(cwd, featureCode);
394
+ // COMP-GSD-4: release the resume claim ONLY if THIS process claimed it
395
+ // (ownership-aware — never clobber a concurrent run's valid claim, and don't
396
+ // release after losing the claim race). pause.json persists for --resume
397
+ // unless a clean complete cleared it above.
398
+ if (lockClaimed) releasePauseLock(cwd, featureCode);
125
399
  if (ownsStratum) {
126
400
  try { await stratum.disconnect?.(); } catch { /* best-effort */ }
127
401
  }
@@ -153,6 +427,7 @@ async function runOneStep(response, ctx) {
153
427
  const flowId = response.flow_id;
154
428
  const stepId = response.step_id;
155
429
  const stepType = response.type ?? response.step_type;
430
+ if (stepId) ctx.lastStepId = stepId; // COMP-GSD-4: for the budget pause's stepId
156
431
 
157
432
  if (response.status === 'execute_step') {
158
433
  // parallel_dispatch step (the `execute` step)
@@ -160,15 +435,40 @@ async function runOneStep(response, ctx) {
160
435
  const outcome = await executeParallelDispatchServer(
161
436
  response,
162
437
  stratum,
163
- { cwd, featureCode },
438
+ { cwd, featureCode, gsd: true }, // COMP-GSD-7: gates timing+diff capture
164
439
  null, // progress
165
440
  { write: () => {} }, // streamWriter — no-op for v1
166
441
  cwd,
442
+ {
443
+ stuckDetector: ctx.stuckDetector, // COMP-GSD-5 (null in non-gsd callers)
444
+ // COMP-GSD-6: bump state.json's heartbeat on every task event so a long
445
+ // task sitting in the dispatch poll loop isn't mistaken for crashed.
446
+ onHeartbeat: ctx.runState ? () => { try { flushState(ctx, {}); } catch { /* best-effort */ } } : null,
447
+ },
167
448
  );
449
+
450
+ // COMP-GSD-5: a stuck verdict halts the run. Persist the diagnostic +
451
+ // resume state, then return a terminal `stuck` envelope so runGsd's loop
452
+ // exits. The task was already cancelled (conflict) inside dispatch.
453
+ if (outcome && outcome.stuck) {
454
+ ctx.stuck = outcome.stuck;
455
+ writeStuckArtifacts(ctx, response, outcome.stuck);
456
+ return { status: 'stuck', flow_id: flowId, step_id: stepId };
457
+ }
458
+
168
459
  // After diffs are merged, capture the touched files for ship_gsd
169
460
  // staging. The clean-workspace precondition above guarantees every
170
461
  // file in the post-execute dirty set is genuinely a GSD-produced change.
171
462
  ctx.filesChanged = collectChangedFiles(cwd);
463
+ // COMP-GSD-6: checkpoint completed tasks after the execute merge.
464
+ // COMP-GSD-7-EVENTLOG: emit the execute-phase transition once, then a
465
+ // task_completed event per newly-completed task.
466
+ if (ctx.runState) {
467
+ const completed = collectCompletedTaskIds(cwd, featureCode);
468
+ flushState(ctx, { phase: 'execute', completedTaskIds: completed });
469
+ emitPhaseOnce(ctx, 'execute'); // dedupes; runState.phase can't gate this
470
+ emitCompletionDeltas(ctx, completed);
471
+ }
172
472
  // executeParallelDispatchServer returns the next-step dispatch envelope
173
473
  return outcome;
174
474
  }
@@ -191,6 +491,26 @@ async function runOneStep(response, ctx) {
191
491
  return await stratum.stepDone(flowId, stepId, shipResult);
192
492
  }
193
493
 
494
+ // COMP-GSD-5 resume: skip the decompose AGENT entirely and substitute the
495
+ // persisted task graph (already enriched/repaired during the original run
496
+ // and already filtered to exclude completedTaskIds). We do NOT re-run
497
+ // validateAndRepairTaskGraph: enrichTaskGraph would flag the completed
498
+ // tasks' Boundary Map slices as orphaned (no task in the SUBSET owns them).
499
+ // Stable task IDs + no re-decompose are the whole point.
500
+ if (stepId === 'decompose_gsd' && ctx.resumeTaskGraph) {
501
+ ctx.lastTaskGraph = ctx.resumeTaskGraph;
502
+ // COMP-GSD-6: a resume already has the (filtered) task graph — mark
503
+ // resumeReady so a re-crash during execute resumes rather than restarts.
504
+ if (ctx.runState) {
505
+ flushState(ctx, {
506
+ phase: 'execute',
507
+ resumeReady: true,
508
+ decomposedTasks: (ctx.resumeTaskGraph.tasks ?? []).map((t) => ({ ...t })),
509
+ });
510
+ }
511
+ return await stratum.stepDone(flowId, stepId, ctx.resumeTaskGraph);
512
+ }
513
+
194
514
  // Single-agent step: dispatch via runAgentText. The agent returns text;
195
515
  // we expect JSON matching the step's output_contract.
196
516
  const prompt = response.intent ?? '';
@@ -207,6 +527,19 @@ async function runOneStep(response, ctx) {
207
527
  // T6 step 7: validate decompose_gsd output and repair missing descriptions.
208
528
  if (stepId === 'decompose_gsd') {
209
529
  result = validateAndRepairTaskGraph(result, blueprintText, gateCommands);
530
+ // COMP-GSD-5: remember the ENRICHED graph so a later stuck halt can
531
+ // persist the full task definitions (with descriptions/produces/consumes)
532
+ // into pause.json — resume re-dispatches these without re-enriching.
533
+ ctx.lastTaskGraph = result;
534
+ // COMP-GSD-6: the task graph now exists → resumeReady true; persist it so a
535
+ // crash during execute can synthesize a resume graph from state.json.
536
+ if (ctx.runState) {
537
+ flushState(ctx, {
538
+ phase: 'execute',
539
+ resumeReady: true,
540
+ decomposedTasks: (result.tasks ?? []).map((t) => ({ ...t })),
541
+ });
542
+ }
210
543
  }
211
544
 
212
545
  return await stratum.stepDone(flowId, stepId, result);
@@ -362,3 +695,562 @@ function collectBlackboard(cwd, featureCode) {
362
695
  }
363
696
  return out;
364
697
  }
698
+
699
+ // ===========================================================================
700
+ // COMP-GSD-5: stuck detection + resume
701
+ // ===========================================================================
702
+
703
+ function gsdDir(cwd, featureCode) {
704
+ return join(cwd, '.compose', 'gsd', featureCode);
705
+ }
706
+
707
+ // ===========================================================================
708
+ // COMP-GSD-6: run.lock (live-run exclusivity) + state.json flush helpers
709
+ // ===========================================================================
710
+
711
+ const RUN_LOCK_STALE_MS = 90000;
712
+
713
+ function runLockDir(cwd, featureCode) {
714
+ return join(gsdDir(cwd, featureCode), 'run.lock');
715
+ }
716
+
717
+ // Atomically take over a stale lock dir. The naive `rmSync` + `mkdirSync` is
718
+ // racy — two reclaimers can both see "stale", both rm, and one deletes the
719
+ // other's fresh lock. renameSync IS atomic, so only one racer can rename the
720
+ // stale dir aside; the loser gets ENOENT. The winner removes the renamed copy
721
+ // and re-creates the lock; if a NEW claimant raced into the freed name first,
722
+ // our mkdir gets EEXIST and we (correctly) report we lost. Returns true iff WE
723
+ // recreated the lock.
724
+ function takeoverStaleLock(lockPath) {
725
+ const aside = `${lockPath}.stale.${process.pid}.${Date.now()}`;
726
+ try {
727
+ renameSync(lockPath, aside); // atomic — loser gets ENOENT
728
+ } catch {
729
+ return false; // another racer already took it over (or it vanished)
730
+ }
731
+ try { rmSync(aside, { recursive: true, force: true }); } catch { /* best-effort */ }
732
+ try {
733
+ mkdirSync(lockPath);
734
+ return true;
735
+ } catch (err) {
736
+ if (err.code === 'EEXIST') return false; // a fresh claimant won the freed name
737
+ throw err;
738
+ }
739
+ }
740
+
741
+ // Read the owning pid for a run.lock: run.lock/owner.json first (lock-local
742
+ // record), then state.json (Codex review precedence). Returns a number or null.
743
+ function runLockOwnerPid(cwd, featureCode) {
744
+ const ownerPath = join(runLockDir(cwd, featureCode), 'owner.json');
745
+ if (existsSync(ownerPath)) {
746
+ try {
747
+ const o = JSON.parse(readFileSync(ownerPath, 'utf-8'));
748
+ if (typeof o.pid === 'number') return o.pid;
749
+ } catch { /* fall through to state.json */ }
750
+ }
751
+ const state = readGsdState(cwd, featureCode);
752
+ return typeof state?.pid === 'number' ? state.pid : null;
753
+ }
754
+
755
+ // Atomic live-run claim, taken BEFORE the first stratum side effect. mkdirSync
756
+ // is atomic on POSIX: the loser gets EEXIST. On EEXIST we take over a STALE lock
757
+ // — owner pid dead, OR (no owner record AND lock-dir mtime older than the stale
758
+ // window, which covers the sub-ms gap before owner.json lands). A live owner
759
+ // refuses. Writes run.lock/owner.json {pid,startedAt} immediately after winning.
760
+ export function claimRunLock(cwd, featureCode) {
761
+ const dir = gsdDir(cwd, featureCode);
762
+ mkdirSync(dir, { recursive: true });
763
+ const lock = runLockDir(cwd, featureCode);
764
+ const write = () => {
765
+ writeFileSync(
766
+ join(lock, 'owner.json'),
767
+ JSON.stringify({ pid: process.pid, startedAt: new Date().toISOString() }, null, 2),
768
+ );
769
+ };
770
+ try {
771
+ mkdirSync(lock);
772
+ write();
773
+ return;
774
+ } catch (err) {
775
+ if (err.code !== 'EEXIST') throw err;
776
+ }
777
+ // EEXIST — decide stale vs live.
778
+ const ownerPid = runLockOwnerPid(cwd, featureCode);
779
+ let stale = false;
780
+ if (typeof ownerPid === 'number') {
781
+ stale = !pidAlive(ownerPid);
782
+ } else {
783
+ // No owner record yet: fall back to lock-dir age.
784
+ try {
785
+ stale = Date.now() - statSync(lock).mtimeMs > RUN_LOCK_STALE_MS;
786
+ } catch { stale = true; }
787
+ }
788
+ if (!stale) {
789
+ throw new Error(
790
+ `runGsd: another gsd run owns ${featureCode} (.compose/gsd/${featureCode}/run.lock, ` +
791
+ `pid ${ownerPid ?? 'unknown'} alive). Refusing to start a concurrent run.`,
792
+ );
793
+ }
794
+ // Atomic stale takeover (rename-aside). If we lose the takeover race, another
795
+ // run now legitimately owns the feature — refuse.
796
+ if (!takeoverStaleLock(lock)) {
797
+ throw new Error(
798
+ `runGsd: another gsd run claimed ${featureCode} during stale-lock takeover. ` +
799
+ `Refusing to start a concurrent run.`,
800
+ );
801
+ }
802
+ write();
803
+ }
804
+
805
+ export function releaseRunLock(cwd, featureCode) {
806
+ rmSync(runLockDir(cwd, featureCode), { recursive: true, force: true });
807
+ }
808
+
809
+ // Merge a patch into ctx.runState and atomically flush state.json. ctx.runState
810
+ // is the single in-memory source of truth; every flush restamps heartbeatAt.
811
+ function flushState(ctx, patch) {
812
+ ctx.runState = { ...(ctx.runState ?? {}), ...patch };
813
+ writeGsdState(ctx.cwd, ctx.featureCode, ctx.runState);
814
+ }
815
+
816
+ // COMP-GSD-7-EVENTLOG: emit a `task_completed` event for each task that has
817
+ // completed since the last emit. Dedupes via ctx.emittedCompletions (seeded from
818
+ // the run's initial completed snapshot, so a resume never re-fires prior-session
819
+ // completions). Called at the execute-merge checkpoint and before each halt
820
+ // (stuck/budget) — the halt paths return early, before the merge checkpoint.
821
+ function emitCompletionDeltas(ctx, completedIds) {
822
+ if (!ctx?.emittedCompletions) return;
823
+ const ids = completedIds ?? collectCompletedTaskIds(ctx.cwd, ctx.featureCode);
824
+ for (const id of ids) {
825
+ if (!id || ctx.emittedCompletions.has(id)) continue;
826
+ ctx.emittedCompletions.add(id);
827
+ appendGsdEvent(ctx.cwd, ctx.featureCode, 'task_completed', { taskId: id });
828
+ }
829
+ }
830
+
831
+ // COMP-GSD-7-EVENTLOG: emit a `phase` event the first time a phase is entered.
832
+ // Deduped via ctx.emittedPhases — runState.phase is set to 'execute' before the
833
+ // execute-merge checkpoint runs, so it can't itself gate the emission.
834
+ function emitPhaseOnce(ctx, phase) {
835
+ if (!ctx?.emittedPhases || ctx.emittedPhases.has(phase)) return;
836
+ ctx.emittedPhases.add(phase);
837
+ appendGsdEvent(ctx.cwd, ctx.featureCode, 'phase', { phase });
838
+ }
839
+
840
+ /**
841
+ * Build a GsdStuckDetector from `.compose/compose.json` `gsd.stuck.*`, falling
842
+ * back to documented defaults (sameFileEdits=3, errorRepeats=3,
843
+ * noProgressCalls=8, wallClockMs=600000). Config keys use snake_case to match
844
+ * the design table; the detector takes camelCase.
845
+ */
846
+ export function buildStuckDetector(cwd) {
847
+ const cfg = readGsdStuckConfig(cwd);
848
+ return new GsdStuckDetector({
849
+ sameFileEdits: cfg.same_file_edits ?? DEFAULT_THRESHOLDS.sameFileEdits,
850
+ errorRepeats: cfg.error_repeats ?? DEFAULT_THRESHOLDS.errorRepeats,
851
+ noProgressCalls: cfg.no_progress_calls ?? DEFAULT_THRESHOLDS.noProgressCalls,
852
+ wallClockMs: cfg.wall_clock_ms ?? DEFAULT_THRESHOLDS.wallClockMs,
853
+ });
854
+ }
855
+
856
+ function readGsdStuckConfig(cwd) {
857
+ const configPath = join(cwd, '.compose', 'compose.json');
858
+ if (!existsSync(configPath)) return {};
859
+ try {
860
+ const cfg = JSON.parse(readFileSync(configPath, 'utf-8'));
861
+ return cfg?.gsd?.stuck ?? {};
862
+ } catch {
863
+ return {};
864
+ }
865
+ }
866
+
867
+ /**
868
+ * Task ids whose VALIDATED TaskResult is already known — the union of the
869
+ * persisted blackboard and any per-task result files that validate. Lenient
870
+ * (does NOT throw on a bad file) because at stuck-halt time the run is being
871
+ * abandoned, not finalized.
872
+ */
873
+ function collectCompletedTaskIds(cwd, featureCode) {
874
+ const done = new Set(Object.keys(readBlackboard(featureCode, { cwd }) ?? {}));
875
+ const dir = join(gsdDir(cwd, featureCode), 'results');
876
+ if (existsSync(dir)) {
877
+ for (const f of readdirSync(dir).filter((x) => x.endsWith('.json'))) {
878
+ try {
879
+ const parsed = JSON.parse(readFileSync(join(dir, f), 'utf-8'));
880
+ if (validateTaskResult(parsed).ok) done.add(f.replace(/\.json$/, ''));
881
+ } catch { /* skip unreadable */ }
882
+ }
883
+ }
884
+ return [...done];
885
+ }
886
+
887
+ /** Best-effort unified diff of the whole working tree (for the stuck.md triage). */
888
+ function captureWorkingDiff(cwd) {
889
+ try {
890
+ return execSync('git diff HEAD', {
891
+ cwd, encoding: 'utf-8', timeout: 5000, maxBuffer: 4 * 1024 * 1024,
892
+ stdio: ['ignore', 'pipe', 'ignore'],
893
+ }).trim();
894
+ } catch {
895
+ return '';
896
+ }
897
+ }
898
+
899
+ /**
900
+ * Persist the stuck diagnostic (stuck.md + stuck.json, per
901
+ * contracts/gsd-stuck.json#stuck) AND the resume state (pause.json, per
902
+ * #pause). decomposedTasks is the FULL task list (from the dispatch envelope),
903
+ * persisted so --resume does not re-decompose. completedTaskIds comes from the
904
+ * blackboard / results dir.
905
+ */
906
+ function writeStuckArtifacts(ctx, dispatchResponse, verdict) {
907
+ const { cwd, featureCode } = ctx;
908
+ const dir = gsdDir(cwd, featureCode);
909
+ mkdirSync(dir, { recursive: true });
910
+ const ts = new Date().toISOString();
911
+
912
+ // Persist the FULLY-ENRICHED task graph (captured at decompose) so --resume
913
+ // re-dispatches the unfinished subset WITHOUT re-decomposing or re-enriching.
914
+ // Fall back to the dispatch envelope's tasks only if enrichment wasn't seen.
915
+ const sourceTasks = ctx.lastTaskGraph?.tasks ?? dispatchResponse.tasks ?? [];
916
+ const decomposedTasks = sourceTasks.map((t) => ({ ...t }));
917
+ const completedTaskIds = collectCompletedTaskIds(cwd, featureCode);
918
+ const partialDiff = captureWorkingDiff(cwd);
919
+
920
+ const stuck = {
921
+ feature: featureCode,
922
+ taskId: verdict.taskId,
923
+ signal: verdict.signal,
924
+ detail: verdict.detail,
925
+ attemptCounts: verdict.attemptCounts ?? {},
926
+ ts,
927
+ };
928
+ if (partialDiff) stuck.partialDiff = partialDiff;
929
+ writeFileSync(join(dir, 'stuck.json'), JSON.stringify(stuck, null, 2) + '\n');
930
+
931
+ const pause = {
932
+ flowId: dispatchResponse.flow_id,
933
+ stepId: dispatchResponse.step_id,
934
+ stuckTaskId: verdict.taskId,
935
+ signal: verdict.signal,
936
+ detail: verdict.detail,
937
+ decomposedTasks,
938
+ completedTaskIds,
939
+ pid: process.pid,
940
+ mode: 'gsd',
941
+ ts,
942
+ };
943
+ writeFileSync(join(dir, 'pause.json'), JSON.stringify(pause, null, 2) + '\n');
944
+
945
+ writeFileSync(join(dir, 'stuck.md'), renderStuckMarkdown(stuck, pause));
946
+ }
947
+
948
+ function renderStuckMarkdown(stuck, pause) {
949
+ const remaining = pause.decomposedTasks
950
+ .map((t) => t.id)
951
+ .filter((id) => !pause.completedTaskIds.includes(id));
952
+ return `# GSD stuck: ${stuck.feature}
953
+
954
+ **Signal:** \`${stuck.signal}\`
955
+ **Stuck task:** \`${stuck.taskId}\`
956
+ **Detected:** ${stuck.ts}
957
+
958
+ ## What happened
959
+
960
+ ${stuck.detail}
961
+
962
+ Attempt counts at halt:
963
+ - same-file edits (max across files): ${stuck.attemptCounts.sameFileEdits ?? 0}
964
+ - error repeats (max across hashes): ${stuck.attemptCounts.errorRepeats ?? 0}
965
+ - consecutive no-progress calls: ${stuck.attemptCounts.noProgressCalls ?? 0}
966
+
967
+ The in-flight task was cancelled and the run halted cleanly.
968
+
969
+ ## Resume or abort
970
+
971
+ Completed tasks (already in the blackboard, will be skipped): ${pause.completedTaskIds.length ? pause.completedTaskIds.map((x) => `\`${x}\``).join(', ') : '(none)'}
972
+ Tasks that will re-dispatch on resume: ${remaining.length ? remaining.map((x) => `\`${x}\``).join(', ') : '(none)'}
973
+
974
+ - **Resume:** \`compose gsd ${stuck.feature} --resume\` — re-dispatches the unfinished tasks into fresh worktrees.
975
+ - **Abort:** delete \`.compose/gsd/${stuck.feature}/pause.json\` and start over.
976
+
977
+ State for resume is in \`pause.json\` (schema: \`contracts/gsd-stuck.json#/definitions/pause\`).
978
+ `;
979
+ }
980
+
981
+ /**
982
+ * --resume: read pause.json, enforce the ownership + mode guard (mirrors
983
+ * `compose fix --resume`, bin/compose.js:1933), and return the persisted task
984
+ * graph filtered to exclude completedTaskIds. Throws (caller surfaces the
985
+ * message + exits 1) when there is nothing to resume or the guard fails.
986
+ *
987
+ * COMP-GSD-4: `claim` (default true) controls the atomic pause.lock ownership
988
+ * claim. runGsd passes `{claim:false}` and claims later (claimResumeLock) as
989
+ * the first statement INSIDE its try, so the run-loop's finally always releases
990
+ * the lock — no strand on a budget/stuck re-halt or a pre-dispatch throw. The
991
+ * CLI/test callers keep the default (read+guard+claim in one call).
992
+ */
993
+ export function loadResumeTaskGraph(cwd, featureCode, { claim = true } = {}) {
994
+ const pausePath = join(gsdDir(cwd, featureCode), 'pause.json');
995
+ let pause;
996
+ if (existsSync(pausePath)) {
997
+ try {
998
+ pause = JSON.parse(readFileSync(pausePath, 'utf-8'));
999
+ } catch (err) {
1000
+ throw new Error(`runGsd: pause.json for ${featureCode} is unreadable: ${err.message}`);
1001
+ }
1002
+ } else {
1003
+ // COMP-GSD-6 crash bridge: a hard crash never reaches the stuck/budget halt
1004
+ // paths that write pause.json. If state.json shows a running run with a DEAD
1005
+ // pid and a populated task graph (resumeReady), synthesize a pause-shaped
1006
+ // object so the unfinished subset can be re-dispatched through the same
1007
+ // guards/filtering below. An EMPTY graph (crashed pre/at decompose) is NOT
1008
+ // resumable here — it (correctly) falls through to the throw; the supervisor
1009
+ // restarts such runs fresh rather than --resume.
1010
+ const state = readGsdState(cwd, featureCode);
1011
+ if (
1012
+ state && state.status === 'running' && !pidAlive(state.pid) &&
1013
+ Array.isArray(state.decomposedTasks) && state.decomposedTasks.length > 0
1014
+ ) {
1015
+ pause = {
1016
+ flowId: state.flowId ?? null,
1017
+ stepId: state.lastStepId ?? 'execute',
1018
+ decomposedTasks: state.decomposedTasks,
1019
+ completedTaskIds: state.completedTaskIds ?? [],
1020
+ pid: state.pid,
1021
+ mode: 'gsd',
1022
+ ts: state.heartbeatAt ?? new Date().toISOString(),
1023
+ };
1024
+ } else {
1025
+ throw new Error(
1026
+ `runGsd: no pause.json to resume for ${featureCode}. ` +
1027
+ `Nothing to resume — run \`compose gsd ${featureCode}\` to start fresh.`,
1028
+ );
1029
+ }
1030
+ }
1031
+
1032
+ // Mode guard: refuse to resume a non-gsd pause file.
1033
+ if (pause.mode && pause.mode !== 'gsd') {
1034
+ throw new Error(
1035
+ `runGsd: cannot --resume: pause.json for ${featureCode} is in ${pause.mode} mode, not gsd.`,
1036
+ );
1037
+ }
1038
+
1039
+ // Ownership guard: refuse if the recorded pid is still alive. A resumable
1040
+ // pause is one whose writing process has EXITED — a live pid means another
1041
+ // run still owns this feature (mirrors `compose fix --resume`). We do not
1042
+ // make a self-pid exception: if a live process holds the pause, resuming is
1043
+ // unsafe regardless of whether that pid happens to match ours.
1044
+ if (typeof pause.pid === 'number' && pidAlive(pause.pid)) {
1045
+ throw new Error(
1046
+ `runGsd: cannot --resume: pid ${pause.pid} still owns this gsd run (process is live). ` +
1047
+ `Wait for it to exit (or remove a stale pause.json) before resuming.`,
1048
+ );
1049
+ }
1050
+
1051
+ const tasks = Array.isArray(pause.decomposedTasks) ? pause.decomposedTasks : [];
1052
+ if (tasks.length === 0) {
1053
+ throw new Error(`runGsd: pause.json for ${featureCode} has no decomposedTasks to resume.`);
1054
+ }
1055
+ const completed = new Set(pause.completedTaskIds ?? []);
1056
+ const remaining = tasks
1057
+ .filter((t) => !completed.has(t.id))
1058
+ .map((t) => {
1059
+ // A completed dependency is already satisfied (its result is in the
1060
+ // blackboard); strip it from depends_on so the re-dispatched subgraph is
1061
+ // self-consistent and a remaining task does not wait on a task that will
1062
+ // never be re-dispatched (COMP-GSD-5 Codex review residual).
1063
+ if (!Array.isArray(t.depends_on) || t.depends_on.length === 0) return t;
1064
+ const deps = t.depends_on.filter((id) => !completed.has(id));
1065
+ return deps.length === t.depends_on.length ? t : { ...t, depends_on: deps };
1066
+ });
1067
+ if (remaining.length === 0) {
1068
+ // Everything already completed — nothing to re-dispatch. Treat as clean.
1069
+ throw new Error(
1070
+ `runGsd: all tasks for ${featureCode} are already completed; nothing to re-dispatch. ` +
1071
+ `Delete pause.json to finish.`,
1072
+ );
1073
+ }
1074
+ if (claim) claimResumeLock(cwd, featureCode);
1075
+ return { tasks: remaining };
1076
+ }
1077
+
1078
+ /**
1079
+ * Atomic ownership claim (COMP-GSD-5 Codex review, HIGH). `mkdirSync` is an
1080
+ * atomically exclusive create, so two concurrent --resume invocations cannot
1081
+ * both claim — the loser gets EEXIST and refuses.
1082
+ *
1083
+ * COMP-GSD-6: a STALE claim left by a crashed --resume is now auto-recovered.
1084
+ * The HOLDER of pause.lock writes its own pid into pause.lock/owner.json (NOT
1085
+ * pause.json.pid, which is the original crashed run's pid — always dead at
1086
+ * resume time and so useless for liveness). Takeover when that holder pid is
1087
+ * dead, OR no owner record exists and the lock-dir mtime is older than the
1088
+ * stale window. TOCTOU-safe: remove + re-attempt the atomic mkdir; a concurrent
1089
+ * winner still wins.
1090
+ */
1091
+ export function claimResumeLock(cwd, featureCode) {
1092
+ const claimPath = join(gsdDir(cwd, featureCode), 'pause.lock');
1093
+ const writeOwner = () => {
1094
+ try {
1095
+ writeFileSync(
1096
+ join(claimPath, 'owner.json'),
1097
+ JSON.stringify({ pid: process.pid, ts: new Date().toISOString() }, null, 2),
1098
+ );
1099
+ } catch { /* best-effort; mtime fallback still protects takeover */ }
1100
+ };
1101
+ try {
1102
+ mkdirSync(claimPath);
1103
+ writeOwner();
1104
+ return;
1105
+ } catch (err) {
1106
+ if (err.code !== 'EEXIST') throw err;
1107
+ }
1108
+ // EEXIST — decide stale vs live by the lock HOLDER's own owner record.
1109
+ let holderPid = null;
1110
+ const ownerPath = join(claimPath, 'owner.json');
1111
+ if (existsSync(ownerPath)) {
1112
+ try {
1113
+ const o = JSON.parse(readFileSync(ownerPath, 'utf-8'));
1114
+ if (typeof o.pid === 'number') holderPid = o.pid;
1115
+ } catch { /* fall through to mtime */ }
1116
+ }
1117
+ let stale = false;
1118
+ if (typeof holderPid === 'number') {
1119
+ stale = !pidAlive(holderPid);
1120
+ } else {
1121
+ try {
1122
+ stale = Date.now() - statSync(claimPath).mtimeMs > RUN_LOCK_STALE_MS;
1123
+ } catch { stale = true; }
1124
+ }
1125
+ if (!stale) {
1126
+ throw new Error(
1127
+ `runGsd: a resume claim already exists for ${featureCode} ` +
1128
+ `(.compose/gsd/${featureCode}/pause.lock, pid ${holderPid ?? 'unknown'} alive). ` +
1129
+ `Another --resume may be in progress; if none is, remove that directory to clear a stale claim.`,
1130
+ );
1131
+ }
1132
+ // Atomic stale takeover (rename-aside) — a concurrent reclaimer can't delete
1133
+ // our fresh lock. If we lose the race, refuse.
1134
+ if (!takeoverStaleLock(claimPath)) {
1135
+ throw new Error(
1136
+ `runGsd: another --resume claimed ${featureCode} during stale-claim takeover; retry.`,
1137
+ );
1138
+ }
1139
+ writeOwner();
1140
+ }
1141
+
1142
+ /**
1143
+ * COMP-GSD-4: release ONLY the resume ownership claim (pause.lock), leaving
1144
+ * pause.json intact for the next --resume. Called in runGsd's finally on every
1145
+ * exit so a budget/stuck re-halt, cumulative refusal, or pre-dispatch throw
1146
+ * never strands the lock. Idempotent (force) — a no-op when no lock was claimed.
1147
+ */
1148
+ function releasePauseLock(cwd, featureCode) {
1149
+ try { rmSync(join(gsdDir(cwd, featureCode), 'pause.lock'), { recursive: true, force: true }); } catch { /* best-effort */ }
1150
+ }
1151
+
1152
+ /**
1153
+ * COMP-GSD-4: persist the budget halt diagnostic (budget.json + budget.md, via
1154
+ * composeBudgetDiagnostic) AND the resume state (pause.json, kind:'budget').
1155
+ * Mirrors writeStuckArtifacts but carries the `budget` block instead of the
1156
+ * stuck-specific fields. decomposedTasks comes from the enriched graph so
1157
+ * --resume re-dispatches the unfinished subset without re-decomposing.
1158
+ */
1159
+ function writeBudgetArtifacts(ctx, response, budgetState) {
1160
+ const { cwd, featureCode } = ctx;
1161
+ const dir = gsdDir(cwd, featureCode);
1162
+ mkdirSync(dir, { recursive: true });
1163
+ const ts = new Date().toISOString();
1164
+
1165
+ const sourceTasks = ctx.lastTaskGraph?.tasks ?? response.tasks ?? [];
1166
+ const decomposedTasks = sourceTasks.map((t) => ({ ...t }));
1167
+ const completedTaskIds = collectCompletedTaskIds(cwd, featureCode);
1168
+
1169
+ const { json, md } = composeBudgetDiagnostic(budgetState, { feature: featureCode, decomposedTasks, completedTaskIds });
1170
+ writeFileSync(join(dir, 'budget.json'), JSON.stringify(json, null, 2) + '\n');
1171
+ writeFileSync(join(dir, 'budget.md'), md);
1172
+
1173
+ const pause = {
1174
+ flowId: response.flow_id ?? null,
1175
+ stepId: response.step_id ?? ctx.lastStepId ?? 'execute',
1176
+ kind: 'budget',
1177
+ budget: { axis: json.axis, caps: budgetState.caps ?? {}, consumed: budgetState.consumed ?? {} },
1178
+ decomposedTasks,
1179
+ completedTaskIds,
1180
+ pid: process.pid,
1181
+ mode: 'gsd',
1182
+ ts,
1183
+ };
1184
+ writeFileSync(join(dir, 'pause.json'), JSON.stringify(pause, null, 2) + '\n');
1185
+ }
1186
+
1187
+ /**
1188
+ * COMP-GSD-7: on a clean complete, snapshot the run's final budget actuals-vs-caps
1189
+ * to budget-final.json so the milestone report has them retroactively (a clean
1190
+ * complete writes no budget.json — only halts do). Distinct filename from the
1191
+ * halt artifact budget.json (which buildGsdQuery's precedence reads). Atomic write.
1192
+ */
1193
+ export function writeBudgetFinalSnapshot(ctx, budgetState) {
1194
+ const { cwd, featureCode } = ctx;
1195
+ const dir = gsdDir(cwd, featureCode);
1196
+ mkdirSync(dir, { recursive: true });
1197
+ const decomposedTasks = (ctx.runState?.decomposedTasks ?? []).map((t) => ({ ...t }));
1198
+ const completedTaskIds = collectCompletedTaskIds(cwd, featureCode);
1199
+ const { json } = composeBudgetDiagnostic(budgetState, { feature: featureCode, decomposedTasks, completedTaskIds });
1200
+ const target = join(dir, 'budget-final.json');
1201
+ const tmp = `${target}.tmp`;
1202
+ writeFileSync(tmp, JSON.stringify(json, null, 2) + '\n');
1203
+ renameSync(tmp, target);
1204
+ }
1205
+
1206
+ /**
1207
+ * COMP-GSD-4: append a run's consumed usage to the cumulative ledger. Sourced
1208
+ * from the stratum budget_state.consumed ({tokens,dispatches,wall_s,dollars}).
1209
+ * No-op when budget_state is absent (un-budgeted runs).
1210
+ */
1211
+ function recordGsdUsageFromState(cwd, featureCode, budgetState) {
1212
+ const consumed = budgetState?.consumed;
1213
+ if (!consumed) return;
1214
+ recordGsdUsage(join(cwd, '.compose'), featureCode, {
1215
+ tokens: consumed.tokens ?? 0,
1216
+ costUsd: consumed.dollars ?? 0,
1217
+ dispatches: consumed.dispatches ?? 0,
1218
+ timeMs: Math.round((consumed.wall_s ?? 0) * 1000),
1219
+ });
1220
+ }
1221
+
1222
+ /**
1223
+ * COMP-GSD-4: write a budget refusal diagnostic when the cumulative ceiling is
1224
+ * already spent (pre-dispatch). No pause.json — nothing was dispatched, so
1225
+ * there is no run to resume; the user raises the cap or runs --reset-budget.
1226
+ */
1227
+ function writeCumulativeRefusal(cwd, featureCode, chk, limits) {
1228
+ const dir = gsdDir(cwd, featureCode);
1229
+ mkdirSync(dir, { recursive: true });
1230
+ const ts = new Date().toISOString();
1231
+ const json = { feature: featureCode, kind: 'budget', axis: 'cumulative', reason: chk.reason, usage: chk.usage, limits, ts };
1232
+ writeFileSync(join(dir, 'budget.json'), JSON.stringify(json, null, 2) + '\n');
1233
+ const md = [
1234
+ `# GSD budget refusal — ${featureCode}`,
1235
+ '',
1236
+ `**${chk.reason}**`,
1237
+ '',
1238
+ `Cumulative usage: ${chk.usage.totalTokens} tokens, $${(chk.usage.totalCostUsd ?? 0).toFixed(4)}.`,
1239
+ '',
1240
+ 'This feature has already spent its cumulative `gsd.budget.cumulative.*` ceiling.',
1241
+ 'Raise the cap in `.compose/compose.json`, or clear the ledger:',
1242
+ '',
1243
+ '```',
1244
+ `compose gsd ${featureCode} --reset-budget`,
1245
+ '```',
1246
+ '',
1247
+ ].join('\n');
1248
+ writeFileSync(join(dir, 'budget.md'), md);
1249
+ }
1250
+
1251
+ function clearPauseFile(cwd, featureCode) {
1252
+ const dir = gsdDir(cwd, featureCode);
1253
+ try { rmSync(join(dir, 'pause.json'), { force: true }); } catch { /* best-effort */ }
1254
+ // Release the resume ownership claim dir (COMP-GSD-5 Codex review) alongside it.
1255
+ try { rmSync(join(dir, 'pause.lock'), { recursive: true, force: true }); } catch { /* best-effort */ }
1256
+ }