@smartmemory/compose 0.2.7-beta → 0.2.9-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/bin/compose.js +112 -3
  2. package/contracts/gsd-state.json +140 -0
  3. package/contracts/gsd-stuck.json +141 -0
  4. package/dist/assets/{App-D3ehVPvi.js → App-CG-2euMe.js} +164 -164
  5. package/dist/assets/{arc-Dmf69iHG.js → arc-7QBWoLra.js} +1 -1
  6. package/dist/assets/{architectureDiagram-3BPJPVTR-xYo993Yw.js → architectureDiagram-3BPJPVTR-CUw-7uLm.js} +1 -1
  7. package/dist/assets/{blockDiagram-GPEHLZMM-UX4EF98O.js → blockDiagram-GPEHLZMM-COU1vmr7.js} +1 -1
  8. package/dist/assets/{c4Diagram-AAUBKEIU-DaP9CGWb.js → c4Diagram-AAUBKEIU-XPO9PSJL.js} +1 -1
  9. package/dist/assets/channel-Bcu04MIK.js +1 -0
  10. package/dist/assets/{chunk-2J33WTMH-CKk_RN3A.js → chunk-2J33WTMH-zMzVB2a6.js} +1 -1
  11. package/dist/assets/{chunk-4BX2VUAB-DboAwYKw.js → chunk-4BX2VUAB-Kke_qcHU.js} +1 -1
  12. package/dist/assets/{chunk-55IACEB6-Dsy9RYvI.js → chunk-55IACEB6-hMeFx5Nh.js} +1 -1
  13. package/dist/assets/{chunk-727SXJPM-fAH0QO9v.js → chunk-727SXJPM-DesUnrEw.js} +1 -1
  14. package/dist/assets/{chunk-AQP2D5EJ-DyZYerFP.js → chunk-AQP2D5EJ-1uGGvkxW.js} +1 -1
  15. package/dist/assets/{chunk-FMBD7UC4-BnboGO5t.js → chunk-FMBD7UC4-DYHv1PcZ.js} +1 -1
  16. package/dist/assets/{chunk-ND2GUHAM-Di9tYXme.js → chunk-ND2GUHAM-D0MENOLX.js} +1 -1
  17. package/dist/assets/{chunk-QZHKN3VN-zRPRlAIL.js → chunk-QZHKN3VN-8nn3HP-N.js} +1 -1
  18. package/dist/assets/classDiagram-4FO5ZUOK-DU4yxldU.js +1 -0
  19. package/dist/assets/classDiagram-v2-Q7XG4LA2-DU4yxldU.js +1 -0
  20. package/dist/assets/{cose-bilkent-S5V4N54A-C7Hqukaf.js → cose-bilkent-S5V4N54A-BoZPVIny.js} +1 -1
  21. package/dist/assets/{dagre-BM42HDAG-B-cR-BjI.js → dagre-BM42HDAG-BgZzdLG9.js} +1 -1
  22. package/dist/assets/{diagram-2AECGRRQ-B6-5onDk.js → diagram-2AECGRRQ-CknAnpSu.js} +1 -1
  23. package/dist/assets/{diagram-5GNKFQAL-DoZZgFAM.js → diagram-5GNKFQAL-CZUEbKim.js} +1 -1
  24. package/dist/assets/{diagram-KO2AKTUF-77jEGlJh.js → diagram-KO2AKTUF-DCs-pLdH.js} +1 -1
  25. package/dist/assets/{diagram-LMA3HP47-D3S7XDRD.js → diagram-LMA3HP47-lRaDjIfM.js} +1 -1
  26. package/dist/assets/{diagram-OG6HWLK6-KbYL9aCY.js → diagram-OG6HWLK6-CIGqmehP.js} +1 -1
  27. package/dist/assets/{erDiagram-TEJ5UH35-DezFbJP-.js → erDiagram-TEJ5UH35-Lx3c2N6F.js} +1 -1
  28. package/dist/assets/{flowDiagram-I6XJVG4X-4x31cK9j.js → flowDiagram-I6XJVG4X-VoluKqSq.js} +1 -1
  29. package/dist/assets/{ganttDiagram-6RSMTGT7-FopfSTyZ.js → ganttDiagram-6RSMTGT7-D7hETiNZ.js} +1 -1
  30. package/dist/assets/{gitGraphDiagram-PVQCEYII-DSiQGKbN.js → gitGraphDiagram-PVQCEYII-DenEcUvY.js} +1 -1
  31. package/dist/assets/{index-ClX6LVAf.js → index-B4dv3acY.js} +2 -2
  32. package/dist/assets/{infoDiagram-5YYISTIA-DE6BqzK_.js → infoDiagram-5YYISTIA-v7cq9Er9.js} +1 -1
  33. package/dist/assets/{ishikawaDiagram-YF4QCWOH-Dml8NwQI.js → ishikawaDiagram-YF4QCWOH-CfCCXt2x.js} +1 -1
  34. package/dist/assets/{journeyDiagram-JHISSGLW-CwWeJgjE.js → journeyDiagram-JHISSGLW-Bbokl_xO.js} +1 -1
  35. package/dist/assets/{kanban-definition-UN3LZRKU-DnG956Wh.js → kanban-definition-UN3LZRKU-DhkOZ2hg.js} +1 -1
  36. package/dist/assets/{linear-CA3N7Rpi.js → linear-bHjluRm2.js} +1 -1
  37. package/dist/assets/{mindmap-definition-RKZ34NQL-CxfIOjLX.js → mindmap-definition-RKZ34NQL-C1bHpoXH.js} +1 -1
  38. package/dist/assets/{pieDiagram-4H26LBE5-O7aIwy1x.js → pieDiagram-4H26LBE5-CZb1i55T.js} +1 -1
  39. package/dist/assets/{quadrantDiagram-W4KKPZXB-CPQ2qq7c.js → quadrantDiagram-W4KKPZXB-o37AwRHB.js} +1 -1
  40. package/dist/assets/{requirementDiagram-4Y6WPE33-C23horL4.js → requirementDiagram-4Y6WPE33-BVErWDzU.js} +1 -1
  41. package/dist/assets/{sankeyDiagram-5OEKKPKP-DPY04kOW.js → sankeyDiagram-5OEKKPKP-BhBK8gHQ.js} +1 -1
  42. package/dist/assets/{sequenceDiagram-3UESZ5HK-BKaTfIvo.js → sequenceDiagram-3UESZ5HK-CsICF23P.js} +1 -1
  43. package/dist/assets/{stateDiagram-AJRCARHV-B9na_6mY.js → stateDiagram-AJRCARHV-TN1AXwim.js} +1 -1
  44. package/dist/assets/stateDiagram-v2-BHNVJYJU-BLR6AkKX.js +1 -0
  45. package/dist/assets/{timeline-definition-PNZ67QCA-BBWPqd7X.js → timeline-definition-PNZ67QCA-DftAajbU.js} +1 -1
  46. package/dist/assets/{vennDiagram-CIIHVFJN-tWqiHsOZ.js → vennDiagram-CIIHVFJN-cFTMstT7.js} +1 -1
  47. package/dist/assets/{wardley-L42UT6IY-DorxG6os.js → wardley-L42UT6IY-DL8CivzO.js} +1 -1
  48. package/dist/assets/{wardleyDiagram-YWT4CUSO-B49f8GzW.js → wardleyDiagram-YWT4CUSO-BDZT1hQj.js} +1 -1
  49. package/dist/assets/{xychartDiagram-2RQKCTM6-BgKSj8Qb.js → xychartDiagram-2RQKCTM6-DQQSkfC4.js} +1 -1
  50. package/dist/index.html +1 -1
  51. package/lib/budget-ledger.js +84 -0
  52. package/lib/build-stream-schema.js +5 -3
  53. package/lib/build.js +122 -2
  54. package/lib/feature-validator.js +40 -8
  55. package/lib/gsd-budget.js +205 -0
  56. package/lib/gsd-diff-capture.js +34 -0
  57. package/lib/gsd-events.js +61 -0
  58. package/lib/gsd-headless-config.js +110 -0
  59. package/lib/gsd-milestone-report.js +323 -0
  60. package/lib/gsd-state.js +165 -0
  61. package/lib/gsd-stuck.js +275 -0
  62. package/lib/gsd-supervisor.js +223 -0
  63. package/lib/gsd-timing.js +89 -0
  64. package/lib/gsd.js +908 -16
  65. package/package.json +1 -1
  66. package/dist/assets/channel-D_RXsFFT.js +0 -1
  67. package/dist/assets/classDiagram-4FO5ZUOK-K6wdB4ic.js +0 -1
  68. package/dist/assets/classDiagram-v2-Q7XG4LA2-K6wdB4ic.js +0 -1
  69. package/dist/assets/stateDiagram-v2-BHNVJYJU-Cf84VDiH.js +0 -1
@@ -0,0 +1,275 @@
1
+ /**
2
+ * gsd-stuck.js — GsdStuckDetector for COMP-GSD-5.
3
+ *
4
+ * Detects, in real time during per-task `compose gsd` dispatch, that an agent
5
+ * is spinning, and emits a structured verdict so the run loop can halt cleanly.
6
+ *
7
+ * Four signals (thresholds tunable via constructor opts; defaults 3/3/8/600000):
8
+ * - same_file: one file_path edited >= sameFileEdits times.
9
+ * - error_recurrence: a normalized error hash recurs >= errorRepeats.
10
+ * - no_progress: >= noProgressCalls consecutive non-file-changing tool calls.
11
+ * - wall_clock: nowMs - startedAt(taskId) >= wallClockMs.
12
+ *
13
+ * The same-file signal REUSES FixChainDetector (lib/debug-discipline.js) for its
14
+ * per-key file-hit counting — keyed here by taskId. Error-recurrence and
15
+ * no-progress are the only new bookkeeping.
16
+ *
17
+ * Consumes BuildStreamEvents from stratum.onEvent inside
18
+ * executeParallelDispatchServer, keyed by event.task_id. gsd runs the execute
19
+ * step max_concurrent:1, so per-task state is unambiguous. Telemetry contract
20
+ * (schema 0.2.7, STRAT-PAR-STREAM-TOOLDETAIL):
21
+ * tool_use_summary.metadata = { tool, summary, ok, duration_ms, input, tool_use_id }
22
+ * input.file_path present for Edit/Write/MultiEdit/Read
23
+ * tool_result.metadata = { tool_use_id, ok, output }
24
+ *
25
+ * See: docs/features/COMP-GSD-5/{design,blueprint,plan}.md
26
+ * contracts/gsd-stuck.json (`stuck` diagnostic shape)
27
+ */
28
+
29
+ import { createHash } from 'node:crypto';
30
+ import { FixChainDetector } from './debug-discipline.js';
31
+
32
+ // Default thresholds (Decision 4 in design.md).
33
+ export const DEFAULT_THRESHOLDS = Object.freeze({
34
+ sameFileEdits: 3,
35
+ errorRepeats: 3,
36
+ noProgressCalls: 8,
37
+ wallClockMs: 600000,
38
+ });
39
+
40
+ // Tools that change files on disk — they reset no-progress and feed same-file.
41
+ // Read is deliberately excluded: it touches a file_path but makes no change.
42
+ const FILE_CHANGING_TOOLS = new Set(['Edit', 'Write', 'MultiEdit']);
43
+
44
+ // ---------------------------------------------------------------------------
45
+ // Error normalization — collapse cosmetic diffs (volatile paths, line:col
46
+ // numbers, whitespace, hex/temp ids) so the SAME logical failure hashes the
47
+ // same across repeats.
48
+ // ---------------------------------------------------------------------------
49
+
50
+ export function normalizeError(output) {
51
+ if (output == null) return '';
52
+ let s = String(output);
53
+ // Absolute/relative file paths -> a stable token. Catches /Users/..,
54
+ // /tmp/.., /var/.., ./rel/path, C:\... etc. up to a :line:col or space.
55
+ s = s.replace(/(?:[A-Za-z]:)?(?:\/|\\)[^\s:]+(?:[/\\][^\s:]+)*/g, '<path>');
56
+ // Bare relative module-ish paths (a/b/c.js) that didn't start with a slash.
57
+ s = s.replace(/\b[\w.-]+(?:\/[\w.-]+)+\.\w+\b/g, '<path>');
58
+ // line:col suffixes (e.g. :12:5 or :12).
59
+ s = s.replace(/:\d+(?::\d+)?\b/g, ':<n>');
60
+ // Standalone long digit runs (ids, ports, offsets) and hex blobs.
61
+ s = s.replace(/0x[0-9a-fA-F]+/g, '<hex>');
62
+ s = s.replace(/\b\d{2,}\b/g, '<n>');
63
+ // Collapse all whitespace (incl. the em-dash-adjacent spacing) to single spaces.
64
+ s = s.replace(/\s+/g, ' ').trim().toLowerCase();
65
+ return s;
66
+ }
67
+
68
+ function hashError(output) {
69
+ return createHash('sha1').update(normalizeError(output)).digest('hex').slice(0, 16);
70
+ }
71
+
72
+ // ---------------------------------------------------------------------------
73
+ // GsdStuckDetector
74
+ // ---------------------------------------------------------------------------
75
+
76
+ export class GsdStuckDetector {
77
+ constructor(opts = {}) {
78
+ this.sameFileEdits = opts.sameFileEdits ?? DEFAULT_THRESHOLDS.sameFileEdits;
79
+ this.errorRepeats = opts.errorRepeats ?? DEFAULT_THRESHOLDS.errorRepeats;
80
+ this.noProgressCalls = opts.noProgressCalls ?? DEFAULT_THRESHOLDS.noProgressCalls;
81
+ this.wallClockMs = opts.wallClockMs ?? DEFAULT_THRESHOLDS.wallClockMs;
82
+
83
+ // same-file: reuse FixChainDetector's per-key file-hit counter, keyed by taskId.
84
+ this._fixChain = new FixChainDetector();
85
+ // error-recurrence: per-task Map<normalizedHash, count>.
86
+ /** @type {Map<string, Map<string, number>>} */
87
+ this._errorHits = new Map();
88
+ // no-progress: per-task consecutive non-file-changing call count.
89
+ /** @type {Map<string, number>} */
90
+ this._noProgress = new Map();
91
+ // wall-clock baseline per task.
92
+ /** @type {Map<string, number>} */
93
+ this._startedAt = new Map();
94
+ }
95
+
96
+ /** Mark a task's dispatch start — establishes the wall-clock baseline. */
97
+ startTask(taskId, nowMs) {
98
+ if (!taskId) return;
99
+ if (!this._startedAt.has(taskId)) {
100
+ this._startedAt.set(taskId, nowMs);
101
+ }
102
+ }
103
+
104
+ /**
105
+ * Route a BuildStreamEvent into per-task state. Only tool_use_summary and
106
+ * tool_result are meaningful; everything else (and any untagged event) is
107
+ * ignored. Keyed by event.task_id.
108
+ */
109
+ record(event) {
110
+ if (!event || typeof event !== 'object') return;
111
+ const taskId = event.task_id;
112
+ if (!taskId) return; // gsd is max_concurrent:1 but be defensive about attribution.
113
+ const md = event.metadata ?? {};
114
+
115
+ if (event.kind === 'tool_use_summary') {
116
+ const tool = md.tool;
117
+ const filePath = md.input?.file_path;
118
+ if (FILE_CHANGING_TOOLS.has(tool)) {
119
+ // same-file: count the file hit (reuse FixChainDetector per-key counter).
120
+ if (filePath) {
121
+ this._fixChain.recordIterationForBug(taskId, [filePath]);
122
+ }
123
+ // no-progress: a file-changing tool resets the consecutive run.
124
+ this._noProgress.set(taskId, 0);
125
+ } else if ((this._fixChain.byBug.get(taskId)?.fileHits?.size ?? 0) > 0) {
126
+ // "No progress" = non-file-changing calls (Bash, Grep, Read, Glob, ...)
127
+ // AFTER the task has started editing. A task's initial read/grep/test
128
+ // exploration is legitimate work, not a stall — counting it would
129
+ // false-positive and abort productive TDD loops (COMP-GSD-5 Codex
130
+ // review). A task that NEVER edits is caught by the wall_clock backstop.
131
+ this._noProgress.set(taskId, (this._noProgress.get(taskId) ?? 0) + 1);
132
+ }
133
+ return;
134
+ }
135
+
136
+ if (event.kind === 'tool_result') {
137
+ if (md.ok === false) {
138
+ const hash = hashError(md.output);
139
+ let m = this._errorHits.get(taskId);
140
+ if (!m) { m = new Map(); this._errorHits.set(taskId, m); }
141
+ m.set(hash, (m.get(hash) ?? 0) + 1);
142
+ }
143
+ return;
144
+ }
145
+ }
146
+
147
+ /**
148
+ * Evaluate the stuck signals for a task. Returns the FIRST signal that has
149
+ * tripped (precedence: same_file, error_recurrence, no_progress, wall_clock).
150
+ * @returns {{stuck:true, signal:string, detail:string} | {stuck:false}}
151
+ */
152
+ check(taskId, nowMs) {
153
+ // --- same_file ---
154
+ const fileHits = this._fixChain.byBug.get(taskId)?.fileHits;
155
+ if (fileHits) {
156
+ for (const [file, count] of fileHits.entries()) {
157
+ if (count >= this.sameFileEdits) {
158
+ return {
159
+ stuck: true,
160
+ signal: 'same_file',
161
+ detail: `file ${file} edited ${count} times (>= ${this.sameFileEdits}) without converging`,
162
+ };
163
+ }
164
+ }
165
+ }
166
+
167
+ // --- error_recurrence ---
168
+ const errs = this._errorHits.get(taskId);
169
+ if (errs) {
170
+ for (const [hash, count] of errs.entries()) {
171
+ if (count >= this.errorRepeats) {
172
+ return {
173
+ stuck: true,
174
+ signal: 'error_recurrence',
175
+ detail: `the same error recurred ${count} times (>= ${this.errorRepeats}); normalized hash ${hash}`,
176
+ };
177
+ }
178
+ }
179
+ }
180
+
181
+ // --- no_progress ---
182
+ const np = this._noProgress.get(taskId) ?? 0;
183
+ if (np >= this.noProgressCalls) {
184
+ return {
185
+ stuck: true,
186
+ signal: 'no_progress',
187
+ detail: `${np} consecutive tool calls (>= ${this.noProgressCalls}) with no file-changing edit`,
188
+ };
189
+ }
190
+
191
+ // --- wall_clock ---
192
+ const startedAt = this._startedAt.get(taskId);
193
+ if (startedAt != null && nowMs - startedAt >= this.wallClockMs) {
194
+ return {
195
+ stuck: true,
196
+ signal: 'wall_clock',
197
+ detail: `task ran ${nowMs - startedAt}ms (>= ${this.wallClockMs}ms) without finishing`,
198
+ };
199
+ }
200
+
201
+ return { stuck: false };
202
+ }
203
+
204
+ /**
205
+ * Build the `attemptCounts` snapshot for the stuck.json diagnostic
206
+ * (contracts/gsd-stuck.json#/definitions/stuck/attemptCounts).
207
+ */
208
+ attemptCounts(taskId) {
209
+ const fileHits = this._fixChain.byBug.get(taskId)?.fileHits;
210
+ const maxFileEdits = fileHits ? Math.max(0, ...fileHits.values()) : 0;
211
+ const errs = this._errorHits.get(taskId);
212
+ const maxErrorRepeats = errs ? Math.max(0, ...errs.values()) : 0;
213
+ return {
214
+ sameFileEdits: maxFileEdits,
215
+ errorRepeats: maxErrorRepeats,
216
+ noProgressCalls: this._noProgress.get(taskId) ?? 0,
217
+ };
218
+ }
219
+
220
+ /** Clear all state for one task without touching others. */
221
+ reset(taskId) {
222
+ this._fixChain.resetForBug(taskId);
223
+ this._errorHits.delete(taskId);
224
+ this._noProgress.delete(taskId);
225
+ this._startedAt.delete(taskId);
226
+ }
227
+
228
+ // --- Serialization (resume) ----------------------------------------------
229
+
230
+ toJSON() {
231
+ return {
232
+ thresholds: {
233
+ sameFileEdits: this.sameFileEdits,
234
+ errorRepeats: this.errorRepeats,
235
+ noProgressCalls: this.noProgressCalls,
236
+ wallClockMs: this.wallClockMs,
237
+ },
238
+ fixChain: this._fixChain.toJSON(),
239
+ errorHits: Object.fromEntries(
240
+ [...this._errorHits.entries()].map(([t, m]) => [t, Object.fromEntries(m)]),
241
+ ),
242
+ noProgress: Object.fromEntries(this._noProgress),
243
+ startedAt: Object.fromEntries(this._startedAt),
244
+ };
245
+ }
246
+
247
+ static fromJSON(json) {
248
+ const t = (json && typeof json === 'object' && json.thresholds) || {};
249
+ const d = new GsdStuckDetector({
250
+ sameFileEdits: t.sameFileEdits,
251
+ errorRepeats: t.errorRepeats,
252
+ noProgressCalls: t.noProgressCalls,
253
+ wallClockMs: t.wallClockMs,
254
+ });
255
+ if (!json || typeof json !== 'object') return d;
256
+
257
+ d._fixChain = FixChainDetector.fromJSON(json.fixChain ?? {});
258
+ if (json.errorHits && typeof json.errorHits === 'object') {
259
+ for (const [taskId, hashes] of Object.entries(json.errorHits)) {
260
+ d._errorHits.set(taskId, new Map(Object.entries(hashes ?? {})));
261
+ }
262
+ }
263
+ if (json.noProgress && typeof json.noProgress === 'object') {
264
+ for (const [taskId, n] of Object.entries(json.noProgress)) {
265
+ d._noProgress.set(taskId, Number(n) || 0);
266
+ }
267
+ }
268
+ if (json.startedAt && typeof json.startedAt === 'object') {
269
+ for (const [taskId, ms] of Object.entries(json.startedAt)) {
270
+ d._startedAt.set(taskId, Number(ms) || 0);
271
+ }
272
+ }
273
+ return d;
274
+ }
275
+ }
@@ -0,0 +1,223 @@
1
+ // lib/gsd-supervisor.js
2
+ //
3
+ // COMP-GSD-6 S06: the `--headless` supervisor. An outer loop that owns child
4
+ // run attempts — a self-resuming in-process loop can't survive a hard crash, so
5
+ // the supervisor spawns each attempt and re-spawns on a recoverable non-clean
6
+ // exit with exponential backoff and per-kind attempt caps.
7
+ //
8
+ // Classification is driven by the TERMINAL state.json status (not exit code
9
+ // alone): complete | stuck | budget | failed | crashed | (no-state ⇒ fatal).
10
+ // Budget never auto-resumes unless explicitly opted in (protects the GSD-4
11
+ // ceiling). A crash re-spawns --resume only when resumeReady, else fresh.
12
+
13
+ import { spawn } from 'node:child_process';
14
+ import { join, dirname } from 'node:path';
15
+ import { fileURLToPath } from 'node:url';
16
+
17
+ import { buildGsdQuery, pidAlive, clearGsdPause } from './gsd-state.js';
18
+ import { readHeadlessConfig, backoffMs } from './gsd-headless-config.js';
19
+
20
+ const COMPOSE_BIN = join(dirname(fileURLToPath(import.meta.url)), '..', 'bin', 'compose.js');
21
+
22
+ // Map a derived run status to a recovery decision. Pure — takes the per-kind
23
+ // retry counts already consumed, returns what the loop should do next.
24
+ //
25
+ // Returns one of:
26
+ // { terminal: true, status } — done / non-recoverable
27
+ // { terminal: false, status, kind, mode, capExhausted } — retry (or capped)
28
+ export function classifyOutcome(derivedStatus, state, cfg, counts) {
29
+ switch (derivedStatus) {
30
+ case 'complete':
31
+ return { terminal: true, status: 'complete', ok: true };
32
+ case 'failed':
33
+ // Orderly fatal exit (dirty workspace, parse error) — re-running re-fails.
34
+ return { terminal: true, status: 'failed', ok: false };
35
+ case 'absent':
36
+ // No running checkpoint was ever written → a pre-checkpoint failure
37
+ // (bad args, dirty tree before planning). Non-recoverable by absence.
38
+ return { terminal: true, status: 'fatal', ok: false };
39
+ case 'stuck':
40
+ return retryDecision('stuck', 'stuck', 'resume', cfg, counts);
41
+ case 'budget':
42
+ return retryDecision('budget', 'budget', 'resume', cfg, counts);
43
+ case 'crashed': {
44
+ // resumeReady gates --resume (task graph exists) vs fresh restart
45
+ // (crashed during plan/decompose — nothing merged yet).
46
+ const mode = state?.resumeReady ? 'resume' : 'fresh';
47
+ return retryDecision('crash', 'crashed', mode, cfg, counts);
48
+ }
49
+ default:
50
+ // running with a live pid after the child exited shouldn't happen.
51
+ return { terminal: true, status: derivedStatus ?? 'unknown', ok: false };
52
+ }
53
+ }
54
+
55
+ function retryDecision(policyKey, status, mode, cfg, counts) {
56
+ const policy = cfg.autoResume[policyKey];
57
+ if (!policy || !policy.enabled) {
58
+ return { terminal: true, status, ok: false, reason: 'auto-resume disabled' };
59
+ }
60
+ const used = counts[policyKey] ?? 0;
61
+ if (used >= policy.maxAttempts) {
62
+ return { terminal: true, status, ok: false, reason: 'maxAttempts exhausted', capExhausted: true };
63
+ }
64
+ return { terminal: false, status, kind: policyKey, mode };
65
+ }
66
+
67
+ // Default real spawner — runs a PLAIN `compose gsd <feature> [--resume]` child
68
+ // (NOT --headless: that would recurse into another supervisor). Resolves with
69
+ // { code, signal } on exit.
70
+ function defaultSpawnRun({ feature, resume, cwd, attempt }) {
71
+ return new Promise((resolve) => {
72
+ const args = [COMPOSE_BIN, 'gsd', feature];
73
+ if (resume) args.push('--resume');
74
+ if (cwd) args.push('--cwd', cwd);
75
+ const child = spawn(process.execPath, args, {
76
+ stdio: 'inherit',
77
+ cwd: cwd ?? process.cwd(),
78
+ env: { ...process.env, GSD_HEADLESS_ATTEMPT: String(attempt) },
79
+ });
80
+ child.on('exit', (code, signal) => resolve({ code, signal }));
81
+ child.on('error', (err) => resolve({ code: 1, signal: null, error: err }));
82
+ });
83
+ }
84
+
85
+ const defaultSleep = (ms) => new Promise((r) => setTimeout(r, ms));
86
+
87
+ // Abort-aware, unref'd sleep for the watchdog poll: resolves immediately when the
88
+ // signal aborts (so a clean child exit doesn't leave the supervisor waiting a full
89
+ // poll interval) and never holds the process open (unref).
90
+ function abortableSleep(ms, signal) {
91
+ return new Promise((resolve) => {
92
+ if (signal?.aborted) return resolve();
93
+ const t = setTimeout(resolve, ms);
94
+ if (t.unref) t.unref();
95
+ signal?.addEventListener('abort', () => { clearTimeout(t); resolve(); }, { once: true });
96
+ });
97
+ }
98
+
99
+ // COMP-GSD-6-WATCHDOG: poll the child's state.json for a HUNG run — heartbeat
100
+ // frozen on a still-alive pid. Resolves the hung snapshot, or null when aborted
101
+ // (the child exited first). Requires the child's independent heartbeat timer
102
+ // (gsd.js) for `heartbeatStale` to be a sound signal.
103
+ //
104
+ // Confirm-poll: declares hung only after TWO consecutive stale polls with an
105
+ // UNCHANGED heartbeatAt. A host suspend / forward clock jump can momentarily make
106
+ // `now - heartbeatAt > staleMs` true before the just-woken child re-stamps its
107
+ // heartbeat; requiring the heartbeat to stay frozen across two polls lets a
108
+ // healthy child clear the alarm, so only a truly wedged loop is killed.
109
+ export async function defaultWatch({ feature, cwd, cfg, signal, sleep, buildQuery }) {
110
+ const query = buildQuery ?? buildGsdQuery;
111
+ // Default to an abort-aware unref'd sleep so a clean exit ends the poll at once
112
+ // and the timer never holds the process open. Tests inject an instant sleep.
113
+ const nap = sleep ?? ((ms) => abortableSleep(ms, signal));
114
+ let prevHb = null;
115
+ while (!signal.aborted) {
116
+ await nap(cfg.watchdogPollMs);
117
+ if (signal.aborted) return null;
118
+ const s = query(cwd, feature, { staleMs: cfg.heartbeatStaleMs });
119
+ if (s.status === 'running' && s.heartbeatStale) {
120
+ if (prevHb !== null && s.heartbeatAt === prevHb) return s; // frozen across 2 polls → hung
121
+ prevHb = s.heartbeatAt;
122
+ } else {
123
+ prevHb = null; // healthy, or heartbeat advanced (suspend/wake) → reset
124
+ }
125
+ }
126
+ return null;
127
+ }
128
+
129
+ // COMP-GSD-6-WATCHDOG: kill a hung child by pid. SIGTERM, wait the grace window,
130
+ // then SIGKILL if it's still alive. The supervisor never holds the child handle
131
+ // (defaultSpawnRun discards it), so the kill is pid-based. Best-effort: a child
132
+ // that already exited (ESRCH) is fine.
133
+ export async function defaultKillChild(pid, cfg, deps = {}) {
134
+ if (!pid) return;
135
+ const kill = deps.kill ?? process.kill.bind(process);
136
+ const nap = deps.sleep ?? defaultSleep;
137
+ const alive = deps.isAlive ?? pidAlive;
138
+ try { kill(pid, 'SIGTERM'); } catch { /* already gone */ }
139
+ await nap(cfg.watchdogKillGraceMs);
140
+ if (alive(pid)) {
141
+ try { kill(pid, 'SIGKILL'); } catch { /* gone during grace */ }
142
+ }
143
+ }
144
+
145
+ // The supervisor loop. opts.spawnRun / opts.sleep are injectable for tests.
146
+ // Returns { status, attempts, history }.
147
+ export async function runGsdHeadless(feature, opts = {}) {
148
+ const cwd = opts.cwd ?? process.cwd();
149
+ const cfg = opts.config ?? readHeadlessConfig(cwd);
150
+ const spawnRun = opts.spawnRun ?? defaultSpawnRun;
151
+ const sleep = opts.sleep ?? defaultSleep;
152
+ const watch = opts.watch ?? defaultWatch; // COMP-GSD-6-WATCHDOG
153
+ const killChild = opts.killChild ?? defaultKillChild;
154
+ const log = opts.log ?? ((m) => console.error(`[gsd-headless] ${m}`));
155
+ const maxTotalAttempts = opts.maxTotalAttempts ?? 50; // hard backstop
156
+
157
+ const counts = { crash: 0, stuck: 0, budget: 0, hung: 0 };
158
+ const history = [];
159
+ let mode = opts.resume ? 'resume' : 'fresh';
160
+ let attempt = 0;
161
+
162
+ while (attempt < maxTotalAttempts) {
163
+ attempt += 1;
164
+ log(`attempt ${attempt} (${mode})`);
165
+
166
+ // COMP-GSD-6-WATCHDOG: race the child's exit against the hung watchdog. When
167
+ // the watchdog wins, kill+reap the child and classify as a 'hung' recovery.
168
+ // When the watchdog is disabled, this degrades to the plain `await spawnRun`.
169
+ const exitP = spawnRun({ feature, resume: mode === 'resume', cwd, attempt });
170
+ let exit, snap, outcome;
171
+ const hungPolicy = cfg.autoResume.hung;
172
+
173
+ if (hungPolicy && hungPolicy.enabled) {
174
+ const ac = new AbortController();
175
+ // No `sleep` passed → defaultWatch uses its abort-aware unref'd poll sleep.
176
+ const watchP = watch({ feature, cwd, cfg, signal: ac.signal });
177
+ const raced = await Promise.race([
178
+ exitP.then((e) => ({ type: 'exit', exit: e })),
179
+ watchP.then((s) => (s ? { type: 'hung', snap: s } : { type: 'idle' })),
180
+ ]);
181
+ if (raced.type === 'hung') {
182
+ log(`watchdog: hung run (heartbeat frozen, pid ${raced.snap.pid}) — killing`);
183
+ await killChild(raced.snap.pid, cfg);
184
+ exit = await exitP; // reap the killed child
185
+ clearGsdPause(cwd, feature); // crash-bridge uses current state.json
186
+ const m = raced.snap.resumeReady ? 'resume' : 'fresh';
187
+ snap = { status: 'hung' };
188
+ outcome = retryDecision('hung', 'hung', m, cfg, counts);
189
+ } else {
190
+ ac.abort(); // stop the watcher; exit won
191
+ exit = raced.exit;
192
+ }
193
+ } else {
194
+ exit = await exitP; // watchdog off → today's path
195
+ }
196
+
197
+ if (!outcome) {
198
+ // Classify via the full query precedence (state.json -> pause.json ->
199
+ // budget.json -> absent), NOT raw state alone, so a pre-dispatch cumulative-
200
+ // budget refusal (budget.json, no state.json) is seen as 'budget', not
201
+ // 'absent'. The snapshot also carries resumeReady for the crashed branch.
202
+ snap = buildGsdQuery(cwd, feature, { staleMs: cfg.heartbeatStaleMs });
203
+ outcome = classifyOutcome(snap.status, snap, cfg, counts);
204
+ }
205
+ history.push({ attempt, mode, exitCode: exit.code ?? null, derived: snap.status, outcome: outcome.status });
206
+
207
+ if (outcome.terminal) {
208
+ if (outcome.ok) log(`run complete after ${attempt} attempt(s)`);
209
+ else log(`stopping: ${outcome.status}${outcome.reason ? ` (${outcome.reason})` : ''}`);
210
+ return { status: outcome.status, ok: !!outcome.ok, attempts: attempt, history };
211
+ }
212
+
213
+ // Recoverable — consume a retry for this kind, back off, re-spawn.
214
+ counts[outcome.kind] += 1;
215
+ mode = outcome.mode;
216
+ const wait = backoffMs(cfg, counts[outcome.kind]);
217
+ log(`recovering ${outcome.status} via ${mode} (retry ${counts[outcome.kind]}); backoff ${wait}ms`);
218
+ await sleep(wait);
219
+ }
220
+
221
+ log(`hit maxTotalAttempts (${maxTotalAttempts}) — giving up`);
222
+ return { status: 'aborted', ok: false, attempts: attempt, history };
223
+ }
@@ -0,0 +1,89 @@
1
+ // lib/gsd-timing.js
2
+ //
3
+ // COMP-GSD-7 S1: per-task timing sidecar for the milestone report.
4
+ //
5
+ // Stratum's parallel_poll response does NOT carry per-task timing, and the
6
+ // blackboard is rebuilt from agent-written results/*.json validated against
7
+ // contracts/task-result.json (which forbids extra fields). So compose's own
8
+ // poll-loop observations have nowhere to land on the blackboard. This sidecar
9
+ // is the carrier: `.compose/gsd/<feature>/timing.json` = a plain
10
+ // { [taskId]: { startedAt, completedAt, durationMs } }
11
+ // map, written by the gsd dispatch loop and read by the report assembler.
12
+ //
13
+ // Caveat: completedAt is bounded by the dispatch poll interval (seconds), so
14
+ // durations are approximate-to-poll-granularity. The report footer documents it.
15
+ //
16
+ // Atomic write: tmp+rename, mirrors lib/gsd-state.js:44.
17
+
18
+ import { readFileSync, writeFileSync, existsSync, mkdirSync, renameSync, unlinkSync } from 'node:fs';
19
+ import { join } from 'node:path';
20
+
21
+ const TERMINAL_STATES = new Set(['complete', 'failed', 'cancelled']);
22
+
23
+ function gsdDir(cwd, featureCode) {
24
+ return join(cwd, '.compose', 'gsd', featureCode);
25
+ }
26
+
27
+ export function timingSidecarPath(cwd, featureCode) {
28
+ return join(gsdDir(cwd, featureCode), 'timing.json');
29
+ }
30
+
31
+ // Atomic write: write to a tmp sibling, rename into place. Clears a stale tmp
32
+ // from a previously-crashed write first.
33
+ export function writeTimingSidecar(cwd, featureCode, timingMap) {
34
+ const dir = gsdDir(cwd, featureCode);
35
+ mkdirSync(dir, { recursive: true });
36
+ const target = timingSidecarPath(cwd, featureCode);
37
+ const tmp = `${target}.tmp`;
38
+ if (existsSync(tmp)) {
39
+ try { unlinkSync(tmp); } catch { /* ignore */ }
40
+ }
41
+ writeFileSync(tmp, JSON.stringify(timingMap, null, 2));
42
+ renameSync(tmp, target);
43
+ return target;
44
+ }
45
+
46
+ // Read-or-{}. Corrupt/unreadable JSON degrades to {} — the report renders an
47
+ // empty timing column rather than failing.
48
+ export function readTimingSidecar(cwd, featureCode) {
49
+ const p = timingSidecarPath(cwd, featureCode);
50
+ if (!existsSync(p)) return {};
51
+ try {
52
+ return JSON.parse(readFileSync(p, 'utf-8'));
53
+ } catch {
54
+ return {};
55
+ }
56
+ }
57
+
58
+ /**
59
+ * Pure poll accumulator. Called once per poll with the current task-state map.
60
+ *
61
+ * - First sight of a task → record `startedAt = nowIso`.
62
+ * - First time a task is seen in a terminal state (complete|failed|cancelled) →
63
+ * record `completedAt = nowIso` and `durationMs` (completedAt − startedAt,
64
+ * floored at 0). A task first seen already-terminal gets startedAt==completedAt
65
+ * and durationMs 0.
66
+ *
67
+ * Idempotent: an existing startedAt/completedAt is never overwritten (first
68
+ * observation wins), so re-polling the same terminal task is a no-op.
69
+ *
70
+ * @returns the same `timingMap` reference (mutated in place).
71
+ */
72
+ export function recordTaskStates(timingMap, pollTasks, nowIso) {
73
+ if (!pollTasks || typeof pollTasks !== 'object') return timingMap;
74
+ for (const [taskId, ts] of Object.entries(pollTasks)) {
75
+ const state = ts?.state;
76
+ let entry = timingMap[taskId];
77
+ if (!entry) {
78
+ entry = { startedAt: nowIso };
79
+ timingMap[taskId] = entry;
80
+ }
81
+ if (entry.completedAt == null && TERMINAL_STATES.has(state)) {
82
+ entry.completedAt = nowIso;
83
+ const start = Date.parse(entry.startedAt);
84
+ const end = Date.parse(nowIso);
85
+ entry.durationMs = Number.isNaN(start) || Number.isNaN(end) ? 0 : Math.max(0, end - start);
86
+ }
87
+ }
88
+ return timingMap;
89
+ }