@smartmemory/compose 0.2.8-beta → 0.2.9-beta
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/compose.js +75 -1
- package/contracts/gsd-state.json +140 -0
- package/dist/assets/{App-D3ehVPvi.js → App-CG-2euMe.js} +164 -164
- package/dist/assets/{arc-Dmf69iHG.js → arc-7QBWoLra.js} +1 -1
- package/dist/assets/{architectureDiagram-3BPJPVTR-xYo993Yw.js → architectureDiagram-3BPJPVTR-CUw-7uLm.js} +1 -1
- package/dist/assets/{blockDiagram-GPEHLZMM-UX4EF98O.js → blockDiagram-GPEHLZMM-COU1vmr7.js} +1 -1
- package/dist/assets/{c4Diagram-AAUBKEIU-DaP9CGWb.js → c4Diagram-AAUBKEIU-XPO9PSJL.js} +1 -1
- package/dist/assets/channel-Bcu04MIK.js +1 -0
- package/dist/assets/{chunk-2J33WTMH-CKk_RN3A.js → chunk-2J33WTMH-zMzVB2a6.js} +1 -1
- package/dist/assets/{chunk-4BX2VUAB-DboAwYKw.js → chunk-4BX2VUAB-Kke_qcHU.js} +1 -1
- package/dist/assets/{chunk-55IACEB6-Dsy9RYvI.js → chunk-55IACEB6-hMeFx5Nh.js} +1 -1
- package/dist/assets/{chunk-727SXJPM-fAH0QO9v.js → chunk-727SXJPM-DesUnrEw.js} +1 -1
- package/dist/assets/{chunk-AQP2D5EJ-DyZYerFP.js → chunk-AQP2D5EJ-1uGGvkxW.js} +1 -1
- package/dist/assets/{chunk-FMBD7UC4-BnboGO5t.js → chunk-FMBD7UC4-DYHv1PcZ.js} +1 -1
- package/dist/assets/{chunk-ND2GUHAM-Di9tYXme.js → chunk-ND2GUHAM-D0MENOLX.js} +1 -1
- package/dist/assets/{chunk-QZHKN3VN-zRPRlAIL.js → chunk-QZHKN3VN-8nn3HP-N.js} +1 -1
- package/dist/assets/classDiagram-4FO5ZUOK-DU4yxldU.js +1 -0
- package/dist/assets/classDiagram-v2-Q7XG4LA2-DU4yxldU.js +1 -0
- package/dist/assets/{cose-bilkent-S5V4N54A-C7Hqukaf.js → cose-bilkent-S5V4N54A-BoZPVIny.js} +1 -1
- package/dist/assets/{dagre-BM42HDAG-B-cR-BjI.js → dagre-BM42HDAG-BgZzdLG9.js} +1 -1
- package/dist/assets/{diagram-2AECGRRQ-B6-5onDk.js → diagram-2AECGRRQ-CknAnpSu.js} +1 -1
- package/dist/assets/{diagram-5GNKFQAL-DoZZgFAM.js → diagram-5GNKFQAL-CZUEbKim.js} +1 -1
- package/dist/assets/{diagram-KO2AKTUF-77jEGlJh.js → diagram-KO2AKTUF-DCs-pLdH.js} +1 -1
- package/dist/assets/{diagram-LMA3HP47-D3S7XDRD.js → diagram-LMA3HP47-lRaDjIfM.js} +1 -1
- package/dist/assets/{diagram-OG6HWLK6-KbYL9aCY.js → diagram-OG6HWLK6-CIGqmehP.js} +1 -1
- package/dist/assets/{erDiagram-TEJ5UH35-DezFbJP-.js → erDiagram-TEJ5UH35-Lx3c2N6F.js} +1 -1
- package/dist/assets/{flowDiagram-I6XJVG4X-4x31cK9j.js → flowDiagram-I6XJVG4X-VoluKqSq.js} +1 -1
- package/dist/assets/{ganttDiagram-6RSMTGT7-FopfSTyZ.js → ganttDiagram-6RSMTGT7-D7hETiNZ.js} +1 -1
- package/dist/assets/{gitGraphDiagram-PVQCEYII-DSiQGKbN.js → gitGraphDiagram-PVQCEYII-DenEcUvY.js} +1 -1
- package/dist/assets/{index-ClX6LVAf.js → index-B4dv3acY.js} +2 -2
- package/dist/assets/{infoDiagram-5YYISTIA-DE6BqzK_.js → infoDiagram-5YYISTIA-v7cq9Er9.js} +1 -1
- package/dist/assets/{ishikawaDiagram-YF4QCWOH-Dml8NwQI.js → ishikawaDiagram-YF4QCWOH-CfCCXt2x.js} +1 -1
- package/dist/assets/{journeyDiagram-JHISSGLW-CwWeJgjE.js → journeyDiagram-JHISSGLW-Bbokl_xO.js} +1 -1
- package/dist/assets/{kanban-definition-UN3LZRKU-DnG956Wh.js → kanban-definition-UN3LZRKU-DhkOZ2hg.js} +1 -1
- package/dist/assets/{linear-CA3N7Rpi.js → linear-bHjluRm2.js} +1 -1
- package/dist/assets/{mindmap-definition-RKZ34NQL-CxfIOjLX.js → mindmap-definition-RKZ34NQL-C1bHpoXH.js} +1 -1
- package/dist/assets/{pieDiagram-4H26LBE5-O7aIwy1x.js → pieDiagram-4H26LBE5-CZb1i55T.js} +1 -1
- package/dist/assets/{quadrantDiagram-W4KKPZXB-CPQ2qq7c.js → quadrantDiagram-W4KKPZXB-o37AwRHB.js} +1 -1
- package/dist/assets/{requirementDiagram-4Y6WPE33-C23horL4.js → requirementDiagram-4Y6WPE33-BVErWDzU.js} +1 -1
- package/dist/assets/{sankeyDiagram-5OEKKPKP-DPY04kOW.js → sankeyDiagram-5OEKKPKP-BhBK8gHQ.js} +1 -1
- package/dist/assets/{sequenceDiagram-3UESZ5HK-BKaTfIvo.js → sequenceDiagram-3UESZ5HK-CsICF23P.js} +1 -1
- package/dist/assets/{stateDiagram-AJRCARHV-B9na_6mY.js → stateDiagram-AJRCARHV-TN1AXwim.js} +1 -1
- package/dist/assets/stateDiagram-v2-BHNVJYJU-BLR6AkKX.js +1 -0
- package/dist/assets/{timeline-definition-PNZ67QCA-BBWPqd7X.js → timeline-definition-PNZ67QCA-DftAajbU.js} +1 -1
- package/dist/assets/{vennDiagram-CIIHVFJN-tWqiHsOZ.js → vennDiagram-CIIHVFJN-cFTMstT7.js} +1 -1
- package/dist/assets/{wardley-L42UT6IY-DorxG6os.js → wardley-L42UT6IY-DL8CivzO.js} +1 -1
- package/dist/assets/{wardleyDiagram-YWT4CUSO-B49f8GzW.js → wardleyDiagram-YWT4CUSO-BDZT1hQj.js} +1 -1
- package/dist/assets/{xychartDiagram-2RQKCTM6-BgKSj8Qb.js → xychartDiagram-2RQKCTM6-DQQSkfC4.js} +1 -1
- package/dist/index.html +1 -1
- package/lib/build.js +36 -2
- package/lib/gsd-diff-capture.js +34 -0
- package/lib/gsd-events.js +61 -0
- package/lib/gsd-headless-config.js +110 -0
- package/lib/gsd-milestone-report.js +323 -0
- package/lib/gsd-state.js +165 -0
- package/lib/gsd-supervisor.js +223 -0
- package/lib/gsd-timing.js +89 -0
- package/lib/gsd.js +446 -45
- package/package.json +1 -1
- package/dist/assets/channel-D_RXsFFT.js +0 -1
- package/dist/assets/classDiagram-4FO5ZUOK-K6wdB4ic.js +0 -1
- package/dist/assets/classDiagram-v2-Q7XG4LA2-K6wdB4ic.js +0 -1
- package/dist/assets/stateDiagram-v2-BHNVJYJU-Cf84VDiH.js +0 -1
package/lib/gsd-state.js
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
// lib/gsd-state.js
|
|
2
|
+
//
|
|
3
|
+
// COMP-GSD-6 S01: continuous gsd run-state checkpoint (.compose/gsd/<f>/state.json).
|
|
4
|
+
//
|
|
5
|
+
// The load-bearing primitive for headless crash recovery, `compose gsd query`,
|
|
6
|
+
// and the live-run lock. A run flushes this file continuously (init pre-plan,
|
|
7
|
+
// per-task heartbeat, post-decompose, terminal); a hard crash leaves the last
|
|
8
|
+
// checkpoint with status:"running" + a dead pid, which readers derive as
|
|
9
|
+
// "crashed". Plain JSON, atomic tmp+rename — no SQLite (mirrors writeActiveBuild
|
|
10
|
+
// in lib/build.js). `pidAlive` lives here canonically (gsd.js imports it) to
|
|
11
|
+
// keep the gsd.js <-> gsd-state.js dependency one-directional.
|
|
12
|
+
|
|
13
|
+
import { readFileSync, writeFileSync, existsSync, mkdirSync, renameSync, rmSync } from 'node:fs';
|
|
14
|
+
import { join } from 'node:path';
|
|
15
|
+
|
|
16
|
+
const DEFAULT_STALE_MS = 90000;
|
|
17
|
+
|
|
18
|
+
function gsdDir(cwd, featureCode) {
|
|
19
|
+
return join(cwd, '.compose', 'gsd', featureCode);
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export function gsdStatePath(cwd, featureCode) {
|
|
23
|
+
return join(gsdDir(cwd, featureCode), 'state.json');
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// COMP-GSD-6-WATCHDOG: best-effort removal of pause.json. After the supervisor
|
|
27
|
+
// kills a hung child, clearing pause.json lets loadResumeTaskGraph's crash-bridge
|
|
28
|
+
// recover from the current state.json (it prefers pause.json when present). A
|
|
29
|
+
// path+rm helper lives here (gsd-state owns the gsd dir layout) so the supervisor
|
|
30
|
+
// needn't import the heavy gsd.js.
|
|
31
|
+
export function clearGsdPause(cwd, featureCode) {
|
|
32
|
+
const p = join(gsdDir(cwd, featureCode), 'pause.json');
|
|
33
|
+
try { if (existsSync(p)) rmSync(p, { force: true }); } catch { /* best-effort */ }
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// COMP-GSD-7-EVENTLOG: best-effort removal of a prior run's halt artifacts
|
|
37
|
+
// (stuck.{json,md}, budget.{json,md}). A fresh run clears these at its planning
|
|
38
|
+
// checkpoint so the milestone report's timeline — and its snapshot fallback,
|
|
39
|
+
// which reads these files — reflects only the current run, not a stale earlier
|
|
40
|
+
// halt. A clean complete clears only pause.json, so these can otherwise linger.
|
|
41
|
+
export function clearGsdHaltArtifacts(cwd, featureCode) {
|
|
42
|
+
const dir = gsdDir(cwd, featureCode);
|
|
43
|
+
for (const name of ['stuck.json', 'stuck.md', 'budget.json', 'budget.md']) {
|
|
44
|
+
const p = join(dir, name);
|
|
45
|
+
try { if (existsSync(p)) rmSync(p, { force: true }); } catch { /* best-effort */ }
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// Authoritative liveness probe. Signal 0 checks existence without delivering a
|
|
50
|
+
// signal. EPERM => the process exists but isn't ours (still alive) — this is the
|
|
51
|
+
// semantics crash detection needs (cf. build.js isProcessAlive, which returns
|
|
52
|
+
// false on EPERM and is therefore wrong for this purpose).
|
|
53
|
+
export function pidAlive(pid) {
|
|
54
|
+
if (!pid || typeof pid !== 'number') return false;
|
|
55
|
+
try {
|
|
56
|
+
process.kill(pid, 0);
|
|
57
|
+
return true;
|
|
58
|
+
} catch (err) {
|
|
59
|
+
return err.code === 'EPERM';
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Atomic write: stamp heartbeatAt, write to a tmp sibling, rename into place.
|
|
64
|
+
// Returns the object as persisted (with heartbeatAt). The caller owns `pid`
|
|
65
|
+
// (the runtime sets process.pid; tests inject a synthetic pid) — unlike
|
|
66
|
+
// writeActiveBuild, we do NOT force process.pid here.
|
|
67
|
+
export function writeGsdState(cwd, featureCode, state) {
|
|
68
|
+
const dir = gsdDir(cwd, featureCode);
|
|
69
|
+
mkdirSync(dir, { recursive: true });
|
|
70
|
+
const persisted = { ...state, heartbeatAt: new Date().toISOString() };
|
|
71
|
+
const target = gsdStatePath(cwd, featureCode);
|
|
72
|
+
const tmp = `${target}.tmp`;
|
|
73
|
+
writeFileSync(tmp, JSON.stringify(persisted, null, 2));
|
|
74
|
+
renameSync(tmp, target);
|
|
75
|
+
return persisted;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
export function readGsdState(cwd, featureCode) {
|
|
79
|
+
const p = gsdStatePath(cwd, featureCode);
|
|
80
|
+
if (!existsSync(p)) return null;
|
|
81
|
+
try {
|
|
82
|
+
return JSON.parse(readFileSync(p, 'utf-8'));
|
|
83
|
+
} catch {
|
|
84
|
+
return null;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// Reader-side status derivation. Dead pid is the ONLY crash signal; a stale
|
|
89
|
+
// heartbeat on a live pid is advisory (heartbeatStale) — never a crash verdict,
|
|
90
|
+
// because a healthy long task legitimately sits in the dispatch poll loop.
|
|
91
|
+
//
|
|
92
|
+
// Returns { status, heartbeatStale }.
|
|
93
|
+
// running + live pid + fresh hb -> { running, false }
|
|
94
|
+
// running + live pid + stale hb -> { running, true }
|
|
95
|
+
// running + dead pid -> { crashed, false }
|
|
96
|
+
// <terminal> -> { <terminal>, false } (complete|stuck|budget|failed)
|
|
97
|
+
// null/no status -> { absent, false }
|
|
98
|
+
export function deriveRunStatus(state, { staleMs = DEFAULT_STALE_MS, now = Date.now() } = {}) {
|
|
99
|
+
if (!state || !state.status) return { status: 'absent', heartbeatStale: false };
|
|
100
|
+
if (state.status !== 'running') {
|
|
101
|
+
return { status: state.status, heartbeatStale: false };
|
|
102
|
+
}
|
|
103
|
+
if (!pidAlive(state.pid)) {
|
|
104
|
+
return { status: 'crashed', heartbeatStale: false };
|
|
105
|
+
}
|
|
106
|
+
const hb = state.heartbeatAt ? Date.parse(state.heartbeatAt) : null;
|
|
107
|
+
const heartbeatStale = hb != null && !Number.isNaN(hb) && now - hb > staleMs;
|
|
108
|
+
return { status: 'running', heartbeatStale };
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// COMP-GSD-6: synthesize the `compose gsd query` snapshot (contracts/
|
|
112
|
+
// gsd-state.json#/definitions/query). Fixed source precedence so the
|
|
113
|
+
// pre-dispatch cumulative-budget refusal (budget.json, no state.json) isn't
|
|
114
|
+
// mislabeled 'absent': state.json -> pause.json -> budget.json -> absent.
|
|
115
|
+
// Pure synchronous reads — no LLM/server/Stratum (~ms).
|
|
116
|
+
export function buildGsdQuery(cwd, featureCode, { staleMs = DEFAULT_STALE_MS, now = Date.now() } = {}) {
|
|
117
|
+
const dir = gsdDir(cwd, featureCode);
|
|
118
|
+
|
|
119
|
+
// 1. state.json
|
|
120
|
+
const state = readGsdState(cwd, featureCode);
|
|
121
|
+
if (state) {
|
|
122
|
+
const { status, heartbeatStale } = deriveRunStatus(state, { staleMs, now });
|
|
123
|
+
const total = Array.isArray(state.decomposedTasks) ? state.decomposedTasks.length : 0;
|
|
124
|
+
const completed = Array.isArray(state.completedTaskIds) ? state.completedTaskIds.length : 0;
|
|
125
|
+
return {
|
|
126
|
+
feature: featureCode,
|
|
127
|
+
status,
|
|
128
|
+
phase: state.phase ?? null,
|
|
129
|
+
heartbeatStale,
|
|
130
|
+
progress: { completed, total },
|
|
131
|
+
resumeReady: !!state.resumeReady,
|
|
132
|
+
pid: state.pid ?? null,
|
|
133
|
+
flowId: state.flowId ?? null,
|
|
134
|
+
heartbeatAt: state.heartbeatAt ?? null,
|
|
135
|
+
budget: state.budget ?? null,
|
|
136
|
+
};
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// 2. pause.json (paused, no live state.json — e.g. a pre-GSD-6 halt)
|
|
140
|
+
const pausePath = join(dir, 'pause.json');
|
|
141
|
+
if (existsSync(pausePath)) {
|
|
142
|
+
try {
|
|
143
|
+
const p = JSON.parse(readFileSync(pausePath, 'utf-8'));
|
|
144
|
+
const kind = p.kind === 'budget' ? 'budget' : 'stuck';
|
|
145
|
+
return {
|
|
146
|
+
feature: featureCode,
|
|
147
|
+
status: kind,
|
|
148
|
+
phase: 'execute',
|
|
149
|
+
pause: { kind, detail: p.detail ?? null, stuckTaskId: p.stuckTaskId ?? null },
|
|
150
|
+
};
|
|
151
|
+
} catch { /* fall through */ }
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// 3. budget.json (pre-dispatch cumulative refusal — writes budget.json, no state)
|
|
155
|
+
const budgetPath = join(dir, 'budget.json');
|
|
156
|
+
if (existsSync(budgetPath)) {
|
|
157
|
+
try {
|
|
158
|
+
const b = JSON.parse(readFileSync(budgetPath, 'utf-8'));
|
|
159
|
+
return { feature: featureCode, status: 'budget', budget: b };
|
|
160
|
+
} catch { /* fall through */ }
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// 4. absent
|
|
164
|
+
return { feature: featureCode, status: 'absent' };
|
|
165
|
+
}
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
// lib/gsd-supervisor.js
|
|
2
|
+
//
|
|
3
|
+
// COMP-GSD-6 S06: the `--headless` supervisor. An outer loop that owns child
|
|
4
|
+
// run attempts — a self-resuming in-process loop can't survive a hard crash, so
|
|
5
|
+
// the supervisor spawns each attempt and re-spawns on a recoverable non-clean
|
|
6
|
+
// exit with exponential backoff and per-kind attempt caps.
|
|
7
|
+
//
|
|
8
|
+
// Classification is driven by the TERMINAL state.json status (not exit code
|
|
9
|
+
// alone): complete | stuck | budget | failed | crashed | (no-state ⇒ fatal).
|
|
10
|
+
// Budget never auto-resumes unless explicitly opted in (protects the GSD-4
|
|
11
|
+
// ceiling). A crash re-spawns --resume only when resumeReady, else fresh.
|
|
12
|
+
|
|
13
|
+
import { spawn } from 'node:child_process';
|
|
14
|
+
import { join, dirname } from 'node:path';
|
|
15
|
+
import { fileURLToPath } from 'node:url';
|
|
16
|
+
|
|
17
|
+
import { buildGsdQuery, pidAlive, clearGsdPause } from './gsd-state.js';
|
|
18
|
+
import { readHeadlessConfig, backoffMs } from './gsd-headless-config.js';
|
|
19
|
+
|
|
20
|
+
const COMPOSE_BIN = join(dirname(fileURLToPath(import.meta.url)), '..', 'bin', 'compose.js');
|
|
21
|
+
|
|
22
|
+
// Map a derived run status to a recovery decision. Pure — takes the per-kind
|
|
23
|
+
// retry counts already consumed, returns what the loop should do next.
|
|
24
|
+
//
|
|
25
|
+
// Returns one of:
|
|
26
|
+
// { terminal: true, status } — done / non-recoverable
|
|
27
|
+
// { terminal: false, status, kind, mode, capExhausted } — retry (or capped)
|
|
28
|
+
export function classifyOutcome(derivedStatus, state, cfg, counts) {
|
|
29
|
+
switch (derivedStatus) {
|
|
30
|
+
case 'complete':
|
|
31
|
+
return { terminal: true, status: 'complete', ok: true };
|
|
32
|
+
case 'failed':
|
|
33
|
+
// Orderly fatal exit (dirty workspace, parse error) — re-running re-fails.
|
|
34
|
+
return { terminal: true, status: 'failed', ok: false };
|
|
35
|
+
case 'absent':
|
|
36
|
+
// No running checkpoint was ever written → a pre-checkpoint failure
|
|
37
|
+
// (bad args, dirty tree before planning). Non-recoverable by absence.
|
|
38
|
+
return { terminal: true, status: 'fatal', ok: false };
|
|
39
|
+
case 'stuck':
|
|
40
|
+
return retryDecision('stuck', 'stuck', 'resume', cfg, counts);
|
|
41
|
+
case 'budget':
|
|
42
|
+
return retryDecision('budget', 'budget', 'resume', cfg, counts);
|
|
43
|
+
case 'crashed': {
|
|
44
|
+
// resumeReady gates --resume (task graph exists) vs fresh restart
|
|
45
|
+
// (crashed during plan/decompose — nothing merged yet).
|
|
46
|
+
const mode = state?.resumeReady ? 'resume' : 'fresh';
|
|
47
|
+
return retryDecision('crash', 'crashed', mode, cfg, counts);
|
|
48
|
+
}
|
|
49
|
+
default:
|
|
50
|
+
// running with a live pid after the child exited shouldn't happen.
|
|
51
|
+
return { terminal: true, status: derivedStatus ?? 'unknown', ok: false };
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function retryDecision(policyKey, status, mode, cfg, counts) {
|
|
56
|
+
const policy = cfg.autoResume[policyKey];
|
|
57
|
+
if (!policy || !policy.enabled) {
|
|
58
|
+
return { terminal: true, status, ok: false, reason: 'auto-resume disabled' };
|
|
59
|
+
}
|
|
60
|
+
const used = counts[policyKey] ?? 0;
|
|
61
|
+
if (used >= policy.maxAttempts) {
|
|
62
|
+
return { terminal: true, status, ok: false, reason: 'maxAttempts exhausted', capExhausted: true };
|
|
63
|
+
}
|
|
64
|
+
return { terminal: false, status, kind: policyKey, mode };
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Default real spawner — runs a PLAIN `compose gsd <feature> [--resume]` child
|
|
68
|
+
// (NOT --headless: that would recurse into another supervisor). Resolves with
|
|
69
|
+
// { code, signal } on exit.
|
|
70
|
+
function defaultSpawnRun({ feature, resume, cwd, attempt }) {
|
|
71
|
+
return new Promise((resolve) => {
|
|
72
|
+
const args = [COMPOSE_BIN, 'gsd', feature];
|
|
73
|
+
if (resume) args.push('--resume');
|
|
74
|
+
if (cwd) args.push('--cwd', cwd);
|
|
75
|
+
const child = spawn(process.execPath, args, {
|
|
76
|
+
stdio: 'inherit',
|
|
77
|
+
cwd: cwd ?? process.cwd(),
|
|
78
|
+
env: { ...process.env, GSD_HEADLESS_ATTEMPT: String(attempt) },
|
|
79
|
+
});
|
|
80
|
+
child.on('exit', (code, signal) => resolve({ code, signal }));
|
|
81
|
+
child.on('error', (err) => resolve({ code: 1, signal: null, error: err }));
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const defaultSleep = (ms) => new Promise((r) => setTimeout(r, ms));
|
|
86
|
+
|
|
87
|
+
// Abort-aware, unref'd sleep for the watchdog poll: resolves immediately when the
|
|
88
|
+
// signal aborts (so a clean child exit doesn't leave the supervisor waiting a full
|
|
89
|
+
// poll interval) and never holds the process open (unref).
|
|
90
|
+
function abortableSleep(ms, signal) {
|
|
91
|
+
return new Promise((resolve) => {
|
|
92
|
+
if (signal?.aborted) return resolve();
|
|
93
|
+
const t = setTimeout(resolve, ms);
|
|
94
|
+
if (t.unref) t.unref();
|
|
95
|
+
signal?.addEventListener('abort', () => { clearTimeout(t); resolve(); }, { once: true });
|
|
96
|
+
});
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// COMP-GSD-6-WATCHDOG: poll the child's state.json for a HUNG run — heartbeat
|
|
100
|
+
// frozen on a still-alive pid. Resolves the hung snapshot, or null when aborted
|
|
101
|
+
// (the child exited first). Requires the child's independent heartbeat timer
|
|
102
|
+
// (gsd.js) for `heartbeatStale` to be a sound signal.
|
|
103
|
+
//
|
|
104
|
+
// Confirm-poll: declares hung only after TWO consecutive stale polls with an
|
|
105
|
+
// UNCHANGED heartbeatAt. A host suspend / forward clock jump can momentarily make
|
|
106
|
+
// `now - heartbeatAt > staleMs` true before the just-woken child re-stamps its
|
|
107
|
+
// heartbeat; requiring the heartbeat to stay frozen across two polls lets a
|
|
108
|
+
// healthy child clear the alarm, so only a truly wedged loop is killed.
|
|
109
|
+
export async function defaultWatch({ feature, cwd, cfg, signal, sleep, buildQuery }) {
|
|
110
|
+
const query = buildQuery ?? buildGsdQuery;
|
|
111
|
+
// Default to an abort-aware unref'd sleep so a clean exit ends the poll at once
|
|
112
|
+
// and the timer never holds the process open. Tests inject an instant sleep.
|
|
113
|
+
const nap = sleep ?? ((ms) => abortableSleep(ms, signal));
|
|
114
|
+
let prevHb = null;
|
|
115
|
+
while (!signal.aborted) {
|
|
116
|
+
await nap(cfg.watchdogPollMs);
|
|
117
|
+
if (signal.aborted) return null;
|
|
118
|
+
const s = query(cwd, feature, { staleMs: cfg.heartbeatStaleMs });
|
|
119
|
+
if (s.status === 'running' && s.heartbeatStale) {
|
|
120
|
+
if (prevHb !== null && s.heartbeatAt === prevHb) return s; // frozen across 2 polls → hung
|
|
121
|
+
prevHb = s.heartbeatAt;
|
|
122
|
+
} else {
|
|
123
|
+
prevHb = null; // healthy, or heartbeat advanced (suspend/wake) → reset
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
return null;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// COMP-GSD-6-WATCHDOG: kill a hung child by pid. SIGTERM, wait the grace window,
|
|
130
|
+
// then SIGKILL if it's still alive. The supervisor never holds the child handle
|
|
131
|
+
// (defaultSpawnRun discards it), so the kill is pid-based. Best-effort: a child
|
|
132
|
+
// that already exited (ESRCH) is fine.
|
|
133
|
+
export async function defaultKillChild(pid, cfg, deps = {}) {
|
|
134
|
+
if (!pid) return;
|
|
135
|
+
const kill = deps.kill ?? process.kill.bind(process);
|
|
136
|
+
const nap = deps.sleep ?? defaultSleep;
|
|
137
|
+
const alive = deps.isAlive ?? pidAlive;
|
|
138
|
+
try { kill(pid, 'SIGTERM'); } catch { /* already gone */ }
|
|
139
|
+
await nap(cfg.watchdogKillGraceMs);
|
|
140
|
+
if (alive(pid)) {
|
|
141
|
+
try { kill(pid, 'SIGKILL'); } catch { /* gone during grace */ }
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// The supervisor loop. opts.spawnRun / opts.sleep are injectable for tests.
|
|
146
|
+
// Returns { status, attempts, history }.
|
|
147
|
+
export async function runGsdHeadless(feature, opts = {}) {
|
|
148
|
+
const cwd = opts.cwd ?? process.cwd();
|
|
149
|
+
const cfg = opts.config ?? readHeadlessConfig(cwd);
|
|
150
|
+
const spawnRun = opts.spawnRun ?? defaultSpawnRun;
|
|
151
|
+
const sleep = opts.sleep ?? defaultSleep;
|
|
152
|
+
const watch = opts.watch ?? defaultWatch; // COMP-GSD-6-WATCHDOG
|
|
153
|
+
const killChild = opts.killChild ?? defaultKillChild;
|
|
154
|
+
const log = opts.log ?? ((m) => console.error(`[gsd-headless] ${m}`));
|
|
155
|
+
const maxTotalAttempts = opts.maxTotalAttempts ?? 50; // hard backstop
|
|
156
|
+
|
|
157
|
+
const counts = { crash: 0, stuck: 0, budget: 0, hung: 0 };
|
|
158
|
+
const history = [];
|
|
159
|
+
let mode = opts.resume ? 'resume' : 'fresh';
|
|
160
|
+
let attempt = 0;
|
|
161
|
+
|
|
162
|
+
while (attempt < maxTotalAttempts) {
|
|
163
|
+
attempt += 1;
|
|
164
|
+
log(`attempt ${attempt} (${mode})`);
|
|
165
|
+
|
|
166
|
+
// COMP-GSD-6-WATCHDOG: race the child's exit against the hung watchdog. When
|
|
167
|
+
// the watchdog wins, kill+reap the child and classify as a 'hung' recovery.
|
|
168
|
+
// When the watchdog is disabled, this degrades to the plain `await spawnRun`.
|
|
169
|
+
const exitP = spawnRun({ feature, resume: mode === 'resume', cwd, attempt });
|
|
170
|
+
let exit, snap, outcome;
|
|
171
|
+
const hungPolicy = cfg.autoResume.hung;
|
|
172
|
+
|
|
173
|
+
if (hungPolicy && hungPolicy.enabled) {
|
|
174
|
+
const ac = new AbortController();
|
|
175
|
+
// No `sleep` passed → defaultWatch uses its abort-aware unref'd poll sleep.
|
|
176
|
+
const watchP = watch({ feature, cwd, cfg, signal: ac.signal });
|
|
177
|
+
const raced = await Promise.race([
|
|
178
|
+
exitP.then((e) => ({ type: 'exit', exit: e })),
|
|
179
|
+
watchP.then((s) => (s ? { type: 'hung', snap: s } : { type: 'idle' })),
|
|
180
|
+
]);
|
|
181
|
+
if (raced.type === 'hung') {
|
|
182
|
+
log(`watchdog: hung run (heartbeat frozen, pid ${raced.snap.pid}) — killing`);
|
|
183
|
+
await killChild(raced.snap.pid, cfg);
|
|
184
|
+
exit = await exitP; // reap the killed child
|
|
185
|
+
clearGsdPause(cwd, feature); // crash-bridge uses current state.json
|
|
186
|
+
const m = raced.snap.resumeReady ? 'resume' : 'fresh';
|
|
187
|
+
snap = { status: 'hung' };
|
|
188
|
+
outcome = retryDecision('hung', 'hung', m, cfg, counts);
|
|
189
|
+
} else {
|
|
190
|
+
ac.abort(); // stop the watcher; exit won
|
|
191
|
+
exit = raced.exit;
|
|
192
|
+
}
|
|
193
|
+
} else {
|
|
194
|
+
exit = await exitP; // watchdog off → today's path
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
if (!outcome) {
|
|
198
|
+
// Classify via the full query precedence (state.json -> pause.json ->
|
|
199
|
+
// budget.json -> absent), NOT raw state alone, so a pre-dispatch cumulative-
|
|
200
|
+
// budget refusal (budget.json, no state.json) is seen as 'budget', not
|
|
201
|
+
// 'absent'. The snapshot also carries resumeReady for the crashed branch.
|
|
202
|
+
snap = buildGsdQuery(cwd, feature, { staleMs: cfg.heartbeatStaleMs });
|
|
203
|
+
outcome = classifyOutcome(snap.status, snap, cfg, counts);
|
|
204
|
+
}
|
|
205
|
+
history.push({ attempt, mode, exitCode: exit.code ?? null, derived: snap.status, outcome: outcome.status });
|
|
206
|
+
|
|
207
|
+
if (outcome.terminal) {
|
|
208
|
+
if (outcome.ok) log(`run complete after ${attempt} attempt(s)`);
|
|
209
|
+
else log(`stopping: ${outcome.status}${outcome.reason ? ` (${outcome.reason})` : ''}`);
|
|
210
|
+
return { status: outcome.status, ok: !!outcome.ok, attempts: attempt, history };
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// Recoverable — consume a retry for this kind, back off, re-spawn.
|
|
214
|
+
counts[outcome.kind] += 1;
|
|
215
|
+
mode = outcome.mode;
|
|
216
|
+
const wait = backoffMs(cfg, counts[outcome.kind]);
|
|
217
|
+
log(`recovering ${outcome.status} via ${mode} (retry ${counts[outcome.kind]}); backoff ${wait}ms`);
|
|
218
|
+
await sleep(wait);
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
log(`hit maxTotalAttempts (${maxTotalAttempts}) — giving up`);
|
|
222
|
+
return { status: 'aborted', ok: false, attempts: attempt, history };
|
|
223
|
+
}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
// lib/gsd-timing.js
|
|
2
|
+
//
|
|
3
|
+
// COMP-GSD-7 S1: per-task timing sidecar for the milestone report.
|
|
4
|
+
//
|
|
5
|
+
// Stratum's parallel_poll response does NOT carry per-task timing, and the
|
|
6
|
+
// blackboard is rebuilt from agent-written results/*.json validated against
|
|
7
|
+
// contracts/task-result.json (which forbids extra fields). So compose's own
|
|
8
|
+
// poll-loop observations have nowhere to land on the blackboard. This sidecar
|
|
9
|
+
// is the carrier: `.compose/gsd/<feature>/timing.json` = a plain
|
|
10
|
+
// { [taskId]: { startedAt, completedAt, durationMs } }
|
|
11
|
+
// map, written by the gsd dispatch loop and read by the report assembler.
|
|
12
|
+
//
|
|
13
|
+
// Caveat: completedAt is bounded by the dispatch poll interval (seconds), so
|
|
14
|
+
// durations are approximate-to-poll-granularity. The report footer documents it.
|
|
15
|
+
//
|
|
16
|
+
// Atomic write: tmp+rename, mirrors lib/gsd-state.js:44.
|
|
17
|
+
|
|
18
|
+
import { readFileSync, writeFileSync, existsSync, mkdirSync, renameSync, unlinkSync } from 'node:fs';
|
|
19
|
+
import { join } from 'node:path';
|
|
20
|
+
|
|
21
|
+
const TERMINAL_STATES = new Set(['complete', 'failed', 'cancelled']);
|
|
22
|
+
|
|
23
|
+
function gsdDir(cwd, featureCode) {
|
|
24
|
+
return join(cwd, '.compose', 'gsd', featureCode);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export function timingSidecarPath(cwd, featureCode) {
|
|
28
|
+
return join(gsdDir(cwd, featureCode), 'timing.json');
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// Atomic write: write to a tmp sibling, rename into place. Clears a stale tmp
|
|
32
|
+
// from a previously-crashed write first.
|
|
33
|
+
export function writeTimingSidecar(cwd, featureCode, timingMap) {
|
|
34
|
+
const dir = gsdDir(cwd, featureCode);
|
|
35
|
+
mkdirSync(dir, { recursive: true });
|
|
36
|
+
const target = timingSidecarPath(cwd, featureCode);
|
|
37
|
+
const tmp = `${target}.tmp`;
|
|
38
|
+
if (existsSync(tmp)) {
|
|
39
|
+
try { unlinkSync(tmp); } catch { /* ignore */ }
|
|
40
|
+
}
|
|
41
|
+
writeFileSync(tmp, JSON.stringify(timingMap, null, 2));
|
|
42
|
+
renameSync(tmp, target);
|
|
43
|
+
return target;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Read-or-{}. Corrupt/unreadable JSON degrades to {} — the report renders an
|
|
47
|
+
// empty timing column rather than failing.
|
|
48
|
+
export function readTimingSidecar(cwd, featureCode) {
|
|
49
|
+
const p = timingSidecarPath(cwd, featureCode);
|
|
50
|
+
if (!existsSync(p)) return {};
|
|
51
|
+
try {
|
|
52
|
+
return JSON.parse(readFileSync(p, 'utf-8'));
|
|
53
|
+
} catch {
|
|
54
|
+
return {};
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Pure poll accumulator. Called once per poll with the current task-state map.
|
|
60
|
+
*
|
|
61
|
+
* - First sight of a task → record `startedAt = nowIso`.
|
|
62
|
+
* - First time a task is seen in a terminal state (complete|failed|cancelled) →
|
|
63
|
+
* record `completedAt = nowIso` and `durationMs` (completedAt − startedAt,
|
|
64
|
+
* floored at 0). A task first seen already-terminal gets startedAt==completedAt
|
|
65
|
+
* and durationMs 0.
|
|
66
|
+
*
|
|
67
|
+
* Idempotent: an existing startedAt/completedAt is never overwritten (first
|
|
68
|
+
* observation wins), so re-polling the same terminal task is a no-op.
|
|
69
|
+
*
|
|
70
|
+
* @returns the same `timingMap` reference (mutated in place).
|
|
71
|
+
*/
|
|
72
|
+
export function recordTaskStates(timingMap, pollTasks, nowIso) {
|
|
73
|
+
if (!pollTasks || typeof pollTasks !== 'object') return timingMap;
|
|
74
|
+
for (const [taskId, ts] of Object.entries(pollTasks)) {
|
|
75
|
+
const state = ts?.state;
|
|
76
|
+
let entry = timingMap[taskId];
|
|
77
|
+
if (!entry) {
|
|
78
|
+
entry = { startedAt: nowIso };
|
|
79
|
+
timingMap[taskId] = entry;
|
|
80
|
+
}
|
|
81
|
+
if (entry.completedAt == null && TERMINAL_STATES.has(state)) {
|
|
82
|
+
entry.completedAt = nowIso;
|
|
83
|
+
const start = Date.parse(entry.startedAt);
|
|
84
|
+
const end = Date.parse(nowIso);
|
|
85
|
+
entry.durationMs = Number.isNaN(start) || Number.isNaN(end) ? 0 : Math.max(0, end - start);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
return timingMap;
|
|
89
|
+
}
|