@smartmemory/compose 0.2.7-beta → 0.2.9-beta
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/compose.js +112 -3
- package/contracts/gsd-state.json +140 -0
- package/contracts/gsd-stuck.json +141 -0
- package/dist/assets/{App-D3ehVPvi.js → App-CG-2euMe.js} +164 -164
- package/dist/assets/{arc-Dmf69iHG.js → arc-7QBWoLra.js} +1 -1
- package/dist/assets/{architectureDiagram-3BPJPVTR-xYo993Yw.js → architectureDiagram-3BPJPVTR-CUw-7uLm.js} +1 -1
- package/dist/assets/{blockDiagram-GPEHLZMM-UX4EF98O.js → blockDiagram-GPEHLZMM-COU1vmr7.js} +1 -1
- package/dist/assets/{c4Diagram-AAUBKEIU-DaP9CGWb.js → c4Diagram-AAUBKEIU-XPO9PSJL.js} +1 -1
- package/dist/assets/channel-Bcu04MIK.js +1 -0
- package/dist/assets/{chunk-2J33WTMH-CKk_RN3A.js → chunk-2J33WTMH-zMzVB2a6.js} +1 -1
- package/dist/assets/{chunk-4BX2VUAB-DboAwYKw.js → chunk-4BX2VUAB-Kke_qcHU.js} +1 -1
- package/dist/assets/{chunk-55IACEB6-Dsy9RYvI.js → chunk-55IACEB6-hMeFx5Nh.js} +1 -1
- package/dist/assets/{chunk-727SXJPM-fAH0QO9v.js → chunk-727SXJPM-DesUnrEw.js} +1 -1
- package/dist/assets/{chunk-AQP2D5EJ-DyZYerFP.js → chunk-AQP2D5EJ-1uGGvkxW.js} +1 -1
- package/dist/assets/{chunk-FMBD7UC4-BnboGO5t.js → chunk-FMBD7UC4-DYHv1PcZ.js} +1 -1
- package/dist/assets/{chunk-ND2GUHAM-Di9tYXme.js → chunk-ND2GUHAM-D0MENOLX.js} +1 -1
- package/dist/assets/{chunk-QZHKN3VN-zRPRlAIL.js → chunk-QZHKN3VN-8nn3HP-N.js} +1 -1
- package/dist/assets/classDiagram-4FO5ZUOK-DU4yxldU.js +1 -0
- package/dist/assets/classDiagram-v2-Q7XG4LA2-DU4yxldU.js +1 -0
- package/dist/assets/{cose-bilkent-S5V4N54A-C7Hqukaf.js → cose-bilkent-S5V4N54A-BoZPVIny.js} +1 -1
- package/dist/assets/{dagre-BM42HDAG-B-cR-BjI.js → dagre-BM42HDAG-BgZzdLG9.js} +1 -1
- package/dist/assets/{diagram-2AECGRRQ-B6-5onDk.js → diagram-2AECGRRQ-CknAnpSu.js} +1 -1
- package/dist/assets/{diagram-5GNKFQAL-DoZZgFAM.js → diagram-5GNKFQAL-CZUEbKim.js} +1 -1
- package/dist/assets/{diagram-KO2AKTUF-77jEGlJh.js → diagram-KO2AKTUF-DCs-pLdH.js} +1 -1
- package/dist/assets/{diagram-LMA3HP47-D3S7XDRD.js → diagram-LMA3HP47-lRaDjIfM.js} +1 -1
- package/dist/assets/{diagram-OG6HWLK6-KbYL9aCY.js → diagram-OG6HWLK6-CIGqmehP.js} +1 -1
- package/dist/assets/{erDiagram-TEJ5UH35-DezFbJP-.js → erDiagram-TEJ5UH35-Lx3c2N6F.js} +1 -1
- package/dist/assets/{flowDiagram-I6XJVG4X-4x31cK9j.js → flowDiagram-I6XJVG4X-VoluKqSq.js} +1 -1
- package/dist/assets/{ganttDiagram-6RSMTGT7-FopfSTyZ.js → ganttDiagram-6RSMTGT7-D7hETiNZ.js} +1 -1
- package/dist/assets/{gitGraphDiagram-PVQCEYII-DSiQGKbN.js → gitGraphDiagram-PVQCEYII-DenEcUvY.js} +1 -1
- package/dist/assets/{index-ClX6LVAf.js → index-B4dv3acY.js} +2 -2
- package/dist/assets/{infoDiagram-5YYISTIA-DE6BqzK_.js → infoDiagram-5YYISTIA-v7cq9Er9.js} +1 -1
- package/dist/assets/{ishikawaDiagram-YF4QCWOH-Dml8NwQI.js → ishikawaDiagram-YF4QCWOH-CfCCXt2x.js} +1 -1
- package/dist/assets/{journeyDiagram-JHISSGLW-CwWeJgjE.js → journeyDiagram-JHISSGLW-Bbokl_xO.js} +1 -1
- package/dist/assets/{kanban-definition-UN3LZRKU-DnG956Wh.js → kanban-definition-UN3LZRKU-DhkOZ2hg.js} +1 -1
- package/dist/assets/{linear-CA3N7Rpi.js → linear-bHjluRm2.js} +1 -1
- package/dist/assets/{mindmap-definition-RKZ34NQL-CxfIOjLX.js → mindmap-definition-RKZ34NQL-C1bHpoXH.js} +1 -1
- package/dist/assets/{pieDiagram-4H26LBE5-O7aIwy1x.js → pieDiagram-4H26LBE5-CZb1i55T.js} +1 -1
- package/dist/assets/{quadrantDiagram-W4KKPZXB-CPQ2qq7c.js → quadrantDiagram-W4KKPZXB-o37AwRHB.js} +1 -1
- package/dist/assets/{requirementDiagram-4Y6WPE33-C23horL4.js → requirementDiagram-4Y6WPE33-BVErWDzU.js} +1 -1
- package/dist/assets/{sankeyDiagram-5OEKKPKP-DPY04kOW.js → sankeyDiagram-5OEKKPKP-BhBK8gHQ.js} +1 -1
- package/dist/assets/{sequenceDiagram-3UESZ5HK-BKaTfIvo.js → sequenceDiagram-3UESZ5HK-CsICF23P.js} +1 -1
- package/dist/assets/{stateDiagram-AJRCARHV-B9na_6mY.js → stateDiagram-AJRCARHV-TN1AXwim.js} +1 -1
- package/dist/assets/stateDiagram-v2-BHNVJYJU-BLR6AkKX.js +1 -0
- package/dist/assets/{timeline-definition-PNZ67QCA-BBWPqd7X.js → timeline-definition-PNZ67QCA-DftAajbU.js} +1 -1
- package/dist/assets/{vennDiagram-CIIHVFJN-tWqiHsOZ.js → vennDiagram-CIIHVFJN-cFTMstT7.js} +1 -1
- package/dist/assets/{wardley-L42UT6IY-DorxG6os.js → wardley-L42UT6IY-DL8CivzO.js} +1 -1
- package/dist/assets/{wardleyDiagram-YWT4CUSO-B49f8GzW.js → wardleyDiagram-YWT4CUSO-BDZT1hQj.js} +1 -1
- package/dist/assets/{xychartDiagram-2RQKCTM6-BgKSj8Qb.js → xychartDiagram-2RQKCTM6-DQQSkfC4.js} +1 -1
- package/dist/index.html +1 -1
- package/lib/budget-ledger.js +84 -0
- package/lib/build-stream-schema.js +5 -3
- package/lib/build.js +122 -2
- package/lib/feature-validator.js +40 -8
- package/lib/gsd-budget.js +205 -0
- package/lib/gsd-diff-capture.js +34 -0
- package/lib/gsd-events.js +61 -0
- package/lib/gsd-headless-config.js +110 -0
- package/lib/gsd-milestone-report.js +323 -0
- package/lib/gsd-state.js +165 -0
- package/lib/gsd-stuck.js +275 -0
- package/lib/gsd-supervisor.js +223 -0
- package/lib/gsd-timing.js +89 -0
- package/lib/gsd.js +908 -16
- package/package.json +1 -1
- package/dist/assets/channel-D_RXsFFT.js +0 -1
- package/dist/assets/classDiagram-4FO5ZUOK-K6wdB4ic.js +0 -1
- package/dist/assets/classDiagram-v2-Q7XG4LA2-K6wdB4ic.js +0 -1
- package/dist/assets/stateDiagram-v2-BHNVJYJU-Cf84VDiH.js +0 -1
package/lib/gsd.js
CHANGED
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
// V1 limitation: runtime task-to-task handoff is not implemented; tasks see
|
|
14
14
|
// only spec-level upstream context (Boundary Map declarations) per blueprint.
|
|
15
15
|
|
|
16
|
-
import { readFileSync, existsSync, readdirSync } from 'node:fs';
|
|
16
|
+
import { readFileSync, writeFileSync, existsSync, readdirSync, mkdirSync, rmSync, statSync, renameSync } from 'node:fs';
|
|
17
17
|
import { join, resolve, dirname } from 'node:path';
|
|
18
18
|
import { fileURLToPath } from 'node:url';
|
|
19
19
|
import { execSync } from 'node:child_process';
|
|
@@ -22,8 +22,17 @@ import { StratumMcpClient } from './stratum-mcp-client.js';
|
|
|
22
22
|
import { validateBoundaryMap } from './boundary-map.js';
|
|
23
23
|
import { enrichTaskGraph } from './gsd-decompose-enrich.js';
|
|
24
24
|
import { buildTaskDescription } from './gsd-prompt.js';
|
|
25
|
-
import { writeAll, validate as validateTaskResult } from './gsd-blackboard.js';
|
|
25
|
+
import { writeAll, validate as validateTaskResult, read as readBlackboard } from './gsd-blackboard.js';
|
|
26
26
|
import { executeParallelDispatchServer, executeShipStep } from './build.js';
|
|
27
|
+
import { GsdStuckDetector, DEFAULT_THRESHOLDS } from './gsd-stuck.js';
|
|
28
|
+
import { readGsdBudgetConfig, buildBudgetBlock, injectBudget, composeBudgetDiagnostic } from './gsd-budget.js';
|
|
29
|
+
import { recordGsdUsage, checkGsdCumulativeBudget } from './budget-ledger.js';
|
|
30
|
+
// COMP-GSD-6: continuous run-state checkpoint + canonical pid-liveness probe.
|
|
31
|
+
// pidAlive is canonical in gsd-state.js (EPERM=alive) and imported one-way here.
|
|
32
|
+
import { writeGsdState, readGsdState, gsdStatePath, pidAlive, clearGsdHaltArtifacts } from './gsd-state.js';
|
|
33
|
+
import { generateGsdMilestoneReport } from './gsd-milestone-report.js';
|
|
34
|
+
import { readHeadlessConfig } from './gsd-headless-config.js';
|
|
35
|
+
import { appendGsdEvent, clearGsdEvents } from './gsd-events.js';
|
|
27
36
|
|
|
28
37
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
29
38
|
const PACKAGE_ROOT = resolve(__dirname, '..');
|
|
@@ -38,6 +47,15 @@ export async function runGsd(featureCode, opts = {}) {
|
|
|
38
47
|
}
|
|
39
48
|
const cwd = opts.cwd ?? process.cwd();
|
|
40
49
|
|
|
50
|
+
// COMP-GSD-6: a FRESH (non-resume) run must not inherit a prior run's
|
|
51
|
+
// state.json. Clear it up front so that if a precondition below throws BEFORE
|
|
52
|
+
// the planning checkpoint, NO running state remains → the headless supervisor
|
|
53
|
+
// (and `query`) read 'absent' → fatal-by-absence, never a stale 'complete'
|
|
54
|
+
// success. A resume keeps the old state.json (the crash-bridge may need it).
|
|
55
|
+
if (!opts.resume) {
|
|
56
|
+
try { rmSync(gsdStatePath(cwd, featureCode), { force: true }); } catch { /* ignore */ }
|
|
57
|
+
}
|
|
58
|
+
|
|
41
59
|
// 1. Validate preconditions: blueprint exists + Boundary Map ok
|
|
42
60
|
const blueprintPath = join(cwd, 'docs', 'features', featureCode, 'blueprint.md');
|
|
43
61
|
if (!existsSync(blueprintPath)) {
|
|
@@ -62,12 +80,36 @@ export async function runGsd(featureCode, opts = {}) {
|
|
|
62
80
|
);
|
|
63
81
|
}
|
|
64
82
|
|
|
65
|
-
// 2.
|
|
83
|
+
// 2. COMP-GSD-5 resume branch — runs BEFORE the dirty-tree check so a
|
|
84
|
+
// pid/mode-guard failure (the more specific precondition) is reported first.
|
|
85
|
+
// --resume reads pause.json, guards on ownership (no live pid) +
|
|
86
|
+
// mode==='gsd' (mirrors `compose fix --resume`), and seeds a precomputed task
|
|
87
|
+
// graph = decomposedTasks MINUS completedTaskIds so the execute step
|
|
88
|
+
// re-dispatches only the unfinished work. Completed results already live in
|
|
89
|
+
// the blackboard. resumeTaskGraph (when set) makes runOneStep skip the
|
|
90
|
+
// decompose agent entirely → stable task IDs, no re-decompose.
|
|
91
|
+
// COMP-GSD-4: read+guard the resume graph here for guard-ordering, but DEFER
|
|
92
|
+
// the atomic pause.lock claim (claim:false) — runGsd claims inside its try so
|
|
93
|
+
// the finally always releases it (no strand on re-halt/refusal/throw).
|
|
94
|
+
let resumeTaskGraph = null;
|
|
95
|
+
if (opts.resume) {
|
|
96
|
+
resumeTaskGraph = loadResumeTaskGraph(cwd, featureCode, { claim: false });
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// 3. Refuse to start in a dirty workspace BEFORE any Stratum side effects.
|
|
66
100
|
// v1 rationale: alternatives (baseline subtract + post-execute delta) drop
|
|
67
101
|
// legitimate edits to pre-existing dirty files. Refuse-if-dirty makes
|
|
68
102
|
// post-execute dirty set unambiguous: every entry is GSD-produced.
|
|
103
|
+
//
|
|
104
|
+
// On --resume the GSD control plane (.compose/gsd/<feature>/) legitimately
|
|
105
|
+
// carries the prior run's pause.json/blackboard.json/results — that's the
|
|
106
|
+
// resume STATE, not an unrelated edit — so exclude it from the dirty set.
|
|
69
107
|
if (!opts.allowDirtyWorkspace) {
|
|
70
|
-
|
|
108
|
+
let startingDirty = collectChangedFiles(cwd);
|
|
109
|
+
if (opts.resume) {
|
|
110
|
+
const ctrlPrefix = `.compose/gsd/${featureCode}/`;
|
|
111
|
+
startingDirty = startingDirty.filter((f) => !f.startsWith(ctrlPrefix));
|
|
112
|
+
}
|
|
71
113
|
if (startingDirty.length > 0) {
|
|
72
114
|
throw new Error(
|
|
73
115
|
`runGsd: working tree must be clean to ensure ship_gsd stages only GSD-produced changes. ` +
|
|
@@ -77,38 +119,197 @@ export async function runGsd(featureCode, opts = {}) {
|
|
|
77
119
|
}
|
|
78
120
|
}
|
|
79
121
|
|
|
80
|
-
//
|
|
122
|
+
// 4. Resolve gateCommands. loadProjectConfig() does not merge defaults, so
|
|
81
123
|
// explicit fallback here.
|
|
82
124
|
const gateCommands = resolveGateCommands(cwd, opts.gateCommands);
|
|
83
125
|
|
|
84
126
|
// 4. Load pipeline spec
|
|
85
127
|
const specPath = join(PACKAGE_ROOT, 'pipelines', 'gsd.stratum.yaml');
|
|
86
|
-
|
|
128
|
+
// 4a. COMP-GSD-4: inject the stratum flow budget block from `gsd.budget.*`.
|
|
129
|
+
// injectBudget is IDENTITY when nothing is configured, so an un-budgeted gsd
|
|
130
|
+
// run (and plain `compose build`) is byte-identical.
|
|
131
|
+
const budgetCfg = readGsdBudgetConfig(cwd);
|
|
132
|
+
const specYaml = injectBudget(readFileSync(specPath, 'utf-8'), budgetCfg);
|
|
133
|
+
|
|
134
|
+
// 4a. COMP-GSD-4: cumulative cross-session ceiling pre-check (tokens/cost).
|
|
135
|
+
// Refuse to start/resume a run that has already spent its lifetime budget —
|
|
136
|
+
// re-dispatching would immediately re-trip. Runs before the try, so no
|
|
137
|
+
// pause.lock is held yet (the claim is the first statement inside the try).
|
|
138
|
+
const cumulative = buildBudgetBlock(budgetCfg).cumulative;
|
|
139
|
+
if (cumulative) {
|
|
140
|
+
const chk = checkGsdCumulativeBudget(join(cwd, '.compose'), featureCode, cumulative);
|
|
141
|
+
if (chk.exceeded) {
|
|
142
|
+
writeCumulativeRefusal(cwd, featureCode, chk, cumulative);
|
|
143
|
+
return { status: 'budget', flowId: null, axis: 'cumulative', reason: chk.reason };
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// 4b. COMP-GSD-5 stuck detector — thresholds from .compose/compose.json
|
|
148
|
+
// `gsd.stuck.*` with documented defaults. ONLY gsd passes this into the
|
|
149
|
+
// shared executeParallelDispatchServer, so build mode is byte-identical.
|
|
150
|
+
const stuckDetector = buildStuckDetector(cwd);
|
|
87
151
|
|
|
88
152
|
// 5. Connect Stratum + plan (only after preconditions pass)
|
|
89
153
|
const stratum = opts.stratum ?? new StratumMcpClient();
|
|
90
154
|
const ownsStratum = !opts.stratum;
|
|
91
155
|
if (ownsStratum) await stratum.connect();
|
|
156
|
+
// COMP-GSD-4: ownership flag — release the resume lock in finally ONLY if THIS
|
|
157
|
+
// process successfully claimed it (set below). Prevents (a) a non-resume run
|
|
158
|
+
// from clobbering a concurrent resume's valid claim and (b) a claim-race loser
|
|
159
|
+
// (EEXIST) from deleting the winner's lock on its way out.
|
|
160
|
+
let lockClaimed = false;
|
|
161
|
+
let runLockClaimed = false;
|
|
162
|
+
// COMP-GSD-6: the in-memory run-state, threaded through stepCtx and flushed to
|
|
163
|
+
// state.json. Declared here so the catch/finally can read it.
|
|
164
|
+
let stepCtx = null;
|
|
165
|
+
// COMP-GSD-6-WATCHDOG: independent wall-clock heartbeat timer (see below).
|
|
166
|
+
// Declared here so the finally can always clear it.
|
|
167
|
+
let heartbeatTimer = null;
|
|
92
168
|
try {
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
169
|
+
// COMP-GSD-6: claim the live-run lock BEFORE any stratum side effect, so two
|
|
170
|
+
// fresh `compose gsd <same-feature>` runs can't race the results dir. Takes
|
|
171
|
+
// over a stale lock (dead owner) and refuses a live one.
|
|
172
|
+
claimRunLock(cwd, featureCode);
|
|
173
|
+
runLockClaimed = true;
|
|
174
|
+
|
|
175
|
+
// COMP-GSD-4: claim the resume lock HERE (first statement in the try) so the
|
|
176
|
+
// finally releases it on EVERY exit — budget/stuck re-halt, throw, or clean
|
|
177
|
+
// finish. loadResumeTaskGraph above already read+guarded (claim:false).
|
|
178
|
+
if (opts.resume) {
|
|
179
|
+
claimResumeLock(cwd, featureCode); // throws EEXIST → finally sees lockClaimed=false
|
|
180
|
+
lockClaimed = true;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// COMP-GSD-6: pre-plan "planning" checkpoint. A crash during plan/decompose
|
|
184
|
+
// now leaves a dead-pid state.json — the failed-vs-fatal boundary. A throw
|
|
185
|
+
// BEFORE this point (preconditions) leaves no running state → fatal by
|
|
186
|
+
// absence; a throw AFTER → the catch converts it to status:"failed".
|
|
187
|
+
// On resume, seed the planning checkpoint from the (in-memory) resume graph
|
|
188
|
+
// so that if THIS resume re-crashes before its decompose step repopulates
|
|
189
|
+
// state.json, the crash-bridge still has a task graph to recover from
|
|
190
|
+
// (otherwise the fresh empty checkpoint would clobber the prior good data).
|
|
191
|
+
const resumeTasks = opts.resume ? (resumeTaskGraph?.tasks ?? []).map((t) => ({ ...t })) : [];
|
|
192
|
+
const initialState = {
|
|
193
|
+
feature: featureCode,
|
|
194
|
+
flowId: null,
|
|
195
|
+
pid: process.pid,
|
|
196
|
+
mode: 'gsd',
|
|
197
|
+
phase: 'planning',
|
|
198
|
+
status: 'running',
|
|
199
|
+
startedAt: new Date().toISOString(),
|
|
200
|
+
headless: !!opts.headless,
|
|
201
|
+
attempt: opts.attempt ?? 1,
|
|
202
|
+
resumeReady: opts.resume && resumeTasks.length > 0,
|
|
203
|
+
decomposedTasks: resumeTasks,
|
|
204
|
+
completedTaskIds: collectCompletedTaskIds(cwd, featureCode),
|
|
205
|
+
};
|
|
98
206
|
|
|
99
207
|
// Track files merged into the base cwd by the execute step so ship_gsd
|
|
100
208
|
// can stage them. executeShipStep's default filter only stages feature
|
|
101
209
|
// docs unless context.filesChanged is provided.
|
|
102
|
-
|
|
210
|
+
stepCtx = {
|
|
103
211
|
stratum, cwd, featureCode, blueprintText, gateCommands,
|
|
104
212
|
filesChanged: [],
|
|
213
|
+
stuckDetector,
|
|
214
|
+
resumeTaskGraph,
|
|
215
|
+
stuck: null, // set by runOneStep on a stuck verdict
|
|
216
|
+
runState: initialState, // COMP-GSD-6: flushState merges into this
|
|
217
|
+
// COMP-GSD-7-EVENTLOG: tasks already completed at run start (a resume
|
|
218
|
+
// preloads them) are seeded as already-emitted so the appended log never
|
|
219
|
+
// re-fires task_completed for prior-session completions.
|
|
220
|
+
emittedCompletions: new Set(initialState.completedTaskIds),
|
|
221
|
+
// COMP-GSD-7-EVENTLOG: phases already announced (dedupe — runState.phase is
|
|
222
|
+
// set to 'execute' before the merge checkpoint, so it can't gate emission).
|
|
223
|
+
emittedPhases: new Set(),
|
|
105
224
|
};
|
|
225
|
+
flushState(stepCtx, {}); // write the planning checkpoint
|
|
226
|
+
|
|
227
|
+
// COMP-GSD-7-EVENTLOG: at the planning checkpoint — AFTER preconditions
|
|
228
|
+
// passed (so a failed fresh invocation never wipes a prior run's history) —
|
|
229
|
+
// a fresh run truncates the event log and clears stale halt artifacts so the
|
|
230
|
+
// timeline reflects only this run; a resume appends to the existing log.
|
|
231
|
+
if (!opts.resume) {
|
|
232
|
+
clearGsdEvents(cwd, featureCode);
|
|
233
|
+
clearGsdHaltArtifacts(cwd, featureCode);
|
|
234
|
+
}
|
|
235
|
+
appendGsdEvent(cwd, featureCode, 'run_started', {
|
|
236
|
+
mode: opts.resume ? 'resume' : 'fresh',
|
|
237
|
+
attempt: opts.attempt ?? 1,
|
|
238
|
+
});
|
|
239
|
+
|
|
240
|
+
// COMP-GSD-6-WATCHDOG: an INDEPENDENT wall-clock heartbeat. The existing
|
|
241
|
+
// heartbeat only advances on agent push-events (onHeartbeat below), so a
|
|
242
|
+
// quiet-but-healthy task would look stale. This timer restamps state.json's
|
|
243
|
+
// heartbeat on a fixed cadence whenever the event loop is still turning — so
|
|
244
|
+
// a stale heartbeat genuinely means the loop is WEDGED (or the process dead),
|
|
245
|
+
// which is what the headless watchdog keys its hung-kill on. .unref() so it
|
|
246
|
+
// never holds the process open; cleared in finally. Same empty-patch restamp
|
|
247
|
+
// onHeartbeat uses, so it's behavior-compatible.
|
|
248
|
+
//
|
|
249
|
+
// Gated to SUPERVISED children only (GSD_HEADLESS_ATTEMPT, set by the
|
|
250
|
+
// supervisor's spawner) — the supervisor is the sole watcher, so an
|
|
251
|
+
// interactive `compose gsd` stays byte-identical (no extra state.json writes).
|
|
252
|
+
if (process.env.GSD_HEADLESS_ATTEMPT != null) {
|
|
253
|
+
const hbMs = readHeadlessConfig(cwd).watchdogHeartbeatMs;
|
|
254
|
+
heartbeatTimer = setInterval(() => {
|
|
255
|
+
try { if (stepCtx?.runState) flushState(stepCtx, {}); } catch { /* best-effort */ }
|
|
256
|
+
}, hbMs);
|
|
257
|
+
heartbeatTimer.unref?.();
|
|
258
|
+
}
|
|
106
259
|
|
|
107
|
-
|
|
108
|
-
|
|
260
|
+
let response = await stratum.plan(specYaml, 'gsd', {
|
|
261
|
+
featureCode,
|
|
262
|
+
gateCommands,
|
|
263
|
+
});
|
|
264
|
+
const flowId = response.flow_id;
|
|
265
|
+
flushState(stepCtx, { flowId, phase: 'decompose' });
|
|
266
|
+
emitPhaseOnce(stepCtx, 'decompose'); // COMP-GSD-7-EVENTLOG
|
|
267
|
+
|
|
268
|
+
// 5. Status loop. `stuck` (COMP-GSD-5) and `budget_exhausted` (COMP-GSD-4)
|
|
269
|
+
// are terminal statuses. `stuck` is set compose-side by runOneStep; budget
|
|
270
|
+
// is the stratum flow-budget terminal, surfaced verbatim through the advance/
|
|
271
|
+
// poll envelopes (and carries budget_state).
|
|
272
|
+
while (
|
|
273
|
+
response.status !== 'complete' &&
|
|
274
|
+
response.status !== 'killed' &&
|
|
275
|
+
response.status !== 'stuck' &&
|
|
276
|
+
response.status !== 'budget_exhausted'
|
|
277
|
+
) {
|
|
109
278
|
response = await runOneStep(response, stepCtx);
|
|
110
279
|
}
|
|
111
280
|
|
|
281
|
+
if (response.status === 'stuck') {
|
|
282
|
+
// Artifacts (stuck.md/json + pause.json) were written by runOneStep.
|
|
283
|
+
// COMP-GSD-7-EVENTLOG: flush any completions that finished before the stuck
|
|
284
|
+
// verdict (the stuck path returns early, before the execute-merge delta),
|
|
285
|
+
// then record the pause.
|
|
286
|
+
emitCompletionDeltas(stepCtx);
|
|
287
|
+
appendGsdEvent(cwd, featureCode, 'paused', { pauseKind: 'stuck', taskId: stepCtx.stuck?.taskId ?? null });
|
|
288
|
+
flushState(stepCtx, { status: 'stuck' }); // COMP-GSD-6 terminal checkpoint
|
|
289
|
+
return {
|
|
290
|
+
status: 'stuck',
|
|
291
|
+
flowId,
|
|
292
|
+
stuckTaskId: stepCtx.stuck?.taskId ?? null,
|
|
293
|
+
signal: stepCtx.stuck?.signal ?? null,
|
|
294
|
+
};
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
if (response.status === 'budget_exhausted') {
|
|
298
|
+
// COMP-GSD-4: the stratum flow budget tripped. The flow already
|
|
299
|
+
// cascade-cancelled in-flight siblings. Persist budget.{md,json} +
|
|
300
|
+
// pause.json (kind:budget) for --resume, record cumulative usage, and
|
|
301
|
+
// return a terminal `budget` envelope. pause.lock is released by finally.
|
|
302
|
+
const budgetState = response.budget_state ?? {};
|
|
303
|
+
writeBudgetArtifacts(stepCtx, response, budgetState);
|
|
304
|
+
recordGsdUsageFromState(cwd, featureCode, budgetState);
|
|
305
|
+
const axis = composeBudgetDiagnostic(budgetState, { feature: featureCode }).json.axis;
|
|
306
|
+
// COMP-GSD-7-EVENTLOG: flush pre-halt completions, then record the pause.
|
|
307
|
+
emitCompletionDeltas(stepCtx);
|
|
308
|
+
appendGsdEvent(cwd, featureCode, 'paused', { pauseKind: 'budget', axis });
|
|
309
|
+
flushState(stepCtx, { status: 'budget' }); // COMP-GSD-6 terminal checkpoint
|
|
310
|
+
return { status: 'budget', flowId, axis, consumed: budgetState.consumed ?? {}, caps: budgetState.caps ?? {} };
|
|
311
|
+
}
|
|
312
|
+
|
|
112
313
|
// 6. Post-step blackboard finalization — read each task's TaskResult JSON
|
|
113
314
|
// and write the consolidated blackboard.
|
|
114
315
|
const blackboard = collectBlackboard(cwd, featureCode);
|
|
@@ -116,12 +317,85 @@ export async function runGsd(featureCode, opts = {}) {
|
|
|
116
317
|
await writeAll(featureCode, blackboard, { cwd });
|
|
117
318
|
}
|
|
118
319
|
|
|
320
|
+
// 6b. COMP-GSD-5: a clean (non-stuck) finish clears any pause.json — the
|
|
321
|
+
// resume completed, or a fresh run superseded a stale pause.
|
|
322
|
+
if (response.status === 'complete') {
|
|
323
|
+
// COMP-GSD-4: record this run's cumulative usage (best-effort; no-op when
|
|
324
|
+
// the complete envelope carries no budget_state, e.g. un-budgeted runs).
|
|
325
|
+
recordGsdUsageFromState(cwd, featureCode, response.budget_state);
|
|
326
|
+
clearPauseFile(cwd, featureCode);
|
|
327
|
+
// COMP-GSD-7: on a clean complete, budget.json is NOT written (only halts
|
|
328
|
+
// write it). Persist a budget-final.json snapshot so the milestone report
|
|
329
|
+
// (auto + retroactive `gsd report`) has actuals-vs-caps. No-op when the
|
|
330
|
+
// envelope carries no budget_state (un-budgeted run). Best-effort: this is
|
|
331
|
+
// a derived report input — a write failure must NEVER demote a successful
|
|
332
|
+
// run to 'failed' via the outer catch.
|
|
333
|
+
if (response.budget_state) {
|
|
334
|
+
try {
|
|
335
|
+
writeBudgetFinalSnapshot(stepCtx, response.budget_state);
|
|
336
|
+
} catch (err) {
|
|
337
|
+
console.warn(`[gsd] budget-final snapshot failed: ${err.message}`);
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
// COMP-GSD-6: terminal state.json flush. Only 'complete' is a success; any
|
|
343
|
+
// other terminal here (e.g. stratum 'killed') maps to 'failed' so we stay
|
|
344
|
+
// within the closed status vocabulary the contract + supervisor share.
|
|
345
|
+
// COMP-GSD-7: stamp completedAt so retroactive reports can recover wall-clock.
|
|
346
|
+
const terminalStatus = response.status === 'complete' ? 'complete' : 'failed';
|
|
347
|
+
// COMP-GSD-7-EVENTLOG: emit the terminal event. complete → final completion
|
|
348
|
+
// deltas + 'completed'; any other terminal (e.g. stratum 'killed') → 'failed'.
|
|
349
|
+
if (terminalStatus === 'complete') {
|
|
350
|
+
emitCompletionDeltas(stepCtx);
|
|
351
|
+
appendGsdEvent(cwd, featureCode, 'completed', {});
|
|
352
|
+
} else {
|
|
353
|
+
appendGsdEvent(cwd, featureCode, 'failed', { reason: response.status ?? 'unknown' });
|
|
354
|
+
}
|
|
355
|
+
flushState(stepCtx, { status: terminalStatus, phase: 'done', completedAt: new Date().toISOString() });
|
|
356
|
+
|
|
357
|
+
// COMP-GSD-7: best-effort milestone report on a clean complete. A report
|
|
358
|
+
// failure must never fail the run — it is a derived artifact.
|
|
359
|
+
if (terminalStatus === 'complete') {
|
|
360
|
+
try {
|
|
361
|
+
const r = generateGsdMilestoneReport(featureCode, cwd);
|
|
362
|
+
if (!r.ok) console.warn(`[gsd] milestone report skipped: ${r.error}`);
|
|
363
|
+
} catch (err) {
|
|
364
|
+
console.warn(`[gsd] milestone report generation failed: ${err.message}`);
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
// Return the normalized closed-vocabulary status (not the raw stratum status)
|
|
369
|
+
// so the CLI/callers don't mistake a 'killed' terminal for success.
|
|
119
370
|
return {
|
|
120
|
-
status:
|
|
371
|
+
status: terminalStatus,
|
|
121
372
|
flowId,
|
|
122
373
|
blackboardEntries: Object.keys(blackboard).length,
|
|
123
374
|
};
|
|
375
|
+
} catch (err) {
|
|
376
|
+
// COMP-GSD-6: an orderly throw AFTER the planning checkpoint becomes a
|
|
377
|
+
// terminal status:"failed" so the supervisor treats it as non-recoverable
|
|
378
|
+
// (vs a hard crash → status stays "running" + dead pid → reader-derived
|
|
379
|
+
// "crashed"). Guard on a persisted running state so pre-checkpoint throws
|
|
380
|
+
// (which left no running state) stay fatal-by-absence, not "failed".
|
|
381
|
+
if (stepCtx?.runState && readGsdState(cwd, featureCode)?.status === 'running') {
|
|
382
|
+
try { flushState(stepCtx, { status: 'failed' }); } catch { /* best-effort */ }
|
|
383
|
+
// COMP-GSD-7-EVENTLOG: record the failure (only when a run actually started
|
|
384
|
+
// — a pre-checkpoint throw left no running state and gets no event). Append
|
|
385
|
+
// is best-effort; never mask the original error.
|
|
386
|
+
appendGsdEvent(cwd, featureCode, 'failed', { reason: err?.message ?? 'error' });
|
|
387
|
+
}
|
|
388
|
+
throw err;
|
|
124
389
|
} finally {
|
|
390
|
+
// COMP-GSD-6-WATCHDOG: stop the independent heartbeat timer.
|
|
391
|
+
if (heartbeatTimer) clearInterval(heartbeatTimer);
|
|
392
|
+
// COMP-GSD-6: release the live-run lock if THIS process claimed it.
|
|
393
|
+
if (runLockClaimed) releaseRunLock(cwd, featureCode);
|
|
394
|
+
// COMP-GSD-4: release the resume claim ONLY if THIS process claimed it
|
|
395
|
+
// (ownership-aware — never clobber a concurrent run's valid claim, and don't
|
|
396
|
+
// release after losing the claim race). pause.json persists for --resume
|
|
397
|
+
// unless a clean complete cleared it above.
|
|
398
|
+
if (lockClaimed) releasePauseLock(cwd, featureCode);
|
|
125
399
|
if (ownsStratum) {
|
|
126
400
|
try { await stratum.disconnect?.(); } catch { /* best-effort */ }
|
|
127
401
|
}
|
|
@@ -153,6 +427,7 @@ async function runOneStep(response, ctx) {
|
|
|
153
427
|
const flowId = response.flow_id;
|
|
154
428
|
const stepId = response.step_id;
|
|
155
429
|
const stepType = response.type ?? response.step_type;
|
|
430
|
+
if (stepId) ctx.lastStepId = stepId; // COMP-GSD-4: for the budget pause's stepId
|
|
156
431
|
|
|
157
432
|
if (response.status === 'execute_step') {
|
|
158
433
|
// parallel_dispatch step (the `execute` step)
|
|
@@ -160,15 +435,40 @@ async function runOneStep(response, ctx) {
|
|
|
160
435
|
const outcome = await executeParallelDispatchServer(
|
|
161
436
|
response,
|
|
162
437
|
stratum,
|
|
163
|
-
{ cwd, featureCode },
|
|
438
|
+
{ cwd, featureCode, gsd: true }, // COMP-GSD-7: gates timing+diff capture
|
|
164
439
|
null, // progress
|
|
165
440
|
{ write: () => {} }, // streamWriter — no-op for v1
|
|
166
441
|
cwd,
|
|
442
|
+
{
|
|
443
|
+
stuckDetector: ctx.stuckDetector, // COMP-GSD-5 (null in non-gsd callers)
|
|
444
|
+
// COMP-GSD-6: bump state.json's heartbeat on every task event so a long
|
|
445
|
+
// task sitting in the dispatch poll loop isn't mistaken for crashed.
|
|
446
|
+
onHeartbeat: ctx.runState ? () => { try { flushState(ctx, {}); } catch { /* best-effort */ } } : null,
|
|
447
|
+
},
|
|
167
448
|
);
|
|
449
|
+
|
|
450
|
+
// COMP-GSD-5: a stuck verdict halts the run. Persist the diagnostic +
|
|
451
|
+
// resume state, then return a terminal `stuck` envelope so runGsd's loop
|
|
452
|
+
// exits. The task was already cancelled (conflict) inside dispatch.
|
|
453
|
+
if (outcome && outcome.stuck) {
|
|
454
|
+
ctx.stuck = outcome.stuck;
|
|
455
|
+
writeStuckArtifacts(ctx, response, outcome.stuck);
|
|
456
|
+
return { status: 'stuck', flow_id: flowId, step_id: stepId };
|
|
457
|
+
}
|
|
458
|
+
|
|
168
459
|
// After diffs are merged, capture the touched files for ship_gsd
|
|
169
460
|
// staging. The clean-workspace precondition above guarantees every
|
|
170
461
|
// file in the post-execute dirty set is genuinely a GSD-produced change.
|
|
171
462
|
ctx.filesChanged = collectChangedFiles(cwd);
|
|
463
|
+
// COMP-GSD-6: checkpoint completed tasks after the execute merge.
|
|
464
|
+
// COMP-GSD-7-EVENTLOG: emit the execute-phase transition once, then a
|
|
465
|
+
// task_completed event per newly-completed task.
|
|
466
|
+
if (ctx.runState) {
|
|
467
|
+
const completed = collectCompletedTaskIds(cwd, featureCode);
|
|
468
|
+
flushState(ctx, { phase: 'execute', completedTaskIds: completed });
|
|
469
|
+
emitPhaseOnce(ctx, 'execute'); // dedupes; runState.phase can't gate this
|
|
470
|
+
emitCompletionDeltas(ctx, completed);
|
|
471
|
+
}
|
|
172
472
|
// executeParallelDispatchServer returns the next-step dispatch envelope
|
|
173
473
|
return outcome;
|
|
174
474
|
}
|
|
@@ -191,6 +491,26 @@ async function runOneStep(response, ctx) {
|
|
|
191
491
|
return await stratum.stepDone(flowId, stepId, shipResult);
|
|
192
492
|
}
|
|
193
493
|
|
|
494
|
+
// COMP-GSD-5 resume: skip the decompose AGENT entirely and substitute the
|
|
495
|
+
// persisted task graph (already enriched/repaired during the original run
|
|
496
|
+
// and already filtered to exclude completedTaskIds). We do NOT re-run
|
|
497
|
+
// validateAndRepairTaskGraph: enrichTaskGraph would flag the completed
|
|
498
|
+
// tasks' Boundary Map slices as orphaned (no task in the SUBSET owns them).
|
|
499
|
+
// Stable task IDs + no re-decompose are the whole point.
|
|
500
|
+
if (stepId === 'decompose_gsd' && ctx.resumeTaskGraph) {
|
|
501
|
+
ctx.lastTaskGraph = ctx.resumeTaskGraph;
|
|
502
|
+
// COMP-GSD-6: a resume already has the (filtered) task graph — mark
|
|
503
|
+
// resumeReady so a re-crash during execute resumes rather than restarts.
|
|
504
|
+
if (ctx.runState) {
|
|
505
|
+
flushState(ctx, {
|
|
506
|
+
phase: 'execute',
|
|
507
|
+
resumeReady: true,
|
|
508
|
+
decomposedTasks: (ctx.resumeTaskGraph.tasks ?? []).map((t) => ({ ...t })),
|
|
509
|
+
});
|
|
510
|
+
}
|
|
511
|
+
return await stratum.stepDone(flowId, stepId, ctx.resumeTaskGraph);
|
|
512
|
+
}
|
|
513
|
+
|
|
194
514
|
// Single-agent step: dispatch via runAgentText. The agent returns text;
|
|
195
515
|
// we expect JSON matching the step's output_contract.
|
|
196
516
|
const prompt = response.intent ?? '';
|
|
@@ -207,6 +527,19 @@ async function runOneStep(response, ctx) {
|
|
|
207
527
|
// T6 step 7: validate decompose_gsd output and repair missing descriptions.
|
|
208
528
|
if (stepId === 'decompose_gsd') {
|
|
209
529
|
result = validateAndRepairTaskGraph(result, blueprintText, gateCommands);
|
|
530
|
+
// COMP-GSD-5: remember the ENRICHED graph so a later stuck halt can
|
|
531
|
+
// persist the full task definitions (with descriptions/produces/consumes)
|
|
532
|
+
// into pause.json — resume re-dispatches these without re-enriching.
|
|
533
|
+
ctx.lastTaskGraph = result;
|
|
534
|
+
// COMP-GSD-6: the task graph now exists → resumeReady true; persist it so a
|
|
535
|
+
// crash during execute can synthesize a resume graph from state.json.
|
|
536
|
+
if (ctx.runState) {
|
|
537
|
+
flushState(ctx, {
|
|
538
|
+
phase: 'execute',
|
|
539
|
+
resumeReady: true,
|
|
540
|
+
decomposedTasks: (result.tasks ?? []).map((t) => ({ ...t })),
|
|
541
|
+
});
|
|
542
|
+
}
|
|
210
543
|
}
|
|
211
544
|
|
|
212
545
|
return await stratum.stepDone(flowId, stepId, result);
|
|
@@ -362,3 +695,562 @@ function collectBlackboard(cwd, featureCode) {
|
|
|
362
695
|
}
|
|
363
696
|
return out;
|
|
364
697
|
}
|
|
698
|
+
|
|
699
|
+
// ===========================================================================
|
|
700
|
+
// COMP-GSD-5: stuck detection + resume
|
|
701
|
+
// ===========================================================================
|
|
702
|
+
|
|
703
|
+
function gsdDir(cwd, featureCode) {
|
|
704
|
+
return join(cwd, '.compose', 'gsd', featureCode);
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
// ===========================================================================
|
|
708
|
+
// COMP-GSD-6: run.lock (live-run exclusivity) + state.json flush helpers
|
|
709
|
+
// ===========================================================================
|
|
710
|
+
|
|
711
|
+
const RUN_LOCK_STALE_MS = 90000;
|
|
712
|
+
|
|
713
|
+
function runLockDir(cwd, featureCode) {
|
|
714
|
+
return join(gsdDir(cwd, featureCode), 'run.lock');
|
|
715
|
+
}
|
|
716
|
+
|
|
717
|
+
// Atomically take over a stale lock dir. The naive `rmSync` + `mkdirSync` is
|
|
718
|
+
// racy — two reclaimers can both see "stale", both rm, and one deletes the
|
|
719
|
+
// other's fresh lock. renameSync IS atomic, so only one racer can rename the
|
|
720
|
+
// stale dir aside; the loser gets ENOENT. The winner removes the renamed copy
|
|
721
|
+
// and re-creates the lock; if a NEW claimant raced into the freed name first,
|
|
722
|
+
// our mkdir gets EEXIST and we (correctly) report we lost. Returns true iff WE
|
|
723
|
+
// recreated the lock.
|
|
724
|
+
function takeoverStaleLock(lockPath) {
|
|
725
|
+
const aside = `${lockPath}.stale.${process.pid}.${Date.now()}`;
|
|
726
|
+
try {
|
|
727
|
+
renameSync(lockPath, aside); // atomic — loser gets ENOENT
|
|
728
|
+
} catch {
|
|
729
|
+
return false; // another racer already took it over (or it vanished)
|
|
730
|
+
}
|
|
731
|
+
try { rmSync(aside, { recursive: true, force: true }); } catch { /* best-effort */ }
|
|
732
|
+
try {
|
|
733
|
+
mkdirSync(lockPath);
|
|
734
|
+
return true;
|
|
735
|
+
} catch (err) {
|
|
736
|
+
if (err.code === 'EEXIST') return false; // a fresh claimant won the freed name
|
|
737
|
+
throw err;
|
|
738
|
+
}
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
// Read the owning pid for a run.lock: run.lock/owner.json first (lock-local
|
|
742
|
+
// record), then state.json (Codex review precedence). Returns a number or null.
|
|
743
|
+
function runLockOwnerPid(cwd, featureCode) {
|
|
744
|
+
const ownerPath = join(runLockDir(cwd, featureCode), 'owner.json');
|
|
745
|
+
if (existsSync(ownerPath)) {
|
|
746
|
+
try {
|
|
747
|
+
const o = JSON.parse(readFileSync(ownerPath, 'utf-8'));
|
|
748
|
+
if (typeof o.pid === 'number') return o.pid;
|
|
749
|
+
} catch { /* fall through to state.json */ }
|
|
750
|
+
}
|
|
751
|
+
const state = readGsdState(cwd, featureCode);
|
|
752
|
+
return typeof state?.pid === 'number' ? state.pid : null;
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
// Atomic live-run claim, taken BEFORE the first stratum side effect. mkdirSync
|
|
756
|
+
// is atomic on POSIX: the loser gets EEXIST. On EEXIST we take over a STALE lock
|
|
757
|
+
// — owner pid dead, OR (no owner record AND lock-dir mtime older than the stale
|
|
758
|
+
// window, which covers the sub-ms gap before owner.json lands). A live owner
|
|
759
|
+
// refuses. Writes run.lock/owner.json {pid,startedAt} immediately after winning.
|
|
760
|
+
export function claimRunLock(cwd, featureCode) {
|
|
761
|
+
const dir = gsdDir(cwd, featureCode);
|
|
762
|
+
mkdirSync(dir, { recursive: true });
|
|
763
|
+
const lock = runLockDir(cwd, featureCode);
|
|
764
|
+
const write = () => {
|
|
765
|
+
writeFileSync(
|
|
766
|
+
join(lock, 'owner.json'),
|
|
767
|
+
JSON.stringify({ pid: process.pid, startedAt: new Date().toISOString() }, null, 2),
|
|
768
|
+
);
|
|
769
|
+
};
|
|
770
|
+
try {
|
|
771
|
+
mkdirSync(lock);
|
|
772
|
+
write();
|
|
773
|
+
return;
|
|
774
|
+
} catch (err) {
|
|
775
|
+
if (err.code !== 'EEXIST') throw err;
|
|
776
|
+
}
|
|
777
|
+
// EEXIST — decide stale vs live.
|
|
778
|
+
const ownerPid = runLockOwnerPid(cwd, featureCode);
|
|
779
|
+
let stale = false;
|
|
780
|
+
if (typeof ownerPid === 'number') {
|
|
781
|
+
stale = !pidAlive(ownerPid);
|
|
782
|
+
} else {
|
|
783
|
+
// No owner record yet: fall back to lock-dir age.
|
|
784
|
+
try {
|
|
785
|
+
stale = Date.now() - statSync(lock).mtimeMs > RUN_LOCK_STALE_MS;
|
|
786
|
+
} catch { stale = true; }
|
|
787
|
+
}
|
|
788
|
+
if (!stale) {
|
|
789
|
+
throw new Error(
|
|
790
|
+
`runGsd: another gsd run owns ${featureCode} (.compose/gsd/${featureCode}/run.lock, ` +
|
|
791
|
+
`pid ${ownerPid ?? 'unknown'} alive). Refusing to start a concurrent run.`,
|
|
792
|
+
);
|
|
793
|
+
}
|
|
794
|
+
// Atomic stale takeover (rename-aside). If we lose the takeover race, another
|
|
795
|
+
// run now legitimately owns the feature — refuse.
|
|
796
|
+
if (!takeoverStaleLock(lock)) {
|
|
797
|
+
throw new Error(
|
|
798
|
+
`runGsd: another gsd run claimed ${featureCode} during stale-lock takeover. ` +
|
|
799
|
+
`Refusing to start a concurrent run.`,
|
|
800
|
+
);
|
|
801
|
+
}
|
|
802
|
+
write();
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
export function releaseRunLock(cwd, featureCode) {
|
|
806
|
+
rmSync(runLockDir(cwd, featureCode), { recursive: true, force: true });
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
// Merge a patch into ctx.runState and atomically flush state.json. ctx.runState
|
|
810
|
+
// is the single in-memory source of truth; every flush restamps heartbeatAt.
|
|
811
|
+
function flushState(ctx, patch) {
|
|
812
|
+
ctx.runState = { ...(ctx.runState ?? {}), ...patch };
|
|
813
|
+
writeGsdState(ctx.cwd, ctx.featureCode, ctx.runState);
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
// COMP-GSD-7-EVENTLOG: emit a `task_completed` event for each task that has
|
|
817
|
+
// completed since the last emit. Dedupes via ctx.emittedCompletions (seeded from
|
|
818
|
+
// the run's initial completed snapshot, so a resume never re-fires prior-session
|
|
819
|
+
// completions). Called at the execute-merge checkpoint and before each halt
|
|
820
|
+
// (stuck/budget) — the halt paths return early, before the merge checkpoint.
|
|
821
|
+
function emitCompletionDeltas(ctx, completedIds) {
|
|
822
|
+
if (!ctx?.emittedCompletions) return;
|
|
823
|
+
const ids = completedIds ?? collectCompletedTaskIds(ctx.cwd, ctx.featureCode);
|
|
824
|
+
for (const id of ids) {
|
|
825
|
+
if (!id || ctx.emittedCompletions.has(id)) continue;
|
|
826
|
+
ctx.emittedCompletions.add(id);
|
|
827
|
+
appendGsdEvent(ctx.cwd, ctx.featureCode, 'task_completed', { taskId: id });
|
|
828
|
+
}
|
|
829
|
+
}
|
|
830
|
+
|
|
831
|
+
// COMP-GSD-7-EVENTLOG: emit a `phase` event the first time a phase is entered.
|
|
832
|
+
// Deduped via ctx.emittedPhases — runState.phase is set to 'execute' before the
|
|
833
|
+
// execute-merge checkpoint runs, so it can't itself gate the emission.
|
|
834
|
+
function emitPhaseOnce(ctx, phase) {
|
|
835
|
+
if (!ctx?.emittedPhases || ctx.emittedPhases.has(phase)) return;
|
|
836
|
+
ctx.emittedPhases.add(phase);
|
|
837
|
+
appendGsdEvent(ctx.cwd, ctx.featureCode, 'phase', { phase });
|
|
838
|
+
}
|
|
839
|
+
|
|
840
|
+
/**
|
|
841
|
+
* Build a GsdStuckDetector from `.compose/compose.json` `gsd.stuck.*`, falling
|
|
842
|
+
* back to documented defaults (sameFileEdits=3, errorRepeats=3,
|
|
843
|
+
* noProgressCalls=8, wallClockMs=600000). Config keys use snake_case to match
|
|
844
|
+
* the design table; the detector takes camelCase.
|
|
845
|
+
*/
|
|
846
|
+
export function buildStuckDetector(cwd) {
|
|
847
|
+
const cfg = readGsdStuckConfig(cwd);
|
|
848
|
+
return new GsdStuckDetector({
|
|
849
|
+
sameFileEdits: cfg.same_file_edits ?? DEFAULT_THRESHOLDS.sameFileEdits,
|
|
850
|
+
errorRepeats: cfg.error_repeats ?? DEFAULT_THRESHOLDS.errorRepeats,
|
|
851
|
+
noProgressCalls: cfg.no_progress_calls ?? DEFAULT_THRESHOLDS.noProgressCalls,
|
|
852
|
+
wallClockMs: cfg.wall_clock_ms ?? DEFAULT_THRESHOLDS.wallClockMs,
|
|
853
|
+
});
|
|
854
|
+
}
|
|
855
|
+
|
|
856
|
+
function readGsdStuckConfig(cwd) {
|
|
857
|
+
const configPath = join(cwd, '.compose', 'compose.json');
|
|
858
|
+
if (!existsSync(configPath)) return {};
|
|
859
|
+
try {
|
|
860
|
+
const cfg = JSON.parse(readFileSync(configPath, 'utf-8'));
|
|
861
|
+
return cfg?.gsd?.stuck ?? {};
|
|
862
|
+
} catch {
|
|
863
|
+
return {};
|
|
864
|
+
}
|
|
865
|
+
}
|
|
866
|
+
|
|
867
|
+
/**
|
|
868
|
+
* Task ids whose VALIDATED TaskResult is already known — the union of the
|
|
869
|
+
* persisted blackboard and any per-task result files that validate. Lenient
|
|
870
|
+
* (does NOT throw on a bad file) because at stuck-halt time the run is being
|
|
871
|
+
* abandoned, not finalized.
|
|
872
|
+
*/
|
|
873
|
+
function collectCompletedTaskIds(cwd, featureCode) {
|
|
874
|
+
const done = new Set(Object.keys(readBlackboard(featureCode, { cwd }) ?? {}));
|
|
875
|
+
const dir = join(gsdDir(cwd, featureCode), 'results');
|
|
876
|
+
if (existsSync(dir)) {
|
|
877
|
+
for (const f of readdirSync(dir).filter((x) => x.endsWith('.json'))) {
|
|
878
|
+
try {
|
|
879
|
+
const parsed = JSON.parse(readFileSync(join(dir, f), 'utf-8'));
|
|
880
|
+
if (validateTaskResult(parsed).ok) done.add(f.replace(/\.json$/, ''));
|
|
881
|
+
} catch { /* skip unreadable */ }
|
|
882
|
+
}
|
|
883
|
+
}
|
|
884
|
+
return [...done];
|
|
885
|
+
}
|
|
886
|
+
|
|
887
|
+
/** Best-effort unified diff of the whole working tree (for the stuck.md triage). */
|
|
888
|
+
function captureWorkingDiff(cwd) {
|
|
889
|
+
try {
|
|
890
|
+
return execSync('git diff HEAD', {
|
|
891
|
+
cwd, encoding: 'utf-8', timeout: 5000, maxBuffer: 4 * 1024 * 1024,
|
|
892
|
+
stdio: ['ignore', 'pipe', 'ignore'],
|
|
893
|
+
}).trim();
|
|
894
|
+
} catch {
|
|
895
|
+
return '';
|
|
896
|
+
}
|
|
897
|
+
}
|
|
898
|
+
|
|
899
|
+
/**
|
|
900
|
+
* Persist the stuck diagnostic (stuck.md + stuck.json, per
|
|
901
|
+
* contracts/gsd-stuck.json#stuck) AND the resume state (pause.json, per
|
|
902
|
+
* #pause). decomposedTasks is the FULL task list (from the dispatch envelope),
|
|
903
|
+
* persisted so --resume does not re-decompose. completedTaskIds comes from the
|
|
904
|
+
* blackboard / results dir.
|
|
905
|
+
*/
|
|
906
|
+
function writeStuckArtifacts(ctx, dispatchResponse, verdict) {
|
|
907
|
+
const { cwd, featureCode } = ctx;
|
|
908
|
+
const dir = gsdDir(cwd, featureCode);
|
|
909
|
+
mkdirSync(dir, { recursive: true });
|
|
910
|
+
const ts = new Date().toISOString();
|
|
911
|
+
|
|
912
|
+
// Persist the FULLY-ENRICHED task graph (captured at decompose) so --resume
|
|
913
|
+
// re-dispatches the unfinished subset WITHOUT re-decomposing or re-enriching.
|
|
914
|
+
// Fall back to the dispatch envelope's tasks only if enrichment wasn't seen.
|
|
915
|
+
const sourceTasks = ctx.lastTaskGraph?.tasks ?? dispatchResponse.tasks ?? [];
|
|
916
|
+
const decomposedTasks = sourceTasks.map((t) => ({ ...t }));
|
|
917
|
+
const completedTaskIds = collectCompletedTaskIds(cwd, featureCode);
|
|
918
|
+
const partialDiff = captureWorkingDiff(cwd);
|
|
919
|
+
|
|
920
|
+
const stuck = {
|
|
921
|
+
feature: featureCode,
|
|
922
|
+
taskId: verdict.taskId,
|
|
923
|
+
signal: verdict.signal,
|
|
924
|
+
detail: verdict.detail,
|
|
925
|
+
attemptCounts: verdict.attemptCounts ?? {},
|
|
926
|
+
ts,
|
|
927
|
+
};
|
|
928
|
+
if (partialDiff) stuck.partialDiff = partialDiff;
|
|
929
|
+
writeFileSync(join(dir, 'stuck.json'), JSON.stringify(stuck, null, 2) + '\n');
|
|
930
|
+
|
|
931
|
+
const pause = {
|
|
932
|
+
flowId: dispatchResponse.flow_id,
|
|
933
|
+
stepId: dispatchResponse.step_id,
|
|
934
|
+
stuckTaskId: verdict.taskId,
|
|
935
|
+
signal: verdict.signal,
|
|
936
|
+
detail: verdict.detail,
|
|
937
|
+
decomposedTasks,
|
|
938
|
+
completedTaskIds,
|
|
939
|
+
pid: process.pid,
|
|
940
|
+
mode: 'gsd',
|
|
941
|
+
ts,
|
|
942
|
+
};
|
|
943
|
+
writeFileSync(join(dir, 'pause.json'), JSON.stringify(pause, null, 2) + '\n');
|
|
944
|
+
|
|
945
|
+
writeFileSync(join(dir, 'stuck.md'), renderStuckMarkdown(stuck, pause));
|
|
946
|
+
}
|
|
947
|
+
|
|
948
|
+
function renderStuckMarkdown(stuck, pause) {
|
|
949
|
+
const remaining = pause.decomposedTasks
|
|
950
|
+
.map((t) => t.id)
|
|
951
|
+
.filter((id) => !pause.completedTaskIds.includes(id));
|
|
952
|
+
return `# GSD stuck: ${stuck.feature}
|
|
953
|
+
|
|
954
|
+
**Signal:** \`${stuck.signal}\`
|
|
955
|
+
**Stuck task:** \`${stuck.taskId}\`
|
|
956
|
+
**Detected:** ${stuck.ts}
|
|
957
|
+
|
|
958
|
+
## What happened
|
|
959
|
+
|
|
960
|
+
${stuck.detail}
|
|
961
|
+
|
|
962
|
+
Attempt counts at halt:
|
|
963
|
+
- same-file edits (max across files): ${stuck.attemptCounts.sameFileEdits ?? 0}
|
|
964
|
+
- error repeats (max across hashes): ${stuck.attemptCounts.errorRepeats ?? 0}
|
|
965
|
+
- consecutive no-progress calls: ${stuck.attemptCounts.noProgressCalls ?? 0}
|
|
966
|
+
|
|
967
|
+
The in-flight task was cancelled and the run halted cleanly.
|
|
968
|
+
|
|
969
|
+
## Resume or abort
|
|
970
|
+
|
|
971
|
+
Completed tasks (already in the blackboard, will be skipped): ${pause.completedTaskIds.length ? pause.completedTaskIds.map((x) => `\`${x}\``).join(', ') : '(none)'}
|
|
972
|
+
Tasks that will re-dispatch on resume: ${remaining.length ? remaining.map((x) => `\`${x}\``).join(', ') : '(none)'}
|
|
973
|
+
|
|
974
|
+
- **Resume:** \`compose gsd ${stuck.feature} --resume\` — re-dispatches the unfinished tasks into fresh worktrees.
|
|
975
|
+
- **Abort:** delete \`.compose/gsd/${stuck.feature}/pause.json\` and start over.
|
|
976
|
+
|
|
977
|
+
State for resume is in \`pause.json\` (schema: \`contracts/gsd-stuck.json#/definitions/pause\`).
|
|
978
|
+
`;
|
|
979
|
+
}
|
|
980
|
+
|
|
981
|
+
/**
|
|
982
|
+
* --resume: read pause.json, enforce the ownership + mode guard (mirrors
|
|
983
|
+
* `compose fix --resume`, bin/compose.js:1933), and return the persisted task
|
|
984
|
+
* graph filtered to exclude completedTaskIds. Throws (caller surfaces the
|
|
985
|
+
* message + exits 1) when there is nothing to resume or the guard fails.
|
|
986
|
+
*
|
|
987
|
+
* COMP-GSD-4: `claim` (default true) controls the atomic pause.lock ownership
|
|
988
|
+
* claim. runGsd passes `{claim:false}` and claims later (claimResumeLock) as
|
|
989
|
+
* the first statement INSIDE its try, so the run-loop's finally always releases
|
|
990
|
+
* the lock — no strand on a budget/stuck re-halt or a pre-dispatch throw. The
|
|
991
|
+
* CLI/test callers keep the default (read+guard+claim in one call).
|
|
992
|
+
*/
|
|
993
|
+
export function loadResumeTaskGraph(cwd, featureCode, { claim = true } = {}) {
|
|
994
|
+
const pausePath = join(gsdDir(cwd, featureCode), 'pause.json');
|
|
995
|
+
let pause;
|
|
996
|
+
if (existsSync(pausePath)) {
|
|
997
|
+
try {
|
|
998
|
+
pause = JSON.parse(readFileSync(pausePath, 'utf-8'));
|
|
999
|
+
} catch (err) {
|
|
1000
|
+
throw new Error(`runGsd: pause.json for ${featureCode} is unreadable: ${err.message}`);
|
|
1001
|
+
}
|
|
1002
|
+
} else {
|
|
1003
|
+
// COMP-GSD-6 crash bridge: a hard crash never reaches the stuck/budget halt
|
|
1004
|
+
// paths that write pause.json. If state.json shows a running run with a DEAD
|
|
1005
|
+
// pid and a populated task graph (resumeReady), synthesize a pause-shaped
|
|
1006
|
+
// object so the unfinished subset can be re-dispatched through the same
|
|
1007
|
+
// guards/filtering below. An EMPTY graph (crashed pre/at decompose) is NOT
|
|
1008
|
+
// resumable here — it (correctly) falls through to the throw; the supervisor
|
|
1009
|
+
// restarts such runs fresh rather than --resume.
|
|
1010
|
+
const state = readGsdState(cwd, featureCode);
|
|
1011
|
+
if (
|
|
1012
|
+
state && state.status === 'running' && !pidAlive(state.pid) &&
|
|
1013
|
+
Array.isArray(state.decomposedTasks) && state.decomposedTasks.length > 0
|
|
1014
|
+
) {
|
|
1015
|
+
pause = {
|
|
1016
|
+
flowId: state.flowId ?? null,
|
|
1017
|
+
stepId: state.lastStepId ?? 'execute',
|
|
1018
|
+
decomposedTasks: state.decomposedTasks,
|
|
1019
|
+
completedTaskIds: state.completedTaskIds ?? [],
|
|
1020
|
+
pid: state.pid,
|
|
1021
|
+
mode: 'gsd',
|
|
1022
|
+
ts: state.heartbeatAt ?? new Date().toISOString(),
|
|
1023
|
+
};
|
|
1024
|
+
} else {
|
|
1025
|
+
throw new Error(
|
|
1026
|
+
`runGsd: no pause.json to resume for ${featureCode}. ` +
|
|
1027
|
+
`Nothing to resume — run \`compose gsd ${featureCode}\` to start fresh.`,
|
|
1028
|
+
);
|
|
1029
|
+
}
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
// Mode guard: refuse to resume a non-gsd pause file.
|
|
1033
|
+
if (pause.mode && pause.mode !== 'gsd') {
|
|
1034
|
+
throw new Error(
|
|
1035
|
+
`runGsd: cannot --resume: pause.json for ${featureCode} is in ${pause.mode} mode, not gsd.`,
|
|
1036
|
+
);
|
|
1037
|
+
}
|
|
1038
|
+
|
|
1039
|
+
// Ownership guard: refuse if the recorded pid is still alive. A resumable
|
|
1040
|
+
// pause is one whose writing process has EXITED — a live pid means another
|
|
1041
|
+
// run still owns this feature (mirrors `compose fix --resume`). We do not
|
|
1042
|
+
// make a self-pid exception: if a live process holds the pause, resuming is
|
|
1043
|
+
// unsafe regardless of whether that pid happens to match ours.
|
|
1044
|
+
if (typeof pause.pid === 'number' && pidAlive(pause.pid)) {
|
|
1045
|
+
throw new Error(
|
|
1046
|
+
`runGsd: cannot --resume: pid ${pause.pid} still owns this gsd run (process is live). ` +
|
|
1047
|
+
`Wait for it to exit (or remove a stale pause.json) before resuming.`,
|
|
1048
|
+
);
|
|
1049
|
+
}
|
|
1050
|
+
|
|
1051
|
+
const tasks = Array.isArray(pause.decomposedTasks) ? pause.decomposedTasks : [];
|
|
1052
|
+
if (tasks.length === 0) {
|
|
1053
|
+
throw new Error(`runGsd: pause.json for ${featureCode} has no decomposedTasks to resume.`);
|
|
1054
|
+
}
|
|
1055
|
+
const completed = new Set(pause.completedTaskIds ?? []);
|
|
1056
|
+
const remaining = tasks
|
|
1057
|
+
.filter((t) => !completed.has(t.id))
|
|
1058
|
+
.map((t) => {
|
|
1059
|
+
// A completed dependency is already satisfied (its result is in the
|
|
1060
|
+
// blackboard); strip it from depends_on so the re-dispatched subgraph is
|
|
1061
|
+
// self-consistent and a remaining task does not wait on a task that will
|
|
1062
|
+
// never be re-dispatched (COMP-GSD-5 Codex review residual).
|
|
1063
|
+
if (!Array.isArray(t.depends_on) || t.depends_on.length === 0) return t;
|
|
1064
|
+
const deps = t.depends_on.filter((id) => !completed.has(id));
|
|
1065
|
+
return deps.length === t.depends_on.length ? t : { ...t, depends_on: deps };
|
|
1066
|
+
});
|
|
1067
|
+
if (remaining.length === 0) {
|
|
1068
|
+
// Everything already completed — nothing to re-dispatch. Treat as clean.
|
|
1069
|
+
throw new Error(
|
|
1070
|
+
`runGsd: all tasks for ${featureCode} are already completed; nothing to re-dispatch. ` +
|
|
1071
|
+
`Delete pause.json to finish.`,
|
|
1072
|
+
);
|
|
1073
|
+
}
|
|
1074
|
+
if (claim) claimResumeLock(cwd, featureCode);
|
|
1075
|
+
return { tasks: remaining };
|
|
1076
|
+
}
|
|
1077
|
+
|
|
1078
|
+
/**
|
|
1079
|
+
* Atomic ownership claim (COMP-GSD-5 Codex review, HIGH). `mkdirSync` is an
|
|
1080
|
+
* atomically exclusive create, so two concurrent --resume invocations cannot
|
|
1081
|
+
* both claim — the loser gets EEXIST and refuses.
|
|
1082
|
+
*
|
|
1083
|
+
* COMP-GSD-6: a STALE claim left by a crashed --resume is now auto-recovered.
|
|
1084
|
+
* The HOLDER of pause.lock writes its own pid into pause.lock/owner.json (NOT
|
|
1085
|
+
* pause.json.pid, which is the original crashed run's pid — always dead at
|
|
1086
|
+
* resume time and so useless for liveness). Takeover when that holder pid is
|
|
1087
|
+
* dead, OR no owner record exists and the lock-dir mtime is older than the
|
|
1088
|
+
* stale window. TOCTOU-safe: remove + re-attempt the atomic mkdir; a concurrent
|
|
1089
|
+
* winner still wins.
|
|
1090
|
+
*/
|
|
1091
|
+
export function claimResumeLock(cwd, featureCode) {
|
|
1092
|
+
const claimPath = join(gsdDir(cwd, featureCode), 'pause.lock');
|
|
1093
|
+
const writeOwner = () => {
|
|
1094
|
+
try {
|
|
1095
|
+
writeFileSync(
|
|
1096
|
+
join(claimPath, 'owner.json'),
|
|
1097
|
+
JSON.stringify({ pid: process.pid, ts: new Date().toISOString() }, null, 2),
|
|
1098
|
+
);
|
|
1099
|
+
} catch { /* best-effort; mtime fallback still protects takeover */ }
|
|
1100
|
+
};
|
|
1101
|
+
try {
|
|
1102
|
+
mkdirSync(claimPath);
|
|
1103
|
+
writeOwner();
|
|
1104
|
+
return;
|
|
1105
|
+
} catch (err) {
|
|
1106
|
+
if (err.code !== 'EEXIST') throw err;
|
|
1107
|
+
}
|
|
1108
|
+
// EEXIST — decide stale vs live by the lock HOLDER's own owner record.
|
|
1109
|
+
let holderPid = null;
|
|
1110
|
+
const ownerPath = join(claimPath, 'owner.json');
|
|
1111
|
+
if (existsSync(ownerPath)) {
|
|
1112
|
+
try {
|
|
1113
|
+
const o = JSON.parse(readFileSync(ownerPath, 'utf-8'));
|
|
1114
|
+
if (typeof o.pid === 'number') holderPid = o.pid;
|
|
1115
|
+
} catch { /* fall through to mtime */ }
|
|
1116
|
+
}
|
|
1117
|
+
let stale = false;
|
|
1118
|
+
if (typeof holderPid === 'number') {
|
|
1119
|
+
stale = !pidAlive(holderPid);
|
|
1120
|
+
} else {
|
|
1121
|
+
try {
|
|
1122
|
+
stale = Date.now() - statSync(claimPath).mtimeMs > RUN_LOCK_STALE_MS;
|
|
1123
|
+
} catch { stale = true; }
|
|
1124
|
+
}
|
|
1125
|
+
if (!stale) {
|
|
1126
|
+
throw new Error(
|
|
1127
|
+
`runGsd: a resume claim already exists for ${featureCode} ` +
|
|
1128
|
+
`(.compose/gsd/${featureCode}/pause.lock, pid ${holderPid ?? 'unknown'} alive). ` +
|
|
1129
|
+
`Another --resume may be in progress; if none is, remove that directory to clear a stale claim.`,
|
|
1130
|
+
);
|
|
1131
|
+
}
|
|
1132
|
+
// Atomic stale takeover (rename-aside) — a concurrent reclaimer can't delete
|
|
1133
|
+
// our fresh lock. If we lose the race, refuse.
|
|
1134
|
+
if (!takeoverStaleLock(claimPath)) {
|
|
1135
|
+
throw new Error(
|
|
1136
|
+
`runGsd: another --resume claimed ${featureCode} during stale-claim takeover; retry.`,
|
|
1137
|
+
);
|
|
1138
|
+
}
|
|
1139
|
+
writeOwner();
|
|
1140
|
+
}
|
|
1141
|
+
|
|
1142
|
+
/**
|
|
1143
|
+
* COMP-GSD-4: release ONLY the resume ownership claim (pause.lock), leaving
|
|
1144
|
+
* pause.json intact for the next --resume. Called in runGsd's finally on every
|
|
1145
|
+
* exit so a budget/stuck re-halt, cumulative refusal, or pre-dispatch throw
|
|
1146
|
+
* never strands the lock. Idempotent (force) — a no-op when no lock was claimed.
|
|
1147
|
+
*/
|
|
1148
|
+
function releasePauseLock(cwd, featureCode) {
|
|
1149
|
+
try { rmSync(join(gsdDir(cwd, featureCode), 'pause.lock'), { recursive: true, force: true }); } catch { /* best-effort */ }
|
|
1150
|
+
}
|
|
1151
|
+
|
|
1152
|
+
/**
|
|
1153
|
+
* COMP-GSD-4: persist the budget halt diagnostic (budget.json + budget.md, via
|
|
1154
|
+
* composeBudgetDiagnostic) AND the resume state (pause.json, kind:'budget').
|
|
1155
|
+
* Mirrors writeStuckArtifacts but carries the `budget` block instead of the
|
|
1156
|
+
* stuck-specific fields. decomposedTasks comes from the enriched graph so
|
|
1157
|
+
* --resume re-dispatches the unfinished subset without re-decomposing.
|
|
1158
|
+
*/
|
|
1159
|
+
function writeBudgetArtifacts(ctx, response, budgetState) {
|
|
1160
|
+
const { cwd, featureCode } = ctx;
|
|
1161
|
+
const dir = gsdDir(cwd, featureCode);
|
|
1162
|
+
mkdirSync(dir, { recursive: true });
|
|
1163
|
+
const ts = new Date().toISOString();
|
|
1164
|
+
|
|
1165
|
+
const sourceTasks = ctx.lastTaskGraph?.tasks ?? response.tasks ?? [];
|
|
1166
|
+
const decomposedTasks = sourceTasks.map((t) => ({ ...t }));
|
|
1167
|
+
const completedTaskIds = collectCompletedTaskIds(cwd, featureCode);
|
|
1168
|
+
|
|
1169
|
+
const { json, md } = composeBudgetDiagnostic(budgetState, { feature: featureCode, decomposedTasks, completedTaskIds });
|
|
1170
|
+
writeFileSync(join(dir, 'budget.json'), JSON.stringify(json, null, 2) + '\n');
|
|
1171
|
+
writeFileSync(join(dir, 'budget.md'), md);
|
|
1172
|
+
|
|
1173
|
+
const pause = {
|
|
1174
|
+
flowId: response.flow_id ?? null,
|
|
1175
|
+
stepId: response.step_id ?? ctx.lastStepId ?? 'execute',
|
|
1176
|
+
kind: 'budget',
|
|
1177
|
+
budget: { axis: json.axis, caps: budgetState.caps ?? {}, consumed: budgetState.consumed ?? {} },
|
|
1178
|
+
decomposedTasks,
|
|
1179
|
+
completedTaskIds,
|
|
1180
|
+
pid: process.pid,
|
|
1181
|
+
mode: 'gsd',
|
|
1182
|
+
ts,
|
|
1183
|
+
};
|
|
1184
|
+
writeFileSync(join(dir, 'pause.json'), JSON.stringify(pause, null, 2) + '\n');
|
|
1185
|
+
}
|
|
1186
|
+
|
|
1187
|
+
/**
|
|
1188
|
+
* COMP-GSD-7: on a clean complete, snapshot the run's final budget actuals-vs-caps
|
|
1189
|
+
* to budget-final.json so the milestone report has them retroactively (a clean
|
|
1190
|
+
* complete writes no budget.json — only halts do). Distinct filename from the
|
|
1191
|
+
* halt artifact budget.json (which buildGsdQuery's precedence reads). Atomic write.
|
|
1192
|
+
*/
|
|
1193
|
+
export function writeBudgetFinalSnapshot(ctx, budgetState) {
|
|
1194
|
+
const { cwd, featureCode } = ctx;
|
|
1195
|
+
const dir = gsdDir(cwd, featureCode);
|
|
1196
|
+
mkdirSync(dir, { recursive: true });
|
|
1197
|
+
const decomposedTasks = (ctx.runState?.decomposedTasks ?? []).map((t) => ({ ...t }));
|
|
1198
|
+
const completedTaskIds = collectCompletedTaskIds(cwd, featureCode);
|
|
1199
|
+
const { json } = composeBudgetDiagnostic(budgetState, { feature: featureCode, decomposedTasks, completedTaskIds });
|
|
1200
|
+
const target = join(dir, 'budget-final.json');
|
|
1201
|
+
const tmp = `${target}.tmp`;
|
|
1202
|
+
writeFileSync(tmp, JSON.stringify(json, null, 2) + '\n');
|
|
1203
|
+
renameSync(tmp, target);
|
|
1204
|
+
}
|
|
1205
|
+
|
|
1206
|
+
/**
|
|
1207
|
+
* COMP-GSD-4: append a run's consumed usage to the cumulative ledger. Sourced
|
|
1208
|
+
* from the stratum budget_state.consumed ({tokens,dispatches,wall_s,dollars}).
|
|
1209
|
+
* No-op when budget_state is absent (un-budgeted runs).
|
|
1210
|
+
*/
|
|
1211
|
+
function recordGsdUsageFromState(cwd, featureCode, budgetState) {
|
|
1212
|
+
const consumed = budgetState?.consumed;
|
|
1213
|
+
if (!consumed) return;
|
|
1214
|
+
recordGsdUsage(join(cwd, '.compose'), featureCode, {
|
|
1215
|
+
tokens: consumed.tokens ?? 0,
|
|
1216
|
+
costUsd: consumed.dollars ?? 0,
|
|
1217
|
+
dispatches: consumed.dispatches ?? 0,
|
|
1218
|
+
timeMs: Math.round((consumed.wall_s ?? 0) * 1000),
|
|
1219
|
+
});
|
|
1220
|
+
}
|
|
1221
|
+
|
|
1222
|
+
/**
|
|
1223
|
+
* COMP-GSD-4: write a budget refusal diagnostic when the cumulative ceiling is
|
|
1224
|
+
* already spent (pre-dispatch). No pause.json — nothing was dispatched, so
|
|
1225
|
+
* there is no run to resume; the user raises the cap or runs --reset-budget.
|
|
1226
|
+
*/
|
|
1227
|
+
function writeCumulativeRefusal(cwd, featureCode, chk, limits) {
|
|
1228
|
+
const dir = gsdDir(cwd, featureCode);
|
|
1229
|
+
mkdirSync(dir, { recursive: true });
|
|
1230
|
+
const ts = new Date().toISOString();
|
|
1231
|
+
const json = { feature: featureCode, kind: 'budget', axis: 'cumulative', reason: chk.reason, usage: chk.usage, limits, ts };
|
|
1232
|
+
writeFileSync(join(dir, 'budget.json'), JSON.stringify(json, null, 2) + '\n');
|
|
1233
|
+
const md = [
|
|
1234
|
+
`# GSD budget refusal — ${featureCode}`,
|
|
1235
|
+
'',
|
|
1236
|
+
`**${chk.reason}**`,
|
|
1237
|
+
'',
|
|
1238
|
+
`Cumulative usage: ${chk.usage.totalTokens} tokens, $${(chk.usage.totalCostUsd ?? 0).toFixed(4)}.`,
|
|
1239
|
+
'',
|
|
1240
|
+
'This feature has already spent its cumulative `gsd.budget.cumulative.*` ceiling.',
|
|
1241
|
+
'Raise the cap in `.compose/compose.json`, or clear the ledger:',
|
|
1242
|
+
'',
|
|
1243
|
+
'```',
|
|
1244
|
+
`compose gsd ${featureCode} --reset-budget`,
|
|
1245
|
+
'```',
|
|
1246
|
+
'',
|
|
1247
|
+
].join('\n');
|
|
1248
|
+
writeFileSync(join(dir, 'budget.md'), md);
|
|
1249
|
+
}
|
|
1250
|
+
|
|
1251
|
+
function clearPauseFile(cwd, featureCode) {
|
|
1252
|
+
const dir = gsdDir(cwd, featureCode);
|
|
1253
|
+
try { rmSync(join(dir, 'pause.json'), { force: true }); } catch { /* best-effort */ }
|
|
1254
|
+
// Release the resume ownership claim dir (COMP-GSD-5 Codex review) alongside it.
|
|
1255
|
+
try { rmSync(join(dir, 'pause.lock'), { recursive: true, force: true }); } catch { /* best-effort */ }
|
|
1256
|
+
}
|