@tekyzinc/gsd-t 3.15.10 → 3.16.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,188 @@
1
+ /**
2
+ * gsd-t-unattended-heartbeat.cjs
3
+ *
4
+ * Liveness heartbeat watchdog for the unattended supervisor.
5
+ *
6
+ * Supersedes the pre-M43 `workerTimeoutMs` wall-clock guillotine as the
7
+ * PRIMARY stuck-worker detector. The guillotine remains as an absolute
8
+ * backstop (raised to 1 hour by default) for pathological cases where a
9
+ * child never writes a single event.
10
+ *
11
+ * How it works
12
+ * ────────────
13
+ * The supervisor polls `.gsd-t/events/YYYY-MM-DD.jsonl` mtime every 60 s
14
+ * during a worker iteration. If the mtime has not advanced for at least
15
+ * `staleHeartbeatMs` (default 300_000 = 5 min), the worker is considered
16
+ * stuck and SIGTERM'd. Healthy workers producing events run indefinitely
17
+ * under the 1-hour absolute cap.
18
+ *
19
+ * This module is pure and side-effect-free by default. `checkHeartbeat()`
20
+ * accepts injected `now` and `fsShim` so the entire watchdog can be
21
+ * unit-tested with a fake clock and fake filesystem.
22
+ *
23
+ * Zero external dependencies — Node built-ins only.
24
+ *
25
+ * Contract: .gsd-t/contracts/unattended-supervisor-contract.md v1.1.0
26
+ * §"Heartbeat Watchdog"
27
+ */
28
+
29
+ "use strict";
30
+
31
+ const fs = require("node:fs");
32
+ const path = require("node:path");
33
+
34
+ /**
35
+ * Build the events JSONL path for a given date.
36
+ *
37
+ * @param {string} projectDir
38
+ * @param {Date|number} when Date, or ms since epoch. Defaults to now when
39
+ * omitted at the call site.
40
+ * @returns {string}
41
+ */
42
+ function eventsPathFor(projectDir, when) {
43
+ const d = when instanceof Date ? when : new Date(when || Date.now());
44
+ const y = d.getUTCFullYear();
45
+ const m = String(d.getUTCMonth() + 1).padStart(2, "0");
46
+ const day = String(d.getUTCDate()).padStart(2, "0");
47
+ return path.join(projectDir, ".gsd-t", "events", `${y}-${m}-${day}.jsonl`);
48
+ }
49
+
50
+ /**
51
+ * Check whether the worker's event stream is stale.
52
+ *
53
+ * A worker is "stale" when the relevant events JSONL file's mtime has not
54
+ * advanced within `staleHeartbeatMs` of the given `now`. The relevant file
55
+ * is the one matching the date of `now` — if the loop crosses a UTC day
56
+ * boundary mid-iteration, the new day's file is checked.
57
+ *
58
+ * Fresh worker grace: if the events file does not exist yet AND
59
+ * `(now - workerStartedAt) < staleHeartbeatMs`, the worker is considered
60
+ * healthy (still booting). After the grace window with no file, the worker
61
+ * is stale.
62
+ *
63
+ * @param {object} params
64
+ * @param {string} params.projectDir
65
+ * @param {number} params.workerStartedAt ms since epoch
66
+ * @param {number} params.staleHeartbeatMs
67
+ * @param {number} [params.now] ms since epoch (defaults to Date.now())
68
+ * @param {object} [params.fsShim] { existsSync, statSync } — test hook
69
+ * @returns {{stale: boolean, reason: string, lastEventMs: (number|null), ageMs: (number|null), eventsPath: string}}
70
+ */
71
+ function checkHeartbeat({
72
+ projectDir,
73
+ workerStartedAt,
74
+ staleHeartbeatMs,
75
+ now,
76
+ fsShim,
77
+ }) {
78
+ if (typeof projectDir !== "string" || projectDir.length === 0) {
79
+ throw new Error("checkHeartbeat: projectDir is required");
80
+ }
81
+ if (typeof workerStartedAt !== "number" || !Number.isFinite(workerStartedAt)) {
82
+ throw new Error("checkHeartbeat: workerStartedAt must be a finite number");
83
+ }
84
+ if (
85
+ typeof staleHeartbeatMs !== "number" ||
86
+ !Number.isFinite(staleHeartbeatMs) ||
87
+ staleHeartbeatMs <= 0
88
+ ) {
89
+ throw new Error("checkHeartbeat: staleHeartbeatMs must be a positive number");
90
+ }
91
+ const nowMs = typeof now === "number" ? now : Date.now();
92
+ const shim = fsShim || fs;
93
+
94
+ const eventsPath = eventsPathFor(projectDir, nowMs);
95
+
96
+ let exists = false;
97
+ try {
98
+ exists = !!shim.existsSync(eventsPath);
99
+ } catch (_) {
100
+ exists = false;
101
+ }
102
+
103
+ if (!exists) {
104
+ const sinceStart = nowMs - workerStartedAt;
105
+ if (sinceStart < staleHeartbeatMs) {
106
+ return {
107
+ stale: false,
108
+ reason: `events file not yet created (grace: ${sinceStart}ms < ${staleHeartbeatMs}ms)`,
109
+ lastEventMs: null,
110
+ ageMs: null,
111
+ eventsPath,
112
+ };
113
+ }
114
+ return {
115
+ stale: true,
116
+ reason: `events file ${eventsPath} absent for ${sinceStart}ms since worker start (threshold ${staleHeartbeatMs}ms)`,
117
+ lastEventMs: null,
118
+ ageMs: sinceStart,
119
+ eventsPath,
120
+ };
121
+ }
122
+
123
+ let stat;
124
+ try {
125
+ stat = shim.statSync(eventsPath);
126
+ } catch (err) {
127
+ // File existed at existsSync but stat failed — treat as stale only if
128
+ // we are past the grace window. Under the grace window, assume transient.
129
+ const sinceStart = nowMs - workerStartedAt;
130
+ if (sinceStart < staleHeartbeatMs) {
131
+ return {
132
+ stale: false,
133
+ reason: `events stat transient failure (grace): ${err.message}`,
134
+ lastEventMs: null,
135
+ ageMs: null,
136
+ eventsPath,
137
+ };
138
+ }
139
+ return {
140
+ stale: true,
141
+ reason: `events stat failed past grace: ${err.message}`,
142
+ lastEventMs: null,
143
+ ageMs: sinceStart,
144
+ eventsPath,
145
+ };
146
+ }
147
+
148
+ const mtimeMs =
149
+ typeof stat.mtimeMs === "number"
150
+ ? stat.mtimeMs
151
+ : stat.mtime instanceof Date
152
+ ? stat.mtime.getTime()
153
+ : 0;
154
+
155
+ // Reference point for staleness: max(mtime, workerStartedAt). This handles
156
+ // the bootstrap case where the events file already existed from a prior
157
+ // iteration — we don't want to kill the worker on its first 60s poll just
158
+ // because it hasn't emitted yet. The worker gets at least staleHeartbeatMs
159
+ // from its own start to produce the first event.
160
+ const ref = Math.max(mtimeMs, workerStartedAt);
161
+ const ageMs = nowMs - ref;
162
+
163
+ if (ageMs >= staleHeartbeatMs) {
164
+ return {
165
+ stale: true,
166
+ reason: `last event ${ageMs}ms ago (threshold ${staleHeartbeatMs}ms)`,
167
+ lastEventMs: mtimeMs,
168
+ ageMs,
169
+ eventsPath,
170
+ };
171
+ }
172
+ return {
173
+ stale: false,
174
+ reason: `fresh — last event ${ageMs}ms ago`,
175
+ lastEventMs: mtimeMs,
176
+ ageMs,
177
+ eventsPath,
178
+ };
179
+ }
180
+
181
+ module.exports = {
182
+ checkHeartbeat,
183
+ eventsPathFor,
184
+ // Default heartbeat poll cadence — exported so tests and the supervisor
185
+ // can reference a single source of truth.
186
+ DEFAULT_HEARTBEAT_POLL_MS: 60 * 1000,
187
+ DEFAULT_STALE_HEARTBEAT_MS: 5 * 60 * 1000,
188
+ };
@@ -14,7 +14,8 @@
14
14
  * Task 1 of m36-cross-platform delivers:
15
15
  * - resolveClaudePath()
16
16
  * - isAlive(pid)
17
- * - spawnWorker(projectDir, timeoutMs)
17
+ * - spawnWorker(projectDir, timeoutMs, opts)
18
+ * opts.onHeartbeatCheck? — async liveness watchdog (M43 D?)
18
19
  *
19
20
  * Cross-platform notes:
20
21
  * - darwin / linux paths are runtime-tested.
@@ -89,46 +90,85 @@ function isAlive(pid) {
89
90
  // ─── spawnWorker ─────────────────────────────────────────────────────────────
90
91
 
91
92
  /**
92
- * Spawn a synchronous `claude -p '/gsd-t-resume'` worker iteration for the
93
- * unattended supervisor.
93
+ * Spawn a `claude -p '/gsd-t-resume'` worker iteration for the unattended
94
+ * supervisor.
94
95
  *
95
96
  * Returns a normalized result object: `{ status, stdout, stderr, signal,
96
- * timedOut, error }`. Never throws — spawn errors are returned in `error`.
97
+ * timedOut, staleHeartbeat, error }`. Never throws — spawn errors are
98
+ * returned in `error`.
97
99
  *
98
- * Timeout semantics: when `spawnSync`'s `timeout` fires, the child is sent
99
- * SIGTERM (or the equivalent on win32), `status` is `null`, and `signal` is
100
- * non-null. We surface this as `timedOut: true` so callers can map to exit
101
- * code 3 per contract §5.
100
+ * Two kill paths:
101
+ * 1. Heartbeat watchdog (M43 primary) when `opts.onHeartbeatCheck` is
102
+ * provided, the function polls every `opts.heartbeatPollMs` (default
103
+ * 60_000). If the callback returns `{stale: true, ...}`, the child is
104
+ * SIGTERM'd and `staleHeartbeat: true` is set on the result.
105
+ * 2. Wall-clock timeout (absolute backstop) — `timeoutMs` is the hard cap
106
+ * regardless of heartbeat. On expiry the child is SIGTERM'd and
107
+ * `timedOut: true`. Default raised to 1 h in supervisor-core so a
108
+ * healthy long-running worker is not cut.
102
109
  *
103
- * Spawn recipe (uniform across platforms):
104
- * - `shell: false` no shell quoting hazards
105
- * - `windowsHide: true`no flashed window on win32
106
- * - explicit `claude.cmd` filename on win32 (see resolveClaudePath JSDoc)
110
+ * Exactly one path is used per iteration:
111
+ * - opts.onHeartbeatCheck present heartbeat path (async event loop)
112
+ * - opts.onHeartbeatCheck absentlegacy spawnSync path (blocking)
113
+ * The legacy path is preserved so callers that have no meaningful liveness
114
+ * signal (test stubs, dry-run) keep the original semantics.
107
115
  *
108
- * @todo Spike C: verify `claude.cmd -p "/gsd-t-resume"` dispatches correctly
109
- * under PowerShell + cmd.exe + Git Bash. See
110
- * `docs/unattended-windows-caveats.md` (Task 3 of m36-cross-platform).
116
+ * Cross-platform:
117
+ * - darwin / linux: runtime-tested.
118
+ * - win32: implementation-complete; documented in
119
+ * `docs/unattended-windows-caveats.md` (Task 3).
111
120
  *
112
121
  * @param {string} projectDir Absolute path to the project directory (cwd).
113
- * @param {number} timeoutMs Wall-clock cap per worker iteration in ms.
122
+ * @param {number} timeoutMs Wall-clock backstop per worker iteration in ms.
114
123
  * @param {object} [opts] Optional overrides (test-mode hooks).
115
124
  * @param {string} [opts.bin] Override the resolved binary (test-mode only).
116
125
  * @param {string[]} [opts.args] Override args (defaults to `['-p', '/gsd-t-resume']`).
117
126
  * @param {object} [opts.env] Override env (defaults to `process.env`).
127
+ * @param {Function} [opts.onHeartbeatCheck] Called every heartbeatPollMs
128
+ * with no args; must return `{stale: boolean, reason?: string}` or a
129
+ * Promise thereof. When stale, the child is SIGTERM'd.
130
+ * @param {number} [opts.heartbeatPollMs] Poll cadence in ms. Default 60_000.
131
+ * @param {Function} [opts.onHeartbeatSample] Optional observer; receives the
132
+ * raw callback result each poll for logging.
133
+ * @param {Function} [opts.onStdoutLine] Optional live stdout line callback.
134
+ * When provided (heartbeat path only), invoked once per `\n`-terminated line
135
+ * as stdout streams in. Trailing partial line is flushed on close. Errors
136
+ * are swallowed (best-effort tee). Used by the supervisor to write each
137
+ * worker line into the M42 transcript file in real time, instead of
138
+ * appending the entire stdout buffer post-hoc at child exit.
118
139
  * @returns {{
119
140
  * status: number|null,
120
141
  * stdout: string,
121
142
  * stderr: string,
122
143
  * signal: string|null,
123
144
  * timedOut: boolean,
145
+ * staleHeartbeat: boolean,
146
+ * heartbeatReason: string|null,
124
147
  * error: Error|null
125
- * }}
148
+ * }|Promise<...>}
149
+ * Returns a Promise when `opts.onHeartbeatCheck` is provided; otherwise a
150
+ * synchronous result (legacy path).
126
151
  */
127
152
  function spawnWorker(projectDir, timeoutMs, opts = {}) {
128
153
  const bin = opts.bin || resolveClaudePath();
129
154
  const args = opts.args || ["-p", "/gsd-t-resume"];
130
155
  const env = opts.env || process.env;
131
156
 
157
+ if (typeof opts.onHeartbeatCheck === "function") {
158
+ return _spawnWorkerAsyncHeartbeat({
159
+ bin,
160
+ args,
161
+ env,
162
+ cwd: projectDir,
163
+ timeoutMs,
164
+ onHeartbeatCheck: opts.onHeartbeatCheck,
165
+ heartbeatPollMs: opts.heartbeatPollMs || 60 * 1000,
166
+ onHeartbeatSample: opts.onHeartbeatSample,
167
+ onStdoutLine: opts.onStdoutLine,
168
+ spawnImpl: opts._spawnImpl || spawn,
169
+ });
170
+ }
171
+
132
172
  const result = spawnSync(bin, args, {
133
173
  cwd: projectDir,
134
174
  encoding: "utf8",
@@ -139,19 +179,11 @@ function spawnWorker(projectDir, timeoutMs, opts = {}) {
139
179
  windowsHide: true,
140
180
  });
141
181
 
142
- // Normalize. spawnSync may return error if the binary cannot be launched
143
- // (ENOENT etc.) — surface it instead of throwing.
144
182
  const stdout = typeof result.stdout === "string" ? result.stdout : "";
145
183
  const stderr = typeof result.stderr === "string" ? result.stderr : "";
146
184
  const signal = result.signal || null;
147
185
  const status = typeof result.status === "number" ? result.status : null;
148
186
 
149
- // Timeout detection: when spawnSync's `timeout` option fires it sets
150
- // - status === null
151
- // - signal !== null (SIGTERM on POSIX, equivalent on win32)
152
- // - error.code === 'ETIMEDOUT' (Node surfaces it as a synthetic Error)
153
- // The ETIMEDOUT code is the authoritative signal — checking it
154
- // discriminates a genuine timeout from an ENOENT/spawn failure.
155
187
  const errCode = result.error && result.error.code;
156
188
  const timedOut =
157
189
  errCode === "ETIMEDOUT" || (status === null && signal !== null && !result.error);
@@ -162,12 +194,144 @@ function spawnWorker(projectDir, timeoutMs, opts = {}) {
162
194
  stderr,
163
195
  signal,
164
196
  timedOut,
165
- // Suppress the synthetic ETIMEDOUT error so callers can rely on
166
- // `timedOut` for the timeout case and `error` for genuine spawn failures.
197
+ staleHeartbeat: false,
198
+ heartbeatReason: null,
167
199
  error: errCode === "ETIMEDOUT" ? null : result.error || null,
168
200
  };
169
201
  }
170
202
 
203
+ /**
204
+ * Async spawn path used when a heartbeat callback is provided.
205
+ *
206
+ * Returns a Promise resolving to the same result shape as spawnWorker's
207
+ * legacy path, plus `staleHeartbeat` and `heartbeatReason`.
208
+ *
209
+ * Kill precedence when both fire in the same tick: heartbeat wins (it is
210
+ * the more specific signal). The loser is suppressed on the result.
211
+ *
212
+ * @private
213
+ */
214
+ function _spawnWorkerAsyncHeartbeat({
215
+ bin,
216
+ args,
217
+ env,
218
+ cwd,
219
+ timeoutMs,
220
+ onHeartbeatCheck,
221
+ heartbeatPollMs,
222
+ onHeartbeatSample,
223
+ onStdoutLine,
224
+ spawnImpl,
225
+ }) {
226
+ return new Promise((resolve) => {
227
+ let child;
228
+ try {
229
+ child = spawnImpl(bin, args, {
230
+ cwd,
231
+ env,
232
+ stdio: ["ignore", "pipe", "pipe"],
233
+ shell: false,
234
+ windowsHide: true,
235
+ });
236
+ } catch (err) {
237
+ resolve({
238
+ status: null,
239
+ stdout: "",
240
+ stderr: "",
241
+ signal: null,
242
+ timedOut: false,
243
+ staleHeartbeat: false,
244
+ heartbeatReason: null,
245
+ error: err,
246
+ });
247
+ return;
248
+ }
249
+
250
+ const chunksOut = [];
251
+ const chunksErr = [];
252
+ let lineBuf = "";
253
+ const emitLine = (line) => {
254
+ if (typeof onStdoutLine !== "function" || line.length === 0) return;
255
+ try { onStdoutLine(line); } catch (_) { /* tee is best-effort */ }
256
+ };
257
+ if (child.stdout) child.stdout.on("data", (d) => {
258
+ chunksOut.push(d);
259
+ if (typeof onStdoutLine !== "function") return;
260
+ lineBuf += d.toString("utf8");
261
+ let nl;
262
+ while ((nl = lineBuf.indexOf("\n")) >= 0) {
263
+ emitLine(lineBuf.slice(0, nl));
264
+ lineBuf = lineBuf.slice(nl + 1);
265
+ }
266
+ });
267
+ if (child.stderr) child.stderr.on("data", (d) => chunksErr.push(d));
268
+
269
+ let resolved = false;
270
+ let timedOut = false;
271
+ let staleHeartbeat = false;
272
+ let heartbeatReason = null;
273
+ let spawnErr = null;
274
+
275
+ const heartbeatTimer = setInterval(async () => {
276
+ if (resolved) return;
277
+ let sample;
278
+ try {
279
+ sample = await onHeartbeatCheck();
280
+ } catch (err) {
281
+ // Heartbeat check must not kill the worker on its own failures.
282
+ // Surface via sample observer when provided and continue.
283
+ sample = { stale: false, reason: `heartbeat check threw: ${err.message}` };
284
+ }
285
+ if (typeof onHeartbeatSample === "function") {
286
+ try { onHeartbeatSample(sample); } catch (_) { /* observer best-effort */ }
287
+ }
288
+ if (sample && sample.stale) {
289
+ staleHeartbeat = true;
290
+ heartbeatReason = sample.reason || "stale heartbeat";
291
+ try { child.kill("SIGTERM"); } catch (_) { /* child may already be gone */ }
292
+ }
293
+ }, heartbeatPollMs);
294
+ if (typeof heartbeatTimer.unref === "function") heartbeatTimer.unref();
295
+
296
+ const absoluteTimer = setTimeout(() => {
297
+ if (resolved || staleHeartbeat) return;
298
+ timedOut = true;
299
+ try { child.kill("SIGTERM"); } catch (_) { /* child may already be gone */ }
300
+ }, timeoutMs);
301
+ if (typeof absoluteTimer.unref === "function") absoluteTimer.unref();
302
+
303
+ child.on("error", (err) => {
304
+ spawnErr = err;
305
+ });
306
+
307
+ child.on("close", (status, signal) => {
308
+ if (resolved) return;
309
+ resolved = true;
310
+ clearInterval(heartbeatTimer);
311
+ clearTimeout(absoluteTimer);
312
+
313
+ if (lineBuf.length > 0) {
314
+ emitLine(lineBuf);
315
+ lineBuf = "";
316
+ }
317
+
318
+ const stdout = Buffer.concat(chunksOut).toString("utf8");
319
+ const stderr = Buffer.concat(chunksErr).toString("utf8");
320
+
321
+ resolve({
322
+ status: typeof status === "number" ? status : null,
323
+ stdout,
324
+ stderr,
325
+ signal: signal || null,
326
+ timedOut,
327
+ staleHeartbeat,
328
+ heartbeatReason,
329
+ error: spawnErr,
330
+ });
331
+ });
332
+ });
333
+ }
334
+
171
335
  // ─── spawnSupervisor ─────────────────────────────────────────────────────────
172
336
 
173
337
  /**
@@ -55,7 +55,13 @@ const DEFAULTS = Object.freeze({
55
55
  maxIterations: 200,
56
56
  hours: 24,
57
57
  gutterNoProgressIters: 5,
58
- workerTimeoutMs: 270000,
58
+ // Absolute backstop — raised from 270_000 to 3_600_000 in contract v1.1.0
59
+ // now that the heartbeat watchdog is the primary stuck-worker detector.
60
+ workerTimeoutMs: 3600000,
61
+ // Heartbeat watchdog threshold — events JSONL mtime must advance within
62
+ // this many ms or the worker is SIGTERM'd. Contract v1.1.0 §"Heartbeat
63
+ // Watchdog".
64
+ staleHeartbeatMs: 300000,
59
65
  });
60
66
 
61
67
  // ── Glob → regex helper ─────────────────────────────────────────────────────
@@ -110,6 +116,7 @@ function cloneDefaults() {
110
116
  hours: DEFAULTS.hours,
111
117
  gutterNoProgressIters: DEFAULTS.gutterNoProgressIters,
112
118
  workerTimeoutMs: DEFAULTS.workerTimeoutMs,
119
+ staleHeartbeatMs: DEFAULTS.staleHeartbeatMs,
113
120
  };
114
121
  }
115
122