@tekyzinc/gsd-t 3.15.10 → 3.16.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/gsd-t-orchestrator-worker.cjs +35 -3
- package/bin/gsd-t-token-capture.cjs +24 -3
- package/bin/gsd-t-token-regenerate-log.cjs +129 -0
- package/bin/gsd-t-transcript-tee.cjs +246 -0
- package/bin/gsd-t-unattended-heartbeat.cjs +188 -0
- package/bin/gsd-t-unattended-platform.cjs +191 -27
- package/bin/gsd-t-unattended-safety.cjs +8 -1
- package/bin/gsd-t-unattended.cjs +218 -38
- package/bin/gsd-t.js +15 -1
- package/bin/supervisor-pid-fingerprint.cjs +126 -0
- package/commands/gsd-t-resume.md +18 -4
- package/docs/architecture.md +16 -0
- package/package.json +1 -1
- package/scripts/gsd-t-dashboard-server.js +291 -4
- package/scripts/gsd-t-dashboard.html +31 -1
- package/scripts/gsd-t-transcript.html +422 -0
- package/scripts/hooks/gsd-t-in-session-probe.js +62 -0
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* gsd-t-unattended-heartbeat.cjs
|
|
3
|
+
*
|
|
4
|
+
* Liveness heartbeat watchdog for the unattended supervisor.
|
|
5
|
+
*
|
|
6
|
+
* Supersedes the pre-M43 `workerTimeoutMs` wall-clock guillotine as the
|
|
7
|
+
* PRIMARY stuck-worker detector. The guillotine remains as an absolute
|
|
8
|
+
* backstop (raised to 1 hour by default) for pathological cases where a
|
|
9
|
+
* child never writes a single event.
|
|
10
|
+
*
|
|
11
|
+
* How it works
|
|
12
|
+
* ────────────
|
|
13
|
+
* The supervisor polls `.gsd-t/events/YYYY-MM-DD.jsonl` mtime every 60 s
|
|
14
|
+
* during a worker iteration. If the mtime has not advanced for at least
|
|
15
|
+
* `staleHeartbeatMs` (default 300_000 = 5 min), the worker is considered
|
|
16
|
+
* stuck and SIGTERM'd. Healthy workers producing events run indefinitely
|
|
17
|
+
* under the 1-hour absolute cap.
|
|
18
|
+
*
|
|
19
|
+
* This module is pure and side-effect-free by default. `checkHeartbeat()`
|
|
20
|
+
* accepts injected `now` and `fsShim` so the entire watchdog can be
|
|
21
|
+
* unit-tested with a fake clock and fake filesystem.
|
|
22
|
+
*
|
|
23
|
+
* Zero external dependencies — Node built-ins only.
|
|
24
|
+
*
|
|
25
|
+
* Contract: .gsd-t/contracts/unattended-supervisor-contract.md v1.1.0
|
|
26
|
+
* §"Heartbeat Watchdog"
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
"use strict";
|
|
30
|
+
|
|
31
|
+
const fs = require("node:fs");
|
|
32
|
+
const path = require("node:path");
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Build the events JSONL path for a given date.
|
|
36
|
+
*
|
|
37
|
+
* @param {string} projectDir
|
|
38
|
+
* @param {Date|number} when Date, or ms since epoch. Defaults to now when
|
|
39
|
+
* omitted at the call site.
|
|
40
|
+
* @returns {string}
|
|
41
|
+
*/
|
|
42
|
+
function eventsPathFor(projectDir, when) {
|
|
43
|
+
const d = when instanceof Date ? when : new Date(when || Date.now());
|
|
44
|
+
const y = d.getUTCFullYear();
|
|
45
|
+
const m = String(d.getUTCMonth() + 1).padStart(2, "0");
|
|
46
|
+
const day = String(d.getUTCDate()).padStart(2, "0");
|
|
47
|
+
return path.join(projectDir, ".gsd-t", "events", `${y}-${m}-${day}.jsonl`);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Check whether the worker's event stream is stale.
|
|
52
|
+
*
|
|
53
|
+
* A worker is "stale" when the relevant events JSONL file's mtime has not
|
|
54
|
+
* advanced within `staleHeartbeatMs` of the given `now`. The relevant file
|
|
55
|
+
* is the one matching the date of `now` — if the loop crosses a UTC day
|
|
56
|
+
* boundary mid-iteration, the new day's file is checked.
|
|
57
|
+
*
|
|
58
|
+
* Fresh worker grace: if the events file does not exist yet AND
|
|
59
|
+
* `(now - workerStartedAt) < staleHeartbeatMs`, the worker is considered
|
|
60
|
+
* healthy (still booting). After the grace window with no file, the worker
|
|
61
|
+
* is stale.
|
|
62
|
+
*
|
|
63
|
+
* @param {object} params
|
|
64
|
+
* @param {string} params.projectDir
|
|
65
|
+
* @param {number} params.workerStartedAt ms since epoch
|
|
66
|
+
* @param {number} params.staleHeartbeatMs
|
|
67
|
+
* @param {number} [params.now] ms since epoch (defaults to Date.now())
|
|
68
|
+
* @param {object} [params.fsShim] { existsSync, statSync } — test hook
|
|
69
|
+
* @returns {{stale: boolean, reason: string, lastEventMs: (number|null), ageMs: (number|null), eventsPath: string}}
|
|
70
|
+
*/
|
|
71
|
+
function checkHeartbeat({
|
|
72
|
+
projectDir,
|
|
73
|
+
workerStartedAt,
|
|
74
|
+
staleHeartbeatMs,
|
|
75
|
+
now,
|
|
76
|
+
fsShim,
|
|
77
|
+
}) {
|
|
78
|
+
if (typeof projectDir !== "string" || projectDir.length === 0) {
|
|
79
|
+
throw new Error("checkHeartbeat: projectDir is required");
|
|
80
|
+
}
|
|
81
|
+
if (typeof workerStartedAt !== "number" || !Number.isFinite(workerStartedAt)) {
|
|
82
|
+
throw new Error("checkHeartbeat: workerStartedAt must be a finite number");
|
|
83
|
+
}
|
|
84
|
+
if (
|
|
85
|
+
typeof staleHeartbeatMs !== "number" ||
|
|
86
|
+
!Number.isFinite(staleHeartbeatMs) ||
|
|
87
|
+
staleHeartbeatMs <= 0
|
|
88
|
+
) {
|
|
89
|
+
throw new Error("checkHeartbeat: staleHeartbeatMs must be a positive number");
|
|
90
|
+
}
|
|
91
|
+
const nowMs = typeof now === "number" ? now : Date.now();
|
|
92
|
+
const shim = fsShim || fs;
|
|
93
|
+
|
|
94
|
+
const eventsPath = eventsPathFor(projectDir, nowMs);
|
|
95
|
+
|
|
96
|
+
let exists = false;
|
|
97
|
+
try {
|
|
98
|
+
exists = !!shim.existsSync(eventsPath);
|
|
99
|
+
} catch (_) {
|
|
100
|
+
exists = false;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
if (!exists) {
|
|
104
|
+
const sinceStart = nowMs - workerStartedAt;
|
|
105
|
+
if (sinceStart < staleHeartbeatMs) {
|
|
106
|
+
return {
|
|
107
|
+
stale: false,
|
|
108
|
+
reason: `events file not yet created (grace: ${sinceStart}ms < ${staleHeartbeatMs}ms)`,
|
|
109
|
+
lastEventMs: null,
|
|
110
|
+
ageMs: null,
|
|
111
|
+
eventsPath,
|
|
112
|
+
};
|
|
113
|
+
}
|
|
114
|
+
return {
|
|
115
|
+
stale: true,
|
|
116
|
+
reason: `events file ${eventsPath} absent for ${sinceStart}ms since worker start (threshold ${staleHeartbeatMs}ms)`,
|
|
117
|
+
lastEventMs: null,
|
|
118
|
+
ageMs: sinceStart,
|
|
119
|
+
eventsPath,
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
let stat;
|
|
124
|
+
try {
|
|
125
|
+
stat = shim.statSync(eventsPath);
|
|
126
|
+
} catch (err) {
|
|
127
|
+
// File existed at existsSync but stat failed — treat as stale only if
|
|
128
|
+
// we are past the grace window. Under the grace window, assume transient.
|
|
129
|
+
const sinceStart = nowMs - workerStartedAt;
|
|
130
|
+
if (sinceStart < staleHeartbeatMs) {
|
|
131
|
+
return {
|
|
132
|
+
stale: false,
|
|
133
|
+
reason: `events stat transient failure (grace): ${err.message}`,
|
|
134
|
+
lastEventMs: null,
|
|
135
|
+
ageMs: null,
|
|
136
|
+
eventsPath,
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
return {
|
|
140
|
+
stale: true,
|
|
141
|
+
reason: `events stat failed past grace: ${err.message}`,
|
|
142
|
+
lastEventMs: null,
|
|
143
|
+
ageMs: sinceStart,
|
|
144
|
+
eventsPath,
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
const mtimeMs =
|
|
149
|
+
typeof stat.mtimeMs === "number"
|
|
150
|
+
? stat.mtimeMs
|
|
151
|
+
: stat.mtime instanceof Date
|
|
152
|
+
? stat.mtime.getTime()
|
|
153
|
+
: 0;
|
|
154
|
+
|
|
155
|
+
// Reference point for staleness: max(mtime, workerStartedAt). This handles
|
|
156
|
+
// the bootstrap case where the events file already existed from a prior
|
|
157
|
+
// iteration — we don't want to kill the worker on its first 60s poll just
|
|
158
|
+
// because it hasn't emitted yet. The worker gets at least staleHeartbeatMs
|
|
159
|
+
// from its own start to produce the first event.
|
|
160
|
+
const ref = Math.max(mtimeMs, workerStartedAt);
|
|
161
|
+
const ageMs = nowMs - ref;
|
|
162
|
+
|
|
163
|
+
if (ageMs >= staleHeartbeatMs) {
|
|
164
|
+
return {
|
|
165
|
+
stale: true,
|
|
166
|
+
reason: `last event ${ageMs}ms ago (threshold ${staleHeartbeatMs}ms)`,
|
|
167
|
+
lastEventMs: mtimeMs,
|
|
168
|
+
ageMs,
|
|
169
|
+
eventsPath,
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
return {
|
|
173
|
+
stale: false,
|
|
174
|
+
reason: `fresh — last event ${ageMs}ms ago`,
|
|
175
|
+
lastEventMs: mtimeMs,
|
|
176
|
+
ageMs,
|
|
177
|
+
eventsPath,
|
|
178
|
+
};
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
module.exports = {
|
|
182
|
+
checkHeartbeat,
|
|
183
|
+
eventsPathFor,
|
|
184
|
+
// Default heartbeat poll cadence — exported so tests and the supervisor
|
|
185
|
+
// can reference a single source of truth.
|
|
186
|
+
DEFAULT_HEARTBEAT_POLL_MS: 60 * 1000,
|
|
187
|
+
DEFAULT_STALE_HEARTBEAT_MS: 5 * 60 * 1000,
|
|
188
|
+
};
|
|
@@ -14,7 +14,8 @@
|
|
|
14
14
|
* Task 1 of m36-cross-platform delivers:
|
|
15
15
|
* - resolveClaudePath()
|
|
16
16
|
* - isAlive(pid)
|
|
17
|
-
* - spawnWorker(projectDir, timeoutMs)
|
|
17
|
+
* - spawnWorker(projectDir, timeoutMs, opts)
|
|
18
|
+
* opts.onHeartbeatCheck? — async liveness watchdog (M43 D?)
|
|
18
19
|
*
|
|
19
20
|
* Cross-platform notes:
|
|
20
21
|
* - darwin / linux paths are runtime-tested.
|
|
@@ -89,46 +90,85 @@ function isAlive(pid) {
|
|
|
89
90
|
// ─── spawnWorker ─────────────────────────────────────────────────────────────
|
|
90
91
|
|
|
91
92
|
/**
|
|
92
|
-
* Spawn a
|
|
93
|
-
*
|
|
93
|
+
* Spawn a `claude -p '/gsd-t-resume'` worker iteration for the unattended
|
|
94
|
+
* supervisor.
|
|
94
95
|
*
|
|
95
96
|
* Returns a normalized result object: `{ status, stdout, stderr, signal,
|
|
96
|
-
* timedOut, error }`. Never throws — spawn errors are
|
|
97
|
+
* timedOut, staleHeartbeat, error }`. Never throws — spawn errors are
|
|
98
|
+
* returned in `error`.
|
|
97
99
|
*
|
|
98
|
-
*
|
|
99
|
-
*
|
|
100
|
-
*
|
|
101
|
-
*
|
|
100
|
+
* Two kill paths:
|
|
101
|
+
* 1. Heartbeat watchdog (M43 primary) — when `opts.onHeartbeatCheck` is
|
|
102
|
+
* provided, the function polls every `opts.heartbeatPollMs` (default
|
|
103
|
+
* 60_000). If the callback returns `{stale: true, ...}`, the child is
|
|
104
|
+
* SIGTERM'd and `staleHeartbeat: true` is set on the result.
|
|
105
|
+
* 2. Wall-clock timeout (absolute backstop) — `timeoutMs` is the hard cap
|
|
106
|
+
* regardless of heartbeat. On expiry the child is SIGTERM'd and
|
|
107
|
+
* `timedOut: true`. Default raised to 1 h in supervisor-core so a
|
|
108
|
+
* healthy long-running worker is not cut.
|
|
102
109
|
*
|
|
103
|
-
*
|
|
104
|
-
* -
|
|
105
|
-
* -
|
|
106
|
-
*
|
|
110
|
+
* Exactly one path is used per iteration:
|
|
111
|
+
* - opts.onHeartbeatCheck present → heartbeat path (async event loop)
|
|
112
|
+
* - opts.onHeartbeatCheck absent → legacy spawnSync path (blocking)
|
|
113
|
+
* The legacy path is preserved so callers that have no meaningful liveness
|
|
114
|
+
* signal (test stubs, dry-run) keep the original semantics.
|
|
107
115
|
*
|
|
108
|
-
*
|
|
109
|
-
*
|
|
110
|
-
*
|
|
116
|
+
* Cross-platform:
|
|
117
|
+
* - darwin / linux: runtime-tested.
|
|
118
|
+
* - win32: implementation-complete; documented in
|
|
119
|
+
* `docs/unattended-windows-caveats.md` (Task 3).
|
|
111
120
|
*
|
|
112
121
|
* @param {string} projectDir Absolute path to the project directory (cwd).
|
|
113
|
-
* @param {number} timeoutMs Wall-clock
|
|
122
|
+
* @param {number} timeoutMs Wall-clock backstop per worker iteration in ms.
|
|
114
123
|
* @param {object} [opts] Optional overrides (test-mode hooks).
|
|
115
124
|
* @param {string} [opts.bin] Override the resolved binary (test-mode only).
|
|
116
125
|
* @param {string[]} [opts.args] Override args (defaults to `['-p', '/gsd-t-resume']`).
|
|
117
126
|
* @param {object} [opts.env] Override env (defaults to `process.env`).
|
|
127
|
+
* @param {Function} [opts.onHeartbeatCheck] Called every heartbeatPollMs
|
|
128
|
+
* with no args; must return `{stale: boolean, reason?: string}` or a
|
|
129
|
+
* Promise thereof. When stale, the child is SIGTERM'd.
|
|
130
|
+
* @param {number} [opts.heartbeatPollMs] Poll cadence in ms. Default 60_000.
|
|
131
|
+
* @param {Function} [opts.onHeartbeatSample] Optional observer; receives the
|
|
132
|
+
* raw callback result each poll for logging.
|
|
133
|
+
* @param {Function} [opts.onStdoutLine] Optional live stdout line callback.
|
|
134
|
+
* When provided (heartbeat path only), invoked once per `\n`-terminated line
|
|
135
|
+
* as stdout streams in. Trailing partial line is flushed on close. Errors
|
|
136
|
+
* are swallowed (best-effort tee). Used by the supervisor to write each
|
|
137
|
+
* worker line into the M42 transcript file in real time, instead of
|
|
138
|
+
* appending the entire stdout buffer post-hoc at child exit.
|
|
118
139
|
* @returns {{
|
|
119
140
|
* status: number|null,
|
|
120
141
|
* stdout: string,
|
|
121
142
|
* stderr: string,
|
|
122
143
|
* signal: string|null,
|
|
123
144
|
* timedOut: boolean,
|
|
145
|
+
* staleHeartbeat: boolean,
|
|
146
|
+
* heartbeatReason: string|null,
|
|
124
147
|
* error: Error|null
|
|
125
|
-
* }}
|
|
148
|
+
* }|Promise<...>}
|
|
149
|
+
* Returns a Promise when `opts.onHeartbeatCheck` is provided; otherwise a
|
|
150
|
+
* synchronous result (legacy path).
|
|
126
151
|
*/
|
|
127
152
|
function spawnWorker(projectDir, timeoutMs, opts = {}) {
|
|
128
153
|
const bin = opts.bin || resolveClaudePath();
|
|
129
154
|
const args = opts.args || ["-p", "/gsd-t-resume"];
|
|
130
155
|
const env = opts.env || process.env;
|
|
131
156
|
|
|
157
|
+
if (typeof opts.onHeartbeatCheck === "function") {
|
|
158
|
+
return _spawnWorkerAsyncHeartbeat({
|
|
159
|
+
bin,
|
|
160
|
+
args,
|
|
161
|
+
env,
|
|
162
|
+
cwd: projectDir,
|
|
163
|
+
timeoutMs,
|
|
164
|
+
onHeartbeatCheck: opts.onHeartbeatCheck,
|
|
165
|
+
heartbeatPollMs: opts.heartbeatPollMs || 60 * 1000,
|
|
166
|
+
onHeartbeatSample: opts.onHeartbeatSample,
|
|
167
|
+
onStdoutLine: opts.onStdoutLine,
|
|
168
|
+
spawnImpl: opts._spawnImpl || spawn,
|
|
169
|
+
});
|
|
170
|
+
}
|
|
171
|
+
|
|
132
172
|
const result = spawnSync(bin, args, {
|
|
133
173
|
cwd: projectDir,
|
|
134
174
|
encoding: "utf8",
|
|
@@ -139,19 +179,11 @@ function spawnWorker(projectDir, timeoutMs, opts = {}) {
|
|
|
139
179
|
windowsHide: true,
|
|
140
180
|
});
|
|
141
181
|
|
|
142
|
-
// Normalize. spawnSync may return error if the binary cannot be launched
|
|
143
|
-
// (ENOENT etc.) — surface it instead of throwing.
|
|
144
182
|
const stdout = typeof result.stdout === "string" ? result.stdout : "";
|
|
145
183
|
const stderr = typeof result.stderr === "string" ? result.stderr : "";
|
|
146
184
|
const signal = result.signal || null;
|
|
147
185
|
const status = typeof result.status === "number" ? result.status : null;
|
|
148
186
|
|
|
149
|
-
// Timeout detection: when spawnSync's `timeout` option fires it sets
|
|
150
|
-
// - status === null
|
|
151
|
-
// - signal !== null (SIGTERM on POSIX, equivalent on win32)
|
|
152
|
-
// - error.code === 'ETIMEDOUT' (Node surfaces it as a synthetic Error)
|
|
153
|
-
// The ETIMEDOUT code is the authoritative signal — checking it
|
|
154
|
-
// discriminates a genuine timeout from an ENOENT/spawn failure.
|
|
155
187
|
const errCode = result.error && result.error.code;
|
|
156
188
|
const timedOut =
|
|
157
189
|
errCode === "ETIMEDOUT" || (status === null && signal !== null && !result.error);
|
|
@@ -162,12 +194,144 @@ function spawnWorker(projectDir, timeoutMs, opts = {}) {
|
|
|
162
194
|
stderr,
|
|
163
195
|
signal,
|
|
164
196
|
timedOut,
|
|
165
|
-
|
|
166
|
-
|
|
197
|
+
staleHeartbeat: false,
|
|
198
|
+
heartbeatReason: null,
|
|
167
199
|
error: errCode === "ETIMEDOUT" ? null : result.error || null,
|
|
168
200
|
};
|
|
169
201
|
}
|
|
170
202
|
|
|
203
|
+
/**
|
|
204
|
+
* Async spawn path used when a heartbeat callback is provided.
|
|
205
|
+
*
|
|
206
|
+
* Returns a Promise resolving to the same result shape as spawnWorker's
|
|
207
|
+
* legacy path, plus `staleHeartbeat` and `heartbeatReason`.
|
|
208
|
+
*
|
|
209
|
+
* Kill precedence when both fire in the same tick: heartbeat wins (it is
|
|
210
|
+
* the more specific signal). The loser is suppressed on the result.
|
|
211
|
+
*
|
|
212
|
+
* @private
|
|
213
|
+
*/
|
|
214
|
+
function _spawnWorkerAsyncHeartbeat({
|
|
215
|
+
bin,
|
|
216
|
+
args,
|
|
217
|
+
env,
|
|
218
|
+
cwd,
|
|
219
|
+
timeoutMs,
|
|
220
|
+
onHeartbeatCheck,
|
|
221
|
+
heartbeatPollMs,
|
|
222
|
+
onHeartbeatSample,
|
|
223
|
+
onStdoutLine,
|
|
224
|
+
spawnImpl,
|
|
225
|
+
}) {
|
|
226
|
+
return new Promise((resolve) => {
|
|
227
|
+
let child;
|
|
228
|
+
try {
|
|
229
|
+
child = spawnImpl(bin, args, {
|
|
230
|
+
cwd,
|
|
231
|
+
env,
|
|
232
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
233
|
+
shell: false,
|
|
234
|
+
windowsHide: true,
|
|
235
|
+
});
|
|
236
|
+
} catch (err) {
|
|
237
|
+
resolve({
|
|
238
|
+
status: null,
|
|
239
|
+
stdout: "",
|
|
240
|
+
stderr: "",
|
|
241
|
+
signal: null,
|
|
242
|
+
timedOut: false,
|
|
243
|
+
staleHeartbeat: false,
|
|
244
|
+
heartbeatReason: null,
|
|
245
|
+
error: err,
|
|
246
|
+
});
|
|
247
|
+
return;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
const chunksOut = [];
|
|
251
|
+
const chunksErr = [];
|
|
252
|
+
let lineBuf = "";
|
|
253
|
+
const emitLine = (line) => {
|
|
254
|
+
if (typeof onStdoutLine !== "function" || line.length === 0) return;
|
|
255
|
+
try { onStdoutLine(line); } catch (_) { /* tee is best-effort */ }
|
|
256
|
+
};
|
|
257
|
+
if (child.stdout) child.stdout.on("data", (d) => {
|
|
258
|
+
chunksOut.push(d);
|
|
259
|
+
if (typeof onStdoutLine !== "function") return;
|
|
260
|
+
lineBuf += d.toString("utf8");
|
|
261
|
+
let nl;
|
|
262
|
+
while ((nl = lineBuf.indexOf("\n")) >= 0) {
|
|
263
|
+
emitLine(lineBuf.slice(0, nl));
|
|
264
|
+
lineBuf = lineBuf.slice(nl + 1);
|
|
265
|
+
}
|
|
266
|
+
});
|
|
267
|
+
if (child.stderr) child.stderr.on("data", (d) => chunksErr.push(d));
|
|
268
|
+
|
|
269
|
+
let resolved = false;
|
|
270
|
+
let timedOut = false;
|
|
271
|
+
let staleHeartbeat = false;
|
|
272
|
+
let heartbeatReason = null;
|
|
273
|
+
let spawnErr = null;
|
|
274
|
+
|
|
275
|
+
const heartbeatTimer = setInterval(async () => {
|
|
276
|
+
if (resolved) return;
|
|
277
|
+
let sample;
|
|
278
|
+
try {
|
|
279
|
+
sample = await onHeartbeatCheck();
|
|
280
|
+
} catch (err) {
|
|
281
|
+
// Heartbeat check must not kill the worker on its own failures.
|
|
282
|
+
// Surface via sample observer when provided and continue.
|
|
283
|
+
sample = { stale: false, reason: `heartbeat check threw: ${err.message}` };
|
|
284
|
+
}
|
|
285
|
+
if (typeof onHeartbeatSample === "function") {
|
|
286
|
+
try { onHeartbeatSample(sample); } catch (_) { /* observer best-effort */ }
|
|
287
|
+
}
|
|
288
|
+
if (sample && sample.stale) {
|
|
289
|
+
staleHeartbeat = true;
|
|
290
|
+
heartbeatReason = sample.reason || "stale heartbeat";
|
|
291
|
+
try { child.kill("SIGTERM"); } catch (_) { /* child may already be gone */ }
|
|
292
|
+
}
|
|
293
|
+
}, heartbeatPollMs);
|
|
294
|
+
if (typeof heartbeatTimer.unref === "function") heartbeatTimer.unref();
|
|
295
|
+
|
|
296
|
+
const absoluteTimer = setTimeout(() => {
|
|
297
|
+
if (resolved || staleHeartbeat) return;
|
|
298
|
+
timedOut = true;
|
|
299
|
+
try { child.kill("SIGTERM"); } catch (_) { /* child may already be gone */ }
|
|
300
|
+
}, timeoutMs);
|
|
301
|
+
if (typeof absoluteTimer.unref === "function") absoluteTimer.unref();
|
|
302
|
+
|
|
303
|
+
child.on("error", (err) => {
|
|
304
|
+
spawnErr = err;
|
|
305
|
+
});
|
|
306
|
+
|
|
307
|
+
child.on("close", (status, signal) => {
|
|
308
|
+
if (resolved) return;
|
|
309
|
+
resolved = true;
|
|
310
|
+
clearInterval(heartbeatTimer);
|
|
311
|
+
clearTimeout(absoluteTimer);
|
|
312
|
+
|
|
313
|
+
if (lineBuf.length > 0) {
|
|
314
|
+
emitLine(lineBuf);
|
|
315
|
+
lineBuf = "";
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
const stdout = Buffer.concat(chunksOut).toString("utf8");
|
|
319
|
+
const stderr = Buffer.concat(chunksErr).toString("utf8");
|
|
320
|
+
|
|
321
|
+
resolve({
|
|
322
|
+
status: typeof status === "number" ? status : null,
|
|
323
|
+
stdout,
|
|
324
|
+
stderr,
|
|
325
|
+
signal: signal || null,
|
|
326
|
+
timedOut,
|
|
327
|
+
staleHeartbeat,
|
|
328
|
+
heartbeatReason,
|
|
329
|
+
error: spawnErr,
|
|
330
|
+
});
|
|
331
|
+
});
|
|
332
|
+
});
|
|
333
|
+
}
|
|
334
|
+
|
|
171
335
|
// ─── spawnSupervisor ─────────────────────────────────────────────────────────
|
|
172
336
|
|
|
173
337
|
/**
|
|
@@ -55,7 +55,13 @@ const DEFAULTS = Object.freeze({
|
|
|
55
55
|
maxIterations: 200,
|
|
56
56
|
hours: 24,
|
|
57
57
|
gutterNoProgressIters: 5,
|
|
58
|
-
|
|
58
|
+
// Absolute backstop — raised from 270_000 to 3_600_000 in contract v1.1.0
|
|
59
|
+
// now that the heartbeat watchdog is the primary stuck-worker detector.
|
|
60
|
+
workerTimeoutMs: 3600000,
|
|
61
|
+
// Heartbeat watchdog threshold — events JSONL mtime must advance within
|
|
62
|
+
// this many ms or the worker is SIGTERM'd. Contract v1.1.0 §"Heartbeat
|
|
63
|
+
// Watchdog".
|
|
64
|
+
staleHeartbeatMs: 300000,
|
|
59
65
|
});
|
|
60
66
|
|
|
61
67
|
// ── Glob → regex helper ─────────────────────────────────────────────────────
|
|
@@ -110,6 +116,7 @@ function cloneDefaults() {
|
|
|
110
116
|
hours: DEFAULTS.hours,
|
|
111
117
|
gutterNoProgressIters: DEFAULTS.gutterNoProgressIters,
|
|
112
118
|
workerTimeoutMs: DEFAULTS.workerTimeoutMs,
|
|
119
|
+
staleHeartbeatMs: DEFAULTS.staleHeartbeatMs,
|
|
113
120
|
};
|
|
114
121
|
}
|
|
115
122
|
|