@openparachute/hub 0.6.2 → 0.6.3-rc.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +87 -35
- package/package.json +1 -1
- package/src/__tests__/api-hub-upgrade.test.ts +690 -0
- package/src/__tests__/api-modules-ops.test.ts +359 -3
- package/src/__tests__/api-modules.test.ts +54 -0
- package/src/__tests__/expose-cloudflare.test.ts +163 -72
- package/src/__tests__/expose-off-auto.test.ts +26 -1
- package/src/__tests__/expose.test.ts +260 -240
- package/src/__tests__/hub-control.test.ts +1 -242
- package/src/__tests__/hub-server.test.ts +64 -0
- package/src/__tests__/hub-unit.test.ts +574 -0
- package/src/__tests__/init.test.ts +219 -2
- package/src/__tests__/lifecycle.test.ts +416 -1448
- package/src/__tests__/managed-unit.test.ts +575 -0
- package/src/__tests__/migrate-cutover.test.ts +840 -0
- package/src/__tests__/migrate-offer.test.ts +240 -0
- package/src/__tests__/migrate.test.ts +132 -0
- package/src/__tests__/module-ops-client.test.ts +556 -0
- package/src/__tests__/port-probe.test.ts +23 -0
- package/src/__tests__/setup-wizard.test.ts +130 -0
- package/src/__tests__/status-supervisor.test.ts +504 -0
- package/src/__tests__/status.test.ts +157 -708
- package/src/__tests__/supervisor.test.ts +471 -6
- package/src/__tests__/upgrade.test.ts +351 -5
- package/src/api-hub-upgrade.ts +384 -0
- package/src/api-hub.ts +2 -1
- package/src/api-modules-ops.ts +221 -0
- package/src/api-modules.ts +18 -2
- package/src/cli.ts +97 -12
- package/src/cloudflare/connector-service.ts +117 -322
- package/src/commands/expose-cloudflare.ts +63 -71
- package/src/commands/expose-supervisor.ts +247 -0
- package/src/commands/expose.ts +59 -48
- package/src/commands/init.ts +225 -12
- package/src/commands/lifecycle.ts +455 -816
- package/src/commands/migrate-cutover.ts +837 -0
- package/src/commands/migrate.ts +71 -2
- package/src/commands/serve-boot.ts +71 -25
- package/src/commands/status.ts +535 -235
- package/src/commands/upgrade.ts +100 -2
- package/src/help.ts +128 -68
- package/src/hub-control.ts +23 -162
- package/src/hub-server.ts +39 -0
- package/src/hub-unit.ts +735 -0
- package/src/hub-upgrade-helper.ts +306 -0
- package/src/hub-upgrade-mode.ts +209 -0
- package/src/hub-upgrade-status.ts +150 -0
- package/src/managed-unit.ts +692 -0
- package/src/migrate-offer.ts +186 -0
- package/src/module-ops-client.ts +457 -0
- package/src/port-probe.ts +50 -0
- package/src/process-state.ts +19 -3
- package/src/setup-wizard.ts +80 -1
- package/src/supervisor.ts +389 -38
- package/web/ui/dist/assets/index-D_6AFvZy.js +61 -0
- package/web/ui/dist/assets/{index-BiBlvEaj.css → index-mz8XcVPP.css} +1 -1
- package/web/ui/dist/index.html +2 -2
- package/web/ui/dist/assets/index-CIN3mnmf.js +0 -61
package/src/supervisor.ts
CHANGED
|
@@ -1,36 +1,70 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* The hub's in-process module supervisor — the single runtime everywhere.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
4
|
+
* As of Phase 5b there is ONE process model: `parachute serve` runs the hub
|
|
5
|
+
* in the foreground with this Supervisor, and the platform's process manager
|
|
6
|
+
* (launchd on a Mac, systemd on a Linux VM, the container runtime on Render /
|
|
7
|
+
* Fly) keeps that `serve` process alive across crashes and reboots. The old
|
|
8
|
+
* manager-less detached-daemon model (per-module `detached + unref()` spawns
|
|
9
|
+
* tracked by pidfiles) is retired — the on-box `parachute start/stop/restart
|
|
10
|
+
* <svc>` verbs are now clients of THIS supervisor, driving it over the
|
|
11
|
+
* loopback module-ops API (`api-modules-ops.ts` → `commands/lifecycle.ts`).
|
|
9
12
|
*
|
|
10
|
-
*
|
|
11
|
-
* crashes, nothing brings it back.
|
|
12
|
-
* - Render's log viewer only surfaces hub's stdout. A detached child
|
|
13
|
-
* whose stdout goes to `~/.parachute/<svc>/logs/<svc>.log` is
|
|
14
|
-
* invisible to the operator clicking through the dashboard.
|
|
13
|
+
* What this supervisor does:
|
|
15
14
|
*
|
|
16
|
-
*
|
|
17
|
-
*
|
|
18
|
-
*
|
|
19
|
-
*
|
|
20
|
-
*
|
|
21
|
-
* `
|
|
22
|
-
*
|
|
23
|
-
*
|
|
15
|
+
* - Spawns each module as an attached child in its own process group
|
|
16
|
+
* (`detached: true` for group-signalling only; stdio stays piped — see
|
|
17
|
+
* `defaultSpawnFn` / `defaultKillGroup`), so a wrapped startCmd's
|
|
18
|
+
* grandchildren are reaped on stop/restart (no EADDRINUSE-on-restart).
|
|
19
|
+
* - Pipes each child's stdout/stderr through a line-prefixing tap into the
|
|
20
|
+
* hub's own stdout (`[vault] …`, `[scribe] …`) and a bounded per-module
|
|
21
|
+
* ring buffer, so the operator sees module output in the hub log /
|
|
22
|
+
* platform log viewer and `parachute logs <svc>` can replay recent lines.
|
|
23
|
+
* - Gates a freshly-spawned module to `running` only once its port binds
|
|
24
|
+
* (port-readiness), and records a structured start-error on preflight
|
|
25
|
+
* failure, so `status` / the SPA keep the friendly missing-dependency
|
|
26
|
+
* surface.
|
|
27
|
+
* - Watches `proc.exited` and crash-restarts children up to a small budget
|
|
28
|
+
* before marking the module `crashed`. The budget keeps a wedged-on-boot
|
|
29
|
+
* module from chewing forever; the hub unit's own StartLimit / Throttle
|
|
30
|
+
* bounds the outer keeper.
|
|
24
31
|
*
|
|
25
|
-
* Out of scope
|
|
26
|
-
*
|
|
27
|
-
*
|
|
28
|
-
* persisting child state to disk (transient — re-derived from
|
|
29
|
-
* services.json on boot).
|
|
32
|
+
* Out of scope: supervising the hub HTTP server itself (that's `Bun.serve` in
|
|
33
|
+
* this same process — the platform manager is the hub's keeper) and persisting
|
|
34
|
+
* child state to disk (transient — re-derived from services.json on every boot).
|
|
30
35
|
*/
|
|
31
36
|
|
|
37
|
+
import {
|
|
38
|
+
MissingDependencyError,
|
|
39
|
+
type MissingDependencyWire,
|
|
40
|
+
ensureExecutable,
|
|
41
|
+
rethrowIfMissing,
|
|
42
|
+
} from "@openparachute/depcheck";
|
|
43
|
+
import { type PortListeningFn, defaultPortListening } from "./port-probe.ts";
|
|
44
|
+
|
|
32
45
|
export type ModuleStatus = "starting" | "running" | "stopped" | "crashed" | "restarting";
|
|
33
46
|
|
|
47
|
+
/**
|
|
48
|
+
* Structured start-failure detail recorded onto `ModuleState` (§6.5). Mirrors
|
|
49
|
+
* depcheck's `MissingDependencyWire` for the missing-dependency case and the
|
|
50
|
+
* services.json-row `ServiceEntryStartError` shape `commands/lifecycle.ts`
|
|
51
|
+
* records, so `status` / the SPA keep the SAME friendly missing-dependency
|
|
52
|
+
* surface whether a module was started via the detached path or the
|
|
53
|
+
* supervisor. `error_type` is left open for a future non-dependency failure.
|
|
54
|
+
*/
|
|
55
|
+
export interface ModuleStartError {
|
|
56
|
+
readonly error_type: string;
|
|
57
|
+
readonly error_description: string;
|
|
58
|
+
/** Present for `error_type: "missing_dependency"`. */
|
|
59
|
+
readonly binary?: string;
|
|
60
|
+
readonly why?: string | null;
|
|
61
|
+
readonly docs_url?: string | null;
|
|
62
|
+
readonly install?: { darwin?: string; linux?: string; generic?: string };
|
|
63
|
+
readonly sysadmin_hint?: string;
|
|
64
|
+
/** ISO timestamp of when the failure was recorded. */
|
|
65
|
+
readonly at: string;
|
|
66
|
+
}
|
|
67
|
+
|
|
34
68
|
export interface ModuleState {
|
|
35
69
|
/** Short name (vault / notes / scribe / …). */
|
|
36
70
|
readonly short: string;
|
|
@@ -46,6 +80,15 @@ export interface ModuleState {
|
|
|
46
80
|
readonly lastCrashAt?: string;
|
|
47
81
|
/** Exit code of the most recent crash. */
|
|
48
82
|
readonly lastExitCode?: number | null;
|
|
83
|
+
/**
|
|
84
|
+
* Structured start-failure detail (§6.5). Set when a preflight
|
|
85
|
+
* `MissingDependencyError` aborts the spawn, OR when a spawned child stays
|
|
86
|
+
* alive but never binds its port within the readiness window
|
|
87
|
+
* (started-but-unbound, hub#487). Cleared on a clean, port-confirmed start.
|
|
88
|
+
* The `status` enum is intentionally NOT extended (proxy-state Mode-1 + the
|
|
89
|
+
* SPA read `running`); this field carries the friendly diagnostic instead.
|
|
90
|
+
*/
|
|
91
|
+
readonly startError?: ModuleStartError;
|
|
49
92
|
}
|
|
50
93
|
|
|
51
94
|
export interface SpawnRequest {
|
|
@@ -97,10 +140,31 @@ export interface SupervisorOpts {
|
|
|
97
140
|
* stream without spelunking stdout.
|
|
98
141
|
*/
|
|
99
142
|
readonly output?: (line: string) => void;
|
|
143
|
+
/**
|
|
144
|
+
* Cap, in bytes, of the per-module log ring buffer (§6.5). The supervisor
|
|
145
|
+
* keeps the most-recent ~`logBufferBytes` of each child's output so a
|
|
146
|
+
* `GET /api/modules/:short/logs` tap can replay the boot/crash lines that
|
|
147
|
+
* happened *before* the reader connected — the detached path got this for
|
|
148
|
+
* free via the per-service logfile; the supervisor streams-and-discards, so
|
|
149
|
+
* without a buffer the crash cause (the most important line) is lost. The
|
|
150
|
+
* oldest whole lines are dropped once the cap is exceeded. Default 64 KiB.
|
|
151
|
+
*/
|
|
152
|
+
readonly logBufferBytes?: number;
|
|
100
153
|
/**
|
|
101
154
|
* Test seam over `Bun.spawn`. Returns a Subprocess-shaped handle.
|
|
102
155
|
*/
|
|
103
156
|
readonly spawnFn?: SpawnFn;
|
|
157
|
+
/**
|
|
158
|
+
* Group-aware kill seam (hub#88). Production sends the signal to the child's
|
|
159
|
+
* whole process group (`process.kill(-pid, signal)`) so wrapped startCmds
|
|
160
|
+
* like `pnpm exec tsx server.ts` reap the tsx grandchild — not just the
|
|
161
|
+
* wrapper that would otherwise leave the grandchild bound to the port →
|
|
162
|
+
* EADDRINUSE on restart. Pairs with `defaultSpawnFn`'s `detached: true`
|
|
163
|
+
* (each child is its own process-group leader, `pid === pgid`). Defaults to
|
|
164
|
+
* {@link defaultKillGroup}; tests inject a stub so they stay deterministic
|
|
165
|
+
* (no real signals) and can assert the negative pid (group send) was used.
|
|
166
|
+
*/
|
|
167
|
+
readonly killFn?: KillFn;
|
|
104
168
|
/**
|
|
105
169
|
* Test seam over wall-clock. Production passes `Date.now`.
|
|
106
170
|
*/
|
|
@@ -110,6 +174,40 @@ export interface SupervisorOpts {
|
|
|
110
174
|
* with `setTimeout`. Tests stub to advance time deterministically.
|
|
111
175
|
*/
|
|
112
176
|
readonly sleep?: (ms: number) => Promise<void>;
|
|
177
|
+
/**
|
|
178
|
+
* Port-readiness probe (§6.5). After a child spawns, the supervisor polls
|
|
179
|
+
* this until the module's port (from `req.env.PORT`) binds, to catch the
|
|
180
|
+
* alive-but-never-bound shape (hub#487). Defaults to `defaultPortListening`
|
|
181
|
+
* (a loopback TCP connect). Tests inject a deterministic stub.
|
|
182
|
+
*
|
|
183
|
+
* Defaulting policy (mirrors `commands/lifecycle.ts`): the readiness gate is
|
|
184
|
+
* SKIPPED unless this is the production path (no `spawnFn` override) OR a
|
|
185
|
+
* test explicitly opts in by injecting `portListening` / `startReadyMs`.
|
|
186
|
+
* Without that guard, every existing stub-spawner test (fake procs that
|
|
187
|
+
* never bind a real port) would block the full readiness window.
|
|
188
|
+
*/
|
|
189
|
+
readonly portListening?: PortListeningFn;
|
|
190
|
+
/**
|
|
191
|
+
* How long the post-spawn port-readiness gate polls before recording a
|
|
192
|
+
* `started-but-unbound` start-error, in ms. Default 4000 on the production
|
|
193
|
+
* path; 0 (skipped) on the stub-spawner test path unless `portListening` /
|
|
194
|
+
* `startReadyMs` is set explicitly.
|
|
195
|
+
*/
|
|
196
|
+
readonly startReadyMs?: number;
|
|
197
|
+
/** Poll interval while waiting for the port to bind, in ms. Default 200. */
|
|
198
|
+
readonly startReadyPollMs?: number;
|
|
199
|
+
/**
|
|
200
|
+
* PATH-resolution seam for the pre-spawn `ensureExecutable` preflight
|
|
201
|
+
* (`@openparachute/depcheck`). Production uses the real `Bun.which`; a
|
|
202
|
+
* missing startCmd binary then aborts the spawn with a structured
|
|
203
|
+
* `MissingDependencyError` recorded onto `ModuleState.startError`.
|
|
204
|
+
*
|
|
205
|
+
* Defaulting policy mirrors the readiness gate: a stub `spawnFn` (test path)
|
|
206
|
+
* gets a permissive resolver so the preflight doesn't trip on binaries
|
|
207
|
+
* absent from the test host's PATH; production gets the real `Bun.which`.
|
|
208
|
+
* Tests exercising the missing-binary branch inject `which: () => null`.
|
|
209
|
+
*/
|
|
210
|
+
readonly which?: (cmd: string) => string | null;
|
|
113
211
|
}
|
|
114
212
|
|
|
115
213
|
/**
|
|
@@ -119,6 +217,12 @@ export interface SupervisorOpts {
|
|
|
119
217
|
*/
|
|
120
218
|
export type SpawnFn = (req: SpawnRequest) => SupervisedProc;
|
|
121
219
|
|
|
220
|
+
/**
|
|
221
|
+
* Group-aware kill seam. Sends `signal` to the process group rooted at `pid`.
|
|
222
|
+
* Production uses {@link defaultKillGroup}; tests inject a stub.
|
|
223
|
+
*/
|
|
224
|
+
export type KillFn = (pid: number, signal: NodeJS.Signals | number) => void;
|
|
225
|
+
|
|
122
226
|
/**
|
|
123
227
|
* The minimal Subprocess shape the supervisor depends on. Bun's real
|
|
124
228
|
* `Subprocess` matches this; the test fake mirrors it.
|
|
@@ -135,6 +239,46 @@ const DEFAULT_MAX_RESTARTS = 3;
|
|
|
135
239
|
const DEFAULT_RESTART_WINDOW_MS = 60_000;
|
|
136
240
|
const DEFAULT_RESTART_DELAY_MS = 500;
|
|
137
241
|
const DEFAULT_KILL_TIMEOUT_MS = 5_000;
|
|
242
|
+
const DEFAULT_LOG_BUFFER_BYTES = 64 * 1024;
|
|
243
|
+
const DEFAULT_START_READY_MS = 4_000;
|
|
244
|
+
const DEFAULT_START_READY_POLL_MS = 200;
|
|
245
|
+
|
|
246
|
+
/**
|
|
247
|
+
* Bounded, line-oriented ring buffer (§6.5). Holds the most-recent lines of a
|
|
248
|
+
* module's output up to `maxBytes`; pushing past the cap drops whole lines
|
|
249
|
+
* from the front (oldest-first) until it fits. Bounding by bytes (not line
|
|
250
|
+
* count) keeps a chatty module from pinning unbounded memory regardless of
|
|
251
|
+
* line length. Each pushed string is already a single prefixed line from
|
|
252
|
+
* `pumpLines` (it includes its trailing newline).
|
|
253
|
+
*/
|
|
254
|
+
export class LogRingBuffer {
|
|
255
|
+
private readonly lines: string[] = [];
|
|
256
|
+
private bytes = 0;
|
|
257
|
+
|
|
258
|
+
constructor(private readonly maxBytes: number) {}
|
|
259
|
+
|
|
260
|
+
push(line: string): void {
|
|
261
|
+
this.lines.push(line);
|
|
262
|
+
this.bytes += Buffer.byteLength(line);
|
|
263
|
+
// Drop oldest whole lines until we're back under the cap. A single line
|
|
264
|
+
// larger than the cap is kept (we never split a line) — the alternative
|
|
265
|
+
// (dropping it) would lose exactly the long stack-trace we most want.
|
|
266
|
+
while (this.bytes > this.maxBytes && this.lines.length > 1) {
|
|
267
|
+
const dropped = this.lines.shift();
|
|
268
|
+
if (dropped !== undefined) this.bytes -= Buffer.byteLength(dropped);
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
/** Snapshot of the buffered lines, oldest-first. */
|
|
273
|
+
snapshot(): string[] {
|
|
274
|
+
return [...this.lines];
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
/** Buffered lines joined into a single string (the wire/tail shape). */
|
|
278
|
+
text(): string {
|
|
279
|
+
return this.lines.join("");
|
|
280
|
+
}
|
|
281
|
+
}
|
|
138
282
|
|
|
139
283
|
/**
|
|
140
284
|
* Per-module supervisor. Owns the spawn → watch → restart loop.
|
|
@@ -151,15 +295,30 @@ export class Supervisor {
|
|
|
151
295
|
private readonly modules = new Map<string, ModuleEntry>();
|
|
152
296
|
|
|
153
297
|
constructor(opts: SupervisorOpts = {}) {
|
|
298
|
+
// Defaulting policy for the port-readiness gate + preflight (§6.5),
|
|
299
|
+
// mirroring `commands/lifecycle.ts`: production (no `spawnFn` override) gets
|
|
300
|
+
// the real 4s readiness window + `Bun.which` preflight. The stub-spawner
|
|
301
|
+
// test path gets 0 (skipped) + a permissive `which` UNLESS a test opts in
|
|
302
|
+
// explicitly (injecting `portListening` / `startReadyMs` / `which`) — so
|
|
303
|
+
// existing fake-proc tests (which never bind a real port) don't block.
|
|
304
|
+
const isProductionPath = opts.spawnFn === undefined;
|
|
305
|
+
const readinessOptedIn = opts.portListening !== undefined || opts.startReadyMs !== undefined;
|
|
154
306
|
this.opts = {
|
|
155
307
|
maxRestarts: opts.maxRestarts ?? DEFAULT_MAX_RESTARTS,
|
|
156
308
|
restartWindowMs: opts.restartWindowMs ?? DEFAULT_RESTART_WINDOW_MS,
|
|
157
309
|
restartDelayMs: opts.restartDelayMs ?? DEFAULT_RESTART_DELAY_MS,
|
|
158
310
|
killTimeoutMs: opts.killTimeoutMs ?? DEFAULT_KILL_TIMEOUT_MS,
|
|
159
311
|
output: opts.output ?? ((line) => process.stdout.write(line)),
|
|
312
|
+
logBufferBytes: opts.logBufferBytes ?? DEFAULT_LOG_BUFFER_BYTES,
|
|
160
313
|
spawnFn: opts.spawnFn ?? defaultSpawnFn,
|
|
314
|
+
killFn: opts.killFn ?? defaultKillGroup,
|
|
161
315
|
now: opts.now ?? Date.now,
|
|
162
316
|
sleep: opts.sleep ?? ((ms) => new Promise((r) => setTimeout(r, ms))),
|
|
317
|
+
portListening: opts.portListening ?? defaultPortListening,
|
|
318
|
+
startReadyMs:
|
|
319
|
+
opts.startReadyMs ?? (isProductionPath || readinessOptedIn ? DEFAULT_START_READY_MS : 0),
|
|
320
|
+
startReadyPollMs: opts.startReadyPollMs ?? DEFAULT_START_READY_POLL_MS,
|
|
321
|
+
which: opts.which ?? (isProductionPath ? Bun.which : () => "/stub/bin/preflight-skipped"),
|
|
163
322
|
};
|
|
164
323
|
}
|
|
165
324
|
|
|
@@ -175,6 +334,9 @@ export class Supervisor {
|
|
|
175
334
|
return existing.state;
|
|
176
335
|
}
|
|
177
336
|
// Crashed → operator intent is "try again." Wipe the budget.
|
|
337
|
+
// A fresh ring buffer per entry — `start` is a clean spawn (the crash-
|
|
338
|
+
// respawn path in `handleExit` reuses the existing entry + buffer, so a
|
|
339
|
+
// crashed module's boot/crash lines survive into the restart for replay).
|
|
178
340
|
const entry: ModuleEntry = {
|
|
179
341
|
req,
|
|
180
342
|
state: {
|
|
@@ -183,12 +345,117 @@ export class Supervisor {
|
|
|
183
345
|
restartsInWindow: 0,
|
|
184
346
|
},
|
|
185
347
|
crashStamps: [],
|
|
348
|
+
logs: new LogRingBuffer(this.opts.logBufferBytes),
|
|
186
349
|
};
|
|
187
350
|
this.modules.set(req.short, entry);
|
|
188
|
-
|
|
351
|
+
|
|
352
|
+
// Pre-spawn preflight (§6.5): resolve the startCmd binary on PATH before
|
|
353
|
+
// spawning a doomed child. A missing binary records a structured
|
|
354
|
+
// `MissingDependencyError` onto state (the same friendly missing-dependency
|
|
355
|
+
// surface `commands/lifecycle.ts` records) and aborts — no spawn. Mirrors
|
|
356
|
+
// `lifecycle.start`'s `ensureExecutable` preflight.
|
|
357
|
+
const startBinary = req.cmd[0];
|
|
358
|
+
if (startBinary) {
|
|
359
|
+
try {
|
|
360
|
+
ensureExecutable(startBinary, { which: this.opts.which });
|
|
361
|
+
} catch (err) {
|
|
362
|
+
if (err instanceof MissingDependencyError) {
|
|
363
|
+
entry.state = {
|
|
364
|
+
...entry.state,
|
|
365
|
+
status: "crashed",
|
|
366
|
+
pid: undefined,
|
|
367
|
+
startError: startErrorFromWire(err.toWire(), this.opts.now),
|
|
368
|
+
};
|
|
369
|
+
return entry.state;
|
|
370
|
+
}
|
|
371
|
+
throw err;
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
// Belt-and-suspenders for a spawn that slips past the preflight (binary
|
|
376
|
+
// removed between check + spawn, or a path that didn't preflight): a
|
|
377
|
+
// not-found spawn throw becomes the same structured MissingDependencyError
|
|
378
|
+
// recorded onto state, not a throw out of `start`. Mirrors
|
|
379
|
+
// `lifecycle.start`'s `rethrowIfMissing` catch.
|
|
380
|
+
try {
|
|
381
|
+
this.spawnAndWatch(entry);
|
|
382
|
+
} catch (err) {
|
|
383
|
+
if (startBinary) {
|
|
384
|
+
try {
|
|
385
|
+
rethrowIfMissing(err, startBinary);
|
|
386
|
+
} catch (missing) {
|
|
387
|
+
if (missing instanceof MissingDependencyError) {
|
|
388
|
+
entry.state = {
|
|
389
|
+
...entry.state,
|
|
390
|
+
status: "crashed",
|
|
391
|
+
pid: undefined,
|
|
392
|
+
startError: startErrorFromWire(missing.toWire(), this.opts.now),
|
|
393
|
+
};
|
|
394
|
+
return entry.state;
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
throw err;
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
// Post-spawn port-readiness gate (§6.5, hub#487). A returned pid only
|
|
402
|
+
// proves the kernel forked the process; it says nothing about whether the
|
|
403
|
+
// module bound its port. Poll the port (from req.env.PORT) up to
|
|
404
|
+
// `startReadyMs`. On success: clear any prior startError. On timeout while
|
|
405
|
+
// the child is still alive: record a `started-but-unbound` structured
|
|
406
|
+
// start-error WITHOUT touching the `running` status enum (proxy-state
|
|
407
|
+
// Mode-1 + the SPA read `running`) — the friendly diagnostic rides the
|
|
408
|
+
// startError field. A child that died during the window is left to the
|
|
409
|
+
// crash watcher (`handleExit`), which owns the restart budget.
|
|
410
|
+
await this.awaitPortReadiness(entry);
|
|
189
411
|
return entry.state;
|
|
190
412
|
}
|
|
191
413
|
|
|
414
|
+
/**
|
|
415
|
+
* Poll the module's port until it binds or `startReadyMs` elapses (§6.5).
|
|
416
|
+
* Skipped when the gate is disabled (stub-spawner test path) or the request
|
|
417
|
+
* carries no `PORT`. Records / clears `state.startError` accordingly; never
|
|
418
|
+
* mutates `state.status` (see `start`).
|
|
419
|
+
*/
|
|
420
|
+
private async awaitPortReadiness(entry: ModuleEntry): Promise<void> {
|
|
421
|
+
if (this.opts.startReadyMs <= 0) return;
|
|
422
|
+
const portStr = entry.req.env?.PORT;
|
|
423
|
+
const port = portStr ? Number(portStr) : Number.NaN;
|
|
424
|
+
if (!Number.isFinite(port) || port <= 0) return; // No port to probe.
|
|
425
|
+
|
|
426
|
+
const deadline = this.opts.now() + this.opts.startReadyMs;
|
|
427
|
+
while (this.opts.now() < deadline) {
|
|
428
|
+
// The child may have crashed during the window — `handleExit` owns that
|
|
429
|
+
// (budget / restart). Stop probing; don't overwrite a crash with a
|
|
430
|
+
// port-readiness verdict.
|
|
431
|
+
if (entry.stopRequested || entry.state.status !== "running") return;
|
|
432
|
+
if (await this.opts.portListening(port)) {
|
|
433
|
+
// Bound → healthy. Clear any stale started-but-unbound error.
|
|
434
|
+
if (entry.state.startError) {
|
|
435
|
+
const { startError: _drop, ...rest } = entry.state;
|
|
436
|
+
entry.state = rest;
|
|
437
|
+
}
|
|
438
|
+
return;
|
|
439
|
+
}
|
|
440
|
+
await this.opts.sleep(this.opts.startReadyPollMs);
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
// Window elapsed, still alive but never bound — record the structured
|
|
444
|
+
// started-but-unbound error so `status` / the SPA show why, not a silently
|
|
445
|
+
// healthy `running`. Keep `running` (the process IS up); the diagnostic is
|
|
446
|
+
// the startError field.
|
|
447
|
+
if (entry.state.status === "running" && !entry.stopRequested) {
|
|
448
|
+
entry.state = {
|
|
449
|
+
...entry.state,
|
|
450
|
+
startError: {
|
|
451
|
+
error_type: "started_but_unbound",
|
|
452
|
+
error_description: `${entry.req.short} started (pid ${entry.state.pid}) but is not listening on port ${port} after ${this.opts.startReadyMs}ms — it may still be coming up, or the port is held by another process.`,
|
|
453
|
+
at: new Date(this.opts.now()).toISOString(),
|
|
454
|
+
},
|
|
455
|
+
};
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
|
|
192
459
|
/**
|
|
193
460
|
* Stop a supervised module. Sends SIGTERM, awaits the child's exit
|
|
194
461
|
* (so the log-pump drains the final flush before our stdout closes),
|
|
@@ -216,7 +483,13 @@ export class Supervisor {
|
|
|
216
483
|
const proc = entry.proc;
|
|
217
484
|
if (proc) {
|
|
218
485
|
try {
|
|
219
|
-
|
|
486
|
+
// Group-aware kill (hub#88): signal the child's whole process group
|
|
487
|
+
// via `killFn` (default `defaultKillGroup` → `process.kill(-pid)`) so
|
|
488
|
+
// a wrapped startCmd's grandchild is reaped too, not just the wrapper.
|
|
489
|
+
// Mirrors `commands/lifecycle.ts`'s `defaultKill` repointing of
|
|
490
|
+
// `defaultSpawner`'s detached children. Without it, the grandchild
|
|
491
|
+
// stays bound to the port → restart hits EADDRINUSE.
|
|
492
|
+
this.opts.killFn(proc.pid, "SIGTERM");
|
|
220
493
|
} catch {
|
|
221
494
|
// Process may already be dead — fall through.
|
|
222
495
|
}
|
|
@@ -234,7 +507,9 @@ export class Supervisor {
|
|
|
234
507
|
`[supervisor] ${entry.req.short} did not exit ${this.opts.killTimeoutMs}ms after SIGTERM — escalating to SIGKILL.\n`,
|
|
235
508
|
);
|
|
236
509
|
try {
|
|
237
|
-
|
|
510
|
+
// Group-aware SIGKILL escalation — same `killFn` seam as the
|
|
511
|
+
// SIGTERM above so the whole group is reaped, not just the leader.
|
|
512
|
+
this.opts.killFn(proc.pid, "SIGKILL");
|
|
238
513
|
} catch {
|
|
239
514
|
// Process may already be dead between the timeout firing
|
|
240
515
|
// and us reaching kill() — fall through to the await.
|
|
@@ -287,13 +562,17 @@ export class Supervisor {
|
|
|
287
562
|
private spawnAndWatch(entry: ModuleEntry): void {
|
|
288
563
|
const proc = this.opts.spawnFn(entry.req);
|
|
289
564
|
entry.proc = proc;
|
|
565
|
+
// Clear any stale startError from a prior attempt — a fresh running pid is
|
|
566
|
+
// the new ground truth; the readiness gate re-records if it still doesn't
|
|
567
|
+
// bind.
|
|
568
|
+
const { startError: _drop, ...prev } = entry.state;
|
|
290
569
|
entry.state = {
|
|
291
|
-
...
|
|
570
|
+
...prev,
|
|
292
571
|
status: "running",
|
|
293
572
|
pid: proc.pid,
|
|
294
573
|
startedAt: new Date(this.opts.now()).toISOString(),
|
|
295
574
|
};
|
|
296
|
-
this.pipeOutput(entry
|
|
575
|
+
this.pipeOutput(entry, proc);
|
|
297
576
|
void proc.exited.then((exitCode) => this.handleExit(entry, exitCode));
|
|
298
577
|
}
|
|
299
578
|
|
|
@@ -348,16 +627,34 @@ export class Supervisor {
|
|
|
348
627
|
}
|
|
349
628
|
|
|
350
629
|
/**
|
|
351
|
-
*
|
|
352
|
-
*
|
|
353
|
-
*
|
|
354
|
-
*
|
|
355
|
-
*
|
|
630
|
+
* Recent buffered output for a supervised module (§6.5), oldest-first, each
|
|
631
|
+
* element a prefixed line. Returns `undefined` for a module that isn't
|
|
632
|
+
* supervised (no entry) so a `GET /api/modules/:short/logs` handler can
|
|
633
|
+
* distinguish "not supervised" (404) from "supervised but quiet" (empty
|
|
634
|
+
* array). Survives a crash-respawn (same entry/buffer), so the boot/crash
|
|
635
|
+
* lines that preceded the reader connecting are replayable — the whole point.
|
|
636
|
+
*/
|
|
637
|
+
logs(short: string): string[] | undefined {
|
|
638
|
+
return this.modules.get(short)?.logs.snapshot();
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
/**
|
|
642
|
+
* Tap a child's stdout + stderr into the supervisor's `output` callback
|
|
643
|
+
* (hub's stdout by default) AND the per-module ring buffer (§6.5),
|
|
644
|
+
* prefixing each line with the module's short name. Line-buffered: partial
|
|
645
|
+
* chunks accumulate until a newline arrives so multi-byte log lines don't
|
|
646
|
+
* get scrambled across modules. The buffer is fed the same prefixed lines
|
|
647
|
+
* the live stream gets, so a later `/logs` tap replays exactly what hub's
|
|
648
|
+
* stdout already showed.
|
|
356
649
|
*/
|
|
357
|
-
private pipeOutput(
|
|
358
|
-
const prefix = `[${short}] `;
|
|
359
|
-
|
|
360
|
-
|
|
650
|
+
private pipeOutput(entry: ModuleEntry, proc: SupervisedProc): void {
|
|
651
|
+
const prefix = `[${entry.req.short}] `;
|
|
652
|
+
const sink = (line: string): void => {
|
|
653
|
+
this.opts.output(line);
|
|
654
|
+
entry.logs.push(line);
|
|
655
|
+
};
|
|
656
|
+
if (proc.stdout) void pumpLines(proc.stdout, prefix, sink);
|
|
657
|
+
if (proc.stderr) void pumpLines(proc.stderr, prefix, sink);
|
|
361
658
|
}
|
|
362
659
|
}
|
|
363
660
|
|
|
@@ -367,6 +664,8 @@ interface ModuleEntry {
|
|
|
367
664
|
proc?: SupervisedProc;
|
|
368
665
|
crashStamps: number[];
|
|
369
666
|
stopRequested?: boolean;
|
|
667
|
+
/** Bounded ring buffer of recent prefixed output lines (§6.5). */
|
|
668
|
+
logs: LogRingBuffer;
|
|
370
669
|
}
|
|
371
670
|
|
|
372
671
|
async function pumpLines(
|
|
@@ -402,7 +701,20 @@ async function pumpLines(
|
|
|
402
701
|
|
|
403
702
|
const defaultSpawnFn: SpawnFn = (req) => {
|
|
404
703
|
const spawnOpts: Parameters<typeof Bun.spawn>[1] = {
|
|
704
|
+
// Keep stdout/stderr explicitly piped — the supervisor pumps child output
|
|
705
|
+
// into hub's log (`pipeOutput`/`pumpLines`) + the per-module ring buffer.
|
|
706
|
+
// `detached: true` does NOT detach explicitly-piped stdio, so these stay
|
|
707
|
+
// wired even though the child gets its own process group below.
|
|
405
708
|
stdio: ["ignore", "pipe", "pipe"],
|
|
709
|
+
// Spawn in a fresh process group (pid == pgid) so `killFn` (→
|
|
710
|
+
// `process.kill(-pid, sig)`) reaches every descendant, not just the
|
|
711
|
+
// wrapper. Without this, wrapped startCmds like `pnpm exec tsx server.ts`
|
|
712
|
+
// leave the tsx grandchild bound to the port after stop → restart hits
|
|
713
|
+
// EADDRINUSE (hub#88). Mirrors `commands/lifecycle.ts`'s `defaultSpawner`,
|
|
714
|
+
// which set `detached: true` for exactly this reason. We do NOT `unref()`:
|
|
715
|
+
// the supervisor must stay attached for the lifecycle (watch `exited`,
|
|
716
|
+
// pump output, reap on stop).
|
|
717
|
+
detached: true,
|
|
406
718
|
// Inherit env so supervised module sees PATH, HOME, PARACHUTE_HOME, etc.
|
|
407
719
|
// Bun.spawn defaults to empty env — see api-modules-ops.ts:defaultRun.
|
|
408
720
|
// Per-call `req.env` overrides merge on top below.
|
|
@@ -413,3 +725,42 @@ const defaultSpawnFn: SpawnFn = (req) => {
|
|
|
413
725
|
const proc = Bun.spawn([...req.cmd], spawnOpts);
|
|
414
726
|
return proc as unknown as SupervisedProc;
|
|
415
727
|
};
|
|
728
|
+
|
|
729
|
+
/**
|
|
730
|
+
* Map a depcheck `MissingDependencyWire` onto the `ModuleStartError` shape
|
|
731
|
+
* recorded on `ModuleState` (§6.5), stamping `at`. The wire's field names
|
|
732
|
+
* already match (binary / why / docs_url / install / sysadmin_hint), so this
|
|
733
|
+
* is a stamp + passthrough — keeping the supervisor's start-error surface
|
|
734
|
+
* identical to the services.json `ServiceEntryStartError` the detached path
|
|
735
|
+
* records, so the SPA renders the same install card from either source.
|
|
736
|
+
*/
|
|
737
|
+
function startErrorFromWire(wire: MissingDependencyWire, now: () => number): ModuleStartError {
|
|
738
|
+
return {
|
|
739
|
+
error_type: wire.error_type,
|
|
740
|
+
error_description: wire.error_description,
|
|
741
|
+
binary: wire.binary,
|
|
742
|
+
why: wire.why,
|
|
743
|
+
docs_url: wire.docs_url,
|
|
744
|
+
install: wire.install,
|
|
745
|
+
sysadmin_hint: wire.sysadmin_hint,
|
|
746
|
+
at: new Date(now()).toISOString(),
|
|
747
|
+
};
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
/**
|
|
751
|
+
* Production group-aware kill (hub#88). Sends `signal` to the entire process
|
|
752
|
+
* group rooted at `pid` (the negative-pid syscall) so a wrapped startCmd's
|
|
753
|
+
* grandchildren are reaped alongside the wrapper. Mirrors
|
|
754
|
+
* `commands/lifecycle.ts`'s `defaultKill`: on ESRCH the group is already gone
|
|
755
|
+
* (or the child predates the detached-spawn change and has no group with that
|
|
756
|
+
* pgid) — fall back to a bare-pid signal so the caller's intent still lands
|
|
757
|
+
* when there's a positive-pid process to receive it.
|
|
758
|
+
*/
|
|
759
|
+
export const defaultKillGroup: KillFn = (pid, signal) => {
|
|
760
|
+
try {
|
|
761
|
+
process.kill(-pid, signal);
|
|
762
|
+
} catch (err) {
|
|
763
|
+
if ((err as NodeJS.ErrnoException).code !== "ESRCH") throw err;
|
|
764
|
+
process.kill(pid, signal);
|
|
765
|
+
}
|
|
766
|
+
};
|