@openparachute/hub 0.6.2 → 0.6.3-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/README.md +87 -35
  2. package/package.json +1 -1
  3. package/src/__tests__/api-hub-upgrade.test.ts +690 -0
  4. package/src/__tests__/api-modules-ops.test.ts +359 -3
  5. package/src/__tests__/api-modules.test.ts +54 -0
  6. package/src/__tests__/expose-cloudflare.test.ts +163 -72
  7. package/src/__tests__/expose-off-auto.test.ts +26 -1
  8. package/src/__tests__/expose.test.ts +260 -240
  9. package/src/__tests__/hub-control.test.ts +1 -242
  10. package/src/__tests__/hub-server.test.ts +64 -0
  11. package/src/__tests__/hub-unit.test.ts +574 -0
  12. package/src/__tests__/init.test.ts +219 -2
  13. package/src/__tests__/lifecycle.test.ts +416 -1448
  14. package/src/__tests__/managed-unit.test.ts +575 -0
  15. package/src/__tests__/migrate-cutover.test.ts +840 -0
  16. package/src/__tests__/migrate-offer.test.ts +240 -0
  17. package/src/__tests__/migrate.test.ts +132 -0
  18. package/src/__tests__/module-ops-client.test.ts +556 -0
  19. package/src/__tests__/port-probe.test.ts +23 -0
  20. package/src/__tests__/setup-wizard.test.ts +130 -0
  21. package/src/__tests__/status-supervisor.test.ts +504 -0
  22. package/src/__tests__/status.test.ts +157 -708
  23. package/src/__tests__/supervisor.test.ts +471 -6
  24. package/src/__tests__/upgrade.test.ts +351 -5
  25. package/src/api-hub-upgrade.ts +384 -0
  26. package/src/api-hub.ts +2 -1
  27. package/src/api-modules-ops.ts +221 -0
  28. package/src/api-modules.ts +18 -2
  29. package/src/cli.ts +97 -12
  30. package/src/cloudflare/connector-service.ts +117 -322
  31. package/src/commands/expose-cloudflare.ts +63 -71
  32. package/src/commands/expose-supervisor.ts +247 -0
  33. package/src/commands/expose.ts +59 -48
  34. package/src/commands/init.ts +225 -12
  35. package/src/commands/lifecycle.ts +455 -816
  36. package/src/commands/migrate-cutover.ts +837 -0
  37. package/src/commands/migrate.ts +71 -2
  38. package/src/commands/serve-boot.ts +71 -25
  39. package/src/commands/status.ts +535 -235
  40. package/src/commands/upgrade.ts +100 -2
  41. package/src/help.ts +128 -68
  42. package/src/hub-control.ts +23 -162
  43. package/src/hub-server.ts +39 -0
  44. package/src/hub-unit.ts +735 -0
  45. package/src/hub-upgrade-helper.ts +306 -0
  46. package/src/hub-upgrade-mode.ts +209 -0
  47. package/src/hub-upgrade-status.ts +150 -0
  48. package/src/managed-unit.ts +692 -0
  49. package/src/migrate-offer.ts +186 -0
  50. package/src/module-ops-client.ts +457 -0
  51. package/src/port-probe.ts +50 -0
  52. package/src/process-state.ts +19 -3
  53. package/src/setup-wizard.ts +80 -1
  54. package/src/supervisor.ts +389 -38
  55. package/web/ui/dist/assets/index-D_6AFvZy.js +61 -0
  56. package/web/ui/dist/assets/{index-BiBlvEaj.css → index-mz8XcVPP.css} +1 -1
  57. package/web/ui/dist/index.html +2 -2
  58. package/web/ui/dist/assets/index-CIN3mnmf.js +0 -61
package/src/supervisor.ts CHANGED
@@ -1,36 +1,70 @@
1
1
  /**
2
- * Per-module child supervisor for container-mode hub.
2
+ * The hub's in-process module supervisor the single runtime everywhere.
3
3
  *
4
- * The on-box flow (`parachute start <svc>`) spawns module daemons
5
- * detached + unref'd, writes a pidfile, and walks away process
6
- * lifecycle becomes the operator's problem (launchd, systemd, or a
7
- * follow-up `parachute restart`). That shape doesn't work in a
8
- * container:
4
+ * As of Phase 5b there is ONE process model: `parachute serve` runs the hub
5
+ * in the foreground with this Supervisor, and the platform's process manager
6
+ * (launchd on a Mac, systemd on a Linux VM, the container runtime on Render /
7
+ * Fly) keeps that `serve` process alive across crashes and reboots. The old
8
+ * manager-less detached-daemon model (per-module `detached + unref()` spawns
9
+ * tracked by pidfiles) is retired — the on-box `parachute start/stop/restart
10
+ * <svc>` verbs are now clients of THIS supervisor, driving it over the
11
+ * loopback module-ops API (`api-modules-ops.ts` → `commands/lifecycle.ts`).
9
12
  *
10
- * - There's no external supervisor watching the children. If vault
11
- * crashes, nothing brings it back.
12
- * - Render's log viewer only surfaces hub's stdout. A detached child
13
- * whose stdout goes to `~/.parachute/<svc>/logs/<svc>.log` is
14
- * invisible to the operator clicking through the dashboard.
13
+ * What this supervisor does:
15
14
  *
16
- * This supervisor solves both. It spawns each module attached (no
17
- * `detached: true`, no `unref()`), pipes their stdout/stderr through a
18
- * line-prefixing tap into hub's own stdout (`[vault] …`,
19
- * `[scribe] …`), watches `proc.exited`, and restarts crashed children
20
- * up to a small budget before giving up + marking the module
21
- * `crashed`. The budget keeps a wedged-on-boot module from chewing
22
- * forever; once it's exhausted the operator sees the crash via /api/modules
23
- * (or, post-1B, the per-module log view).
15
+ * - Spawns each module as an attached child in its own process group
16
+ * (`detached: true` for group-signalling only; stdio stays piped — see
17
+ * `defaultSpawnFn` / `defaultKillGroup`), so a wrapped startCmd's
18
+ * grandchildren are reaped on stop/restart (no EADDRINUSE-on-restart).
19
+ * - Pipes each child's stdout/stderr through a line-prefixing tap into the
20
+ * hub's own stdout (`[vault] …`, `[scribe] …`) and a bounded per-module
21
+ * ring buffer, so the operator sees module output in the hub log /
22
+ * platform log viewer and `parachute logs <svc>` can replay recent lines.
23
+ * - Gates a freshly-spawned module to `running` only once its port binds
24
+ * (port-readiness), and records a structured start-error on preflight
25
+ * failure, so `status` / the SPA keep the friendly missing-dependency
26
+ * surface.
27
+ * - Watches `proc.exited` and crash-restarts children up to a small budget
28
+ * before marking the module `crashed`. The budget keeps a wedged-on-boot
29
+ * module from chewing forever; the hub unit's own StartLimit / Throttle
30
+ * bounds the outer keeper.
24
31
  *
25
- * Out of scope for this module: spawning the hub HTTP server itself
26
- * (that's `Bun.serve` in the same process), driving the on-box
27
- * `parachute start <svc>` path (still uses `commands/lifecycle.ts`),
28
- * persisting child state to disk (transient — re-derived from
29
- * services.json on boot).
32
+ * Out of scope: supervising the hub HTTP server itself (that's `Bun.serve` in
33
+ * this same process the platform manager is the hub's keeper) and persisting
34
+ * child state to disk (transient re-derived from services.json on every boot).
30
35
  */
31
36
 
37
+ import {
38
+ MissingDependencyError,
39
+ type MissingDependencyWire,
40
+ ensureExecutable,
41
+ rethrowIfMissing,
42
+ } from "@openparachute/depcheck";
43
+ import { type PortListeningFn, defaultPortListening } from "./port-probe.ts";
44
+
32
45
  export type ModuleStatus = "starting" | "running" | "stopped" | "crashed" | "restarting";
33
46
 
47
+ /**
48
+ * Structured start-failure detail recorded onto `ModuleState` (§6.5). Mirrors
49
+ * depcheck's `MissingDependencyWire` for the missing-dependency case and the
50
+ * services.json-row `ServiceEntryStartError` shape `commands/lifecycle.ts`
51
+ * records, so `status` / the SPA keep the SAME friendly missing-dependency
52
+ * surface whether a module was started via the detached path or the
53
+ * supervisor. `error_type` is left open for a future non-dependency failure.
54
+ */
55
+ export interface ModuleStartError {
56
+ readonly error_type: string;
57
+ readonly error_description: string;
58
+ /** Present for `error_type: "missing_dependency"`. */
59
+ readonly binary?: string;
60
+ readonly why?: string | null;
61
+ readonly docs_url?: string | null;
62
+ readonly install?: { darwin?: string; linux?: string; generic?: string };
63
+ readonly sysadmin_hint?: string;
64
+ /** ISO timestamp of when the failure was recorded. */
65
+ readonly at: string;
66
+ }
67
+
34
68
  export interface ModuleState {
35
69
  /** Short name (vault / notes / scribe / …). */
36
70
  readonly short: string;
@@ -46,6 +80,15 @@ export interface ModuleState {
46
80
  readonly lastCrashAt?: string;
47
81
  /** Exit code of the most recent crash. */
48
82
  readonly lastExitCode?: number | null;
83
+ /**
84
+ * Structured start-failure detail (§6.5). Set when a preflight
85
+ * `MissingDependencyError` aborts the spawn, OR when a spawned child stays
86
+ * alive but never binds its port within the readiness window
87
+ * (started-but-unbound, hub#487). Cleared on a clean, port-confirmed start.
88
+ * The `status` enum is intentionally NOT extended (proxy-state Mode-1 + the
89
+ * SPA read `running`); this field carries the friendly diagnostic instead.
90
+ */
91
+ readonly startError?: ModuleStartError;
49
92
  }
50
93
 
51
94
  export interface SpawnRequest {
@@ -97,10 +140,31 @@ export interface SupervisorOpts {
97
140
  * stream without spelunking stdout.
98
141
  */
99
142
  readonly output?: (line: string) => void;
143
+ /**
144
+ * Cap, in bytes, of the per-module log ring buffer (§6.5). The supervisor
145
+ * keeps the most-recent ~`logBufferBytes` of each child's output so a
146
+ * `GET /api/modules/:short/logs` tap can replay the boot/crash lines that
147
+ * happened *before* the reader connected — the detached path got this for
148
+ * free via the per-service logfile; the supervisor streams-and-discards, so
149
+ * without a buffer the crash cause (the most important line) is lost. The
150
+ * oldest whole lines are dropped once the cap is exceeded. Default 64 KiB.
151
+ */
152
+ readonly logBufferBytes?: number;
100
153
  /**
101
154
  * Test seam over `Bun.spawn`. Returns a Subprocess-shaped handle.
102
155
  */
103
156
  readonly spawnFn?: SpawnFn;
157
+ /**
158
+ * Group-aware kill seam (hub#88). Production sends the signal to the child's
159
+ * whole process group (`process.kill(-pid, signal)`) so wrapped startCmds
160
+ * like `pnpm exec tsx server.ts` reap the tsx grandchild — not just the
161
+ * wrapper that would otherwise leave the grandchild bound to the port →
162
+ * EADDRINUSE on restart. Pairs with `defaultSpawnFn`'s `detached: true`
163
+ * (each child is its own process-group leader, `pid === pgid`). Defaults to
164
+ * {@link defaultKillGroup}; tests inject a stub so they stay deterministic
165
+ * (no real signals) and can assert the negative pid (group send) was used.
166
+ */
167
+ readonly killFn?: KillFn;
104
168
  /**
105
169
  * Test seam over wall-clock. Production passes `Date.now`.
106
170
  */
@@ -110,6 +174,40 @@ export interface SupervisorOpts {
110
174
  * with `setTimeout`. Tests stub to advance time deterministically.
111
175
  */
112
176
  readonly sleep?: (ms: number) => Promise<void>;
177
+ /**
178
+ * Port-readiness probe (§6.5). After a child spawns, the supervisor polls
179
+ * this until the module's port (from `req.env.PORT`) binds, to catch the
180
+ * alive-but-never-bound shape (hub#487). Defaults to `defaultPortListening`
181
+ * (a loopback TCP connect). Tests inject a deterministic stub.
182
+ *
183
+ * Defaulting policy (mirrors `commands/lifecycle.ts`): the readiness gate is
184
+ * SKIPPED unless this is the production path (no `spawnFn` override) OR a
185
+ * test explicitly opts in by injecting `portListening` / `startReadyMs`.
186
+ * Without that guard, every existing stub-spawner test (fake procs that
187
+ * never bind a real port) would block the full readiness window.
188
+ */
189
+ readonly portListening?: PortListeningFn;
190
+ /**
191
+ * How long the post-spawn port-readiness gate polls before recording a
192
+ * `started-but-unbound` start-error, in ms. Default 4000 on the production
193
+ * path; 0 (skipped) on the stub-spawner test path unless `portListening` /
194
+ * `startReadyMs` is set explicitly.
195
+ */
196
+ readonly startReadyMs?: number;
197
+ /** Poll interval while waiting for the port to bind, in ms. Default 200. */
198
+ readonly startReadyPollMs?: number;
199
+ /**
200
+ * PATH-resolution seam for the pre-spawn `ensureExecutable` preflight
201
+ * (`@openparachute/depcheck`). Production uses the real `Bun.which`; a
202
+ * missing startCmd binary then aborts the spawn with a structured
203
+ * `MissingDependencyError` recorded onto `ModuleState.startError`.
204
+ *
205
+ * Defaulting policy mirrors the readiness gate: a stub `spawnFn` (test path)
206
+ * gets a permissive resolver so the preflight doesn't trip on binaries
207
+ * absent from the test host's PATH; production gets the real `Bun.which`.
208
+ * Tests exercising the missing-binary branch inject `which: () => null`.
209
+ */
210
+ readonly which?: (cmd: string) => string | null;
113
211
  }
114
212
 
115
213
  /**
@@ -119,6 +217,12 @@ export interface SupervisorOpts {
119
217
  */
120
218
  export type SpawnFn = (req: SpawnRequest) => SupervisedProc;
121
219
 
220
+ /**
221
+ * Group-aware kill seam. Sends `signal` to the process group rooted at `pid`.
222
+ * Production uses {@link defaultKillGroup}; tests inject a stub.
223
+ */
224
+ export type KillFn = (pid: number, signal: NodeJS.Signals | number) => void;
225
+
122
226
  /**
123
227
  * The minimal Subprocess shape the supervisor depends on. Bun's real
124
228
  * `Subprocess` matches this; the test fake mirrors it.
@@ -135,6 +239,46 @@ const DEFAULT_MAX_RESTARTS = 3;
135
239
  const DEFAULT_RESTART_WINDOW_MS = 60_000;
136
240
  const DEFAULT_RESTART_DELAY_MS = 500;
137
241
  const DEFAULT_KILL_TIMEOUT_MS = 5_000;
242
+ const DEFAULT_LOG_BUFFER_BYTES = 64 * 1024;
243
+ const DEFAULT_START_READY_MS = 4_000;
244
+ const DEFAULT_START_READY_POLL_MS = 200;
245
+
246
+ /**
247
+ * Bounded, line-oriented ring buffer (§6.5). Holds the most-recent lines of a
248
+ * module's output up to `maxBytes`; pushing past the cap drops whole lines
249
+ * from the front (oldest-first) until it fits. Bounding by bytes (not line
250
+ * count) keeps a chatty module from pinning unbounded memory regardless of
251
+ * line length. Each pushed string is already a single prefixed line from
252
+ * `pumpLines` (it includes its trailing newline).
253
+ */
254
+ export class LogRingBuffer {
255
+ private readonly lines: string[] = [];
256
+ private bytes = 0;
257
+
258
+ constructor(private readonly maxBytes: number) {}
259
+
260
+ push(line: string): void {
261
+ this.lines.push(line);
262
+ this.bytes += Buffer.byteLength(line);
263
+ // Drop oldest whole lines until we're back under the cap. A single line
264
+ // larger than the cap is kept (we never split a line) — the alternative
265
+ // (dropping it) would lose exactly the long stack-trace we most want.
266
+ while (this.bytes > this.maxBytes && this.lines.length > 1) {
267
+ const dropped = this.lines.shift();
268
+ if (dropped !== undefined) this.bytes -= Buffer.byteLength(dropped);
269
+ }
270
+ }
271
+
272
+ /** Snapshot of the buffered lines, oldest-first. */
273
+ snapshot(): string[] {
274
+ return [...this.lines];
275
+ }
276
+
277
+ /** Buffered lines joined into a single string (the wire/tail shape). */
278
+ text(): string {
279
+ return this.lines.join("");
280
+ }
281
+ }
138
282
 
139
283
  /**
140
284
  * Per-module supervisor. Owns the spawn → watch → restart loop.
@@ -151,15 +295,30 @@ export class Supervisor {
151
295
  private readonly modules = new Map<string, ModuleEntry>();
152
296
 
153
297
  constructor(opts: SupervisorOpts = {}) {
298
+ // Defaulting policy for the port-readiness gate + preflight (§6.5),
299
+ // mirroring `commands/lifecycle.ts`: production (no `spawnFn` override) gets
300
+ // the real 4s readiness window + `Bun.which` preflight. The stub-spawner
301
+ // test path gets 0 (skipped) + a permissive `which` UNLESS a test opts in
302
+ // explicitly (injecting `portListening` / `startReadyMs` / `which`) — so
303
+ // existing fake-proc tests (which never bind a real port) don't block.
304
+ const isProductionPath = opts.spawnFn === undefined;
305
+ const readinessOptedIn = opts.portListening !== undefined || opts.startReadyMs !== undefined;
154
306
  this.opts = {
155
307
  maxRestarts: opts.maxRestarts ?? DEFAULT_MAX_RESTARTS,
156
308
  restartWindowMs: opts.restartWindowMs ?? DEFAULT_RESTART_WINDOW_MS,
157
309
  restartDelayMs: opts.restartDelayMs ?? DEFAULT_RESTART_DELAY_MS,
158
310
  killTimeoutMs: opts.killTimeoutMs ?? DEFAULT_KILL_TIMEOUT_MS,
159
311
  output: opts.output ?? ((line) => process.stdout.write(line)),
312
+ logBufferBytes: opts.logBufferBytes ?? DEFAULT_LOG_BUFFER_BYTES,
160
313
  spawnFn: opts.spawnFn ?? defaultSpawnFn,
314
+ killFn: opts.killFn ?? defaultKillGroup,
161
315
  now: opts.now ?? Date.now,
162
316
  sleep: opts.sleep ?? ((ms) => new Promise((r) => setTimeout(r, ms))),
317
+ portListening: opts.portListening ?? defaultPortListening,
318
+ startReadyMs:
319
+ opts.startReadyMs ?? (isProductionPath || readinessOptedIn ? DEFAULT_START_READY_MS : 0),
320
+ startReadyPollMs: opts.startReadyPollMs ?? DEFAULT_START_READY_POLL_MS,
321
+ which: opts.which ?? (isProductionPath ? Bun.which : () => "/stub/bin/preflight-skipped"),
163
322
  };
164
323
  }
165
324
 
@@ -175,6 +334,9 @@ export class Supervisor {
175
334
  return existing.state;
176
335
  }
177
336
  // Crashed → operator intent is "try again." Wipe the budget.
337
+ // A fresh ring buffer per entry — `start` is a clean spawn (the crash-
338
+ // respawn path in `handleExit` reuses the existing entry + buffer, so a
339
+ // crashed module's boot/crash lines survive into the restart for replay).
178
340
  const entry: ModuleEntry = {
179
341
  req,
180
342
  state: {
@@ -183,12 +345,117 @@ export class Supervisor {
183
345
  restartsInWindow: 0,
184
346
  },
185
347
  crashStamps: [],
348
+ logs: new LogRingBuffer(this.opts.logBufferBytes),
186
349
  };
187
350
  this.modules.set(req.short, entry);
188
- this.spawnAndWatch(entry);
351
+
352
+ // Pre-spawn preflight (§6.5): resolve the startCmd binary on PATH before
353
+ // spawning a doomed child. A missing binary records a structured
354
+ // `MissingDependencyError` onto state (the same friendly missing-dependency
355
+ // surface `commands/lifecycle.ts` records) and aborts — no spawn. Mirrors
356
+ // `lifecycle.start`'s `ensureExecutable` preflight.
357
+ const startBinary = req.cmd[0];
358
+ if (startBinary) {
359
+ try {
360
+ ensureExecutable(startBinary, { which: this.opts.which });
361
+ } catch (err) {
362
+ if (err instanceof MissingDependencyError) {
363
+ entry.state = {
364
+ ...entry.state,
365
+ status: "crashed",
366
+ pid: undefined,
367
+ startError: startErrorFromWire(err.toWire(), this.opts.now),
368
+ };
369
+ return entry.state;
370
+ }
371
+ throw err;
372
+ }
373
+ }
374
+
375
+ // Belt-and-suspenders for a spawn that slips past the preflight (binary
376
+ // removed between check + spawn, or a path that didn't preflight): a
377
+ // not-found spawn throw becomes the same structured MissingDependencyError
378
+ // recorded onto state, not a throw out of `start`. Mirrors
379
+ // `lifecycle.start`'s `rethrowIfMissing` catch.
380
+ try {
381
+ this.spawnAndWatch(entry);
382
+ } catch (err) {
383
+ if (startBinary) {
384
+ try {
385
+ rethrowIfMissing(err, startBinary);
386
+ } catch (missing) {
387
+ if (missing instanceof MissingDependencyError) {
388
+ entry.state = {
389
+ ...entry.state,
390
+ status: "crashed",
391
+ pid: undefined,
392
+ startError: startErrorFromWire(missing.toWire(), this.opts.now),
393
+ };
394
+ return entry.state;
395
+ }
396
+ }
397
+ }
398
+ throw err;
399
+ }
400
+
401
+ // Post-spawn port-readiness gate (§6.5, hub#487). A returned pid only
402
+ // proves the kernel forked the process; it says nothing about whether the
403
+ // module bound its port. Poll the port (from req.env.PORT) up to
404
+ // `startReadyMs`. On success: clear any prior startError. On timeout while
405
+ // the child is still alive: record a `started-but-unbound` structured
406
+ // start-error WITHOUT touching the `running` status enum (proxy-state
407
+ // Mode-1 + the SPA read `running`) — the friendly diagnostic rides the
408
+ // startError field. A child that died during the window is left to the
409
+ // crash watcher (`handleExit`), which owns the restart budget.
410
+ await this.awaitPortReadiness(entry);
189
411
  return entry.state;
190
412
  }
191
413
 
414
+ /**
415
+ * Poll the module's port until it binds or `startReadyMs` elapses (§6.5).
416
+ * Skipped when the gate is disabled (stub-spawner test path) or the request
417
+ * carries no `PORT`. Records / clears `state.startError` accordingly; never
418
+ * mutates `state.status` (see `start`).
419
+ */
420
+ private async awaitPortReadiness(entry: ModuleEntry): Promise<void> {
421
+ if (this.opts.startReadyMs <= 0) return;
422
+ const portStr = entry.req.env?.PORT;
423
+ const port = portStr ? Number(portStr) : Number.NaN;
424
+ if (!Number.isFinite(port) || port <= 0) return; // No port to probe.
425
+
426
+ const deadline = this.opts.now() + this.opts.startReadyMs;
427
+ while (this.opts.now() < deadline) {
428
+ // The child may have crashed during the window — `handleExit` owns that
429
+ // (budget / restart). Stop probing; don't overwrite a crash with a
430
+ // port-readiness verdict.
431
+ if (entry.stopRequested || entry.state.status !== "running") return;
432
+ if (await this.opts.portListening(port)) {
433
+ // Bound → healthy. Clear any stale started-but-unbound error.
434
+ if (entry.state.startError) {
435
+ const { startError: _drop, ...rest } = entry.state;
436
+ entry.state = rest;
437
+ }
438
+ return;
439
+ }
440
+ await this.opts.sleep(this.opts.startReadyPollMs);
441
+ }
442
+
443
+ // Window elapsed, still alive but never bound — record the structured
444
+ // started-but-unbound error so `status` / the SPA show why, not a silently
445
+ // healthy `running`. Keep `running` (the process IS up); the diagnostic is
446
+ // the startError field.
447
+ if (entry.state.status === "running" && !entry.stopRequested) {
448
+ entry.state = {
449
+ ...entry.state,
450
+ startError: {
451
+ error_type: "started_but_unbound",
452
+ error_description: `${entry.req.short} started (pid ${entry.state.pid}) but is not listening on port ${port} after ${this.opts.startReadyMs}ms — it may still be coming up, or the port is held by another process.`,
453
+ at: new Date(this.opts.now()).toISOString(),
454
+ },
455
+ };
456
+ }
457
+ }
458
+
192
459
  /**
193
460
  * Stop a supervised module. Sends SIGTERM, awaits the child's exit
194
461
  * (so the log-pump drains the final flush before our stdout closes),
@@ -216,7 +483,13 @@ export class Supervisor {
216
483
  const proc = entry.proc;
217
484
  if (proc) {
218
485
  try {
219
- proc.kill("SIGTERM");
486
+ // Group-aware kill (hub#88): signal the child's whole process group
487
+ // via `killFn` (default `defaultKillGroup` → `process.kill(-pid)`) so
488
+ // a wrapped startCmd's grandchild is reaped too, not just the wrapper.
489
+ // Mirrors `commands/lifecycle.ts`'s `defaultKill` repointing of
490
+ // `defaultSpawner`'s detached children. Without it, the grandchild
491
+ // stays bound to the port → restart hits EADDRINUSE.
492
+ this.opts.killFn(proc.pid, "SIGTERM");
220
493
  } catch {
221
494
  // Process may already be dead — fall through.
222
495
  }
@@ -234,7 +507,9 @@ export class Supervisor {
234
507
  `[supervisor] ${entry.req.short} did not exit ${this.opts.killTimeoutMs}ms after SIGTERM — escalating to SIGKILL.\n`,
235
508
  );
236
509
  try {
237
- proc.kill("SIGKILL");
510
+ // Group-aware SIGKILL escalation — same `killFn` seam as the
511
+ // SIGTERM above so the whole group is reaped, not just the leader.
512
+ this.opts.killFn(proc.pid, "SIGKILL");
238
513
  } catch {
239
514
  // Process may already be dead between the timeout firing
240
515
  // and us reaching kill() — fall through to the await.
@@ -287,13 +562,17 @@ export class Supervisor {
287
562
  private spawnAndWatch(entry: ModuleEntry): void {
288
563
  const proc = this.opts.spawnFn(entry.req);
289
564
  entry.proc = proc;
565
+ // Clear any stale startError from a prior attempt — a fresh running pid is
566
+ // the new ground truth; the readiness gate re-records if it still doesn't
567
+ // bind.
568
+ const { startError: _drop, ...prev } = entry.state;
290
569
  entry.state = {
291
- ...entry.state,
570
+ ...prev,
292
571
  status: "running",
293
572
  pid: proc.pid,
294
573
  startedAt: new Date(this.opts.now()).toISOString(),
295
574
  };
296
- this.pipeOutput(entry.req.short, proc);
575
+ this.pipeOutput(entry, proc);
297
576
  void proc.exited.then((exitCode) => this.handleExit(entry, exitCode));
298
577
  }
299
578
 
@@ -348,16 +627,34 @@ export class Supervisor {
348
627
  }
349
628
 
350
629
  /**
351
- * Tap a child's stdout + stderr into the supervisor's `output`
352
- * callback (hub's stdout by default), prefixing each line with the
353
- * module's short name. Line-buffered: partial chunks accumulate
354
- * until a newline arrives so multi-byte log lines don't get
355
- * scrambled across modules.
630
+ * Recent buffered output for a supervised module (§6.5), oldest-first, each
631
+ * element a prefixed line. Returns `undefined` for a module that isn't
632
+ * supervised (no entry) so a `GET /api/modules/:short/logs` handler can
633
+ * distinguish "not supervised" (404) from "supervised but quiet" (empty
634
+ * array). Survives a crash-respawn (same entry/buffer), so the boot/crash
635
+ * lines that preceded the reader connecting are replayable — the whole point.
636
+ */
637
+ logs(short: string): string[] | undefined {
638
+ return this.modules.get(short)?.logs.snapshot();
639
+ }
640
+
641
+ /**
642
+ * Tap a child's stdout + stderr into the supervisor's `output` callback
643
+ * (hub's stdout by default) AND the per-module ring buffer (§6.5),
644
+ * prefixing each line with the module's short name. Line-buffered: partial
645
+ * chunks accumulate until a newline arrives so multi-byte log lines don't
646
+ * get scrambled across modules. The buffer is fed the same prefixed lines
647
+ * the live stream gets, so a later `/logs` tap replays exactly what hub's
648
+ * stdout already showed.
356
649
  */
357
- private pipeOutput(short: string, proc: SupervisedProc): void {
358
- const prefix = `[${short}] `;
359
- if (proc.stdout) void pumpLines(proc.stdout, prefix, this.opts.output);
360
- if (proc.stderr) void pumpLines(proc.stderr, prefix, this.opts.output);
650
+ private pipeOutput(entry: ModuleEntry, proc: SupervisedProc): void {
651
+ const prefix = `[${entry.req.short}] `;
652
+ const sink = (line: string): void => {
653
+ this.opts.output(line);
654
+ entry.logs.push(line);
655
+ };
656
+ if (proc.stdout) void pumpLines(proc.stdout, prefix, sink);
657
+ if (proc.stderr) void pumpLines(proc.stderr, prefix, sink);
361
658
  }
362
659
  }
363
660
 
@@ -367,6 +664,8 @@ interface ModuleEntry {
367
664
  proc?: SupervisedProc;
368
665
  crashStamps: number[];
369
666
  stopRequested?: boolean;
667
+ /** Bounded ring buffer of recent prefixed output lines (§6.5). */
668
+ logs: LogRingBuffer;
370
669
  }
371
670
 
372
671
  async function pumpLines(
@@ -402,7 +701,20 @@ async function pumpLines(
402
701
 
403
702
  const defaultSpawnFn: SpawnFn = (req) => {
404
703
  const spawnOpts: Parameters<typeof Bun.spawn>[1] = {
704
+ // Keep stdout/stderr explicitly piped — the supervisor pumps child output
705
+ // into hub's log (`pipeOutput`/`pumpLines`) + the per-module ring buffer.
706
+ // `detached: true` does NOT detach explicitly-piped stdio, so these stay
707
+ // wired even though the child gets its own process group below.
405
708
  stdio: ["ignore", "pipe", "pipe"],
709
+ // Spawn in a fresh process group (pid == pgid) so `killFn` (→
710
+ // `process.kill(-pid, sig)`) reaches every descendant, not just the
711
+ // wrapper. Without this, wrapped startCmds like `pnpm exec tsx server.ts`
712
+ // leave the tsx grandchild bound to the port after stop → restart hits
713
+ // EADDRINUSE (hub#88). Mirrors `commands/lifecycle.ts`'s `defaultSpawner`,
714
+ // which set `detached: true` for exactly this reason. We do NOT `unref()`:
715
+ // the supervisor must stay attached for the lifecycle (watch `exited`,
716
+ // pump output, reap on stop).
717
+ detached: true,
406
718
  // Inherit env so supervised module sees PATH, HOME, PARACHUTE_HOME, etc.
407
719
  // Bun.spawn defaults to empty env — see api-modules-ops.ts:defaultRun.
408
720
  // Per-call `req.env` overrides merge on top below.
@@ -413,3 +725,42 @@ const defaultSpawnFn: SpawnFn = (req) => {
413
725
  const proc = Bun.spawn([...req.cmd], spawnOpts);
414
726
  return proc as unknown as SupervisedProc;
415
727
  };
728
+
729
+ /**
730
+ * Map a depcheck `MissingDependencyWire` onto the `ModuleStartError` shape
731
+ * recorded on `ModuleState` (§6.5), stamping `at`. The wire's field names
732
+ * already match (binary / why / docs_url / install / sysadmin_hint), so this
733
+ * is a stamp + passthrough — keeping the supervisor's start-error surface
734
+ * identical to the services.json `ServiceEntryStartError` the detached path
735
+ * records, so the SPA renders the same install card from either source.
736
+ */
737
+ function startErrorFromWire(wire: MissingDependencyWire, now: () => number): ModuleStartError {
738
+ return {
739
+ error_type: wire.error_type,
740
+ error_description: wire.error_description,
741
+ binary: wire.binary,
742
+ why: wire.why,
743
+ docs_url: wire.docs_url,
744
+ install: wire.install,
745
+ sysadmin_hint: wire.sysadmin_hint,
746
+ at: new Date(now()).toISOString(),
747
+ };
748
+ }
749
+
750
+ /**
751
+ * Production group-aware kill (hub#88). Sends `signal` to the entire process
752
+ * group rooted at `pid` (the negative-pid syscall) so a wrapped startCmd's
753
+ * grandchildren are reaped alongside the wrapper. Mirrors
754
+ * `commands/lifecycle.ts`'s `defaultKill`: on ESRCH the group is already gone
755
+ * (or the child predates the detached-spawn change and has no group with that
756
+ * pgid) — fall back to a bare-pid signal so the caller's intent still lands
757
+ * when there's a positive-pid process to receive it.
758
+ */
759
+ export const defaultKillGroup: KillFn = (pid, signal) => {
760
+ try {
761
+ process.kill(-pid, signal);
762
+ } catch (err) {
763
+ if ((err as NodeJS.ErrnoException).code !== "ESRCH") throw err;
764
+ process.kill(pid, signal);
765
+ }
766
+ };