@openparachute/hub 0.6.2 → 0.6.3-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,4 @@
1
1
  import { existsSync, openSync, readFileSync } from "node:fs";
2
- import { Socket } from "node:net";
3
2
  import { join } from "node:path";
4
3
  import {
5
4
  MissingDependencyError,
@@ -20,8 +19,30 @@ import {
20
19
  } from "../hub-control.ts";
21
20
  import { hubDbPath, openHubDb } from "../hub-db.ts";
22
21
  import { HUB_ORIGIN_ENV, deriveHubOrigin } from "../hub-origin.ts";
22
+ import {
23
+ type EnsureHubUnitOpts,
24
+ type EnsureHubUnitResult,
25
+ HUB_UNIT_DEFAULT_PORT,
26
+ type HubUnitDeps,
27
+ type HubUnitManagerOpResult,
28
+ defaultHubUnitDeps,
29
+ ensureHubUnit as ensureHubUnitImpl,
30
+ isHubUnitInstalled,
31
+ restartHubUnit as restartHubUnitImpl,
32
+ stopHubUnit as stopHubUnitImpl,
33
+ } from "../hub-unit.ts";
23
34
  import { ModuleManifestError, readModuleManifest } from "../module-manifest.ts";
35
+ import {
36
+ type DriveModuleOpDeps,
37
+ type ModuleOp,
38
+ ModuleOpHttpError,
39
+ type ModuleOpResult,
40
+ NoOperatorTokenError,
41
+ OperatorTokenExpiredError,
42
+ driveModuleOp as driveModuleOpImpl,
43
+ } from "../module-ops-client.ts";
24
44
  import { type OperatorIssuerHealStatus, selfHealOperatorTokenIssuer } from "../operator-token.ts";
45
+ import { type PortListeningFn, defaultPortListening } from "../port-probe.ts";
25
46
  import {
26
47
  type AliveFn,
27
48
  clearPid,
@@ -98,42 +119,16 @@ export type KillFn = (pid: number, signal: NodeJS.Signals | number) => void;
98
119
  export type SleepFn = (ms: number) => Promise<void>;
99
120
 
100
121
  /**
101
- * "Is something listening on this TCP port on loopback?" seam. Pairs with the
102
- * spawn-then-die settle (hub#194) to catch the *other* silent-start failure
103
- * shape (hub#487): a service that lives long enough to clear the liveness
104
- * check but never binds its port because the port is already held (EADDRINUSE
105
- * from an orphan). The recorded pid stays alive (vault's process supervisor
106
- * retries / lingers) so `alive(pid)` says "running" while `parachute status`
107
- * shows it inactive because nothing answers on the port.
108
- *
109
- * Tests inject a deterministic stub; production uses `defaultPortListening`.
122
+ * Port-readiness probe seam + its production impl now live in `port-probe.ts`
123
+ * (design 2026-06-01 §6.5) so the supervisor can share the exact same TCP
124
+ * connect-probe without dragging lifecycle's heavy import graph. Re-exported
125
+ * here so this module's public API (and its tests) are unchanged. Pairs with
126
+ * the spawn-then-die settle (hub#194) to catch the alive-but-never-bound shape
127
+ * (hub#487): a service that clears the liveness check but never binds its port
128
+ * because it's already held `alive(pid)` says "running" while `status` shows
129
+ * it inactive because nothing answers on the port.
110
130
  */
111
- export type PortListeningFn = (port: number) => Promise<boolean>;
112
-
113
- /**
114
- * Connect-probe: open a TCP socket to 127.0.0.1:<port> and see if it's
115
- * accepted. A successful connect means *something* is listening; we close
116
- * immediately. Connection refused / timeout means nothing is bound yet.
117
- * `node:net` rather than `Bun.connect` because the latter has no clean
118
- * "connection refused → false" without a custom socket handler, and the net
119
- * Socket's `error`/`connect` events map directly onto the boolean we want.
120
- */
121
- export const defaultPortListening: PortListeningFn = (port) =>
122
- new Promise((resolve) => {
123
- const socket = new Socket();
124
- let settled = false;
125
- const done = (listening: boolean) => {
126
- if (settled) return;
127
- settled = true;
128
- socket.destroy();
129
- resolve(listening);
130
- };
131
- socket.setTimeout(1000);
132
- socket.once("connect", () => done(true));
133
- socket.once("timeout", () => done(false));
134
- socket.once("error", () => done(false));
135
- socket.connect(port, "127.0.0.1");
136
- });
131
+ export { type PortListeningFn, defaultPortListening };
137
132
 
138
133
  /**
139
134
  * Group-aware liveness: returns true if the process group (pgid == pid)
@@ -311,6 +306,58 @@ export interface LifecycleOpts {
311
306
  log: (line: string) => void;
312
307
  }) => Promise<OperatorIssuerHealStatus>;
313
308
  };
309
+ /**
310
+ * Phase 3b supervisor-path seams (design §3.3). When a hub UNIT is installed
311
+ * (launchd/systemd/container — detected via {@link isHubUnitInstalled}),
312
+ * `start/stop/restart` drive the RUNNING hub's in-process Supervisor over the
313
+ * loopback module-ops API instead of spawning detached pidfile daemons. The
314
+ * detached arm (`spawner`/`hub.ensureRunning`/`hub.stop`) remains the no-unit
315
+ * fallback until Phase 5 retires it.
316
+ *
317
+ * Everything here is injectable so tests can (a) force the unit-installed
318
+ * branch without a real launchd/systemd, and (b) assert the module-ops /
319
+ * manager calls without a live hub. Production wires the real
320
+ * {@link driveModuleOp} / {@link ensureHubUnit} / {@link stopHubUnit} /
321
+ * {@link restartHubUnit} against an opened hub.db + the resolved hub origin.
322
+ */
323
+ supervisor?: {
324
+ /**
325
+ * Is a hub unit installed (the dual-dispatch discriminant)? Production
326
+ * uses `isHubUnitInstalled(hubUnitDeps)`. Tests set this `true`/`false`
327
+ * directly to pick the branch deterministically. When set, it wins over
328
+ * the `hubUnitDeps`-derived detection.
329
+ */
330
+ unitInstalled?: boolean;
331
+ /** Deps for the real `isHubUnitInstalled` probe + the hub-unit manager ops. */
332
+ hubUnitDeps?: HubUnitDeps;
333
+ /** Drive a per-module op against the running hub (reads operator.token). */
334
+ driveModuleOp?: (
335
+ short: string,
336
+ op: ModuleOp,
337
+ deps: DriveModuleOpDeps,
338
+ ) => Promise<ModuleOpResult>;
339
+ /** Ensure the hub unit is up before a module op (§3.2). */
340
+ ensureHubUnit?: (opts: EnsureHubUnitOpts) => Promise<EnsureHubUnitResult>;
341
+ /** Stop the hub unit via the platform manager (NEVER a PID signal, §3.3). */
342
+ stopHubUnit?: (deps: HubUnitDeps) => HubUnitManagerOpResult;
343
+ /** Restart the hub unit via the platform manager (NEVER a PID signal, §3.3). */
344
+ restartHubUnit?: (deps: HubUnitDeps) => HubUnitManagerOpResult;
345
+ /**
346
+ * Probe whether the loopback hub answers `/health`. Used by `stop <svc>`:
347
+ * if the hub is down, the supervised module is already down (children die
348
+ * with the hub) → report "already stopped" WITHOUT starting the hub.
349
+ * Production reuses the hub-unit deps' `probeHealth`.
350
+ */
351
+ probeHubHealth?: (port: number) => Promise<boolean>;
352
+ /**
353
+ * Open the hub DB used to validate/auto-rotate the operator token in
354
+ * `driveModuleOp`. Production opens `<configDir>/hub.db`; tests inject an
355
+ * in-memory/seeded db. Returns a handle the caller closes.
356
+ */
357
+ openDb?: (configDir: string) => import("bun:sqlite").Database;
358
+ /** Loopback hub base URL override (default derives from the hub port). */
359
+ baseUrl?: string;
360
+ };
314
361
  }
315
362
 
316
363
  interface Resolved {
@@ -337,6 +384,21 @@ interface Resolved {
337
384
  configDir: string;
338
385
  log: (line: string) => void;
339
386
  }) => Promise<OperatorIssuerHealStatus>;
387
+ sup: ResolvedSupervisor;
388
+ }
389
+
390
+ /** Resolved Phase 3b supervisor-path seams (see `LifecycleOpts.supervisor`). */
391
+ interface ResolvedSupervisor {
392
+ /** Whether a hub unit is installed — the dual-dispatch discriminant. */
393
+ unitInstalled: boolean;
394
+ hubUnitDeps: HubUnitDeps;
395
+ driveModuleOp: (short: string, op: ModuleOp, deps: DriveModuleOpDeps) => Promise<ModuleOpResult>;
396
+ ensureHubUnit: (opts: EnsureHubUnitOpts) => Promise<EnsureHubUnitResult>;
397
+ stopHubUnit: (deps: HubUnitDeps) => HubUnitManagerOpResult;
398
+ restartHubUnit: (deps: HubUnitDeps) => HubUnitManagerOpResult;
399
+ probeHubHealth: (port: number) => Promise<boolean>;
400
+ openDb: (configDir: string) => import("bun:sqlite").Database;
401
+ baseUrl: string | undefined;
340
402
  }
341
403
 
342
404
  /**
@@ -404,9 +466,64 @@ function resolve(opts: LifecycleOpts): Resolved {
404
466
  ensureHub: opts.hub?.ensureRunning ?? ensureHubRunning,
405
467
  stopHubFn: opts.hub?.stop ?? stopHub,
406
468
  selfHealOperatorTokenFn: opts.hub?.selfHealOperatorToken ?? defaultSelfHealOperatorToken,
469
+ sup: resolveSupervisor(opts.supervisor),
470
+ };
471
+ }
472
+
473
+ /**
474
+ * Resolve the Phase 3b supervisor-path seams (the dual-dispatch arm).
475
+ *
476
+ * The discriminant `unitInstalled` decides which arm a verb takes:
477
+ * - When the caller PROVIDES a `supervisor` block (even `{}`, which the
478
+ * production CLI dispatch passes), `unitInstalled` is the explicit override
479
+ * if set, else the real `isHubUnitInstalled` probe over the hub-unit deps —
480
+ * so on a box with a launchd/systemd hub unit the verbs drive the running
481
+ * supervisor, and on a legacy detached box they take the detached arm.
482
+ * - When the caller OMITS `supervisor` entirely (the shape of every existing
483
+ * lifecycle test, which never opts into the new path), `unitInstalled`
484
+ * defaults to `false` → the detached arm. This keeps those tests
485
+ * DETERMINISTIC regardless of whether the test host happens to have a real
486
+ * hub unit installed. New Phase 3b tests opt into the supervisor arm by
487
+ * passing `supervisor: { unitInstalled: true, … }`.
488
+ */
489
+ function resolveSupervisor(opts: LifecycleOpts["supervisor"]): ResolvedSupervisor {
490
+ const hubUnitDeps = opts?.hubUnitDeps ?? defaultHubUnitDeps;
491
+ // No `supervisor` block at all → detached arm, deterministically. Only probe
492
+ // the real filesystem when the caller opted into the new path (production CLI
493
+ // passes `supervisor: {}`; tests pass the seams they want to assert).
494
+ const unitInstalled =
495
+ opts === undefined ? false : (opts.unitInstalled ?? isHubUnitInstalled(hubUnitDeps));
496
+ return {
497
+ unitInstalled,
498
+ hubUnitDeps,
499
+ driveModuleOp: opts?.driveModuleOp ?? driveModuleOpImpl,
500
+ ensureHubUnit: opts?.ensureHubUnit ?? ensureHubUnitImpl,
501
+ stopHubUnit: opts?.stopHubUnit ?? stopHubUnitImpl,
502
+ restartHubUnit: opts?.restartHubUnit ?? restartHubUnitImpl,
503
+ probeHubHealth: opts?.probeHubHealth ?? hubUnitDeps.probeHealth,
504
+ openDb: opts?.openDb ?? ((configDir) => openHubDb(hubDbPath(configDir))),
505
+ baseUrl: opts?.baseUrl,
407
506
  };
408
507
  }
409
508
 
509
+ /**
510
+ * Resolve the hub origin used as the operator token's `iss` validator in the
511
+ * supervisor path. Unlike {@link resolveHubOrigin} (which returns `undefined`
512
+ * for pure loopback so the spawn env omits PARACHUTE_HUB_ORIGIN), the operator
513
+ * token ALWAYS carries an `iss`, so this falls back to the canonical loopback
514
+ * origin. Mirrors `commands/auth.ts`'s `resolveHubIssuer` so the issuer the CLI
515
+ * validates the token against matches what `auth rotate-operator` minted under.
516
+ * The fallback differs cosmetically — here `readHubPort(configDir) ??
517
+ * HUB_UNIT_DEFAULT_PORT`, in auth.ts `127.0.0.1:${HUB_DEFAULT_PORT}` — but both
518
+ * resolve to 1939 under canonical-ports today, so they agree in practice.
519
+ * TODO: consolidate with auth.ts:resolveHubIssuer to prevent drift.
520
+ */
521
+ function resolveOperatorTokenIssuer(hubOrigin: string | undefined, configDir: string): string {
522
+ if (hubOrigin) return hubOrigin;
523
+ const port = readHubPort(configDir) ?? HUB_UNIT_DEFAULT_PORT;
524
+ return `http://127.0.0.1:${port}`;
525
+ }
526
+
410
527
  /**
411
528
  * Source of truth order for `PARACHUTE_HUB_ORIGIN`:
412
529
  * 1. explicit override (flag / opt)
@@ -574,6 +691,12 @@ async function resolveTargets(
574
691
 
575
692
  export async function start(svc: string | undefined, opts: LifecycleOpts = {}): Promise<number> {
576
693
  const r = resolve(opts);
694
+ // Phase 3b dual-dispatch (design §3.3). On a box with a hub unit installed,
695
+ // drive the RUNNING supervisor; otherwise fall through to the unchanged
696
+ // detached arm below. Phase 5 deletes the else-arm — keep this a clean
697
+ // top-level branch so that deletion is a one-liner.
698
+ if (r.sup.unitInstalled) return startViaSupervisor(svc, r);
699
+ // --- no-unit detached fallback (unchanged; preserved until Phase 5) ---
577
700
  if (svc === HUB_SVC) return startHubSvc(r);
578
701
  const picked = await resolveTargets(svc, r.manifestPath);
579
702
  if ("error" in picked) {
@@ -815,6 +938,10 @@ function persistVaultHubOriginForStart(r: Resolved): void {
815
938
 
816
939
  export async function stop(svc: string | undefined, opts: LifecycleOpts = {}): Promise<number> {
817
940
  const r = resolve(opts);
941
+ // Phase 3b dual-dispatch (design §3.3). Unit-installed → drive the supervisor
942
+ // / platform manager; else the unchanged detached arm below.
943
+ if (r.sup.unitInstalled) return stopViaSupervisor(svc, r);
944
+ // --- no-unit detached fallback (unchanged; preserved until Phase 5) ---
818
945
  if (svc === HUB_SVC) return stopHubSvc(r);
819
946
  const picked = await resolveTargets(svc, r.manifestPath);
820
947
  if ("error" in picked) {
@@ -866,9 +993,210 @@ export async function stop(svc: string | undefined, opts: LifecycleOpts = {}): P
866
993
  }
867
994
 
868
995
  export async function restart(svc: string | undefined, opts: LifecycleOpts = {}): Promise<number> {
869
- const stopCode = await stop(svc, opts);
996
+ const r = resolve(opts);
997
+ // Phase 3b dual-dispatch (design §3.3). Unit-installed → drive the supervisor
998
+ // / platform manager (with the 404-fallthrough for modules, §6.2); else the
999
+ // unchanged detached stop-then-start below.
1000
+ if (r.sup.unitInstalled) return restartViaSupervisor(svc, r);
1001
+ // --- no-unit detached fallback (unchanged; preserved until Phase 5) ---
1002
+ // Pass `supervisor: undefined` to the inner stop/start so their own
1003
+ // `resolveSupervisor` short-circuits to `unitInstalled: false` without
1004
+ // re-probing `isHubUnitInstalled` (two redundant `stat`s per call) — we
1005
+ // already resolved no-unit above, so both inner calls would re-take this
1006
+ // same detached arm regardless. Behavior-preserving; just drops the probes.
1007
+ const detachedOpts = { ...opts, supervisor: undefined };
1008
+ const stopCode = await stop(svc, detachedOpts);
870
1009
  if (stopCode !== 0) return stopCode;
871
- return await start(svc, opts);
1010
+ return await start(svc, detachedOpts);
1011
+ }
1012
+
1013
+ // ---------------------------------------------------------------------------
1014
+ // Phase 3b supervisor-path verb dispatch (design §3.3).
1015
+ //
1016
+ // These are the NEW arm of the dual-dispatch: when a hub unit is installed,
1017
+ // `start/stop/restart` drive the RUNNING hub's in-process Supervisor over the
1018
+ // loopback module-ops API (per-module verbs) or the platform manager (hub
1019
+ // verbs / no-svc). The detached arm above is untouched and Phase 5 deletes it
1020
+ // + this comment block's `unitInstalled` guard, collapsing to this path only.
1021
+ // ---------------------------------------------------------------------------
1022
+
1023
+ /**
1024
+ * Drive a single module-op against the running hub, mapping the module-ops
1025
+ * client's errors to actionable CLI output (§3.1). Opens hub.db (to validate /
1026
+ * auto-rotate the operator token), resolves the issuer the token was minted
1027
+ * under, and closes the db afterward. Returns the result on success; on a
1028
+ * surfaced error returns `undefined` so the caller can decide (e.g. the restart
1029
+ * 404-fallthrough). Re-throws nothing the caller can't handle: the operator-
1030
+ * token / HTTP errors are caught here and printed.
1031
+ */
1032
+ async function driveSupervisorOp(
1033
+ short: string,
1034
+ op: ModuleOp,
1035
+ r: Resolved,
1036
+ ): Promise<{ result?: ModuleOpResult; httpError?: ModuleOpHttpError; failed: boolean }> {
1037
+ const issuer = resolveOperatorTokenIssuer(r.hubOrigin, r.configDir);
1038
+ const db = r.sup.openDb(r.configDir);
1039
+ try {
1040
+ const deps: DriveModuleOpDeps = {
1041
+ db,
1042
+ issuer,
1043
+ configDir: r.configDir,
1044
+ ...(r.sup.baseUrl !== undefined ? { baseUrl: r.sup.baseUrl } : {}),
1045
+ };
1046
+ const result = await r.sup.driveModuleOp(short, op, deps);
1047
+ return { result, failed: false };
1048
+ } catch (err) {
1049
+ if (err instanceof NoOperatorTokenError || err instanceof OperatorTokenExpiredError) {
1050
+ // Surface the already-actionable message (don't raw-throw a 401, §3.1).
1051
+ r.log(`✗ ${short}: ${err.message}`);
1052
+ return { failed: true };
1053
+ }
1054
+ if (err instanceof ModuleOpHttpError) {
1055
+ // Return the typed HTTP error so the caller can branch (404-fallthrough,
1056
+ // not_installed hint). Callers that don't branch print it via
1057
+ // `surfaceModuleOpHttpError`.
1058
+ return { httpError: err, failed: true };
1059
+ }
1060
+ // Unknown error — surface its message rather than crashing the CLI.
1061
+ r.log(`✗ ${short}: ${err instanceof Error ? err.message : String(err)}`);
1062
+ return { failed: true };
1063
+ } finally {
1064
+ db.close();
1065
+ }
1066
+ }
1067
+
1068
+ /** Print a module-ops HTTP error with an actionable hint for the known codes. */
1069
+ function surfaceModuleOpHttpError(short: string, err: ModuleOpHttpError, r: Resolved): void {
1070
+ if (err.status === 400 && err.code === "not_installed") {
1071
+ r.log(
1072
+ `✗ ${short} is not installed — run \`parachute install ${short}\` first, then \`parachute start ${short}\`.`,
1073
+ );
1074
+ return;
1075
+ }
1076
+ r.log(`✗ ${short}: ${err.message}`);
1077
+ }
1078
+
1079
+ /**
1080
+ * Ensure the hub unit is up, mapping `ensureHubUnit`'s structured outcome to a
1081
+ * CLI exit signal. Returns true when the hub is up (already-up / started),
1082
+ * false when it isn't (and the messages were surfaced). The `no-unit` outcome
1083
+ * shouldn't reach here under the dual-dispatch (we only take the supervisor arm
1084
+ * when a unit IS installed), but it's handled defensively.
1085
+ */
1086
+ async function ensureHubForOp(r: Resolved, port: number): Promise<boolean> {
1087
+ const ensured = await r.sup.ensureHubUnit({
1088
+ port,
1089
+ deps: r.sup.hubUnitDeps,
1090
+ log: r.log,
1091
+ });
1092
+ if (ensured.outcome === "already-up" || ensured.outcome === "started") return true;
1093
+ // Defensive / unreachable under dual-dispatch: this arm catches the `no-unit`
1094
+ // outcome (and any other non-up outcome), but we only reach `ensureHubForOp`
1095
+ // on the supervisor path, which is gated on `unitInstalled === true` — the
1096
+ // same `isHubUnitInstalled` probe that makes `ensureHubUnit` return `no-unit`
1097
+ // only when it's false. So `no-unit` can't surface here in production; it's
1098
+ // harmless surface. Candidate for removal in the Phase 5 bridge-collapse —
1099
+ // the deletion sweep should not overlook this branch.
1100
+ for (const m of ensured.messages) r.log(m);
1101
+ return false;
1102
+ }
1103
+
1104
+ /** `start <svc>` / `start` (no svc) over the supervisor (§3.3). */
1105
+ async function startViaSupervisor(svc: string | undefined, r: Resolved): Promise<number> {
1106
+ const port = readHubPort(r.configDir) ?? HUB_UNIT_DEFAULT_PORT;
1107
+ // `start hub` / `start` (no svc): ensure the hub unit is up — it transitively
1108
+ // boots every installed module from services.json via bootSupervisedModules.
1109
+ if (svc === HUB_SVC || svc === undefined) {
1110
+ const up = await ensureHubForOp(r, port);
1111
+ if (!up) return 1;
1112
+ r.log(svc === HUB_SVC ? "✓ hub is up." : "✓ hub is up (all installed modules booted).");
1113
+ return 0;
1114
+ }
1115
+ // `start <svc>`: ensure the hub is up first (chicken-and-egg §3.2), then drive
1116
+ // a pure supervisor.start of the already-installed module.
1117
+ if (!(await ensureHubForOp(r, port))) return 1;
1118
+ const { result, httpError, failed } = await driveSupervisorOp(svc, "start", r);
1119
+ if (httpError) {
1120
+ surfaceModuleOpHttpError(svc, httpError, r);
1121
+ return 1;
1122
+ }
1123
+ if (failed || !result) return 1;
1124
+ r.log(`✓ ${svc} started.`);
1125
+ return 0;
1126
+ }
1127
+
1128
+ /** `stop <svc>` / `stop` (no svc) over the supervisor / platform manager (§3.3). */
1129
+ async function stopViaSupervisor(svc: string | undefined, r: Resolved): Promise<number> {
1130
+ const port = readHubPort(r.configDir) ?? HUB_UNIT_DEFAULT_PORT;
1131
+ // `stop hub` / `stop` (no svc): stop the hub UNIT via the platform manager.
1132
+ // MUST go through the manager — a PID signal would be undone by launchd
1133
+ // KeepAlive / systemd Restart=always (R17). Children die with the hub.
1134
+ if (svc === HUB_SVC || svc === undefined) {
1135
+ const res = r.sup.stopHubUnit(r.sup.hubUnitDeps);
1136
+ for (const m of res.messages) r.log(m);
1137
+ if (res.outcome === "ok") {
1138
+ r.log("✓ hub stopped (all supervised modules stopped with it).");
1139
+ return 0;
1140
+ }
1141
+ return 1;
1142
+ }
1143
+ // `stop <svc>`: a supervised module dies WITH the hub. If the hub isn't
1144
+ // reachable, the module is already down — report success WITHOUT starting the
1145
+ // hub (do NOT ensureHubUnit just to stop one module). Only when the hub is up
1146
+ // do we drive the supervisor's stop.
1147
+ if (!(await r.sup.probeHubHealth(port))) {
1148
+ r.log(`${svc} already stopped (the hub isn't running, so its modules are down).`);
1149
+ return 0;
1150
+ }
1151
+ const { httpError, failed, result } = await driveSupervisorOp(svc, "stop", r);
1152
+ if (httpError) {
1153
+ surfaceModuleOpHttpError(svc, httpError, r);
1154
+ return 1;
1155
+ }
1156
+ if (failed || !result) return 1;
1157
+ r.log(`✓ ${svc} stopped.`);
1158
+ return 0;
1159
+ }
1160
+
1161
+ /** `restart <svc>` / `restart` (no svc) over the supervisor / manager (§3.3). */
1162
+ async function restartViaSupervisor(svc: string | undefined, r: Resolved): Promise<number> {
1163
+ // `restart hub` / `restart` (no svc): restart the hub UNIT via the platform
1164
+ // manager. NOT a per-module fan-out — restarting the hub re-boots all modules
1165
+ // anyway. MUST go through the manager (never a PID signal, R17).
1166
+ if (svc === HUB_SVC || svc === undefined) {
1167
+ const res = r.sup.restartHubUnit(r.sup.hubUnitDeps);
1168
+ for (const m of res.messages) r.log(m);
1169
+ if (res.outcome === "ok") {
1170
+ r.log("✓ hub restarted (all modules re-booted).");
1171
+ return 0;
1172
+ }
1173
+ return 1;
1174
+ }
1175
+ // `restart <svc>`: ensure the hub is up, then drive supervisor.restart.
1176
+ const port = readHubPort(r.configDir) ?? HUB_UNIT_DEFAULT_PORT;
1177
+ if (!(await ensureHubForOp(r, port))) return 1;
1178
+ const restartRes = await driveSupervisorOp(svc, "restart", r);
1179
+ if (restartRes.httpError) {
1180
+ // 404-fallthrough (§6.2): a module that isn't currently supervised (crashed
1181
+ // out of budget, skipped at boot, installed out-of-band) returns 404
1182
+ // `not_supervised`. `restart` must be total over module state (matching the
1183
+ // detached stop+start), so fall through to a pure `start`.
1184
+ if (restartRes.httpError.status === 404 && restartRes.httpError.code === "not_supervised") {
1185
+ const startRes = await driveSupervisorOp(svc, "start", r);
1186
+ if (startRes.httpError) {
1187
+ surfaceModuleOpHttpError(svc, startRes.httpError, r);
1188
+ return 1;
1189
+ }
1190
+ if (startRes.failed || !startRes.result) return 1;
1191
+ r.log(`✓ ${svc} started.`);
1192
+ return 0;
1193
+ }
1194
+ surfaceModuleOpHttpError(svc, restartRes.httpError, r);
1195
+ return 1;
1196
+ }
1197
+ if (restartRes.failed || !restartRes.result) return 1;
1198
+ r.log(`✓ ${svc} restarted.`);
1199
+ return 0;
872
1200
  }
873
1201
 
874
1202
  /**
@@ -47,6 +47,62 @@ export interface BootedModule {
47
47
  readonly reason?: string;
48
48
  }
49
49
 
50
+ export interface SpawnReqShape {
51
+ short: string;
52
+ cmd: readonly string[];
53
+ cwd?: string;
54
+ env?: Record<string, string>;
55
+ }
56
+
57
+ export interface BuildSpawnRequestOpts {
58
+ /** Config dir ($PARACHUTE_HOME). Used to read the module's per-service `.env`. */
59
+ readonly configDir: string;
60
+ /** Canonical hub origin → child env `PARACHUTE_HUB_ORIGIN`. Skipped when absent. */
61
+ readonly hubOrigin?: string;
62
+ /**
63
+ * Extra env merged on top of the derived env (PORT / .env / HUB_ORIGIN).
64
+ * Wins over all of them. Used by the API `start` handler's test seam +
65
+ * first-boot vault-name pass-through (`spawnEnv`). Empty/absent on the
66
+ * boot path.
67
+ */
68
+ readonly extraEnv?: Record<string, string>;
69
+ }
70
+
71
+ /**
72
+ * Build the `Supervisor.start` request for a single module, identically on
73
+ * both the serve-boot path and the `POST /api/modules/:short/start` handler.
74
+ *
75
+ * Env layering (later wins):
76
+ * 1. `PORT` from the services.json `entry.port` — overrides hub's own PORT
77
+ * so supervised children honor their canonical port assignment
78
+ * (hub#356/#357).
79
+ * 2. per-service `.env` at `<configDir>/<short>/.env` — operator-configured
80
+ * values (e.g. scribe provider keys) override the bare PORT.
81
+ * 3. `PARACHUTE_HUB_ORIGIN` = `opts.hubOrigin` — anchors the child's `iss`
82
+ * expectation to the value hub mints with (hub#365).
83
+ * 4. `opts.extraEnv` — test seam / first-boot pass-through; wins last.
84
+ *
85
+ * `cwd` is set to `entry.installDir` when present (third-party modules ship
86
+ * relative startCmds that need it; first-party fallbacks use absolute / PATH
87
+ * binaries so cwd is a no-op there).
88
+ */
89
+ export function buildModuleSpawnRequest(
90
+ short: string,
91
+ entry: ServiceEntry,
92
+ cmd: readonly string[],
93
+ opts: BuildSpawnRequestOpts,
94
+ ): SpawnReqShape {
95
+ const fileEnv = readEnvFileValues(join(opts.configDir, short, ".env"));
96
+ const env: Record<string, string> = { PORT: String(entry.port), ...fileEnv };
97
+ if (opts.hubOrigin) env[HUB_ORIGIN_ENV] = opts.hubOrigin;
98
+ if (opts.extraEnv) Object.assign(env, opts.extraEnv);
99
+
100
+ const req: SpawnReqShape = { short, cmd };
101
+ if (entry.installDir) req.cwd = entry.installDir;
102
+ if (Object.keys(env).length > 0) req.env = env;
103
+ return req;
104
+ }
105
+
50
106
  /**
51
107
  * Walk services.json, spawn every manageable module via the
52
108
  * supervisor. Returns a per-module decision log so the caller can
@@ -92,32 +148,22 @@ export async function bootSupervisedModules(
92
148
  continue;
93
149
  }
94
150
 
95
- // PORT override (hub#357 — third spawn site missed by hub#356).
96
- // Without this, modules that read process.env.PORT (vault, scribe)
97
- // inherit hub's PORT from Bun.spawn's env: process.env default and
98
- // crash EADDRINUSE on hub's port. Container deploy was still broken
99
- // after #356 because this BOOT path runs on hub startup before the
100
- // supervisor's other spawn paths see any traffic. fileEnv wins on
101
- // collision so per-service .env can still override.
102
- const fileEnv = readEnvFileValues(join(opts.configDir, short, ".env"));
103
- const env: Record<string, string> = { PORT: String(entry.port), ...fileEnv };
104
- if (opts.hubOrigin) env[HUB_ORIGIN_ENV] = opts.hubOrigin;
105
-
106
- const req: {
107
- short: string;
108
- cmd: readonly string[];
109
- cwd?: string;
110
- env?: Record<string, string>;
111
- } = {
112
- short,
113
- cmd,
114
- };
115
- // Third-party modules ship clean relative startCmds — cwd:
116
- // installDir makes them resolve. First-party fallbacks use
117
- // absolute / PATH binaries so cwd is a no-op there.
118
- if (entry.installDir) req.cwd = entry.installDir;
119
- if (Object.keys(env).length > 0) req.env = env;
151
+ // PORT override (hub#357 — third spawn site missed by hub#356), per-service
152
+ // .env merge, and PARACHUTE_HUB_ORIGIN propagation (hub#365) all live in the
153
+ // shared `buildModuleSpawnRequest` so the `POST /api/modules/:short/start`
154
+ // handler builds an identical request (design 2026-06-01 §3.3).
155
+ const req = buildModuleSpawnRequest(short, entry, cmd, {
156
+ configDir: opts.configDir,
157
+ ...(opts.hubOrigin !== undefined ? { hubOrigin: opts.hubOrigin } : {}),
158
+ });
120
159
 
160
+ // Serial await, not Promise.all: `supervisor.start` now carries a bounded
161
+ // post-spawn port-readiness gate (DEFAULT_START_READY_MS), so boot latency
162
+ // is the SUM of each slow-binding module's gate wait before `Bun.serve`
163
+ // comes up. Intentional — sequential boot keeps the start-error/install-card
164
+ // surface ordered and avoids a thundering-herd of port probes. Don't switch
165
+ // to `Promise.all` without accounting for the gate (it'd overlap the waits
166
+ // but also fire N concurrent readiness probes mid-boot).
121
167
  await supervisor.start(req);
122
168
  log(`[supervisor] ${short}: started (cmd=${cmd.join(" ")}).`);
123
169
  results.push({ short, entryName: entry.name, status: "started" });