@openparachute/hub 0.6.2 → 0.6.3-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/README.md +87 -35
  2. package/package.json +1 -1
  3. package/src/__tests__/api-hub-upgrade.test.ts +690 -0
  4. package/src/__tests__/api-modules-ops.test.ts +359 -3
  5. package/src/__tests__/api-modules.test.ts +54 -0
  6. package/src/__tests__/expose-cloudflare.test.ts +163 -72
  7. package/src/__tests__/expose-off-auto.test.ts +26 -1
  8. package/src/__tests__/expose.test.ts +260 -240
  9. package/src/__tests__/hub-control.test.ts +1 -242
  10. package/src/__tests__/hub-server.test.ts +64 -0
  11. package/src/__tests__/hub-unit.test.ts +574 -0
  12. package/src/__tests__/init.test.ts +219 -2
  13. package/src/__tests__/lifecycle.test.ts +416 -1448
  14. package/src/__tests__/managed-unit.test.ts +575 -0
  15. package/src/__tests__/migrate-cutover.test.ts +840 -0
  16. package/src/__tests__/migrate-offer.test.ts +240 -0
  17. package/src/__tests__/migrate.test.ts +132 -0
  18. package/src/__tests__/module-ops-client.test.ts +556 -0
  19. package/src/__tests__/port-probe.test.ts +23 -0
  20. package/src/__tests__/setup-wizard.test.ts +130 -0
  21. package/src/__tests__/status-supervisor.test.ts +504 -0
  22. package/src/__tests__/status.test.ts +157 -708
  23. package/src/__tests__/supervisor.test.ts +471 -6
  24. package/src/__tests__/upgrade.test.ts +351 -5
  25. package/src/api-hub-upgrade.ts +384 -0
  26. package/src/api-hub.ts +2 -1
  27. package/src/api-modules-ops.ts +221 -0
  28. package/src/api-modules.ts +18 -2
  29. package/src/cli.ts +97 -12
  30. package/src/cloudflare/connector-service.ts +117 -322
  31. package/src/commands/expose-cloudflare.ts +63 -71
  32. package/src/commands/expose-supervisor.ts +247 -0
  33. package/src/commands/expose.ts +59 -48
  34. package/src/commands/init.ts +225 -12
  35. package/src/commands/lifecycle.ts +455 -816
  36. package/src/commands/migrate-cutover.ts +837 -0
  37. package/src/commands/migrate.ts +71 -2
  38. package/src/commands/serve-boot.ts +71 -25
  39. package/src/commands/status.ts +535 -235
  40. package/src/commands/upgrade.ts +100 -2
  41. package/src/help.ts +128 -68
  42. package/src/hub-control.ts +23 -162
  43. package/src/hub-server.ts +39 -0
  44. package/src/hub-unit.ts +735 -0
  45. package/src/hub-upgrade-helper.ts +306 -0
  46. package/src/hub-upgrade-mode.ts +209 -0
  47. package/src/hub-upgrade-status.ts +150 -0
  48. package/src/managed-unit.ts +692 -0
  49. package/src/migrate-offer.ts +186 -0
  50. package/src/module-ops-client.ts +457 -0
  51. package/src/port-probe.ts +50 -0
  52. package/src/process-state.ts +19 -3
  53. package/src/setup-wizard.ts +80 -1
  54. package/src/supervisor.ts +389 -38
  55. package/web/ui/dist/assets/index-D_6AFvZy.js +61 -0
  56. package/web/ui/dist/assets/{index-BiBlvEaj.css → index-mz8XcVPP.css} +1 -1
  57. package/web/ui/dist/index.html +2 -2
  58. package/web/ui/dist/assets/index-CIN3mnmf.js +0 -61
@@ -1,66 +1,49 @@
1
- import { existsSync, openSync, readFileSync } from "node:fs";
2
- import { Socket } from "node:net";
1
+ import { existsSync } from "node:fs";
3
2
  import { join } from "node:path";
4
- import {
5
- MissingDependencyError,
6
- ensureExecutable,
7
- rethrowIfMissing,
8
- } from "@openparachute/depcheck";
3
+ import { rethrowIfMissing } from "@openparachute/depcheck";
9
4
  import { CONFIG_DIR, SERVICES_MANIFEST_PATH } from "../config.ts";
10
- import { readEnvFileValues } from "../env-file.ts";
11
5
  import { readExposeState } from "../expose-state.ts";
12
- import {
13
- type EnsureHubOpts,
14
- type EnsureHubResult,
15
- HUB_SVC,
16
- type StopHubOpts,
17
- ensureHubRunning,
18
- readHubPort,
19
- stopHub,
20
- } from "../hub-control.ts";
6
+ import { HUB_SVC, readHubPort } from "../hub-control.ts";
21
7
  import { hubDbPath, openHubDb } from "../hub-db.ts";
22
- import { HUB_ORIGIN_ENV, deriveHubOrigin } from "../hub-origin.ts";
23
- import { ModuleManifestError, readModuleManifest } from "../module-manifest.ts";
24
- import { type OperatorIssuerHealStatus, selfHealOperatorTokenIssuer } from "../operator-token.ts";
8
+ import { deriveHubOrigin } from "../hub-origin.ts";
25
9
  import {
26
- type AliveFn,
27
- clearPid,
28
- ensureLogPath,
29
- logPath as logPathFor,
30
- processState,
31
- readPid,
32
- writePid,
33
- } from "../process-state.ts";
10
+ type EnsureHubUnitOpts,
11
+ type EnsureHubUnitResult,
12
+ HUB_UNIT_DEFAULT_PORT,
13
+ type HubUnitDeps,
14
+ type HubUnitManagerOpResult,
15
+ defaultHubUnitDeps,
16
+ ensureHubUnit as ensureHubUnitImpl,
17
+ isHubUnitInstalled,
18
+ restartHubUnit as restartHubUnitImpl,
19
+ stopHubUnit as stopHubUnitImpl,
20
+ } from "../hub-unit.ts";
34
21
  import {
35
- KNOWN_MODULES,
36
- type ServiceSpec,
37
- composeKnownModuleSpec,
38
- getSpec,
39
- getSpecFromInstallDir,
40
- knownServices,
41
- shortNameForManifest,
42
- } from "../service-spec.ts";
22
+ type MigrateOfferOpts,
23
+ type MigrateOfferResult,
24
+ offerMigrateToSupervised,
25
+ } from "../migrate-offer.ts";
43
26
  import {
44
- type ServiceEntry,
45
- clearStartError,
46
- readManifest,
47
- recordStartError,
48
- } from "../services-manifest.ts";
49
- import { persistVaultHubOrigin, selfHealVaultHubOrigin } from "../vault-hub-origin-env.ts";
27
+ type DriveModuleOpDeps,
28
+ type ModuleOp,
29
+ ModuleOpHttpError,
30
+ type ModuleOpResult,
31
+ NoOperatorTokenError,
32
+ OperatorTokenExpiredError,
33
+ driveModuleOp as driveModuleOpImpl,
34
+ } from "../module-ops-client.ts";
35
+ import { type PortListeningFn, defaultPortListening } from "../port-probe.ts";
36
+ import { type AliveFn, logPath as logPathFor, processState } from "../process-state.ts";
37
+ import { getSpec, knownServices } from "../service-spec.ts";
38
+ import { readManifest } from "../services-manifest.ts";
50
39
 
51
40
  /**
52
- * Tiny seam over `Bun.spawn` for lifecycle tests. The real spawner opens the
53
- * log file, appends stdout+stderr to it, and `unref()`s the child so parent
54
- * exit doesn't bring it down.
55
- *
56
- * `env`, when provided, is merged into the child's environment on top of the
57
- * parent's today's only caller is `start`, which injects
58
- * PARACHUTE_HUB_ORIGIN so vault's OAuth issuer matches the hub URL.
59
- *
60
- * `cwd`, when provided, is the child's working directory. Set to the
61
- * service's installDir for third-party modules so manifest-declared
62
- * relative startCmds (e.g. `["bun", "web/server/src/server.ts"]`) resolve
63
- * against the package root.
41
+ * Tiny seam over `Bun.spawn`, retained for the `parachute logs <svc> --follow`
42
+ * tail (`LogsOpts.tailSpawner`). The detached MODULE spawner (`defaultSpawner`)
43
+ * was retired in Phase 5b — modules are spawned by the supervisor under `serve`,
44
+ * not by a detached pidfile daemon. `logs` is the last consumer of this seam, and
45
+ * its tail only needs `cmd` (the `opts` is unused there but kept on the interface
46
+ * for a future caller).
64
47
  */
65
48
  export interface SpawnerOptions {
66
49
  env?: Record<string, string>;
@@ -71,81 +54,32 @@ export interface Spawner {
71
54
  spawn(cmd: readonly string[], logFile: string, opts?: SpawnerOptions): number;
72
55
  }
73
56
 
74
- export const defaultSpawner: Spawner = {
75
- spawn(cmd, logFile, opts) {
76
- const fd = openSync(logFile, "a");
77
- const spawnOpts: Parameters<typeof Bun.spawn>[1] = {
78
- stdio: ["ignore", fd, fd],
79
- // Spawn in a fresh process group (pid == pgid) so kill(-pid, sig)
80
- // reaches every descendant, not just the wrapper. Without this,
81
- // wrapped startCmds like `pnpm exec tsx server.ts` leave the tsx
82
- // grandchild bound to the port after stop → restart hits EADDRINUSE.
83
- detached: true,
84
- // Inherit env so child sees PATH, HOME, PARACHUTE_HOME, etc.
85
- // Bun.spawn defaults to empty env — see api-modules-ops.ts:defaultRun.
86
- // Per-call `opts.env` overrides merge on top below.
87
- env: process.env,
88
- };
89
- if (opts?.env) spawnOpts.env = { ...process.env, ...opts.env };
90
- if (opts?.cwd) spawnOpts.cwd = opts.cwd;
91
- const proc = Bun.spawn([...cmd], spawnOpts);
92
- proc.unref();
93
- return proc.pid;
94
- },
95
- };
96
-
97
57
  export type KillFn = (pid: number, signal: NodeJS.Signals | number) => void;
98
58
  export type SleepFn = (ms: number) => Promise<void>;
99
59
 
100
60
  /**
101
- * "Is something listening on this TCP port on loopback?" seam. Pairs with the
102
- * spawn-then-die settle (hub#194) to catch the *other* silent-start failure
103
- * shape (hub#487): a service that lives long enough to clear the liveness
104
- * check but never binds its port because the port is already held (EADDRINUSE
105
- * from an orphan). The recorded pid stays alive (vault's process supervisor
106
- * retries / lingers) so `alive(pid)` says "running" while `parachute status`
107
- * shows it inactive because nothing answers on the port.
108
- *
109
- * Tests inject a deterministic stub; production uses `defaultPortListening`.
61
+ * Port-readiness probe seam + its production impl now live in `port-probe.ts`
62
+ * (design 2026-06-01 §6.5) so the supervisor can share the exact same TCP
63
+ * connect-probe without dragging lifecycle's heavy import graph. Re-exported
64
+ * here so this module's public API (and its tests) are unchanged. Pairs with
65
+ * the spawn-then-die settle (hub#194) to catch the alive-but-never-bound shape
66
+ * (hub#487): a service that clears the liveness check but never binds its port
67
+ * because it's already held `alive(pid)` says "running" while `status` shows
68
+ * it inactive because nothing answers on the port.
110
69
  */
111
- export type PortListeningFn = (port: number) => Promise<boolean>;
112
-
113
- /**
114
- * Connect-probe: open a TCP socket to 127.0.0.1:<port> and see if it's
115
- * accepted. A successful connect means *something* is listening; we close
116
- * immediately. Connection refused / timeout means nothing is bound yet.
117
- * `node:net` rather than `Bun.connect` because the latter has no clean
118
- * "connection refused → false" without a custom socket handler, and the net
119
- * Socket's `error`/`connect` events map directly onto the boolean we want.
120
- */
121
- export const defaultPortListening: PortListeningFn = (port) =>
122
- new Promise((resolve) => {
123
- const socket = new Socket();
124
- let settled = false;
125
- const done = (listening: boolean) => {
126
- if (settled) return;
127
- settled = true;
128
- socket.destroy();
129
- resolve(listening);
130
- };
131
- socket.setTimeout(1000);
132
- socket.once("connect", () => done(true));
133
- socket.once("timeout", () => done(false));
134
- socket.once("error", () => done(false));
135
- socket.connect(port, "127.0.0.1");
136
- });
70
+ export { type PortListeningFn, defaultPortListening };
137
71
 
138
72
  /**
139
73
  * Group-aware liveness: returns true if the process group (pgid == pid)
140
- * still has any member. Pairs with `defaultSpawner`'s `detached: true`
141
- * the recorded pid is the pgid we created, so the group's existence is
142
- * the right "is the service still up?" signal (catches the wrapper-dead-
143
- * but-grandchild-listening case that causes EADDRINUSE on restart).
74
+ * still has any member. The detached module spawner that created these process
75
+ * groups is retired (Phase 5b the supervisor under `serve` owns module
76
+ * spawning now, with its own group-spawn + `defaultKillGroup` in `supervisor.ts`),
77
+ * but this stays as the liveness primitive for `parachute logs`'s
78
+ * "running-but-no-logfile" diagnostic over any pidfile still on disk (the readers
79
+ * §7.5 keeps for one release).
144
80
  *
145
- * Falls back to a single-pid check for legacy pidfiles written before
146
- * detached-spawn landed: `kill(-pid, 0)` returns ESRCH because no group
147
- * with that pgid exists, and we still want to honor the bare-pid alive
148
- * signal so a follow-up `stop` runs.
81
+ * Falls back to a single-pid check when no group with that pgid exists:
82
+ * `kill(-pid, 0)` returns ESRCH, and we still honor the bare-pid alive signal.
149
83
  */
150
84
  export const defaultAlive: AliveFn = (pid) => {
151
85
  try {
@@ -163,12 +97,13 @@ export const defaultAlive: AliveFn = (pid) => {
163
97
  };
164
98
 
165
99
  /**
166
- * Sends `signal` to the entire process group rooted at `pid`. With
167
- * `defaultSpawner` putting the child in its own group, this reaches the
168
- * wrapper and any grandchildren in one syscall. ESRCH on the group send
169
- * means the pgid is gone (legacy pidfile, or the leader exited and the
170
- * group emptied) fall back to a bare-pid signal so the caller's intent
171
- * still lands when there's a positive-pid process to receive it.
100
+ * Sends `signal` to the entire process group rooted at `pid`. Reaches a wrapper
101
+ * and any grandchildren in one syscall when the pid is a group leader. ESRCH on
102
+ * the group send means the pgid is gone (the leader exited and the group emptied,
103
+ * or a non-group pid) fall back to a bare-pid signal so the caller's intent
104
+ * still lands. The supervisor's `defaultKillGroup` (supervisor.ts) is the
105
+ * production reaper now; this export survives for the group-aware test coverage
106
+ * + any future on-box use.
172
107
  */
173
108
  export const defaultKill: KillFn = (pid, signal) => {
174
109
  try {
@@ -181,232 +116,258 @@ export const defaultKill: KillFn = (pid, signal) => {
181
116
 
182
117
  export const defaultSleep: SleepFn = (ms) => new Promise((r) => setTimeout(r, ms));
183
118
 
184
- /**
185
- * Read the trailing `n` lines of a logfile, best-effort. Used to surface the
186
- * real boot error when a start fails — operators shouldn't have to manually
187
- * `tail` the log to learn *why* the daemon died. Returns [] on any read
188
- * error (missing file, permissions) so the caller falls back to the generic
189
- * "tail the log" hint without throwing.
190
- */
191
- function readLogTail(logFile: string, n: number): string[] {
192
- try {
193
- const content = readFileSync(logFile, "utf8");
194
- const trimmed = content.replace(/\n$/, "");
195
- if (trimmed === "") return [];
196
- return trimmed.split("\n").slice(-n);
197
- } catch {
198
- return [];
199
- }
200
- }
201
-
202
- /**
203
- * Heuristic EADDRINUSE detector over a logfile tail. cloudflared, Bun, and
204
- * Node all surface port collisions with recognizable phrases; we match the
205
- * common ones rather than parse a structured error (there isn't one across
206
- * runtimes). False positives are harmless — the worst case is we *also* print
207
- * the port-in-use remedy on an unrelated failure, which is still actionable.
208
- */
209
- function detectAddrInUse(logTail: readonly string[]): boolean {
210
- return logTail.some((line) => /EADDRINUSE|address already in use|port .* in use/i.test(line));
211
- }
212
-
213
119
  export interface LifecycleOpts {
214
- spawner?: Spawner;
215
- kill?: KillFn;
216
- alive?: AliveFn;
217
- sleep?: SleepFn;
218
- now?: () => number;
219
120
  manifestPath?: string;
220
121
  configDir?: string;
221
122
  log?: (line: string) => void;
222
- /** How long stop waits for SIGTERM before escalating to SIGKILL. */
223
- killWaitMs?: number;
224
- /** Poll interval while waiting for SIGTERM to land. */
225
- pollIntervalMs?: number;
226
123
  /**
227
- * How long `start` sleeps before re-checking `alive(pid)` to catch the
228
- * spawn-then-immediately-die failure shape (hub#194: notes-serve crashed
229
- * 50ms in on Bun.resolveSync, but `start` reported success because the
230
- * spawn returned a pid). 250ms is the default in production — long
231
- * enough to catch real silent-crashes (resolve failures, port
232
- * collisions, missing args) without making `parachute start` feel
233
- * laggy.
234
- *
235
- * Defaulting policy: if `alive` is not overridden, the settle defaults
236
- * to 0 (skipped). Stub spawners hand back fake pids that the real
237
- * `defaultAlive` would mark as dead, which would make every existing
238
- * stub-spawner test fail spuriously. Tests that want to exercise the
239
- * settle path inject both `alive` and `startSettleMs` explicitly.
240
- * Production paths use the real `defaultAlive` and get the real 250ms
241
- * settle.
242
- */
243
- startSettleMs?: number;
244
- /**
245
- * Probe whether the service's port is listening, post-spawn. Pairs with the
246
- * settle (hub#194) to catch the EADDRINUSE-orphan shape (hub#487): the
247
- * process survives the liveness window (vault lingers / retries) but never
248
- * binds because the port is already held, so `start` would otherwise report
249
- * "✓ started" while `status` shows it inactive. Tests inject a stub;
250
- * production uses `defaultPortListening` (a loopback TCP connect probe).
251
- */
252
- portListening?: PortListeningFn;
253
- /**
254
- * How long `start` polls for the service to bind its port after the
255
- * liveness settle passes. Default 4000ms in production — long enough to
256
- * cover vault/scribe cold-boot (DB open, route registration) without making
257
- * a healthy start feel laggy. Polled at `startReadyPollMs` intervals; the
258
- * first time the port answers we declare success. If the window elapses
259
- * with the process still alive but the port silent, we print a non-fatal
260
- * warning (the daemon may still be coming up) rather than failing — only a
261
- * *dead* process is a hard failure. Defaulting policy mirrors
262
- * `startSettleMs`: 0 (skipped) unless `portListening` is injected or the
263
- * production path (no spawner override) is active.
264
- */
265
- startReadyMs?: number;
266
- /** Poll interval while waiting for the port to come up. Default 200ms. */
267
- startReadyPollMs?: number;
268
- /**
269
- * Override the hub origin passed to services as PARACHUTE_HUB_ORIGIN. If
270
- * unset, `start` derives it from `expose-state.json` (when exposed) or
271
- * the hub.port file (local dev). Undefined → no env var is set at all,
272
- * and the service advertises its own default issuer.
124
+ * Override the hub origin used as the operator token's `iss` validator on the
125
+ * loopback module-ops call. If unset, derived from `expose-state.json` (when
126
+ * exposed) or the hub.port file (local dev).
273
127
  */
274
128
  hubOrigin?: string;
275
129
  /**
276
- * Hub-lifecycle seams for `parachute start|stop|restart hub`. The hub
277
- * doesn't go through the generic services-manifest path because its
278
- * start has special semantics (port-fallback probe, port-file write,
279
- * --issuer flag) `lifecycle.start("hub")` dispatches to
280
- * `ensureHubRunning` and `lifecycle.stop("hub")` dispatches to
281
- * `stopHub`. Tests inject stubs to avoid spawning real bun processes.
282
- */
283
- /**
284
- * PATH-resolution seam for the start preflight (`@openparachute/depcheck`
285
- * `ensureExecutable`). Production uses the real `Bun.which`; a missing
286
- * startCmd binary then surfaces the friendly missing-dependency UX +
287
- * persists it to services.json.
130
+ * Supervisor-path seams (design §3.3) the ONLY runtime as of Phase 5b.
131
+ * `start/stop/restart` drive the RUNNING hub's in-process Supervisor over the
132
+ * loopback module-ops API (per-module verbs) or the platform manager (hub
133
+ * verbs / no-svc). The detached spawners are retired; a box with no hub unit
134
+ * goes through the §7.5 auto-offer / actionable error (`migrateOffer`), never
135
+ * a detached spawn.
288
136
  *
289
- * Defaulting policy mirrors `startSettleMs`: when a stub `spawner` is
290
- * injected (the test path) `which` defaults to a permissive resolver
291
- * (`() => "<stub>"`) so existing stub-spawner tests don't trip the preflight
292
- * against binaries that aren't on the test host's PATH (`parachute-vault`,
293
- * `notes-serve`). Production (no spawner override) gets the real `Bun.which`.
294
- * Tests that want to exercise the missing-binary branch inject `which`
295
- * explicitly (e.g. `which: () => null`).
137
+ * Everything here is injectable so tests can (a) force the unit-installed
138
+ * branch without a real launchd/systemd, and (b) assert the module-ops /
139
+ * manager calls without a live hub. Production wires the real
140
+ * {@link driveModuleOp} / {@link ensureHubUnit} / {@link stopHubUnit} /
141
+ * {@link restartHubUnit} against an opened hub.db + the resolved hub origin.
142
+ *
143
+ * `unitInstalled` is the discriminant that decides whether the box is already
144
+ * supervised. When OMITTED entirely it defaults to `false` → the verb runs the
145
+ * no-unit path (auto-offer / error). The production CLI dispatch passes
146
+ * `supervisor: {}` so the real `isHubUnitInstalled` probe decides.
296
147
  */
297
- which?: (cmd: string) => string | null;
298
- hub?: {
299
- ensureRunning?: (opts: EnsureHubOpts) => Promise<EnsureHubResult>;
300
- stop?: (opts: StopHubOpts) => Promise<boolean>;
148
+ supervisor?: {
149
+ /**
150
+ * Is a hub unit installed (the dual-dispatch discriminant)? Production
151
+ * uses `isHubUnitInstalled(hubUnitDeps)`. Tests set this `true`/`false`
152
+ * directly to pick the branch deterministically. When set, it wins over
153
+ * the `hubUnitDeps`-derived detection.
154
+ */
155
+ unitInstalled?: boolean;
156
+ /** Deps for the real `isHubUnitInstalled` probe + the hub-unit manager ops. */
157
+ hubUnitDeps?: HubUnitDeps;
158
+ /** Drive a per-module op against the running hub (reads operator.token). */
159
+ driveModuleOp?: (
160
+ short: string,
161
+ op: ModuleOp,
162
+ deps: DriveModuleOpDeps,
163
+ ) => Promise<ModuleOpResult>;
164
+ /** Ensure the hub unit is up before a module op (§3.2). */
165
+ ensureHubUnit?: (opts: EnsureHubUnitOpts) => Promise<EnsureHubUnitResult>;
166
+ /** Stop the hub unit via the platform manager (NEVER a PID signal, §3.3). */
167
+ stopHubUnit?: (deps: HubUnitDeps) => HubUnitManagerOpResult;
168
+ /** Restart the hub unit via the platform manager (NEVER a PID signal, §3.3). */
169
+ restartHubUnit?: (deps: HubUnitDeps) => HubUnitManagerOpResult;
301
170
  /**
302
- * Self-heal the operator token's stale `iss` after `start hub` (hub#481).
303
- * Production opens hub.db at `<configDir>/hub.db` and delegates to
304
- * `selfHealOperatorTokenIssuer`. Tests inject a stub to assert the call
305
- * happens or to make it throw and prove a self-heal failure never fails
306
- * `start hub`.
171
+ * Probe whether the loopback hub answers `/health`. Used by `stop <svc>`:
172
+ * if the hub is down, the supervised module is already down (children die
173
+ * with the hub) report "already stopped" WITHOUT starting the hub.
174
+ * Production reuses the hub-unit deps' `probeHealth`.
307
175
  */
308
- selfHealOperatorToken?: (args: {
309
- issuer: string;
310
- configDir: string;
311
- log: (line: string) => void;
312
- }) => Promise<OperatorIssuerHealStatus>;
176
+ probeHubHealth?: (port: number) => Promise<boolean>;
177
+ /**
178
+ * Open the hub DB used to validate/auto-rotate the operator token in
179
+ * `driveModuleOp`. Production opens `<configDir>/hub.db`; tests inject an
180
+ * in-memory/seeded db. Returns a handle the caller closes.
181
+ */
182
+ openDb?: (configDir: string) => import("bun:sqlite").Database;
183
+ /** Loopback hub base URL override (default derives from the hub port). */
184
+ baseUrl?: string;
185
+ };
186
+ /**
187
+ * §7.5 auto-detect-and-offer seam. When a verb takes the DETACHED arm (no hub
188
+ * unit installed) and a prior detached install is detected, the verb offers
189
+ * the supervised cutover (interactive) or prints the command (non-TTY) BEFORE
190
+ * doing detached work. Injectable so tests can (a) stub the offer to assert it
191
+ * fires / migrates / declines, and (b) DISABLE it entirely (`enabled:false`)
192
+ * so the hundreds of existing detached-arm lifecycle tests don't trip an
193
+ * interactive prompt. Production wires the real `offerMigrateToSupervised`.
194
+ *
195
+ * Default when OMITTED: disabled, so existing tests (which never opt in) stay
196
+ * deterministic. The production CLI dispatch passes `{ enabled: true }`.
197
+ */
198
+ migrateOffer?: {
199
+ /** Master switch. Default `false` when the whole block is omitted. */
200
+ enabled?: boolean;
201
+ /** The offer implementation (default `offerMigrateToSupervised`). */
202
+ offer?: (opts: MigrateOfferOpts) => Promise<MigrateOfferResult>;
313
203
  };
314
204
  }
315
205
 
316
206
  interface Resolved {
317
- spawner: Spawner;
318
- kill: KillFn;
319
- alive: AliveFn;
320
- sleep: SleepFn;
321
- now: () => number;
322
207
  manifestPath: string;
323
208
  configDir: string;
324
209
  log: (line: string) => void;
325
- killWaitMs: number;
326
- pollIntervalMs: number;
327
- startSettleMs: number;
328
- portListening: PortListeningFn;
329
- startReadyMs: number;
330
- startReadyPollMs: number;
331
- which: (cmd: string) => string | null;
332
210
  hubOrigin: string | undefined;
333
- ensureHub: (opts: EnsureHubOpts) => Promise<EnsureHubResult>;
334
- stopHubFn: (opts: StopHubOpts) => Promise<boolean>;
335
- selfHealOperatorTokenFn: (args: {
336
- issuer: string;
337
- configDir: string;
338
- log: (line: string) => void;
339
- }) => Promise<OperatorIssuerHealStatus>;
211
+ sup: ResolvedSupervisor;
212
+ /** §7.5 resolved auto-offer (enabled flag + the offer impl). */
213
+ migrateOffer: {
214
+ enabled: boolean;
215
+ offer: (opts: MigrateOfferOpts) => Promise<MigrateOfferResult>;
216
+ };
340
217
  }
341
218
 
342
- /**
343
- * Production self-heal: open hub.db at `<configDir>/hub.db`, run
344
- * `selfHealOperatorTokenIssuer`, and close the db. Derives the db path the
345
- * same way the rest of the repo does (`hubDbPath(configDir)`); `openHubDb`
346
- * runs migrations + WAL on open, matching `commands/auth.ts`. Tests override
347
- * this whole seam, so the db-open only happens on the production path.
348
- */
349
- async function defaultSelfHealOperatorToken(args: {
350
- issuer: string;
351
- configDir: string;
352
- log: (line: string) => void;
353
- }): Promise<OperatorIssuerHealStatus> {
354
- const db = openHubDb(hubDbPath(args.configDir));
355
- try {
356
- return await selfHealOperatorTokenIssuer(db, {
357
- issuer: args.issuer,
358
- configDir: args.configDir,
359
- log: args.log,
360
- });
361
- } finally {
362
- db.close();
363
- }
219
+ /** Resolved supervisor-path seams (see `LifecycleOpts.supervisor`). */
220
+ interface ResolvedSupervisor {
221
+ /** Whether a hub unit is installed the dual-dispatch discriminant. */
222
+ unitInstalled: boolean;
223
+ hubUnitDeps: HubUnitDeps;
224
+ driveModuleOp: (short: string, op: ModuleOp, deps: DriveModuleOpDeps) => Promise<ModuleOpResult>;
225
+ ensureHubUnit: (opts: EnsureHubUnitOpts) => Promise<EnsureHubUnitResult>;
226
+ stopHubUnit: (deps: HubUnitDeps) => HubUnitManagerOpResult;
227
+ restartHubUnit: (deps: HubUnitDeps) => HubUnitManagerOpResult;
228
+ probeHubHealth: (port: number) => Promise<boolean>;
229
+ openDb: (configDir: string) => import("bun:sqlite").Database;
230
+ baseUrl: string | undefined;
364
231
  }
365
232
 
366
233
  function resolve(opts: LifecycleOpts): Resolved {
367
234
  const configDir = opts.configDir ?? CONFIG_DIR;
368
235
  return {
369
- spawner: opts.spawner ?? defaultSpawner,
370
- kill: opts.kill ?? defaultKill,
371
- alive: opts.alive ?? defaultAlive,
372
- sleep: opts.sleep ?? defaultSleep,
373
- now: opts.now ?? Date.now,
374
236
  manifestPath: opts.manifestPath ?? SERVICES_MANIFEST_PATH,
375
237
  configDir,
376
238
  log: opts.log ?? ((line) => console.log(line)),
377
- killWaitMs: opts.killWaitMs ?? 10_000,
378
- pollIntervalMs: opts.pollIntervalMs ?? 200,
379
- // See `LifecycleOpts.startSettleMs` doc. Production (no spawner
380
- // override, no alive override) gets the 250ms settle. Tests that
381
- // inject a stub spawner without a stub alive get 0 — `defaultAlive`
382
- // against a fake pid would always report dead and break unrelated
383
- // tests. Tests that want to exercise the settle path explicitly
384
- // override `alive`, which re-enables the default 250ms.
385
- startSettleMs:
386
- opts.startSettleMs ?? (opts.spawner === undefined || opts.alive !== undefined ? 250 : 0),
387
- portListening: opts.portListening ?? defaultPortListening,
388
- // Same defaulting policy as startSettleMs: production (no spawner
389
- // override) gets the real 4s readiness window; tests that inject a stub
390
- // spawner get 0 (skipped) unless they explicitly opt in via
391
- // `portListening` or `startReadyMs`, so existing stub-spawner tests don't
392
- // start probing a fake port.
393
- startReadyMs:
394
- opts.startReadyMs ??
395
- (opts.spawner === undefined || opts.portListening !== undefined ? 4000 : 0),
396
- startReadyPollMs: opts.startReadyPollMs ?? 200,
397
- // Same defaulting policy as startSettleMs/startReadyMs: production (no
398
- // spawner override) preflights with the real Bun.which; stub-spawner tests
399
- // get a permissive resolver so the preflight doesn't trip against binaries
400
- // that aren't on the test host's PATH. Explicit `which` always wins.
401
- which:
402
- opts.which ?? (opts.spawner === undefined ? Bun.which : () => "/stub/bin/preflight-skipped"),
403
239
  hubOrigin: resolveHubOrigin(opts.hubOrigin, configDir),
404
- ensureHub: opts.hub?.ensureRunning ?? ensureHubRunning,
405
- stopHubFn: opts.hub?.stop ?? stopHub,
406
- selfHealOperatorTokenFn: opts.hub?.selfHealOperatorToken ?? defaultSelfHealOperatorToken,
240
+ sup: resolveSupervisor(opts.supervisor),
241
+ migrateOffer: {
242
+ // Default OFF when omitted so the existing supervised-arm + no-unit
243
+ // lifecycle tests (which don't opt in) don't trip an interactive prompt.
244
+ // The production CLI dispatch passes `{ enabled: true }`.
245
+ enabled: opts.migrateOffer?.enabled ?? false,
246
+ offer: opts.migrateOffer?.offer ?? offerMigrateToSupervised,
247
+ },
248
+ };
249
+ }
250
+
251
+ /**
252
+ * Resolve the supervisor-path seams.
253
+ *
254
+ * The discriminant `unitInstalled` decides whether the box is already supervised:
255
+ * - When the caller PROVIDES a `supervisor` block (even `{}`, which the
256
+ * production CLI dispatch passes), `unitInstalled` is the explicit override
257
+ * if set, else the real `isHubUnitInstalled` probe over the hub-unit deps.
258
+ * - When the caller OMITS `supervisor` entirely, `unitInstalled` defaults to
259
+ * `false` → the verb runs the no-unit path (§7.5 auto-offer / actionable
260
+ * error). Deterministic regardless of whether the test host happens to have a
261
+ * real hub unit installed.
262
+ */
263
+ function resolveSupervisor(opts: LifecycleOpts["supervisor"]): ResolvedSupervisor {
264
+ const hubUnitDeps = opts?.hubUnitDeps ?? defaultHubUnitDeps;
265
+ // No `supervisor` block at all → no-unit path, deterministically. Only probe
266
+ // the real filesystem when the caller opted in (production CLI passes
267
+ // `supervisor: {}`; tests pass the seams they want to assert).
268
+ const unitInstalled =
269
+ opts === undefined ? false : (opts.unitInstalled ?? isHubUnitInstalled(hubUnitDeps));
270
+ return {
271
+ unitInstalled,
272
+ hubUnitDeps,
273
+ driveModuleOp: opts?.driveModuleOp ?? driveModuleOpImpl,
274
+ ensureHubUnit: opts?.ensureHubUnit ?? ensureHubUnitImpl,
275
+ stopHubUnit: opts?.stopHubUnit ?? stopHubUnitImpl,
276
+ restartHubUnit: opts?.restartHubUnit ?? restartHubUnitImpl,
277
+ probeHubHealth: opts?.probeHubHealth ?? hubUnitDeps.probeHealth,
278
+ openDb: opts?.openDb ?? ((configDir) => openHubDb(hubDbPath(configDir))),
279
+ baseUrl: opts?.baseUrl,
407
280
  };
408
281
  }
409
282
 
283
+ /**
284
+ * §7.5 auto-detect-and-offer hook for the no-unit case of start/stop/restart.
285
+ *
286
+ * Called when a verb finds NO hub unit installed (Phase 5b removed the detached
287
+ * spawners, so there is no detached arm to fall back to). When the offer is
288
+ * enabled, it runs `offerMigrateToSupervised` (which itself checks "no unit +
289
+ * prior detached" and prompts / prints). Returns `true` ONLY when the operator
290
+ * accepted AND the cutover succeeded — i.e. the box is NOW supervised, so the
291
+ * caller can dispatch through the supervisor path. Every other outcome (offer
292
+ * disabled, no-offer, declined, printed in a non-TTY, migrate-failed) returns
293
+ * `false` → the caller surfaces the actionable "run `parachute migrate
294
+ * --to-supervised`" error (NOT a detached spawn — that path is gone).
295
+ *
296
+ * The migrate-failed case deliberately returns `false`: a failed cutover leaves
297
+ * the box un-migrated (the cutover is fail-safe + re-runnable), so the verb
298
+ * surfaces the error rather than dispatching into a supervisor that isn't up.
299
+ */
300
+ async function maybeOfferAndMigrate(r: Resolved): Promise<boolean> {
301
+ if (!r.migrateOffer.enabled) return false;
302
+ const result = await r.migrateOffer.offer({
303
+ configDir: r.configDir,
304
+ manifestPath: r.manifestPath,
305
+ log: r.log,
306
+ });
307
+ if (result.outcome === "migrated") {
308
+ // The box is now supervised. Flip the resolved discriminant so the verb
309
+ // takes the supervisor arm (the unit is freshly installed; `unitInstalled`
310
+ // was resolved as false before the offer).
311
+ r.sup.unitInstalled = true;
312
+ return true;
313
+ }
314
+ return false;
315
+ }
316
+
317
+ /**
318
+ * Phase 5b single-path gate (the point-of-no-return). The supervised path is the
319
+ * ONLY runtime — the detached spawners are retired. So every per-module verb must
320
+ * first establish that a hub unit is installed; if it isn't, there is no detached
321
+ * fallback to take. Resolution order:
322
+ *
323
+ * 1. Unit installed → ready; dispatch through the supervisor.
324
+ * 2. No unit → run the §7.5 auto-detect-and-offer. If the operator accepts the
325
+ * cutover and it succeeds, the box is now supervised → ready.
326
+ * 3. Still no unit (offer disabled / no prior-detached evidence / declined /
327
+ * printed in a non-TTY / migrate-failed) → surface the actionable error and
328
+ * return NOT ready. The verb returns a non-zero exit; it NEVER spawns a
329
+ * detached daemon (that machinery is gone).
330
+ *
331
+ * The offer itself logs its own context (interactive prompt / printed command),
332
+ * so when it fired we don't double-print the bare error. We only print the
333
+ * actionable fallback line when no offer was surfaced (offer disabled or no
334
+ * prior-detached evidence — a genuinely-unmigrated or clean box driven by a
335
+ * script).
336
+ */
337
+ async function requireSupervisedOrOffer(r: Resolved): Promise<boolean> {
338
+ if (r.sup.unitInstalled) return true;
339
+ const migrated = await maybeOfferAndMigrate(r);
340
+ if (migrated) return true;
341
+ // No unit and not migrated. If the offer was enabled it already surfaced its
342
+ // own guidance (prompt / printed command / declined note); otherwise print the
343
+ // actionable command so a script on a never-migrated box isn't left guessing.
344
+ if (!r.migrateOffer.enabled) {
345
+ r.log(
346
+ "No supervised hub unit is installed. Run `parachute migrate --to-supervised` to install it,",
347
+ );
348
+ r.log("or run `parachute serve` in the foreground.");
349
+ }
350
+ return false;
351
+ }
352
+
353
+ /**
354
+ * Resolve the hub origin used as the operator token's `iss` validator in the
355
+ * supervisor path. Unlike {@link resolveHubOrigin} (which returns `undefined`
356
+ * for pure loopback so the spawn env omits PARACHUTE_HUB_ORIGIN), the operator
357
+ * token ALWAYS carries an `iss`, so this falls back to the canonical loopback
358
+ * origin. Mirrors `commands/auth.ts`'s `resolveHubIssuer` so the issuer the CLI
359
+ * validates the token against matches what `auth rotate-operator` minted under.
360
+ * The fallback differs cosmetically — here `readHubPort(configDir) ??
361
+ * HUB_UNIT_DEFAULT_PORT`, in auth.ts `127.0.0.1:${HUB_DEFAULT_PORT}` — but both
362
+ * resolve to 1939 under canonical-ports today, so they agree in practice.
363
+ * See #508: consolidate with auth.ts:resolveHubIssuer to prevent drift.
364
+ */
365
+ function resolveOperatorTokenIssuer(hubOrigin: string | undefined, configDir: string): string {
366
+ if (hubOrigin) return hubOrigin;
367
+ const port = readHubPort(configDir) ?? HUB_UNIT_DEFAULT_PORT;
368
+ return `http://127.0.0.1:${port}`;
369
+ }
370
+
410
371
  /**
411
372
  * Source of truth order for `PARACHUTE_HUB_ORIGIN`:
412
373
  * 1. explicit override (flag / opt)
@@ -422,538 +383,216 @@ function resolveHubOrigin(override: string | undefined, configDir: string): stri
422
383
  return deriveHubOrigin({ exposeFqdn, hubPort: readHubPort(configDir) });
423
384
  }
424
385
 
425
- interface ResolvedTarget {
426
- short: string;
427
- entry: ServiceEntry;
428
- /**
429
- * Lifecycle spec resolved at request time. First-party comes from
430
- * `getSpec(short)`; third-party comes from
431
- * `getSpecFromInstallDir(entry.installDir, ...)`. May be undefined when
432
- * a row has neither — `start` prints the actionable "no installDir"
433
- * re-install message for an installDir-less third-party row, or
434
- * "lifecycle not yet supported" otherwise; `stop`/`logs` keep working
435
- * via pidfile/logfile semantics keyed by `short`.
436
- */
437
- spec: ServiceSpec | undefined;
386
+ export async function start(svc: string | undefined, opts: LifecycleOpts = {}): Promise<number> {
387
+ const r = resolve(opts);
388
+ // Phase 5b single-path (design §8 Phase 5 + Appendix). The supervised path is
389
+ // the ONLY runtime — the detached spawners are retired. A box without a hub
390
+ // unit gets the §7.5 auto-offer / actionable error, NEVER a detached spawn.
391
+ if (!(await requireSupervisedOrOffer(r))) return 1;
392
+ return startViaSupervisor(svc, r);
438
393
  }
439
394
 
440
- async function specForEntry(
441
- short: string,
442
- entry: ServiceEntry,
443
- ): Promise<{ spec: ServiceSpec | undefined; error?: string }> {
444
- const firstParty = getSpec(short);
445
- // KNOWN_MODULES shorts (vault / scribe / runner — post hub#310 FALLBACK
446
- // retirement): if installDir is stamped (typical post-self-register),
447
- // compose the spec from the module's own `.parachute/module.json` so the
448
- // module is authoritative for its startCmd / paths / health. Falls back
449
- // to the minimal `getSpec` (which carries an imperative `extras.startCmd`
450
- // matching the module's canonical declaration) when installDir is absent
451
- // or module.json is unreadable covers legacy services.json rows from
452
- // before installDir stamping landed.
453
- const km = KNOWN_MODULES[short];
454
- if (km) {
455
- if (entry.installDir) {
456
- try {
457
- const manifest = await readModuleManifest(entry.installDir);
458
- if (manifest) return { spec: composeKnownModuleSpec(km, manifest) };
459
- } catch (err) {
460
- if (err instanceof ModuleManifestError) {
461
- // Surface the parse/validation error but keep the legacy
462
- // imperative-startCmd spec so `start` can still spawn — better
463
- // than no lifecycle at all when a module ships a typo'd manifest.
464
- return { spec: firstParty, error: err.message };
465
- }
466
- throw err;
467
- }
468
- }
469
- return { spec: firstParty };
470
- }
471
- // FIRST_PARTY_FALLBACKS shorts (notes / channel): the vendored manifest
472
- // is authoritative — startCmd is composed from extras + manifest at
473
- // `getSpec` time, no installDir read needed.
474
- if (firstParty) return { spec: firstParty };
475
- // Third-party rows: spec lives in the module's installDir/module.json.
476
- if (!entry.installDir) return { spec: undefined };
477
- try {
478
- const spec = await getSpecFromInstallDir(entry.installDir, entry.name);
479
- return { spec: spec ?? undefined };
480
- } catch (err) {
481
- if (err instanceof ModuleManifestError) {
482
- return { spec: undefined, error: err.message };
483
- }
484
- throw err;
485
- }
395
+ export async function stop(svc: string | undefined, opts: LifecycleOpts = {}): Promise<number> {
396
+ const r = resolve(opts);
397
+ // Phase 5b single-path: supervised is the only runtime (see `start`).
398
+ if (!(await requireSupervisedOrOffer(r))) return 1;
399
+ return stopViaSupervisor(svc, r);
400
+ }
401
+
402
+ export async function restart(svc: string | undefined, opts: LifecycleOpts = {}): Promise<number> {
403
+ const r = resolve(opts);
404
+ // Phase 5b single-path: supervised is the only runtime. The 404-fallthrough
405
+ // (a not-currently-supervised module start, §6.2) lives in
406
+ // `restartViaSupervisor`, which makes `restart <svc>` total over module state
407
+ // just as the retired detached stop+start was.
408
+ if (!(await requireSupervisedOrOffer(r))) return 1;
409
+ return restartViaSupervisor(svc, r);
486
410
  }
487
411
 
412
+ // ---------------------------------------------------------------------------
413
+ // Supervisor-path verb dispatch (design §3.3) — the ONLY runtime as of Phase 5b.
414
+ //
415
+ // `start/stop/restart` drive the RUNNING hub's in-process Supervisor over the
416
+ // loopback module-ops API (per-module verbs) or the platform manager (hub
417
+ // verbs / no-svc). The detached arm was retired in Phase 5b — a box with no hub
418
+ // unit goes through `requireSupervisedOrOffer` (§7.5 auto-offer / actionable
419
+ // error), never a detached spawn.
420
+ // ---------------------------------------------------------------------------
421
+
488
422
  /**
489
- * Services selected by the `[svc]` positional. `undefined` targets every
490
- * manageable service (first-party shortnames OR third-party rows that
491
- * carry `installDir`). Unknown names get a friendly error up front rather
492
- * than a confusing spawn failure downstream.
493
- *
494
- * Third-party modules are addressed by the `name` field from their
495
- * `module.json` (which is what install copied to `entry.name` for
496
- * third-party). First-party are addressed by their short name (vault,
497
- * notes, …) and matched via `shortNameForManifest`.
498
- *
499
- * Named-path detail: a third-party row whose name matches but lacks
500
- * `installDir` resolves to the entry with `spec: undefined` (rather than
501
- * an "unknown service" error). `stop`/`logs` handle the spec-less case
502
- * via pidfile/logfile semantics; `start` surfaces an actionable
503
- * re-install hint downstream. The genuinely-unknown path (no first-party
504
- * fallback AND no row in services.json) still errors as `unknown service`.
423
+ * Drive a single module-op against the running hub, mapping the module-ops
424
+ * client's errors to actionable CLI output (§3.1). Opens hub.db (to validate /
425
+ * auto-rotate the operator token), resolves the issuer the token was minted
426
+ * under, and closes the db afterward. Returns the result on success; on a
427
+ * surfaced error returns `undefined` so the caller can decide (e.g. the restart
428
+ * 404-fallthrough). Re-throws nothing the caller can't handle: the operator-
429
+ * token / HTTP errors are caught here and printed.
505
430
  */
506
- async function resolveTargets(
507
- svc: string | undefined,
508
- manifestPath: string,
509
- ): Promise<{ targets: ResolvedTarget[] } | { error: string }> {
510
- const manifest = readManifest(manifestPath);
511
- if (manifest.services.length === 0) {
512
- return { error: "No services installed yet. Try: parachute install vault" };
513
- }
514
-
515
- if (svc !== undefined) {
516
- // Try first-party (svc is a short name → known fallback).
517
- const firstPartySpec = getSpec(svc);
518
- if (firstPartySpec) {
519
- const entry = manifest.services.find((s) => s.name === firstPartySpec.manifestName);
520
- if (!entry) {
521
- return { error: `${svc} isn't installed. Run \`parachute install ${svc}\` first.` };
522
- }
523
- // KNOWN_MODULES path (hub#310): `getSpec` returns a startCmd-less
524
- // minimal spec for vault / scribe / runner. Compose the full
525
- // spawnable spec by reading installDir's module.json so `start` /
526
- // `restart` see the real startCmd. FIRST_PARTY_FALLBACKS path:
527
- // `firstPartySpec.startCmd` is already populated, and `specForEntry`
528
- // short-circuits without re-reading.
529
- const { spec, error } = await specForEntry(svc, entry);
530
- if (error) return { error: `${svc}: invalid module.json — ${error}` };
531
- return { targets: [{ short: svc, entry, spec: spec ?? firstPartySpec }] };
532
- }
533
- // Third-party: match a services.json row by name. Rows with `installDir`
534
- // resolve a full spec from the on-disk module.json. Rows without it are
535
- // still managed (stop/logs use pidfile/logfile semantics keyed by short
536
- // name), but with `spec: undefined` — `start` will surface an
537
- // installDir-specific error downstream rather than reject up front.
538
- const entry = manifest.services.find((s) => s.name === svc);
539
- if (entry) {
540
- if (entry.installDir) {
541
- const { spec, error } = await specForEntry(svc, entry);
542
- if (error) return { error: `${svc}: invalid module.json — ${error}` };
543
- return { targets: [{ short: svc, entry, spec }] };
544
- }
545
- return { targets: [{ short: svc, entry, spec: undefined }] };
546
- }
547
- return {
548
- error: `unknown service "${svc}". known: ${knownServices().join(", ")}`,
431
+ async function driveSupervisorOp(
432
+ short: string,
433
+ op: ModuleOp,
434
+ r: Resolved,
435
+ ): Promise<{ result?: ModuleOpResult; httpError?: ModuleOpHttpError; failed: boolean }> {
436
+ const issuer = resolveOperatorTokenIssuer(r.hubOrigin, r.configDir);
437
+ const db = r.sup.openDb(r.configDir);
438
+ try {
439
+ const deps: DriveModuleOpDeps = {
440
+ db,
441
+ issuer,
442
+ configDir: r.configDir,
443
+ ...(r.sup.baseUrl !== undefined ? { baseUrl: r.sup.baseUrl } : {}),
549
444
  };
550
- }
551
-
552
- const targets: ResolvedTarget[] = [];
553
- for (const entry of manifest.services) {
554
- const short = shortNameForManifest(entry.name);
555
- if (short) {
556
- // KNOWN_MODULES path (hub#310): minimal `getSpec` returns no startCmd
557
- // for vault / scribe / runner — read installDir's module.json to
558
- // compose the spawnable spec. FIRST_PARTY_FALLBACKS shorts get
559
- // back the same vendored-startCmd-bearing spec from `getSpec`.
560
- const { spec } = await specForEntry(short, entry);
561
- targets.push({ short, entry, spec });
562
- continue;
445
+ const result = await r.sup.driveModuleOp(short, op, deps);
446
+ return { result, failed: false };
447
+ } catch (err) {
448
+ if (err instanceof NoOperatorTokenError || err instanceof OperatorTokenExpiredError) {
449
+ // Surface the already-actionable message (don't raw-throw a 401, §3.1).
450
+ r.log(`✗ ${short}: ${err.message}`);
451
+ return { failed: true };
563
452
  }
564
- if (entry.installDir) {
565
- const { spec } = await specForEntry(entry.name, entry);
566
- targets.push({ short: entry.name, entry, spec });
453
+ if (err instanceof ModuleOpHttpError) {
454
+ // Return the typed HTTP error so the caller can branch (404-fallthrough,
455
+ // not_installed hint). Callers that don't branch print it via
456
+ // `surfaceModuleOpHttpError`.
457
+ return { httpError: err, failed: true };
567
458
  }
459
+ // Unknown error — surface its message rather than crashing the CLI.
460
+ r.log(`✗ ${short}: ${err instanceof Error ? err.message : String(err)}`);
461
+ return { failed: true };
462
+ } finally {
463
+ db.close();
568
464
  }
569
- if (targets.length === 0) {
570
- return { error: "No manageable services in services.json." };
571
- }
572
- return { targets };
573
465
  }
574
466
 
575
- export async function start(svc: string | undefined, opts: LifecycleOpts = {}): Promise<number> {
576
- const r = resolve(opts);
577
- if (svc === HUB_SVC) return startHubSvc(r);
578
- const picked = await resolveTargets(svc, r.manifestPath);
579
- if ("error" in picked) {
580
- r.log(picked.error);
581
- return 1;
582
- }
583
-
584
- let failures = 0;
585
- for (const { short, entry, spec } of picked.targets) {
586
- const state = processState(short, r.configDir, r.alive);
587
- if (state.status === "running") {
588
- r.log(`${short} already running (pid ${state.pid}).`);
589
- continue;
590
- }
591
- if (state.pid !== undefined) {
592
- // Stale PID file for a dead process — clear it before we spawn fresh.
593
- clearPid(short, r.configDir);
594
- }
595
-
596
- const cmd = spec?.startCmd?.(entry);
597
- if (!cmd || cmd.length === 0) {
598
- // Distinguish the missing-installDir case from "spec resolved but has
599
- // no startCmd" — the former is fixable by re-registering the module,
600
- // the latter is a hub-level limitation. Third-party rows hit the first
601
- // branch when their self-registration predates the installDir contract.
602
- if (!getSpec(short) && !entry.installDir) {
603
- r.log(
604
- `${short}: services.json entry has no installDir, so the start command can't be resolved. Re-run \`parachute install <path-to-${short}>\` to refresh its registration, or upgrade the module to a version that self-registers with installDir.`,
605
- );
606
- } else {
607
- r.log(`${short}: lifecycle not yet supported for this service.`);
608
- }
609
- failures++;
610
- continue;
611
- }
612
-
613
- const logFile = ensureLogPath(short, r.configDir);
614
- // Merge `<configDir>/<short>/.env` into the spawn env so service-specific
615
- // values (auto-wired SCRIBE_AUTH_TOKEN/SCRIBE_URL on vault, GROQ/OPENAI
616
- // API keys on scribe written by the install prompt) reach the daemon.
617
- // Vault still loads its own .env at runtime (it has its own start.sh
618
- // wrapper for launchd / systemd) — this is idempotent there. Hub-origin
619
- // override wins on collision; that's the live-exposure source of truth.
620
- const fileEnv = readEnvFileValues(join(r.configDir, short, ".env"));
621
- // PORT override (hub#356): same shape as `spawnSupervised` in
622
- // api-modules-ops.ts. Without this, operators running `parachute start
623
- // vault` inside a container that has PORT in env (Render / Fly / etc.)
624
- // hit EADDRINUSE on hub's port. Local dev typically doesn't set PORT, so
625
- // this is a no-op there. fileEnv wins on collision so per-service .env
626
- // can still override if an operator deliberately set PORT in there.
627
- const env: Record<string, string> = { PORT: String(entry.port), ...fileEnv };
628
- if (r.hubOrigin) env[HUB_ORIGIN_ENV] = r.hubOrigin;
629
- const spawnerOpts: { env?: Record<string, string>; cwd?: string } = {};
630
- if (Object.keys(env).length > 0) spawnerOpts.env = env;
631
- // Third-party modules ship clean relative startCmds — `cwd: installDir`
632
- // makes those resolve. First-party fallbacks use absolute / PATH binaries
633
- // so their cwd is irrelevant; passing it doesn't hurt.
634
- if (entry.installDir) spawnerOpts.cwd = entry.installDir;
635
- const passOpts =
636
- spawnerOpts.env !== undefined || spawnerOpts.cwd !== undefined ? spawnerOpts : undefined;
637
-
638
- // Pre-flight the startCmd binary (`@openparachute/depcheck`) so a missing
639
- // executable surfaces the friendly install UX inline AND is persisted onto
640
- // the services.json row, so a *later* `parachute status` (a separate
641
- // invocation that only reads the manifest) + the SPA modules pane show
642
- // "vault: failed to start — parachute-vault not installed" with install
643
- // info, rather than a bare "failed"/orphan-timeout. The binary is `cmd[0]`
644
- // (e.g. `parachute-vault` for an npm install, `bun` for a bun-linked one).
645
- const startBinary = cmd[0];
646
- if (startBinary) {
647
- try {
648
- ensureExecutable(startBinary, { which: r.which });
649
- } catch (err) {
650
- if (err instanceof MissingDependencyError) {
651
- failures++;
652
- r.log(`✗ ${short} failed to start:`);
653
- for (const line of err.message.split("\n")) r.log(` ${line}`);
654
- recordStartError(entry.name, err.toWire(), r.manifestPath);
655
- continue;
656
- }
657
- throw err;
658
- }
659
- }
660
-
661
- let pid: number;
662
- try {
663
- pid = r.spawner.spawn(cmd, logFile, passOpts);
664
- } catch (err) {
665
- // Belt-and-suspenders: a missing binary that slipped past the pre-flight
666
- // (race) still becomes a MissingDependencyError via rethrowIfMissing.
667
- if (startBinary) {
668
- try {
669
- rethrowIfMissing(err, startBinary);
670
- } catch (missing) {
671
- if (missing instanceof MissingDependencyError) {
672
- failures++;
673
- r.log(`✗ ${short} failed to start:`);
674
- for (const line of missing.message.split("\n")) r.log(` ${line}`);
675
- recordStartError(entry.name, missing.toWire(), r.manifestPath);
676
- continue;
677
- }
678
- }
679
- }
680
- failures++;
681
- const msg = err instanceof Error ? err.message : String(err);
682
- r.log(`✗ ${short} failed to start: ${msg}`);
683
- continue;
684
- }
685
- // A successful spawn clears any stale start-error recorded from a prior
686
- // missing-dependency failure so `parachute status` doesn't keep showing it.
687
- clearStartError(entry.name, r.manifestPath);
688
- writePid(short, pid, r.configDir);
689
-
690
- // Boot-readiness gating (hub#194 + hub#487). A spawn returning a pid only
691
- // proves the kernel forked the process — it says nothing about whether the
692
- // service survived its boot or bound its port. Two silent-start shapes:
693
- //
694
- // (1) spawn-then-immediately-die (hub#194): the child throws before
695
- // listening (notes-serve's Bun.resolveSync failing for bun-linked
696
- // installs) and exits microseconds later. Caught by the settle below.
697
- //
698
- // (2) alive-but-never-bound (hub#487): the port is already held by an
699
- // orphan, the child hits EADDRINUSE, but its process *lingers* (or a
700
- // supervisor retries) long enough to clear the liveness check. `start`
701
- // would report "✓ started" while `parachute status` shows it inactive
702
- // because nothing answers on the port. Aaron hit exactly this with an
703
- // orphan holding vault's 1940 on a fresh EC2 box. Caught by the
704
- // port-readiness poll below.
705
- //
706
- // On any failure we surface the tail of the logfile so the operator sees
707
- // the real boot error inline, and we specifically call out EADDRINUSE with
708
- // the `lsof -ti:<port>` remedy.
709
- const reportStartFailure = (reason: string): void => {
710
- clearPid(short, r.configDir);
711
- failures++;
712
- const tail = readLogTail(logFile, 20);
713
- if (detectAddrInUse(tail)) {
714
- r.log(
715
- `✗ ${short} failed to start: port ${entry.port} is already in use. Stop the existing process first — find it with \`lsof -ti:${entry.port}\` (then \`kill <pid>\`), or run \`parachute restart ${short}\`.`,
716
- );
717
- } else {
718
- r.log(`✗ ${short} failed to start: ${reason}`);
719
- }
720
- if (tail.length > 0) {
721
- r.log(` ── last ${tail.length} log line(s) (${logFile}) ──`);
722
- for (const line of tail) r.log(` │ ${line}`);
723
- } else {
724
- r.log(` Tail the log for details: tail -50 ${logFile}`);
725
- }
726
- };
727
-
728
- if (r.startSettleMs > 0) {
729
- await r.sleep(r.startSettleMs);
730
- if (!r.alive(pid)) {
731
- reportStartFailure(
732
- `spawned pid ${pid} but the process exited within ${r.startSettleMs}ms.`,
733
- );
734
- continue;
735
- }
736
- }
737
-
738
- // Port-readiness poll (hub#487). The process is alive; now confirm it
739
- // actually bound its port before claiming success. Poll up to
740
- // `startReadyMs`, re-checking liveness each iteration so a *later* death
741
- // (e.g. a slow EADDRINUSE crash) is still reported as a failure. A process
742
- // that stays alive but never binds within the window gets a non-fatal
743
- // warning rather than a hard failure — some daemons legitimately do slow
744
- // boot work, and we'd rather not flip a healthy-but-slow start to red.
745
- if (r.startReadyMs > 0) {
746
- const deadline = r.now() + r.startReadyMs;
747
- let listening = false;
748
- let died = false;
749
- while (r.now() < deadline) {
750
- if (!r.alive(pid)) {
751
- died = true;
752
- break;
753
- }
754
- if (await r.portListening(entry.port)) {
755
- listening = true;
756
- break;
757
- }
758
- await r.sleep(r.startReadyPollMs);
759
- }
760
- if (died) {
761
- reportStartFailure(`spawned pid ${pid} but the process exited during startup.`);
762
- continue;
763
- }
764
- if (!listening) {
765
- // Last-chance liveness check — the loop may have exited on the
766
- // deadline right as the process died.
767
- if (!r.alive(pid)) {
768
- reportStartFailure(`spawned pid ${pid} but the process exited during startup.`);
769
- continue;
770
- }
771
- r.log(
772
- `⚠ ${short} started (pid ${pid}) but port ${entry.port} isn't accepting connections yet after ${r.startReadyMs}ms.`,
773
- );
774
- r.log(
775
- ` It may still be coming up — check \`parachute status\` and \`parachute logs ${short}\`.`,
776
- );
777
- if (r.hubOrigin) r.log(` ${HUB_ORIGIN_ENV}=${r.hubOrigin}`);
778
- if (short === "vault") persistVaultHubOriginForStart(r);
779
- continue;
780
- }
781
- }
782
-
783
- r.log(`✓ ${short} started (pid ${pid}); logs: ${logFile}`);
784
- if (r.hubOrigin) r.log(` ${HUB_ORIGIN_ENV}=${r.hubOrigin}`);
785
- if (short === "vault") persistVaultHubOriginForStart(r);
467
+ /** Print a module-ops HTTP error with an actionable hint for the known codes. */
468
+ function surfaceModuleOpHttpError(short: string, err: ModuleOpHttpError, r: Resolved): void {
469
+ if (err.status === 400 && err.code === "not_installed") {
470
+ r.log(
471
+ `✗ ${short} is not installed — run \`parachute install ${short}\` first, then \`parachute start ${short}\`.`,
472
+ );
473
+ return;
786
474
  }
787
- return failures === 0 ? 0 : 1;
475
+ r.log(`✗ ${short}: ${err.message}`);
788
476
  }
789
477
 
790
478
  /**
791
- * Durable-persist vault's `PARACHUTE_HUB_ORIGIN` on a vault `start`. Two cases,
792
- * in order:
793
- *
794
- * 1. The resolved spawn origin (`r.hubOrigin`) is a real public origin — write
795
- * it. This is the long-standing happy path: an exposure is live, the
796
- * launchd / systemd daemon (which boots vault out-of-band and never sees
797
- * this spawn env) needs it in `.env` to validate hub-minted JWTs' `iss`.
798
- * `persistVaultHubOrigin` skips loopback / unchanged values itself.
799
- *
800
- * 2. Self-heal: even when `r.hubOrigin` resolved to loopback or undefined
801
- * (e.g. the hub.port file outran the expose-state read, or this is a bare
802
- * `restart vault` on a deploy whose `.env` was never written), consult
803
- * `expose-state.json` directly. If it advertises a public origin and
804
- * vault's persisted value is unset / loopback, write the public origin.
805
- * This is what lets an EXISTING broken Cloudflare deploy self-correct on
806
- * the next `parachute restart vault`, not only fresh exposes.
479
+ * Ensure the hub unit is up, mapping `ensureHubUnit`'s structured outcome to a
480
+ * CLI exit signal. Returns true when the hub is up (already-up / started),
481
+ * false when it isn't (and the messages were surfaced).
807
482
  *
808
- * Case 1 covers the override / freshly-resolved path; case 2 catches the gap
809
- * the Cloudflare 401 P0 fell through. See `vault-hub-origin-env.ts`.
483
+ * The `no-unit` outcome shouldn't reach here: `requireSupervisedOrOffer` gates
484
+ * every verb on `unitInstalled === true` before dispatching to the supervisor
485
+ * path, which is the same `isHubUnitInstalled` probe `ensureHubUnit` uses to
486
+ * decide `no-unit`. The defensive arm below still surfaces any non-up outcome's
487
+ * messages rather than silently succeeding.
810
488
  */
811
- function persistVaultHubOriginForStart(r: Resolved): void {
812
- if (r.hubOrigin) persistVaultHubOrigin(r.configDir, r.hubOrigin, r.log);
813
- selfHealVaultHubOrigin(r.configDir, r.log, join(r.configDir, "expose-state.json"));
489
+ async function ensureHubForOp(r: Resolved, port: number): Promise<boolean> {
490
+ const ensured = await r.sup.ensureHubUnit({
491
+ port,
492
+ deps: r.sup.hubUnitDeps,
493
+ log: r.log,
494
+ });
495
+ if (ensured.outcome === "already-up" || ensured.outcome === "started") return true;
496
+ for (const m of ensured.messages) r.log(m);
497
+ return false;
814
498
  }
815
499
 
816
- export async function stop(svc: string | undefined, opts: LifecycleOpts = {}): Promise<number> {
817
- const r = resolve(opts);
818
- if (svc === HUB_SVC) return stopHubSvc(r);
819
- const picked = await resolveTargets(svc, r.manifestPath);
820
- if ("error" in picked) {
821
- r.log(picked.error);
822
- return 1;
500
+ /** `start <svc>` / `start` (no svc) over the supervisor (§3.3). */
501
+ async function startViaSupervisor(svc: string | undefined, r: Resolved): Promise<number> {
502
+ const port = readHubPort(r.configDir) ?? HUB_UNIT_DEFAULT_PORT;
503
+ // `start hub` / `start` (no svc): ensure the hub unit is up — it transitively
504
+ // boots every installed module from services.json via bootSupervisedModules.
505
+ if (svc === HUB_SVC || svc === undefined) {
506
+ const up = await ensureHubForOp(r, port);
507
+ if (!up) return 1;
508
+ r.log(svc === HUB_SVC ? "✓ hub is up." : "✓ hub is up (all installed modules booted).");
509
+ return 0;
823
510
  }
824
-
825
- let failures = 0;
826
- for (const { short } of picked.targets) {
827
- const pid = readPid(short, r.configDir);
828
- if (pid === undefined) {
829
- r.log(`${short} wasn't running.`);
830
- continue;
831
- }
832
- if (!r.alive(pid)) {
833
- clearPid(short, r.configDir);
834
- r.log(`${short} wasn't running (cleaned stale pid file).`);
835
- continue;
836
- }
837
-
838
- try {
839
- r.kill(pid, "SIGTERM");
840
- } catch (err) {
841
- failures++;
842
- r.log(`✗ ${short}: SIGTERM failed: ${err instanceof Error ? err.message : String(err)}`);
843
- continue;
844
- }
845
-
846
- const deadline = r.now() + r.killWaitMs;
847
- while (r.now() < deadline && r.alive(pid)) {
848
- await r.sleep(r.pollIntervalMs);
849
- }
850
-
851
- if (r.alive(pid)) {
852
- r.log(`${short} didn't exit after ${r.killWaitMs}ms; sending SIGKILL.`);
853
- try {
854
- r.kill(pid, "SIGKILL");
855
- } catch (err) {
856
- failures++;
857
- r.log(`✗ ${short}: SIGKILL failed: ${err instanceof Error ? err.message : String(err)}`);
858
- continue;
859
- }
860
- }
861
-
862
- clearPid(short, r.configDir);
863
- r.log(`✓ ${short} stopped.`);
511
+ // `start <svc>`: ensure the hub is up first (chicken-and-egg §3.2), then drive
512
+ // a pure supervisor.start of the already-installed module.
513
+ if (!(await ensureHubForOp(r, port))) return 1;
514
+ const { result, httpError, failed } = await driveSupervisorOp(svc, "start", r);
515
+ if (httpError) {
516
+ surfaceModuleOpHttpError(svc, httpError, r);
517
+ return 1;
864
518
  }
865
- return failures === 0 ? 0 : 1;
866
- }
867
-
868
- export async function restart(svc: string | undefined, opts: LifecycleOpts = {}): Promise<number> {
869
- const stopCode = await stop(svc, opts);
870
- if (stopCode !== 0) return stopCode;
871
- return await start(svc, opts);
519
+ if (failed || !result) return 1;
520
+ r.log(`✓ ${svc} started.`);
521
+ return 0;
872
522
  }
873
523
 
874
- /**
875
- * Start the internal hub. Delegates to `ensureHubRunning`, which owns the
876
- * port-fallback probe, the port-file write, and the issuer flag — none of
877
- * which fit a generic `SERVICE_SPECS` entry. The hub origin (when known)
878
- * doubles as the OAuth `iss` claim, so we forward it as `issuer`.
879
- *
880
- * Silences `ensureHubRunning`'s own log and emits our own `✓ hub started …`
881
- * line so the output matches the service-start shape (`✓ vault started
882
- * (pid X); logs: …`) and `stopHubSvc`'s `✓ hub stopped.` symmetry.
883
- */
884
- async function startHubSvc(r: Resolved): Promise<number> {
885
- const ensureOpts: EnsureHubOpts = { configDir: r.configDir, log: () => {} };
886
- if (r.hubOrigin) ensureOpts.issuer = r.hubOrigin;
887
- try {
888
- const result = await r.ensureHub(ensureOpts);
889
- if (result.started) {
890
- const logFile = logPathFor(HUB_SVC, r.configDir);
891
- r.log(`✓ hub started (pid ${result.pid}) on port ${result.port}; logs: ${logFile}`);
892
- } else {
893
- r.log(`hub already running (pid ${result.pid}) on port ${result.port}.`);
524
+ /** `stop <svc>` / `stop` (no svc) over the supervisor / platform manager (§3.3). */
525
+ async function stopViaSupervisor(svc: string | undefined, r: Resolved): Promise<number> {
526
+ const port = readHubPort(r.configDir) ?? HUB_UNIT_DEFAULT_PORT;
527
+ // `stop hub` / `stop` (no svc): stop the hub UNIT via the platform manager.
528
+ // MUST go through the manager a PID signal would be undone by launchd
529
+ // KeepAlive / systemd Restart=always (R17). Children die with the hub.
530
+ if (svc === HUB_SVC || svc === undefined) {
531
+ const res = r.sup.stopHubUnit(r.sup.hubUnitDeps);
532
+ for (const m of res.messages) r.log(m);
533
+ if (res.outcome === "ok") {
534
+ r.log("✓ hub stopped (all supervised modules stopped with it).");
535
+ return 0;
894
536
  }
895
- // Self-heal a stale operator-token issuer (hub#481). Runs whether the hub
896
- // was freshly started OR already running — a token stamped at loopback
897
- // before exposure must heal even when the hub is already up. The loopback /
898
- // provenance guards live inside `selfHealOperatorTokenIssuer`, so the only
899
- // gate here is "is there a real issuer to heal toward?".
900
- await selfHealOperatorTokenOnStart(r);
537
+ return 1;
538
+ }
539
+ // `stop <svc>`: a supervised module dies WITH the hub. If the hub isn't
540
+ // reachable, the module is already down — report success WITHOUT starting the
541
+ // hub (do NOT ensureHubUnit just to stop one module). Only when the hub is up
542
+ // do we drive the supervisor's stop.
543
+ if (!(await r.sup.probeHubHealth(port))) {
544
+ r.log(`✓ ${svc} already stopped (the hub isn't running, so its modules are down).`);
901
545
  return 0;
902
- } catch (err) {
903
- r.log(`✗ hub failed to start: ${err instanceof Error ? err.message : String(err)}`);
546
+ }
547
+ const { httpError, failed, result } = await driveSupervisorOp(svc, "stop", r);
548
+ if (httpError) {
549
+ surfaceModuleOpHttpError(svc, httpError, r);
904
550
  return 1;
905
551
  }
552
+ if (failed || !result) return 1;
553
+ r.log(`✓ ${svc} stopped.`);
554
+ return 0;
906
555
  }
907
556
 
908
- /**
909
- * Re-issue the operator token under the hub's current origin when its `iss`
910
- * went stale after an init-at-loopback expose transition (hub#481). Mirrors
911
- * `persistVaultHubOriginForStart`'s quiet style: emit a single line only when
912
- * a rotation actually happens; stay silent for fresh / absent / skipped.
913
- *
914
- * The ENTIRE self-heal is wrapped here so it can NEVER block or fail
915
- * `start hub` a db-open error, a corrupt token, anything — degrades to a
916
- * brief warning and `start hub` still returns 0.
917
- */
918
- async function selfHealOperatorTokenOnStart(r: Resolved): Promise<void> {
919
- if (!r.hubOrigin) return;
920
- try {
921
- const status = await r.selfHealOperatorTokenFn({
922
- issuer: r.hubOrigin,
923
- configDir: r.configDir,
924
- log: r.log,
925
- });
926
- if (status.kind === "rotated") {
927
- r.log(` refreshed operator.token issuer → ${r.hubOrigin} (was stale after exposure)`);
557
+ /** `restart <svc>` / `restart` (no svc) over the supervisor / manager (§3.3). */
558
+ async function restartViaSupervisor(svc: string | undefined, r: Resolved): Promise<number> {
559
+ // `restart hub` / `restart` (no svc): restart the hub UNIT via the platform
560
+ // manager. NOT a per-module fan-out restarting the hub re-boots all modules
561
+ // anyway. MUST go through the manager (never a PID signal, R17).
562
+ if (svc === HUB_SVC || svc === undefined) {
563
+ const res = r.sup.restartHubUnit(r.sup.hubUnitDeps);
564
+ for (const m of res.messages) r.log(m);
565
+ if (res.outcome === "ok") {
566
+ r.log("✓ hub restarted (all modules re-booted).");
567
+ return 0;
928
568
  }
929
- } catch (err) {
930
- r.log(
931
- ` note: operator.token issuer self-heal skipped (${
932
- err instanceof Error ? err.message : String(err)
933
- })`,
934
- );
569
+ return 1;
935
570
  }
936
- }
937
-
938
- /**
939
- * Stop the internal hub. `stopHub` returns false when nothing was running
940
- * (no pidfile, or stale pidfile cleared) — that's a clean no-op for the
941
- * operator, so we still exit 0.
942
- */
943
- async function stopHubSvc(r: Resolved): Promise<number> {
944
- try {
945
- const stopped = await r.stopHubFn({
946
- configDir: r.configDir,
947
- log: r.log,
948
- killWaitMs: r.killWaitMs,
949
- pollIntervalMs: r.pollIntervalMs,
950
- });
951
- r.log(stopped ? "✓ hub stopped." : "hub wasn't running.");
952
- return 0;
953
- } catch (err) {
954
- r.log(`✗ hub failed to stop: ${err instanceof Error ? err.message : String(err)}`);
571
+ // `restart <svc>`: ensure the hub is up, then drive supervisor.restart.
572
+ const port = readHubPort(r.configDir) ?? HUB_UNIT_DEFAULT_PORT;
573
+ if (!(await ensureHubForOp(r, port))) return 1;
574
+ const restartRes = await driveSupervisorOp(svc, "restart", r);
575
+ if (restartRes.httpError) {
576
+ // 404-fallthrough (§6.2): a module that isn't currently supervised (crashed
577
+ // out of budget, skipped at boot, installed out-of-band) returns 404
578
+ // `not_supervised`. `restart` must be total over module state (matching the
579
+ // detached stop+start), so fall through to a pure `start`.
580
+ if (restartRes.httpError.status === 404 && restartRes.httpError.code === "not_supervised") {
581
+ const startRes = await driveSupervisorOp(svc, "start", r);
582
+ if (startRes.httpError) {
583
+ surfaceModuleOpHttpError(svc, startRes.httpError, r);
584
+ return 1;
585
+ }
586
+ if (startRes.failed || !startRes.result) return 1;
587
+ r.log(`✓ ${svc} started.`);
588
+ return 0;
589
+ }
590
+ surfaceModuleOpHttpError(svc, restartRes.httpError, r);
955
591
  return 1;
956
592
  }
593
+ if (restartRes.failed || !restartRes.result) return 1;
594
+ r.log(`✓ ${svc} restarted.`);
595
+ return 0;
957
596
  }
958
597
 
959
598
  export interface LogsOpts {