@openparachute/hub 0.6.2 → 0.6.3-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/README.md +87 -35
  2. package/package.json +1 -1
  3. package/src/__tests__/api-hub-upgrade.test.ts +690 -0
  4. package/src/__tests__/api-modules-ops.test.ts +359 -3
  5. package/src/__tests__/api-modules.test.ts +54 -0
  6. package/src/__tests__/expose-cloudflare.test.ts +163 -72
  7. package/src/__tests__/expose-off-auto.test.ts +26 -1
  8. package/src/__tests__/expose.test.ts +260 -240
  9. package/src/__tests__/hub-control.test.ts +1 -242
  10. package/src/__tests__/hub-server.test.ts +64 -0
  11. package/src/__tests__/hub-unit.test.ts +574 -0
  12. package/src/__tests__/init.test.ts +219 -2
  13. package/src/__tests__/lifecycle.test.ts +416 -1448
  14. package/src/__tests__/managed-unit.test.ts +575 -0
  15. package/src/__tests__/migrate-cutover.test.ts +840 -0
  16. package/src/__tests__/migrate-offer.test.ts +240 -0
  17. package/src/__tests__/migrate.test.ts +132 -0
  18. package/src/__tests__/module-ops-client.test.ts +556 -0
  19. package/src/__tests__/port-probe.test.ts +23 -0
  20. package/src/__tests__/setup-wizard.test.ts +130 -0
  21. package/src/__tests__/status-supervisor.test.ts +504 -0
  22. package/src/__tests__/status.test.ts +157 -708
  23. package/src/__tests__/supervisor.test.ts +471 -6
  24. package/src/__tests__/upgrade.test.ts +351 -5
  25. package/src/api-hub-upgrade.ts +384 -0
  26. package/src/api-hub.ts +2 -1
  27. package/src/api-modules-ops.ts +221 -0
  28. package/src/api-modules.ts +18 -2
  29. package/src/cli.ts +97 -12
  30. package/src/cloudflare/connector-service.ts +117 -322
  31. package/src/commands/expose-cloudflare.ts +63 -71
  32. package/src/commands/expose-supervisor.ts +247 -0
  33. package/src/commands/expose.ts +59 -48
  34. package/src/commands/init.ts +225 -12
  35. package/src/commands/lifecycle.ts +455 -816
  36. package/src/commands/migrate-cutover.ts +837 -0
  37. package/src/commands/migrate.ts +71 -2
  38. package/src/commands/serve-boot.ts +71 -25
  39. package/src/commands/status.ts +535 -235
  40. package/src/commands/upgrade.ts +100 -2
  41. package/src/help.ts +128 -68
  42. package/src/hub-control.ts +23 -162
  43. package/src/hub-server.ts +39 -0
  44. package/src/hub-unit.ts +735 -0
  45. package/src/hub-upgrade-helper.ts +306 -0
  46. package/src/hub-upgrade-mode.ts +209 -0
  47. package/src/hub-upgrade-status.ts +150 -0
  48. package/src/managed-unit.ts +692 -0
  49. package/src/migrate-offer.ts +186 -0
  50. package/src/module-ops-client.ts +457 -0
  51. package/src/port-probe.ts +50 -0
  52. package/src/process-state.ts +19 -3
  53. package/src/setup-wizard.ts +80 -1
  54. package/src/supervisor.ts +389 -38
  55. package/web/ui/dist/assets/index-D_6AFvZy.js +61 -0
  56. package/web/ui/dist/assets/{index-BiBlvEaj.css → index-mz8XcVPP.css} +1 -1
  57. package/web/ui/dist/index.html +2 -2
  58. package/web/ui/dist/assets/index-CIN3mnmf.js +0 -61
@@ -0,0 +1,837 @@
1
+ /**
2
+ * `parachute migrate --to-supervised` (and `--teardown`) — the idempotent
3
+ * detached→supervised CUTOVER, Phase 5a of the hub-as-supervisor unification
4
+ * (design `parachute.computer/design/2026-06-01-hub-as-supervisor-unification.md`
5
+ * §7.1–§7.5).
6
+ *
7
+ * This file is the MACHINERY; the BRIDGE stays intact. After 5a an un-migrated
8
+ * box still works on the detached path (`defaultSpawner` / `ensureHubRunning`
9
+ * remain — Phase 5b retires them). The cutover is opt-in (`--to-supervised`) or
10
+ * auto-offered (§7.5, in `lifecycle.ts`). It NEVER runs implicitly.
11
+ *
12
+ * The cutover is the most dangerous operation in the CLI: it stops real running
13
+ * services and installs a process-manager unit. So the ORDERING is load-bearing
14
+ * and the whole path is FAIL-SAFE + RESUMABLE:
15
+ *
16
+ * §7.1 ordering (stop-detached-FIRST-then-start-unit, to dodge the port-1939
17
+ * double-spawn race the canonical-ports 1939-pin would turn into a crash-loop):
18
+ * 1. DETECT the current model (detached hub alive? each module alive?). If a
19
+ * hub unit already exists AND the hub is supervised → idempotent no-op.
20
+ * 2. WRITE the unit file WITHOUT starting it (`installManagedUnit start:false`
21
+ * — daemon-reload but NOT enable --now / bootstrap). This is the §7.1
22
+ * race-avoider: the unit is on disk + resumable, but no second hub is
23
+ * started yet.
24
+ * 3. STOP the detached processes — `stopHub` for the hub, a per-module
25
+ * pidfile stop for each module.
26
+ * 4. §7.2 ORPHAN SWEEP — lsof per services.json port + the hub port; adopt +
27
+ * kill any process still bound to a declared port (mirrors stopHub's 1939
28
+ * orphan-adoption, per-module-port).
29
+ * 5. VERIFY the hub port + each module port is free (bounded poll). If a port
30
+ * won't free, FAIL leaving the unit written-but-not-started so a retry is
31
+ * clean.
32
+ * 6. START the unit (`installManagedUnit start:true` / enable --now). The hub
33
+ * comes up on a free 1939 and boots modules from services.json.
34
+ * 7. VERIFY the hub answers /health and the expected modules are running.
35
+ * 8. The cloudflared connector (if any) is left intact — it's its own unit.
36
+ *
37
+ * RESUMABILITY: a partial cutover (unit written, not started) is the canonical
38
+ * recoverable state. Re-running `--to-supervised` from there:
39
+ * - DETECT sees a unit installed but the hub NOT supervised (no /health) → it
40
+ * does NOT no-op; it re-runs steps 2-7. Step 2 (write start:false) is
41
+ * idempotent (overwrites the same file), the stop steps are no-ops if the
42
+ * detached procs already died, and step 6 brings the unit up.
43
+ *
44
+ * FAIL-SAFE: every failure leaves a recoverable state. The only states we refuse
45
+ * to leave the box in are (a) detached-stopped + unit-failed-to-start + no
46
+ * recovery path. Step 6's start-failure leaves the unit written (re-runnable);
47
+ * step 5's port-won't-free fails BEFORE stopping nothing-more and before
48
+ * starting, with the unit written for a clean retry.
49
+ *
50
+ * EVERYTHING is behind injectable seams (the `CutoverDeps`) so the destructive
51
+ * tests run in a sandbox `PARACHUTE_HOME` with NO real Bun.spawn / systemctl /
52
+ * launchctl / lsof / process kills.
53
+ */
54
+
55
+ import { spawnSync } from "node:child_process";
56
+ import { fileURLToPath } from "node:url";
57
+ import { CONFIG_DIR, SERVICES_MANIFEST_PATH } from "../config.ts";
58
+ import {
59
+ HUB_DEFAULT_PORT,
60
+ type KillFn,
61
+ type PidOnPortFn,
62
+ type StopHubOpts,
63
+ defaultPidOnPort,
64
+ stopHub,
65
+ } from "../hub-control.ts";
66
+ import {
67
+ type HubUnitDeps,
68
+ type InstallAndStartHubUnitOpts,
69
+ type InstallAndStartHubUnitResult,
70
+ defaultHubUnitDeps,
71
+ hubUnitMessages,
72
+ installAndStartHubUnit,
73
+ isHubUnitInstalled,
74
+ } from "../hub-unit.ts";
75
+ import {
76
+ HUB_LAUNCHD_LABEL,
77
+ HUB_SYSTEMD_UNIT_NAME,
78
+ type ManagedUnit,
79
+ type ManagedUnitDeps,
80
+ type ManagedUnitRemoveResult,
81
+ buildHubManagedUnit,
82
+ installManagedUnit,
83
+ removeManagedUnit,
84
+ } from "../managed-unit.ts";
85
+ import { type PortListeningFn, defaultPortListening } from "../port-probe.ts";
86
+ import { type AliveFn, clearPid, readPid } from "../process-state.ts";
87
+ import { shortNameForManifest } from "../service-spec.ts";
88
+ import { type ServiceEntry, readManifestLenient } from "../services-manifest.ts";
89
+
90
+ /**
91
+ * Absolute path to this hub checkout's `src/cli.ts` — the entry the hub unit's
92
+ * `ExecStart`/`ProgramArguments` runs `serve` against. This file is
93
+ * `src/commands/migrate-cutover.ts`, so `cli.ts` is one directory up. Mirrors
94
+ * `init.ts`'s `defaultHubCliPath`.
95
+ */
96
+ export function defaultHubCliPath(): string {
97
+ return fileURLToPath(new URL("../cli.ts", import.meta.url));
98
+ }
99
+
100
+ /**
101
+ * Best-effort command-line probe for a pid (the orphan-sweep ownership check).
102
+ * Returns the process's command line, or undefined when it can't be read. See
103
+ * `CutoverDeps.ownerOfPid`.
104
+ */
105
+ export type OwnerProbeFn = (pid: number) => string | undefined;
106
+
107
+ /**
108
+ * Production `ownerOfPid`: `ps -o command= -p <pid>` returns the full argv of the
109
+ * process (one line). macOS + Linux both ship `ps` and accept `-o command=` (the
110
+ * trailing `=` suppresses the header). Any failure — `ps` missing, pid gone,
111
+ * permission, garbage — returns undefined so the caller treats the orphan as
112
+ * UNATTRIBUTABLE (and refuses to kill it). Mirrors `defaultPidOnPort`'s
113
+ * shell-out-and-swallow shape.
114
+ */
115
+ export const defaultOwnerOfPid: OwnerProbeFn = (pid) => {
116
+ try {
117
+ const result = spawnSync("ps", ["-o", "command=", "-p", String(pid)], {
118
+ encoding: "utf8",
119
+ timeout: 2000,
120
+ });
121
+ if (result.status !== 0) return undefined;
122
+ const line = result.stdout
123
+ .split("\n")
124
+ .map((s) => s.trim())
125
+ .find((s) => s.length > 0);
126
+ return line === undefined || line.length === 0 ? undefined : line;
127
+ } catch {
128
+ return undefined;
129
+ }
130
+ };
131
+
132
+ /**
133
+ * Injectable side-effect seam for the cutover. Production wires the real
134
+ * implementations; tests inject fakes so no real process is stopped, no real
135
+ * unit installed, no real port probed.
136
+ */
137
+ export interface CutoverDeps {
138
+ /** Process-liveness probe (pidfile readers + this = "is the detached proc alive?"). */
139
+ alive: AliveFn;
140
+ /** Send a signal to a pid (orphan-sweep kill). Group-aware (negative-pid) by default. */
141
+ kill: KillFn;
142
+ /** Which pid is bound to a port (orphan-sweep lsof). */
143
+ pidOnPort: PidOnPortFn;
144
+ /**
145
+ * Best-effort command-line of a pid (the orphan-sweep ownership probe). Returns
146
+ * the process's argv joined (e.g. `bun .../server.ts --port 1940`) or undefined
147
+ * when it can't be read (pid gone, permission, no `ps`). Used by
148
+ * `sweepOrphanOnPort` to decide whether an orphan holding a MODULE port is
149
+ * plausibly that parachute module before adopting + killing it — so the cutover
150
+ * never blind-kills an operator's unrelated process that happens to squat a
151
+ * declared port. Injectable so tests drive attribution without shelling to `ps`.
152
+ */
153
+ ownerOfPid: OwnerProbeFn;
154
+ /** TCP connect-probe for the verify-ports-free + verify-hub-ready steps. */
155
+ portListening: PortListeningFn;
156
+ /** Stop the detached hub (SIGTERM→SIGKILL + 1939 orphan adoption). */
157
+ stopHub: (opts: StopHubOpts) => Promise<boolean>;
158
+ /**
159
+ * Install + start the hub unit (the §7.1 step-6 start). Calls
160
+ * `installAndStartHubUnit` in production. The cutover does NOT call
161
+ * `installManagedUnit start:false` directly for the WRITE step — instead it
162
+ * reuses the higher-level builder so the env capture / bun resolution / readiness
163
+ * wait all match `init`. See `writeUnitWithoutStarting`.
164
+ */
165
+ installAndStartHubUnit: (
166
+ opts: InstallAndStartHubUnitOpts,
167
+ ) => Promise<InstallAndStartHubUnitResult>;
168
+ /**
169
+ * Write the hub unit file WITHOUT starting it (§7.1 step 2 — the race-avoider).
170
+ * Production builds the descriptor + calls `installManagedUnit(start:false)`;
171
+ * tests stub it. Returns true on a successful write (or fallback-but-recoverable),
172
+ * false when even the write failed (no unit on disk → not resumable here).
173
+ */
174
+ writeUnitWithoutStarting: (opts: WriteUnitOpts) => WriteUnitResult;
175
+ /** Is a hub unit file installed? (the §7.1 step-1 detect discriminant). */
176
+ isHubUnitInstalled: (deps: HubUnitDeps) => boolean;
177
+ /** Probe whether the loopback hub answers /health (detect "supervised" + verify). */
178
+ probeHealth: (port: number) => Promise<boolean>;
179
+ /** Sleep between port-free / readiness polls (tests pin to 0). */
180
+ sleep: (ms: number) => Promise<void>;
181
+ /** The hub-unit deps for install / detect / manager calls. */
182
+ hubUnitDeps: HubUnitDeps;
183
+ }
184
+
185
+ export interface WriteUnitOpts {
186
+ parachuteHome: string;
187
+ cliPath: string;
188
+ port: number;
189
+ deps: HubUnitDeps;
190
+ }
191
+
192
+ export interface WriteUnitResult {
193
+ /** True when the unit file is on disk (resumable). False = write failed. */
194
+ written: boolean;
195
+ /** "installed" (file on disk) or "fallback" (no manager / write failed). */
196
+ outcome: "installed" | "fallback";
197
+ /**
198
+ * On a `fallback`, WHY — so the caller maps to the right `CutoverOutcome`
199
+ * instead of conflating the two causes (the MUST-FIX NIT: a bun-not-found /
200
+ * write failure previously surfaced as the wrong "no service manager" message,
201
+ * and the `write-failed` outcome was dead):
202
+ * - "no-manager" → no systemd/launchd here (the supervised model is impossible);
203
+ * - "write-failed" → a manager exists but the unit couldn't be written (bun
204
+ * unresolvable, write/daemon-reload failure).
205
+ * Undefined when `outcome === "installed"`.
206
+ */
207
+ cause?: "no-manager" | "write-failed";
208
+ messages: string[];
209
+ }
210
+
211
+ /**
212
+ * Production `writeUnitWithoutStarting`: build the hub `ManagedUnit` descriptor
213
+ * (captures the operator's current PARACHUTE_HOME per §4.2, resolves abs bun)
214
+ * and `installManagedUnit(start:false)` — daemon-reload / write-the-plist but
215
+ * NEVER enable --now / bootstrap. The §7.1 step-2 race-avoider.
216
+ */
217
+ function defaultUnitPath(bunInstall: string): string {
218
+ return `${bunInstall}/bin:/usr/local/bin:/usr/bin:/bin`;
219
+ }
220
+
221
+ export function defaultWriteUnitWithoutStarting(opts: WriteUnitOpts): WriteUnitResult {
222
+ const { deps } = opts;
223
+ const bunInstall = `${deps.homeDir()}/.bun`;
224
+ const path = defaultUnitPath(bunInstall);
225
+ const logPath = `${opts.parachuteHome}/hub/logs/hub.log`;
226
+ let unit: ManagedUnit;
227
+ try {
228
+ unit = buildHubManagedUnit({
229
+ parachuteHome: opts.parachuteHome,
230
+ port: opts.port,
231
+ bunInstall,
232
+ path,
233
+ cliPath: opts.cliPath,
234
+ logPath,
235
+ deps,
236
+ });
237
+ } catch (err) {
238
+ // `bun` couldn't be resolved — refuse to bake a broken ExecStart. No unit on
239
+ // disk: not resumable from here. A manager may well exist; this is a WRITE
240
+ // failure (can't compose a valid unit), NOT a no-manager host — surface it as
241
+ // such so the operator sees "bun not found / could not write the unit".
242
+ return {
243
+ written: false,
244
+ outcome: "fallback",
245
+ cause: "write-failed",
246
+ messages: [err instanceof Error ? err.message : String(err)],
247
+ };
248
+ }
249
+ const res = installManagedUnit({
250
+ unit,
251
+ deps,
252
+ messages: hubUnitMessages(),
253
+ start: false,
254
+ });
255
+ // `installed` → the file is on disk (resumable). `fallback` → either no manager
256
+ // (host can't host a unit) or the install/write failed (manager present). Thread
257
+ // the manager's `reason` through so the caller distinguishes them; default to
258
+ // "write-failed" if (somehow) absent — the conservative non-no-manager message.
259
+ return {
260
+ written: res.outcome === "installed",
261
+ outcome: res.outcome,
262
+ cause: res.outcome === "fallback" ? (res.reason ?? "write-failed") : undefined,
263
+ messages: res.messages,
264
+ };
265
+ }
266
+
267
+ /**
268
+ * Group-aware kill — INLINED from `lifecycle.ts`'s `defaultKill` (NOT imported,
269
+ * to avoid the import cycle `lifecycle.ts → migrate-offer.ts → migrate-cutover.ts`).
270
+ * MUST stay byte-equivalent to lifecycle's group-aware kill.
271
+ *
272
+ * Modules are spawned `detached: true` by `defaultSpawner` (lifecycle.ts), so the
273
+ * recorded pid is a process-GROUP leader (pid == pgid). A wrapper startCmd like
274
+ * `pnpm exec tsx server.ts` leaves the real server as a GRANDCHILD inside that
275
+ * group. The cutover originally used the BARE-PID `hub-control.ts:defaultKill`
276
+ * (`process.kill(pid, sig)`), which signals only the wrapper — the tsx grandchild
277
+ * survives, keeps holding the module's port, `waitPortFree` times out, and the
278
+ * cutover returns `port-stuck` on the FIRST run for any wrapper-startCmd module
279
+ * (the exact hub#88 footgun). `process.kill(-pid, sig)` signals the whole group;
280
+ * ESRCH (legacy pidfile written before detached-spawn, or the leader already
281
+ * exited and the group emptied) falls back to a bare-pid signal so the intent
282
+ * still lands when there's a positive-pid process to receive it.
283
+ */
284
+ const groupAwareKill: KillFn = (pid, signal) => {
285
+ try {
286
+ process.kill(-pid, signal);
287
+ } catch (err) {
288
+ if ((err as NodeJS.ErrnoException).code !== "ESRCH") throw err;
289
+ process.kill(pid, signal);
290
+ }
291
+ };
292
+
293
+ /**
294
+ * Group-aware liveness — INLINED from `lifecycle.ts`'s `defaultAlive` (same
295
+ * import-cycle reason as `groupAwareKill`). Returns true if the process GROUP
296
+ * (pgid == pid) still has any member, so the stop-then-wait loop keeps polling
297
+ * until the wrapper AND its grandchild are both gone (the bare-pid
298
+ * `process-state.ts:defaultAlive` would report the leader dead while the
299
+ * grandchild lingers, prematurely clearing the pidfile + skipping the SIGKILL
300
+ * escalation, re-opening the hub#88 port hold). ESRCH on the group probe (legacy
301
+ * pidfile, or the leader exited and the group emptied) falls back to a bare-pid
302
+ * check so a positive-pid process is still honored.
303
+ */
304
+ const groupAwareAlive: AliveFn = (pid) => {
305
+ try {
306
+ process.kill(-pid, 0);
307
+ return true;
308
+ } catch (err) {
309
+ if ((err as NodeJS.ErrnoException).code !== "ESRCH") return true;
310
+ }
311
+ try {
312
+ process.kill(pid, 0);
313
+ return true;
314
+ } catch {
315
+ return false;
316
+ }
317
+ };
318
+
319
+ export const defaultCutoverDeps: CutoverDeps = {
320
+ alive: groupAwareAlive,
321
+ kill: groupAwareKill,
322
+ pidOnPort: defaultPidOnPort,
323
+ ownerOfPid: defaultOwnerOfPid,
324
+ portListening: defaultPortListening,
325
+ stopHub,
326
+ installAndStartHubUnit,
327
+ writeUnitWithoutStarting: defaultWriteUnitWithoutStarting,
328
+ isHubUnitInstalled,
329
+ probeHealth: defaultHubUnitDeps.probeHealth,
330
+ sleep: (ms) => new Promise((r) => setTimeout(r, ms)),
331
+ hubUnitDeps: defaultHubUnitDeps,
332
+ };
333
+
334
+ export interface CutoverOpts {
335
+ configDir?: string;
336
+ manifestPath?: string;
337
+ /** Hub port (default 1939). */
338
+ port?: number;
339
+ /** Absolute cli.ts path the unit runs `serve` against (default resolved here). */
340
+ cliPath?: string;
341
+ log?: (line: string) => void;
342
+ deps?: Partial<CutoverDeps>;
343
+ /** Port-free / readiness budget in ms (default 15s). */
344
+ timeoutMs?: number;
345
+ /** Poll interval in ms (default 250). */
346
+ pollMs?: number;
347
+ }
348
+
349
+ export type CutoverOutcome =
350
+ /** A hub unit already exists AND the hub answers /health → nothing to do. */
351
+ | "already-migrated"
352
+ /** The full cutover ran end-to-end and the hub is supervised + healthy. */
353
+ | "migrated"
354
+ /** No service manager (container / init-less) — cutover is impossible here. */
355
+ | "no-manager"
356
+ /** A declared port wouldn't free; unit written-but-not-started, re-runnable. */
357
+ | "port-stuck"
358
+ /** The unit failed to start; written-but-not-started, re-runnable. */
359
+ | "start-failed"
360
+ /** The unit came up but never answered /health within the budget. */
361
+ | "verify-timeout"
362
+ /** Couldn't even write the unit file (e.g. bun unresolvable). */
363
+ | "write-failed";
364
+
365
+ export interface CutoverResult {
366
+ outcome: CutoverOutcome;
367
+ /** The hub port. */
368
+ port: number;
369
+ messages: string[];
370
+ }
371
+
372
+ /** A module's short name + the port it declares in services.json. */
373
+ interface ModuleTarget {
374
+ short: string;
375
+ port: number;
376
+ }
377
+
378
+ /** Read each services.json module's short name + declared port (lenient). */
379
+ function moduleTargets(manifestPath: string): ModuleTarget[] {
380
+ let services: ServiceEntry[];
381
+ try {
382
+ services = readManifestLenient(manifestPath).services;
383
+ } catch {
384
+ return [];
385
+ }
386
+ const out: ModuleTarget[] = [];
387
+ for (const entry of services) {
388
+ const short = shortNameForManifest(entry.name) ?? entry.name;
389
+ out.push({ short, port: entry.port });
390
+ }
391
+ return out;
392
+ }
393
+
394
+ /**
395
+ * Stop a single detached module by its pidfile (mirrors lifecycle.ts's detached
396
+ * stop arm). SIGTERM → bounded wait → SIGKILL → clear pidfile. A missing/stale
397
+ * pidfile is a no-op. Returns true when the module is now stopped.
398
+ */
399
+ async function stopDetachedModule(
400
+ target: ModuleTarget,
401
+ configDir: string,
402
+ deps: CutoverDeps,
403
+ killWaitMs: number,
404
+ pollMs: number,
405
+ log: (line: string) => void,
406
+ ): Promise<void> {
407
+ const pid = readPid(target.short, configDir);
408
+ if (pid === undefined) return;
409
+ if (!deps.alive(pid)) {
410
+ clearPid(target.short, configDir);
411
+ return;
412
+ }
413
+ try {
414
+ deps.kill(pid, "SIGTERM");
415
+ } catch {
416
+ // Gone between alive() and kill(); treat as stopped.
417
+ clearPid(target.short, configDir);
418
+ return;
419
+ }
420
+ const deadline = Date.now() + killWaitMs;
421
+ while (Date.now() < deadline && deps.alive(pid)) {
422
+ if (pollMs > 0) await deps.sleep(pollMs);
423
+ else break;
424
+ }
425
+ if (deps.alive(pid)) {
426
+ log(` ${target.short} didn't exit; sending SIGKILL.`);
427
+ try {
428
+ deps.kill(pid, "SIGKILL");
429
+ } catch {
430
+ // Racing a just-exited process.
431
+ }
432
+ }
433
+ clearPid(target.short, configDir);
434
+ log(` ✓ stopped ${target.short}`);
435
+ }
436
+
437
+ /**
438
+ * Decide whether an orphan pid bound to a MODULE port is plausibly attributable
439
+ * to that parachute module — the MUST-FIX-2 guard against blind-killing an
440
+ * operator's unrelated process that merely squats a declared port. Attributable
441
+ * when ANY of:
442
+ * - the orphan pid equals the module's RECORDED pid (services.json/pidfile);
443
+ * - its command line mentions `parachute` (any parachute-managed process —
444
+ * the `~/.parachute/...` install path and the `@openparachute/<mod>`
445
+ * package name both carry this marker, so it catches every genuine
446
+ * parachute-managed module);
447
+ * - its command line mentions the module's start command (when a hint is
448
+ * supplied — currently always unset at the call site, the seam is kept
449
+ * for a future services.json-derived start command).
450
+ * An unreadable command line (probe returned undefined) + a non-matching pid is
451
+ * NOT attributable — we refuse to kill it.
452
+ *
453
+ * NOTE: the bare module short-name needle (`vault`/`runner`/`scribe`/`notes`)
454
+ * was deliberately dropped — on the most destructive command (a process KILL),
455
+ * a bare short-name is too loose: a `runner` substring matches an unrelated CI
456
+ * runner squatting the port. The `parachute` marker already attributes every
457
+ * genuine parachute-managed process, so the short-name arm only widened the
458
+ * false-positive surface.
459
+ */
460
+ function orphanAttributable(args: {
461
+ orphan: number;
462
+ recordedPid: number | undefined;
463
+ short: string;
464
+ startCmdHint: string | undefined;
465
+ ownerOfPid: OwnerProbeFn;
466
+ }): { attributable: boolean; cmdline: string | undefined } {
467
+ const { orphan, recordedPid, startCmdHint, ownerOfPid } = args;
468
+ if (recordedPid !== undefined && orphan === recordedPid) {
469
+ return { attributable: true, cmdline: undefined };
470
+ }
471
+ const cmdline = ownerOfPid(orphan);
472
+ if (cmdline === undefined) return { attributable: false, cmdline: undefined };
473
+ const haystack = cmdline.toLowerCase();
474
+ const needles = ["parachute", ...(startCmdHint ? [startCmdHint.toLowerCase()] : [])].filter(
475
+ (n) => n.length > 0,
476
+ );
477
+ const attributable = needles.some((n) => haystack.includes(n));
478
+ return { attributable, cmdline };
479
+ }
480
+
481
+ /**
482
+ * §7.2 orphan sweep: lsof a port, and if a live process is bound to it, adopt +
483
+ * kill it (mirrors stopHub's 1939 orphan-adoption, per-module-port). A
484
+ * stale-pidfile-but-alive module won't be found by `readPid` → without this it
485
+ * stays bound → the supervised re-spawn hits EADDRINUSE.
486
+ *
487
+ * MUST-FIX 2 — OWNERSHIP CHECK (module ports only): for a declared MODULE port,
488
+ * we refuse to kill an orphan unless it's plausibly attributable to that
489
+ * parachute module (`orphanAttributable`). An operator's own dev server squatting
490
+ * a module's port must NOT be nuked by the cutover — we emit a clear warning and
491
+ * leave it; the subsequent verify-ports-free step turns the still-held port into
492
+ * a `port-stuck` outcome the operator resolves. The HUB port retains the
493
+ * pre-existing blind-adopt behavior (mirrors `stopHub`'s 1939 orphan-adoption) —
494
+ * that scope is unchanged; pass `attribute: undefined` for it.
495
+ *
496
+ * Returns true when the orphan was adopted + signalled (or there was no orphan),
497
+ * false when an UNATTRIBUTABLE process was found + deliberately left running.
498
+ */
499
+ function sweepOrphanOnPort(
500
+ port: number,
501
+ label: string,
502
+ deps: CutoverDeps,
503
+ log: (line: string) => void,
504
+ attribute?: { recordedPid: number | undefined; short: string; startCmdHint: string | undefined },
505
+ ): boolean {
506
+ const orphan = deps.pidOnPort(port);
507
+ if (orphan === undefined) return true;
508
+ if (!deps.alive(orphan)) return true;
509
+
510
+ if (attribute !== undefined) {
511
+ const { attributable, cmdline } = orphanAttributable({
512
+ orphan,
513
+ recordedPid: attribute.recordedPid,
514
+ short: attribute.short,
515
+ startCmdHint: attribute.startCmdHint,
516
+ ownerOfPid: deps.ownerOfPid,
517
+ });
518
+ if (!attributable) {
519
+ const desc = cmdline ? `${cmdline}` : "command line unavailable";
520
+ log(
521
+ ` ⚠ port ${port} for ${label} is held by an unrelated process (PID ${orphan}, ${desc}); refusing to kill it.`,
522
+ );
523
+ log(
524
+ " The cutover only adopts processes it can attribute to this module. Stop that process yourself,",
525
+ );
526
+ log(" then re-run `parachute migrate --to-supervised`.");
527
+ return false;
528
+ }
529
+ }
530
+
531
+ log(` orphan on ${label} port ${port} (PID ${orphan}) — stopping it.`);
532
+ try {
533
+ deps.kill(orphan, "SIGTERM");
534
+ } catch {
535
+ // Already gone.
536
+ return true;
537
+ }
538
+ // Best-effort SIGKILL follow-up if still alive (no long wait — the
539
+ // verify-ports-free step below polls + escalates the failure if it persists).
540
+ if (deps.alive(orphan)) {
541
+ try {
542
+ deps.kill(orphan, "SIGKILL");
543
+ } catch {
544
+ // Racing a just-exited process.
545
+ }
546
+ }
547
+ return true;
548
+ }
549
+
550
+ /**
551
+ * Poll a port until nothing is listening on it (bounded). Returns true when the
552
+ * port is free, false on timeout. The §7.1 step-5 race-guard: the unit must not
553
+ * start until 1939 (and each module port) is released, or the new hub crash-loops
554
+ * on EADDRINUSE under Restart=always.
555
+ */
556
+ async function waitPortFree(
557
+ port: number,
558
+ deps: CutoverDeps,
559
+ timeoutMs: number,
560
+ pollMs: number,
561
+ ): Promise<boolean> {
562
+ const deadline = Date.now() + timeoutMs;
563
+ for (;;) {
564
+ if (!(await deps.portListening(port))) return true;
565
+ if (Date.now() >= deadline) break;
566
+ if (pollMs > 0) await deps.sleep(pollMs);
567
+ else break;
568
+ }
569
+ return !(await deps.portListening(port));
570
+ }
571
+
572
+ /**
573
+ * Poll the hub /health until it answers (bounded). The §7.1 step-7 verify.
574
+ */
575
+ async function waitHubHealthy(
576
+ port: number,
577
+ deps: CutoverDeps,
578
+ timeoutMs: number,
579
+ pollMs: number,
580
+ ): Promise<boolean> {
581
+ const deadline = Date.now() + timeoutMs;
582
+ for (;;) {
583
+ if (await deps.probeHealth(port)) return true;
584
+ if (Date.now() >= deadline) break;
585
+ if (pollMs > 0) await deps.sleep(pollMs);
586
+ else break;
587
+ }
588
+ return deps.probeHealth(port);
589
+ }
590
+
591
+ /**
592
+ * The idempotent detached→supervised cutover (§7.1). See the file header for the
593
+ * ordering + fail-safe + resumability contract. Returns a structured outcome;
594
+ * the CLI maps it to an exit code + messaging.
595
+ */
596
+ export async function cutoverToSupervised(opts: CutoverOpts = {}): Promise<CutoverResult> {
597
+ const configDir = opts.configDir ?? CONFIG_DIR;
598
+ const manifestPath = opts.manifestPath ?? SERVICES_MANIFEST_PATH;
599
+ const port = opts.port ?? HUB_DEFAULT_PORT;
600
+ const cliPath = opts.cliPath ?? defaultHubCliPath();
601
+ const log = opts.log ?? ((line) => console.log(line));
602
+ const timeoutMs = opts.timeoutMs ?? 15_000;
603
+ const pollMs = opts.pollMs ?? 250;
604
+ const deps: CutoverDeps = { ...defaultCutoverDeps, ...(opts.deps ?? {}) };
605
+
606
+ const targets = moduleTargets(manifestPath);
607
+
608
+ // --- Step 1: DETECT the current model (and the idempotent no-op). ---
609
+ const unitInstalled = deps.isHubUnitInstalled(deps.hubUnitDeps);
610
+ const hubHealthy = await deps.probeHealth(port);
611
+ if (unitInstalled && hubHealthy) {
612
+ // A unit exists AND the hub answers /health → already supervised. No-op.
613
+ return {
614
+ outcome: "already-migrated",
615
+ port,
616
+ messages: ["Already migrated — a supervised hub unit is installed and healthy."],
617
+ };
618
+ }
619
+
620
+ log("Migrating to the supervised model (parachute serve under a process manager).");
621
+ if (unitInstalled) {
622
+ // A unit is on disk but the hub isn't answering — a partial/failed prior
623
+ // cutover (unit written, not started), or the unit is stopped. Resume.
624
+ log("Found a hub unit already written (resuming a prior cutover).");
625
+ }
626
+
627
+ // --- Step 2: WRITE the unit WITHOUT starting it (the §7.1 race-avoider). ---
628
+ log("Writing the hub unit file (not starting it yet)…");
629
+ const write = deps.writeUnitWithoutStarting({
630
+ parachuteHome: configDir,
631
+ cliPath,
632
+ port,
633
+ deps: deps.hubUnitDeps,
634
+ });
635
+ for (const m of write.messages) log(` ${m}`);
636
+ if (!write.written) {
637
+ // Distinguish the two fallback causes (MUST-FIX NIT). Both bail cleanly here —
638
+ // we're still BEFORE step 3, so nothing has been stopped — but with accurate
639
+ // messaging so a bun-not-found / write failure doesn't masquerade as a
640
+ // missing-service-manager host.
641
+ if (write.cause === "no-manager") {
642
+ // No service manager on this host (container / init-less) — there is no
643
+ // unit to install; the runtime here is foreground `serve`.
644
+ return {
645
+ outcome: "no-manager",
646
+ port,
647
+ messages: [
648
+ "This host has no service manager (systemd/launchd) — the supervised model needs one.",
649
+ "Run `parachute serve` in the foreground, or use a platform that provides a manager.",
650
+ ...write.messages,
651
+ ],
652
+ };
653
+ }
654
+ // The write itself failed (bun unresolvable, or the manager errored writing
655
+ // the unit). A manager may exist — this is NOT a no-manager host.
656
+ return {
657
+ outcome: "write-failed",
658
+ port,
659
+ messages: [
660
+ "Could not write the hub unit file (bun not found, or the service manager errored) — no changes made.",
661
+ ...write.messages,
662
+ ],
663
+ };
664
+ }
665
+
666
+ // --- Step 3: STOP the detached processes (hub FIRST is not required vs
667
+ // modules, but we stop the hub then each module so children of the detached
668
+ // hub, if any, are released before their ports are swept). ---
669
+ log("Stopping the detached hub + modules…");
670
+ const stopped = await deps.stopHub({ configDir, log: (l) => log(` ${l}`) });
671
+ if (stopped) log(" ✓ stopped the detached hub");
672
+ for (const target of targets) {
673
+ await stopDetachedModule(target, configDir, deps, timeoutMs, pollMs, log);
674
+ }
675
+
676
+ // --- Step 4: §7.2 ORPHAN SWEEP — per services.json port + the hub port. ---
677
+ // The HUB port keeps the pre-existing blind-adopt (mirrors stopHub's 1939
678
+ // orphan-adoption — out of scope for MUST-FIX 2). The MODULE ports get the
679
+ // ownership check: we read the module's recorded pid (so a still-alive process
680
+ // we already know about is trivially attributable) and only adopt+kill an
681
+ // orphan we can attribute to that parachute module; an UNATTRIBUTABLE squatter
682
+ // is left running with a warning, and the verify-ports-free step turns the
683
+ // still-held port into `port-stuck`.
684
+ log("Sweeping orphaned processes still bound to declared ports…");
685
+ sweepOrphanOnPort(port, "hub", deps, log);
686
+ for (const target of targets) {
687
+ sweepOrphanOnPort(target.port, target.short, deps, log, {
688
+ recordedPid: readPid(target.short, configDir),
689
+ short: target.short,
690
+ startCmdHint: undefined,
691
+ });
692
+ }
693
+
694
+ // --- Step 5: VERIFY the hub port + each module port is free. ---
695
+ // Fail leaving the unit written-but-not-started so a retry is clean (§7.1).
696
+ log("Verifying ports are free before starting the unit…");
697
+ const portsToCheck: Array<{ port: number; label: string }> = [
698
+ { port, label: "hub" },
699
+ ...targets.map((t) => ({ port: t.port, label: t.short })),
700
+ ];
701
+ for (const p of portsToCheck) {
702
+ const free = await waitPortFree(p.port, deps, timeoutMs, pollMs);
703
+ if (!free) {
704
+ return {
705
+ outcome: "port-stuck",
706
+ port,
707
+ messages: [
708
+ `Port ${p.port} (${p.label}) is still held after stopping the detached processes.`,
709
+ "The hub unit is written but NOT started — your box is unchanged except the unit file.",
710
+ `Find what's holding the port (\`lsof -iTCP:${p.port}\`), stop it, then re-run \`parachute migrate --to-supervised\`.`,
711
+ ],
712
+ };
713
+ }
714
+ }
715
+
716
+ // --- Step 6: START the unit (enable --now / bootstrap). ---
717
+ log("Starting the hub unit…");
718
+ const started = await deps.installAndStartHubUnit({
719
+ parachuteHome: configDir,
720
+ cliPath,
721
+ port,
722
+ log: (l) => log(` ${l}`),
723
+ });
724
+ if (started.outcome === "no-manager") {
725
+ // The manager vanished between step 2 and step 6 (extremely unlikely), or
726
+ // the install degraded. The detached procs are stopped + the unit is on
727
+ // disk → re-runnable once the manager is available. Surface clearly.
728
+ return {
729
+ outcome: "start-failed",
730
+ port,
731
+ messages: [
732
+ "Could not start the hub unit via the service manager.",
733
+ "The unit file is written; re-run `parachute migrate --to-supervised` once the service manager is available,",
734
+ "or run `parachute serve` in the foreground.",
735
+ ...started.messages,
736
+ ],
737
+ };
738
+ }
739
+ if (started.outcome !== "started") {
740
+ // `timeout` / `start-failed` — the unit was (re)installed but the hub didn't
741
+ // become ready. Re-runnable; surface the unit log the helper tailed.
742
+ return {
743
+ outcome: started.outcome === "timeout" ? "verify-timeout" : "start-failed",
744
+ port,
745
+ messages: [
746
+ "The hub unit was started but the hub didn't come up cleanly.",
747
+ "Re-run `parachute migrate --to-supervised`, or check `parachute logs hub`.",
748
+ ...started.messages,
749
+ ],
750
+ };
751
+ }
752
+
753
+ // --- Step 7: VERIFY the hub answers /health. ---
754
+ log("Verifying the supervised hub is healthy…");
755
+ const healthy = await waitHubHealthy(port, deps, timeoutMs, pollMs);
756
+ if (!healthy) {
757
+ return {
758
+ outcome: "verify-timeout",
759
+ port,
760
+ messages: [
761
+ `The hub unit started but did not answer /health on 127.0.0.1:${port}.`,
762
+ "Re-run `parachute migrate --to-supervised`, or check `parachute logs hub`.",
763
+ ...started.messages,
764
+ ],
765
+ };
766
+ }
767
+
768
+ // --- Step 8: the cloudflared connector (if any) is left intact — it's its
769
+ // own unit; tailscale needs nothing. (Nothing to do here — documented for the
770
+ // reader; the connector unit is never touched by the hub cutover.) ---
771
+
772
+ return {
773
+ outcome: "migrated",
774
+ port,
775
+ messages: [
776
+ "✓ Migrated to the supervised model.",
777
+ "The hub now runs under your platform's process manager (it survives reboots),",
778
+ "and modules are supervised children that boot from services.json.",
779
+ "Per-module CLI verbs (`parachute start|stop|restart <svc>`) now drive the running hub.",
780
+ ],
781
+ };
782
+ }
783
+
784
+ // ---------------------------------------------------------------------------
785
+ // §7.4 teardown — the rollback path.
786
+ // ---------------------------------------------------------------------------
787
+
788
+ export interface TeardownOpts {
789
+ log?: (line: string) => void;
790
+ /** Injectable managed-unit deps (default production). */
791
+ deps?: ManagedUnitDeps;
792
+ /** Test seam: the removeManagedUnit implementation. */
793
+ remove?: (opts: {
794
+ launchdLabel: string;
795
+ systemdUnitName: string;
796
+ deps: ManagedUnitDeps;
797
+ removedLaunchdMessage: (label: string) => string;
798
+ removedSystemdMessage: (unitName: string) => string;
799
+ }) => ManagedUnitRemoveResult;
800
+ }
801
+
802
+ /**
803
+ * `parachute migrate --teardown` (§7.4) — remove the hub unit. Idempotent +
804
+ * best-effort: a missing unit is a no-op; tool failures never throw (the
805
+ * teardown must always succeed at clearing state). This is the ROLLBACK path if
806
+ * the cutover misbehaves: tear down the unit and the operator falls back to a
807
+ * foreground `serve` (or the still-intact detached path, until Phase 5b).
808
+ *
809
+ * NOTE: this removes the HUB unit only. It deliberately does NOT remove the
810
+ * cloudflared connector unit (independent; `expose off --cloudflare` owns that),
811
+ * and it does NOT re-spawn the detached hub — the operator decides what runtime
812
+ * to fall back to.
813
+ */
814
+ export function teardownHubUnit(opts: TeardownOpts = {}): { removed: boolean; messages: string[] } {
815
+ const log = opts.log ?? ((line) => console.log(line));
816
+ const deps = opts.deps ?? defaultHubUnitDeps;
817
+ const remove = opts.remove ?? removeManagedUnit;
818
+ const res = remove({
819
+ launchdLabel: HUB_LAUNCHD_LABEL,
820
+ systemdUnitName: HUB_SYSTEMD_UNIT_NAME,
821
+ deps,
822
+ removedLaunchdMessage: (label) =>
823
+ `Removed launchd LaunchAgent ${label} — the hub no longer starts on login/boot.`,
824
+ removedSystemdMessage: (unitName) =>
825
+ `Removed systemd unit ${unitName} — the hub no longer starts on boot.`,
826
+ });
827
+ if (res.removed) {
828
+ for (const m of res.messages) log(m);
829
+ log("");
830
+ log("The supervised hub unit is gone. To run the hub now, either:");
831
+ log(" - `parachute serve` (foreground), or");
832
+ log(" - `parachute migrate --to-supervised` to reinstall the unit.");
833
+ } else {
834
+ log("No hub unit was installed — nothing to tear down.");
835
+ }
836
+ return res;
837
+ }