@openparachute/hub 0.6.1 → 0.6.3-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,735 @@
1
+ /**
2
+ * "Ensure the hub UNIT is up" — the Phase 3a successor to the detached
3
+ * `ensureHubRunning` spawn (`hub-control.ts:200`).
4
+ *
5
+ * Under the hub-as-supervisor unification (design:
6
+ * `parachute.computer/design/2026-06-01-hub-as-supervisor-unification.md` §3.2)
7
+ * the hub no longer runs as a detached, `unref()`'d `bun hub-server.ts` tracked
8
+ * by a pidfile. It runs as `parachute serve` (foreground hub + in-process
9
+ * Supervisor) under a per-platform process manager — a launchd LaunchAgent on
10
+ * macOS, a systemd unit on Linux, the container runtime on Render/Fly. So
11
+ * "ensure the hub is up" becomes "ensure the hub UNIT is started", driven
12
+ * through the platform manager, NEVER a detached spawn.
13
+ *
14
+ * THE §3.2 ALGORITHM (`ensureHubUnit`):
15
+ * 1. Probe the loopback hub (`GET /health` on the configured port). If it
16
+ * answers → return "already up".
17
+ * 2. If down, start the hub unit via the platform manager:
18
+ * `systemctl [--user] start parachute-hub.service` (system vs user by
19
+ * uid, mirroring `managed-unit.ts`) / `launchctl kickstart -k
20
+ * gui/<uid>/computer.parachute.hub`.
21
+ * 3. If NO unit is installed → fail with an actionable "run `parachute
22
+ * migrate`" message (or, from `init`, init installs it — see
23
+ * `installAndStartHubUnit`). NEVER a detached fallback spawn.
24
+ * 4. If NO manager at all (no systemctl AND no launchctl) → clear
25
+ * foreground-`serve`-only message (R19 / D1). Don't hang, don't spawn.
26
+ * 5. Wait for hub readiness by polling the hub port (`defaultPortListening`).
27
+ * On timeout, surface the unit's recent log so a wedged hub is
28
+ * diagnosable, not a silent hang.
29
+ *
30
+ * EVERYTHING behind the injectable {@link HubUnitDeps} seam (mirroring
31
+ * `ManagedUnitDeps` / `ensureHubRunning`'s seam) so it's unit-testable without
32
+ * touching the real OS.
33
+ *
34
+ * SCOPE (Phase 3a): this is a NEW helper. It does NOT replace
35
+ * `ensureHubRunning` — `expose` / `expose-cloudflare` / `lifecycle` keep using
36
+ * the old detached path until their phases (4/5). Only `init` adopts the new
37
+ * path in 3a (via `installAndStartHubUnit`), and 3b/Phase 4 will adopt
38
+ * `ensureHubUnit` for the per-module-verb bringup.
39
+ */
40
+
41
+ import {
42
+ HUB_LAUNCHD_LABEL,
43
+ HUB_SYSTEMD_UNIT_NAME,
44
+ type ManagedUnit,
45
+ type ManagedUnitDeps,
46
+ type ManagedUnitInstallResult,
47
+ type ManagedUnitMessages,
48
+ type ServiceCommandResult,
49
+ buildHubManagedUnit,
50
+ defaultManagedUnitDeps,
51
+ installManagedUnit,
52
+ launchdPlistPathForLabel,
53
+ systemdUnitPathForName,
54
+ } from "./managed-unit.ts";
55
+ import { type PortListeningFn, defaultPortListening } from "./port-probe.ts";
56
+
57
+ /** Default canonical hub port (the 1939 pin). */
58
+ export const HUB_UNIT_DEFAULT_PORT = 1939;
59
+
60
+ /**
61
+ * Injectable side-effect seam for the ensure-hub-unit machinery. EXTENDS
62
+ * `ManagedUnitDeps` (platform / getuid / homeDir / userName / which / run /
63
+ * file ops — so the same fake drives both `ensureHubUnit` AND the
64
+ * `installManagedUnit` call inside `installAndStartHubUnit` with no unsafe
65
+ * cast) plus the three extra probes ensure-hub needs: an HTTP `/health` probe,
66
+ * a TCP port-listening probe, and a `sleep` (all deterministically stubbable).
67
+ *
68
+ * Production wires {@link defaultHubUnitDeps}; tests inject fakes so no
69
+ * launchctl/systemctl/socket/HTTP call ever touches the real OS.
70
+ */
71
+ export interface HubUnitDeps extends ManagedUnitDeps {
72
+ /**
73
+ * HTTP `/health` probe of the loopback hub. Resolves true when the hub
74
+ * answers 2xx, false on connection-refused / non-2xx / timeout. Production
75
+ * uses a bounded `fetch`; tests inject a deterministic stub.
76
+ */
77
+ probeHealth: (port: number) => Promise<boolean>;
78
+ /** TCP connect-probe for readiness polling (reuses `defaultPortListening`). */
79
+ portListening: PortListeningFn;
80
+ /** Sleep between readiness polls (tests pin to 0). */
81
+ sleep: (ms: number) => Promise<void>;
82
+ }
83
+
84
+ /**
85
+ * Default `/health` probe: a bounded `fetch` to `http://127.0.0.1:<port>/health`.
86
+ * Any non-2xx / network error / timeout → false (treated as "hub not up").
87
+ * 1.5s timeout so a wedged-but-listening socket doesn't hang the probe.
88
+ */
89
+ async function defaultProbeHealth(port: number): Promise<boolean> {
90
+ try {
91
+ const res = await fetch(`http://127.0.0.1:${port}/health`, {
92
+ signal: AbortSignal.timeout(1500),
93
+ });
94
+ return res.ok;
95
+ } catch {
96
+ return false;
97
+ }
98
+ }
99
+
100
+ export const defaultHubUnitDeps: HubUnitDeps = {
101
+ ...defaultManagedUnitDeps,
102
+ probeHealth: defaultProbeHealth,
103
+ portListening: defaultPortListening,
104
+ sleep: (ms) => new Promise((r) => setTimeout(r, ms)),
105
+ };
106
+
107
+ export type HubUnitOutcome =
108
+ /** Hub already answered `/health` — no manager call was needed. */
109
+ | "already-up"
110
+ /** The manager was driven to start the unit and the hub became ready. */
111
+ | "started"
112
+ /** No unit is installed (actionable error). */
113
+ | "no-unit"
114
+ /** No service manager (systemd/launchd) is available (actionable error). */
115
+ | "no-manager"
116
+ /** The unit was started but the hub never answered within the timeout. */
117
+ | "timeout"
118
+ /** The platform manager rejected the start command. */
119
+ | "start-failed";
120
+
121
+ export interface EnsureHubUnitResult {
122
+ outcome: HubUnitOutcome;
123
+ /** The hub port that was probed. */
124
+ port: number;
125
+ /** Human-readable lines the caller should surface (errors, log tails). */
126
+ messages: string[];
127
+ }
128
+
129
+ export interface EnsureHubUnitOpts {
130
+ /** Hub port to probe + wait on (default 1939). */
131
+ port?: number;
132
+ /** Injectable deps (defaults to production). */
133
+ deps?: HubUnitDeps;
134
+ /** Readiness budget in ms (default 15s). */
135
+ readyTimeoutMs?: number;
136
+ /** Poll interval in ms (default 250). */
137
+ readyPollMs?: number;
138
+ /** How many trailing log lines to surface on timeout (default 50). */
139
+ logTailLines?: number;
140
+ log?: (line: string) => void;
141
+ }
142
+
143
+ /** True when a hub unit file is present for this platform. */
144
+ export function isHubUnitInstalled(deps: HubUnitDeps): boolean {
145
+ const home = deps.homeDir();
146
+ if (deps.platform === "darwin") {
147
+ return deps.exists(launchdPlistPathForLabel(HUB_LAUNCHD_LABEL, home));
148
+ }
149
+ if (deps.platform === "linux") {
150
+ const root = (deps.getuid() ?? 1000) === 0;
151
+ return deps.exists(systemdUnitPathForName(HUB_SYSTEMD_UNIT_NAME, home, root));
152
+ }
153
+ return false;
154
+ }
155
+
156
+ /**
157
+ * Is a service manager (systemd / launchd) available on this platform at all?
158
+ * macOS → launchctl; Linux → systemctl. A box with neither (a bare container,
159
+ * an init-less host) has no manager — the foreground-`serve`-only path (R19/D1).
160
+ */
161
+ export function hasServiceManager(deps: HubUnitDeps): boolean {
162
+ if (deps.platform === "darwin") return deps.which("launchctl") !== null;
163
+ if (deps.platform === "linux") return deps.which("systemctl") !== null;
164
+ return false;
165
+ }
166
+
167
+ /** The "no service manager found" actionable message (R19 / D1). */
168
+ export const NO_MANAGER_MESSAGE =
169
+ "no service manager (systemd/launchd) found — run `parachute serve` in the foreground, or use a platform that provides one";
170
+
171
+ /** The "no hub unit installed" actionable message (§3.2 step 3). */
172
+ export const NO_UNIT_MESSAGE = "no hub unit installed — run `parachute migrate` to install it";
173
+
174
+ /**
175
+ * Start the hub unit via the platform manager. Returns the raw command result
176
+ * so the caller can surface stderr on failure. Branches exactly like
177
+ * `managed-unit.ts` / the connector: launchd uses `kickstart -k gui/<uid>/<label>`
178
+ * (force-restart-or-start); systemd uses `systemctl [--user] start <unit>`
179
+ * (system vs user by uid).
180
+ *
181
+ * NOTE: `launchctl kickstart` requires the unit to be bootstrapped already
182
+ * (the install path does that). If it isn't loaded, kickstart returns non-zero
183
+ * — which surfaces as a start-failure with the manager's stderr, not a hang.
184
+ */
185
+ function startHubUnitViaManager(deps: HubUnitDeps): ServiceCommandResult {
186
+ if (deps.platform === "darwin") {
187
+ const uid = deps.getuid() ?? 0;
188
+ return deps.run(["launchctl", "kickstart", "-k", `gui/${uid}/${HUB_LAUNCHD_LABEL}`]);
189
+ }
190
+ // linux / systemd
191
+ const root = (deps.getuid() ?? 1000) === 0;
192
+ const scope = root ? [] : ["--user"];
193
+ return deps.run(["systemctl", ...scope, "start", HUB_SYSTEMD_UNIT_NAME]);
194
+ }
195
+
196
+ /**
197
+ * Best-effort tail of the hub unit's recent log so a wedged hub is diagnosable
198
+ * on a readiness timeout (§3.2 step 5). Tries the platform's native log first
199
+ * (journald on systemd, `launchctl print` on launchd), then falls back to the
200
+ * hub's own log file. Never throws — diagnostics must not mask the timeout.
201
+ */
202
+ function tailHubUnitLog(deps: HubUnitDeps, lines: number): string[] {
203
+ const out: string[] = [];
204
+ try {
205
+ if (deps.platform === "linux" && deps.which("journalctl") !== null) {
206
+ const root = (deps.getuid() ?? 1000) === 0;
207
+ const scope = root ? [] : ["--user"];
208
+ const r = deps.run([
209
+ "journalctl",
210
+ ...scope,
211
+ "-u",
212
+ HUB_SYSTEMD_UNIT_NAME,
213
+ "-n",
214
+ String(lines),
215
+ "--no-pager",
216
+ ]);
217
+ if (r.code === 0 && r.stdout.trim().length > 0) {
218
+ out.push("Recent hub unit log (journalctl):", r.stdout.trimEnd());
219
+ return out;
220
+ }
221
+ }
222
+ if (deps.platform === "darwin") {
223
+ // NOTE: `launchctl print` emits the service STATE DESCRIPTOR (load state,
224
+ // last exit code, pid, env), NOT a log tail — unlike the systemd arm's
225
+ // `journalctl -n 50` which is a genuine tail of the unit's output. It's
226
+ // still diagnostically useful (a crash-looping unit shows its last exit
227
+ // code here), but it won't show the hub's recent stderr. The richer
228
+ // launchd equivalent is `log show --predicate 'process == "bun"' --last 5m`
229
+ // (or scoped by the unit's logPath) — a future refinement; not wired now.
230
+ const uid = deps.getuid() ?? 0;
231
+ const r = deps.run(["launchctl", "print", `gui/${uid}/${HUB_LAUNCHD_LABEL}`]);
232
+ if (r.code === 0 && r.stdout.trim().length > 0) {
233
+ out.push("Hub unit state (launchctl print):", r.stdout.trimEnd());
234
+ return out;
235
+ }
236
+ }
237
+ } catch {
238
+ // The log tail is best-effort; fall through to the file tail below.
239
+ }
240
+ return out;
241
+ }
242
+
243
+ /** Outcome of a `stop hub` / `restart hub` via the platform manager (§3.3). */
244
+ export type HubUnitManagerOpOutcome =
245
+ /** The manager command succeeded (the unit was stopped / restarted). */
246
+ | "ok"
247
+ /** No service manager (systemd/launchd) is available. */
248
+ | "no-manager"
249
+ /** No hub unit is installed. */
250
+ | "no-unit"
251
+ /** The platform manager rejected the command (carries its stderr). */
252
+ | "failed";
253
+
254
+ export interface HubUnitManagerOpResult {
255
+ outcome: HubUnitManagerOpOutcome;
256
+ /** Human-readable lines the caller should surface. */
257
+ messages: string[];
258
+ }
259
+
260
+ /**
261
+ * Stop the hub UNIT via the platform manager (design §3.3 `stop hub` row).
262
+ *
263
+ * MUST go through the manager — NEVER a PID signal. launchd `KeepAlive` and
264
+ * systemd `Restart=always` would immediately respawn a killed PID (R17), so a
265
+ * `kill` would be silently undone. The manager call deregisters the unit's
266
+ * keep-alive intent so the hub actually stays down:
267
+ * - launchd → `launchctl bootout gui/<uid>/<label>` (unloads + stops; a
268
+ * subsequent `start hub` re-bootstraps via the install path / `init`).
269
+ * - systemd → `systemctl [--user] stop <unit>` (Restart=always does not
270
+ * re-trigger on an explicit `stop`).
271
+ *
272
+ * Children die with the hub (`serve`'s stop() SIGTERMs all supervised children
273
+ * before `server.stop()`), so stopping the unit stops every module too.
274
+ *
275
+ * Returns a structured outcome; the caller maps it to exit code + messaging.
276
+ * Does NOT install a unit when none exists, and does NOT signal any PID.
277
+ */
278
+ export function stopHubUnit(deps: HubUnitDeps): HubUnitManagerOpResult {
279
+ if (!hasServiceManager(deps)) {
280
+ return { outcome: "no-manager", messages: [NO_MANAGER_MESSAGE] };
281
+ }
282
+ if (!isHubUnitInstalled(deps)) {
283
+ return { outcome: "no-unit", messages: [NO_UNIT_MESSAGE] };
284
+ }
285
+ let res: ServiceCommandResult;
286
+ if (deps.platform === "darwin") {
287
+ const uid = deps.getuid() ?? 0;
288
+ // bootout unloads + stops the LaunchAgent so KeepAlive can't resurrect it.
289
+ res = deps.run(["launchctl", "bootout", `gui/${uid}/${HUB_LAUNCHD_LABEL}`]);
290
+ } else {
291
+ const root = (deps.getuid() ?? 1000) === 0;
292
+ const scope = root ? [] : ["--user"];
293
+ res = deps.run(["systemctl", ...scope, "stop", HUB_SYSTEMD_UNIT_NAME]);
294
+ }
295
+ if (res.code !== 0) {
296
+ const detail = res.stderr.trim() || res.stdout.trim() || "unknown error";
297
+ return {
298
+ outcome: "failed",
299
+ messages: [`failed to stop the hub unit via the service manager (${detail})`],
300
+ };
301
+ }
302
+ return { outcome: "ok", messages: [] };
303
+ }
304
+
305
+ /**
306
+ * Restart the hub UNIT via the platform manager (design §3.3 `restart hub`
307
+ * row). MUST go through the manager — NEVER a PID signal (same R17 reasoning as
308
+ * {@link stopHubUnit}). NOT a per-module fan-out: restarting the hub tears down
309
+ * all supervised children and re-boots every module from `services.json`, so a
310
+ * unit restart is already a total restart of the box's modules.
311
+ * - launchd → `launchctl kickstart -k gui/<uid>/<label>` (force-restart;
312
+ * the same command the start path uses, which on an already-loaded unit
313
+ * kills + relaunches).
314
+ * - systemd → `systemctl [--user] restart <unit>`.
315
+ *
316
+ * Returns a structured outcome; the caller maps it to exit code + messaging.
317
+ */
318
+ export function restartHubUnit(deps: HubUnitDeps): HubUnitManagerOpResult {
319
+ if (!hasServiceManager(deps)) {
320
+ return { outcome: "no-manager", messages: [NO_MANAGER_MESSAGE] };
321
+ }
322
+ if (!isHubUnitInstalled(deps)) {
323
+ return { outcome: "no-unit", messages: [NO_UNIT_MESSAGE] };
324
+ }
325
+ let res: ServiceCommandResult;
326
+ if (deps.platform === "darwin") {
327
+ const uid = deps.getuid() ?? 0;
328
+ res = deps.run(["launchctl", "kickstart", "-k", `gui/${uid}/${HUB_LAUNCHD_LABEL}`]);
329
+ } else {
330
+ const root = (deps.getuid() ?? 1000) === 0;
331
+ const scope = root ? [] : ["--user"];
332
+ res = deps.run(["systemctl", ...scope, "restart", HUB_SYSTEMD_UNIT_NAME]);
333
+ }
334
+ if (res.code !== 0) {
335
+ const detail = res.stderr.trim() || res.stdout.trim() || "unknown error";
336
+ return {
337
+ outcome: "failed",
338
+ messages: [`failed to restart the hub unit via the service manager (${detail})`],
339
+ };
340
+ }
341
+ return { outcome: "ok", messages: [] };
342
+ }
343
+
344
+ /**
345
+ * Run-state of the hub UNIT as reported by the platform manager (design §6.4).
346
+ * This is the manager's view — NOT a liveness verdict. The hub answering
347
+ * `/health` is the liveness signal; the caller (`status`) composes the two
348
+ * (manager says `active` + `/health` answers → "running"; `active` but no
349
+ * `/health` yet → "starting/unhealthy"; `failed` → "failed").
350
+ */
351
+ export type HubUnitState =
352
+ /** systemd `is-active` → `active`; launchd `print` → `state = running`. */
353
+ | "active"
354
+ /** systemd `is-active` → `activating` / `reloading`; launchd transient. */
355
+ | "activating"
356
+ /** systemd `is-active` → `failed`; launchd nonzero `last exit code`. */
357
+ | "failed"
358
+ /** systemd `is-active` → `inactive` / `dead`; launchd not-running, clean. */
359
+ | "inactive"
360
+ /** A hub unit is installed but the manager couldn't classify it. */
361
+ | "unknown"
362
+ /** No hub unit file is installed on this platform. */
363
+ | "no-unit"
364
+ /**
365
+ * No on-box service manager exists at all (container runtime / init-less
366
+ * host). There is nothing to query — `status` reports "container runtime
367
+ * (managed)" and leans on `/health` for liveness (§6.4).
368
+ */
369
+ | "no-manager";
370
+
371
+ export interface HubUnitStateResult {
372
+ state: HubUnitState;
373
+ /** Last exit code, when the manager surfaced one (launchd / failed unit). */
374
+ lastExitCode?: number;
375
+ /** Raw manager output (trimmed), for diagnostics on `unknown` / `failed`. */
376
+ detail?: string;
377
+ }
378
+
379
+ /**
380
+ * Map a systemd `systemctl is-active` stdout token to a {@link HubUnitState}.
381
+ * `is-active` prints exactly one of: `active`, `activating`, `reloading`,
382
+ * `inactive`, `failed`, `deactivating`, `unknown`. We collapse the transient
383
+ * tokens onto our smaller vocabulary so `status` doesn't have to know them all.
384
+ */
385
+ function mapSystemdActiveToken(token: string): HubUnitState {
386
+ switch (token.trim()) {
387
+ case "active":
388
+ return "active";
389
+ case "activating":
390
+ case "reloading":
391
+ // `deactivating` (an in-flight stop transition) deliberately maps to
392
+ // `activating` → our `pending` vocabulary: it's a transient transition, not
393
+ // a terminal state. During a `parachute stop hub` the unit may momentarily
394
+ // read `pending` here before settling to `inactive` — the next status poll
395
+ // resolves it. Better a brief "pending" than flapping to a false "active".
396
+ case "deactivating":
397
+ return "activating";
398
+ case "failed":
399
+ return "failed";
400
+ case "inactive":
401
+ case "dead":
402
+ return "inactive";
403
+ default:
404
+ return "unknown";
405
+ }
406
+ }
407
+
408
+ /**
409
+ * Parse a launchd `launchctl print gui/<uid>/<label>` state descriptor into a
410
+ * {@link HubUnitStateResult}. The descriptor is multi-line key/value; we read
411
+ * the `state = …` line and the `last exit code = …` line. A nonzero last-exit
412
+ * with the service not running reads as `failed`; `state = running` reads as
413
+ * `active`; anything else with the unit loaded reads as `inactive`.
414
+ *
415
+ * `launchctl print` exits nonzero when the label isn't loaded at all — the
416
+ * caller treats that as `inactive` (unit installed on disk but not bootstrapped),
417
+ * since the descriptor body is empty.
418
+ */
419
+ function parseLaunchctlPrint(stdout: string): HubUnitStateResult {
420
+ const stateMatch = stdout.match(/^\s*state\s*=\s*(\S+)/im);
421
+ const exitMatch = stdout.match(/last exit code\s*=\s*(-?\d+)/i);
422
+ const lastExitCode = exitMatch?.[1] !== undefined ? Number(exitMatch[1]) : undefined;
423
+ const stateToken = stateMatch?.[1]?.toLowerCase();
424
+ const detail = stdout.trim().length > 0 ? stdout.trim() : undefined;
425
+ if (stateToken === "running") {
426
+ return lastExitCode !== undefined
427
+ ? { state: "active", lastExitCode, ...(detail ? { detail } : {}) }
428
+ : { state: "active", ...(detail ? { detail } : {}) };
429
+ }
430
+ // Not running: a nonzero recorded last-exit means the unit crashed/failed; a
431
+ // zero / absent exit means it's loaded-but-idle (inactive). KeepAlive units
432
+ // that crash-loop surface a nonzero last-exit here even between respawns.
433
+ if (lastExitCode !== undefined && lastExitCode !== 0) {
434
+ return { state: "failed", lastExitCode, ...(detail ? { detail } : {}) };
435
+ }
436
+ if (lastExitCode !== undefined) {
437
+ return { state: "inactive", lastExitCode, ...(detail ? { detail } : {}) };
438
+ }
439
+ // No state and no exit-code line — the descriptor told us nothing usable.
440
+ // Distinguish `unknown` (non-empty but unparseable stdout — e.g. a future
441
+ // macOS `launchctl print` field layout we don't recognize) from `inactive`
442
+ // (empty/absent body — the label isn't loaded). This is deliberate: a new
443
+ // layout must NOT be misread as a false `inactive` (which would tell the
444
+ // operator the hub is stopped when it may well be running) — `unknown` keeps
445
+ // the raw `detail` for diagnosis instead.
446
+ return detail ? { state: "unknown", detail } : { state: "inactive" };
447
+ }
448
+
449
+ /**
450
+ * Query the platform manager for the hub unit's run-state (design §6.4 hub
451
+ * row). This is the `status`-side counterpart to {@link stopHubUnit} /
452
+ * {@link restartHubUnit}: a READ, never a mutation.
453
+ *
454
+ * - No manager (container / init-less) → `no-manager`. `status` reports
455
+ * "container runtime (managed)" — there's nothing on-box to query, and the
456
+ * `/health` answer is the liveness signal.
457
+ * - No unit file installed → `no-unit` (a legacy detached box that somehow
458
+ * reached this read; the dual-dispatch branch in `status` guards against it).
459
+ * - systemd → `systemctl [--user] is-active <unit>`; the token maps via
460
+ * {@link mapSystemdActiveToken}. `is-active` exits nonzero for non-active
461
+ * states, so we read stdout regardless of exit code.
462
+ * - launchd → `launchctl print gui/<uid>/<label>`; parsed via
463
+ * {@link parseLaunchctlPrint}.
464
+ *
465
+ * Never throws — a query failure degrades to `unknown` with the manager's
466
+ * stderr in `detail`, so `status` can render a sensible row rather than crash.
467
+ */
468
+ export function queryHubUnitState(deps: HubUnitDeps): HubUnitStateResult {
469
+ if (!hasServiceManager(deps)) return { state: "no-manager" };
470
+ if (!isHubUnitInstalled(deps)) return { state: "no-unit" };
471
+ try {
472
+ if (deps.platform === "darwin") {
473
+ const uid = deps.getuid() ?? 0;
474
+ const r = deps.run(["launchctl", "print", `gui/${uid}/${HUB_LAUNCHD_LABEL}`]);
475
+ // A nonzero exit with empty stdout means the label isn't loaded — the
476
+ // unit file is on disk but never bootstrapped. Read as inactive.
477
+ if (r.stdout.trim().length === 0) {
478
+ const detail = r.stderr.trim();
479
+ return detail ? { state: "inactive", detail } : { state: "inactive" };
480
+ }
481
+ return parseLaunchctlPrint(r.stdout);
482
+ }
483
+ // linux / systemd. `is-active` exits 0 only when active; we classify from
484
+ // stdout (the state word) regardless of exit code.
485
+ const root = (deps.getuid() ?? 1000) === 0;
486
+ const scope = root ? [] : ["--user"];
487
+ const r = deps.run(["systemctl", ...scope, "is-active", HUB_SYSTEMD_UNIT_NAME]);
488
+ const token = r.stdout.trim() || r.stderr.trim();
489
+ if (token.length === 0) return { state: "unknown" };
490
+ const state = mapSystemdActiveToken(token);
491
+ return state === "unknown" ? { state, detail: token } : { state };
492
+ } catch (err) {
493
+ // A manager-query failure must never crash `status` — degrade to unknown.
494
+ return { state: "unknown", detail: err instanceof Error ? err.message : String(err) };
495
+ }
496
+ }
497
+
498
+ /**
499
+ * Ensure the hub UNIT is up (design §3.2). Probe `/health`; if down, start the
500
+ * unit via the platform manager; wait for readiness; surface the unit log on
501
+ * timeout. Returns a structured outcome — the CALLER decides exit codes /
502
+ * messaging (so `init` and the future per-module-verb path can present it
503
+ * differently).
504
+ *
505
+ * Does NOT install a unit when none exists — that's the caller's job (`init`
506
+ * installs; per-module verbs tell the operator to `parachute migrate`).
507
+ */
508
+ export async function ensureHubUnit(opts: EnsureHubUnitOpts = {}): Promise<EnsureHubUnitResult> {
509
+ const deps = opts.deps ?? defaultHubUnitDeps;
510
+ const port = opts.port ?? HUB_UNIT_DEFAULT_PORT;
511
+ const readyTimeoutMs = opts.readyTimeoutMs ?? 15_000;
512
+ const readyPollMs = opts.readyPollMs ?? 250;
513
+ const logTailLines = opts.logTailLines ?? 50;
514
+ const log = opts.log ?? (() => {});
515
+
516
+ // Step 1: probe the loopback hub. If it answers, we're done — no manager call.
517
+ if (await deps.probeHealth(port)) {
518
+ return { outcome: "already-up", port, messages: [] };
519
+ }
520
+
521
+ // Step 4 (checked before 2/3): is there ANY manager? A box with neither
522
+ // systemctl nor launchctl can't run a background unit at all (R19/D1).
523
+ if (!hasServiceManager(deps)) {
524
+ return { outcome: "no-manager", port, messages: [NO_MANAGER_MESSAGE] };
525
+ }
526
+
527
+ // Step 3: is a unit installed? If not, we can't start it — fail actionably
528
+ // rather than silently spawning a detached hub.
529
+ if (!isHubUnitInstalled(deps)) {
530
+ return { outcome: "no-unit", port, messages: [NO_UNIT_MESSAGE] };
531
+ }
532
+
533
+ // Step 2: start the unit via the platform manager.
534
+ log("Hub not responding — starting the hub unit via the service manager…");
535
+ const started = startHubUnitViaManager(deps);
536
+ if (started.code !== 0) {
537
+ const detail = started.stderr.trim() || started.stdout.trim() || "unknown error";
538
+ return {
539
+ outcome: "start-failed",
540
+ port,
541
+ messages: [
542
+ `failed to start the hub unit via the service manager (${detail}) — run \`parachute migrate\` to (re)install it, or \`parachute serve\` in the foreground`,
543
+ ],
544
+ };
545
+ }
546
+
547
+ // Step 5: wait for readiness, polling the hub port. On timeout, surface the
548
+ // unit's recent log so a wedged hub is diagnosable rather than a silent hang.
549
+ const deadline = Date.now() + readyTimeoutMs;
550
+ for (;;) {
551
+ if (await deps.portListening(port)) {
552
+ return { outcome: "started", port, messages: [] };
553
+ }
554
+ if (Date.now() >= deadline) break;
555
+ if (readyPollMs > 0) await deps.sleep(readyPollMs);
556
+ else break;
557
+ }
558
+ // One final check after the loop (covers readyPollMs===0 / fast-forward).
559
+ if (await deps.portListening(port)) {
560
+ return { outcome: "started", port, messages: [] };
561
+ }
562
+
563
+ const messages = [
564
+ `hub unit started but did not become ready on 127.0.0.1:${port} within ${Math.round(
565
+ readyTimeoutMs / 1000,
566
+ )}s`,
567
+ ...tailHubUnitLog(deps, logTailLines),
568
+ ];
569
+ return { outcome: "timeout", port, messages };
570
+ }
571
+
572
+ export interface InstallAndStartHubUnitResult {
573
+ /** Outcome of the post-install readiness wait (see {@link HubUnitOutcome}). */
574
+ outcome: HubUnitOutcome;
575
+ /** The hub port. */
576
+ port: number;
577
+ /** Result of the unit-file install step. */
578
+ install: ManagedUnitInstallResult;
579
+ /** Human-readable lines the caller should surface. */
580
+ messages: string[];
581
+ }
582
+
583
+ export interface InstallAndStartHubUnitOpts {
584
+ /** The operator's CURRENT `PARACHUTE_HOME` (captured per §4.2). */
585
+ parachuteHome: string;
586
+ /**
587
+ * Absolute path to `parachute-hub`'s `src/cli.ts` the unit runs `serve`
588
+ * against. Caller resolves it (the bun-linked checkout or installed bin).
589
+ */
590
+ cliPath: string;
591
+ /** Hub port (default 1939). */
592
+ port?: number;
593
+ /** `$BUN_INSTALL` to bake into the unit env (default `$HOME/.bun`). */
594
+ bunInstall?: string;
595
+ /** PATH to bake into the unit env (default a bun-bin-first sane PATH). */
596
+ path?: string;
597
+ /** Log file the hub's stdout+stderr is written to (default the hub logPath). */
598
+ logPath?: string;
599
+ /** Injectable deps (defaults to production). */
600
+ deps?: HubUnitDeps;
601
+ /** Readiness budget in ms (default 15s). */
602
+ readyTimeoutMs?: number;
603
+ /** Poll interval in ms (default 250). */
604
+ readyPollMs?: number;
605
+ log?: (line: string) => void;
606
+ }
607
+
608
+ /** Messages for the hub-unit install (hub wording, mirroring the connector's). */
609
+ export function hubUnitMessages(): ManagedUnitMessages {
610
+ return {
611
+ launchctlMissing: NO_MANAGER_MESSAGE,
612
+ systemctlMissing: NO_MANAGER_MESSAGE,
613
+ lingerWarning:
614
+ "Note: could not enable lingering (loginctl enable-linger) — the hub will run while you're logged in but may not start on a cold boot before login. Re-run as root (a system unit needs no linger) if you want cold-boot survival.",
615
+ writeFailedPrefix: "Failed to write the hub unit file",
616
+ launchctlLoadFailedPrefix: "launchctl could not load the hub unit",
617
+ daemonReloadFailedPrefix: "systemctl daemon-reload failed",
618
+ enableFailedPrefix: "systemctl enable --now failed",
619
+ launchdInstalled: (label, started) =>
620
+ `Installed launchd LaunchAgent ${label} — the hub ${started ? "now runs and " : ""}starts on login/boot.`,
621
+ systemdInstalled: (unitName, root, started) =>
622
+ `Installed systemd ${root ? "system" : "user"} unit ${unitName} — the hub ${started ? "now runs and " : ""}starts on boot.`,
623
+ };
624
+ }
625
+
626
+ /**
627
+ * Sane default PATH for the hub unit when the caller doesn't supply one: bun's
628
+ * global bin first (so supervised children resolve a bun-linked binary on cold
629
+ * boot, R20), then the usual system dirs.
630
+ */
631
+ function defaultUnitPath(bunInstall: string): string {
632
+ return `${bunInstall}/bin:/usr/local/bin:/usr/bin:/bin`;
633
+ }
634
+
635
+ /**
636
+ * Build + install + start the hub unit, then wait for hub readiness (design
637
+ * §3.3 init row / appendix c). This is the `init`-side bringup that REPLACES
638
+ * the detached `ensureHubRunning` spawn:
639
+ * 1. `buildHubManagedUnit` (captures the operator's current PARACHUTE_HOME,
640
+ * resolves abs bun via `which`, launchd-by-default on Mac per D2).
641
+ * 2. `installManagedUnit(unit, { start: true })`.
642
+ * 3. Wait for hub readiness (port poll, surface the unit log on timeout).
643
+ *
644
+ * Graceful: when the platform has no manager (`installManagedUnit` returns
645
+ * `{ outcome: "fallback" }`), this returns `outcome: "no-manager"` WITHOUT
646
+ * spawning anything — the container/init-less path is foreground `serve`, not
647
+ * `init` (§3.2 step 4 / Deliverable-1 nuance).
648
+ */
649
+ export async function installAndStartHubUnit(
650
+ opts: InstallAndStartHubUnitOpts,
651
+ ): Promise<InstallAndStartHubUnitResult> {
652
+ const deps = opts.deps ?? defaultHubUnitDeps;
653
+ const port = opts.port ?? HUB_UNIT_DEFAULT_PORT;
654
+ const bunInstall = opts.bunInstall ?? `${deps.homeDir()}/.bun`;
655
+ const path = opts.path ?? defaultUnitPath(bunInstall);
656
+ const logPath = opts.logPath ?? `${opts.parachuteHome}/hub/logs/hub.log`;
657
+ const log = opts.log ?? (() => {});
658
+
659
+ // A platform with no manager can't host a unit — short-circuit to the clear
660
+ // foreground-serve message BEFORE building a unit we can't install (§3.2
661
+ // step 4). On a container the runtime CMD is `serve`, not `init`.
662
+ if (!hasServiceManager(deps)) {
663
+ return {
664
+ outcome: "no-manager",
665
+ port,
666
+ install: { outcome: "fallback", messages: [NO_MANAGER_MESSAGE] },
667
+ messages: [NO_MANAGER_MESSAGE],
668
+ };
669
+ }
670
+
671
+ let unit: ManagedUnit;
672
+ try {
673
+ unit = buildHubManagedUnit({
674
+ parachuteHome: opts.parachuteHome,
675
+ port,
676
+ bunInstall,
677
+ path,
678
+ cliPath: opts.cliPath,
679
+ logPath,
680
+ // HubUnitDeps extends ManagedUnitDeps — pass it straight through.
681
+ deps,
682
+ });
683
+ } catch (err) {
684
+ // `bun` couldn't be resolved to an absolute path — refuse to bake a broken
685
+ // ExecStart. Surface it; the caller treats this as a hard failure.
686
+ const detail = err instanceof Error ? err.message : String(err);
687
+ return {
688
+ outcome: "start-failed",
689
+ port,
690
+ install: { outcome: "fallback", messages: [detail] },
691
+ messages: [detail],
692
+ };
693
+ }
694
+
695
+ const install = installManagedUnit({
696
+ unit,
697
+ deps,
698
+ messages: hubUnitMessages(),
699
+ start: true,
700
+ });
701
+
702
+ if (install.outcome === "fallback") {
703
+ // The manager probe passed but install still degraded (write failed,
704
+ // enable failed, etc.). Surface the install messages; no unit is running.
705
+ return { outcome: "no-manager", port, install, messages: install.messages };
706
+ }
707
+
708
+ for (const m of install.messages) log(m);
709
+
710
+ // Wait for readiness. The unit's RunAtLoad/enable--now already started it;
711
+ // we poll the port + surface the unit log on timeout (§3.2 step 5).
712
+ const readyTimeoutMs = opts.readyTimeoutMs ?? 15_000;
713
+ const readyPollMs = opts.readyPollMs ?? 250;
714
+ const deadline = Date.now() + readyTimeoutMs;
715
+ for (;;) {
716
+ if (await deps.portListening(port)) {
717
+ return { outcome: "started", port, install, messages: install.messages };
718
+ }
719
+ if (Date.now() >= deadline) break;
720
+ if (readyPollMs > 0) await deps.sleep(readyPollMs);
721
+ else break;
722
+ }
723
+ if (await deps.portListening(port)) {
724
+ return { outcome: "started", port, install, messages: install.messages };
725
+ }
726
+
727
+ const messages = [
728
+ ...install.messages,
729
+ `hub unit installed but did not become ready on 127.0.0.1:${port} within ${Math.round(
730
+ readyTimeoutMs / 1000,
731
+ )}s`,
732
+ ...tailHubUnitLog(deps, 50),
733
+ ];
734
+ return { outcome: "timeout", port, install, messages };
735
+ }