@openparachute/hub 0.6.2 → 0.6.3-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/README.md +87 -35
  2. package/package.json +1 -1
  3. package/src/__tests__/api-hub-upgrade.test.ts +690 -0
  4. package/src/__tests__/api-modules-ops.test.ts +359 -3
  5. package/src/__tests__/api-modules.test.ts +54 -0
  6. package/src/__tests__/expose-cloudflare.test.ts +163 -72
  7. package/src/__tests__/expose-off-auto.test.ts +26 -1
  8. package/src/__tests__/expose.test.ts +260 -240
  9. package/src/__tests__/hub-control.test.ts +1 -242
  10. package/src/__tests__/hub-server.test.ts +64 -0
  11. package/src/__tests__/hub-unit.test.ts +574 -0
  12. package/src/__tests__/init.test.ts +219 -2
  13. package/src/__tests__/lifecycle.test.ts +416 -1448
  14. package/src/__tests__/managed-unit.test.ts +575 -0
  15. package/src/__tests__/migrate-cutover.test.ts +840 -0
  16. package/src/__tests__/migrate-offer.test.ts +240 -0
  17. package/src/__tests__/migrate.test.ts +132 -0
  18. package/src/__tests__/module-ops-client.test.ts +556 -0
  19. package/src/__tests__/port-probe.test.ts +23 -0
  20. package/src/__tests__/setup-wizard.test.ts +130 -0
  21. package/src/__tests__/status-supervisor.test.ts +504 -0
  22. package/src/__tests__/status.test.ts +157 -708
  23. package/src/__tests__/supervisor.test.ts +471 -6
  24. package/src/__tests__/upgrade.test.ts +351 -5
  25. package/src/api-hub-upgrade.ts +384 -0
  26. package/src/api-hub.ts +2 -1
  27. package/src/api-modules-ops.ts +221 -0
  28. package/src/api-modules.ts +18 -2
  29. package/src/cli.ts +97 -12
  30. package/src/cloudflare/connector-service.ts +117 -322
  31. package/src/commands/expose-cloudflare.ts +63 -71
  32. package/src/commands/expose-supervisor.ts +247 -0
  33. package/src/commands/expose.ts +59 -48
  34. package/src/commands/init.ts +225 -12
  35. package/src/commands/lifecycle.ts +455 -816
  36. package/src/commands/migrate-cutover.ts +837 -0
  37. package/src/commands/migrate.ts +71 -2
  38. package/src/commands/serve-boot.ts +71 -25
  39. package/src/commands/status.ts +535 -235
  40. package/src/commands/upgrade.ts +100 -2
  41. package/src/help.ts +128 -68
  42. package/src/hub-control.ts +23 -162
  43. package/src/hub-server.ts +39 -0
  44. package/src/hub-unit.ts +735 -0
  45. package/src/hub-upgrade-helper.ts +306 -0
  46. package/src/hub-upgrade-mode.ts +209 -0
  47. package/src/hub-upgrade-status.ts +150 -0
  48. package/src/managed-unit.ts +692 -0
  49. package/src/migrate-offer.ts +186 -0
  50. package/src/module-ops-client.ts +457 -0
  51. package/src/port-probe.ts +50 -0
  52. package/src/process-state.ts +19 -3
  53. package/src/setup-wizard.ts +80 -1
  54. package/src/supervisor.ts +389 -38
  55. package/web/ui/dist/assets/index-D_6AFvZy.js +61 -0
  56. package/web/ui/dist/assets/{index-BiBlvEaj.css → index-mz8XcVPP.css} +1 -1
  57. package/web/ui/dist/index.html +2 -2
  58. package/web/ui/dist/assets/index-CIN3mnmf.js +0 -61
@@ -1,5 +1,15 @@
1
+ import type { Database } from "bun:sqlite";
1
2
  import { CONFIG_DIR, SERVICES_MANIFEST_PATH } from "../config.ts";
2
- import { HUB_SVC, readHubPort } from "../hub-control.ts";
3
+ import { readHubPort } from "../hub-control.ts";
4
+ import { hubDbPath, openHubDb } from "../hub-db.ts";
5
+ import {
6
+ HUB_UNIT_DEFAULT_PORT,
7
+ type HubUnitDeps,
8
+ type HubUnitState,
9
+ type HubUnitStateResult,
10
+ defaultHubUnitDeps,
11
+ queryHubUnitState as queryHubUnitStateImpl,
12
+ } from "../hub-unit.ts";
3
13
  import {
4
14
  type DetectInstallSourceDeps,
5
15
  detectHubInstallSource,
@@ -7,20 +17,20 @@ import {
7
17
  formatInstallSourceLabel,
8
18
  isStale,
9
19
  } from "../install-source.ts";
10
- import { type AliveFn, defaultAlive, formatUptime, processState } from "../process-state.ts";
20
+ import {
21
+ type DriveModuleOpDeps,
22
+ type ModuleStatesResult,
23
+ NoOperatorTokenError,
24
+ OperatorTokenExpiredError,
25
+ fetchModuleStates as fetchModuleStatesImpl,
26
+ } from "../module-ops-client.ts";
11
27
  import { canonicalPortForManifest, getSpec, shortNameForManifest } from "../service-spec.ts";
12
28
  import { type ServiceEntry, readManifest } from "../services-manifest.ts";
13
29
 
14
- export type FetchFn = (url: string, init?: RequestInit) => Promise<Response>;
15
-
16
30
  export interface StatusOpts {
17
31
  manifestPath?: string;
18
- fetchImpl?: FetchFn;
19
32
  print?: (line: string) => void;
20
- timeoutMs?: number;
21
33
  configDir?: string;
22
- alive?: AliveFn;
23
- now?: () => Date;
24
34
  /**
25
35
  * Test seam for install-source detection. Production reads the filesystem
26
36
  * + shells out to git; tests inject stubs so each case (npm / bun-linked /
@@ -34,54 +44,41 @@ export interface StatusOpts {
34
44
  * source classification doesn't depend on the test runner's location.
35
45
  */
36
46
  hubSrcDir?: string;
37
- }
38
-
39
- export interface ProbeResult {
40
- entry: ServiceEntry;
41
- healthy: boolean;
42
- statusCode?: number;
43
- error?: string;
44
- latencyMs: number;
45
- }
46
-
47
- export async function probe(
48
- entry: ServiceEntry,
49
- fetchImpl: FetchFn,
50
- timeoutMs: number,
51
- ): Promise<ProbeResult> {
52
- const url = `http://localhost:${entry.port}${entry.health}`;
53
- const controller = new AbortController();
54
- const timer = setTimeout(() => controller.abort(), timeoutMs);
55
- const start = performance.now();
56
- try {
57
- const res = await fetchImpl(url, { signal: controller.signal });
58
- const latencyMs = Math.round(performance.now() - start);
59
- // A 401 is the service replying "I'm up but this endpoint requires auth"
60
- // that's strictly healthy from a liveness perspective. Vault's
61
- // canonical health path `/vault/<name>/health` is auth-gated; without
62
- // this carve-out, `parachute status` shows vault as "failing" on every
63
- // fresh install (first impression UX disaster despite vault being fine).
64
- // 5xx unhealthy; 200-class healthy; 401 healthy + auth-gated.
65
- // Other 4xx (404 / 400 / etc.) still count as unhealthy — those mean
66
- // the configured health path doesn't exist or is shaped wrong.
67
- const healthy = res.ok || res.status === 401;
68
- return {
69
- entry,
70
- healthy,
71
- statusCode: res.status,
72
- latencyMs,
73
- };
74
- } catch (err) {
75
- const latencyMs = Math.round(performance.now() - start);
76
- return {
77
- entry,
78
- healthy: false,
79
- error: err instanceof Error ? err.message : String(err),
80
- latencyMs,
81
- };
82
- } finally {
83
- clearTimeout(timer);
84
- }
47
+ /**
48
+ * Supervisor-path seams (design §6.4) — the ONLY runtime as of Phase 5b.
49
+ * `status` reads the hub row from the PLATFORM MANAGER (`queryHubUnitState`)
50
+ * + `/health`, and the module rows from the RUNNING supervisor (`GET
51
+ * /api/modules` via the operator-token→Bearer path). The detached
52
+ * pidfile/`processState` arm was retired in Phase 5b.
53
+ *
54
+ * Everything here is injectable so tests drive it without a real
55
+ * launchd/systemd/socket/HTTP call. Production wires the real machinery; the
56
+ * read paths are bounded + degrade gracefully on every failure (no manager,
57
+ * hub down, no token, API error) so `status` never hangs or crashes.
58
+ */
59
+ supervisor?: {
60
+ /** Deps for `queryHubUnitState` + the `/health` probe. */
61
+ hubUnitDeps?: HubUnitDeps;
62
+ /** Query the platform manager for the hub unit's run-state (§6.4 hub row). */
63
+ queryHubUnitState?: (deps: HubUnitDeps) => HubUnitStateResult;
64
+ /**
65
+ * Probe whether the loopback hub answers `/health`. The liveness signal for
66
+ * the hub row (§6.4) AND the gate for reading module states: if the hub is
67
+ * down, skip the API read and show modules degraded. Production reuses the
68
+ * hub-unit deps' bounded `probeHealth`.
69
+ */
70
+ probeHubHealth?: (port: number) => Promise<boolean>;
71
+ /** Read the running supervisor's module states (§6.4 module rows). */
72
+ fetchModuleStates?: (deps: DriveModuleOpDeps) => Promise<ModuleStatesResult>;
73
+ /**
74
+ * Open the hub DB used to validate/auto-rotate the operator token in
75
+ * `fetchModuleStates`. Production opens `<configDir>/hub.db`; tests inject a
76
+ * seeded db. Returns a handle the caller closes.
77
+ */
78
+ openDb?: (configDir: string) => Database;
79
+ /** Loopback hub base URL override (default derives from the hub port). */
80
+ baseUrl?: string;
81
+ };
85
82
  }
86
83
 
87
84
  function formatRow(cells: string[], widths: number[]): string {
@@ -154,6 +151,16 @@ interface StatusRow {
154
151
  * just showing it inactive. Cleared on the next successful start.
155
152
  */
156
153
  startErrorNote?: string;
154
+ /**
155
+ * Hub-row-only manager-context note (Phase 3c, §6.4). Surfaces the platform
156
+ * manager's view when it adds signal the STATE column can't carry:
157
+ * - "container runtime (managed)" on Render/Fly (no on-box manager).
158
+ * - "service manager reports active; /health not answering yet (starting or
159
+ * unhealthy)" when the unit is up but the hub isn't serving.
160
+ * - the manager's failed-unit detail / last-exit code.
161
+ * Printed on a continuation line like the other notes.
162
+ */
163
+ managerNote?: string;
157
164
  }
158
165
 
159
166
  /**
@@ -170,190 +177,103 @@ function urlForEntry(entry: ServiceEntry, short: string | undefined): string | u
170
177
  return `http://127.0.0.1:${entry.port}${first}`;
171
178
  }
172
179
 
173
- function hubRow(
174
- configDir: string,
175
- alive: AliveFn,
176
- nowDate: Date,
177
- hubSrcDir: string,
180
+ /**
181
+ * The MANIFEST-derived portion of a module row — port/version/URL/drift/source/
182
+ * stale and the persisted `lastStartError` note. The supervisor read fills in
183
+ * the run-state fields (STATE / PID / UPTIME) on top.
184
+ *
185
+ * Pure over the manifest entry + install-source deps; no process / network read.
186
+ */
187
+ interface ManifestRowBase {
188
+ short: string | undefined;
189
+ url: string | undefined;
190
+ driftWarning?: string;
191
+ sourceLabel: string;
192
+ staleNote?: string;
193
+ /** The persisted `lastStartError` note (detached preflight wrote it). */
194
+ manifestStartErrorNote?: string;
195
+ }
196
+
197
+ function manifestRowBase(
198
+ entry: ServiceEntry,
178
199
  installSourceDeps: DetectInstallSourceDeps,
179
- ): StatusRow | undefined {
180
- const proc = processState(HUB_SVC, configDir, alive);
181
- if (proc.status === "unknown") return undefined;
182
- const port = readHubPort(configDir);
183
- const portLabel = port !== undefined ? String(port) : "-";
184
- // Hub doesn't self-probe (it'd be probing itself over loopback). Treat
185
- // "running pidfile" as `active` and "stopped" as `inactive` — the same
186
- // STATE rollup every other row uses, just without the probe input.
187
- const stateLabel: StateLabel = proc.status === "running" ? "active" : "inactive";
188
- const pidLabel = proc.status === "running" && proc.pid !== undefined ? String(proc.pid) : "-";
189
- const uptimeLabel =
190
- proc.status === "running" && proc.startedAt ? formatUptime(proc.startedAt, nowDate) : "-";
191
- const source = detectHubInstallSource(hubSrcDir, installSourceDeps);
192
- return {
193
- service: "parachute-hub (internal)",
194
- port: portLabel,
195
- version: source.livePackageVersion ?? "-",
196
- stateLabel,
197
- pidLabel,
198
- uptimeLabel,
199
- healthDetail: "-",
200
- latencyLabel: "-",
201
- sourceLabel: formatInstallSourceLabel(source),
202
- url: port !== undefined ? `http://127.0.0.1:${port}` : undefined,
203
- healthy: true,
204
- skipped: true,
205
- };
200
+ ): ManifestRowBase {
201
+ // Third-party rows (with `installDir`) live under `~/.parachute/<entry.name>/`,
202
+ // matching what `parachute start` uses as the short. First-party rows still
203
+ // map manifestName → short via the canonical fallback.
204
+ const short = shortNameForManifest(entry.name) ?? (entry.installDir ? entry.name : undefined);
205
+ const url = urlForEntry(entry, short);
206
+
207
+ // Canonical-port drift detection (hub#195). Only fires for known first-party
208
+ // services where we have a canonical assignment. Third-party rows have no
209
+ // canonical to compare against. Informational operators may have moved a
210
+ // service off canonical deliberately.
211
+ const canonical = canonicalPortForManifest(entry.name);
212
+ const driftWarning =
213
+ canonical !== undefined && canonical !== entry.port
214
+ ? `canonical port is ${canonical}`
215
+ : undefined;
216
+
217
+ // Install-source detection (hub#243). One filesystem walk + maybe one
218
+ // `git rev-parse` per row. Failures degrade silently to `unknown`.
219
+ const detectArgs: { entryName: string; installDir?: string } = { entryName: entry.name };
220
+ if (entry.installDir !== undefined) detectArgs.installDir = entry.installDir;
221
+ const source = detectInstallSource(detectArgs, installSourceDeps);
222
+ const sourceLabel = formatInstallSourceLabel(source);
223
+ const staleNote = isStale(entry.version, source)
224
+ ? `STALE: services.json cached ${entry.version}; live package.json ${source.livePackageVersion}`
225
+ : undefined;
226
+
227
+ // Persisted last-start failure (lifecycle preflight wrote a missing-dependency
228
+ // wire onto services.json). Surface a one-line summary; the full install
229
+ // recipe lives in services.json + the admin SPA card.
230
+ const manifestStartErrorNote =
231
+ entry.lastStartError !== undefined
232
+ ? entry.lastStartError.binary !== undefined
233
+ ? `failed to start: ${entry.lastStartError.binary} not installed — run \`parachute status\` detail or see /admin/modules for install steps`
234
+ : `failed to start: ${entry.lastStartError.error_description.split("\n")[0]}`
235
+ : undefined;
236
+
237
+ return { short, url, driftWarning, sourceLabel, staleNote, manifestStartErrorNote };
206
238
  }
207
239
 
208
240
  export async function status(opts: StatusOpts = {}): Promise<number> {
209
241
  const manifestPath = opts.manifestPath ?? SERVICES_MANIFEST_PATH;
210
- const fetchImpl = opts.fetchImpl ?? fetch;
211
242
  const print = opts.print ?? ((line) => console.log(line));
212
- const timeoutMs = opts.timeoutMs ?? 1500;
213
243
  const configDir = opts.configDir ?? CONFIG_DIR;
214
- const alive = opts.alive ?? defaultAlive;
215
- const now = opts.now ?? (() => new Date());
216
244
  const installSourceDeps = opts.installSourceDeps ?? {};
217
245
  const hubSrcDir = opts.hubSrcDir ?? import.meta.dir;
218
246
 
219
247
  const manifest = readManifest(manifestPath);
220
- if (manifest.services.length === 0) {
221
- print("No services installed yet.");
222
- print("Try: parachute install vault");
223
- return 0;
224
- }
225
-
226
- const nowDate = now();
227
-
228
- /**
229
- * Per-row resolution: look up the short name so we can read PID state,
230
- * skip the health probe when the process is known-stopped (ECONNREFUSED
231
- * noise isn't informative), and report it as running/stopped + uptime.
232
- *
233
- * Third-party services we don't know about fall back to probing and show
234
- * "-" for process columns.
235
- */
236
- const rows: StatusRow[] = await Promise.all(
237
- manifest.services.map(async (entry) => {
238
- // Third-party rows (with `installDir`) live under `~/.parachute/<entry.name>/`,
239
- // matching what `parachute start` uses as the short. First-party rows still
240
- // map manifestName → short via the canonical fallback.
241
- const short = shortNameForManifest(entry.name) ?? (entry.installDir ? entry.name : undefined);
242
- const proc = short ? processState(short, configDir, alive) : undefined;
243
-
244
- const pidLabel =
245
- proc?.status === "running" && proc.pid !== undefined ? String(proc.pid) : "-";
246
- const uptimeLabel =
247
- proc?.status === "running" && proc.startedAt ? formatUptime(proc.startedAt, nowDate) : "-";
248
-
249
- const url = urlForEntry(entry, short);
250
-
251
- // Canonical-port drift detection (hub#195). Only fires for known
252
- // first-party services where we have a canonical assignment. Third-party
253
- // rows have no canonical to compare against. Warning is informational —
254
- // operators may have moved a service off canonical deliberately.
255
- // Note: multi-vault instance rows (`parachute-vault-<instance>`) don't
256
- // match a canonical manifest name, so drift warnings don't fire for
257
- // them. Intentional — see `canonicalPortForManifest` for the rationale.
258
- const canonical = canonicalPortForManifest(entry.name);
259
- const driftWarning =
260
- canonical !== undefined && canonical !== entry.port
261
- ? `canonical port is ${canonical}`
262
- : undefined;
263
-
264
- // Install-source detection (hub#243). One filesystem walk + maybe one
265
- // `git rev-parse` per row. Failures degrade silently to `unknown` —
266
- // status output should never error out on a missing checkout dir.
267
- const detectArgs: { entryName: string; installDir?: string } = { entryName: entry.name };
268
- if (entry.installDir !== undefined) detectArgs.installDir = entry.installDir;
269
- const source = detectInstallSource(detectArgs, installSourceDeps);
270
- const sourceLabel = formatInstallSourceLabel(source);
271
- const staleNote = isStale(entry.version, source)
272
- ? `STALE: services.json cached ${entry.version}; live package.json ${source.livePackageVersion}`
273
- : undefined;
274
-
275
- // Persisted last-start failure (lifecycle preflight wrote a missing-
276
- // dependency wire). Surface a one-line summary; the full install recipe
277
- // lives in services.json + the admin SPA card. Keeps `parachute status`
278
- // scannable while still telling the operator "this is why it's down."
279
- const startErrorNote =
280
- entry.lastStartError !== undefined
281
- ? entry.lastStartError.binary !== undefined
282
- ? `failed to start: ${entry.lastStartError.binary} not installed — run \`parachute status\` detail or see /admin/modules for install steps`
283
- : `failed to start: ${entry.lastStartError.error_description.split("\n")[0]}`
284
- : undefined;
285
-
286
- // Only skip probe when we know the process is dead (PID file was
287
- // present but kill(pid, 0) failed). "unknown" status (no PID file)
288
- // still probes — externally-managed services should report health.
289
- if (proc?.status === "stopped") {
290
- return {
291
- service: entry.name,
292
- port: String(entry.port),
293
- version: entry.version,
294
- // Operator deliberately stopped (or pidfile-but-dead) maps to
295
- // `inactive` per design-system.md §6 — same surface as "never
296
- // started." No probe is informative when we know the process
297
- // is dead.
298
- stateLabel: "inactive",
299
- pidLabel,
300
- uptimeLabel,
301
- healthDetail: "-",
302
- latencyLabel: "-",
303
- sourceLabel,
304
- url,
305
- healthy: false,
306
- skipped: true,
307
- driftWarning,
308
- staleNote,
309
- startErrorNote,
310
- };
311
- }
312
248
 
313
- const p = await probe(entry, fetchImpl, timeoutMs);
314
- const healthDetail = p.healthy
315
- ? "ok"
316
- : p.statusCode !== undefined
317
- ? `http ${p.statusCode}`
318
- : (p.error ?? "down");
319
- // STATE rollup per design-system.md §6:
320
- // - probe ok → `active`
321
- // - probe failed → `failing` (the probe ran, so the
322
- // process is up enough to answer or
323
- // refuse — it's failing, not stopped)
324
- // - no PID file + probe fails → `failing` too (externally-managed
325
- // row that's down is still "failing"
326
- // from the operator's view)
327
- // The `pending` state isn't reachable from `parachute status` today
328
- // — pending-OAuth surfaces in the admin SPA, not the CLI. If a
329
- // future surface adds it (e.g. supervisor reports `pending-config`
330
- // for unconfigured modules), wire it here.
331
- const stateLabel: StateLabel = p.healthy ? "active" : "failing";
332
- return {
333
- service: entry.name,
334
- port: String(entry.port),
335
- version: entry.version,
336
- stateLabel,
337
- pidLabel,
338
- uptimeLabel,
339
- healthDetail,
340
- latencyLabel: `${p.latencyMs}ms`,
341
- sourceLabel,
342
- url,
343
- healthy: p.healthy,
344
- skipped: false,
345
- driftWarning,
346
- staleNote,
347
- startErrorNote,
348
- };
349
- }),
350
- );
351
-
352
- // Hub is an internal service — not in services.json, but users notice
353
- // when it's dead. Only show it if we've seen it run.
354
- const hub = hubRow(configDir, alive, nowDate, hubSrcDir, installSourceDeps);
355
- if (hub) rows.push(hub);
249
+ // Supervised path only (Phase 5b — the detached pidfile arm is retired). Read
250
+ // the hub row from the platform manager + `/health` and the module rows from
251
+ // the RUNNING supervisor (§6.4). The hub row is meaningful even with zero
252
+ // modules installed (the hub runs under a unit), so a "no modules" table is
253
+ // rendered rather than the old "No services installed yet." early return.
254
+ const sup = resolveStatusSupervisor(opts.supervisor);
255
+ const rows = await buildSupervisorRows({
256
+ manifest,
257
+ configDir,
258
+ installSourceDeps,
259
+ hubSrcDir,
260
+ sup,
261
+ });
262
+ renderRows(rows, print);
263
+ // A row is `healthy: false` + `!skipped` only when the supervisor (or the
264
+ // hub-row manager/health composition) says so (crashed / failing). A
265
+ // stopped/inactive row is expected (skipped, exit 0); a `failing` one exits 1.
266
+ const anyUnhealthy = rows.some((r) => !r.skipped && !r.healthy);
267
+ return anyUnhealthy ? 1 : 0;
268
+ }
356
269
 
270
+ /**
271
+ * Render the status table + continuation lines. Shared by the detached arm and
272
+ * the Phase 3c supervisor arm so the table shape (design-system.md §6 columns +
273
+ * the `→`/`!` continuation prefixes) is identical regardless of where each
274
+ * row's run-state was sourced. Pure over `rows` + the `print` sink.
275
+ */
276
+ function renderRows(rows: StatusRow[], print: (line: string) => void): void {
357
277
  // Header per design-system.md §6 "CLI status column shape":
358
278
  // SERVICE PORT VERSION STATE PID UPTIME LATENCY SOURCE
359
279
  // Pre-F shape was SERVICE PORT VERSION PROCESS PID UPTIME HEALTH LATENCY
@@ -397,17 +317,397 @@ export async function status(opts: StatusOpts = {}): Promise<number> {
397
317
  if (row.stateLabel === "failing" && row.healthDetail !== "-" && row.healthDetail.length > 0) {
398
318
  print(` ! probe: ${row.healthDetail}`);
399
319
  }
320
+ if (row.managerNote) print(` ! ${row.managerNote}`);
400
321
  if (row.driftWarning) print(` ! ${row.driftWarning}`);
401
322
  if (row.staleNote) print(` ! ${row.staleNote}`);
402
323
  if (row.startErrorNote) print(` ! ${row.startErrorNote}`);
403
324
  }
325
+ }
404
326
 
405
- /**
406
- * Overall exit: non-zero if any *probed* service is unhealthy. A stopped
407
- * service is expected ("I haven't started it yet"), not a failure — users
408
- * want `parachute status` to return 0 after a fresh install before they
409
- * `parachute start`. Health regressions among running services still 1.
410
- */
411
- const anyUnhealthy = rows.some((r) => !r.skipped && !r.healthy);
412
- return anyUnhealthy ? 1 : 0;
327
+ // ---------------------------------------------------------------------------
328
+ // Supervisor-path status (design §6.4) the ONLY runtime as of Phase 5b.
329
+ //
330
+ // `status` reads the hub row from the PLATFORM MANAGER (`queryHubUnitState`) +
331
+ // a `/health` probe, and the module rows from the RUNNING supervisor (`GET
332
+ // /api/modules` via the operator-token→Bearer path). Every read is bounded +
333
+ // degrades gracefully `status` is a diagnostic and must NEVER hang or crash
334
+ // regardless of hub/manager/token state. The detached pidfile arm was retired
335
+ // in Phase 5b.
336
+ // ---------------------------------------------------------------------------
337
+
338
+ /** Resolved supervisor-path seams (see `StatusOpts.supervisor`). */
339
+ interface ResolvedStatusSupervisor {
340
+ hubUnitDeps: HubUnitDeps;
341
+ queryHubUnitState: (deps: HubUnitDeps) => HubUnitStateResult;
342
+ probeHubHealth: (port: number) => Promise<boolean>;
343
+ fetchModuleStates: (deps: DriveModuleOpDeps) => Promise<ModuleStatesResult>;
344
+ openDb: (configDir: string) => Database;
345
+ baseUrl: string | undefined;
346
+ }
347
+
348
+ /**
349
+ * Resolve the supervisor-path seams. Production passes `supervisor: {}` (or
350
+ * omits it) and gets the real impls; tests inject the seams they want to assert.
351
+ */
352
+ function resolveStatusSupervisor(opts: StatusOpts["supervisor"]): ResolvedStatusSupervisor {
353
+ const hubUnitDeps = opts?.hubUnitDeps ?? defaultHubUnitDeps;
354
+ return {
355
+ hubUnitDeps,
356
+ queryHubUnitState: opts?.queryHubUnitState ?? queryHubUnitStateImpl,
357
+ probeHubHealth: opts?.probeHubHealth ?? hubUnitDeps.probeHealth,
358
+ fetchModuleStates: opts?.fetchModuleStates ?? fetchModuleStatesImpl,
359
+ openDb: opts?.openDb ?? ((configDir) => openHubDb(hubDbPath(configDir))),
360
+ baseUrl: opts?.baseUrl,
361
+ };
362
+ }
363
+
364
+ /**
365
+ * Resolve the issuer the operator token is validated against — the hub's
366
+ * current loopback origin. Mirrors lifecycle.ts's `resolveOperatorTokenIssuer`
367
+ * fallback (`readHubPort ?? HUB_UNIT_DEFAULT_PORT`); both resolve to 1939 under
368
+ * canonical-ports, so they agree with what `auth rotate-operator` minted under.
369
+ */
370
+ function statusOperatorTokenIssuer(configDir: string): string {
371
+ return `http://127.0.0.1:${readHubPort(configDir) ?? HUB_UNIT_DEFAULT_PORT}`;
372
+ }
373
+
374
+ /**
375
+ * Map a supervisor `ModuleState.status` to the canonical STATE rollup
376
+ * (design-system.md §6). `running` is `active`; `crashed` is `failing`;
377
+ * `starting` / `restarting` are `pending` (in-flight operator-visible
378
+ * transition); `stopped` is `inactive`. An unknown/absent status (module not
379
+ * tracked by the supervisor — never booted, skipped at boot) is `inactive`.
380
+ */
381
+ function mapSupervisorStatus(status: string | null): {
382
+ stateLabel: StateLabel;
383
+ healthy: boolean;
384
+ skipped: boolean;
385
+ } {
386
+ switch (status) {
387
+ case "running":
388
+ return { stateLabel: "active", healthy: true, skipped: false };
389
+ case "crashed":
390
+ return { stateLabel: "failing", healthy: false, skipped: false };
391
+ case "starting":
392
+ case "restarting":
393
+ // In-flight transition — supervised, mid-operation. `pending` is the
394
+ // canonical "needs-attention transient" rollup; treat as not-a-failure
395
+ // (skipped) so a mid-restart module doesn't flip `status` to exit 1.
396
+ return { stateLabel: "pending", healthy: true, skipped: true };
397
+ default:
398
+ // stopped / null / unknown — operator-stopped or never started. The
399
+ // `skipped: true` + `healthy: false` pairing is DELIBERATE, not a mismatch:
400
+ // - `healthy: false` is honest — an inactive module is genuinely not
401
+ // serving (so a detail renderer can style it as down, not green).
402
+ // - `skipped: true` keeps the exit-code check (`rows.some(r => !r.skipped
403
+ // && !r.healthy)` at the call site, ~:385) from counting an
404
+ // operator-stopped module as a FAILURE — `parachute stop vault` then
405
+ // `status` must still exit 0.
406
+ // This is the same combination + exit semantics the detached arm uses for
407
+ // its `inactive` (operator-stopped) rows.
408
+ return { stateLabel: "inactive", healthy: false, skipped: true };
409
+ }
410
+ }
411
+
412
+ /**
413
+ * Format a supervisor `startError` (the structured missing-dependency /
414
+ * started-but-unbound wire, §6.5) into the same one-line note the detached arm
415
+ * shows from `services.json.lastStartError` (#188). Returns undefined when
416
+ * there's no usable detail.
417
+ */
418
+ function supervisorStartErrorNote(startError: unknown): string | undefined {
419
+ if (!startError || typeof startError !== "object") return undefined;
420
+ const e = startError as { binary?: unknown; error_description?: unknown };
421
+ if (typeof e.binary === "string" && e.binary.length > 0) {
422
+ return `failed to start: ${e.binary} not installed — see /admin/modules for install steps`;
423
+ }
424
+ if (typeof e.error_description === "string" && e.error_description.length > 0) {
425
+ return `failed to start: ${e.error_description.split("\n")[0]}`;
426
+ }
427
+ return undefined;
428
+ }
429
+
430
+ interface BuildSupervisorRowsArgs {
431
+ manifest: ReturnType<typeof readManifest>;
432
+ configDir: string;
433
+ installSourceDeps: DetectInstallSourceDeps;
434
+ hubSrcDir: string;
435
+ sup: ResolvedStatusSupervisor;
436
+ }
437
+
438
+ /**
439
+ * Build the full status rows on a UNIT-MANAGED box (design §6.4): module rows
440
+ * from the running supervisor, the hub row from the platform manager + /health.
441
+ * Never throws — every read is wrapped + degrades to a sensible readout.
442
+ */
443
+ async function buildSupervisorRows(args: BuildSupervisorRowsArgs): Promise<StatusRow[]> {
444
+ const { manifest, configDir, installSourceDeps, hubSrcDir, sup } = args;
445
+ const port = readHubPort(configDir) ?? HUB_UNIT_DEFAULT_PORT;
446
+
447
+ // Probe the hub once: it's both the hub row's liveness signal AND the gate for
448
+ // whether the supervisor (module states) is reachable. Bounded; never throws.
449
+ let hubHealthy = false;
450
+ try {
451
+ hubHealthy = await sup.probeHubHealth(port);
452
+ } catch {
453
+ hubHealthy = false;
454
+ }
455
+
456
+ // Read the running supervisor's module states — ONLY when the hub answers
457
+ // (children die with the hub, so a down hub means every module is down; no
458
+ // point calling, and the call would just connection-refuse). Degrade on every
459
+ // failure path: no token, expired token, HTTP error, anything — `status`
460
+ // shows what it can rather than crashing.
461
+ let states: ModuleStatesResult | undefined;
462
+ let moduleReadNote: string | undefined;
463
+ if (hubHealthy) {
464
+ const db = sup.openDb(configDir);
465
+ try {
466
+ states = await sup.fetchModuleStates({
467
+ db,
468
+ issuer: statusOperatorTokenIssuer(configDir),
469
+ configDir,
470
+ ...(sup.baseUrl !== undefined ? { baseUrl: sup.baseUrl } : {}),
471
+ });
472
+ } catch (err) {
473
+ if (err instanceof NoOperatorTokenError || err instanceof OperatorTokenExpiredError) {
474
+ // No / expired operator token: we can't read module run-state, but the
475
+ // hub is up. Show the manifest-derived rows with an actionable note —
476
+ // do NOT 401-crash status (§6.4 graceful degradation).
477
+ moduleReadNote =
478
+ "couldn't read live module state — run `parachute auth rotate-operator` to mint an operator token";
479
+ } else {
480
+ // HTTP error / parse / anything else — degrade with the message.
481
+ moduleReadNote = `couldn't read live module state (${
482
+ err instanceof Error ? err.message : String(err)
483
+ })`;
484
+ }
485
+ } finally {
486
+ db.close();
487
+ }
488
+ }
489
+
490
+ const stateByShort = new Map<string, ModuleStatesResult["modules"][number]>();
491
+ for (const m of states?.modules ?? []) {
492
+ if (m.short) stateByShort.set(m.short, m);
493
+ }
494
+
495
+ const rows: StatusRow[] = manifest.services.map((entry) => {
496
+ const base = manifestRowBase(entry, installSourceDeps);
497
+ const snap = base.short ? stateByShort.get(base.short) : undefined;
498
+
499
+ if (!hubHealthy) {
500
+ // Hub is down → every supervised module is down with it. Show `inactive`
501
+ // (expected, not a failure) with a note rather than a probe failure.
502
+ return {
503
+ service: entry.name,
504
+ port: String(entry.port),
505
+ version: entry.version,
506
+ stateLabel: "inactive",
507
+ pidLabel: "-",
508
+ uptimeLabel: "-",
509
+ healthDetail: "-",
510
+ latencyLabel: "-",
511
+ sourceLabel: base.sourceLabel,
512
+ url: base.url,
513
+ healthy: false,
514
+ skipped: true,
515
+ ...(base.driftWarning ? { driftWarning: base.driftWarning } : {}),
516
+ ...(base.staleNote ? { staleNote: base.staleNote } : {}),
517
+ managerNote: "hub is down — its modules are stopped",
518
+ };
519
+ }
520
+
521
+ const { stateLabel, healthy, skipped } = mapSupervisorStatus(snap?.supervisor_status ?? null);
522
+ // Prefer the supervisor's structured start-error (live), else the persisted
523
+ // services.json note — same friendly surface either way (#188).
524
+ const startErrorNote =
525
+ supervisorStartErrorNote(snap?.supervisor_start_error) ?? base.manifestStartErrorNote;
526
+ const healthDetail =
527
+ stateLabel === "failing" ? `supervisor: ${snap?.supervisor_status ?? "crashed"}` : "-";
528
+
529
+ const row: StatusRow = {
530
+ service: entry.name,
531
+ port: String(entry.port),
532
+ version: entry.version,
533
+ stateLabel,
534
+ pidLabel: snap?.pid !== undefined && snap?.pid !== null ? String(snap.pid) : "-",
535
+ uptimeLabel: "-",
536
+ healthDetail,
537
+ latencyLabel: "-",
538
+ sourceLabel: base.sourceLabel,
539
+ url: base.url,
540
+ healthy,
541
+ skipped,
542
+ };
543
+ if (base.driftWarning) row.driftWarning = base.driftWarning;
544
+ if (base.staleNote) row.staleNote = base.staleNote;
545
+ if (startErrorNote) row.startErrorNote = startErrorNote;
546
+ // Surface the degraded-read note ONCE — on the first module row so the
547
+ // operator sees why run-state is missing, without repeating it on every row.
548
+ if (moduleReadNote) {
549
+ row.managerNote = moduleReadNote;
550
+ moduleReadNote = undefined;
551
+ }
552
+ return row;
553
+ });
554
+
555
+ const hub = buildSupervisorHubRow({
556
+ configDir,
557
+ hubSrcDir,
558
+ installSourceDeps,
559
+ sup,
560
+ port,
561
+ hubHealthy,
562
+ });
563
+ // If the degraded-read note never landed on a module row (empty manifest),
564
+ // surface it on the hub row so the operator still sees the actionable hint.
565
+ if (moduleReadNote && !hub.managerNote) hub.managerNote = moduleReadNote;
566
+ rows.push(hub);
567
+ return rows;
568
+ }
569
+
570
+ interface BuildSupervisorHubRowArgs {
571
+ configDir: string;
572
+ hubSrcDir: string;
573
+ installSourceDeps: DetectInstallSourceDeps;
574
+ sup: ResolvedStatusSupervisor;
575
+ port: number;
576
+ hubHealthy: boolean;
577
+ }
578
+
579
+ /**
580
+ * Build the hub row from the platform manager + /health (design §6.4). The
581
+ * manager's `queryHubUnitState` is the run-state; `/health` is the liveness
582
+ * signal. Composition:
583
+ * - manager `active` + /health OK → `active` (running).
584
+ * - manager `active` + /health down → `failing` with a "starting/unhealthy"
585
+ * note (the unit is up but not serving yet).
586
+ * - manager `failed` → `failing` (surface the last-exit code).
587
+ * - manager `inactive` → `inactive`.
588
+ * - no on-box manager (container/Render/Fly) → lean on /health for liveness;
589
+ * report "container runtime (managed)".
590
+ * Never throws — a manager-query failure degrades to the /health verdict.
591
+ */
592
+ function buildSupervisorHubRow(args: BuildSupervisorHubRowArgs): StatusRow {
593
+ const { configDir, hubSrcDir, installSourceDeps, sup, port, hubHealthy } = args;
594
+ const source = detectHubInstallSource(hubSrcDir, installSourceDeps);
595
+ const base: Omit<StatusRow, "stateLabel" | "pidLabel" | "uptimeLabel" | "healthy" | "skipped"> & {
596
+ healthDetail: string;
597
+ } = {
598
+ service: "parachute-hub (internal)",
599
+ port: String(port),
600
+ version: source.livePackageVersion ?? "-",
601
+ healthDetail: "-",
602
+ latencyLabel: "-",
603
+ sourceLabel: formatInstallSourceLabel(source),
604
+ url: `http://127.0.0.1:${port}`,
605
+ };
606
+
607
+ let managerState: HubUnitState;
608
+ let lastExitCode: number | undefined;
609
+ try {
610
+ const q = sup.queryHubUnitState(sup.hubUnitDeps);
611
+ managerState = q.state;
612
+ lastExitCode = q.lastExitCode;
613
+ } catch {
614
+ // The manager query must never crash status — fall back to /health only.
615
+ managerState = "unknown";
616
+ }
617
+
618
+ // No on-box manager (container / Render / Fly): there's nothing to query —
619
+ // `/health` is the sole liveness signal. Report the managed-runtime nuance.
620
+ if (managerState === "no-manager") {
621
+ return {
622
+ ...base,
623
+ stateLabel: hubHealthy ? "active" : "failing",
624
+ pidLabel: "-",
625
+ uptimeLabel: "-",
626
+ healthDetail: hubHealthy ? "-" : "down",
627
+ healthy: hubHealthy,
628
+ skipped: hubHealthy,
629
+ managerNote: "container runtime (managed)",
630
+ };
631
+ }
632
+
633
+ // Manager says failed: surface it as `failing` with the last-exit code even if
634
+ // a respawn happens to be answering /health right now.
635
+ if (managerState === "failed") {
636
+ return {
637
+ ...base,
638
+ stateLabel: "failing",
639
+ pidLabel: "-",
640
+ uptimeLabel: "-",
641
+ healthDetail: hubHealthy ? "service manager reports failed" : "down",
642
+ healthy: false,
643
+ skipped: false,
644
+ managerNote:
645
+ lastExitCode !== undefined
646
+ ? `service manager reports the hub unit failed (last exit code ${lastExitCode})`
647
+ : "service manager reports the hub unit failed",
648
+ };
649
+ }
650
+
651
+ // Manager says active.
652
+ if (managerState === "active") {
653
+ if (hubHealthy) {
654
+ return {
655
+ ...base,
656
+ stateLabel: "active",
657
+ pidLabel: "-",
658
+ uptimeLabel: "-",
659
+ healthy: true,
660
+ skipped: true,
661
+ };
662
+ }
663
+ // Active per the manager but not answering /health: starting up or wedged.
664
+ return {
665
+ ...base,
666
+ stateLabel: "failing",
667
+ pidLabel: "-",
668
+ uptimeLabel: "-",
669
+ healthDetail: "manager active, /health not answering",
670
+ healthy: false,
671
+ skipped: false,
672
+ managerNote:
673
+ "service manager reports active; /health not answering yet (starting or unhealthy)",
674
+ };
675
+ }
676
+
677
+ // Manager says activating: transient bring-up. If /health already answers,
678
+ // call it active; else show it as pending (in-flight).
679
+ if (managerState === "activating") {
680
+ return {
681
+ ...base,
682
+ stateLabel: hubHealthy ? "active" : "pending",
683
+ pidLabel: "-",
684
+ uptimeLabel: "-",
685
+ healthy: true,
686
+ skipped: true,
687
+ ...(hubHealthy ? {} : { managerNote: "service manager reports the hub unit is starting" }),
688
+ };
689
+ }
690
+
691
+ // Manager says inactive / unknown / no-unit (defensive — no-unit shouldn't
692
+ // reach here under the dual-dispatch). Trust /health as the tiebreaker: if the
693
+ // hub somehow answers, show active; else inactive.
694
+ if (hubHealthy) {
695
+ return {
696
+ ...base,
697
+ stateLabel: "active",
698
+ pidLabel: "-",
699
+ uptimeLabel: "-",
700
+ healthy: true,
701
+ skipped: true,
702
+ };
703
+ }
704
+ return {
705
+ ...base,
706
+ stateLabel: "inactive",
707
+ pidLabel: "-",
708
+ uptimeLabel: "-",
709
+ healthy: false,
710
+ skipped: true,
711
+ ...(managerState === "unknown" ? { managerNote: "service manager state unknown" } : {}),
712
+ };
413
713
  }