@openparachute/hub 0.6.2 → 0.6.3-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,16 @@
1
+ import type { Database } from "bun:sqlite";
1
2
  import { CONFIG_DIR, SERVICES_MANIFEST_PATH } from "../config.ts";
2
3
  import { HUB_SVC, readHubPort } from "../hub-control.ts";
4
+ import { hubDbPath, openHubDb } from "../hub-db.ts";
5
+ import {
6
+ HUB_UNIT_DEFAULT_PORT,
7
+ type HubUnitDeps,
8
+ type HubUnitState,
9
+ type HubUnitStateResult,
10
+ defaultHubUnitDeps,
11
+ isHubUnitInstalled,
12
+ queryHubUnitState as queryHubUnitStateImpl,
13
+ } from "../hub-unit.ts";
3
14
  import {
4
15
  type DetectInstallSourceDeps,
5
16
  detectHubInstallSource,
@@ -7,6 +18,13 @@ import {
7
18
  formatInstallSourceLabel,
8
19
  isStale,
9
20
  } from "../install-source.ts";
21
+ import {
22
+ type DriveModuleOpDeps,
23
+ type ModuleStatesResult,
24
+ NoOperatorTokenError,
25
+ OperatorTokenExpiredError,
26
+ fetchModuleStates as fetchModuleStatesImpl,
27
+ } from "../module-ops-client.ts";
10
28
  import { type AliveFn, defaultAlive, formatUptime, processState } from "../process-state.ts";
11
29
  import { canonicalPortForManifest, getSpec, shortNameForManifest } from "../service-spec.ts";
12
30
  import { type ServiceEntry, readManifest } from "../services-manifest.ts";
@@ -34,6 +52,54 @@ export interface StatusOpts {
34
52
  * source classification doesn't depend on the test runner's location.
35
53
  */
36
54
  hubSrcDir?: string;
55
+ /**
56
+ * Phase 3c supervisor-path seams (design §6.4). When a hub UNIT is installed
57
+ * (launchd/systemd/container — detected via {@link isHubUnitInstalled}),
58
+ * `status` reads the hub row from the PLATFORM MANAGER (`queryHubUnitState`)
59
+ * + `/health`, and the module rows from the RUNNING supervisor (`GET
60
+ * /api/modules` via the operator-token→Bearer path). On a legacy detached box
61
+ * (no hub unit) it keeps the EXACT pidfile/`processState` behavior, unchanged
62
+ * until Phase 5 retires it.
63
+ *
64
+ * Everything here is injectable so tests force either arm without a real
65
+ * launchd/systemd/socket/HTTP call. Production wires the real machinery; the
66
+ * read paths are bounded + degrade gracefully on every failure (no manager,
67
+ * hub down, no token, API error) so `status` never hangs or crashes.
68
+ */
69
+ supervisor?: {
70
+ /**
71
+ * Is a hub unit installed (the dual-dispatch discriminant)? Production uses
72
+ * `isHubUnitInstalled(hubUnitDeps)`. Tests set this `true`/`false` directly
73
+ * to pick the branch deterministically. When set, it wins over the
74
+ * `hubUnitDeps`-derived detection.
75
+ *
76
+ * Defaulting: when the caller OMITS the `supervisor` block entirely (every
77
+ * existing status test), the arm defaults to detached — so those tests stay
78
+ * deterministic regardless of whether the test host has a real hub unit.
79
+ */
80
+ unitInstalled?: boolean;
81
+ /** Deps for `isHubUnitInstalled` + `queryHubUnitState` + the `/health` probe. */
82
+ hubUnitDeps?: HubUnitDeps;
83
+ /** Query the platform manager for the hub unit's run-state (§6.4 hub row). */
84
+ queryHubUnitState?: (deps: HubUnitDeps) => HubUnitStateResult;
85
+ /**
86
+ * Probe whether the loopback hub answers `/health`. The liveness signal for
87
+ * the hub row (§6.4) AND the gate for reading module states: if the hub is
88
+ * down, skip the API read and show modules degraded. Production reuses the
89
+ * hub-unit deps' bounded `probeHealth`.
90
+ */
91
+ probeHubHealth?: (port: number) => Promise<boolean>;
92
+ /** Read the running supervisor's module states (§6.4 module rows). */
93
+ fetchModuleStates?: (deps: DriveModuleOpDeps) => Promise<ModuleStatesResult>;
94
+ /**
95
+ * Open the hub DB used to validate/auto-rotate the operator token in
96
+ * `fetchModuleStates`. Production opens `<configDir>/hub.db`; tests inject a
97
+ * seeded db. Returns a handle the caller closes.
98
+ */
99
+ openDb?: (configDir: string) => Database;
100
+ /** Loopback hub base URL override (default derives from the hub port). */
101
+ baseUrl?: string;
102
+ };
37
103
  }
38
104
 
39
105
  export interface ProbeResult {
@@ -154,6 +220,16 @@ interface StatusRow {
154
220
  * just showing it inactive. Cleared on the next successful start.
155
221
  */
156
222
  startErrorNote?: string;
223
+ /**
224
+ * Hub-row-only manager-context note (Phase 3c, §6.4). Surfaces the platform
225
+ * manager's view when it adds signal the STATE column can't carry:
226
+ * - "container runtime (managed)" on Render/Fly (no on-box manager).
227
+ * - "service manager reports active; /health not answering yet (starting or
228
+ * unhealthy)" when the unit is up but the hub isn't serving.
229
+ * - the manager's failed-unit detail / last-exit code.
230
+ * Printed on a continuation line like the other notes.
231
+ */
232
+ managerNote?: string;
157
233
  }
158
234
 
159
235
  /**
@@ -170,6 +246,70 @@ function urlForEntry(entry: ServiceEntry, short: string | undefined): string | u
170
246
  return `http://127.0.0.1:${entry.port}${first}`;
171
247
  }
172
248
 
249
+ /**
250
+ * The MANIFEST-derived portion of a module row — identical regardless of
251
+ * whether the row's run-state comes from a pidfile (detached arm) or the
252
+ * supervisor (Phase 3c). Extracting it keeps the two arms in lockstep on
253
+ * port/version/URL/drift/source/stale and the persisted `lastStartError` note,
254
+ * so the only per-arm difference is the run-state fields (STATE / PID / UPTIME).
255
+ *
256
+ * Pure over the manifest entry + install-source deps; no process / network
257
+ * read. Shared so the detached arm stays behavior-identical (existing tests
258
+ * guard it) while the supervisor arm reuses the exact same derivation.
259
+ */
260
+ interface ManifestRowBase {
261
+ short: string | undefined;
262
+ url: string | undefined;
263
+ driftWarning?: string;
264
+ sourceLabel: string;
265
+ staleNote?: string;
266
+ /** The persisted `lastStartError` note (detached preflight wrote it). */
267
+ manifestStartErrorNote?: string;
268
+ }
269
+
270
+ function manifestRowBase(
271
+ entry: ServiceEntry,
272
+ installSourceDeps: DetectInstallSourceDeps,
273
+ ): ManifestRowBase {
274
+ // Third-party rows (with `installDir`) live under `~/.parachute/<entry.name>/`,
275
+ // matching what `parachute start` uses as the short. First-party rows still
276
+ // map manifestName → short via the canonical fallback.
277
+ const short = shortNameForManifest(entry.name) ?? (entry.installDir ? entry.name : undefined);
278
+ const url = urlForEntry(entry, short);
279
+
280
+ // Canonical-port drift detection (hub#195). Only fires for known first-party
281
+ // services where we have a canonical assignment. Third-party rows have no
282
+ // canonical to compare against. Informational — operators may have moved a
283
+ // service off canonical deliberately.
284
+ const canonical = canonicalPortForManifest(entry.name);
285
+ const driftWarning =
286
+ canonical !== undefined && canonical !== entry.port
287
+ ? `canonical port is ${canonical}`
288
+ : undefined;
289
+
290
+ // Install-source detection (hub#243). One filesystem walk + maybe one
291
+ // `git rev-parse` per row. Failures degrade silently to `unknown`.
292
+ const detectArgs: { entryName: string; installDir?: string } = { entryName: entry.name };
293
+ if (entry.installDir !== undefined) detectArgs.installDir = entry.installDir;
294
+ const source = detectInstallSource(detectArgs, installSourceDeps);
295
+ const sourceLabel = formatInstallSourceLabel(source);
296
+ const staleNote = isStale(entry.version, source)
297
+ ? `STALE: services.json cached ${entry.version}; live package.json ${source.livePackageVersion}`
298
+ : undefined;
299
+
300
+ // Persisted last-start failure (lifecycle preflight wrote a missing-dependency
301
+ // wire onto services.json). Surface a one-line summary; the full install
302
+ // recipe lives in services.json + the admin SPA card.
303
+ const manifestStartErrorNote =
304
+ entry.lastStartError !== undefined
305
+ ? entry.lastStartError.binary !== undefined
306
+ ? `failed to start: ${entry.lastStartError.binary} not installed — run \`parachute status\` detail or see /admin/modules for install steps`
307
+ : `failed to start: ${entry.lastStartError.error_description.split("\n")[0]}`
308
+ : undefined;
309
+
310
+ return { short, url, driftWarning, sourceLabel, staleNote, manifestStartErrorNote };
311
+ }
312
+
173
313
  function hubRow(
174
314
  configDir: string,
175
315
  alive: AliveFn,
@@ -217,6 +357,36 @@ export async function status(opts: StatusOpts = {}): Promise<number> {
217
357
  const hubSrcDir = opts.hubSrcDir ?? import.meta.dir;
218
358
 
219
359
  const manifest = readManifest(manifestPath);
360
+
361
+ // Phase 3c dual-dispatch (design §6.4). On a box with a hub unit installed
362
+ // (launchd/systemd/container), read the hub row from the platform manager +
363
+ // `/health` and the module rows from the RUNNING supervisor; otherwise fall
364
+ // through to the unchanged detached arm below. Phase 5 deletes the else-arm —
365
+ // keep this a clean top-level branch so that deletion is a one-liner.
366
+ //
367
+ // Branched BEFORE the detached arm's empty-manifest early return: on a
368
+ // unit-managed box the hub row is meaningful even with zero modules installed
369
+ // (the hub IS running under a unit), so the supervisor arm renders the hub row
370
+ // + a "no modules" table rather than the detached "No services installed yet."
371
+ const sup = resolveStatusSupervisor(opts.supervisor);
372
+ if (sup.unitInstalled) {
373
+ const rows = await buildSupervisorRows({
374
+ manifest,
375
+ configDir,
376
+ installSourceDeps,
377
+ hubSrcDir,
378
+ sup,
379
+ });
380
+ renderRows(rows, print);
381
+ // The supervisor arm marks a row `healthy: false` + `!skipped` only when the
382
+ // supervisor (or the hub-row manager/health composition) says so (crashed /
383
+ // failing) — same exit contract as the detached arm: a stopped/inactive row
384
+ // is expected (skipped, exit 0), a `failing` one exits 1.
385
+ const anyUnhealthy = rows.some((r) => !r.skipped && !r.healthy);
386
+ return anyUnhealthy ? 1 : 0;
387
+ }
388
+ // --- no-unit detached fallback (unchanged; preserved until Phase 5) ---
389
+
220
390
  if (manifest.services.length === 0) {
221
391
  print("No services installed yet.");
222
392
  print("Try: parachute install vault");
@@ -235,10 +405,16 @@ export async function status(opts: StatusOpts = {}): Promise<number> {
235
405
  */
236
406
  const rows: StatusRow[] = await Promise.all(
237
407
  manifest.services.map(async (entry) => {
238
- // Third-party rows (with `installDir`) live under `~/.parachute/<entry.name>/`,
239
- // matching what `parachute start` uses as the short. First-party rows still
240
- // map manifestName → short via the canonical fallback.
241
- const short = shortNameForManifest(entry.name) ?? (entry.installDir ? entry.name : undefined);
408
+ // MANIFEST-derived fields shared with the supervisor arm (port/version/
409
+ // URL/drift/source/stale + the persisted lastStartError note).
410
+ const {
411
+ short,
412
+ url,
413
+ driftWarning,
414
+ sourceLabel,
415
+ staleNote,
416
+ manifestStartErrorNote: startErrorNote,
417
+ } = manifestRowBase(entry, installSourceDeps);
242
418
  const proc = short ? processState(short, configDir, alive) : undefined;
243
419
 
244
420
  const pidLabel =
@@ -246,43 +422,6 @@ export async function status(opts: StatusOpts = {}): Promise<number> {
246
422
  const uptimeLabel =
247
423
  proc?.status === "running" && proc.startedAt ? formatUptime(proc.startedAt, nowDate) : "-";
248
424
 
249
- const url = urlForEntry(entry, short);
250
-
251
- // Canonical-port drift detection (hub#195). Only fires for known
252
- // first-party services where we have a canonical assignment. Third-party
253
- // rows have no canonical to compare against. Warning is informational —
254
- // operators may have moved a service off canonical deliberately.
255
- // Note: multi-vault instance rows (`parachute-vault-<instance>`) don't
256
- // match a canonical manifest name, so drift warnings don't fire for
257
- // them. Intentional — see `canonicalPortForManifest` for the rationale.
258
- const canonical = canonicalPortForManifest(entry.name);
259
- const driftWarning =
260
- canonical !== undefined && canonical !== entry.port
261
- ? `canonical port is ${canonical}`
262
- : undefined;
263
-
264
- // Install-source detection (hub#243). One filesystem walk + maybe one
265
- // `git rev-parse` per row. Failures degrade silently to `unknown` —
266
- // status output should never error out on a missing checkout dir.
267
- const detectArgs: { entryName: string; installDir?: string } = { entryName: entry.name };
268
- if (entry.installDir !== undefined) detectArgs.installDir = entry.installDir;
269
- const source = detectInstallSource(detectArgs, installSourceDeps);
270
- const sourceLabel = formatInstallSourceLabel(source);
271
- const staleNote = isStale(entry.version, source)
272
- ? `STALE: services.json cached ${entry.version}; live package.json ${source.livePackageVersion}`
273
- : undefined;
274
-
275
- // Persisted last-start failure (lifecycle preflight wrote a missing-
276
- // dependency wire). Surface a one-line summary; the full install recipe
277
- // lives in services.json + the admin SPA card. Keeps `parachute status`
278
- // scannable while still telling the operator "this is why it's down."
279
- const startErrorNote =
280
- entry.lastStartError !== undefined
281
- ? entry.lastStartError.binary !== undefined
282
- ? `failed to start: ${entry.lastStartError.binary} not installed — run \`parachute status\` detail or see /admin/modules for install steps`
283
- : `failed to start: ${entry.lastStartError.error_description.split("\n")[0]}`
284
- : undefined;
285
-
286
425
  // Only skip probe when we know the process is dead (PID file was
287
426
  // present but kill(pid, 0) failed). "unknown" status (no PID file)
288
427
  // still probes — externally-managed services should report health.
@@ -354,6 +493,25 @@ export async function status(opts: StatusOpts = {}): Promise<number> {
354
493
  const hub = hubRow(configDir, alive, nowDate, hubSrcDir, installSourceDeps);
355
494
  if (hub) rows.push(hub);
356
495
 
496
+ renderRows(rows, print);
497
+
498
+ /**
499
+ * Overall exit: non-zero if any *probed* service is unhealthy. A stopped
500
+ * service is expected ("I haven't started it yet"), not a failure — users
501
+ * want `parachute status` to return 0 after a fresh install before they
502
+ * `parachute start`. Health regressions among running services still 1.
503
+ */
504
+ const anyUnhealthy = rows.some((r) => !r.skipped && !r.healthy);
505
+ return anyUnhealthy ? 1 : 0;
506
+ }
507
+
508
+ /**
509
+ * Render the status table + continuation lines. Shared by the detached arm and
510
+ * the Phase 3c supervisor arm so the table shape (design-system.md §6 columns +
511
+ * the `→`/`!` continuation prefixes) is identical regardless of where each
512
+ * row's run-state was sourced. Pure over `rows` + the `print` sink.
513
+ */
514
+ function renderRows(rows: StatusRow[], print: (line: string) => void): void {
357
515
  // Header per design-system.md §6 "CLI status column shape":
358
516
  // SERVICE PORT VERSION STATE PID UPTIME LATENCY SOURCE
359
517
  // Pre-F shape was SERVICE PORT VERSION PROCESS PID UPTIME HEALTH LATENCY
@@ -397,17 +555,406 @@ export async function status(opts: StatusOpts = {}): Promise<number> {
397
555
  if (row.stateLabel === "failing" && row.healthDetail !== "-" && row.healthDetail.length > 0) {
398
556
  print(` ! probe: ${row.healthDetail}`);
399
557
  }
558
+ if (row.managerNote) print(` ! ${row.managerNote}`);
400
559
  if (row.driftWarning) print(` ! ${row.driftWarning}`);
401
560
  if (row.staleNote) print(` ! ${row.staleNote}`);
402
561
  if (row.startErrorNote) print(` ! ${row.startErrorNote}`);
403
562
  }
563
+ }
404
564
 
405
- /**
406
- * Overall exit: non-zero if any *probed* service is unhealthy. A stopped
407
- * service is expected ("I haven't started it yet"), not a failure — users
408
- * want `parachute status` to return 0 after a fresh install before they
409
- * `parachute start`. Health regressions among running services still 1.
410
- */
411
- const anyUnhealthy = rows.some((r) => !r.skipped && !r.healthy);
412
- return anyUnhealthy ? 1 : 0;
565
+ // ---------------------------------------------------------------------------
566
+ // Phase 3c supervisor-path status (design §6.4).
567
+ //
568
+ // When a hub unit is installed, `status` reads the hub row from the PLATFORM
569
+ // MANAGER (`queryHubUnitState`) + a `/health` probe, and the module rows from
570
+ // the RUNNING supervisor (`GET /api/modules` via the operator-token→Bearer
571
+ // path). Every read is bounded + degrades gracefully — `status` is a
572
+ // diagnostic and must NEVER hang or crash regardless of hub/manager/token
573
+ // state. The detached arm above is untouched; Phase 5 deletes it.
574
+ // ---------------------------------------------------------------------------
575
+
576
+ /** Resolved Phase 3c supervisor-path seams (see `StatusOpts.supervisor`). */
577
+ interface ResolvedStatusSupervisor {
578
+ /** Whether a hub unit is installed — the dual-dispatch discriminant. */
579
+ unitInstalled: boolean;
580
+ hubUnitDeps: HubUnitDeps;
581
+ queryHubUnitState: (deps: HubUnitDeps) => HubUnitStateResult;
582
+ probeHubHealth: (port: number) => Promise<boolean>;
583
+ fetchModuleStates: (deps: DriveModuleOpDeps) => Promise<ModuleStatesResult>;
584
+ openDb: (configDir: string) => Database;
585
+ baseUrl: string | undefined;
586
+ }
587
+
588
+ /**
589
+ * Resolve the Phase 3c supervisor-path seams. Mirrors lifecycle.ts's
590
+ * `resolveSupervisor` discriminant policy:
591
+ * - No `supervisor` block at all (every existing status test) → detached arm,
592
+ * deterministically (no real-filesystem probe).
593
+ * - A `supervisor` block present → explicit `unitInstalled` override if set,
594
+ * else the real `isHubUnitInstalled` probe over the hub-unit deps.
595
+ */
596
+ function resolveStatusSupervisor(opts: StatusOpts["supervisor"]): ResolvedStatusSupervisor {
597
+ const hubUnitDeps = opts?.hubUnitDeps ?? defaultHubUnitDeps;
598
+ const unitInstalled =
599
+ opts === undefined ? false : (opts.unitInstalled ?? isHubUnitInstalled(hubUnitDeps));
600
+ return {
601
+ unitInstalled,
602
+ hubUnitDeps,
603
+ queryHubUnitState: opts?.queryHubUnitState ?? queryHubUnitStateImpl,
604
+ probeHubHealth: opts?.probeHubHealth ?? hubUnitDeps.probeHealth,
605
+ fetchModuleStates: opts?.fetchModuleStates ?? fetchModuleStatesImpl,
606
+ openDb: opts?.openDb ?? ((configDir) => openHubDb(hubDbPath(configDir))),
607
+ baseUrl: opts?.baseUrl,
608
+ };
609
+ }
610
+
611
+ /**
612
+ * Resolve the issuer the operator token is validated against — the hub's
613
+ * current loopback origin. Mirrors lifecycle.ts's `resolveOperatorTokenIssuer`
614
+ * fallback (`readHubPort ?? HUB_UNIT_DEFAULT_PORT`); both resolve to 1939 under
615
+ * canonical-ports, so they agree with what `auth rotate-operator` minted under.
616
+ */
617
+ function statusOperatorTokenIssuer(configDir: string): string {
618
+ return `http://127.0.0.1:${readHubPort(configDir) ?? HUB_UNIT_DEFAULT_PORT}`;
619
+ }
620
+
621
+ /**
622
+ * Map a supervisor `ModuleState.status` to the canonical STATE rollup
623
+ * (design-system.md §6). `running` is `active`; `crashed` is `failing`;
624
+ * `starting` / `restarting` are `pending` (in-flight operator-visible
625
+ * transition); `stopped` is `inactive`. An unknown/absent status (module not
626
+ * tracked by the supervisor — never booted, skipped at boot) is `inactive`.
627
+ */
628
+ function mapSupervisorStatus(status: string | null): {
629
+ stateLabel: StateLabel;
630
+ healthy: boolean;
631
+ skipped: boolean;
632
+ } {
633
+ switch (status) {
634
+ case "running":
635
+ return { stateLabel: "active", healthy: true, skipped: false };
636
+ case "crashed":
637
+ return { stateLabel: "failing", healthy: false, skipped: false };
638
+ case "starting":
639
+ case "restarting":
640
+ // In-flight transition — supervised, mid-operation. `pending` is the
641
+ // canonical "needs-attention transient" rollup; treat as not-a-failure
642
+ // (skipped) so a mid-restart module doesn't flip `status` to exit 1.
643
+ return { stateLabel: "pending", healthy: true, skipped: true };
644
+ default:
645
+ // stopped / null / unknown — operator-stopped or never started. The
646
+ // `skipped: true` + `healthy: false` pairing is DELIBERATE, not a mismatch:
647
+ // - `healthy: false` is honest — an inactive module is genuinely not
648
+ // serving (so a detail renderer can style it as down, not green).
649
+ // - `skipped: true` keeps the exit-code check (`rows.some(r => !r.skipped
650
+ // && !r.healthy)` at the call site, ~:385) from counting an
651
+ // operator-stopped module as a FAILURE — `parachute stop vault` then
652
+ // `status` must still exit 0.
653
+ // This is the same combination + exit semantics the detached arm uses for
654
+ // its `inactive` (operator-stopped) rows.
655
+ return { stateLabel: "inactive", healthy: false, skipped: true };
656
+ }
657
+ }
658
+
659
+ /**
660
+ * Format a supervisor `startError` (the structured missing-dependency /
661
+ * started-but-unbound wire, §6.5) into the same one-line note the detached arm
662
+ * shows from `services.json.lastStartError` (#188). Returns undefined when
663
+ * there's no usable detail.
664
+ */
665
+ function supervisorStartErrorNote(startError: unknown): string | undefined {
666
+ if (!startError || typeof startError !== "object") return undefined;
667
+ const e = startError as { binary?: unknown; error_description?: unknown };
668
+ if (typeof e.binary === "string" && e.binary.length > 0) {
669
+ return `failed to start: ${e.binary} not installed — see /admin/modules for install steps`;
670
+ }
671
+ if (typeof e.error_description === "string" && e.error_description.length > 0) {
672
+ return `failed to start: ${e.error_description.split("\n")[0]}`;
673
+ }
674
+ return undefined;
675
+ }
676
+
677
+ interface BuildSupervisorRowsArgs {
678
+ manifest: ReturnType<typeof readManifest>;
679
+ configDir: string;
680
+ installSourceDeps: DetectInstallSourceDeps;
681
+ hubSrcDir: string;
682
+ sup: ResolvedStatusSupervisor;
683
+ }
684
+
685
+ /**
686
+ * Build the full status rows on a UNIT-MANAGED box (design §6.4): module rows
687
+ * from the running supervisor, the hub row from the platform manager + /health.
688
+ * Never throws — every read is wrapped + degrades to a sensible readout.
689
+ */
690
+ async function buildSupervisorRows(args: BuildSupervisorRowsArgs): Promise<StatusRow[]> {
691
+ const { manifest, configDir, installSourceDeps, hubSrcDir, sup } = args;
692
+ const port = readHubPort(configDir) ?? HUB_UNIT_DEFAULT_PORT;
693
+
694
+ // Probe the hub once: it's both the hub row's liveness signal AND the gate for
695
+ // whether the supervisor (module states) is reachable. Bounded; never throws.
696
+ let hubHealthy = false;
697
+ try {
698
+ hubHealthy = await sup.probeHubHealth(port);
699
+ } catch {
700
+ hubHealthy = false;
701
+ }
702
+
703
+ // Read the running supervisor's module states — ONLY when the hub answers
704
+ // (children die with the hub, so a down hub means every module is down; no
705
+ // point calling, and the call would just connection-refuse). Degrade on every
706
+ // failure path: no token, expired token, HTTP error, anything — `status`
707
+ // shows what it can rather than crashing.
708
+ let states: ModuleStatesResult | undefined;
709
+ let moduleReadNote: string | undefined;
710
+ if (hubHealthy) {
711
+ const db = sup.openDb(configDir);
712
+ try {
713
+ states = await sup.fetchModuleStates({
714
+ db,
715
+ issuer: statusOperatorTokenIssuer(configDir),
716
+ configDir,
717
+ ...(sup.baseUrl !== undefined ? { baseUrl: sup.baseUrl } : {}),
718
+ });
719
+ } catch (err) {
720
+ if (err instanceof NoOperatorTokenError || err instanceof OperatorTokenExpiredError) {
721
+ // No / expired operator token: we can't read module run-state, but the
722
+ // hub is up. Show the manifest-derived rows with an actionable note —
723
+ // do NOT 401-crash status (§6.4 graceful degradation).
724
+ moduleReadNote =
725
+ "couldn't read live module state — run `parachute auth rotate-operator` to mint an operator token";
726
+ } else {
727
+ // HTTP error / parse / anything else — degrade with the message.
728
+ moduleReadNote = `couldn't read live module state (${
729
+ err instanceof Error ? err.message : String(err)
730
+ })`;
731
+ }
732
+ } finally {
733
+ db.close();
734
+ }
735
+ }
736
+
737
+ const stateByShort = new Map<string, ModuleStatesResult["modules"][number]>();
738
+ for (const m of states?.modules ?? []) {
739
+ if (m.short) stateByShort.set(m.short, m);
740
+ }
741
+
742
+ const rows: StatusRow[] = manifest.services.map((entry) => {
743
+ const base = manifestRowBase(entry, installSourceDeps);
744
+ const snap = base.short ? stateByShort.get(base.short) : undefined;
745
+
746
+ if (!hubHealthy) {
747
+ // Hub is down → every supervised module is down with it. Show `inactive`
748
+ // (expected, not a failure) with a note rather than a probe failure.
749
+ return {
750
+ service: entry.name,
751
+ port: String(entry.port),
752
+ version: entry.version,
753
+ stateLabel: "inactive",
754
+ pidLabel: "-",
755
+ uptimeLabel: "-",
756
+ healthDetail: "-",
757
+ latencyLabel: "-",
758
+ sourceLabel: base.sourceLabel,
759
+ url: base.url,
760
+ healthy: false,
761
+ skipped: true,
762
+ ...(base.driftWarning ? { driftWarning: base.driftWarning } : {}),
763
+ ...(base.staleNote ? { staleNote: base.staleNote } : {}),
764
+ managerNote: "hub is down — its modules are stopped",
765
+ };
766
+ }
767
+
768
+ const { stateLabel, healthy, skipped } = mapSupervisorStatus(snap?.supervisor_status ?? null);
769
+ // Prefer the supervisor's structured start-error (live), else the persisted
770
+ // services.json note — same friendly surface either way (#188).
771
+ const startErrorNote =
772
+ supervisorStartErrorNote(snap?.supervisor_start_error) ?? base.manifestStartErrorNote;
773
+ const healthDetail =
774
+ stateLabel === "failing" ? `supervisor: ${snap?.supervisor_status ?? "crashed"}` : "-";
775
+
776
+ const row: StatusRow = {
777
+ service: entry.name,
778
+ port: String(entry.port),
779
+ version: entry.version,
780
+ stateLabel,
781
+ pidLabel: snap?.pid !== undefined && snap?.pid !== null ? String(snap.pid) : "-",
782
+ uptimeLabel: "-",
783
+ healthDetail,
784
+ latencyLabel: "-",
785
+ sourceLabel: base.sourceLabel,
786
+ url: base.url,
787
+ healthy,
788
+ skipped,
789
+ };
790
+ if (base.driftWarning) row.driftWarning = base.driftWarning;
791
+ if (base.staleNote) row.staleNote = base.staleNote;
792
+ if (startErrorNote) row.startErrorNote = startErrorNote;
793
+ // Surface the degraded-read note ONCE — on the first module row so the
794
+ // operator sees why run-state is missing, without repeating it on every row.
795
+ if (moduleReadNote) {
796
+ row.managerNote = moduleReadNote;
797
+ moduleReadNote = undefined;
798
+ }
799
+ return row;
800
+ });
801
+
802
+ const hub = buildSupervisorHubRow({
803
+ configDir,
804
+ hubSrcDir,
805
+ installSourceDeps,
806
+ sup,
807
+ port,
808
+ hubHealthy,
809
+ });
810
+ // If the degraded-read note never landed on a module row (empty manifest),
811
+ // surface it on the hub row so the operator still sees the actionable hint.
812
+ if (moduleReadNote && !hub.managerNote) hub.managerNote = moduleReadNote;
813
+ rows.push(hub);
814
+ return rows;
815
+ }
816
+
817
+ interface BuildSupervisorHubRowArgs {
818
+ configDir: string;
819
+ hubSrcDir: string;
820
+ installSourceDeps: DetectInstallSourceDeps;
821
+ sup: ResolvedStatusSupervisor;
822
+ port: number;
823
+ hubHealthy: boolean;
824
+ }
825
+
826
+ /**
827
+ * Build the hub row from the platform manager + /health (design §6.4). The
828
+ * manager's `queryHubUnitState` is the run-state; `/health` is the liveness
829
+ * signal. Composition:
830
+ * - manager `active` + /health OK → `active` (running).
831
+ * - manager `active` + /health down → `failing` with a "starting/unhealthy"
832
+ * note (the unit is up but not serving yet).
833
+ * - manager `failed` → `failing` (surface the last-exit code).
834
+ * - manager `inactive` → `inactive`.
835
+ * - no on-box manager (container/Render/Fly) → lean on /health for liveness;
836
+ * report "container runtime (managed)".
837
+ * Never throws — a manager-query failure degrades to the /health verdict.
838
+ */
839
+ function buildSupervisorHubRow(args: BuildSupervisorHubRowArgs): StatusRow {
840
+ const { configDir, hubSrcDir, installSourceDeps, sup, port, hubHealthy } = args;
841
+ const source = detectHubInstallSource(hubSrcDir, installSourceDeps);
842
+ const base: Omit<StatusRow, "stateLabel" | "pidLabel" | "uptimeLabel" | "healthy" | "skipped"> & {
843
+ healthDetail: string;
844
+ } = {
845
+ service: "parachute-hub (internal)",
846
+ port: String(port),
847
+ version: source.livePackageVersion ?? "-",
848
+ healthDetail: "-",
849
+ latencyLabel: "-",
850
+ sourceLabel: formatInstallSourceLabel(source),
851
+ url: `http://127.0.0.1:${port}`,
852
+ };
853
+
854
+ let managerState: HubUnitState;
855
+ let lastExitCode: number | undefined;
856
+ try {
857
+ const q = sup.queryHubUnitState(sup.hubUnitDeps);
858
+ managerState = q.state;
859
+ lastExitCode = q.lastExitCode;
860
+ } catch {
861
+ // The manager query must never crash status — fall back to /health only.
862
+ managerState = "unknown";
863
+ }
864
+
865
+ // No on-box manager (container / Render / Fly): there's nothing to query —
866
+ // `/health` is the sole liveness signal. Report the managed-runtime nuance.
867
+ if (managerState === "no-manager") {
868
+ return {
869
+ ...base,
870
+ stateLabel: hubHealthy ? "active" : "failing",
871
+ pidLabel: "-",
872
+ uptimeLabel: "-",
873
+ healthDetail: hubHealthy ? "-" : "down",
874
+ healthy: hubHealthy,
875
+ skipped: hubHealthy,
876
+ managerNote: "container runtime (managed)",
877
+ };
878
+ }
879
+
880
+ // Manager says failed: surface it as `failing` with the last-exit code even if
881
+ // a respawn happens to be answering /health right now.
882
+ if (managerState === "failed") {
883
+ return {
884
+ ...base,
885
+ stateLabel: "failing",
886
+ pidLabel: "-",
887
+ uptimeLabel: "-",
888
+ healthDetail: hubHealthy ? "service manager reports failed" : "down",
889
+ healthy: false,
890
+ skipped: false,
891
+ managerNote:
892
+ lastExitCode !== undefined
893
+ ? `service manager reports the hub unit failed (last exit code ${lastExitCode})`
894
+ : "service manager reports the hub unit failed",
895
+ };
896
+ }
897
+
898
+ // Manager says active.
899
+ if (managerState === "active") {
900
+ if (hubHealthy) {
901
+ return {
902
+ ...base,
903
+ stateLabel: "active",
904
+ pidLabel: "-",
905
+ uptimeLabel: "-",
906
+ healthy: true,
907
+ skipped: true,
908
+ };
909
+ }
910
+ // Active per the manager but not answering /health: starting up or wedged.
911
+ return {
912
+ ...base,
913
+ stateLabel: "failing",
914
+ pidLabel: "-",
915
+ uptimeLabel: "-",
916
+ healthDetail: "manager active, /health not answering",
917
+ healthy: false,
918
+ skipped: false,
919
+ managerNote:
920
+ "service manager reports active; /health not answering yet (starting or unhealthy)",
921
+ };
922
+ }
923
+
924
+ // Manager says activating: transient bring-up. If /health already answers,
925
+ // call it active; else show it as pending (in-flight).
926
+ if (managerState === "activating") {
927
+ return {
928
+ ...base,
929
+ stateLabel: hubHealthy ? "active" : "pending",
930
+ pidLabel: "-",
931
+ uptimeLabel: "-",
932
+ healthy: true,
933
+ skipped: true,
934
+ ...(hubHealthy ? {} : { managerNote: "service manager reports the hub unit is starting" }),
935
+ };
936
+ }
937
+
938
+ // Manager says inactive / unknown / no-unit (defensive — no-unit shouldn't
939
+ // reach here under the dual-dispatch). Trust /health as the tiebreaker: if the
940
+ // hub somehow answers, show active; else inactive.
941
+ if (hubHealthy) {
942
+ return {
943
+ ...base,
944
+ stateLabel: "active",
945
+ pidLabel: "-",
946
+ uptimeLabel: "-",
947
+ healthy: true,
948
+ skipped: true,
949
+ };
950
+ }
951
+ return {
952
+ ...base,
953
+ stateLabel: "inactive",
954
+ pidLabel: "-",
955
+ uptimeLabel: "-",
956
+ healthy: false,
957
+ skipped: true,
958
+ ...(managerState === "unknown" ? { managerNote: "service manager state unknown" } : {}),
959
+ };
413
960
  }