@openparachute/hub 0.5.14-rc.8 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/README.md +109 -15
  2. package/package.json +7 -3
  3. package/src/__tests__/account-home-ui.test.ts +251 -15
  4. package/src/__tests__/account-vault-token.test.ts +355 -0
  5. package/src/__tests__/admin-vaults.test.ts +70 -4
  6. package/src/__tests__/api-mint-token.test.ts +693 -5
  7. package/src/__tests__/api-modules-ops.test.ts +45 -0
  8. package/src/__tests__/api-revoke-token.test.ts +384 -0
  9. package/src/__tests__/api-users.test.ts +7 -2
  10. package/src/__tests__/auth.test.ts +157 -30
  11. package/src/__tests__/cli.test.ts +44 -5
  12. package/src/__tests__/expose-2fa-warning.test.ts +31 -17
  13. package/src/__tests__/expose-auth-preflight.test.ts +71 -72
  14. package/src/__tests__/expose-cloudflare.test.ts +482 -14
  15. package/src/__tests__/expose.test.ts +52 -2
  16. package/src/__tests__/hub-server.test.ts +97 -0
  17. package/src/__tests__/hub.test.ts +85 -6
  18. package/src/__tests__/init.test.ts +102 -1
  19. package/src/__tests__/lifecycle.test.ts +464 -2
  20. package/src/__tests__/oauth-handlers.test.ts +1252 -83
  21. package/src/__tests__/oauth-ui.test.ts +12 -1
  22. package/src/__tests__/operator-token-issuer-self-heal.test.ts +412 -0
  23. package/src/__tests__/resource-binding.test.ts +97 -0
  24. package/src/__tests__/scope-explanations.test.ts +77 -12
  25. package/src/__tests__/services-manifest.test.ts +122 -4
  26. package/src/__tests__/setup-wizard.test.ts +335 -15
  27. package/src/__tests__/status.test.ts +36 -0
  28. package/src/__tests__/two-factor-flow.test.ts +602 -0
  29. package/src/__tests__/two-factor.test.ts +183 -0
  30. package/src/__tests__/upgrade.test.ts +78 -1
  31. package/src/__tests__/users.test.ts +68 -0
  32. package/src/__tests__/vault-auth-status.test.ts +47 -6
  33. package/src/__tests__/vault-hub-origin-env.test.ts +263 -0
  34. package/src/account-home-ui.ts +488 -38
  35. package/src/account-vault-token.ts +282 -0
  36. package/src/admin-handlers.ts +159 -4
  37. package/src/admin-login-ui.ts +49 -5
  38. package/src/admin-vaults.ts +48 -15
  39. package/src/api-account.ts +14 -0
  40. package/src/api-mint-token.ts +132 -24
  41. package/src/api-modules-ops.ts +49 -11
  42. package/src/api-revoke-token.ts +107 -21
  43. package/src/api-users.ts +29 -3
  44. package/src/cli.ts +26 -21
  45. package/src/clients.ts +18 -6
  46. package/src/cloudflare/config.ts +10 -4
  47. package/src/cloudflare/detect.ts +39 -44
  48. package/src/commands/auth.ts +165 -24
  49. package/src/commands/expose-2fa-warning.ts +34 -32
  50. package/src/commands/expose-auth-preflight.ts +89 -78
  51. package/src/commands/expose-cloudflare.ts +370 -12
  52. package/src/commands/expose.ts +8 -0
  53. package/src/commands/init.ts +33 -2
  54. package/src/commands/lifecycle.ts +386 -17
  55. package/src/commands/status.ts +22 -0
  56. package/src/commands/upgrade.ts +55 -11
  57. package/src/commands/wizard.ts +8 -4
  58. package/src/env-file.ts +10 -0
  59. package/src/help.ts +3 -1
  60. package/src/hub-db.ts +39 -1
  61. package/src/hub-server.ts +52 -0
  62. package/src/hub.ts +82 -14
  63. package/src/oauth-handlers.ts +298 -21
  64. package/src/oauth-ui.ts +10 -0
  65. package/src/operator-token.ts +151 -0
  66. package/src/pending-login.ts +116 -0
  67. package/src/rate-limit.ts +51 -0
  68. package/src/resource-binding.ts +134 -0
  69. package/src/scope-attenuation.ts +85 -0
  70. package/src/scope-explanations.ts +131 -14
  71. package/src/services-manifest.ts +112 -0
  72. package/src/setup-wizard.ts +77 -7
  73. package/src/tailscale/run.ts +28 -11
  74. package/src/totp.ts +201 -0
  75. package/src/two-factor-handlers.ts +287 -0
  76. package/src/two-factor-store.ts +181 -0
  77. package/src/two-factor-ui.ts +462 -0
  78. package/src/users.ts +58 -0
  79. package/src/vault/auth-status.ts +71 -19
  80. package/src/vault-hub-origin-env.ts +163 -0
  81. package/web/ui/dist/assets/index-BiBlvEaj.css +1 -0
  82. package/web/ui/dist/assets/index-CIN3mnmf.js +61 -0
  83. package/web/ui/dist/index.html +2 -2
  84. package/src/__tests__/vault-tokens-create-interactive.test.ts +0 -183
  85. package/src/commands/vault-tokens-create-interactive.ts +0 -143
  86. package/web/ui/dist/assets/index-7DtAXz7y.css +0 -1
  87. package/web/ui/dist/assets/index-tRmPbbC7.js +0 -61
@@ -1,5 +1,11 @@
1
- import { existsSync, openSync } from "node:fs";
1
+ import { existsSync, openSync, readFileSync } from "node:fs";
2
+ import { Socket } from "node:net";
2
3
  import { join } from "node:path";
4
+ import {
5
+ MissingDependencyError,
6
+ ensureExecutable,
7
+ rethrowIfMissing,
8
+ } from "@openparachute/depcheck";
3
9
  import { CONFIG_DIR, SERVICES_MANIFEST_PATH } from "../config.ts";
4
10
  import { readEnvFileValues } from "../env-file.ts";
5
11
  import { readExposeState } from "../expose-state.ts";
@@ -12,8 +18,10 @@ import {
12
18
  readHubPort,
13
19
  stopHub,
14
20
  } from "../hub-control.ts";
21
+ import { hubDbPath, openHubDb } from "../hub-db.ts";
15
22
  import { HUB_ORIGIN_ENV, deriveHubOrigin } from "../hub-origin.ts";
16
23
  import { ModuleManifestError, readModuleManifest } from "../module-manifest.ts";
24
+ import { type OperatorIssuerHealStatus, selfHealOperatorTokenIssuer } from "../operator-token.ts";
17
25
  import {
18
26
  type AliveFn,
19
27
  clearPid,
@@ -32,7 +40,13 @@ import {
32
40
  knownServices,
33
41
  shortNameForManifest,
34
42
  } from "../service-spec.ts";
35
- import { type ServiceEntry, readManifest } from "../services-manifest.ts";
43
+ import {
44
+ type ServiceEntry,
45
+ clearStartError,
46
+ readManifest,
47
+ recordStartError,
48
+ } from "../services-manifest.ts";
49
+ import { persistVaultHubOrigin, selfHealVaultHubOrigin } from "../vault-hub-origin-env.ts";
36
50
 
37
51
  /**
38
52
  * Tiny seam over `Bun.spawn` for lifecycle tests. The real spawner opens the
@@ -83,6 +97,44 @@ export const defaultSpawner: Spawner = {
83
97
  export type KillFn = (pid: number, signal: NodeJS.Signals | number) => void;
84
98
  export type SleepFn = (ms: number) => Promise<void>;
85
99
 
100
+ /**
101
+ * "Is something listening on this TCP port on loopback?" seam. Pairs with the
102
+ * spawn-then-die settle (hub#194) to catch the *other* silent-start failure
103
+ * shape (hub#487): a service that lives long enough to clear the liveness
104
+ * check but never binds its port because the port is already held (EADDRINUSE
105
+ * from an orphan). The recorded pid stays alive (vault's process supervisor
106
+ * retries / lingers) so `alive(pid)` says "running" while `parachute status`
107
+ * shows it inactive because nothing answers on the port.
108
+ *
109
+ * Tests inject a deterministic stub; production uses `defaultPortListening`.
110
+ */
111
+ export type PortListeningFn = (port: number) => Promise<boolean>;
112
+
113
+ /**
114
+ * Connect-probe: open a TCP socket to 127.0.0.1:<port> and see if it's
115
+ * accepted. A successful connect means *something* is listening; we close
116
+ * immediately. Connection refused / timeout means nothing is bound yet.
117
+ * `node:net` rather than `Bun.connect` because the latter has no clean
118
+ * "connection refused → false" without a custom socket handler, and the net
119
+ * Socket's `error`/`connect` events map directly onto the boolean we want.
120
+ */
121
+ export const defaultPortListening: PortListeningFn = (port) =>
122
+ new Promise((resolve) => {
123
+ const socket = new Socket();
124
+ let settled = false;
125
+ const done = (listening: boolean) => {
126
+ if (settled) return;
127
+ settled = true;
128
+ socket.destroy();
129
+ resolve(listening);
130
+ };
131
+ socket.setTimeout(1000);
132
+ socket.once("connect", () => done(true));
133
+ socket.once("timeout", () => done(false));
134
+ socket.once("error", () => done(false));
135
+ socket.connect(port, "127.0.0.1");
136
+ });
137
+
86
138
  /**
87
139
  * Group-aware liveness: returns true if the process group (pgid == pid)
88
140
  * still has any member. Pairs with `defaultSpawner`'s `detached: true` —
@@ -129,6 +181,35 @@ export const defaultKill: KillFn = (pid, signal) => {
129
181
 
130
182
  export const defaultSleep: SleepFn = (ms) => new Promise((r) => setTimeout(r, ms));
131
183
 
184
+ /**
185
+ * Read the trailing `n` lines of a logfile, best-effort. Used to surface the
186
+ * real boot error when a start fails — operators shouldn't have to manually
187
+ * `tail` the log to learn *why* the daemon died. Returns [] on any read
188
+ * error (missing file, permissions) so the caller falls back to the generic
189
+ * "tail the log" hint without throwing.
190
+ */
191
+ function readLogTail(logFile: string, n: number): string[] {
192
+ try {
193
+ const content = readFileSync(logFile, "utf8");
194
+ const trimmed = content.replace(/\n$/, "");
195
+ if (trimmed === "") return [];
196
+ return trimmed.split("\n").slice(-n);
197
+ } catch {
198
+ return [];
199
+ }
200
+ }
201
+
202
+ /**
203
+ * Heuristic EADDRINUSE detector over a logfile tail. cloudflared, Bun, and
204
+ * Node all surface port collisions with recognizable phrases; we match the
205
+ * common ones rather than parse a structured error (there isn't one across
206
+ * runtimes). False positives are harmless — the worst case is we *also* print
207
+ * the port-in-use remedy on an unrelated failure, which is still actionable.
208
+ */
209
+ function detectAddrInUse(logTail: readonly string[]): boolean {
210
+ return logTail.some((line) => /EADDRINUSE|address already in use|port .* in use/i.test(line));
211
+ }
212
+
132
213
  export interface LifecycleOpts {
133
214
  spawner?: Spawner;
134
215
  kill?: KillFn;
@@ -160,6 +241,30 @@ export interface LifecycleOpts {
160
241
  * settle.
161
242
  */
162
243
  startSettleMs?: number;
244
+ /**
245
+ * Probe whether the service's port is listening, post-spawn. Pairs with the
246
+ * settle (hub#194) to catch the EADDRINUSE-orphan shape (hub#487): the
247
+ * process survives the liveness window (vault lingers / retries) but never
248
+ * binds because the port is already held, so `start` would otherwise report
249
+ * "✓ started" while `status` shows it inactive. Tests inject a stub;
250
+ * production uses `defaultPortListening` (a loopback TCP connect probe).
251
+ */
252
+ portListening?: PortListeningFn;
253
+ /**
254
+ * How long `start` polls for the service to bind its port after the
255
+ * liveness settle passes. Default 4000ms in production — long enough to
256
+ * cover vault/scribe cold-boot (DB open, route registration) without making
257
+ * a healthy start feel laggy. Polled at `startReadyPollMs` intervals; the
258
+ * first time the port answers we declare success. If the window elapses
259
+ * with the process still alive but the port silent, we print a non-fatal
260
+ * warning (the daemon may still be coming up) rather than failing — only a
261
+ * *dead* process is a hard failure. Defaulting policy mirrors
262
+ * `startSettleMs`: 0 (skipped) unless `portListening` is injected or the
263
+ * production path (no spawner override) is active.
264
+ */
265
+ startReadyMs?: number;
266
+ /** Poll interval while waiting for the port to come up. Default 200ms. */
267
+ startReadyPollMs?: number;
163
268
  /**
164
269
  * Override the hub origin passed to services as PARACHUTE_HUB_ORIGIN. If
165
270
  * unset, `start` derives it from `expose-state.json` (when exposed) or
@@ -175,9 +280,36 @@ export interface LifecycleOpts {
175
280
  * `ensureHubRunning` and `lifecycle.stop("hub")` dispatches to
176
281
  * `stopHub`. Tests inject stubs to avoid spawning real bun processes.
177
282
  */
283
+ /**
284
+ * PATH-resolution seam for the start preflight (`@openparachute/depcheck`
285
+ * `ensureExecutable`). Production uses the real `Bun.which`; a missing
286
+ * startCmd binary then surfaces the friendly missing-dependency UX +
287
+ * persists it to services.json.
288
+ *
289
+ * Defaulting policy mirrors `startSettleMs`: when a stub `spawner` is
290
+ * injected (the test path) `which` defaults to a permissive resolver
291
+ * (`() => "<stub>"`) so existing stub-spawner tests don't trip the preflight
292
+ * against binaries that aren't on the test host's PATH (`parachute-vault`,
293
+ * `notes-serve`). Production (no spawner override) gets the real `Bun.which`.
294
+ * Tests that want to exercise the missing-binary branch inject `which`
295
+ * explicitly (e.g. `which: () => null`).
296
+ */
297
+ which?: (cmd: string) => string | null;
178
298
  hub?: {
179
299
  ensureRunning?: (opts: EnsureHubOpts) => Promise<EnsureHubResult>;
180
300
  stop?: (opts: StopHubOpts) => Promise<boolean>;
301
+ /**
302
+ * Self-heal the operator token's stale `iss` after `start hub` (hub#481).
303
+ * Production opens hub.db at `<configDir>/hub.db` and delegates to
304
+ * `selfHealOperatorTokenIssuer`. Tests inject a stub to assert the call
305
+ * happens — or to make it throw and prove a self-heal failure never fails
306
+ * `start hub`.
307
+ */
308
+ selfHealOperatorToken?: (args: {
309
+ issuer: string;
310
+ configDir: string;
311
+ log: (line: string) => void;
312
+ }) => Promise<OperatorIssuerHealStatus>;
181
313
  };
182
314
  }
183
315
 
@@ -193,9 +325,42 @@ interface Resolved {
193
325
  killWaitMs: number;
194
326
  pollIntervalMs: number;
195
327
  startSettleMs: number;
328
+ portListening: PortListeningFn;
329
+ startReadyMs: number;
330
+ startReadyPollMs: number;
331
+ which: (cmd: string) => string | null;
196
332
  hubOrigin: string | undefined;
197
333
  ensureHub: (opts: EnsureHubOpts) => Promise<EnsureHubResult>;
198
334
  stopHubFn: (opts: StopHubOpts) => Promise<boolean>;
335
+ selfHealOperatorTokenFn: (args: {
336
+ issuer: string;
337
+ configDir: string;
338
+ log: (line: string) => void;
339
+ }) => Promise<OperatorIssuerHealStatus>;
340
+ }
341
+
342
+ /**
343
+ * Production self-heal: open hub.db at `<configDir>/hub.db`, run
344
+ * `selfHealOperatorTokenIssuer`, and close the db. Derives the db path the
345
+ * same way the rest of the repo does (`hubDbPath(configDir)`); `openHubDb`
346
+ * runs migrations + WAL on open, matching `commands/auth.ts`. Tests override
347
+ * this whole seam, so the db-open only happens on the production path.
348
+ */
349
+ async function defaultSelfHealOperatorToken(args: {
350
+ issuer: string;
351
+ configDir: string;
352
+ log: (line: string) => void;
353
+ }): Promise<OperatorIssuerHealStatus> {
354
+ const db = openHubDb(hubDbPath(args.configDir));
355
+ try {
356
+ return await selfHealOperatorTokenIssuer(db, {
357
+ issuer: args.issuer,
358
+ configDir: args.configDir,
359
+ log: args.log,
360
+ });
361
+ } finally {
362
+ db.close();
363
+ }
199
364
  }
200
365
 
201
366
  function resolve(opts: LifecycleOpts): Resolved {
@@ -219,9 +384,26 @@ function resolve(opts: LifecycleOpts): Resolved {
219
384
  // override `alive`, which re-enables the default 250ms.
220
385
  startSettleMs:
221
386
  opts.startSettleMs ?? (opts.spawner === undefined || opts.alive !== undefined ? 250 : 0),
387
+ portListening: opts.portListening ?? defaultPortListening,
388
+ // Same defaulting policy as startSettleMs: production (no spawner
389
+ // override) gets the real 4s readiness window; tests that inject a stub
390
+ // spawner get 0 (skipped) unless they explicitly opt in via
391
+ // `portListening` or `startReadyMs`, so existing stub-spawner tests don't
392
+ // start probing a fake port.
393
+ startReadyMs:
394
+ opts.startReadyMs ??
395
+ (opts.spawner === undefined || opts.portListening !== undefined ? 4000 : 0),
396
+ startReadyPollMs: opts.startReadyPollMs ?? 200,
397
+ // Same defaulting policy as startSettleMs/startReadyMs: production (no
398
+ // spawner override) preflights with the real Bun.which; stub-spawner tests
399
+ // get a permissive resolver so the preflight doesn't trip against binaries
400
+ // that aren't on the test host's PATH. Explicit `which` always wins.
401
+ which:
402
+ opts.which ?? (opts.spawner === undefined ? Bun.which : () => "/stub/bin/preflight-skipped"),
222
403
  hubOrigin: resolveHubOrigin(opts.hubOrigin, configDir),
223
404
  ensureHub: opts.hub?.ensureRunning ?? ensureHubRunning,
224
405
  stopHubFn: opts.hub?.stop ?? stopHub,
406
+ selfHealOperatorTokenFn: opts.hub?.selfHealOperatorToken ?? defaultSelfHealOperatorToken,
225
407
  };
226
408
  }
227
409
 
@@ -452,42 +634,185 @@ export async function start(svc: string | undefined, opts: LifecycleOpts = {}):
452
634
  if (entry.installDir) spawnerOpts.cwd = entry.installDir;
453
635
  const passOpts =
454
636
  spawnerOpts.env !== undefined || spawnerOpts.cwd !== undefined ? spawnerOpts : undefined;
637
+
638
+ // Pre-flight the startCmd binary (`@openparachute/depcheck`) so a missing
639
+ // executable surfaces the friendly install UX inline AND is persisted onto
640
+ // the services.json row, so a *later* `parachute status` (a separate
641
+ // invocation that only reads the manifest) + the SPA modules pane show
642
+ // "vault: failed to start — parachute-vault not installed" with install
643
+ // info, rather than a bare "failed"/orphan-timeout. The binary is `cmd[0]`
644
+ // (e.g. `parachute-vault` for an npm install, `bun` for a bun-linked one).
645
+ const startBinary = cmd[0];
646
+ if (startBinary) {
647
+ try {
648
+ ensureExecutable(startBinary, { which: r.which });
649
+ } catch (err) {
650
+ if (err instanceof MissingDependencyError) {
651
+ failures++;
652
+ r.log(`✗ ${short} failed to start:`);
653
+ for (const line of err.message.split("\n")) r.log(` ${line}`);
654
+ recordStartError(entry.name, err.toWire(), r.manifestPath);
655
+ continue;
656
+ }
657
+ throw err;
658
+ }
659
+ }
660
+
455
661
  let pid: number;
456
662
  try {
457
663
  pid = r.spawner.spawn(cmd, logFile, passOpts);
458
664
  } catch (err) {
665
+ // Belt-and-suspenders: a missing binary that slipped past the pre-flight
666
+ // (race) still becomes a MissingDependencyError via rethrowIfMissing.
667
+ if (startBinary) {
668
+ try {
669
+ rethrowIfMissing(err, startBinary);
670
+ } catch (missing) {
671
+ if (missing instanceof MissingDependencyError) {
672
+ failures++;
673
+ r.log(`✗ ${short} failed to start:`);
674
+ for (const line of missing.message.split("\n")) r.log(` ${line}`);
675
+ recordStartError(entry.name, missing.toWire(), r.manifestPath);
676
+ continue;
677
+ }
678
+ }
679
+ }
459
680
  failures++;
460
681
  const msg = err instanceof Error ? err.message : String(err);
461
682
  r.log(`✗ ${short} failed to start: ${msg}`);
462
683
  continue;
463
684
  }
685
+ // A successful spawn clears any stale start-error recorded from a prior
686
+ // missing-dependency failure so `parachute status` doesn't keep showing it.
687
+ clearStartError(entry.name, r.manifestPath);
464
688
  writePid(short, pid, r.configDir);
465
689
 
466
- // Settle-poll for spawn-then-immediately-die (hub#194). A spawn returning
467
- // a pid only proves the kernel forked the process; the child may exit
468
- // microseconds later if its main code path throws before listening
469
- // (e.g. notes-serve's Bun.resolveSync failing for bun-linked installs).
470
- // Without this poll, we'd report success and the operator would chase
471
- // a phantom 502.
690
+ // Boot-readiness gating (hub#194 + hub#487). A spawn returning a pid only
691
+ // proves the kernel forked the process it says nothing about whether the
692
+ // service survived its boot or bound its port. Two silent-start shapes:
693
+ //
694
+ // (1) spawn-then-immediately-die (hub#194): the child throws before
695
+ // listening (notes-serve's Bun.resolveSync failing for bun-linked
696
+ // installs) and exits microseconds later. Caught by the settle below.
697
+ //
698
+ // (2) alive-but-never-bound (hub#487): the port is already held by an
699
+ // orphan, the child hits EADDRINUSE, but its process *lingers* (or a
700
+ // supervisor retries) long enough to clear the liveness check. `start`
701
+ // would report "✓ started" while `parachute status` shows it inactive
702
+ // because nothing answers on the port. Aaron hit exactly this with an
703
+ // orphan holding vault's 1940 on a fresh EC2 box. Caught by the
704
+ // port-readiness poll below.
705
+ //
706
+ // On any failure we surface the tail of the logfile so the operator sees
707
+ // the real boot error inline, and we specifically call out EADDRINUSE with
708
+ // the `lsof -ti:<port>` remedy.
709
+ const reportStartFailure = (reason: string): void => {
710
+ clearPid(short, r.configDir);
711
+ failures++;
712
+ const tail = readLogTail(logFile, 20);
713
+ if (detectAddrInUse(tail)) {
714
+ r.log(
715
+ `✗ ${short} failed to start: port ${entry.port} is already in use. Stop the existing process first — find it with \`lsof -ti:${entry.port}\` (then \`kill <pid>\`), or run \`parachute restart ${short}\`.`,
716
+ );
717
+ } else {
718
+ r.log(`✗ ${short} failed to start: ${reason}`);
719
+ }
720
+ if (tail.length > 0) {
721
+ r.log(` ── last ${tail.length} log line(s) (${logFile}) ──`);
722
+ for (const line of tail) r.log(` │ ${line}`);
723
+ } else {
724
+ r.log(` Tail the log for details: tail -50 ${logFile}`);
725
+ }
726
+ };
727
+
472
728
  if (r.startSettleMs > 0) {
473
729
  await r.sleep(r.startSettleMs);
474
730
  if (!r.alive(pid)) {
475
- clearPid(short, r.configDir);
476
- failures++;
731
+ reportStartFailure(
732
+ `spawned pid ${pid} but the process exited within ${r.startSettleMs}ms.`,
733
+ );
734
+ continue;
735
+ }
736
+ }
737
+
738
+ // Port-readiness poll (hub#487). The process is alive; now confirm it
739
+ // actually bound its port before claiming success. Poll up to
740
+ // `startReadyMs`, re-checking liveness each iteration so a *later* death
741
+ // (e.g. a slow EADDRINUSE crash) is still reported as a failure. A process
742
+ // that stays alive but never binds within the window gets a non-fatal
743
+ // warning rather than a hard failure — some daemons legitimately do slow
744
+ // boot work, and we'd rather not flip a healthy-but-slow start to red.
745
+ if (r.startReadyMs > 0) {
746
+ const deadline = r.now() + r.startReadyMs;
747
+ let listening = false;
748
+ let died = false;
749
+ while (r.now() < deadline) {
750
+ if (!r.alive(pid)) {
751
+ died = true;
752
+ break;
753
+ }
754
+ if (await r.portListening(entry.port)) {
755
+ listening = true;
756
+ break;
757
+ }
758
+ await r.sleep(r.startReadyPollMs);
759
+ }
760
+ if (died) {
761
+ reportStartFailure(`spawned pid ${pid} but the process exited during startup.`);
762
+ continue;
763
+ }
764
+ if (!listening) {
765
+ // Last-chance liveness check — the loop may have exited on the
766
+ // deadline right as the process died.
767
+ if (!r.alive(pid)) {
768
+ reportStartFailure(`spawned pid ${pid} but the process exited during startup.`);
769
+ continue;
770
+ }
477
771
  r.log(
478
- `✗ ${short} failed to start: spawned pid ${pid} but the process exited within ${r.startSettleMs}ms.`,
772
+ `⚠ ${short} started (pid ${pid}) but port ${entry.port} isn't accepting connections yet after ${r.startReadyMs}ms.`,
479
773
  );
480
- r.log(` Tail the log for details: tail -50 ${logFile}`);
774
+ r.log(
775
+ ` It may still be coming up — check \`parachute status\` and \`parachute logs ${short}\`.`,
776
+ );
777
+ if (r.hubOrigin) r.log(` ${HUB_ORIGIN_ENV}=${r.hubOrigin}`);
778
+ if (short === "vault") persistVaultHubOriginForStart(r);
481
779
  continue;
482
780
  }
483
781
  }
484
782
 
485
783
  r.log(`✓ ${short} started (pid ${pid}); logs: ${logFile}`);
486
784
  if (r.hubOrigin) r.log(` ${HUB_ORIGIN_ENV}=${r.hubOrigin}`);
785
+ if (short === "vault") persistVaultHubOriginForStart(r);
487
786
  }
488
787
  return failures === 0 ? 0 : 1;
489
788
  }
490
789
 
790
+ /**
791
+ * Durable-persist vault's `PARACHUTE_HUB_ORIGIN` on a vault `start`. Two cases,
792
+ * in order:
793
+ *
794
+ * 1. The resolved spawn origin (`r.hubOrigin`) is a real public origin — write
795
+ * it. This is the long-standing happy path: an exposure is live, the
796
+ * launchd / systemd daemon (which boots vault out-of-band and never sees
797
+ * this spawn env) needs it in `.env` to validate hub-minted JWTs' `iss`.
798
+ * `persistVaultHubOrigin` skips loopback / unchanged values itself.
799
+ *
800
+ * 2. Self-heal: even when `r.hubOrigin` resolved to loopback or undefined
801
+ * (e.g. the hub.port file outran the expose-state read, or this is a bare
802
+ * `restart vault` on a deploy whose `.env` was never written), consult
803
+ * `expose-state.json` directly. If it advertises a public origin and
804
+ * vault's persisted value is unset / loopback, write the public origin.
805
+ * This is what lets an EXISTING broken Cloudflare deploy self-correct on
806
+ * the next `parachute restart vault`, not only fresh exposes.
807
+ *
808
+ * Case 1 covers the override / freshly-resolved path; case 2 catches the gap
809
+ * the Cloudflare 401 P0 fell through. See `vault-hub-origin-env.ts`.
810
+ */
811
+ function persistVaultHubOriginForStart(r: Resolved): void {
812
+ if (r.hubOrigin) persistVaultHubOrigin(r.configDir, r.hubOrigin, r.log);
813
+ selfHealVaultHubOrigin(r.configDir, r.log, join(r.configDir, "expose-state.json"));
814
+ }
815
+
491
816
  export async function stop(svc: string | undefined, opts: LifecycleOpts = {}): Promise<number> {
492
817
  const r = resolve(opts);
493
818
  if (svc === HUB_SVC) return stopHubSvc(r);
@@ -567,6 +892,12 @@ async function startHubSvc(r: Resolved): Promise<number> {
567
892
  } else {
568
893
  r.log(`hub already running (pid ${result.pid}) on port ${result.port}.`);
569
894
  }
895
+ // Self-heal a stale operator-token issuer (hub#481). Runs whether the hub
896
+ // was freshly started OR already running — a token stamped at loopback
897
+ // before exposure must heal even when the hub is already up. The loopback /
898
+ // provenance guards live inside `selfHealOperatorTokenIssuer`, so the only
899
+ // gate here is "is there a real issuer to heal toward?".
900
+ await selfHealOperatorTokenOnStart(r);
570
901
  return 0;
571
902
  } catch (err) {
572
903
  r.log(`✗ hub failed to start: ${err instanceof Error ? err.message : String(err)}`);
@@ -574,6 +905,36 @@ async function startHubSvc(r: Resolved): Promise<number> {
574
905
  }
575
906
  }
576
907
 
908
+ /**
909
+ * Re-issue the operator token under the hub's current origin when its `iss`
910
+ * went stale after an init-at-loopback → expose transition (hub#481). Mirrors
911
+ * `persistVaultHubOriginForStart`'s quiet style: emit a single line only when
912
+ * a rotation actually happens; stay silent for fresh / absent / skipped.
913
+ *
914
+ * The ENTIRE self-heal is wrapped here so it can NEVER block or fail
915
+ * `start hub` — a db-open error, a corrupt token, anything — degrades to a
916
+ * brief warning and `start hub` still returns 0.
917
+ */
918
+ async function selfHealOperatorTokenOnStart(r: Resolved): Promise<void> {
919
+ if (!r.hubOrigin) return;
920
+ try {
921
+ const status = await r.selfHealOperatorTokenFn({
922
+ issuer: r.hubOrigin,
923
+ configDir: r.configDir,
924
+ log: r.log,
925
+ });
926
+ if (status.kind === "rotated") {
927
+ r.log(` refreshed operator.token issuer → ${r.hubOrigin} (was stale after exposure)`);
928
+ }
929
+ } catch (err) {
930
+ r.log(
931
+ ` note: operator.token issuer self-heal skipped (${
932
+ err instanceof Error ? err.message : String(err)
933
+ })`,
934
+ );
935
+ }
936
+ }
937
+
577
938
  /**
578
939
  * Stop the internal hub. `stopHub` returns false when nothing was running
579
940
  * (no pidfile, or stale pidfile cleared) — that's a clean no-op for the
@@ -659,11 +1020,19 @@ export async function logs(svc: string, opts: LogsOpts = {}): Promise<number> {
659
1020
  spawn(cmd) {
660
1021
  // Inherit env so `tail` sees PATH, etc. Bun.spawn defaults to empty
661
1022
  // env — see api-modules-ops.ts:defaultRun.
662
- const proc = Bun.spawn([...cmd], {
663
- stdio: ["ignore", "inherit", "inherit"],
664
- env: process.env,
665
- });
666
- return proc.pid;
1023
+ try {
1024
+ const proc = Bun.spawn([...cmd], {
1025
+ stdio: ["ignore", "inherit", "inherit"],
1026
+ env: process.env,
1027
+ });
1028
+ return proc.pid;
1029
+ } catch (err) {
1030
+ // A missing `tail` (minimal container without coreutils) surfaces
1031
+ // the friendly install UX instead of a raw spawn throw. The CLI
1032
+ // top-level catch in cli.ts renders the MissingDependencyError.
1033
+ rethrowIfMissing(err, "tail");
1034
+ throw err;
1035
+ }
667
1036
  },
668
1037
  };
669
1038
  spawner.spawn(["tail", "-n", String(lines), "-f", path], path);
@@ -146,6 +146,14 @@ interface StatusRow {
146
146
  * stale-after-rebuild row without comparing columns by eye.
147
147
  */
148
148
  staleNote?: string;
149
+ /**
150
+ * Persisted last-start failure (`lastStartError`, written by the lifecycle
151
+ * start preflight when a startCmd binary is missing). Surfaced on a
152
+ * continuation line so a *later* `parachute status` explains why the row
153
+ * isn't active — "failed to start: <binary> not installed" — rather than
154
+ * just showing it inactive. Cleared on the next successful start.
155
+ */
156
+ startErrorNote?: string;
149
157
  }
150
158
 
151
159
  /**
@@ -264,6 +272,17 @@ export async function status(opts: StatusOpts = {}): Promise<number> {
264
272
  ? `STALE: services.json cached ${entry.version}; live package.json ${source.livePackageVersion}`
265
273
  : undefined;
266
274
 
275
+ // Persisted last-start failure (lifecycle preflight wrote a missing-
276
+ // dependency wire). Surface a one-line summary; the full install recipe
277
+ // lives in services.json + the admin SPA card. Keeps `parachute status`
278
+ // scannable while still telling the operator "this is why it's down."
279
+ const startErrorNote =
280
+ entry.lastStartError !== undefined
281
+ ? entry.lastStartError.binary !== undefined
282
+ ? `failed to start: ${entry.lastStartError.binary} not installed — run \`parachute status\` detail or see /admin/modules for install steps`
283
+ : `failed to start: ${entry.lastStartError.error_description.split("\n")[0]}`
284
+ : undefined;
285
+
267
286
  // Only skip probe when we know the process is dead (PID file was
268
287
  // present but kill(pid, 0) failed). "unknown" status (no PID file)
269
288
  // still probes — externally-managed services should report health.
@@ -287,6 +306,7 @@ export async function status(opts: StatusOpts = {}): Promise<number> {
287
306
  skipped: true,
288
307
  driftWarning,
289
308
  staleNote,
309
+ startErrorNote,
290
310
  };
291
311
  }
292
312
 
@@ -324,6 +344,7 @@ export async function status(opts: StatusOpts = {}): Promise<number> {
324
344
  skipped: false,
325
345
  driftWarning,
326
346
  staleNote,
347
+ startErrorNote,
327
348
  };
328
349
  }),
329
350
  );
@@ -378,6 +399,7 @@ export async function status(opts: StatusOpts = {}): Promise<number> {
378
399
  }
379
400
  if (row.driftWarning) print(` ! ${row.driftWarning}`);
380
401
  if (row.staleNote) print(` ! ${row.staleNote}`);
402
+ if (row.startErrorNote) print(` ! ${row.startErrorNote}`);
381
403
  }
382
404
 
383
405
  /**
@@ -72,25 +72,69 @@ export interface UpgradeRunner {
72
72
  ): Promise<{ code: number; stdout: string }>;
73
73
  }
74
74
 
75
+ /**
76
+ * Exit code we synthesize when a binary can't be spawned at all. 127 is the
77
+ * POSIX shell convention for "command not found" — it lets every git call
78
+ * degrade to a normal non-zero result instead of crashing the whole command.
79
+ */
80
+ const SPAWN_NOT_FOUND_CODE = 127;
81
+
82
+ /**
83
+ * True when an error thrown by `Bun.spawn` means "the executable doesn't
84
+ * exist on this host" (ENOENT). On a minimal server with no `git` installed —
85
+ * a legitimate, common shape for a published-npm install on the canonical
86
+ * install path — `Bun.spawn(["git", ...])` throws *synchronously* with this
87
+ * shape. We catch it so `parachute upgrade` degrades to the npm path rather
88
+ * than dying with an uncaught `Executable not found in $PATH: "git"`.
89
+ */
90
+ function isSpawnNotFound(err: unknown): boolean {
91
+ if (typeof err !== "object" || err === null) return false;
92
+ const code = (err as { code?: unknown }).code;
93
+ const message = (err as { message?: unknown }).message;
94
+ return (
95
+ code === "ENOENT" ||
96
+ (typeof message === "string" && message.includes("Executable not found in $PATH"))
97
+ );
98
+ }
99
+
75
100
  export const defaultRunner: UpgradeRunner = {
76
101
  async run(cmd, opts) {
77
102
  // Inherit env so `bun add -g` etc. see TMPDIR, BUN_INSTALL, PATH, HOME.
78
103
  // Bun.spawn defaults to empty env — see api-modules-ops.ts:defaultRun.
79
- const proc = Bun.spawn([...cmd], {
80
- cwd: opts?.cwd,
81
- stdio: ["inherit", "inherit", "inherit"],
82
- env: process.env,
83
- });
104
+ let proc: Bun.Subprocess;
105
+ try {
106
+ proc = Bun.spawn([...cmd], {
107
+ cwd: opts?.cwd,
108
+ stdio: ["inherit", "inherit", "inherit"],
109
+ env: process.env,
110
+ });
111
+ } catch (err) {
112
+ // Binary not on this host (e.g. no `git` on a minimal server). Degrade
113
+ // to a non-zero exit rather than letting the throw crash the command.
114
+ if (isSpawnNotFound(err)) return SPAWN_NOT_FOUND_CODE;
115
+ throw err;
116
+ }
84
117
  return await proc.exited;
85
118
  },
86
119
  async capture(cmd, opts) {
87
120
  // Inherit env — same rationale as `run` above.
88
- const proc = Bun.spawn([...cmd], {
89
- cwd: opts?.cwd,
90
- stdout: "pipe",
91
- stderr: "pipe",
92
- env: process.env,
93
- });
121
+ let proc: Bun.Subprocess<"ignore", "pipe", "pipe">;
122
+ try {
123
+ proc = Bun.spawn([...cmd], {
124
+ cwd: opts?.cwd,
125
+ stdout: "pipe",
126
+ stderr: "pipe",
127
+ env: process.env,
128
+ });
129
+ } catch (err) {
130
+ // See `run` above: ENOENT (binary-not-found) becomes a captured
131
+ // non-zero result so every git call degrades to "command failed".
132
+ if (isSpawnNotFound(err)) {
133
+ const bin = cmd[0] ?? "command";
134
+ return { code: SPAWN_NOT_FOUND_CODE, stdout: `${bin}: not found on this host\n` };
135
+ }
136
+ throw err;
137
+ }
94
138
  const [stdout, stderr] = await Promise.all([
95
139
  new Response(proc.stdout).text(),
96
140
  new Response(proc.stderr).text(),