@openparachute/hub 0.5.14-rc.2 → 0.5.14-rc.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/README.md +109 -15
  2. package/package.json +7 -3
  3. package/src/__tests__/account-home-ui.test.ts +251 -15
  4. package/src/__tests__/account-vault-token.test.ts +355 -0
  5. package/src/__tests__/admin-vaults.test.ts +70 -4
  6. package/src/__tests__/api-mint-token.test.ts +693 -5
  7. package/src/__tests__/api-modules-config.test.ts +16 -10
  8. package/src/__tests__/api-modules-ops.test.ts +45 -0
  9. package/src/__tests__/api-modules.test.ts +92 -75
  10. package/src/__tests__/api-ready.test.ts +135 -0
  11. package/src/__tests__/api-revoke-token.test.ts +384 -0
  12. package/src/__tests__/api-users.test.ts +7 -2
  13. package/src/__tests__/auth.test.ts +157 -30
  14. package/src/__tests__/cli.test.ts +44 -5
  15. package/src/__tests__/cloudflare-detect.test.ts +60 -5
  16. package/src/__tests__/expose-2fa-warning.test.ts +31 -17
  17. package/src/__tests__/expose-auth-preflight.test.ts +71 -72
  18. package/src/__tests__/expose-cloudflare.test.ts +582 -11
  19. package/src/__tests__/expose-interactive.test.ts +10 -4
  20. package/src/__tests__/expose-public-auto.test.ts +5 -1
  21. package/src/__tests__/expose.test.ts +52 -2
  22. package/src/__tests__/hub-server.test.ts +396 -10
  23. package/src/__tests__/hub.test.ts +85 -6
  24. package/src/__tests__/init.test.ts +928 -0
  25. package/src/__tests__/lifecycle.test.ts +464 -2
  26. package/src/__tests__/migrate.test.ts +433 -51
  27. package/src/__tests__/oauth-handlers.test.ts +1252 -83
  28. package/src/__tests__/oauth-ui.test.ts +12 -1
  29. package/src/__tests__/operator-token-issuer-self-heal.test.ts +412 -0
  30. package/src/__tests__/proxy-error-ui.test.ts +212 -0
  31. package/src/__tests__/proxy-state.test.ts +192 -0
  32. package/src/__tests__/resource-binding.test.ts +97 -0
  33. package/src/__tests__/scope-explanations.test.ts +77 -12
  34. package/src/__tests__/services-manifest.test.ts +122 -4
  35. package/src/__tests__/setup-wizard.test.ts +633 -53
  36. package/src/__tests__/status.test.ts +36 -0
  37. package/src/__tests__/two-factor-flow.test.ts +602 -0
  38. package/src/__tests__/two-factor.test.ts +183 -0
  39. package/src/__tests__/upgrade.test.ts +78 -1
  40. package/src/__tests__/users.test.ts +68 -0
  41. package/src/__tests__/vault-auth-status.test.ts +312 -11
  42. package/src/__tests__/vault-hub-origin-env.test.ts +263 -0
  43. package/src/__tests__/wizard.test.ts +372 -0
  44. package/src/account-home-ui.ts +488 -38
  45. package/src/account-vault-token.ts +282 -0
  46. package/src/admin-handlers.ts +159 -4
  47. package/src/admin-login-ui.ts +49 -5
  48. package/src/admin-vaults.ts +48 -15
  49. package/src/api-account.ts +14 -0
  50. package/src/api-mint-token.ts +132 -24
  51. package/src/api-modules-ops.ts +49 -11
  52. package/src/api-modules.ts +29 -12
  53. package/src/api-ready.ts +102 -0
  54. package/src/api-revoke-token.ts +107 -21
  55. package/src/api-users.ts +29 -3
  56. package/src/cli.ts +112 -25
  57. package/src/clients.ts +18 -6
  58. package/src/cloudflare/config.ts +10 -4
  59. package/src/cloudflare/detect.ts +82 -20
  60. package/src/commands/auth.ts +165 -24
  61. package/src/commands/expose-2fa-warning.ts +34 -32
  62. package/src/commands/expose-auth-preflight.ts +89 -78
  63. package/src/commands/expose-cloudflare.ts +471 -16
  64. package/src/commands/expose-interactive.ts +10 -11
  65. package/src/commands/expose-public-auto.ts +6 -4
  66. package/src/commands/expose.ts +8 -0
  67. package/src/commands/init.ts +594 -0
  68. package/src/commands/install.ts +33 -2
  69. package/src/commands/lifecycle.ts +386 -17
  70. package/src/commands/migrate.ts +293 -41
  71. package/src/commands/status.ts +22 -0
  72. package/src/commands/upgrade.ts +55 -11
  73. package/src/commands/wizard.ts +847 -0
  74. package/src/env-file.ts +10 -0
  75. package/src/help.ts +157 -15
  76. package/src/hub-db.ts +39 -1
  77. package/src/hub-server.ts +119 -13
  78. package/src/hub-settings.ts +11 -0
  79. package/src/hub.ts +82 -14
  80. package/src/oauth-handlers.ts +298 -21
  81. package/src/oauth-ui.ts +10 -0
  82. package/src/operator-token.ts +151 -0
  83. package/src/pending-login.ts +116 -0
  84. package/src/proxy-error-ui.ts +506 -0
  85. package/src/proxy-state.ts +131 -0
  86. package/src/rate-limit.ts +51 -0
  87. package/src/resource-binding.ts +134 -0
  88. package/src/scope-attenuation.ts +85 -0
  89. package/src/scope-explanations.ts +131 -14
  90. package/src/services-manifest.ts +112 -0
  91. package/src/setup-wizard.ts +738 -125
  92. package/src/tailscale/run.ts +28 -11
  93. package/src/totp.ts +201 -0
  94. package/src/two-factor-handlers.ts +287 -0
  95. package/src/two-factor-store.ts +181 -0
  96. package/src/two-factor-ui.ts +462 -0
  97. package/src/users.ts +58 -0
  98. package/src/vault/auth-status.ts +200 -25
  99. package/src/vault-hub-origin-env.ts +163 -0
  100. package/web/ui/dist/assets/index-BiBlvEaj.css +1 -0
  101. package/web/ui/dist/assets/index-CIN3mnmf.js +61 -0
  102. package/web/ui/dist/index.html +2 -2
  103. package/src/__tests__/vault-tokens-create-interactive.test.ts +0 -183
  104. package/src/commands/vault-tokens-create-interactive.ts +0 -143
  105. package/web/ui/dist/assets/index-7DtAXz7y.css +0 -1
  106. package/web/ui/dist/assets/index-tRmPbbC7.js +0 -61
@@ -190,6 +190,31 @@ export interface InstallOpts {
190
190
  * (#45) to pre-collect the answer up front. Ignored for non-vault installs.
191
191
  */
192
192
  vaultName?: string;
193
+ /**
194
+ * "Install the module, but don't create a first vault instance" (hub#168 — the
195
+ * wizard-parity work for Aaron's 2026-05-28 directive: "always install the
196
+ * vault module, but creating a vault should be optional").
197
+ *
198
+ * Default: false (today's behavior — install runs the service's `init` and
199
+ * starts the daemon, which for vault auto-creates a `default` row).
200
+ *
201
+ * When true:
202
+ * - The `bun add -g <pkg>` step still runs (puts the binary on PATH).
203
+ * - `spec.init` is SKIPPED. For vault this means no `parachute-vault init`
204
+ * → no default-vault row is created from this code path.
205
+ * - `lifecycle.start` is SKIPPED. The supervisor/wizard owns spawning;
206
+ * starting vault here would trigger its server-side auto-init (which
207
+ * creates a `default` vault on first boot when `listVaults().length === 0`).
208
+ * - services.json is still seeded (`spec.seedEntry`) + installDir stamped
209
+ * so subsequent supervisor spawns find the module + module.json.
210
+ *
211
+ * Intended for `parachute init` — install the module so the wizard can offer
212
+ * Create/Import/Skip without a follow-up bun-add round-trip, but defer
213
+ * vault-instance creation to whichever path the wizard's vault step takes.
214
+ * On the existing CLI surfaces (`parachute install vault`, `parachute setup`),
215
+ * leave it false so today's behavior is unchanged.
216
+ */
217
+ noCreate?: boolean;
193
218
  /**
194
219
  * `parachute install scribe` only: pre-pick the transcription provider so
195
220
  * the prompt doesn't fire. Validated against scribe's known providers — an
@@ -708,7 +733,7 @@ export async function install(input: string, opts: InstallOpts = {}): Promise<nu
708
733
  ? spec.manifestName
709
734
  : manifest.name;
710
735
 
711
- if (spec.init) {
736
+ if (spec.init && !opts.noCreate) {
712
737
  // Forward --vault-name from the InstallOpts when set so `parachute setup`
713
738
  // (and any future programmatic caller) can pre-answer the name prompt.
714
739
  const initCmd =
@@ -721,6 +746,8 @@ export async function install(input: string, opts: InstallOpts = {}): Promise<nu
721
746
  log(`${initCmd.join(" ")} exited ${initCode}`);
722
747
  return initCode;
723
748
  }
749
+ } else if (spec.init && opts.noCreate) {
750
+ log(`(skipping ${spec.init.join(" ")} — --no-create: module installed, no instance created)`);
724
751
  }
725
752
 
726
753
  // Hub-as-port-authority (#53): pick the service's port now and reflect it
@@ -849,7 +876,11 @@ export async function install(input: string, opts: InstallOpts = {}): Promise<nu
849
876
  // wondering why nothing happened. Always end with the daemon running unless
850
877
  // the caller opted out (CI / piped scripts). Idempotent: if the service is
851
878
  // already up, lifecycle.start no-ops via the existing PID-file check.
852
- if (!opts.noStart) {
879
+ //
880
+ // `noCreate` (hub#168) also suppresses auto-start: starting vault would
881
+ // trigger its server-side first-boot auto-init (creating a default vault),
882
+ // which is exactly what --no-create is supposed to defer.
883
+ if (!opts.noStart && !opts.noCreate) {
853
884
  const startService =
854
885
  opts.startService ??
855
886
  ((short: string) => lifecycleStart(short, { manifestPath, configDir, log }));
@@ -1,5 +1,11 @@
1
- import { existsSync, openSync } from "node:fs";
1
+ import { existsSync, openSync, readFileSync } from "node:fs";
2
+ import { Socket } from "node:net";
2
3
  import { join } from "node:path";
4
+ import {
5
+ MissingDependencyError,
6
+ ensureExecutable,
7
+ rethrowIfMissing,
8
+ } from "@openparachute/depcheck";
3
9
  import { CONFIG_DIR, SERVICES_MANIFEST_PATH } from "../config.ts";
4
10
  import { readEnvFileValues } from "../env-file.ts";
5
11
  import { readExposeState } from "../expose-state.ts";
@@ -12,8 +18,10 @@ import {
12
18
  readHubPort,
13
19
  stopHub,
14
20
  } from "../hub-control.ts";
21
+ import { hubDbPath, openHubDb } from "../hub-db.ts";
15
22
  import { HUB_ORIGIN_ENV, deriveHubOrigin } from "../hub-origin.ts";
16
23
  import { ModuleManifestError, readModuleManifest } from "../module-manifest.ts";
24
+ import { type OperatorIssuerHealStatus, selfHealOperatorTokenIssuer } from "../operator-token.ts";
17
25
  import {
18
26
  type AliveFn,
19
27
  clearPid,
@@ -32,7 +40,13 @@ import {
32
40
  knownServices,
33
41
  shortNameForManifest,
34
42
  } from "../service-spec.ts";
35
- import { type ServiceEntry, readManifest } from "../services-manifest.ts";
43
+ import {
44
+ type ServiceEntry,
45
+ clearStartError,
46
+ readManifest,
47
+ recordStartError,
48
+ } from "../services-manifest.ts";
49
+ import { persistVaultHubOrigin, selfHealVaultHubOrigin } from "../vault-hub-origin-env.ts";
36
50
 
37
51
  /**
38
52
  * Tiny seam over `Bun.spawn` for lifecycle tests. The real spawner opens the
@@ -83,6 +97,44 @@ export const defaultSpawner: Spawner = {
83
97
  export type KillFn = (pid: number, signal: NodeJS.Signals | number) => void;
84
98
  export type SleepFn = (ms: number) => Promise<void>;
85
99
 
100
+ /**
101
+ * "Is something listening on this TCP port on loopback?" seam. Pairs with the
102
+ * spawn-then-die settle (hub#194) to catch the *other* silent-start failure
103
+ * shape (hub#487): a service that lives long enough to clear the liveness
104
+ * check but never binds its port because the port is already held (EADDRINUSE
105
+ * from an orphan). The recorded pid stays alive (vault's process supervisor
106
+ * retries / lingers) so `alive(pid)` says "running" while `parachute status`
107
+ * shows it inactive because nothing answers on the port.
108
+ *
109
+ * Tests inject a deterministic stub; production uses `defaultPortListening`.
110
+ */
111
+ export type PortListeningFn = (port: number) => Promise<boolean>;
112
+
113
+ /**
114
+ * Connect-probe: open a TCP socket to 127.0.0.1:<port> and see if it's
115
+ * accepted. A successful connect means *something* is listening; we close
116
+ * immediately. Connection refused / timeout means nothing is bound yet.
117
+ * `node:net` rather than `Bun.connect` because the latter has no clean
118
+ * "connection refused → false" without a custom socket handler, and the net
119
+ * Socket's `error`/`connect` events map directly onto the boolean we want.
120
+ */
121
+ export const defaultPortListening: PortListeningFn = (port) =>
122
+ new Promise((resolve) => {
123
+ const socket = new Socket();
124
+ let settled = false;
125
+ const done = (listening: boolean) => {
126
+ if (settled) return;
127
+ settled = true;
128
+ socket.destroy();
129
+ resolve(listening);
130
+ };
131
+ socket.setTimeout(1000);
132
+ socket.once("connect", () => done(true));
133
+ socket.once("timeout", () => done(false));
134
+ socket.once("error", () => done(false));
135
+ socket.connect(port, "127.0.0.1");
136
+ });
137
+
86
138
  /**
87
139
  * Group-aware liveness: returns true if the process group (pgid == pid)
88
140
  * still has any member. Pairs with `defaultSpawner`'s `detached: true` —
@@ -129,6 +181,35 @@ export const defaultKill: KillFn = (pid, signal) => {
129
181
 
130
182
  export const defaultSleep: SleepFn = (ms) => new Promise((r) => setTimeout(r, ms));
131
183
 
184
+ /**
185
+ * Read the trailing `n` lines of a logfile, best-effort. Used to surface the
186
+ * real boot error when a start fails — operators shouldn't have to manually
187
+ * `tail` the log to learn *why* the daemon died. Returns [] on any read
188
+ * error (missing file, permissions) so the caller falls back to the generic
189
+ * "tail the log" hint without throwing.
190
+ */
191
+ function readLogTail(logFile: string, n: number): string[] {
192
+ try {
193
+ const content = readFileSync(logFile, "utf8");
194
+ const trimmed = content.replace(/\n$/, "");
195
+ if (trimmed === "") return [];
196
+ return trimmed.split("\n").slice(-n);
197
+ } catch {
198
+ return [];
199
+ }
200
+ }
201
+
202
+ /**
203
+ * Heuristic EADDRINUSE detector over a logfile tail. cloudflared, Bun, and
204
+ * Node all surface port collisions with recognizable phrases; we match the
205
+ * common ones rather than parse a structured error (there isn't one across
206
+ * runtimes). False positives are harmless — the worst case is we *also* print
207
+ * the port-in-use remedy on an unrelated failure, which is still actionable.
208
+ */
209
+ function detectAddrInUse(logTail: readonly string[]): boolean {
210
+ return logTail.some((line) => /EADDRINUSE|address already in use|port .* in use/i.test(line));
211
+ }
212
+
132
213
  export interface LifecycleOpts {
133
214
  spawner?: Spawner;
134
215
  kill?: KillFn;
@@ -160,6 +241,30 @@ export interface LifecycleOpts {
160
241
  * settle.
161
242
  */
162
243
  startSettleMs?: number;
244
+ /**
245
+ * Probe whether the service's port is listening, post-spawn. Pairs with the
246
+ * settle (hub#194) to catch the EADDRINUSE-orphan shape (hub#487): the
247
+ * process survives the liveness window (vault lingers / retries) but never
248
+ * binds because the port is already held, so `start` would otherwise report
249
+ * "✓ started" while `status` shows it inactive. Tests inject a stub;
250
+ * production uses `defaultPortListening` (a loopback TCP connect probe).
251
+ */
252
+ portListening?: PortListeningFn;
253
+ /**
254
+ * How long `start` polls for the service to bind its port after the
255
+ * liveness settle passes. Default 4000ms in production — long enough to
256
+ * cover vault/scribe cold-boot (DB open, route registration) without making
257
+ * a healthy start feel laggy. Polled at `startReadyPollMs` intervals; the
258
+ * first time the port answers we declare success. If the window elapses
259
+ * with the process still alive but the port silent, we print a non-fatal
260
+ * warning (the daemon may still be coming up) rather than failing — only a
261
+ * *dead* process is a hard failure. Defaulting policy mirrors
262
+ * `startSettleMs`: 0 (skipped) unless `portListening` is injected or the
263
+ * production path (no spawner override) is active.
264
+ */
265
+ startReadyMs?: number;
266
+ /** Poll interval while waiting for the port to come up. Default 200ms. */
267
+ startReadyPollMs?: number;
163
268
  /**
164
269
  * Override the hub origin passed to services as PARACHUTE_HUB_ORIGIN. If
165
270
  * unset, `start` derives it from `expose-state.json` (when exposed) or
@@ -175,9 +280,36 @@ export interface LifecycleOpts {
175
280
  * `ensureHubRunning` and `lifecycle.stop("hub")` dispatches to
176
281
  * `stopHub`. Tests inject stubs to avoid spawning real bun processes.
177
282
  */
283
+ /**
284
+ * PATH-resolution seam for the start preflight (`@openparachute/depcheck`
285
+ * `ensureExecutable`). Production uses the real `Bun.which`; a missing
286
+ * startCmd binary then surfaces the friendly missing-dependency UX +
287
+ * persists it to services.json.
288
+ *
289
+ * Defaulting policy mirrors `startSettleMs`: when a stub `spawner` is
290
+ * injected (the test path) `which` defaults to a permissive resolver
291
+ * (`() => "<stub>"`) so existing stub-spawner tests don't trip the preflight
292
+ * against binaries that aren't on the test host's PATH (`parachute-vault`,
293
+ * `notes-serve`). Production (no spawner override) gets the real `Bun.which`.
294
+ * Tests that want to exercise the missing-binary branch inject `which`
295
+ * explicitly (e.g. `which: () => null`).
296
+ */
297
+ which?: (cmd: string) => string | null;
178
298
  hub?: {
179
299
  ensureRunning?: (opts: EnsureHubOpts) => Promise<EnsureHubResult>;
180
300
  stop?: (opts: StopHubOpts) => Promise<boolean>;
301
+ /**
302
+ * Self-heal the operator token's stale `iss` after `start hub` (hub#481).
303
+ * Production opens hub.db at `<configDir>/hub.db` and delegates to
304
+ * `selfHealOperatorTokenIssuer`. Tests inject a stub to assert the call
305
+ * happens — or to make it throw and prove a self-heal failure never fails
306
+ * `start hub`.
307
+ */
308
+ selfHealOperatorToken?: (args: {
309
+ issuer: string;
310
+ configDir: string;
311
+ log: (line: string) => void;
312
+ }) => Promise<OperatorIssuerHealStatus>;
181
313
  };
182
314
  }
183
315
 
@@ -193,9 +325,42 @@ interface Resolved {
193
325
  killWaitMs: number;
194
326
  pollIntervalMs: number;
195
327
  startSettleMs: number;
328
+ portListening: PortListeningFn;
329
+ startReadyMs: number;
330
+ startReadyPollMs: number;
331
+ which: (cmd: string) => string | null;
196
332
  hubOrigin: string | undefined;
197
333
  ensureHub: (opts: EnsureHubOpts) => Promise<EnsureHubResult>;
198
334
  stopHubFn: (opts: StopHubOpts) => Promise<boolean>;
335
+ selfHealOperatorTokenFn: (args: {
336
+ issuer: string;
337
+ configDir: string;
338
+ log: (line: string) => void;
339
+ }) => Promise<OperatorIssuerHealStatus>;
340
+ }
341
+
342
+ /**
343
+ * Production self-heal: open hub.db at `<configDir>/hub.db`, run
344
+ * `selfHealOperatorTokenIssuer`, and close the db. Derives the db path the
345
+ * same way the rest of the repo does (`hubDbPath(configDir)`); `openHubDb`
346
+ * runs migrations + WAL on open, matching `commands/auth.ts`. Tests override
347
+ * this whole seam, so the db-open only happens on the production path.
348
+ */
349
+ async function defaultSelfHealOperatorToken(args: {
350
+ issuer: string;
351
+ configDir: string;
352
+ log: (line: string) => void;
353
+ }): Promise<OperatorIssuerHealStatus> {
354
+ const db = openHubDb(hubDbPath(args.configDir));
355
+ try {
356
+ return await selfHealOperatorTokenIssuer(db, {
357
+ issuer: args.issuer,
358
+ configDir: args.configDir,
359
+ log: args.log,
360
+ });
361
+ } finally {
362
+ db.close();
363
+ }
199
364
  }
200
365
 
201
366
  function resolve(opts: LifecycleOpts): Resolved {
@@ -219,9 +384,26 @@ function resolve(opts: LifecycleOpts): Resolved {
219
384
  // override `alive`, which re-enables the default 250ms.
220
385
  startSettleMs:
221
386
  opts.startSettleMs ?? (opts.spawner === undefined || opts.alive !== undefined ? 250 : 0),
387
+ portListening: opts.portListening ?? defaultPortListening,
388
+ // Same defaulting policy as startSettleMs: production (no spawner
389
+ // override) gets the real 4s readiness window; tests that inject a stub
390
+ // spawner get 0 (skipped) unless they explicitly opt in via
391
+ // `portListening` or `startReadyMs`, so existing stub-spawner tests don't
392
+ // start probing a fake port.
393
+ startReadyMs:
394
+ opts.startReadyMs ??
395
+ (opts.spawner === undefined || opts.portListening !== undefined ? 4000 : 0),
396
+ startReadyPollMs: opts.startReadyPollMs ?? 200,
397
+ // Same defaulting policy as startSettleMs/startReadyMs: production (no
398
+ // spawner override) preflights with the real Bun.which; stub-spawner tests
399
+ // get a permissive resolver so the preflight doesn't trip against binaries
400
+ // that aren't on the test host's PATH. Explicit `which` always wins.
401
+ which:
402
+ opts.which ?? (opts.spawner === undefined ? Bun.which : () => "/stub/bin/preflight-skipped"),
222
403
  hubOrigin: resolveHubOrigin(opts.hubOrigin, configDir),
223
404
  ensureHub: opts.hub?.ensureRunning ?? ensureHubRunning,
224
405
  stopHubFn: opts.hub?.stop ?? stopHub,
406
+ selfHealOperatorTokenFn: opts.hub?.selfHealOperatorToken ?? defaultSelfHealOperatorToken,
225
407
  };
226
408
  }
227
409
 
@@ -452,42 +634,185 @@ export async function start(svc: string | undefined, opts: LifecycleOpts = {}):
452
634
  if (entry.installDir) spawnerOpts.cwd = entry.installDir;
453
635
  const passOpts =
454
636
  spawnerOpts.env !== undefined || spawnerOpts.cwd !== undefined ? spawnerOpts : undefined;
637
+
638
+ // Pre-flight the startCmd binary (`@openparachute/depcheck`) so a missing
639
+ // executable surfaces the friendly install UX inline AND is persisted onto
640
+ // the services.json row, so a *later* `parachute status` (a separate
641
+ // invocation that only reads the manifest) + the SPA modules pane show
642
+ // "vault: failed to start — parachute-vault not installed" with install
643
+ // info, rather than a bare "failed"/orphan-timeout. The binary is `cmd[0]`
644
+ // (e.g. `parachute-vault` for an npm install, `bun` for a bun-linked one).
645
+ const startBinary = cmd[0];
646
+ if (startBinary) {
647
+ try {
648
+ ensureExecutable(startBinary, { which: r.which });
649
+ } catch (err) {
650
+ if (err instanceof MissingDependencyError) {
651
+ failures++;
652
+ r.log(`✗ ${short} failed to start:`);
653
+ for (const line of err.message.split("\n")) r.log(` ${line}`);
654
+ recordStartError(entry.name, err.toWire(), r.manifestPath);
655
+ continue;
656
+ }
657
+ throw err;
658
+ }
659
+ }
660
+
455
661
  let pid: number;
456
662
  try {
457
663
  pid = r.spawner.spawn(cmd, logFile, passOpts);
458
664
  } catch (err) {
665
+ // Belt-and-suspenders: a missing binary that slipped past the pre-flight
666
+ // (race) still becomes a MissingDependencyError via rethrowIfMissing.
667
+ if (startBinary) {
668
+ try {
669
+ rethrowIfMissing(err, startBinary);
670
+ } catch (missing) {
671
+ if (missing instanceof MissingDependencyError) {
672
+ failures++;
673
+ r.log(`✗ ${short} failed to start:`);
674
+ for (const line of missing.message.split("\n")) r.log(` ${line}`);
675
+ recordStartError(entry.name, missing.toWire(), r.manifestPath);
676
+ continue;
677
+ }
678
+ }
679
+ }
459
680
  failures++;
460
681
  const msg = err instanceof Error ? err.message : String(err);
461
682
  r.log(`✗ ${short} failed to start: ${msg}`);
462
683
  continue;
463
684
  }
685
+ // A successful spawn clears any stale start-error recorded from a prior
686
+ // missing-dependency failure so `parachute status` doesn't keep showing it.
687
+ clearStartError(entry.name, r.manifestPath);
464
688
  writePid(short, pid, r.configDir);
465
689
 
466
- // Settle-poll for spawn-then-immediately-die (hub#194). A spawn returning
467
- // a pid only proves the kernel forked the process; the child may exit
468
- // microseconds later if its main code path throws before listening
469
- // (e.g. notes-serve's Bun.resolveSync failing for bun-linked installs).
470
- // Without this poll, we'd report success and the operator would chase
471
- // a phantom 502.
690
+ // Boot-readiness gating (hub#194 + hub#487). A spawn returning a pid only
691
+ // proves the kernel forked the process it says nothing about whether the
692
+ // service survived its boot or bound its port. Two silent-start shapes:
693
+ //
694
+ // (1) spawn-then-immediately-die (hub#194): the child throws before
695
+ // listening (notes-serve's Bun.resolveSync failing for bun-linked
696
+ // installs) and exits microseconds later. Caught by the settle below.
697
+ //
698
+ // (2) alive-but-never-bound (hub#487): the port is already held by an
699
+ // orphan, the child hits EADDRINUSE, but its process *lingers* (or a
700
+ // supervisor retries) long enough to clear the liveness check. `start`
701
+ // would report "✓ started" while `parachute status` shows it inactive
702
+ // because nothing answers on the port. Aaron hit exactly this with an
703
+ // orphan holding vault's 1940 on a fresh EC2 box. Caught by the
704
+ // port-readiness poll below.
705
+ //
706
+ // On any failure we surface the tail of the logfile so the operator sees
707
+ // the real boot error inline, and we specifically call out EADDRINUSE with
708
+ // the `lsof -ti:<port>` remedy.
709
+ const reportStartFailure = (reason: string): void => {
710
+ clearPid(short, r.configDir);
711
+ failures++;
712
+ const tail = readLogTail(logFile, 20);
713
+ if (detectAddrInUse(tail)) {
714
+ r.log(
715
+ `✗ ${short} failed to start: port ${entry.port} is already in use. Stop the existing process first — find it with \`lsof -ti:${entry.port}\` (then \`kill <pid>\`), or run \`parachute restart ${short}\`.`,
716
+ );
717
+ } else {
718
+ r.log(`✗ ${short} failed to start: ${reason}`);
719
+ }
720
+ if (tail.length > 0) {
721
+ r.log(` ── last ${tail.length} log line(s) (${logFile}) ──`);
722
+ for (const line of tail) r.log(` │ ${line}`);
723
+ } else {
724
+ r.log(` Tail the log for details: tail -50 ${logFile}`);
725
+ }
726
+ };
727
+
472
728
  if (r.startSettleMs > 0) {
473
729
  await r.sleep(r.startSettleMs);
474
730
  if (!r.alive(pid)) {
475
- clearPid(short, r.configDir);
476
- failures++;
731
+ reportStartFailure(
732
+ `spawned pid ${pid} but the process exited within ${r.startSettleMs}ms.`,
733
+ );
734
+ continue;
735
+ }
736
+ }
737
+
738
+ // Port-readiness poll (hub#487). The process is alive; now confirm it
739
+ // actually bound its port before claiming success. Poll up to
740
+ // `startReadyMs`, re-checking liveness each iteration so a *later* death
741
+ // (e.g. a slow EADDRINUSE crash) is still reported as a failure. A process
742
+ // that stays alive but never binds within the window gets a non-fatal
743
+ // warning rather than a hard failure — some daemons legitimately do slow
744
+ // boot work, and we'd rather not flip a healthy-but-slow start to red.
745
+ if (r.startReadyMs > 0) {
746
+ const deadline = r.now() + r.startReadyMs;
747
+ let listening = false;
748
+ let died = false;
749
+ while (r.now() < deadline) {
750
+ if (!r.alive(pid)) {
751
+ died = true;
752
+ break;
753
+ }
754
+ if (await r.portListening(entry.port)) {
755
+ listening = true;
756
+ break;
757
+ }
758
+ await r.sleep(r.startReadyPollMs);
759
+ }
760
+ if (died) {
761
+ reportStartFailure(`spawned pid ${pid} but the process exited during startup.`);
762
+ continue;
763
+ }
764
+ if (!listening) {
765
+ // Last-chance liveness check — the loop may have exited on the
766
+ // deadline right as the process died.
767
+ if (!r.alive(pid)) {
768
+ reportStartFailure(`spawned pid ${pid} but the process exited during startup.`);
769
+ continue;
770
+ }
477
771
  r.log(
478
- `✗ ${short} failed to start: spawned pid ${pid} but the process exited within ${r.startSettleMs}ms.`,
772
+ `⚠ ${short} started (pid ${pid}) but port ${entry.port} isn't accepting connections yet after ${r.startReadyMs}ms.`,
479
773
  );
480
- r.log(` Tail the log for details: tail -50 ${logFile}`);
774
+ r.log(
775
+ ` It may still be coming up — check \`parachute status\` and \`parachute logs ${short}\`.`,
776
+ );
777
+ if (r.hubOrigin) r.log(` ${HUB_ORIGIN_ENV}=${r.hubOrigin}`);
778
+ if (short === "vault") persistVaultHubOriginForStart(r);
481
779
  continue;
482
780
  }
483
781
  }
484
782
 
485
783
  r.log(`✓ ${short} started (pid ${pid}); logs: ${logFile}`);
486
784
  if (r.hubOrigin) r.log(` ${HUB_ORIGIN_ENV}=${r.hubOrigin}`);
785
+ if (short === "vault") persistVaultHubOriginForStart(r);
487
786
  }
488
787
  return failures === 0 ? 0 : 1;
489
788
  }
490
789
 
790
+ /**
791
+ * Durable-persist vault's `PARACHUTE_HUB_ORIGIN` on a vault `start`. Two cases,
792
+ * in order:
793
+ *
794
+ * 1. The resolved spawn origin (`r.hubOrigin`) is a real public origin — write
795
+ * it. This is the long-standing happy path: an exposure is live, the
796
+ * launchd / systemd daemon (which boots vault out-of-band and never sees
797
+ * this spawn env) needs it in `.env` to validate hub-minted JWTs' `iss`.
798
+ * `persistVaultHubOrigin` skips loopback / unchanged values itself.
799
+ *
800
+ * 2. Self-heal: even when `r.hubOrigin` resolved to loopback or undefined
801
+ * (e.g. the hub.port file outran the expose-state read, or this is a bare
802
+ * `restart vault` on a deploy whose `.env` was never written), consult
803
+ * `expose-state.json` directly. If it advertises a public origin and
804
+ * vault's persisted value is unset / loopback, write the public origin.
805
+ * This is what lets an EXISTING broken Cloudflare deploy self-correct on
806
+ * the next `parachute restart vault`, not only fresh exposes.
807
+ *
808
+ * Case 1 covers the override / freshly-resolved path; case 2 catches the gap
809
+ * the Cloudflare 401 P0 fell through. See `vault-hub-origin-env.ts`.
810
+ */
811
+ function persistVaultHubOriginForStart(r: Resolved): void {
812
+ if (r.hubOrigin) persistVaultHubOrigin(r.configDir, r.hubOrigin, r.log);
813
+ selfHealVaultHubOrigin(r.configDir, r.log, join(r.configDir, "expose-state.json"));
814
+ }
815
+
491
816
  export async function stop(svc: string | undefined, opts: LifecycleOpts = {}): Promise<number> {
492
817
  const r = resolve(opts);
493
818
  if (svc === HUB_SVC) return stopHubSvc(r);
@@ -567,6 +892,12 @@ async function startHubSvc(r: Resolved): Promise<number> {
567
892
  } else {
568
893
  r.log(`hub already running (pid ${result.pid}) on port ${result.port}.`);
569
894
  }
895
+ // Self-heal a stale operator-token issuer (hub#481). Runs whether the hub
896
+ // was freshly started OR already running — a token stamped at loopback
897
+ // before exposure must heal even when the hub is already up. The loopback /
898
+ // provenance guards live inside `selfHealOperatorTokenIssuer`, so the only
899
+ // gate here is "is there a real issuer to heal toward?".
900
+ await selfHealOperatorTokenOnStart(r);
570
901
  return 0;
571
902
  } catch (err) {
572
903
  r.log(`✗ hub failed to start: ${err instanceof Error ? err.message : String(err)}`);
@@ -574,6 +905,36 @@ async function startHubSvc(r: Resolved): Promise<number> {
574
905
  }
575
906
  }
576
907
 
908
+ /**
909
+ * Re-issue the operator token under the hub's current origin when its `iss`
910
+ * went stale after an init-at-loopback → expose transition (hub#481). Mirrors
911
+ * `persistVaultHubOriginForStart`'s quiet style: emit a single line only when
912
+ * a rotation actually happens; stay silent for fresh / absent / skipped.
913
+ *
914
+ * The ENTIRE self-heal is wrapped here so it can NEVER block or fail
915
+ * `start hub` — a db-open error, a corrupt token, anything — degrades to a
916
+ * brief warning and `start hub` still returns 0.
917
+ */
918
+ async function selfHealOperatorTokenOnStart(r: Resolved): Promise<void> {
919
+ if (!r.hubOrigin) return;
920
+ try {
921
+ const status = await r.selfHealOperatorTokenFn({
922
+ issuer: r.hubOrigin,
923
+ configDir: r.configDir,
924
+ log: r.log,
925
+ });
926
+ if (status.kind === "rotated") {
927
+ r.log(` refreshed operator.token issuer → ${r.hubOrigin} (was stale after exposure)`);
928
+ }
929
+ } catch (err) {
930
+ r.log(
931
+ ` note: operator.token issuer self-heal skipped (${
932
+ err instanceof Error ? err.message : String(err)
933
+ })`,
934
+ );
935
+ }
936
+ }
937
+
577
938
  /**
578
939
  * Stop the internal hub. `stopHub` returns false when nothing was running
579
940
  * (no pidfile, or stale pidfile cleared) — that's a clean no-op for the
@@ -659,11 +1020,19 @@ export async function logs(svc: string, opts: LogsOpts = {}): Promise<number> {
659
1020
  spawn(cmd) {
660
1021
  // Inherit env so `tail` sees PATH, etc. Bun.spawn defaults to empty
661
1022
  // env — see api-modules-ops.ts:defaultRun.
662
- const proc = Bun.spawn([...cmd], {
663
- stdio: ["ignore", "inherit", "inherit"],
664
- env: process.env,
665
- });
666
- return proc.pid;
1023
+ try {
1024
+ const proc = Bun.spawn([...cmd], {
1025
+ stdio: ["ignore", "inherit", "inherit"],
1026
+ env: process.env,
1027
+ });
1028
+ return proc.pid;
1029
+ } catch (err) {
1030
+ // A missing `tail` (minimal container without coreutils) surfaces
1031
+ // the friendly install UX instead of a raw spawn throw. The CLI
1032
+ // top-level catch in cli.ts renders the MissingDependencyError.
1033
+ rethrowIfMissing(err, "tail");
1034
+ throw err;
1035
+ }
667
1036
  },
668
1037
  };
669
1038
  spawner.spawn(["tail", "-n", String(lines), "-f", path], path);