@openparachute/hub 0.5.14-rc.9 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +23 -0
- package/package.json +7 -3
- package/src/__tests__/account-home-ui.test.ts +251 -15
- package/src/__tests__/account-vault-token.test.ts +355 -0
- package/src/__tests__/admin-vaults.test.ts +70 -4
- package/src/__tests__/api-mint-token.test.ts +30 -21
- package/src/__tests__/api-modules-ops.test.ts +45 -0
- package/src/__tests__/api-users.test.ts +7 -2
- package/src/__tests__/auth.test.ts +157 -30
- package/src/__tests__/cli.test.ts +44 -5
- package/src/__tests__/expose-2fa-warning.test.ts +31 -17
- package/src/__tests__/expose-auth-preflight.test.ts +71 -72
- package/src/__tests__/expose-cloudflare.test.ts +482 -14
- package/src/__tests__/expose.test.ts +52 -2
- package/src/__tests__/hub-server.test.ts +97 -0
- package/src/__tests__/hub.test.ts +85 -6
- package/src/__tests__/init.test.ts +102 -1
- package/src/__tests__/lifecycle.test.ts +464 -2
- package/src/__tests__/oauth-handlers.test.ts +1252 -83
- package/src/__tests__/oauth-ui.test.ts +12 -1
- package/src/__tests__/operator-token-issuer-self-heal.test.ts +412 -0
- package/src/__tests__/resource-binding.test.ts +97 -0
- package/src/__tests__/scope-explanations.test.ts +41 -12
- package/src/__tests__/services-manifest.test.ts +122 -4
- package/src/__tests__/setup-wizard.test.ts +335 -15
- package/src/__tests__/status.test.ts +36 -0
- package/src/__tests__/two-factor-flow.test.ts +602 -0
- package/src/__tests__/two-factor.test.ts +183 -0
- package/src/__tests__/upgrade.test.ts +78 -1
- package/src/__tests__/users.test.ts +68 -0
- package/src/__tests__/vault-auth-status.test.ts +47 -6
- package/src/__tests__/vault-hub-origin-env.test.ts +263 -0
- package/src/account-home-ui.ts +488 -38
- package/src/account-vault-token.ts +282 -0
- package/src/admin-handlers.ts +159 -4
- package/src/admin-login-ui.ts +49 -5
- package/src/admin-vaults.ts +48 -15
- package/src/api-account.ts +14 -0
- package/src/api-modules-ops.ts +49 -11
- package/src/api-users.ts +29 -3
- package/src/cli.ts +26 -21
- package/src/clients.ts +18 -6
- package/src/cloudflare/config.ts +10 -4
- package/src/cloudflare/detect.ts +39 -44
- package/src/commands/auth.ts +165 -24
- package/src/commands/expose-2fa-warning.ts +34 -32
- package/src/commands/expose-auth-preflight.ts +89 -78
- package/src/commands/expose-cloudflare.ts +370 -12
- package/src/commands/expose.ts +8 -0
- package/src/commands/init.ts +33 -2
- package/src/commands/lifecycle.ts +386 -17
- package/src/commands/status.ts +22 -0
- package/src/commands/upgrade.ts +55 -11
- package/src/commands/wizard.ts +8 -4
- package/src/env-file.ts +10 -0
- package/src/help.ts +3 -1
- package/src/hub-db.ts +39 -1
- package/src/hub-server.ts +52 -0
- package/src/hub.ts +82 -14
- package/src/oauth-handlers.ts +298 -21
- package/src/oauth-ui.ts +10 -0
- package/src/operator-token.ts +151 -0
- package/src/pending-login.ts +116 -0
- package/src/rate-limit.ts +51 -0
- package/src/resource-binding.ts +134 -0
- package/src/scope-explanations.ts +46 -18
- package/src/services-manifest.ts +112 -0
- package/src/setup-wizard.ts +77 -7
- package/src/tailscale/run.ts +28 -11
- package/src/totp.ts +201 -0
- package/src/two-factor-handlers.ts +287 -0
- package/src/two-factor-store.ts +181 -0
- package/src/two-factor-ui.ts +462 -0
- package/src/users.ts +58 -0
- package/src/vault/auth-status.ts +71 -19
- package/src/vault-hub-origin-env.ts +163 -0
- package/web/ui/dist/assets/index-BiBlvEaj.css +1 -0
- package/web/ui/dist/assets/index-CIN3mnmf.js +61 -0
- package/web/ui/dist/index.html +2 -2
- package/src/__tests__/vault-tokens-create-interactive.test.ts +0 -183
- package/src/commands/vault-tokens-create-interactive.ts +0 -143
- package/web/ui/dist/assets/index-7DtAXz7y.css +0 -1
- package/web/ui/dist/assets/index-tRmPbbC7.js +0 -61
|
@@ -1,5 +1,11 @@
|
|
|
1
|
-
import { existsSync, openSync } from "node:fs";
|
|
1
|
+
import { existsSync, openSync, readFileSync } from "node:fs";
|
|
2
|
+
import { Socket } from "node:net";
|
|
2
3
|
import { join } from "node:path";
|
|
4
|
+
import {
|
|
5
|
+
MissingDependencyError,
|
|
6
|
+
ensureExecutable,
|
|
7
|
+
rethrowIfMissing,
|
|
8
|
+
} from "@openparachute/depcheck";
|
|
3
9
|
import { CONFIG_DIR, SERVICES_MANIFEST_PATH } from "../config.ts";
|
|
4
10
|
import { readEnvFileValues } from "../env-file.ts";
|
|
5
11
|
import { readExposeState } from "../expose-state.ts";
|
|
@@ -12,8 +18,10 @@ import {
|
|
|
12
18
|
readHubPort,
|
|
13
19
|
stopHub,
|
|
14
20
|
} from "../hub-control.ts";
|
|
21
|
+
import { hubDbPath, openHubDb } from "../hub-db.ts";
|
|
15
22
|
import { HUB_ORIGIN_ENV, deriveHubOrigin } from "../hub-origin.ts";
|
|
16
23
|
import { ModuleManifestError, readModuleManifest } from "../module-manifest.ts";
|
|
24
|
+
import { type OperatorIssuerHealStatus, selfHealOperatorTokenIssuer } from "../operator-token.ts";
|
|
17
25
|
import {
|
|
18
26
|
type AliveFn,
|
|
19
27
|
clearPid,
|
|
@@ -32,7 +40,13 @@ import {
|
|
|
32
40
|
knownServices,
|
|
33
41
|
shortNameForManifest,
|
|
34
42
|
} from "../service-spec.ts";
|
|
35
|
-
import {
|
|
43
|
+
import {
|
|
44
|
+
type ServiceEntry,
|
|
45
|
+
clearStartError,
|
|
46
|
+
readManifest,
|
|
47
|
+
recordStartError,
|
|
48
|
+
} from "../services-manifest.ts";
|
|
49
|
+
import { persistVaultHubOrigin, selfHealVaultHubOrigin } from "../vault-hub-origin-env.ts";
|
|
36
50
|
|
|
37
51
|
/**
|
|
38
52
|
* Tiny seam over `Bun.spawn` for lifecycle tests. The real spawner opens the
|
|
@@ -83,6 +97,44 @@ export const defaultSpawner: Spawner = {
|
|
|
83
97
|
export type KillFn = (pid: number, signal: NodeJS.Signals | number) => void;
|
|
84
98
|
export type SleepFn = (ms: number) => Promise<void>;
|
|
85
99
|
|
|
100
|
+
/**
|
|
101
|
+
* "Is something listening on this TCP port on loopback?" seam. Pairs with the
|
|
102
|
+
* spawn-then-die settle (hub#194) to catch the *other* silent-start failure
|
|
103
|
+
* shape (hub#487): a service that lives long enough to clear the liveness
|
|
104
|
+
* check but never binds its port because the port is already held (EADDRINUSE
|
|
105
|
+
* from an orphan). The recorded pid stays alive (vault's process supervisor
|
|
106
|
+
* retries / lingers) so `alive(pid)` says "running" while `parachute status`
|
|
107
|
+
* shows it inactive because nothing answers on the port.
|
|
108
|
+
*
|
|
109
|
+
* Tests inject a deterministic stub; production uses `defaultPortListening`.
|
|
110
|
+
*/
|
|
111
|
+
export type PortListeningFn = (port: number) => Promise<boolean>;
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Connect-probe: open a TCP socket to 127.0.0.1:<port> and see if it's
|
|
115
|
+
* accepted. A successful connect means *something* is listening; we close
|
|
116
|
+
* immediately. Connection refused / timeout means nothing is bound yet.
|
|
117
|
+
* `node:net` rather than `Bun.connect` because the latter has no clean
|
|
118
|
+
* "connection refused → false" without a custom socket handler, and the net
|
|
119
|
+
* Socket's `error`/`connect` events map directly onto the boolean we want.
|
|
120
|
+
*/
|
|
121
|
+
export const defaultPortListening: PortListeningFn = (port) =>
|
|
122
|
+
new Promise((resolve) => {
|
|
123
|
+
const socket = new Socket();
|
|
124
|
+
let settled = false;
|
|
125
|
+
const done = (listening: boolean) => {
|
|
126
|
+
if (settled) return;
|
|
127
|
+
settled = true;
|
|
128
|
+
socket.destroy();
|
|
129
|
+
resolve(listening);
|
|
130
|
+
};
|
|
131
|
+
socket.setTimeout(1000);
|
|
132
|
+
socket.once("connect", () => done(true));
|
|
133
|
+
socket.once("timeout", () => done(false));
|
|
134
|
+
socket.once("error", () => done(false));
|
|
135
|
+
socket.connect(port, "127.0.0.1");
|
|
136
|
+
});
|
|
137
|
+
|
|
86
138
|
/**
|
|
87
139
|
* Group-aware liveness: returns true if the process group (pgid == pid)
|
|
88
140
|
* still has any member. Pairs with `defaultSpawner`'s `detached: true` —
|
|
@@ -129,6 +181,35 @@ export const defaultKill: KillFn = (pid, signal) => {
|
|
|
129
181
|
|
|
130
182
|
export const defaultSleep: SleepFn = (ms) => new Promise((r) => setTimeout(r, ms));
|
|
131
183
|
|
|
184
|
+
/**
|
|
185
|
+
* Read the trailing `n` lines of a logfile, best-effort. Used to surface the
|
|
186
|
+
* real boot error when a start fails — operators shouldn't have to manually
|
|
187
|
+
* `tail` the log to learn *why* the daemon died. Returns [] on any read
|
|
188
|
+
* error (missing file, permissions) so the caller falls back to the generic
|
|
189
|
+
* "tail the log" hint without throwing.
|
|
190
|
+
*/
|
|
191
|
+
function readLogTail(logFile: string, n: number): string[] {
|
|
192
|
+
try {
|
|
193
|
+
const content = readFileSync(logFile, "utf8");
|
|
194
|
+
const trimmed = content.replace(/\n$/, "");
|
|
195
|
+
if (trimmed === "") return [];
|
|
196
|
+
return trimmed.split("\n").slice(-n);
|
|
197
|
+
} catch {
|
|
198
|
+
return [];
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/**
|
|
203
|
+
* Heuristic EADDRINUSE detector over a logfile tail. cloudflared, Bun, and
|
|
204
|
+
* Node all surface port collisions with recognizable phrases; we match the
|
|
205
|
+
* common ones rather than parse a structured error (there isn't one across
|
|
206
|
+
* runtimes). False positives are harmless — the worst case is we *also* print
|
|
207
|
+
* the port-in-use remedy on an unrelated failure, which is still actionable.
|
|
208
|
+
*/
|
|
209
|
+
function detectAddrInUse(logTail: readonly string[]): boolean {
|
|
210
|
+
return logTail.some((line) => /EADDRINUSE|address already in use|port .* in use/i.test(line));
|
|
211
|
+
}
|
|
212
|
+
|
|
132
213
|
export interface LifecycleOpts {
|
|
133
214
|
spawner?: Spawner;
|
|
134
215
|
kill?: KillFn;
|
|
@@ -160,6 +241,30 @@ export interface LifecycleOpts {
|
|
|
160
241
|
* settle.
|
|
161
242
|
*/
|
|
162
243
|
startSettleMs?: number;
|
|
244
|
+
/**
|
|
245
|
+
* Probe whether the service's port is listening, post-spawn. Pairs with the
|
|
246
|
+
* settle (hub#194) to catch the EADDRINUSE-orphan shape (hub#487): the
|
|
247
|
+
* process survives the liveness window (vault lingers / retries) but never
|
|
248
|
+
* binds because the port is already held, so `start` would otherwise report
|
|
249
|
+
* "✓ started" while `status` shows it inactive. Tests inject a stub;
|
|
250
|
+
* production uses `defaultPortListening` (a loopback TCP connect probe).
|
|
251
|
+
*/
|
|
252
|
+
portListening?: PortListeningFn;
|
|
253
|
+
/**
|
|
254
|
+
* How long `start` polls for the service to bind its port after the
|
|
255
|
+
* liveness settle passes. Default 4000ms in production — long enough to
|
|
256
|
+
* cover vault/scribe cold-boot (DB open, route registration) without making
|
|
257
|
+
* a healthy start feel laggy. Polled at `startReadyPollMs` intervals; the
|
|
258
|
+
* first time the port answers we declare success. If the window elapses
|
|
259
|
+
* with the process still alive but the port silent, we print a non-fatal
|
|
260
|
+
* warning (the daemon may still be coming up) rather than failing — only a
|
|
261
|
+
* *dead* process is a hard failure. Defaulting policy mirrors
|
|
262
|
+
* `startSettleMs`: 0 (skipped) unless `portListening` is injected or the
|
|
263
|
+
* production path (no spawner override) is active.
|
|
264
|
+
*/
|
|
265
|
+
startReadyMs?: number;
|
|
266
|
+
/** Poll interval while waiting for the port to come up. Default 200ms. */
|
|
267
|
+
startReadyPollMs?: number;
|
|
163
268
|
/**
|
|
164
269
|
* Override the hub origin passed to services as PARACHUTE_HUB_ORIGIN. If
|
|
165
270
|
* unset, `start` derives it from `expose-state.json` (when exposed) or
|
|
@@ -175,9 +280,36 @@ export interface LifecycleOpts {
|
|
|
175
280
|
* `ensureHubRunning` and `lifecycle.stop("hub")` dispatches to
|
|
176
281
|
* `stopHub`. Tests inject stubs to avoid spawning real bun processes.
|
|
177
282
|
*/
|
|
283
|
+
/**
|
|
284
|
+
* PATH-resolution seam for the start preflight (`@openparachute/depcheck`
|
|
285
|
+
* `ensureExecutable`). Production uses the real `Bun.which`; a missing
|
|
286
|
+
* startCmd binary then surfaces the friendly missing-dependency UX +
|
|
287
|
+
* persists it to services.json.
|
|
288
|
+
*
|
|
289
|
+
* Defaulting policy mirrors `startSettleMs`: when a stub `spawner` is
|
|
290
|
+
* injected (the test path) `which` defaults to a permissive resolver
|
|
291
|
+
* (`() => "<stub>"`) so existing stub-spawner tests don't trip the preflight
|
|
292
|
+
* against binaries that aren't on the test host's PATH (`parachute-vault`,
|
|
293
|
+
* `notes-serve`). Production (no spawner override) gets the real `Bun.which`.
|
|
294
|
+
* Tests that want to exercise the missing-binary branch inject `which`
|
|
295
|
+
* explicitly (e.g. `which: () => null`).
|
|
296
|
+
*/
|
|
297
|
+
which?: (cmd: string) => string | null;
|
|
178
298
|
hub?: {
|
|
179
299
|
ensureRunning?: (opts: EnsureHubOpts) => Promise<EnsureHubResult>;
|
|
180
300
|
stop?: (opts: StopHubOpts) => Promise<boolean>;
|
|
301
|
+
/**
|
|
302
|
+
* Self-heal the operator token's stale `iss` after `start hub` (hub#481).
|
|
303
|
+
* Production opens hub.db at `<configDir>/hub.db` and delegates to
|
|
304
|
+
* `selfHealOperatorTokenIssuer`. Tests inject a stub to assert the call
|
|
305
|
+
* happens — or to make it throw and prove a self-heal failure never fails
|
|
306
|
+
* `start hub`.
|
|
307
|
+
*/
|
|
308
|
+
selfHealOperatorToken?: (args: {
|
|
309
|
+
issuer: string;
|
|
310
|
+
configDir: string;
|
|
311
|
+
log: (line: string) => void;
|
|
312
|
+
}) => Promise<OperatorIssuerHealStatus>;
|
|
181
313
|
};
|
|
182
314
|
}
|
|
183
315
|
|
|
@@ -193,9 +325,42 @@ interface Resolved {
|
|
|
193
325
|
killWaitMs: number;
|
|
194
326
|
pollIntervalMs: number;
|
|
195
327
|
startSettleMs: number;
|
|
328
|
+
portListening: PortListeningFn;
|
|
329
|
+
startReadyMs: number;
|
|
330
|
+
startReadyPollMs: number;
|
|
331
|
+
which: (cmd: string) => string | null;
|
|
196
332
|
hubOrigin: string | undefined;
|
|
197
333
|
ensureHub: (opts: EnsureHubOpts) => Promise<EnsureHubResult>;
|
|
198
334
|
stopHubFn: (opts: StopHubOpts) => Promise<boolean>;
|
|
335
|
+
selfHealOperatorTokenFn: (args: {
|
|
336
|
+
issuer: string;
|
|
337
|
+
configDir: string;
|
|
338
|
+
log: (line: string) => void;
|
|
339
|
+
}) => Promise<OperatorIssuerHealStatus>;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
/**
|
|
343
|
+
* Production self-heal: open hub.db at `<configDir>/hub.db`, run
|
|
344
|
+
* `selfHealOperatorTokenIssuer`, and close the db. Derives the db path the
|
|
345
|
+
* same way the rest of the repo does (`hubDbPath(configDir)`); `openHubDb`
|
|
346
|
+
* runs migrations + WAL on open, matching `commands/auth.ts`. Tests override
|
|
347
|
+
* this whole seam, so the db-open only happens on the production path.
|
|
348
|
+
*/
|
|
349
|
+
async function defaultSelfHealOperatorToken(args: {
|
|
350
|
+
issuer: string;
|
|
351
|
+
configDir: string;
|
|
352
|
+
log: (line: string) => void;
|
|
353
|
+
}): Promise<OperatorIssuerHealStatus> {
|
|
354
|
+
const db = openHubDb(hubDbPath(args.configDir));
|
|
355
|
+
try {
|
|
356
|
+
return await selfHealOperatorTokenIssuer(db, {
|
|
357
|
+
issuer: args.issuer,
|
|
358
|
+
configDir: args.configDir,
|
|
359
|
+
log: args.log,
|
|
360
|
+
});
|
|
361
|
+
} finally {
|
|
362
|
+
db.close();
|
|
363
|
+
}
|
|
199
364
|
}
|
|
200
365
|
|
|
201
366
|
function resolve(opts: LifecycleOpts): Resolved {
|
|
@@ -219,9 +384,26 @@ function resolve(opts: LifecycleOpts): Resolved {
|
|
|
219
384
|
// override `alive`, which re-enables the default 250ms.
|
|
220
385
|
startSettleMs:
|
|
221
386
|
opts.startSettleMs ?? (opts.spawner === undefined || opts.alive !== undefined ? 250 : 0),
|
|
387
|
+
portListening: opts.portListening ?? defaultPortListening,
|
|
388
|
+
// Same defaulting policy as startSettleMs: production (no spawner
|
|
389
|
+
// override) gets the real 4s readiness window; tests that inject a stub
|
|
390
|
+
// spawner get 0 (skipped) unless they explicitly opt in via
|
|
391
|
+
// `portListening` or `startReadyMs`, so existing stub-spawner tests don't
|
|
392
|
+
// start probing a fake port.
|
|
393
|
+
startReadyMs:
|
|
394
|
+
opts.startReadyMs ??
|
|
395
|
+
(opts.spawner === undefined || opts.portListening !== undefined ? 4000 : 0),
|
|
396
|
+
startReadyPollMs: opts.startReadyPollMs ?? 200,
|
|
397
|
+
// Same defaulting policy as startSettleMs/startReadyMs: production (no
|
|
398
|
+
// spawner override) preflights with the real Bun.which; stub-spawner tests
|
|
399
|
+
// get a permissive resolver so the preflight doesn't trip against binaries
|
|
400
|
+
// that aren't on the test host's PATH. Explicit `which` always wins.
|
|
401
|
+
which:
|
|
402
|
+
opts.which ?? (opts.spawner === undefined ? Bun.which : () => "/stub/bin/preflight-skipped"),
|
|
222
403
|
hubOrigin: resolveHubOrigin(opts.hubOrigin, configDir),
|
|
223
404
|
ensureHub: opts.hub?.ensureRunning ?? ensureHubRunning,
|
|
224
405
|
stopHubFn: opts.hub?.stop ?? stopHub,
|
|
406
|
+
selfHealOperatorTokenFn: opts.hub?.selfHealOperatorToken ?? defaultSelfHealOperatorToken,
|
|
225
407
|
};
|
|
226
408
|
}
|
|
227
409
|
|
|
@@ -452,42 +634,185 @@ export async function start(svc: string | undefined, opts: LifecycleOpts = {}):
|
|
|
452
634
|
if (entry.installDir) spawnerOpts.cwd = entry.installDir;
|
|
453
635
|
const passOpts =
|
|
454
636
|
spawnerOpts.env !== undefined || spawnerOpts.cwd !== undefined ? spawnerOpts : undefined;
|
|
637
|
+
|
|
638
|
+
// Pre-flight the startCmd binary (`@openparachute/depcheck`) so a missing
|
|
639
|
+
// executable surfaces the friendly install UX inline AND is persisted onto
|
|
640
|
+
// the services.json row, so a *later* `parachute status` (a separate
|
|
641
|
+
// invocation that only reads the manifest) + the SPA modules pane show
|
|
642
|
+
// "vault: failed to start — parachute-vault not installed" with install
|
|
643
|
+
// info, rather than a bare "failed"/orphan-timeout. The binary is `cmd[0]`
|
|
644
|
+
// (e.g. `parachute-vault` for an npm install, `bun` for a bun-linked one).
|
|
645
|
+
const startBinary = cmd[0];
|
|
646
|
+
if (startBinary) {
|
|
647
|
+
try {
|
|
648
|
+
ensureExecutable(startBinary, { which: r.which });
|
|
649
|
+
} catch (err) {
|
|
650
|
+
if (err instanceof MissingDependencyError) {
|
|
651
|
+
failures++;
|
|
652
|
+
r.log(`✗ ${short} failed to start:`);
|
|
653
|
+
for (const line of err.message.split("\n")) r.log(` ${line}`);
|
|
654
|
+
recordStartError(entry.name, err.toWire(), r.manifestPath);
|
|
655
|
+
continue;
|
|
656
|
+
}
|
|
657
|
+
throw err;
|
|
658
|
+
}
|
|
659
|
+
}
|
|
660
|
+
|
|
455
661
|
let pid: number;
|
|
456
662
|
try {
|
|
457
663
|
pid = r.spawner.spawn(cmd, logFile, passOpts);
|
|
458
664
|
} catch (err) {
|
|
665
|
+
// Belt-and-suspenders: a missing binary that slipped past the pre-flight
|
|
666
|
+
// (race) still becomes a MissingDependencyError via rethrowIfMissing.
|
|
667
|
+
if (startBinary) {
|
|
668
|
+
try {
|
|
669
|
+
rethrowIfMissing(err, startBinary);
|
|
670
|
+
} catch (missing) {
|
|
671
|
+
if (missing instanceof MissingDependencyError) {
|
|
672
|
+
failures++;
|
|
673
|
+
r.log(`✗ ${short} failed to start:`);
|
|
674
|
+
for (const line of missing.message.split("\n")) r.log(` ${line}`);
|
|
675
|
+
recordStartError(entry.name, missing.toWire(), r.manifestPath);
|
|
676
|
+
continue;
|
|
677
|
+
}
|
|
678
|
+
}
|
|
679
|
+
}
|
|
459
680
|
failures++;
|
|
460
681
|
const msg = err instanceof Error ? err.message : String(err);
|
|
461
682
|
r.log(`✗ ${short} failed to start: ${msg}`);
|
|
462
683
|
continue;
|
|
463
684
|
}
|
|
685
|
+
// A successful spawn clears any stale start-error recorded from a prior
|
|
686
|
+
// missing-dependency failure so `parachute status` doesn't keep showing it.
|
|
687
|
+
clearStartError(entry.name, r.manifestPath);
|
|
464
688
|
writePid(short, pid, r.configDir);
|
|
465
689
|
|
|
466
|
-
//
|
|
467
|
-
//
|
|
468
|
-
//
|
|
469
|
-
//
|
|
470
|
-
//
|
|
471
|
-
//
|
|
690
|
+
// Boot-readiness gating (hub#194 + hub#487). A spawn returning a pid only
|
|
691
|
+
// proves the kernel forked the process — it says nothing about whether the
|
|
692
|
+
// service survived its boot or bound its port. Two silent-start shapes:
|
|
693
|
+
//
|
|
694
|
+
// (1) spawn-then-immediately-die (hub#194): the child throws before
|
|
695
|
+
// listening (notes-serve's Bun.resolveSync failing for bun-linked
|
|
696
|
+
// installs) and exits microseconds later. Caught by the settle below.
|
|
697
|
+
//
|
|
698
|
+
// (2) alive-but-never-bound (hub#487): the port is already held by an
|
|
699
|
+
// orphan, the child hits EADDRINUSE, but its process *lingers* (or a
|
|
700
|
+
// supervisor retries) long enough to clear the liveness check. `start`
|
|
701
|
+
// would report "✓ started" while `parachute status` shows it inactive
|
|
702
|
+
// because nothing answers on the port. Aaron hit exactly this with an
|
|
703
|
+
// orphan holding vault's 1940 on a fresh EC2 box. Caught by the
|
|
704
|
+
// port-readiness poll below.
|
|
705
|
+
//
|
|
706
|
+
// On any failure we surface the tail of the logfile so the operator sees
|
|
707
|
+
// the real boot error inline, and we specifically call out EADDRINUSE with
|
|
708
|
+
// the `lsof -ti:<port>` remedy.
|
|
709
|
+
const reportStartFailure = (reason: string): void => {
|
|
710
|
+
clearPid(short, r.configDir);
|
|
711
|
+
failures++;
|
|
712
|
+
const tail = readLogTail(logFile, 20);
|
|
713
|
+
if (detectAddrInUse(tail)) {
|
|
714
|
+
r.log(
|
|
715
|
+
`✗ ${short} failed to start: port ${entry.port} is already in use. Stop the existing process first — find it with \`lsof -ti:${entry.port}\` (then \`kill <pid>\`), or run \`parachute restart ${short}\`.`,
|
|
716
|
+
);
|
|
717
|
+
} else {
|
|
718
|
+
r.log(`✗ ${short} failed to start: ${reason}`);
|
|
719
|
+
}
|
|
720
|
+
if (tail.length > 0) {
|
|
721
|
+
r.log(` ── last ${tail.length} log line(s) (${logFile}) ──`);
|
|
722
|
+
for (const line of tail) r.log(` │ ${line}`);
|
|
723
|
+
} else {
|
|
724
|
+
r.log(` Tail the log for details: tail -50 ${logFile}`);
|
|
725
|
+
}
|
|
726
|
+
};
|
|
727
|
+
|
|
472
728
|
if (r.startSettleMs > 0) {
|
|
473
729
|
await r.sleep(r.startSettleMs);
|
|
474
730
|
if (!r.alive(pid)) {
|
|
475
|
-
|
|
476
|
-
|
|
731
|
+
reportStartFailure(
|
|
732
|
+
`spawned pid ${pid} but the process exited within ${r.startSettleMs}ms.`,
|
|
733
|
+
);
|
|
734
|
+
continue;
|
|
735
|
+
}
|
|
736
|
+
}
|
|
737
|
+
|
|
738
|
+
// Port-readiness poll (hub#487). The process is alive; now confirm it
|
|
739
|
+
// actually bound its port before claiming success. Poll up to
|
|
740
|
+
// `startReadyMs`, re-checking liveness each iteration so a *later* death
|
|
741
|
+
// (e.g. a slow EADDRINUSE crash) is still reported as a failure. A process
|
|
742
|
+
// that stays alive but never binds within the window gets a non-fatal
|
|
743
|
+
// warning rather than a hard failure — some daemons legitimately do slow
|
|
744
|
+
// boot work, and we'd rather not flip a healthy-but-slow start to red.
|
|
745
|
+
if (r.startReadyMs > 0) {
|
|
746
|
+
const deadline = r.now() + r.startReadyMs;
|
|
747
|
+
let listening = false;
|
|
748
|
+
let died = false;
|
|
749
|
+
while (r.now() < deadline) {
|
|
750
|
+
if (!r.alive(pid)) {
|
|
751
|
+
died = true;
|
|
752
|
+
break;
|
|
753
|
+
}
|
|
754
|
+
if (await r.portListening(entry.port)) {
|
|
755
|
+
listening = true;
|
|
756
|
+
break;
|
|
757
|
+
}
|
|
758
|
+
await r.sleep(r.startReadyPollMs);
|
|
759
|
+
}
|
|
760
|
+
if (died) {
|
|
761
|
+
reportStartFailure(`spawned pid ${pid} but the process exited during startup.`);
|
|
762
|
+
continue;
|
|
763
|
+
}
|
|
764
|
+
if (!listening) {
|
|
765
|
+
// Last-chance liveness check — the loop may have exited on the
|
|
766
|
+
// deadline right as the process died.
|
|
767
|
+
if (!r.alive(pid)) {
|
|
768
|
+
reportStartFailure(`spawned pid ${pid} but the process exited during startup.`);
|
|
769
|
+
continue;
|
|
770
|
+
}
|
|
477
771
|
r.log(
|
|
478
|
-
|
|
772
|
+
`⚠ ${short} started (pid ${pid}) but port ${entry.port} isn't accepting connections yet after ${r.startReadyMs}ms.`,
|
|
479
773
|
);
|
|
480
|
-
r.log(
|
|
774
|
+
r.log(
|
|
775
|
+
` It may still be coming up — check \`parachute status\` and \`parachute logs ${short}\`.`,
|
|
776
|
+
);
|
|
777
|
+
if (r.hubOrigin) r.log(` ${HUB_ORIGIN_ENV}=${r.hubOrigin}`);
|
|
778
|
+
if (short === "vault") persistVaultHubOriginForStart(r);
|
|
481
779
|
continue;
|
|
482
780
|
}
|
|
483
781
|
}
|
|
484
782
|
|
|
485
783
|
r.log(`✓ ${short} started (pid ${pid}); logs: ${logFile}`);
|
|
486
784
|
if (r.hubOrigin) r.log(` ${HUB_ORIGIN_ENV}=${r.hubOrigin}`);
|
|
785
|
+
if (short === "vault") persistVaultHubOriginForStart(r);
|
|
487
786
|
}
|
|
488
787
|
return failures === 0 ? 0 : 1;
|
|
489
788
|
}
|
|
490
789
|
|
|
790
|
+
/**
|
|
791
|
+
* Durable-persist vault's `PARACHUTE_HUB_ORIGIN` on a vault `start`. Two cases,
|
|
792
|
+
* in order:
|
|
793
|
+
*
|
|
794
|
+
* 1. The resolved spawn origin (`r.hubOrigin`) is a real public origin — write
|
|
795
|
+
* it. This is the long-standing happy path: an exposure is live, the
|
|
796
|
+
* launchd / systemd daemon (which boots vault out-of-band and never sees
|
|
797
|
+
* this spawn env) needs it in `.env` to validate hub-minted JWTs' `iss`.
|
|
798
|
+
* `persistVaultHubOrigin` skips loopback / unchanged values itself.
|
|
799
|
+
*
|
|
800
|
+
* 2. Self-heal: even when `r.hubOrigin` resolved to loopback or undefined
|
|
801
|
+
* (e.g. the hub.port file outran the expose-state read, or this is a bare
|
|
802
|
+
* `restart vault` on a deploy whose `.env` was never written), consult
|
|
803
|
+
* `expose-state.json` directly. If it advertises a public origin and
|
|
804
|
+
* vault's persisted value is unset / loopback, write the public origin.
|
|
805
|
+
* This is what lets an EXISTING broken Cloudflare deploy self-correct on
|
|
806
|
+
* the next `parachute restart vault`, not only fresh exposes.
|
|
807
|
+
*
|
|
808
|
+
* Case 1 covers the override / freshly-resolved path; case 2 catches the gap
|
|
809
|
+
* the Cloudflare 401 P0 fell through. See `vault-hub-origin-env.ts`.
|
|
810
|
+
*/
|
|
811
|
+
function persistVaultHubOriginForStart(r: Resolved): void {
|
|
812
|
+
if (r.hubOrigin) persistVaultHubOrigin(r.configDir, r.hubOrigin, r.log);
|
|
813
|
+
selfHealVaultHubOrigin(r.configDir, r.log, join(r.configDir, "expose-state.json"));
|
|
814
|
+
}
|
|
815
|
+
|
|
491
816
|
export async function stop(svc: string | undefined, opts: LifecycleOpts = {}): Promise<number> {
|
|
492
817
|
const r = resolve(opts);
|
|
493
818
|
if (svc === HUB_SVC) return stopHubSvc(r);
|
|
@@ -567,6 +892,12 @@ async function startHubSvc(r: Resolved): Promise<number> {
|
|
|
567
892
|
} else {
|
|
568
893
|
r.log(`hub already running (pid ${result.pid}) on port ${result.port}.`);
|
|
569
894
|
}
|
|
895
|
+
// Self-heal a stale operator-token issuer (hub#481). Runs whether the hub
|
|
896
|
+
// was freshly started OR already running — a token stamped at loopback
|
|
897
|
+
// before exposure must heal even when the hub is already up. The loopback /
|
|
898
|
+
// provenance guards live inside `selfHealOperatorTokenIssuer`, so the only
|
|
899
|
+
// gate here is "is there a real issuer to heal toward?".
|
|
900
|
+
await selfHealOperatorTokenOnStart(r);
|
|
570
901
|
return 0;
|
|
571
902
|
} catch (err) {
|
|
572
903
|
r.log(`✗ hub failed to start: ${err instanceof Error ? err.message : String(err)}`);
|
|
@@ -574,6 +905,36 @@ async function startHubSvc(r: Resolved): Promise<number> {
|
|
|
574
905
|
}
|
|
575
906
|
}
|
|
576
907
|
|
|
908
|
+
/**
|
|
909
|
+
* Re-issue the operator token under the hub's current origin when its `iss`
|
|
910
|
+
* went stale after an init-at-loopback → expose transition (hub#481). Mirrors
|
|
911
|
+
* `persistVaultHubOriginForStart`'s quiet style: emit a single line only when
|
|
912
|
+
* a rotation actually happens; stay silent for fresh / absent / skipped.
|
|
913
|
+
*
|
|
914
|
+
* The ENTIRE self-heal is wrapped here so it can NEVER block or fail
|
|
915
|
+
* `start hub` — a db-open error, a corrupt token, anything — degrades to a
|
|
916
|
+
* brief warning and `start hub` still returns 0.
|
|
917
|
+
*/
|
|
918
|
+
async function selfHealOperatorTokenOnStart(r: Resolved): Promise<void> {
|
|
919
|
+
if (!r.hubOrigin) return;
|
|
920
|
+
try {
|
|
921
|
+
const status = await r.selfHealOperatorTokenFn({
|
|
922
|
+
issuer: r.hubOrigin,
|
|
923
|
+
configDir: r.configDir,
|
|
924
|
+
log: r.log,
|
|
925
|
+
});
|
|
926
|
+
if (status.kind === "rotated") {
|
|
927
|
+
r.log(` refreshed operator.token issuer → ${r.hubOrigin} (was stale after exposure)`);
|
|
928
|
+
}
|
|
929
|
+
} catch (err) {
|
|
930
|
+
r.log(
|
|
931
|
+
` note: operator.token issuer self-heal skipped (${
|
|
932
|
+
err instanceof Error ? err.message : String(err)
|
|
933
|
+
})`,
|
|
934
|
+
);
|
|
935
|
+
}
|
|
936
|
+
}
|
|
937
|
+
|
|
577
938
|
/**
|
|
578
939
|
* Stop the internal hub. `stopHub` returns false when nothing was running
|
|
579
940
|
* (no pidfile, or stale pidfile cleared) — that's a clean no-op for the
|
|
@@ -659,11 +1020,19 @@ export async function logs(svc: string, opts: LogsOpts = {}): Promise<number> {
|
|
|
659
1020
|
spawn(cmd) {
|
|
660
1021
|
// Inherit env so `tail` sees PATH, etc. Bun.spawn defaults to empty
|
|
661
1022
|
// env — see api-modules-ops.ts:defaultRun.
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
1023
|
+
try {
|
|
1024
|
+
const proc = Bun.spawn([...cmd], {
|
|
1025
|
+
stdio: ["ignore", "inherit", "inherit"],
|
|
1026
|
+
env: process.env,
|
|
1027
|
+
});
|
|
1028
|
+
return proc.pid;
|
|
1029
|
+
} catch (err) {
|
|
1030
|
+
// A missing `tail` (minimal container without coreutils) surfaces
|
|
1031
|
+
// the friendly install UX instead of a raw spawn throw. The CLI
|
|
1032
|
+
// top-level catch in cli.ts renders the MissingDependencyError.
|
|
1033
|
+
rethrowIfMissing(err, "tail");
|
|
1034
|
+
throw err;
|
|
1035
|
+
}
|
|
667
1036
|
},
|
|
668
1037
|
};
|
|
669
1038
|
spawner.spawn(["tail", "-n", String(lines), "-f", path], path);
|
package/src/commands/status.ts
CHANGED
|
@@ -146,6 +146,14 @@ interface StatusRow {
|
|
|
146
146
|
* stale-after-rebuild row without comparing columns by eye.
|
|
147
147
|
*/
|
|
148
148
|
staleNote?: string;
|
|
149
|
+
/**
|
|
150
|
+
* Persisted last-start failure (`lastStartError`, written by the lifecycle
|
|
151
|
+
* start preflight when a startCmd binary is missing). Surfaced on a
|
|
152
|
+
* continuation line so a *later* `parachute status` explains why the row
|
|
153
|
+
* isn't active — "failed to start: <binary> not installed" — rather than
|
|
154
|
+
* just showing it inactive. Cleared on the next successful start.
|
|
155
|
+
*/
|
|
156
|
+
startErrorNote?: string;
|
|
149
157
|
}
|
|
150
158
|
|
|
151
159
|
/**
|
|
@@ -264,6 +272,17 @@ export async function status(opts: StatusOpts = {}): Promise<number> {
|
|
|
264
272
|
? `STALE: services.json cached ${entry.version}; live package.json ${source.livePackageVersion}`
|
|
265
273
|
: undefined;
|
|
266
274
|
|
|
275
|
+
// Persisted last-start failure (lifecycle preflight wrote a missing-
|
|
276
|
+
// dependency wire). Surface a one-line summary; the full install recipe
|
|
277
|
+
// lives in services.json + the admin SPA card. Keeps `parachute status`
|
|
278
|
+
// scannable while still telling the operator "this is why it's down."
|
|
279
|
+
const startErrorNote =
|
|
280
|
+
entry.lastStartError !== undefined
|
|
281
|
+
? entry.lastStartError.binary !== undefined
|
|
282
|
+
? `failed to start: ${entry.lastStartError.binary} not installed — run \`parachute status\` detail or see /admin/modules for install steps`
|
|
283
|
+
: `failed to start: ${entry.lastStartError.error_description.split("\n")[0]}`
|
|
284
|
+
: undefined;
|
|
285
|
+
|
|
267
286
|
// Only skip probe when we know the process is dead (PID file was
|
|
268
287
|
// present but kill(pid, 0) failed). "unknown" status (no PID file)
|
|
269
288
|
// still probes — externally-managed services should report health.
|
|
@@ -287,6 +306,7 @@ export async function status(opts: StatusOpts = {}): Promise<number> {
|
|
|
287
306
|
skipped: true,
|
|
288
307
|
driftWarning,
|
|
289
308
|
staleNote,
|
|
309
|
+
startErrorNote,
|
|
290
310
|
};
|
|
291
311
|
}
|
|
292
312
|
|
|
@@ -324,6 +344,7 @@ export async function status(opts: StatusOpts = {}): Promise<number> {
|
|
|
324
344
|
skipped: false,
|
|
325
345
|
driftWarning,
|
|
326
346
|
staleNote,
|
|
347
|
+
startErrorNote,
|
|
327
348
|
};
|
|
328
349
|
}),
|
|
329
350
|
);
|
|
@@ -378,6 +399,7 @@ export async function status(opts: StatusOpts = {}): Promise<number> {
|
|
|
378
399
|
}
|
|
379
400
|
if (row.driftWarning) print(` ! ${row.driftWarning}`);
|
|
380
401
|
if (row.staleNote) print(` ! ${row.staleNote}`);
|
|
402
|
+
if (row.startErrorNote) print(` ! ${row.startErrorNote}`);
|
|
381
403
|
}
|
|
382
404
|
|
|
383
405
|
/**
|
package/src/commands/upgrade.ts
CHANGED
|
@@ -72,25 +72,69 @@ export interface UpgradeRunner {
|
|
|
72
72
|
): Promise<{ code: number; stdout: string }>;
|
|
73
73
|
}
|
|
74
74
|
|
|
75
|
+
/**
|
|
76
|
+
* Exit code we synthesize when a binary can't be spawned at all. 127 is the
|
|
77
|
+
* POSIX shell convention for "command not found" — it lets every git call
|
|
78
|
+
* degrade to a normal non-zero result instead of crashing the whole command.
|
|
79
|
+
*/
|
|
80
|
+
const SPAWN_NOT_FOUND_CODE = 127;
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* True when an error thrown by `Bun.spawn` means "the executable doesn't
|
|
84
|
+
* exist on this host" (ENOENT). On a minimal server with no `git` installed —
|
|
85
|
+
* a legitimate, common shape for a published-npm install on the canonical
|
|
86
|
+
* install path — `Bun.spawn(["git", ...])` throws *synchronously* with this
|
|
87
|
+
* shape. We catch it so `parachute upgrade` degrades to the npm path rather
|
|
88
|
+
* than dying with an uncaught `Executable not found in $PATH: "git"`.
|
|
89
|
+
*/
|
|
90
|
+
function isSpawnNotFound(err: unknown): boolean {
|
|
91
|
+
if (typeof err !== "object" || err === null) return false;
|
|
92
|
+
const code = (err as { code?: unknown }).code;
|
|
93
|
+
const message = (err as { message?: unknown }).message;
|
|
94
|
+
return (
|
|
95
|
+
code === "ENOENT" ||
|
|
96
|
+
(typeof message === "string" && message.includes("Executable not found in $PATH"))
|
|
97
|
+
);
|
|
98
|
+
}
|
|
99
|
+
|
|
75
100
|
export const defaultRunner: UpgradeRunner = {
|
|
76
101
|
async run(cmd, opts) {
|
|
77
102
|
// Inherit env so `bun add -g` etc. see TMPDIR, BUN_INSTALL, PATH, HOME.
|
|
78
103
|
// Bun.spawn defaults to empty env — see api-modules-ops.ts:defaultRun.
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
104
|
+
let proc: Bun.Subprocess;
|
|
105
|
+
try {
|
|
106
|
+
proc = Bun.spawn([...cmd], {
|
|
107
|
+
cwd: opts?.cwd,
|
|
108
|
+
stdio: ["inherit", "inherit", "inherit"],
|
|
109
|
+
env: process.env,
|
|
110
|
+
});
|
|
111
|
+
} catch (err) {
|
|
112
|
+
// Binary not on this host (e.g. no `git` on a minimal server). Degrade
|
|
113
|
+
// to a non-zero exit rather than letting the throw crash the command.
|
|
114
|
+
if (isSpawnNotFound(err)) return SPAWN_NOT_FOUND_CODE;
|
|
115
|
+
throw err;
|
|
116
|
+
}
|
|
84
117
|
return await proc.exited;
|
|
85
118
|
},
|
|
86
119
|
async capture(cmd, opts) {
|
|
87
120
|
// Inherit env — same rationale as `run` above.
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
121
|
+
let proc: Bun.Subprocess<"ignore", "pipe", "pipe">;
|
|
122
|
+
try {
|
|
123
|
+
proc = Bun.spawn([...cmd], {
|
|
124
|
+
cwd: opts?.cwd,
|
|
125
|
+
stdout: "pipe",
|
|
126
|
+
stderr: "pipe",
|
|
127
|
+
env: process.env,
|
|
128
|
+
});
|
|
129
|
+
} catch (err) {
|
|
130
|
+
// See `run` above: ENOENT (binary-not-found) becomes a captured
|
|
131
|
+
// non-zero result so every git call degrades to "command failed".
|
|
132
|
+
if (isSpawnNotFound(err)) {
|
|
133
|
+
const bin = cmd[0] ?? "command";
|
|
134
|
+
return { code: SPAWN_NOT_FOUND_CODE, stdout: `${bin}: not found on this host\n` };
|
|
135
|
+
}
|
|
136
|
+
throw err;
|
|
137
|
+
}
|
|
94
138
|
const [stdout, stderr] = await Promise.all([
|
|
95
139
|
new Response(proc.stdout).text(),
|
|
96
140
|
new Response(proc.stderr).text(),
|