@openparachute/hub 0.6.2 → 0.6.3-rc.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +87 -35
- package/package.json +1 -1
- package/src/__tests__/api-hub-upgrade.test.ts +690 -0
- package/src/__tests__/api-modules-ops.test.ts +359 -3
- package/src/__tests__/api-modules.test.ts +54 -0
- package/src/__tests__/expose-cloudflare.test.ts +163 -72
- package/src/__tests__/expose-off-auto.test.ts +26 -1
- package/src/__tests__/expose.test.ts +260 -240
- package/src/__tests__/hub-control.test.ts +1 -242
- package/src/__tests__/hub-server.test.ts +64 -0
- package/src/__tests__/hub-unit.test.ts +574 -0
- package/src/__tests__/init.test.ts +219 -2
- package/src/__tests__/lifecycle.test.ts +416 -1448
- package/src/__tests__/managed-unit.test.ts +575 -0
- package/src/__tests__/migrate-cutover.test.ts +840 -0
- package/src/__tests__/migrate-offer.test.ts +240 -0
- package/src/__tests__/migrate.test.ts +132 -0
- package/src/__tests__/module-ops-client.test.ts +556 -0
- package/src/__tests__/port-probe.test.ts +23 -0
- package/src/__tests__/setup-wizard.test.ts +130 -0
- package/src/__tests__/status-supervisor.test.ts +504 -0
- package/src/__tests__/status.test.ts +157 -708
- package/src/__tests__/supervisor.test.ts +471 -6
- package/src/__tests__/upgrade.test.ts +351 -5
- package/src/api-hub-upgrade.ts +384 -0
- package/src/api-hub.ts +2 -1
- package/src/api-modules-ops.ts +221 -0
- package/src/api-modules.ts +18 -2
- package/src/cli.ts +97 -12
- package/src/cloudflare/connector-service.ts +117 -322
- package/src/commands/expose-cloudflare.ts +63 -71
- package/src/commands/expose-supervisor.ts +247 -0
- package/src/commands/expose.ts +59 -48
- package/src/commands/init.ts +225 -12
- package/src/commands/lifecycle.ts +455 -816
- package/src/commands/migrate-cutover.ts +837 -0
- package/src/commands/migrate.ts +71 -2
- package/src/commands/serve-boot.ts +71 -25
- package/src/commands/status.ts +535 -235
- package/src/commands/upgrade.ts +100 -2
- package/src/help.ts +128 -68
- package/src/hub-control.ts +23 -162
- package/src/hub-server.ts +39 -0
- package/src/hub-unit.ts +735 -0
- package/src/hub-upgrade-helper.ts +306 -0
- package/src/hub-upgrade-mode.ts +209 -0
- package/src/hub-upgrade-status.ts +150 -0
- package/src/managed-unit.ts +692 -0
- package/src/migrate-offer.ts +186 -0
- package/src/module-ops-client.ts +457 -0
- package/src/port-probe.ts +50 -0
- package/src/process-state.ts +19 -3
- package/src/setup-wizard.ts +80 -1
- package/src/supervisor.ts +389 -38
- package/web/ui/dist/assets/index-D_6AFvZy.js +61 -0
- package/web/ui/dist/assets/{index-BiBlvEaj.css → index-mz8XcVPP.css} +1 -1
- package/web/ui/dist/index.html +2 -2
- package/web/ui/dist/assets/index-CIN3mnmf.js +0 -61
|
@@ -1,66 +1,49 @@
|
|
|
1
|
-
import { existsSync
|
|
2
|
-
import { Socket } from "node:net";
|
|
1
|
+
import { existsSync } from "node:fs";
|
|
3
2
|
import { join } from "node:path";
|
|
4
|
-
import {
|
|
5
|
-
MissingDependencyError,
|
|
6
|
-
ensureExecutable,
|
|
7
|
-
rethrowIfMissing,
|
|
8
|
-
} from "@openparachute/depcheck";
|
|
3
|
+
import { rethrowIfMissing } from "@openparachute/depcheck";
|
|
9
4
|
import { CONFIG_DIR, SERVICES_MANIFEST_PATH } from "../config.ts";
|
|
10
|
-
import { readEnvFileValues } from "../env-file.ts";
|
|
11
5
|
import { readExposeState } from "../expose-state.ts";
|
|
12
|
-
import {
|
|
13
|
-
type EnsureHubOpts,
|
|
14
|
-
type EnsureHubResult,
|
|
15
|
-
HUB_SVC,
|
|
16
|
-
type StopHubOpts,
|
|
17
|
-
ensureHubRunning,
|
|
18
|
-
readHubPort,
|
|
19
|
-
stopHub,
|
|
20
|
-
} from "../hub-control.ts";
|
|
6
|
+
import { HUB_SVC, readHubPort } from "../hub-control.ts";
|
|
21
7
|
import { hubDbPath, openHubDb } from "../hub-db.ts";
|
|
22
|
-
import {
|
|
23
|
-
import { ModuleManifestError, readModuleManifest } from "../module-manifest.ts";
|
|
24
|
-
import { type OperatorIssuerHealStatus, selfHealOperatorTokenIssuer } from "../operator-token.ts";
|
|
8
|
+
import { deriveHubOrigin } from "../hub-origin.ts";
|
|
25
9
|
import {
|
|
26
|
-
type
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
10
|
+
type EnsureHubUnitOpts,
|
|
11
|
+
type EnsureHubUnitResult,
|
|
12
|
+
HUB_UNIT_DEFAULT_PORT,
|
|
13
|
+
type HubUnitDeps,
|
|
14
|
+
type HubUnitManagerOpResult,
|
|
15
|
+
defaultHubUnitDeps,
|
|
16
|
+
ensureHubUnit as ensureHubUnitImpl,
|
|
17
|
+
isHubUnitInstalled,
|
|
18
|
+
restartHubUnit as restartHubUnitImpl,
|
|
19
|
+
stopHubUnit as stopHubUnitImpl,
|
|
20
|
+
} from "../hub-unit.ts";
|
|
34
21
|
import {
|
|
35
|
-
|
|
36
|
-
type
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
getSpecFromInstallDir,
|
|
40
|
-
knownServices,
|
|
41
|
-
shortNameForManifest,
|
|
42
|
-
} from "../service-spec.ts";
|
|
22
|
+
type MigrateOfferOpts,
|
|
23
|
+
type MigrateOfferResult,
|
|
24
|
+
offerMigrateToSupervised,
|
|
25
|
+
} from "../migrate-offer.ts";
|
|
43
26
|
import {
|
|
44
|
-
type
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
27
|
+
type DriveModuleOpDeps,
|
|
28
|
+
type ModuleOp,
|
|
29
|
+
ModuleOpHttpError,
|
|
30
|
+
type ModuleOpResult,
|
|
31
|
+
NoOperatorTokenError,
|
|
32
|
+
OperatorTokenExpiredError,
|
|
33
|
+
driveModuleOp as driveModuleOpImpl,
|
|
34
|
+
} from "../module-ops-client.ts";
|
|
35
|
+
import { type PortListeningFn, defaultPortListening } from "../port-probe.ts";
|
|
36
|
+
import { type AliveFn, logPath as logPathFor, processState } from "../process-state.ts";
|
|
37
|
+
import { getSpec, knownServices } from "../service-spec.ts";
|
|
38
|
+
import { readManifest } from "../services-manifest.ts";
|
|
50
39
|
|
|
51
40
|
/**
|
|
52
|
-
* Tiny seam over `Bun.spawn
|
|
53
|
-
*
|
|
54
|
-
*
|
|
55
|
-
*
|
|
56
|
-
* `
|
|
57
|
-
*
|
|
58
|
-
* PARACHUTE_HUB_ORIGIN so vault's OAuth issuer matches the hub URL.
|
|
59
|
-
*
|
|
60
|
-
* `cwd`, when provided, is the child's working directory. Set to the
|
|
61
|
-
* service's installDir for third-party modules so manifest-declared
|
|
62
|
-
* relative startCmds (e.g. `["bun", "web/server/src/server.ts"]`) resolve
|
|
63
|
-
* against the package root.
|
|
41
|
+
* Tiny seam over `Bun.spawn`, retained for the `parachute logs <svc> --follow`
|
|
42
|
+
* tail (`LogsOpts.tailSpawner`). The detached MODULE spawner (`defaultSpawner`)
|
|
43
|
+
* was retired in Phase 5b — modules are spawned by the supervisor under `serve`,
|
|
44
|
+
* not by a detached pidfile daemon. `logs` is the last consumer of this seam, and
|
|
45
|
+
* its tail only needs `cmd` (the `opts` is unused there but kept on the interface
|
|
46
|
+
* for a future caller).
|
|
64
47
|
*/
|
|
65
48
|
export interface SpawnerOptions {
|
|
66
49
|
env?: Record<string, string>;
|
|
@@ -71,81 +54,32 @@ export interface Spawner {
|
|
|
71
54
|
spawn(cmd: readonly string[], logFile: string, opts?: SpawnerOptions): number;
|
|
72
55
|
}
|
|
73
56
|
|
|
74
|
-
export const defaultSpawner: Spawner = {
|
|
75
|
-
spawn(cmd, logFile, opts) {
|
|
76
|
-
const fd = openSync(logFile, "a");
|
|
77
|
-
const spawnOpts: Parameters<typeof Bun.spawn>[1] = {
|
|
78
|
-
stdio: ["ignore", fd, fd],
|
|
79
|
-
// Spawn in a fresh process group (pid == pgid) so kill(-pid, sig)
|
|
80
|
-
// reaches every descendant, not just the wrapper. Without this,
|
|
81
|
-
// wrapped startCmds like `pnpm exec tsx server.ts` leave the tsx
|
|
82
|
-
// grandchild bound to the port after stop → restart hits EADDRINUSE.
|
|
83
|
-
detached: true,
|
|
84
|
-
// Inherit env so child sees PATH, HOME, PARACHUTE_HOME, etc.
|
|
85
|
-
// Bun.spawn defaults to empty env — see api-modules-ops.ts:defaultRun.
|
|
86
|
-
// Per-call `opts.env` overrides merge on top below.
|
|
87
|
-
env: process.env,
|
|
88
|
-
};
|
|
89
|
-
if (opts?.env) spawnOpts.env = { ...process.env, ...opts.env };
|
|
90
|
-
if (opts?.cwd) spawnOpts.cwd = opts.cwd;
|
|
91
|
-
const proc = Bun.spawn([...cmd], spawnOpts);
|
|
92
|
-
proc.unref();
|
|
93
|
-
return proc.pid;
|
|
94
|
-
},
|
|
95
|
-
};
|
|
96
|
-
|
|
97
57
|
export type KillFn = (pid: number, signal: NodeJS.Signals | number) => void;
|
|
98
58
|
export type SleepFn = (ms: number) => Promise<void>;
|
|
99
59
|
|
|
100
60
|
/**
|
|
101
|
-
*
|
|
102
|
-
*
|
|
103
|
-
*
|
|
104
|
-
*
|
|
105
|
-
*
|
|
106
|
-
*
|
|
107
|
-
*
|
|
108
|
-
*
|
|
109
|
-
* Tests inject a deterministic stub; production uses `defaultPortListening`.
|
|
61
|
+
* Port-readiness probe seam + its production impl now live in `port-probe.ts`
|
|
62
|
+
* (design 2026-06-01 §6.5) so the supervisor can share the exact same TCP
|
|
63
|
+
* connect-probe without dragging lifecycle's heavy import graph. Re-exported
|
|
64
|
+
* here so this module's public API (and its tests) are unchanged. Pairs with
|
|
65
|
+
* the spawn-then-die settle (hub#194) to catch the alive-but-never-bound shape
|
|
66
|
+
* (hub#487): a service that clears the liveness check but never binds its port
|
|
67
|
+
* because it's already held — `alive(pid)` says "running" while `status` shows
|
|
68
|
+
* it inactive because nothing answers on the port.
|
|
110
69
|
*/
|
|
111
|
-
export type PortListeningFn
|
|
112
|
-
|
|
113
|
-
/**
|
|
114
|
-
* Connect-probe: open a TCP socket to 127.0.0.1:<port> and see if it's
|
|
115
|
-
* accepted. A successful connect means *something* is listening; we close
|
|
116
|
-
* immediately. Connection refused / timeout means nothing is bound yet.
|
|
117
|
-
* `node:net` rather than `Bun.connect` because the latter has no clean
|
|
118
|
-
* "connection refused → false" without a custom socket handler, and the net
|
|
119
|
-
* Socket's `error`/`connect` events map directly onto the boolean we want.
|
|
120
|
-
*/
|
|
121
|
-
export const defaultPortListening: PortListeningFn = (port) =>
|
|
122
|
-
new Promise((resolve) => {
|
|
123
|
-
const socket = new Socket();
|
|
124
|
-
let settled = false;
|
|
125
|
-
const done = (listening: boolean) => {
|
|
126
|
-
if (settled) return;
|
|
127
|
-
settled = true;
|
|
128
|
-
socket.destroy();
|
|
129
|
-
resolve(listening);
|
|
130
|
-
};
|
|
131
|
-
socket.setTimeout(1000);
|
|
132
|
-
socket.once("connect", () => done(true));
|
|
133
|
-
socket.once("timeout", () => done(false));
|
|
134
|
-
socket.once("error", () => done(false));
|
|
135
|
-
socket.connect(port, "127.0.0.1");
|
|
136
|
-
});
|
|
70
|
+
export { type PortListeningFn, defaultPortListening };
|
|
137
71
|
|
|
138
72
|
/**
|
|
139
73
|
* Group-aware liveness: returns true if the process group (pgid == pid)
|
|
140
|
-
* still has any member.
|
|
141
|
-
*
|
|
142
|
-
*
|
|
143
|
-
* but
|
|
74
|
+
* still has any member. The detached module spawner that created these process
|
|
75
|
+
* groups is retired (Phase 5b — the supervisor under `serve` owns module
|
|
76
|
+
* spawning now, with its own group-spawn + `defaultKillGroup` in `supervisor.ts`),
|
|
77
|
+
* but this stays as the liveness primitive for `parachute logs`'s
|
|
78
|
+
* "running-but-no-logfile" diagnostic over any pidfile still on disk (the readers
|
|
79
|
+
* §7.5 keeps for one release).
|
|
144
80
|
*
|
|
145
|
-
* Falls back to a single-pid check
|
|
146
|
-
*
|
|
147
|
-
* with that pgid exists, and we still want to honor the bare-pid alive
|
|
148
|
-
* signal so a follow-up `stop` runs.
|
|
81
|
+
* Falls back to a single-pid check when no group with that pgid exists:
|
|
82
|
+
* `kill(-pid, 0)` returns ESRCH, and we still honor the bare-pid alive signal.
|
|
149
83
|
*/
|
|
150
84
|
export const defaultAlive: AliveFn = (pid) => {
|
|
151
85
|
try {
|
|
@@ -163,12 +97,13 @@ export const defaultAlive: AliveFn = (pid) => {
|
|
|
163
97
|
};
|
|
164
98
|
|
|
165
99
|
/**
|
|
166
|
-
* Sends `signal` to the entire process group rooted at `pid`.
|
|
167
|
-
*
|
|
168
|
-
*
|
|
169
|
-
*
|
|
170
|
-
*
|
|
171
|
-
*
|
|
100
|
+
* Sends `signal` to the entire process group rooted at `pid`. Reaches a wrapper
|
|
101
|
+
* and any grandchildren in one syscall when the pid is a group leader. ESRCH on
|
|
102
|
+
* the group send means the pgid is gone (the leader exited and the group emptied,
|
|
103
|
+
* or a non-group pid) — fall back to a bare-pid signal so the caller's intent
|
|
104
|
+
* still lands. The supervisor's `defaultKillGroup` (supervisor.ts) is the
|
|
105
|
+
* production reaper now; this export survives for the group-aware test coverage
|
|
106
|
+
* + any future on-box use.
|
|
172
107
|
*/
|
|
173
108
|
export const defaultKill: KillFn = (pid, signal) => {
|
|
174
109
|
try {
|
|
@@ -181,232 +116,258 @@ export const defaultKill: KillFn = (pid, signal) => {
|
|
|
181
116
|
|
|
182
117
|
export const defaultSleep: SleepFn = (ms) => new Promise((r) => setTimeout(r, ms));
|
|
183
118
|
|
|
184
|
-
/**
|
|
185
|
-
* Read the trailing `n` lines of a logfile, best-effort. Used to surface the
|
|
186
|
-
* real boot error when a start fails — operators shouldn't have to manually
|
|
187
|
-
* `tail` the log to learn *why* the daemon died. Returns [] on any read
|
|
188
|
-
* error (missing file, permissions) so the caller falls back to the generic
|
|
189
|
-
* "tail the log" hint without throwing.
|
|
190
|
-
*/
|
|
191
|
-
function readLogTail(logFile: string, n: number): string[] {
|
|
192
|
-
try {
|
|
193
|
-
const content = readFileSync(logFile, "utf8");
|
|
194
|
-
const trimmed = content.replace(/\n$/, "");
|
|
195
|
-
if (trimmed === "") return [];
|
|
196
|
-
return trimmed.split("\n").slice(-n);
|
|
197
|
-
} catch {
|
|
198
|
-
return [];
|
|
199
|
-
}
|
|
200
|
-
}
|
|
201
|
-
|
|
202
|
-
/**
|
|
203
|
-
* Heuristic EADDRINUSE detector over a logfile tail. cloudflared, Bun, and
|
|
204
|
-
* Node all surface port collisions with recognizable phrases; we match the
|
|
205
|
-
* common ones rather than parse a structured error (there isn't one across
|
|
206
|
-
* runtimes). False positives are harmless — the worst case is we *also* print
|
|
207
|
-
* the port-in-use remedy on an unrelated failure, which is still actionable.
|
|
208
|
-
*/
|
|
209
|
-
function detectAddrInUse(logTail: readonly string[]): boolean {
|
|
210
|
-
return logTail.some((line) => /EADDRINUSE|address already in use|port .* in use/i.test(line));
|
|
211
|
-
}
|
|
212
|
-
|
|
213
119
|
export interface LifecycleOpts {
|
|
214
|
-
spawner?: Spawner;
|
|
215
|
-
kill?: KillFn;
|
|
216
|
-
alive?: AliveFn;
|
|
217
|
-
sleep?: SleepFn;
|
|
218
|
-
now?: () => number;
|
|
219
120
|
manifestPath?: string;
|
|
220
121
|
configDir?: string;
|
|
221
122
|
log?: (line: string) => void;
|
|
222
|
-
/** How long stop waits for SIGTERM before escalating to SIGKILL. */
|
|
223
|
-
killWaitMs?: number;
|
|
224
|
-
/** Poll interval while waiting for SIGTERM to land. */
|
|
225
|
-
pollIntervalMs?: number;
|
|
226
123
|
/**
|
|
227
|
-
*
|
|
228
|
-
*
|
|
229
|
-
*
|
|
230
|
-
* spawn returned a pid). 250ms is the default in production — long
|
|
231
|
-
* enough to catch real silent-crashes (resolve failures, port
|
|
232
|
-
* collisions, missing args) without making `parachute start` feel
|
|
233
|
-
* laggy.
|
|
234
|
-
*
|
|
235
|
-
* Defaulting policy: if `alive` is not overridden, the settle defaults
|
|
236
|
-
* to 0 (skipped). Stub spawners hand back fake pids that the real
|
|
237
|
-
* `defaultAlive` would mark as dead, which would make every existing
|
|
238
|
-
* stub-spawner test fail spuriously. Tests that want to exercise the
|
|
239
|
-
* settle path inject both `alive` and `startSettleMs` explicitly.
|
|
240
|
-
* Production paths use the real `defaultAlive` and get the real 250ms
|
|
241
|
-
* settle.
|
|
242
|
-
*/
|
|
243
|
-
startSettleMs?: number;
|
|
244
|
-
/**
|
|
245
|
-
* Probe whether the service's port is listening, post-spawn. Pairs with the
|
|
246
|
-
* settle (hub#194) to catch the EADDRINUSE-orphan shape (hub#487): the
|
|
247
|
-
* process survives the liveness window (vault lingers / retries) but never
|
|
248
|
-
* binds because the port is already held, so `start` would otherwise report
|
|
249
|
-
* "✓ started" while `status` shows it inactive. Tests inject a stub;
|
|
250
|
-
* production uses `defaultPortListening` (a loopback TCP connect probe).
|
|
251
|
-
*/
|
|
252
|
-
portListening?: PortListeningFn;
|
|
253
|
-
/**
|
|
254
|
-
* How long `start` polls for the service to bind its port after the
|
|
255
|
-
* liveness settle passes. Default 4000ms in production — long enough to
|
|
256
|
-
* cover vault/scribe cold-boot (DB open, route registration) without making
|
|
257
|
-
* a healthy start feel laggy. Polled at `startReadyPollMs` intervals; the
|
|
258
|
-
* first time the port answers we declare success. If the window elapses
|
|
259
|
-
* with the process still alive but the port silent, we print a non-fatal
|
|
260
|
-
* warning (the daemon may still be coming up) rather than failing — only a
|
|
261
|
-
* *dead* process is a hard failure. Defaulting policy mirrors
|
|
262
|
-
* `startSettleMs`: 0 (skipped) unless `portListening` is injected or the
|
|
263
|
-
* production path (no spawner override) is active.
|
|
264
|
-
*/
|
|
265
|
-
startReadyMs?: number;
|
|
266
|
-
/** Poll interval while waiting for the port to come up. Default 200ms. */
|
|
267
|
-
startReadyPollMs?: number;
|
|
268
|
-
/**
|
|
269
|
-
* Override the hub origin passed to services as PARACHUTE_HUB_ORIGIN. If
|
|
270
|
-
* unset, `start` derives it from `expose-state.json` (when exposed) or
|
|
271
|
-
* the hub.port file (local dev). Undefined → no env var is set at all,
|
|
272
|
-
* and the service advertises its own default issuer.
|
|
124
|
+
* Override the hub origin used as the operator token's `iss` validator on the
|
|
125
|
+
* loopback module-ops call. If unset, derived from `expose-state.json` (when
|
|
126
|
+
* exposed) or the hub.port file (local dev).
|
|
273
127
|
*/
|
|
274
128
|
hubOrigin?: string;
|
|
275
129
|
/**
|
|
276
|
-
*
|
|
277
|
-
*
|
|
278
|
-
*
|
|
279
|
-
*
|
|
280
|
-
*
|
|
281
|
-
*
|
|
282
|
-
*/
|
|
283
|
-
/**
|
|
284
|
-
* PATH-resolution seam for the start preflight (`@openparachute/depcheck`
|
|
285
|
-
* `ensureExecutable`). Production uses the real `Bun.which`; a missing
|
|
286
|
-
* startCmd binary then surfaces the friendly missing-dependency UX +
|
|
287
|
-
* persists it to services.json.
|
|
130
|
+
* Supervisor-path seams (design §3.3) — the ONLY runtime as of Phase 5b.
|
|
131
|
+
* `start/stop/restart` drive the RUNNING hub's in-process Supervisor over the
|
|
132
|
+
* loopback module-ops API (per-module verbs) or the platform manager (hub
|
|
133
|
+
* verbs / no-svc). The detached spawners are retired; a box with no hub unit
|
|
134
|
+
* goes through the §7.5 auto-offer / actionable error (`migrateOffer`), never
|
|
135
|
+
* a detached spawn.
|
|
288
136
|
*
|
|
289
|
-
*
|
|
290
|
-
*
|
|
291
|
-
*
|
|
292
|
-
*
|
|
293
|
-
*
|
|
294
|
-
*
|
|
295
|
-
*
|
|
137
|
+
* Everything here is injectable so tests can (a) force the unit-installed
|
|
138
|
+
* branch without a real launchd/systemd, and (b) assert the module-ops /
|
|
139
|
+
* manager calls without a live hub. Production wires the real
|
|
140
|
+
* {@link driveModuleOp} / {@link ensureHubUnit} / {@link stopHubUnit} /
|
|
141
|
+
* {@link restartHubUnit} against an opened hub.db + the resolved hub origin.
|
|
142
|
+
*
|
|
143
|
+
* `unitInstalled` is the discriminant that decides whether the box is already
|
|
144
|
+
* supervised. When OMITTED entirely it defaults to `false` → the verb runs the
|
|
145
|
+
* no-unit path (auto-offer / error). The production CLI dispatch passes
|
|
146
|
+
* `supervisor: {}` so the real `isHubUnitInstalled` probe decides.
|
|
296
147
|
*/
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
148
|
+
supervisor?: {
|
|
149
|
+
/**
|
|
150
|
+
* Is a hub unit installed (the dual-dispatch discriminant)? Production
|
|
151
|
+
* uses `isHubUnitInstalled(hubUnitDeps)`. Tests set this `true`/`false`
|
|
152
|
+
* directly to pick the branch deterministically. When set, it wins over
|
|
153
|
+
* the `hubUnitDeps`-derived detection.
|
|
154
|
+
*/
|
|
155
|
+
unitInstalled?: boolean;
|
|
156
|
+
/** Deps for the real `isHubUnitInstalled` probe + the hub-unit manager ops. */
|
|
157
|
+
hubUnitDeps?: HubUnitDeps;
|
|
158
|
+
/** Drive a per-module op against the running hub (reads operator.token). */
|
|
159
|
+
driveModuleOp?: (
|
|
160
|
+
short: string,
|
|
161
|
+
op: ModuleOp,
|
|
162
|
+
deps: DriveModuleOpDeps,
|
|
163
|
+
) => Promise<ModuleOpResult>;
|
|
164
|
+
/** Ensure the hub unit is up before a module op (§3.2). */
|
|
165
|
+
ensureHubUnit?: (opts: EnsureHubUnitOpts) => Promise<EnsureHubUnitResult>;
|
|
166
|
+
/** Stop the hub unit via the platform manager (NEVER a PID signal, §3.3). */
|
|
167
|
+
stopHubUnit?: (deps: HubUnitDeps) => HubUnitManagerOpResult;
|
|
168
|
+
/** Restart the hub unit via the platform manager (NEVER a PID signal, §3.3). */
|
|
169
|
+
restartHubUnit?: (deps: HubUnitDeps) => HubUnitManagerOpResult;
|
|
301
170
|
/**
|
|
302
|
-
*
|
|
303
|
-
*
|
|
304
|
-
*
|
|
305
|
-
*
|
|
306
|
-
* `start hub`.
|
|
171
|
+
* Probe whether the loopback hub answers `/health`. Used by `stop <svc>`:
|
|
172
|
+
* if the hub is down, the supervised module is already down (children die
|
|
173
|
+
* with the hub) → report "already stopped" WITHOUT starting the hub.
|
|
174
|
+
* Production reuses the hub-unit deps' `probeHealth`.
|
|
307
175
|
*/
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
176
|
+
probeHubHealth?: (port: number) => Promise<boolean>;
|
|
177
|
+
/**
|
|
178
|
+
* Open the hub DB used to validate/auto-rotate the operator token in
|
|
179
|
+
* `driveModuleOp`. Production opens `<configDir>/hub.db`; tests inject an
|
|
180
|
+
* in-memory/seeded db. Returns a handle the caller closes.
|
|
181
|
+
*/
|
|
182
|
+
openDb?: (configDir: string) => import("bun:sqlite").Database;
|
|
183
|
+
/** Loopback hub base URL override (default derives from the hub port). */
|
|
184
|
+
baseUrl?: string;
|
|
185
|
+
};
|
|
186
|
+
/**
|
|
187
|
+
* §7.5 auto-detect-and-offer seam. When a verb takes the DETACHED arm (no hub
|
|
188
|
+
* unit installed) and a prior detached install is detected, the verb offers
|
|
189
|
+
* the supervised cutover (interactive) or prints the command (non-TTY) BEFORE
|
|
190
|
+
* doing detached work. Injectable so tests can (a) stub the offer to assert it
|
|
191
|
+
* fires / migrates / declines, and (b) DISABLE it entirely (`enabled:false`)
|
|
192
|
+
* so the hundreds of existing detached-arm lifecycle tests don't trip an
|
|
193
|
+
* interactive prompt. Production wires the real `offerMigrateToSupervised`.
|
|
194
|
+
*
|
|
195
|
+
* Default when OMITTED: disabled, so existing tests (which never opt in) stay
|
|
196
|
+
* deterministic. The production CLI dispatch passes `{ enabled: true }`.
|
|
197
|
+
*/
|
|
198
|
+
migrateOffer?: {
|
|
199
|
+
/** Master switch. Default `false` when the whole block is omitted. */
|
|
200
|
+
enabled?: boolean;
|
|
201
|
+
/** The offer implementation (default `offerMigrateToSupervised`). */
|
|
202
|
+
offer?: (opts: MigrateOfferOpts) => Promise<MigrateOfferResult>;
|
|
313
203
|
};
|
|
314
204
|
}
|
|
315
205
|
|
|
316
206
|
interface Resolved {
|
|
317
|
-
spawner: Spawner;
|
|
318
|
-
kill: KillFn;
|
|
319
|
-
alive: AliveFn;
|
|
320
|
-
sleep: SleepFn;
|
|
321
|
-
now: () => number;
|
|
322
207
|
manifestPath: string;
|
|
323
208
|
configDir: string;
|
|
324
209
|
log: (line: string) => void;
|
|
325
|
-
killWaitMs: number;
|
|
326
|
-
pollIntervalMs: number;
|
|
327
|
-
startSettleMs: number;
|
|
328
|
-
portListening: PortListeningFn;
|
|
329
|
-
startReadyMs: number;
|
|
330
|
-
startReadyPollMs: number;
|
|
331
|
-
which: (cmd: string) => string | null;
|
|
332
210
|
hubOrigin: string | undefined;
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
}) => Promise<OperatorIssuerHealStatus>;
|
|
211
|
+
sup: ResolvedSupervisor;
|
|
212
|
+
/** §7.5 resolved auto-offer (enabled flag + the offer impl). */
|
|
213
|
+
migrateOffer: {
|
|
214
|
+
enabled: boolean;
|
|
215
|
+
offer: (opts: MigrateOfferOpts) => Promise<MigrateOfferResult>;
|
|
216
|
+
};
|
|
340
217
|
}
|
|
341
218
|
|
|
342
|
-
/**
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
const db = openHubDb(hubDbPath(args.configDir));
|
|
355
|
-
try {
|
|
356
|
-
return await selfHealOperatorTokenIssuer(db, {
|
|
357
|
-
issuer: args.issuer,
|
|
358
|
-
configDir: args.configDir,
|
|
359
|
-
log: args.log,
|
|
360
|
-
});
|
|
361
|
-
} finally {
|
|
362
|
-
db.close();
|
|
363
|
-
}
|
|
219
|
+
/** Resolved supervisor-path seams (see `LifecycleOpts.supervisor`). */
|
|
220
|
+
interface ResolvedSupervisor {
|
|
221
|
+
/** Whether a hub unit is installed — the dual-dispatch discriminant. */
|
|
222
|
+
unitInstalled: boolean;
|
|
223
|
+
hubUnitDeps: HubUnitDeps;
|
|
224
|
+
driveModuleOp: (short: string, op: ModuleOp, deps: DriveModuleOpDeps) => Promise<ModuleOpResult>;
|
|
225
|
+
ensureHubUnit: (opts: EnsureHubUnitOpts) => Promise<EnsureHubUnitResult>;
|
|
226
|
+
stopHubUnit: (deps: HubUnitDeps) => HubUnitManagerOpResult;
|
|
227
|
+
restartHubUnit: (deps: HubUnitDeps) => HubUnitManagerOpResult;
|
|
228
|
+
probeHubHealth: (port: number) => Promise<boolean>;
|
|
229
|
+
openDb: (configDir: string) => import("bun:sqlite").Database;
|
|
230
|
+
baseUrl: string | undefined;
|
|
364
231
|
}
|
|
365
232
|
|
|
366
233
|
function resolve(opts: LifecycleOpts): Resolved {
|
|
367
234
|
const configDir = opts.configDir ?? CONFIG_DIR;
|
|
368
235
|
return {
|
|
369
|
-
spawner: opts.spawner ?? defaultSpawner,
|
|
370
|
-
kill: opts.kill ?? defaultKill,
|
|
371
|
-
alive: opts.alive ?? defaultAlive,
|
|
372
|
-
sleep: opts.sleep ?? defaultSleep,
|
|
373
|
-
now: opts.now ?? Date.now,
|
|
374
236
|
manifestPath: opts.manifestPath ?? SERVICES_MANIFEST_PATH,
|
|
375
237
|
configDir,
|
|
376
238
|
log: opts.log ?? ((line) => console.log(line)),
|
|
377
|
-
killWaitMs: opts.killWaitMs ?? 10_000,
|
|
378
|
-
pollIntervalMs: opts.pollIntervalMs ?? 200,
|
|
379
|
-
// See `LifecycleOpts.startSettleMs` doc. Production (no spawner
|
|
380
|
-
// override, no alive override) gets the 250ms settle. Tests that
|
|
381
|
-
// inject a stub spawner without a stub alive get 0 — `defaultAlive`
|
|
382
|
-
// against a fake pid would always report dead and break unrelated
|
|
383
|
-
// tests. Tests that want to exercise the settle path explicitly
|
|
384
|
-
// override `alive`, which re-enables the default 250ms.
|
|
385
|
-
startSettleMs:
|
|
386
|
-
opts.startSettleMs ?? (opts.spawner === undefined || opts.alive !== undefined ? 250 : 0),
|
|
387
|
-
portListening: opts.portListening ?? defaultPortListening,
|
|
388
|
-
// Same defaulting policy as startSettleMs: production (no spawner
|
|
389
|
-
// override) gets the real 4s readiness window; tests that inject a stub
|
|
390
|
-
// spawner get 0 (skipped) unless they explicitly opt in via
|
|
391
|
-
// `portListening` or `startReadyMs`, so existing stub-spawner tests don't
|
|
392
|
-
// start probing a fake port.
|
|
393
|
-
startReadyMs:
|
|
394
|
-
opts.startReadyMs ??
|
|
395
|
-
(opts.spawner === undefined || opts.portListening !== undefined ? 4000 : 0),
|
|
396
|
-
startReadyPollMs: opts.startReadyPollMs ?? 200,
|
|
397
|
-
// Same defaulting policy as startSettleMs/startReadyMs: production (no
|
|
398
|
-
// spawner override) preflights with the real Bun.which; stub-spawner tests
|
|
399
|
-
// get a permissive resolver so the preflight doesn't trip against binaries
|
|
400
|
-
// that aren't on the test host's PATH. Explicit `which` always wins.
|
|
401
|
-
which:
|
|
402
|
-
opts.which ?? (opts.spawner === undefined ? Bun.which : () => "/stub/bin/preflight-skipped"),
|
|
403
239
|
hubOrigin: resolveHubOrigin(opts.hubOrigin, configDir),
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
240
|
+
sup: resolveSupervisor(opts.supervisor),
|
|
241
|
+
migrateOffer: {
|
|
242
|
+
// Default OFF when omitted so the existing supervised-arm + no-unit
|
|
243
|
+
// lifecycle tests (which don't opt in) don't trip an interactive prompt.
|
|
244
|
+
// The production CLI dispatch passes `{ enabled: true }`.
|
|
245
|
+
enabled: opts.migrateOffer?.enabled ?? false,
|
|
246
|
+
offer: opts.migrateOffer?.offer ?? offerMigrateToSupervised,
|
|
247
|
+
},
|
|
248
|
+
};
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
/**
|
|
252
|
+
* Resolve the supervisor-path seams.
|
|
253
|
+
*
|
|
254
|
+
* The discriminant `unitInstalled` decides whether the box is already supervised:
|
|
255
|
+
* - When the caller PROVIDES a `supervisor` block (even `{}`, which the
|
|
256
|
+
* production CLI dispatch passes), `unitInstalled` is the explicit override
|
|
257
|
+
* if set, else the real `isHubUnitInstalled` probe over the hub-unit deps.
|
|
258
|
+
* - When the caller OMITS `supervisor` entirely, `unitInstalled` defaults to
|
|
259
|
+
* `false` → the verb runs the no-unit path (§7.5 auto-offer / actionable
|
|
260
|
+
* error). Deterministic regardless of whether the test host happens to have a
|
|
261
|
+
* real hub unit installed.
|
|
262
|
+
*/
|
|
263
|
+
function resolveSupervisor(opts: LifecycleOpts["supervisor"]): ResolvedSupervisor {
|
|
264
|
+
const hubUnitDeps = opts?.hubUnitDeps ?? defaultHubUnitDeps;
|
|
265
|
+
// No `supervisor` block at all → no-unit path, deterministically. Only probe
|
|
266
|
+
// the real filesystem when the caller opted in (production CLI passes
|
|
267
|
+
// `supervisor: {}`; tests pass the seams they want to assert).
|
|
268
|
+
const unitInstalled =
|
|
269
|
+
opts === undefined ? false : (opts.unitInstalled ?? isHubUnitInstalled(hubUnitDeps));
|
|
270
|
+
return {
|
|
271
|
+
unitInstalled,
|
|
272
|
+
hubUnitDeps,
|
|
273
|
+
driveModuleOp: opts?.driveModuleOp ?? driveModuleOpImpl,
|
|
274
|
+
ensureHubUnit: opts?.ensureHubUnit ?? ensureHubUnitImpl,
|
|
275
|
+
stopHubUnit: opts?.stopHubUnit ?? stopHubUnitImpl,
|
|
276
|
+
restartHubUnit: opts?.restartHubUnit ?? restartHubUnitImpl,
|
|
277
|
+
probeHubHealth: opts?.probeHubHealth ?? hubUnitDeps.probeHealth,
|
|
278
|
+
openDb: opts?.openDb ?? ((configDir) => openHubDb(hubDbPath(configDir))),
|
|
279
|
+
baseUrl: opts?.baseUrl,
|
|
407
280
|
};
|
|
408
281
|
}
|
|
409
282
|
|
|
283
|
+
/**
|
|
284
|
+
* §7.5 auto-detect-and-offer hook for the no-unit case of start/stop/restart.
|
|
285
|
+
*
|
|
286
|
+
* Called when a verb finds NO hub unit installed (Phase 5b removed the detached
|
|
287
|
+
* spawners, so there is no detached arm to fall back to). When the offer is
|
|
288
|
+
* enabled, it runs `offerMigrateToSupervised` (which itself checks "no unit +
|
|
289
|
+
* prior detached" and prompts / prints). Returns `true` ONLY when the operator
|
|
290
|
+
* accepted AND the cutover succeeded — i.e. the box is NOW supervised, so the
|
|
291
|
+
* caller can dispatch through the supervisor path. Every other outcome (offer
|
|
292
|
+
* disabled, no-offer, declined, printed in a non-TTY, migrate-failed) returns
|
|
293
|
+
* `false` → the caller surfaces the actionable "run `parachute migrate
|
|
294
|
+
* --to-supervised`" error (NOT a detached spawn — that path is gone).
|
|
295
|
+
*
|
|
296
|
+
* The migrate-failed case deliberately returns `false`: a failed cutover leaves
|
|
297
|
+
* the box un-migrated (the cutover is fail-safe + re-runnable), so the verb
|
|
298
|
+
* surfaces the error rather than dispatching into a supervisor that isn't up.
|
|
299
|
+
*/
|
|
300
|
+
async function maybeOfferAndMigrate(r: Resolved): Promise<boolean> {
|
|
301
|
+
if (!r.migrateOffer.enabled) return false;
|
|
302
|
+
const result = await r.migrateOffer.offer({
|
|
303
|
+
configDir: r.configDir,
|
|
304
|
+
manifestPath: r.manifestPath,
|
|
305
|
+
log: r.log,
|
|
306
|
+
});
|
|
307
|
+
if (result.outcome === "migrated") {
|
|
308
|
+
// The box is now supervised. Flip the resolved discriminant so the verb
|
|
309
|
+
// takes the supervisor arm (the unit is freshly installed; `unitInstalled`
|
|
310
|
+
// was resolved as false before the offer).
|
|
311
|
+
r.sup.unitInstalled = true;
|
|
312
|
+
return true;
|
|
313
|
+
}
|
|
314
|
+
return false;
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
/**
|
|
318
|
+
* Phase 5b single-path gate (the point-of-no-return). The supervised path is the
|
|
319
|
+
* ONLY runtime — the detached spawners are retired. So every per-module verb must
|
|
320
|
+
* first establish that a hub unit is installed; if it isn't, there is no detached
|
|
321
|
+
* fallback to take. Resolution order:
|
|
322
|
+
*
|
|
323
|
+
* 1. Unit installed → ready; dispatch through the supervisor.
|
|
324
|
+
* 2. No unit → run the §7.5 auto-detect-and-offer. If the operator accepts the
|
|
325
|
+
* cutover and it succeeds, the box is now supervised → ready.
|
|
326
|
+
* 3. Still no unit (offer disabled / no prior-detached evidence / declined /
|
|
327
|
+
* printed in a non-TTY / migrate-failed) → surface the actionable error and
|
|
328
|
+
* return NOT ready. The verb returns a non-zero exit; it NEVER spawns a
|
|
329
|
+
* detached daemon (that machinery is gone).
|
|
330
|
+
*
|
|
331
|
+
* The offer itself logs its own context (interactive prompt / printed command),
|
|
332
|
+
* so when it fired we don't double-print the bare error. We only print the
|
|
333
|
+
* actionable fallback line when no offer was surfaced (offer disabled or no
|
|
334
|
+
* prior-detached evidence — a genuinely-unmigrated or clean box driven by a
|
|
335
|
+
* script).
|
|
336
|
+
*/
|
|
337
|
+
async function requireSupervisedOrOffer(r: Resolved): Promise<boolean> {
|
|
338
|
+
if (r.sup.unitInstalled) return true;
|
|
339
|
+
const migrated = await maybeOfferAndMigrate(r);
|
|
340
|
+
if (migrated) return true;
|
|
341
|
+
// No unit and not migrated. If the offer was enabled it already surfaced its
|
|
342
|
+
// own guidance (prompt / printed command / declined note); otherwise print the
|
|
343
|
+
// actionable command so a script on a never-migrated box isn't left guessing.
|
|
344
|
+
if (!r.migrateOffer.enabled) {
|
|
345
|
+
r.log(
|
|
346
|
+
"No supervised hub unit is installed. Run `parachute migrate --to-supervised` to install it,",
|
|
347
|
+
);
|
|
348
|
+
r.log("or run `parachute serve` in the foreground.");
|
|
349
|
+
}
|
|
350
|
+
return false;
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
/**
|
|
354
|
+
* Resolve the hub origin used as the operator token's `iss` validator in the
|
|
355
|
+
* supervisor path. Unlike {@link resolveHubOrigin} (which returns `undefined`
|
|
356
|
+
* for pure loopback so the spawn env omits PARACHUTE_HUB_ORIGIN), the operator
|
|
357
|
+
* token ALWAYS carries an `iss`, so this falls back to the canonical loopback
|
|
358
|
+
* origin. Mirrors `commands/auth.ts`'s `resolveHubIssuer` so the issuer the CLI
|
|
359
|
+
* validates the token against matches what `auth rotate-operator` minted under.
|
|
360
|
+
* The fallback differs cosmetically — here `readHubPort(configDir) ??
|
|
361
|
+
* HUB_UNIT_DEFAULT_PORT`, in auth.ts `127.0.0.1:${HUB_DEFAULT_PORT}` — but both
|
|
362
|
+
* resolve to 1939 under canonical-ports today, so they agree in practice.
|
|
363
|
+
* See #508: consolidate with auth.ts:resolveHubIssuer to prevent drift.
|
|
364
|
+
*/
|
|
365
|
+
function resolveOperatorTokenIssuer(hubOrigin: string | undefined, configDir: string): string {
|
|
366
|
+
if (hubOrigin) return hubOrigin;
|
|
367
|
+
const port = readHubPort(configDir) ?? HUB_UNIT_DEFAULT_PORT;
|
|
368
|
+
return `http://127.0.0.1:${port}`;
|
|
369
|
+
}
|
|
370
|
+
|
|
410
371
|
/**
|
|
411
372
|
* Source of truth order for `PARACHUTE_HUB_ORIGIN`:
|
|
412
373
|
* 1. explicit override (flag / opt)
|
|
@@ -422,538 +383,216 @@ function resolveHubOrigin(override: string | undefined, configDir: string): stri
|
|
|
422
383
|
return deriveHubOrigin({ exposeFqdn, hubPort: readHubPort(configDir) });
|
|
423
384
|
}
|
|
424
385
|
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
* a row has neither — `start` prints the actionable "no installDir"
|
|
433
|
-
* re-install message for an installDir-less third-party row, or
|
|
434
|
-
* "lifecycle not yet supported" otherwise; `stop`/`logs` keep working
|
|
435
|
-
* via pidfile/logfile semantics keyed by `short`.
|
|
436
|
-
*/
|
|
437
|
-
spec: ServiceSpec | undefined;
|
|
386
|
+
export async function start(svc: string | undefined, opts: LifecycleOpts = {}): Promise<number> {
|
|
387
|
+
const r = resolve(opts);
|
|
388
|
+
// Phase 5b single-path (design §8 Phase 5 + Appendix). The supervised path is
|
|
389
|
+
// the ONLY runtime — the detached spawners are retired. A box without a hub
|
|
390
|
+
// unit gets the §7.5 auto-offer / actionable error, NEVER a detached spawn.
|
|
391
|
+
if (!(await requireSupervisedOrOffer(r))) return 1;
|
|
392
|
+
return startViaSupervisor(svc, r);
|
|
438
393
|
}
|
|
439
394
|
|
|
440
|
-
async function
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
//
|
|
450
|
-
//
|
|
451
|
-
//
|
|
452
|
-
//
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
if (entry.installDir) {
|
|
456
|
-
try {
|
|
457
|
-
const manifest = await readModuleManifest(entry.installDir);
|
|
458
|
-
if (manifest) return { spec: composeKnownModuleSpec(km, manifest) };
|
|
459
|
-
} catch (err) {
|
|
460
|
-
if (err instanceof ModuleManifestError) {
|
|
461
|
-
// Surface the parse/validation error but keep the legacy
|
|
462
|
-
// imperative-startCmd spec so `start` can still spawn — better
|
|
463
|
-
// than no lifecycle at all when a module ships a typo'd manifest.
|
|
464
|
-
return { spec: firstParty, error: err.message };
|
|
465
|
-
}
|
|
466
|
-
throw err;
|
|
467
|
-
}
|
|
468
|
-
}
|
|
469
|
-
return { spec: firstParty };
|
|
470
|
-
}
|
|
471
|
-
// FIRST_PARTY_FALLBACKS shorts (notes / channel): the vendored manifest
|
|
472
|
-
// is authoritative — startCmd is composed from extras + manifest at
|
|
473
|
-
// `getSpec` time, no installDir read needed.
|
|
474
|
-
if (firstParty) return { spec: firstParty };
|
|
475
|
-
// Third-party rows: spec lives in the module's installDir/module.json.
|
|
476
|
-
if (!entry.installDir) return { spec: undefined };
|
|
477
|
-
try {
|
|
478
|
-
const spec = await getSpecFromInstallDir(entry.installDir, entry.name);
|
|
479
|
-
return { spec: spec ?? undefined };
|
|
480
|
-
} catch (err) {
|
|
481
|
-
if (err instanceof ModuleManifestError) {
|
|
482
|
-
return { spec: undefined, error: err.message };
|
|
483
|
-
}
|
|
484
|
-
throw err;
|
|
485
|
-
}
|
|
395
|
+
export async function stop(svc: string | undefined, opts: LifecycleOpts = {}): Promise<number> {
|
|
396
|
+
const r = resolve(opts);
|
|
397
|
+
// Phase 5b single-path: supervised is the only runtime (see `start`).
|
|
398
|
+
if (!(await requireSupervisedOrOffer(r))) return 1;
|
|
399
|
+
return stopViaSupervisor(svc, r);
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
export async function restart(svc: string | undefined, opts: LifecycleOpts = {}): Promise<number> {
|
|
403
|
+
const r = resolve(opts);
|
|
404
|
+
// Phase 5b single-path: supervised is the only runtime. The 404-fallthrough
|
|
405
|
+
// (a not-currently-supervised module → start, §6.2) lives in
|
|
406
|
+
// `restartViaSupervisor`, which makes `restart <svc>` total over module state
|
|
407
|
+
// just as the retired detached stop+start was.
|
|
408
|
+
if (!(await requireSupervisedOrOffer(r))) return 1;
|
|
409
|
+
return restartViaSupervisor(svc, r);
|
|
486
410
|
}
|
|
487
411
|
|
|
412
|
+
// ---------------------------------------------------------------------------
|
|
413
|
+
// Supervisor-path verb dispatch (design §3.3) — the ONLY runtime as of Phase 5b.
|
|
414
|
+
//
|
|
415
|
+
// `start/stop/restart` drive the RUNNING hub's in-process Supervisor over the
|
|
416
|
+
// loopback module-ops API (per-module verbs) or the platform manager (hub
|
|
417
|
+
// verbs / no-svc). The detached arm was retired in Phase 5b — a box with no hub
|
|
418
|
+
// unit goes through `requireSupervisedOrOffer` (§7.5 auto-offer / actionable
|
|
419
|
+
// error), never a detached spawn.
|
|
420
|
+
// ---------------------------------------------------------------------------
|
|
421
|
+
|
|
488
422
|
/**
|
|
489
|
-
*
|
|
490
|
-
*
|
|
491
|
-
*
|
|
492
|
-
*
|
|
493
|
-
*
|
|
494
|
-
*
|
|
495
|
-
*
|
|
496
|
-
* third-party). First-party are addressed by their short name (vault,
|
|
497
|
-
* notes, …) and matched via `shortNameForManifest`.
|
|
498
|
-
*
|
|
499
|
-
* Named-path detail: a third-party row whose name matches but lacks
|
|
500
|
-
* `installDir` resolves to the entry with `spec: undefined` (rather than
|
|
501
|
-
* an "unknown service" error). `stop`/`logs` handle the spec-less case
|
|
502
|
-
* via pidfile/logfile semantics; `start` surfaces an actionable
|
|
503
|
-
* re-install hint downstream. The genuinely-unknown path (no first-party
|
|
504
|
-
* fallback AND no row in services.json) still errors as `unknown service`.
|
|
423
|
+
* Drive a single module-op against the running hub, mapping the module-ops
|
|
424
|
+
* client's errors to actionable CLI output (§3.1). Opens hub.db (to validate /
|
|
425
|
+
* auto-rotate the operator token), resolves the issuer the token was minted
|
|
426
|
+
* under, and closes the db afterward. Returns the result on success; on a
|
|
427
|
+
* surfaced error returns `undefined` so the caller can decide (e.g. the restart
|
|
428
|
+
* 404-fallthrough). Re-throws nothing the caller can't handle: the operator-
|
|
429
|
+
* token / HTTP errors are caught here and printed.
|
|
505
430
|
*/
|
|
506
|
-
async function
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
const entry = manifest.services.find((s) => s.name === firstPartySpec.manifestName);
|
|
520
|
-
if (!entry) {
|
|
521
|
-
return { error: `${svc} isn't installed. Run \`parachute install ${svc}\` first.` };
|
|
522
|
-
}
|
|
523
|
-
// KNOWN_MODULES path (hub#310): `getSpec` returns a startCmd-less
|
|
524
|
-
// minimal spec for vault / scribe / runner. Compose the full
|
|
525
|
-
// spawnable spec by reading installDir's module.json so `start` /
|
|
526
|
-
// `restart` see the real startCmd. FIRST_PARTY_FALLBACKS path:
|
|
527
|
-
// `firstPartySpec.startCmd` is already populated, and `specForEntry`
|
|
528
|
-
// short-circuits without re-reading.
|
|
529
|
-
const { spec, error } = await specForEntry(svc, entry);
|
|
530
|
-
if (error) return { error: `${svc}: invalid module.json — ${error}` };
|
|
531
|
-
return { targets: [{ short: svc, entry, spec: spec ?? firstPartySpec }] };
|
|
532
|
-
}
|
|
533
|
-
// Third-party: match a services.json row by name. Rows with `installDir`
|
|
534
|
-
// resolve a full spec from the on-disk module.json. Rows without it are
|
|
535
|
-
// still managed (stop/logs use pidfile/logfile semantics keyed by short
|
|
536
|
-
// name), but with `spec: undefined` — `start` will surface an
|
|
537
|
-
// installDir-specific error downstream rather than reject up front.
|
|
538
|
-
const entry = manifest.services.find((s) => s.name === svc);
|
|
539
|
-
if (entry) {
|
|
540
|
-
if (entry.installDir) {
|
|
541
|
-
const { spec, error } = await specForEntry(svc, entry);
|
|
542
|
-
if (error) return { error: `${svc}: invalid module.json — ${error}` };
|
|
543
|
-
return { targets: [{ short: svc, entry, spec }] };
|
|
544
|
-
}
|
|
545
|
-
return { targets: [{ short: svc, entry, spec: undefined }] };
|
|
546
|
-
}
|
|
547
|
-
return {
|
|
548
|
-
error: `unknown service "${svc}". known: ${knownServices().join(", ")}`,
|
|
431
|
+
async function driveSupervisorOp(
|
|
432
|
+
short: string,
|
|
433
|
+
op: ModuleOp,
|
|
434
|
+
r: Resolved,
|
|
435
|
+
): Promise<{ result?: ModuleOpResult; httpError?: ModuleOpHttpError; failed: boolean }> {
|
|
436
|
+
const issuer = resolveOperatorTokenIssuer(r.hubOrigin, r.configDir);
|
|
437
|
+
const db = r.sup.openDb(r.configDir);
|
|
438
|
+
try {
|
|
439
|
+
const deps: DriveModuleOpDeps = {
|
|
440
|
+
db,
|
|
441
|
+
issuer,
|
|
442
|
+
configDir: r.configDir,
|
|
443
|
+
...(r.sup.baseUrl !== undefined ? { baseUrl: r.sup.baseUrl } : {}),
|
|
549
444
|
};
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
// for vault / scribe / runner — read installDir's module.json to
|
|
558
|
-
// compose the spawnable spec. FIRST_PARTY_FALLBACKS shorts get
|
|
559
|
-
// back the same vendored-startCmd-bearing spec from `getSpec`.
|
|
560
|
-
const { spec } = await specForEntry(short, entry);
|
|
561
|
-
targets.push({ short, entry, spec });
|
|
562
|
-
continue;
|
|
445
|
+
const result = await r.sup.driveModuleOp(short, op, deps);
|
|
446
|
+
return { result, failed: false };
|
|
447
|
+
} catch (err) {
|
|
448
|
+
if (err instanceof NoOperatorTokenError || err instanceof OperatorTokenExpiredError) {
|
|
449
|
+
// Surface the already-actionable message (don't raw-throw a 401, §3.1).
|
|
450
|
+
r.log(`✗ ${short}: ${err.message}`);
|
|
451
|
+
return { failed: true };
|
|
563
452
|
}
|
|
564
|
-
if (
|
|
565
|
-
|
|
566
|
-
|
|
453
|
+
if (err instanceof ModuleOpHttpError) {
|
|
454
|
+
// Return the typed HTTP error so the caller can branch (404-fallthrough,
|
|
455
|
+
// not_installed hint). Callers that don't branch print it via
|
|
456
|
+
// `surfaceModuleOpHttpError`.
|
|
457
|
+
return { httpError: err, failed: true };
|
|
567
458
|
}
|
|
459
|
+
// Unknown error — surface its message rather than crashing the CLI.
|
|
460
|
+
r.log(`✗ ${short}: ${err instanceof Error ? err.message : String(err)}`);
|
|
461
|
+
return { failed: true };
|
|
462
|
+
} finally {
|
|
463
|
+
db.close();
|
|
568
464
|
}
|
|
569
|
-
if (targets.length === 0) {
|
|
570
|
-
return { error: "No manageable services in services.json." };
|
|
571
|
-
}
|
|
572
|
-
return { targets };
|
|
573
465
|
}
|
|
574
466
|
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
if (
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
return
|
|
582
|
-
}
|
|
583
|
-
|
|
584
|
-
let failures = 0;
|
|
585
|
-
for (const { short, entry, spec } of picked.targets) {
|
|
586
|
-
const state = processState(short, r.configDir, r.alive);
|
|
587
|
-
if (state.status === "running") {
|
|
588
|
-
r.log(`${short} already running (pid ${state.pid}).`);
|
|
589
|
-
continue;
|
|
590
|
-
}
|
|
591
|
-
if (state.pid !== undefined) {
|
|
592
|
-
// Stale PID file for a dead process — clear it before we spawn fresh.
|
|
593
|
-
clearPid(short, r.configDir);
|
|
594
|
-
}
|
|
595
|
-
|
|
596
|
-
const cmd = spec?.startCmd?.(entry);
|
|
597
|
-
if (!cmd || cmd.length === 0) {
|
|
598
|
-
// Distinguish the missing-installDir case from "spec resolved but has
|
|
599
|
-
// no startCmd" — the former is fixable by re-registering the module,
|
|
600
|
-
// the latter is a hub-level limitation. Third-party rows hit the first
|
|
601
|
-
// branch when their self-registration predates the installDir contract.
|
|
602
|
-
if (!getSpec(short) && !entry.installDir) {
|
|
603
|
-
r.log(
|
|
604
|
-
`${short}: services.json entry has no installDir, so the start command can't be resolved. Re-run \`parachute install <path-to-${short}>\` to refresh its registration, or upgrade the module to a version that self-registers with installDir.`,
|
|
605
|
-
);
|
|
606
|
-
} else {
|
|
607
|
-
r.log(`${short}: lifecycle not yet supported for this service.`);
|
|
608
|
-
}
|
|
609
|
-
failures++;
|
|
610
|
-
continue;
|
|
611
|
-
}
|
|
612
|
-
|
|
613
|
-
const logFile = ensureLogPath(short, r.configDir);
|
|
614
|
-
// Merge `<configDir>/<short>/.env` into the spawn env so service-specific
|
|
615
|
-
// values (auto-wired SCRIBE_AUTH_TOKEN/SCRIBE_URL on vault, GROQ/OPENAI
|
|
616
|
-
// API keys on scribe written by the install prompt) reach the daemon.
|
|
617
|
-
// Vault still loads its own .env at runtime (it has its own start.sh
|
|
618
|
-
// wrapper for launchd / systemd) — this is idempotent there. Hub-origin
|
|
619
|
-
// override wins on collision; that's the live-exposure source of truth.
|
|
620
|
-
const fileEnv = readEnvFileValues(join(r.configDir, short, ".env"));
|
|
621
|
-
// PORT override (hub#356): same shape as `spawnSupervised` in
|
|
622
|
-
// api-modules-ops.ts. Without this, operators running `parachute start
|
|
623
|
-
// vault` inside a container that has PORT in env (Render / Fly / etc.)
|
|
624
|
-
// hit EADDRINUSE on hub's port. Local dev typically doesn't set PORT, so
|
|
625
|
-
// this is a no-op there. fileEnv wins on collision so per-service .env
|
|
626
|
-
// can still override if an operator deliberately set PORT in there.
|
|
627
|
-
const env: Record<string, string> = { PORT: String(entry.port), ...fileEnv };
|
|
628
|
-
if (r.hubOrigin) env[HUB_ORIGIN_ENV] = r.hubOrigin;
|
|
629
|
-
const spawnerOpts: { env?: Record<string, string>; cwd?: string } = {};
|
|
630
|
-
if (Object.keys(env).length > 0) spawnerOpts.env = env;
|
|
631
|
-
// Third-party modules ship clean relative startCmds — `cwd: installDir`
|
|
632
|
-
// makes those resolve. First-party fallbacks use absolute / PATH binaries
|
|
633
|
-
// so their cwd is irrelevant; passing it doesn't hurt.
|
|
634
|
-
if (entry.installDir) spawnerOpts.cwd = entry.installDir;
|
|
635
|
-
const passOpts =
|
|
636
|
-
spawnerOpts.env !== undefined || spawnerOpts.cwd !== undefined ? spawnerOpts : undefined;
|
|
637
|
-
|
|
638
|
-
// Pre-flight the startCmd binary (`@openparachute/depcheck`) so a missing
|
|
639
|
-
// executable surfaces the friendly install UX inline AND is persisted onto
|
|
640
|
-
// the services.json row, so a *later* `parachute status` (a separate
|
|
641
|
-
// invocation that only reads the manifest) + the SPA modules pane show
|
|
642
|
-
// "vault: failed to start — parachute-vault not installed" with install
|
|
643
|
-
// info, rather than a bare "failed"/orphan-timeout. The binary is `cmd[0]`
|
|
644
|
-
// (e.g. `parachute-vault` for an npm install, `bun` for a bun-linked one).
|
|
645
|
-
const startBinary = cmd[0];
|
|
646
|
-
if (startBinary) {
|
|
647
|
-
try {
|
|
648
|
-
ensureExecutable(startBinary, { which: r.which });
|
|
649
|
-
} catch (err) {
|
|
650
|
-
if (err instanceof MissingDependencyError) {
|
|
651
|
-
failures++;
|
|
652
|
-
r.log(`✗ ${short} failed to start:`);
|
|
653
|
-
for (const line of err.message.split("\n")) r.log(` ${line}`);
|
|
654
|
-
recordStartError(entry.name, err.toWire(), r.manifestPath);
|
|
655
|
-
continue;
|
|
656
|
-
}
|
|
657
|
-
throw err;
|
|
658
|
-
}
|
|
659
|
-
}
|
|
660
|
-
|
|
661
|
-
let pid: number;
|
|
662
|
-
try {
|
|
663
|
-
pid = r.spawner.spawn(cmd, logFile, passOpts);
|
|
664
|
-
} catch (err) {
|
|
665
|
-
// Belt-and-suspenders: a missing binary that slipped past the pre-flight
|
|
666
|
-
// (race) still becomes a MissingDependencyError via rethrowIfMissing.
|
|
667
|
-
if (startBinary) {
|
|
668
|
-
try {
|
|
669
|
-
rethrowIfMissing(err, startBinary);
|
|
670
|
-
} catch (missing) {
|
|
671
|
-
if (missing instanceof MissingDependencyError) {
|
|
672
|
-
failures++;
|
|
673
|
-
r.log(`✗ ${short} failed to start:`);
|
|
674
|
-
for (const line of missing.message.split("\n")) r.log(` ${line}`);
|
|
675
|
-
recordStartError(entry.name, missing.toWire(), r.manifestPath);
|
|
676
|
-
continue;
|
|
677
|
-
}
|
|
678
|
-
}
|
|
679
|
-
}
|
|
680
|
-
failures++;
|
|
681
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
682
|
-
r.log(`✗ ${short} failed to start: ${msg}`);
|
|
683
|
-
continue;
|
|
684
|
-
}
|
|
685
|
-
// A successful spawn clears any stale start-error recorded from a prior
|
|
686
|
-
// missing-dependency failure so `parachute status` doesn't keep showing it.
|
|
687
|
-
clearStartError(entry.name, r.manifestPath);
|
|
688
|
-
writePid(short, pid, r.configDir);
|
|
689
|
-
|
|
690
|
-
// Boot-readiness gating (hub#194 + hub#487). A spawn returning a pid only
|
|
691
|
-
// proves the kernel forked the process — it says nothing about whether the
|
|
692
|
-
// service survived its boot or bound its port. Two silent-start shapes:
|
|
693
|
-
//
|
|
694
|
-
// (1) spawn-then-immediately-die (hub#194): the child throws before
|
|
695
|
-
// listening (notes-serve's Bun.resolveSync failing for bun-linked
|
|
696
|
-
// installs) and exits microseconds later. Caught by the settle below.
|
|
697
|
-
//
|
|
698
|
-
// (2) alive-but-never-bound (hub#487): the port is already held by an
|
|
699
|
-
// orphan, the child hits EADDRINUSE, but its process *lingers* (or a
|
|
700
|
-
// supervisor retries) long enough to clear the liveness check. `start`
|
|
701
|
-
// would report "✓ started" while `parachute status` shows it inactive
|
|
702
|
-
// because nothing answers on the port. Aaron hit exactly this with an
|
|
703
|
-
// orphan holding vault's 1940 on a fresh EC2 box. Caught by the
|
|
704
|
-
// port-readiness poll below.
|
|
705
|
-
//
|
|
706
|
-
// On any failure we surface the tail of the logfile so the operator sees
|
|
707
|
-
// the real boot error inline, and we specifically call out EADDRINUSE with
|
|
708
|
-
// the `lsof -ti:<port>` remedy.
|
|
709
|
-
const reportStartFailure = (reason: string): void => {
|
|
710
|
-
clearPid(short, r.configDir);
|
|
711
|
-
failures++;
|
|
712
|
-
const tail = readLogTail(logFile, 20);
|
|
713
|
-
if (detectAddrInUse(tail)) {
|
|
714
|
-
r.log(
|
|
715
|
-
`✗ ${short} failed to start: port ${entry.port} is already in use. Stop the existing process first — find it with \`lsof -ti:${entry.port}\` (then \`kill <pid>\`), or run \`parachute restart ${short}\`.`,
|
|
716
|
-
);
|
|
717
|
-
} else {
|
|
718
|
-
r.log(`✗ ${short} failed to start: ${reason}`);
|
|
719
|
-
}
|
|
720
|
-
if (tail.length > 0) {
|
|
721
|
-
r.log(` ── last ${tail.length} log line(s) (${logFile}) ──`);
|
|
722
|
-
for (const line of tail) r.log(` │ ${line}`);
|
|
723
|
-
} else {
|
|
724
|
-
r.log(` Tail the log for details: tail -50 ${logFile}`);
|
|
725
|
-
}
|
|
726
|
-
};
|
|
727
|
-
|
|
728
|
-
if (r.startSettleMs > 0) {
|
|
729
|
-
await r.sleep(r.startSettleMs);
|
|
730
|
-
if (!r.alive(pid)) {
|
|
731
|
-
reportStartFailure(
|
|
732
|
-
`spawned pid ${pid} but the process exited within ${r.startSettleMs}ms.`,
|
|
733
|
-
);
|
|
734
|
-
continue;
|
|
735
|
-
}
|
|
736
|
-
}
|
|
737
|
-
|
|
738
|
-
// Port-readiness poll (hub#487). The process is alive; now confirm it
|
|
739
|
-
// actually bound its port before claiming success. Poll up to
|
|
740
|
-
// `startReadyMs`, re-checking liveness each iteration so a *later* death
|
|
741
|
-
// (e.g. a slow EADDRINUSE crash) is still reported as a failure. A process
|
|
742
|
-
// that stays alive but never binds within the window gets a non-fatal
|
|
743
|
-
// warning rather than a hard failure — some daemons legitimately do slow
|
|
744
|
-
// boot work, and we'd rather not flip a healthy-but-slow start to red.
|
|
745
|
-
if (r.startReadyMs > 0) {
|
|
746
|
-
const deadline = r.now() + r.startReadyMs;
|
|
747
|
-
let listening = false;
|
|
748
|
-
let died = false;
|
|
749
|
-
while (r.now() < deadline) {
|
|
750
|
-
if (!r.alive(pid)) {
|
|
751
|
-
died = true;
|
|
752
|
-
break;
|
|
753
|
-
}
|
|
754
|
-
if (await r.portListening(entry.port)) {
|
|
755
|
-
listening = true;
|
|
756
|
-
break;
|
|
757
|
-
}
|
|
758
|
-
await r.sleep(r.startReadyPollMs);
|
|
759
|
-
}
|
|
760
|
-
if (died) {
|
|
761
|
-
reportStartFailure(`spawned pid ${pid} but the process exited during startup.`);
|
|
762
|
-
continue;
|
|
763
|
-
}
|
|
764
|
-
if (!listening) {
|
|
765
|
-
// Last-chance liveness check — the loop may have exited on the
|
|
766
|
-
// deadline right as the process died.
|
|
767
|
-
if (!r.alive(pid)) {
|
|
768
|
-
reportStartFailure(`spawned pid ${pid} but the process exited during startup.`);
|
|
769
|
-
continue;
|
|
770
|
-
}
|
|
771
|
-
r.log(
|
|
772
|
-
`⚠ ${short} started (pid ${pid}) but port ${entry.port} isn't accepting connections yet after ${r.startReadyMs}ms.`,
|
|
773
|
-
);
|
|
774
|
-
r.log(
|
|
775
|
-
` It may still be coming up — check \`parachute status\` and \`parachute logs ${short}\`.`,
|
|
776
|
-
);
|
|
777
|
-
if (r.hubOrigin) r.log(` ${HUB_ORIGIN_ENV}=${r.hubOrigin}`);
|
|
778
|
-
if (short === "vault") persistVaultHubOriginForStart(r);
|
|
779
|
-
continue;
|
|
780
|
-
}
|
|
781
|
-
}
|
|
782
|
-
|
|
783
|
-
r.log(`✓ ${short} started (pid ${pid}); logs: ${logFile}`);
|
|
784
|
-
if (r.hubOrigin) r.log(` ${HUB_ORIGIN_ENV}=${r.hubOrigin}`);
|
|
785
|
-
if (short === "vault") persistVaultHubOriginForStart(r);
|
|
467
|
+
/** Print a module-ops HTTP error with an actionable hint for the known codes. */
|
|
468
|
+
function surfaceModuleOpHttpError(short: string, err: ModuleOpHttpError, r: Resolved): void {
|
|
469
|
+
if (err.status === 400 && err.code === "not_installed") {
|
|
470
|
+
r.log(
|
|
471
|
+
`✗ ${short} is not installed — run \`parachute install ${short}\` first, then \`parachute start ${short}\`.`,
|
|
472
|
+
);
|
|
473
|
+
return;
|
|
786
474
|
}
|
|
787
|
-
|
|
475
|
+
r.log(`✗ ${short}: ${err.message}`);
|
|
788
476
|
}
|
|
789
477
|
|
|
790
478
|
/**
|
|
791
|
-
*
|
|
792
|
-
*
|
|
793
|
-
*
|
|
794
|
-
* 1. The resolved spawn origin (`r.hubOrigin`) is a real public origin — write
|
|
795
|
-
* it. This is the long-standing happy path: an exposure is live, the
|
|
796
|
-
* launchd / systemd daemon (which boots vault out-of-band and never sees
|
|
797
|
-
* this spawn env) needs it in `.env` to validate hub-minted JWTs' `iss`.
|
|
798
|
-
* `persistVaultHubOrigin` skips loopback / unchanged values itself.
|
|
799
|
-
*
|
|
800
|
-
* 2. Self-heal: even when `r.hubOrigin` resolved to loopback or undefined
|
|
801
|
-
* (e.g. the hub.port file outran the expose-state read, or this is a bare
|
|
802
|
-
* `restart vault` on a deploy whose `.env` was never written), consult
|
|
803
|
-
* `expose-state.json` directly. If it advertises a public origin and
|
|
804
|
-
* vault's persisted value is unset / loopback, write the public origin.
|
|
805
|
-
* This is what lets an EXISTING broken Cloudflare deploy self-correct on
|
|
806
|
-
* the next `parachute restart vault`, not only fresh exposes.
|
|
479
|
+
* Ensure the hub unit is up, mapping `ensureHubUnit`'s structured outcome to a
|
|
480
|
+
* CLI exit signal. Returns true when the hub is up (already-up / started),
|
|
481
|
+
* false when it isn't (and the messages were surfaced).
|
|
807
482
|
*
|
|
808
|
-
*
|
|
809
|
-
*
|
|
483
|
+
* The `no-unit` outcome shouldn't reach here: `requireSupervisedOrOffer` gates
|
|
484
|
+
* every verb on `unitInstalled === true` before dispatching to the supervisor
|
|
485
|
+
* path, which is the same `isHubUnitInstalled` probe `ensureHubUnit` uses to
|
|
486
|
+
* decide `no-unit`. The defensive arm below still surfaces any non-up outcome's
|
|
487
|
+
* messages rather than silently succeeding.
|
|
810
488
|
*/
|
|
811
|
-
function
|
|
812
|
-
|
|
813
|
-
|
|
489
|
+
async function ensureHubForOp(r: Resolved, port: number): Promise<boolean> {
|
|
490
|
+
const ensured = await r.sup.ensureHubUnit({
|
|
491
|
+
port,
|
|
492
|
+
deps: r.sup.hubUnitDeps,
|
|
493
|
+
log: r.log,
|
|
494
|
+
});
|
|
495
|
+
if (ensured.outcome === "already-up" || ensured.outcome === "started") return true;
|
|
496
|
+
for (const m of ensured.messages) r.log(m);
|
|
497
|
+
return false;
|
|
814
498
|
}
|
|
815
499
|
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
500
|
+
/** `start <svc>` / `start` (no svc) over the supervisor (§3.3). */
|
|
501
|
+
async function startViaSupervisor(svc: string | undefined, r: Resolved): Promise<number> {
|
|
502
|
+
const port = readHubPort(r.configDir) ?? HUB_UNIT_DEFAULT_PORT;
|
|
503
|
+
// `start hub` / `start` (no svc): ensure the hub unit is up — it transitively
|
|
504
|
+
// boots every installed module from services.json via bootSupervisedModules.
|
|
505
|
+
if (svc === HUB_SVC || svc === undefined) {
|
|
506
|
+
const up = await ensureHubForOp(r, port);
|
|
507
|
+
if (!up) return 1;
|
|
508
|
+
r.log(svc === HUB_SVC ? "✓ hub is up." : "✓ hub is up (all installed modules booted).");
|
|
509
|
+
return 0;
|
|
823
510
|
}
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
}
|
|
832
|
-
if (!r.alive(pid)) {
|
|
833
|
-
clearPid(short, r.configDir);
|
|
834
|
-
r.log(`${short} wasn't running (cleaned stale pid file).`);
|
|
835
|
-
continue;
|
|
836
|
-
}
|
|
837
|
-
|
|
838
|
-
try {
|
|
839
|
-
r.kill(pid, "SIGTERM");
|
|
840
|
-
} catch (err) {
|
|
841
|
-
failures++;
|
|
842
|
-
r.log(`✗ ${short}: SIGTERM failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
843
|
-
continue;
|
|
844
|
-
}
|
|
845
|
-
|
|
846
|
-
const deadline = r.now() + r.killWaitMs;
|
|
847
|
-
while (r.now() < deadline && r.alive(pid)) {
|
|
848
|
-
await r.sleep(r.pollIntervalMs);
|
|
849
|
-
}
|
|
850
|
-
|
|
851
|
-
if (r.alive(pid)) {
|
|
852
|
-
r.log(`${short} didn't exit after ${r.killWaitMs}ms; sending SIGKILL.`);
|
|
853
|
-
try {
|
|
854
|
-
r.kill(pid, "SIGKILL");
|
|
855
|
-
} catch (err) {
|
|
856
|
-
failures++;
|
|
857
|
-
r.log(`✗ ${short}: SIGKILL failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
858
|
-
continue;
|
|
859
|
-
}
|
|
860
|
-
}
|
|
861
|
-
|
|
862
|
-
clearPid(short, r.configDir);
|
|
863
|
-
r.log(`✓ ${short} stopped.`);
|
|
511
|
+
// `start <svc>`: ensure the hub is up first (chicken-and-egg §3.2), then drive
|
|
512
|
+
// a pure supervisor.start of the already-installed module.
|
|
513
|
+
if (!(await ensureHubForOp(r, port))) return 1;
|
|
514
|
+
const { result, httpError, failed } = await driveSupervisorOp(svc, "start", r);
|
|
515
|
+
if (httpError) {
|
|
516
|
+
surfaceModuleOpHttpError(svc, httpError, r);
|
|
517
|
+
return 1;
|
|
864
518
|
}
|
|
865
|
-
|
|
866
|
-
}
|
|
867
|
-
|
|
868
|
-
export async function restart(svc: string | undefined, opts: LifecycleOpts = {}): Promise<number> {
|
|
869
|
-
const stopCode = await stop(svc, opts);
|
|
870
|
-
if (stopCode !== 0) return stopCode;
|
|
871
|
-
return await start(svc, opts);
|
|
519
|
+
if (failed || !result) return 1;
|
|
520
|
+
r.log(`✓ ${svc} started.`);
|
|
521
|
+
return 0;
|
|
872
522
|
}
|
|
873
523
|
|
|
874
|
-
/**
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
if (r.hubOrigin) ensureOpts.issuer = r.hubOrigin;
|
|
887
|
-
try {
|
|
888
|
-
const result = await r.ensureHub(ensureOpts);
|
|
889
|
-
if (result.started) {
|
|
890
|
-
const logFile = logPathFor(HUB_SVC, r.configDir);
|
|
891
|
-
r.log(`✓ hub started (pid ${result.pid}) on port ${result.port}; logs: ${logFile}`);
|
|
892
|
-
} else {
|
|
893
|
-
r.log(`hub already running (pid ${result.pid}) on port ${result.port}.`);
|
|
524
|
+
/** `stop <svc>` / `stop` (no svc) over the supervisor / platform manager (§3.3). */
|
|
525
|
+
async function stopViaSupervisor(svc: string | undefined, r: Resolved): Promise<number> {
|
|
526
|
+
const port = readHubPort(r.configDir) ?? HUB_UNIT_DEFAULT_PORT;
|
|
527
|
+
// `stop hub` / `stop` (no svc): stop the hub UNIT via the platform manager.
|
|
528
|
+
// MUST go through the manager — a PID signal would be undone by launchd
|
|
529
|
+
// KeepAlive / systemd Restart=always (R17). Children die with the hub.
|
|
530
|
+
if (svc === HUB_SVC || svc === undefined) {
|
|
531
|
+
const res = r.sup.stopHubUnit(r.sup.hubUnitDeps);
|
|
532
|
+
for (const m of res.messages) r.log(m);
|
|
533
|
+
if (res.outcome === "ok") {
|
|
534
|
+
r.log("✓ hub stopped (all supervised modules stopped with it).");
|
|
535
|
+
return 0;
|
|
894
536
|
}
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
537
|
+
return 1;
|
|
538
|
+
}
|
|
539
|
+
// `stop <svc>`: a supervised module dies WITH the hub. If the hub isn't
|
|
540
|
+
// reachable, the module is already down — report success WITHOUT starting the
|
|
541
|
+
// hub (do NOT ensureHubUnit just to stop one module). Only when the hub is up
|
|
542
|
+
// do we drive the supervisor's stop.
|
|
543
|
+
if (!(await r.sup.probeHubHealth(port))) {
|
|
544
|
+
r.log(`✓ ${svc} already stopped (the hub isn't running, so its modules are down).`);
|
|
901
545
|
return 0;
|
|
902
|
-
}
|
|
903
|
-
|
|
546
|
+
}
|
|
547
|
+
const { httpError, failed, result } = await driveSupervisorOp(svc, "stop", r);
|
|
548
|
+
if (httpError) {
|
|
549
|
+
surfaceModuleOpHttpError(svc, httpError, r);
|
|
904
550
|
return 1;
|
|
905
551
|
}
|
|
552
|
+
if (failed || !result) return 1;
|
|
553
|
+
r.log(`✓ ${svc} stopped.`);
|
|
554
|
+
return 0;
|
|
906
555
|
}
|
|
907
556
|
|
|
908
|
-
/**
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
if (!r.hubOrigin) return;
|
|
920
|
-
try {
|
|
921
|
-
const status = await r.selfHealOperatorTokenFn({
|
|
922
|
-
issuer: r.hubOrigin,
|
|
923
|
-
configDir: r.configDir,
|
|
924
|
-
log: r.log,
|
|
925
|
-
});
|
|
926
|
-
if (status.kind === "rotated") {
|
|
927
|
-
r.log(` refreshed operator.token issuer → ${r.hubOrigin} (was stale after exposure)`);
|
|
557
|
+
/** `restart <svc>` / `restart` (no svc) over the supervisor / manager (§3.3). */
|
|
558
|
+
async function restartViaSupervisor(svc: string | undefined, r: Resolved): Promise<number> {
|
|
559
|
+
// `restart hub` / `restart` (no svc): restart the hub UNIT via the platform
|
|
560
|
+
// manager. NOT a per-module fan-out — restarting the hub re-boots all modules
|
|
561
|
+
// anyway. MUST go through the manager (never a PID signal, R17).
|
|
562
|
+
if (svc === HUB_SVC || svc === undefined) {
|
|
563
|
+
const res = r.sup.restartHubUnit(r.sup.hubUnitDeps);
|
|
564
|
+
for (const m of res.messages) r.log(m);
|
|
565
|
+
if (res.outcome === "ok") {
|
|
566
|
+
r.log("✓ hub restarted (all modules re-booted).");
|
|
567
|
+
return 0;
|
|
928
568
|
}
|
|
929
|
-
|
|
930
|
-
r.log(
|
|
931
|
-
` note: operator.token issuer self-heal skipped (${
|
|
932
|
-
err instanceof Error ? err.message : String(err)
|
|
933
|
-
})`,
|
|
934
|
-
);
|
|
569
|
+
return 1;
|
|
935
570
|
}
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
571
|
+
// `restart <svc>`: ensure the hub is up, then drive supervisor.restart.
|
|
572
|
+
const port = readHubPort(r.configDir) ?? HUB_UNIT_DEFAULT_PORT;
|
|
573
|
+
if (!(await ensureHubForOp(r, port))) return 1;
|
|
574
|
+
const restartRes = await driveSupervisorOp(svc, "restart", r);
|
|
575
|
+
if (restartRes.httpError) {
|
|
576
|
+
// 404-fallthrough (§6.2): a module that isn't currently supervised (crashed
|
|
577
|
+
// out of budget, skipped at boot, installed out-of-band) returns 404
|
|
578
|
+
// `not_supervised`. `restart` must be total over module state (matching the
|
|
579
|
+
// detached stop+start), so fall through to a pure `start`.
|
|
580
|
+
if (restartRes.httpError.status === 404 && restartRes.httpError.code === "not_supervised") {
|
|
581
|
+
const startRes = await driveSupervisorOp(svc, "start", r);
|
|
582
|
+
if (startRes.httpError) {
|
|
583
|
+
surfaceModuleOpHttpError(svc, startRes.httpError, r);
|
|
584
|
+
return 1;
|
|
585
|
+
}
|
|
586
|
+
if (startRes.failed || !startRes.result) return 1;
|
|
587
|
+
r.log(`✓ ${svc} started.`);
|
|
588
|
+
return 0;
|
|
589
|
+
}
|
|
590
|
+
surfaceModuleOpHttpError(svc, restartRes.httpError, r);
|
|
955
591
|
return 1;
|
|
956
592
|
}
|
|
593
|
+
if (restartRes.failed || !restartRes.result) return 1;
|
|
594
|
+
r.log(`✓ ${svc} restarted.`);
|
|
595
|
+
return 0;
|
|
957
596
|
}
|
|
958
597
|
|
|
959
598
|
export interface LogsOpts {
|