@venturewild/workspace 0.3.5 → 0.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +112 -112
- package/package.json +83 -83
- package/server/bin/wild-workspace.mjs +995 -995
- package/server/src/account.mjs +114 -114
- package/server/src/agent-login.mjs +146 -146
- package/server/src/agent-readiness.mjs +200 -200
- package/server/src/agent.mjs +468 -453
- package/server/src/bazaar/core.mjs +579 -579
- package/server/src/bazaar/index.mjs +75 -75
- package/server/src/bazaar/mcp-server.mjs +328 -328
- package/server/src/bazaar/mock-tickup.mjs +97 -97
- package/server/src/bazaar/preview-server.mjs +95 -95
- package/server/src/bazaar/seed-recipes/customer-feedback-form/know-how.md +23 -23
- package/server/src/bazaar/seed-recipes/customer-feedback-form/recipe.json +24 -24
- package/server/src/bazaar/seed-recipes/landing-page-launch/know-how.md +29 -29
- package/server/src/bazaar/seed-recipes/landing-page-launch/recipe.json +25 -25
- package/server/src/bazaar/seed-recipes/personal-portfolio/know-how.md +21 -21
- package/server/src/bazaar/seed-recipes/personal-portfolio/recipe.json +24 -24
- package/server/src/bazaar/seed-recipes/receipt-sorter/know-how.md +31 -31
- package/server/src/bazaar/seed-recipes/receipt-sorter/recipe.json +25 -25
- package/server/src/bazaar/seed-recipes/tickup-hr-matching/know-how.md +79 -79
- package/server/src/bazaar/seed-recipes/tickup-hr-matching/recipe.json +32 -32
- package/server/src/canvas/core.mjs +421 -324
- package/server/src/canvas/index.mjs +42 -42
- package/server/src/canvas/mcp-server.mjs +253 -253
- package/server/src/config.mjs +404 -404
- package/server/src/daemon-bin.mjs +110 -110
- package/server/src/daemon-supervisor.mjs +285 -285
- package/server/src/doctor.mjs +375 -375
- package/server/src/inbox.mjs +86 -86
- package/server/src/index.mjs +2475 -2349
- package/server/src/logpaths.mjs +98 -98
- package/server/src/observability.mjs +45 -45
- package/server/src/operator.mjs +92 -92
- package/server/src/pairing.mjs +137 -137
- package/server/src/service.mjs +515 -515
- package/server/src/session-reporter.mjs +201 -201
- package/server/src/settings.mjs +145 -0
- package/server/src/share.mjs +182 -182
- package/server/src/skills.mjs +213 -0
- package/server/src/supervisor.mjs +647 -647
- package/server/src/support-consent.mjs +133 -133
- package/server/src/sync.mjs +248 -248
- package/server/src/transcript.mjs +121 -121
- package/server/src/turn-mcp.mjs +46 -46
- package/server/src/usage.mjs +405 -0
- package/web/dist/assets/index-BxRx8EsD.js +91 -0
- package/web/dist/assets/index-DoOPBr3s.css +1 -0
- package/web/dist/index.html +2 -2
- package/web/dist/assets/index-DatlFPkm.js +0 -91
- package/web/dist/assets/index-Dl0VT5e6.css +0 -1
|
@@ -1,647 +1,647 @@
|
|
|
1
|
-
// WorkspaceSupervisor — keeps the wild-workspace server alive in the background.
|
|
2
|
-
//
|
|
3
|
-
// The server itself auto-starts the bmo-sync daemon on boot (DaemonSupervisor),
|
|
4
|
-
// so keeping the server up brings the whole local stack — public URL included —
|
|
5
|
-
// back to life. This is the watchdog half of the always-on feature
|
|
6
|
-
// (docs/always-on-design.md); `service.mjs` is the per-OS autostart half that
|
|
7
|
-
// launches this hidden at login via `wild-workspace service run`.
|
|
8
|
-
//
|
|
9
|
-
// Design (all proven on Windows incl. a real reboot, 2026-05-30):
|
|
10
|
-
// - Health-driven: polls GET /api/health and (re)spawns the server only when
|
|
11
|
-
// it is down — so it never fights a server someone else started and handles
|
|
12
|
-
// crash recovery naturally.
|
|
13
|
-
// - Singleton: an exclusive lockfile in the machine-global dir
|
|
14
|
-
// (~/.wild-workspace, NEVER the synced workspace — locked principle #1).
|
|
15
|
-
// A stale lock whose pid is dead is taken over.
|
|
16
|
-
// - Exponential backoff (capped) so a crash-looping server can't spin the CPU.
|
|
17
|
-
// - Everything is logged — silent death is the #1 un-debuggable failure mode.
|
|
18
|
-
//
|
|
19
|
-
// Every external touch-point (spawn, health probe, clock) is an injected seam
|
|
20
|
-
// so the suite never spawns a real process.
|
|
21
|
-
|
|
22
|
-
import { spawn } from 'node:child_process';
|
|
23
|
-
import http from 'node:http';
|
|
24
|
-
import fs from 'node:fs';
|
|
25
|
-
import os from 'node:os';
|
|
26
|
-
import path from 'node:path';
|
|
27
|
-
import { fileURLToPath } from 'node:url';
|
|
28
|
-
import { resolveDaemonVersion } from './daemon-bin.mjs';
|
|
29
|
-
import { restartSelf } from './service.mjs';
|
|
30
|
-
|
|
31
|
-
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
32
|
-
const DEFAULT_SERVER_ENTRY = path.join(__dirname, 'index.mjs');
|
|
33
|
-
|
|
34
|
-
/** Resolve true iff the local server answers /api/health. Never throws. */
|
|
35
|
-
export function probeHealth(port, timeoutMs = 2500) {
|
|
36
|
-
return new Promise((resolve) => {
|
|
37
|
-
const req = http.get(
|
|
38
|
-
{ host: '127.0.0.1', port, path: '/api/health', timeout: timeoutMs },
|
|
39
|
-
(res) => { res.resume(); resolve(res.statusCode > 0); },
|
|
40
|
-
);
|
|
41
|
-
req.on('error', () => resolve(false));
|
|
42
|
-
req.on('timeout', () => { req.destroy(); resolve(false); });
|
|
43
|
-
});
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
/**
|
|
47
|
-
* Ask the running server its version via /api/health. Returns the version string
|
|
48
|
-
* or null (server down / no version field / parse error). Never throws. Used by
|
|
49
|
-
* the version-drift check (RC1) — a stale server keeps running its OLD code after
|
|
50
|
-
* an upgrade, so we compare what's RUNNING to what's INSTALLED on disk.
|
|
51
|
-
*/
|
|
52
|
-
export function probeHealthVersion(port, timeoutMs = 2500) {
|
|
53
|
-
return new Promise((resolve) => {
|
|
54
|
-
const req = http.get(
|
|
55
|
-
{ host: '127.0.0.1', port, path: '/api/health', timeout: timeoutMs },
|
|
56
|
-
(res) => {
|
|
57
|
-
let body = '';
|
|
58
|
-
res.on('data', (d) => { body += d; if (body.length > 4096) req.destroy(); });
|
|
59
|
-
res.on('end', () => {
|
|
60
|
-
try { resolve(JSON.parse(body).version || null); } catch { resolve(null); }
|
|
61
|
-
});
|
|
62
|
-
},
|
|
63
|
-
);
|
|
64
|
-
req.on('error', () => resolve(null));
|
|
65
|
-
req.on('timeout', () => { req.destroy(); resolve(null); });
|
|
66
|
-
});
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
/**
|
|
70
|
-
* The version installed on disk RIGHT NOW — read fresh from the package.json that
|
|
71
|
-
* ships next to this file, NOT the in-memory APP_VERSION constant. The supervisor
|
|
72
|
-
* is long-lived: after `npm i -g` (or the operator `reinstall-daemon`) swaps the
|
|
73
|
-
* package, the supervisor's own constant is stale too, so only a fresh disk read
|
|
74
|
-
* sees the new version. Respawning the server child reloads index.mjs from this
|
|
75
|
-
* same path, so the restart actually picks up the new code. Returns null on error.
|
|
76
|
-
*/
|
|
77
|
-
export function installedVersion(entry = DEFAULT_SERVER_ENTRY) {
|
|
78
|
-
try {
|
|
79
|
-
// index.mjs lives at <pkg>/server/src/index.mjs → package.json is ../../.
|
|
80
|
-
const pkg = path.resolve(path.dirname(entry), '..', '..', 'package.json');
|
|
81
|
-
return JSON.parse(fs.readFileSync(pkg, 'utf8')).version || null;
|
|
82
|
-
} catch {
|
|
83
|
-
return null;
|
|
84
|
-
}
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
// Captured ONCE at module load = the version of the code THIS supervisor process
|
|
88
|
-
// is running. A fresh installedVersion() reads disk, which moves ahead after an
|
|
89
|
-
// in-place `npm i -g`; the difference is the supervisor's OWN staleness (the
|
|
90
|
-
// Part-8 gap). Distinct from APP_VERSION only in that we read the same file the
|
|
91
|
-
// drift check reads, so they're guaranteed equal at startup (no false drift).
|
|
92
|
-
export const SUPERVISOR_VERSION = installedVersion();
|
|
93
|
-
|
|
94
|
-
export class WorkspaceSupervisor {
|
|
95
|
-
constructor({
|
|
96
|
-
serverEntry = DEFAULT_SERVER_ENTRY,
|
|
97
|
-
workspaceDir = process.cwd(),
|
|
98
|
-
port = Number(process.env.WILD_WORKSPACE_PORT || 5173),
|
|
99
|
-
globalDir = path.join(os.homedir(), '.wild-workspace'),
|
|
100
|
-
node = process.execPath,
|
|
101
|
-
pollMs = 3000,
|
|
102
|
-
backoffStartMs = 1000,
|
|
103
|
-
backoffMaxMs = 30000,
|
|
104
|
-
probeTimeoutMs = 2500,
|
|
105
|
-
spawnImpl = spawn,
|
|
106
|
-
probeImpl = probeHealth,
|
|
107
|
-
nowImpl = () => Date.now(),
|
|
108
|
-
env = process.env,
|
|
109
|
-
crashLoopThreshold = 3,
|
|
110
|
-
diagnosticsImpl = null,
|
|
111
|
-
// RC1 version-drift auto-restart: when the RUNNING server reports an older
|
|
112
|
-
// version than what's INSTALLED on disk, restart it so it picks up the new
|
|
113
|
-
// code. On by default; seams injected for tests. WILD_WORKSPACE_NO_AUTORESTART=1
|
|
114
|
-
// disables it (e.g. a developer running an intentionally-older server).
|
|
115
|
-
autoRestartOnVersionDrift = env.WILD_WORKSPACE_NO_AUTORESTART !== '1',
|
|
116
|
-
versionImpl = probeHealthVersion,
|
|
117
|
-
installedVersionImpl = () => installedVersion(serverEntry),
|
|
118
|
-
// Phase 2 auto-update (Pillar B): the always-on supervisor self-updates the
|
|
119
|
-
// whole stack on the user's channel, with health-gated rollback. On by
|
|
120
|
-
// default; the env kill switch + the persisted off switch both disable it.
|
|
121
|
-
// Only wired up in start() (not in the unit-test path, which calls tick()
|
|
122
|
-
// directly) — see start(). updatePollMs is the *wake* cadence; the actual
|
|
123
|
-
// check interval lives inside AutoUpdater (6h) and self-rate-limits.
|
|
124
|
-
autoUpdate = env.WILD_WORKSPACE_NO_AUTOUPDATE !== '1',
|
|
125
|
-
updatePollMs = 60 * 60 * 1000, // wake hourly; AutoUpdater gates real checks
|
|
126
|
-
autoUpdaterFactory = null, // test seam: (supervisor) => AutoUpdater-like
|
|
127
|
-
// Phase 3 (Pillar A prerequisite): the always-on supervisor keeps the bmo-sync
|
|
128
|
-
// DAEMON alive too, independent of the workspace server. The daemon hosts the
|
|
129
|
-
// out-of-band support channel (reachable when :5173 is down), so it must not
|
|
130
|
-
// depend on the server being up. The server still ensureRunning()s the daemon
|
|
131
|
-
// at boot (idempotent); this is the keep-alive owner. On by default; kill switch
|
|
132
|
-
// WILD_WORKSPACE_NO_DAEMON_SUPERVISION=1. Only wired in start() (not the unit
|
|
133
|
-
// -test path, which calls daemonTick() directly with an injected factory).
|
|
134
|
-
superviseDaemon = env.WILD_WORKSPACE_NO_DAEMON_SUPERVISION !== '1',
|
|
135
|
-
daemonPollMs = 10000, // probe the daemon every 10s
|
|
136
|
-
daemonSupervisorFactory = null, // test seam: (supervisor) => DaemonSupervisor-like
|
|
137
|
-
// Daemon version-drift restart (the daemon analog of RC1b): after an
|
|
138
|
-
// auto-update installs a new daemon binary, the long-lived daemon process
|
|
139
|
-
// keeps running the OLD code until something restarts it — so the support
|
|
140
|
-
// channel silently won't activate. We recycle the daemon when the installed
|
|
141
|
-
// subpackage version differs from the version the running daemon was spawned
|
|
142
|
-
// under (tracked in `daemon-runtime.json`, since the daemon's /health reports
|
|
143
|
-
// no version). Test seam: inject a version function.
|
|
144
|
-
daemonVersionImpl = () => resolveDaemonVersion({ env }),
|
|
145
|
-
// Supervisor self-restart after auto-update (the Part-8 stale-process fix):
|
|
146
|
-
// once an update installs new code and the server child restarts + verifies
|
|
147
|
-
// healthy, the supervisor must restart ITSELF so its own new code (e.g. the
|
|
148
|
-
// daemon-drift recycle) loads — RC1b only restarts the child. Per-OS re-exec
|
|
149
|
-
// lives in service.mjs::restartSelf. On by default; kill switch
|
|
150
|
-
// WILD_WORKSPACE_NO_SELF_RESTART=1. A cooldown + a once-per-process guard
|
|
151
|
-
// prevent any restart loop; the delay lets the triggering update tick unwind
|
|
152
|
-
// and logs flush first. All seams injected (no real exit/spawn in tests).
|
|
153
|
-
selfRestart = env.WILD_WORKSPACE_NO_SELF_RESTART !== '1',
|
|
154
|
-
selfRestartCooldownMs = 10 * 60 * 1000,
|
|
155
|
-
selfRestartDelayMs = 3000,
|
|
156
|
-
restartSelfImpl = restartSelf,
|
|
157
|
-
exitImpl = (code = 0) => process.exit(code),
|
|
158
|
-
scheduleImpl = (fn, ms) => { const t = setTimeout(fn, ms); if (t.unref) t.unref(); return t; },
|
|
159
|
-
// The version THIS supervisor process is running (captured at module load).
|
|
160
|
-
// The self-drift backstop self-restarts when the installed-on-disk version
|
|
161
|
-
// moves ahead of this — covering EVERY update path (our auto-updater, the
|
|
162
|
-
// operator `update-now`, the CLI `update apply`, a manual `npm i -g`), not
|
|
163
|
-
// just our own. null disables the backstop (tests default to null).
|
|
164
|
-
selfVersion = SUPERVISOR_VERSION,
|
|
165
|
-
} = {}) {
|
|
166
|
-
Object.assign(this, {
|
|
167
|
-
serverEntry, workspaceDir, port, globalDir, node, pollMs,
|
|
168
|
-
backoffStartMs, backoffMaxMs, probeTimeoutMs, spawnImpl, probeImpl, nowImpl, env,
|
|
169
|
-
crashLoopThreshold, diagnosticsImpl,
|
|
170
|
-
autoRestartOnVersionDrift, versionImpl, installedVersionImpl,
|
|
171
|
-
autoUpdate, updatePollMs, autoUpdaterFactory,
|
|
172
|
-
superviseDaemon, daemonPollMs, daemonSupervisorFactory, daemonVersionImpl,
|
|
173
|
-
selfRestart, selfRestartCooldownMs, selfRestartDelayMs, restartSelfImpl, exitImpl, scheduleImpl,
|
|
174
|
-
selfVersion,
|
|
175
|
-
});
|
|
176
|
-
this.autoUpdater = null;
|
|
177
|
-
this.updateTimer = null;
|
|
178
|
-
this.daemonSupervisor = null;
|
|
179
|
-
this.daemonTimer = null;
|
|
180
|
-
this._daemonTicking = false;
|
|
181
|
-
this.daemonRuntimeFile = path.join(globalDir, 'daemon-runtime.json');
|
|
182
|
-
// Persists the last self-restart time so a fresh post-re-exec supervisor
|
|
183
|
-
// honours the cooldown too (belt-and-suspenders against a restart loop).
|
|
184
|
-
this.selfRestartFile = path.join(globalDir, 'self-restart.json');
|
|
185
|
-
this._selfRestartScheduled = false;
|
|
186
|
-
this.logFile = path.join(globalDir, 'supervisor.log');
|
|
187
|
-
this.serverLogFile = path.join(globalDir, 'server.out.log');
|
|
188
|
-
this.lockFile = path.join(globalDir, 'supervisor.lock');
|
|
189
|
-
// Phase 3.2: the bmo-sync daemon drops this file (a consented support
|
|
190
|
-
// `restart-server` action) for us to action — so a restart can be triggered
|
|
191
|
-
// out-of-band even when :5173 is wedged. We kill the child; the next tick
|
|
192
|
-
// respawns it from disk (new code loads). Safe: absent file = no-op.
|
|
193
|
-
this.restartRequestFile = path.join(globalDir, 'restart-request.json');
|
|
194
|
-
this.child = null;
|
|
195
|
-
this.backoff = backoffStartMs;
|
|
196
|
-
this.lastSpawn = 0;
|
|
197
|
-
this.timer = null;
|
|
198
|
-
this.spawnCount = 0; // consecutive spawns without becoming healthy
|
|
199
|
-
this.pushedThisEpisode = false; // crash-loop diagnostics pushed once per episode
|
|
200
|
-
}
|
|
201
|
-
|
|
202
|
-
log(msg) {
|
|
203
|
-
try { fs.appendFileSync(this.logFile, `[${new Date().toISOString()}] ${msg}\n`); } catch { /* best-effort */ }
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
/** Is a pid alive? EPERM means "exists, not ours" → still alive. */
|
|
207
|
-
pidAlive(pid) {
|
|
208
|
-
try { process.kill(pid, 0); return true; } catch (e) { return !!(e && e.code === 'EPERM'); }
|
|
209
|
-
}
|
|
210
|
-
|
|
211
|
-
/** Exclusive lock; take over ONLY a stale lock (recorded pid no longer alive). */
|
|
212
|
-
acquireLock() {
|
|
213
|
-
try { fs.mkdirSync(this.globalDir, { recursive: true }); } catch { /* surfaced below */ }
|
|
214
|
-
try {
|
|
215
|
-
const fd = fs.openSync(this.lockFile, 'wx');
|
|
216
|
-
fs.writeSync(fd, String(process.pid));
|
|
217
|
-
fs.closeSync(fd);
|
|
218
|
-
return true;
|
|
219
|
-
} catch {
|
|
220
|
-
let old = null;
|
|
221
|
-
try { old = Number(fs.readFileSync(this.lockFile, 'utf8').trim()); } catch { /* unreadable */ }
|
|
222
|
-
if (old && this.pidAlive(old)) {
|
|
223
|
-
this.log(`live supervisor pid=${old} already running; exiting`);
|
|
224
|
-
return false;
|
|
225
|
-
}
|
|
226
|
-
try { fs.writeFileSync(this.lockFile, String(process.pid)); this.log('took over stale lock'); return true; }
|
|
227
|
-
catch { return false; }
|
|
228
|
-
}
|
|
229
|
-
}
|
|
230
|
-
|
|
231
|
-
releaseLock() {
|
|
232
|
-
try {
|
|
233
|
-
if (Number(fs.readFileSync(this.lockFile, 'utf8').trim()) === process.pid) fs.unlinkSync(this.lockFile);
|
|
234
|
-
} catch { /* already gone */ }
|
|
235
|
-
}
|
|
236
|
-
|
|
237
|
-
spawnServer() {
|
|
238
|
-
let out = 'ignore';
|
|
239
|
-
try { out = fs.openSync(this.serverLogFile, 'a'); } catch { /* output discarded */ }
|
|
240
|
-
this.child = this.spawnImpl(this.node, [this.serverEntry], {
|
|
241
|
-
cwd: this.workspaceDir,
|
|
242
|
-
windowsHide: true,
|
|
243
|
-
stdio: ['ignore', out, out],
|
|
244
|
-
env: { ...this.env, WILD_WORKSPACE_NO_OPEN: '1', WILD_WORKSPACE_DIR: this.workspaceDir },
|
|
245
|
-
});
|
|
246
|
-
if (typeof out === 'number') { try { fs.closeSync(out); } catch { /* parent fd */ } }
|
|
247
|
-
this.lastSpawn = this.nowImpl();
|
|
248
|
-
const pid = this.child && this.child.pid;
|
|
249
|
-
this.log(`spawned server pid=${pid} (backoff=${this.backoff}ms)`);
|
|
250
|
-
if (this.child && this.child.on) {
|
|
251
|
-
this.child.on('exit', (code, sig) => { this.log(`server pid=${pid} exited code=${code} sig=${sig}`); this.child = null; });
|
|
252
|
-
}
|
|
253
|
-
return this.child;
|
|
254
|
-
}
|
|
255
|
-
|
|
256
|
-
/**
|
|
257
|
-
* Consume a pending support `restart-server` request (Phase 3.2). Returns true
|
|
258
|
-
* iff a request file was present (and removes it). Reading-then-deleting makes
|
|
259
|
-
* "present" mean "unhandled" — idempotent across ticks.
|
|
260
|
-
*/
|
|
261
|
-
consumeRestartRequest() {
|
|
262
|
-
try {
|
|
263
|
-
fs.readFileSync(this.restartRequestFile); // throws if absent
|
|
264
|
-
} catch {
|
|
265
|
-
return false;
|
|
266
|
-
}
|
|
267
|
-
try { fs.unlinkSync(this.restartRequestFile); } catch { /* best-effort */ }
|
|
268
|
-
return true;
|
|
269
|
-
}
|
|
270
|
-
|
|
271
|
-
/** One supervision step. Returns its decision (exposed for tests). */
|
|
272
|
-
async tick() {
|
|
273
|
-
// Phase 3.2: a consented support restart request takes priority — kill the
|
|
274
|
-
// child so the next tick respawns it from disk (picks up any new code).
|
|
275
|
-
if (this.consumeRestartRequest()) {
|
|
276
|
-
this.log('restart-server requested (support channel) — restarting');
|
|
277
|
-
this.restartChild();
|
|
278
|
-
return 'restart-requested';
|
|
279
|
-
}
|
|
280
|
-
// Part-8 backstop: if disk moved ahead of our own code (any update path),
|
|
281
|
-
// schedule a supervisor self-restart. Side-effect only — never changes the
|
|
282
|
-
// tick decision below (server/daemon healing proceeds as usual meanwhile).
|
|
283
|
-
this.maybeSelfRestartOnDrift();
|
|
284
|
-
if (await this.probeImpl(this.port, this.probeTimeoutMs)) {
|
|
285
|
-
this.backoff = this.backoffStartMs; // healthy → reset backoff
|
|
286
|
-
this.spawnCount = 0; // healthy → not a crash loop
|
|
287
|
-
this.pushedThisEpisode = false;
|
|
288
|
-
// RC1 version drift: a healthy-but-STALE server (running older code than
|
|
289
|
-
// what's installed) should be restarted so the upgrade actually lands.
|
|
290
|
-
// Only when WE own the child — we restart by killing it and letting the
|
|
291
|
-
// next tick respawn (which reloads index.mjs from disk). A server started
|
|
292
|
-
// by someone else (foreground `wild-workspace`) we leave alone; we have no
|
|
293
|
-
// handle on it. The restarted server reports the installed version, so the
|
|
294
|
-
// drift clears and this won't loop.
|
|
295
|
-
if (this.autoRestartOnVersionDrift && this.child) {
|
|
296
|
-
try {
|
|
297
|
-
const running = await this.versionImpl(this.port, this.probeTimeoutMs);
|
|
298
|
-
const installed = this.installedVersionImpl();
|
|
299
|
-
if (running && installed && running !== installed) {
|
|
300
|
-
this.log(`version drift: running=${running} installed=${installed} — restarting server`);
|
|
301
|
-
try { this.child.kill(); } catch { /* exit handler clears child */ }
|
|
302
|
-
this.child = null;
|
|
303
|
-
this.backoff = this.backoffStartMs; // upgrade is intentional, not a crash
|
|
304
|
-
return 'version-drift-restart';
|
|
305
|
-
}
|
|
306
|
-
} catch (e) {
|
|
307
|
-
this.log(`version-drift check error: ${e?.message || e}`);
|
|
308
|
-
}
|
|
309
|
-
}
|
|
310
|
-
return 'healthy';
|
|
311
|
-
}
|
|
312
|
-
if (this.child) return 'booting'; // spawned, still coming up
|
|
313
|
-
if (this.nowImpl() - this.lastSpawn < this.backoff) return 'backoff';
|
|
314
|
-
this.spawnServer();
|
|
315
|
-
this.backoff = Math.min(this.backoff * 2, this.backoffMaxMs);
|
|
316
|
-
this.spawnCount += 1;
|
|
317
|
-
// Crash loop: the server won't stay up, so the operator channel (which rides
|
|
318
|
-
// the :5173 server) can't reach this machine at all. Push an install-down
|
|
319
|
-
// `doctor` bundle to bmo-sync ONCE per episode so support sees it anyway —
|
|
320
|
-
// the install-failed-before-server-up case (docs/user-experience.md §5).
|
|
321
|
-
if (this.spawnCount >= this.crashLoopThreshold && !this.pushedThisEpisode) {
|
|
322
|
-
this.pushedThisEpisode = true;
|
|
323
|
-
Promise.resolve(this.pushDiagnostics()).catch((e) => this.log(`diag push error: ${e?.message || e}`));
|
|
324
|
-
}
|
|
325
|
-
return 'spawned';
|
|
326
|
-
}
|
|
327
|
-
|
|
328
|
-
/**
|
|
329
|
-
* Push an install-down diagnostic bundle to bmo-sync. Injected (`diagnosticsImpl`)
|
|
330
|
-
* in tests; the real path is consent- + token-gated and never runs under the
|
|
331
|
-
* test runner. Best-effort, never throws into the supervision loop.
|
|
332
|
-
*/
|
|
333
|
-
async pushDiagnostics() {
|
|
334
|
-
if (this.diagnosticsImpl) return this.diagnosticsImpl(this);
|
|
335
|
-
if (process.env.VITEST || process.env.NODE_ENV === 'test') return;
|
|
336
|
-
try {
|
|
337
|
-
const [{ buildConfig }, { runDoctor }, { loadObservabilityConsent }] = await Promise.all([
|
|
338
|
-
import('./config.mjs'),
|
|
339
|
-
import('./doctor.mjs'),
|
|
340
|
-
import('./observability.mjs'),
|
|
341
|
-
]);
|
|
342
|
-
const config = buildConfig({ workspaceDir: this.workspaceDir, port: this.port });
|
|
343
|
-
if (!config.accountToken) return; // can't key it to a user
|
|
344
|
-
if (process.env.WILD_WORKSPACE_NO_TELEMETRY === '1') return; // kill switch
|
|
345
|
-
if (!loadObservabilityConsent(config.dataDir).enabled) return; // consent
|
|
346
|
-
const report = await runDoctor({ config });
|
|
347
|
-
const url = `${config.bmoSyncServerUrl.replace(/\/$/, '')}/api/telemetry`;
|
|
348
|
-
const ctrl = new AbortController();
|
|
349
|
-
const t = setTimeout(() => ctrl.abort(), 5000);
|
|
350
|
-
try {
|
|
351
|
-
await fetch(url, {
|
|
352
|
-
method: 'POST',
|
|
353
|
-
headers: { 'content-type': 'application/json' },
|
|
354
|
-
body: JSON.stringify({
|
|
355
|
-
account_token: config.accountToken,
|
|
356
|
-
slug: config.account?.slug || null,
|
|
357
|
-
workspace_id: config.workspaceId,
|
|
358
|
-
kind: 'install-down',
|
|
359
|
-
doctor: report,
|
|
360
|
-
sent_at: Math.floor(Date.now() / 1000),
|
|
361
|
-
}),
|
|
362
|
-
signal: ctrl.signal,
|
|
363
|
-
});
|
|
364
|
-
this.log(`pushed install-down diagnostics (fail=${report.summary?.fail})`);
|
|
365
|
-
} finally {
|
|
366
|
-
clearTimeout(t);
|
|
367
|
-
}
|
|
368
|
-
} catch (e) {
|
|
369
|
-
this.log(`diagnostics push failed: ${e?.message || e}`);
|
|
370
|
-
}
|
|
371
|
-
}
|
|
372
|
-
|
|
373
|
-
/**
|
|
374
|
-
* Restart the supervised server child so freshly installed code is loaded.
|
|
375
|
-
* Kills it and lets the next tick respawn (which reloads index.mjs from disk) —
|
|
376
|
-
* the same mechanism as the version-drift restart, exposed for the AutoUpdater.
|
|
377
|
-
* No-op (returns false) when we don't own a child (foreground server).
|
|
378
|
-
*/
|
|
379
|
-
restartChild() {
|
|
380
|
-
if (!this.child) return false;
|
|
381
|
-
this.log('restartChild: killing server to load new code');
|
|
382
|
-
try { this.child.kill(); } catch { /* exit handler clears child */ }
|
|
383
|
-
this.child = null;
|
|
384
|
-
this.backoff = this.backoffStartMs; // an intentional restart, not a crash
|
|
385
|
-
return true;
|
|
386
|
-
}
|
|
387
|
-
|
|
388
|
-
/** The last self-restart time (epoch ms), or 0. Used for the loop-guard cooldown. */
|
|
389
|
-
readLastSelfRestart() {
|
|
390
|
-
try { return Number(JSON.parse(fs.readFileSync(this.selfRestartFile, 'utf8')).at) || 0; }
|
|
391
|
-
catch { return 0; }
|
|
392
|
-
}
|
|
393
|
-
|
|
394
|
-
writeLastSelfRestart(at) {
|
|
395
|
-
try {
|
|
396
|
-
fs.mkdirSync(this.globalDir, { recursive: true });
|
|
397
|
-
fs.writeFileSync(this.selfRestartFile, JSON.stringify({ at }));
|
|
398
|
-
} catch { /* best-effort */ }
|
|
399
|
-
}
|
|
400
|
-
|
|
401
|
-
/**
|
|
402
|
-
* Schedule a supervisor self-restart so freshly-installed SUPERVISOR code loads
|
|
403
|
-
* (the Part-8 stale-process fix). Called from the AutoUpdater's onUpdate hook
|
|
404
|
-
* AFTER an update installed + restarted the server child + verified it healthy —
|
|
405
|
-
* so a bad release has already rolled back before we re-exec ourselves. Guarded
|
|
406
|
-
* three ways against a restart loop: the kill switch, a once-per-process flag,
|
|
407
|
-
* and a persisted cooldown (survives the re-exec). Returns a status string
|
|
408
|
-
* ('scheduled' | 'disabled' | 'already' | 'cooldown') for tests/logging. The
|
|
409
|
-
* actual restart runs on a short delay so the triggering tick unwinds first.
|
|
410
|
-
*/
|
|
411
|
-
scheduleSelfRestart(reason) {
|
|
412
|
-
if (!this.selfRestart) return 'disabled';
|
|
413
|
-
if (this._selfRestartScheduled) return 'already';
|
|
414
|
-
const now = this.nowImpl();
|
|
415
|
-
const last = this.readLastSelfRestart();
|
|
416
|
-
if (last && now - last < this.selfRestartCooldownMs) {
|
|
417
|
-
this.log(`self-restart skipped (cooldown, last ${Math.round((now - last) / 1000)}s ago) — ${reason}`);
|
|
418
|
-
return 'cooldown';
|
|
419
|
-
}
|
|
420
|
-
this._selfRestartScheduled = true;
|
|
421
|
-
this.writeLastSelfRestart(now);
|
|
422
|
-
this.log(`self-restart scheduled in ${this.selfRestartDelayMs}ms — ${reason}`);
|
|
423
|
-
this.scheduleImpl(() => {
|
|
424
|
-
this._performSelfRestart(reason).catch((e) => this.log(`self-restart error: ${e?.message || e}`));
|
|
425
|
-
}, this.selfRestartDelayMs);
|
|
426
|
-
return 'scheduled';
|
|
427
|
-
}
|
|
428
|
-
|
|
429
|
-
/**
|
|
430
|
-
* Carry out the self-restart. On mac/Linux the service manager kills+relaunches
|
|
431
|
-
* us (we just issue the command and get SIGTERM'd → our exit handler releases the
|
|
432
|
-
* lock). On Windows restartSelf spawned a hidden successor and returns
|
|
433
|
-
* willExit:true — we then release the lock (via stop()) and exit so the successor
|
|
434
|
-
* can take it. A non-managed run reports restarted:false and we stay up on the
|
|
435
|
-
* old code (no worse than before this feature). Never throws.
|
|
436
|
-
*/
|
|
437
|
-
async _performSelfRestart(reason) {
|
|
438
|
-
this.log(`self-restart now — ${reason}`);
|
|
439
|
-
let r;
|
|
440
|
-
try {
|
|
441
|
-
r = await this.restartSelfImpl({ dir: this.globalDir, port: this.port });
|
|
442
|
-
} catch (e) {
|
|
443
|
-
this.log(`self-restart impl error: ${e?.message || e}`);
|
|
444
|
-
return { restarted: false, error: e?.message || String(e) };
|
|
445
|
-
}
|
|
446
|
-
this.log(`self-restart result: ${JSON.stringify(r)}`);
|
|
447
|
-
if (r && r.willExit) {
|
|
448
|
-
this.stop(); // clears timers + releases the lock so the successor can take it
|
|
449
|
-
this.exitImpl(0);
|
|
450
|
-
}
|
|
451
|
-
return r;
|
|
452
|
-
}
|
|
453
|
-
|
|
454
|
-
/**
|
|
455
|
-
* Backstop for the Part-8 gap on EVERY update path, not just our own auto-
|
|
456
|
-
* updater: when the version installed on disk no longer matches the code THIS
|
|
457
|
-
* supervisor is running, the supervisor is stale → schedule a self-restart.
|
|
458
|
-
* RC1b already restarts the stale server child and daemonTick recycles the
|
|
459
|
-
* stale daemon; this is the missing third leg (the supervisor itself), so an
|
|
460
|
-
* operator `update-now` / CLI `update apply` / manual `npm i -g` also lands new
|
|
461
|
-
* supervisor code with no reboot. Skipped while OUR auto-updater is mid-flight
|
|
462
|
-
* so the rollback window is respected (that path self-restarts via the onUpdate
|
|
463
|
-
* hook, only after verify succeeds). Cheap (an in-memory compare guarding a disk
|
|
464
|
-
* read) and idempotent (scheduleSelfRestart de-dupes). Never throws.
|
|
465
|
-
*/
|
|
466
|
-
maybeSelfRestartOnDrift() {
|
|
467
|
-
if (!this.selfRestart || !this.selfVersion) return false;
|
|
468
|
-
if (this._selfRestartScheduled) return false;
|
|
469
|
-
if (this.autoUpdater && this.autoUpdater.inProgress) return false; // respect rollback window
|
|
470
|
-
let installed = null;
|
|
471
|
-
try { installed = this.installedVersionImpl(); } catch { return false; }
|
|
472
|
-
if (!installed || installed === this.selfVersion) return false;
|
|
473
|
-
this.log(`supervisor version drift: running=${this.selfVersion} installed=${installed} — self-restarting`);
|
|
474
|
-
this.scheduleSelfRestart(`supervisor drift ${this.selfVersion}→${installed}`);
|
|
475
|
-
return true;
|
|
476
|
-
}
|
|
477
|
-
|
|
478
|
-
/** Build the AutoUpdater bound to this supervisor. Separated for the test seam. */
|
|
479
|
-
async buildAutoUpdater() {
|
|
480
|
-
if (this.autoUpdaterFactory) return this.autoUpdaterFactory(this);
|
|
481
|
-
// Lazy import keeps the unit-test path (which never calls start()) free of the
|
|
482
|
-
// auto-update module + its registry/npm seams.
|
|
483
|
-
const { AutoUpdater } = await import('./auto-update.mjs');
|
|
484
|
-
return new AutoUpdater({
|
|
485
|
-
globalDir: this.globalDir,
|
|
486
|
-
port: this.port,
|
|
487
|
-
installedVersionImpl: this.installedVersionImpl,
|
|
488
|
-
healthVersionImpl: (port) => this.versionImpl(port, this.probeTimeoutMs),
|
|
489
|
-
restartImpl: async () => { this.restartChild(); },
|
|
490
|
-
nowImpl: this.nowImpl,
|
|
491
|
-
env: this.env,
|
|
492
|
-
logImpl: (m) => this.log(m),
|
|
493
|
-
onUpdate: (rec) => {
|
|
494
|
-
this.log(`auto-update result: ${rec.from || '?'}→${rec.to} ${rec.status}`);
|
|
495
|
-
// A genuine version change landed healthy → restart the supervisor itself
|
|
496
|
-
// so its own new code loads (Part-8 stale-process fix). Guarded against
|
|
497
|
-
// loops inside scheduleSelfRestart. Fires only on a real bump (to≠from),
|
|
498
|
-
// never on rollback/failure (those statuses aren't 'ok').
|
|
499
|
-
if (rec.status === 'ok' && rec.to && rec.from && rec.to !== rec.from) {
|
|
500
|
-
this.scheduleSelfRestart(`auto-update ${rec.from}→${rec.to}`);
|
|
501
|
-
}
|
|
502
|
-
},
|
|
503
|
-
});
|
|
504
|
-
}
|
|
505
|
-
|
|
506
|
-
runUpdateTick(opts = {}) {
|
|
507
|
-
if (!this.autoUpdater) return;
|
|
508
|
-
this.autoUpdater.tick(opts)
|
|
509
|
-
.then((r) => { if (r && !['not-due', 'disabled', 'up-to-date', 'busy'].includes(r)) this.log(`auto-update tick: ${r}`); })
|
|
510
|
-
.catch((e) => this.log(`auto-update error: ${e?.message || e}`));
|
|
511
|
-
}
|
|
512
|
-
|
|
513
|
-
/**
|
|
514
|
-
* Build the DaemonSupervisor the always-on layer owns. Reads a FRESH config
|
|
515
|
-
* (not the stale module constant) so the account token / relay in effect when
|
|
516
|
-
* always-on starts are used. Lazy import keeps the unit-test path (which never
|
|
517
|
-
* calls start()) free of config + daemon-supervisor. Test seam: factory.
|
|
518
|
-
*/
|
|
519
|
-
async buildDaemonSupervisor() {
|
|
520
|
-
if (this.daemonSupervisorFactory) return this.daemonSupervisorFactory(this);
|
|
521
|
-
const [{ buildConfig }, { DaemonSupervisor }] = await Promise.all([
|
|
522
|
-
import('./config.mjs'),
|
|
523
|
-
import('./daemon-supervisor.mjs'),
|
|
524
|
-
]);
|
|
525
|
-
const config = buildConfig({ workspaceDir: this.workspaceDir, port: this.port });
|
|
526
|
-
return new DaemonSupervisor({
|
|
527
|
-
httpBase: config.daemonHttpUrl,
|
|
528
|
-
globalDir: this.globalDir,
|
|
529
|
-
accountToken: config.accountToken,
|
|
530
|
-
serverUrl: config.bmoSyncServerUrl,
|
|
531
|
-
});
|
|
532
|
-
}
|
|
533
|
-
|
|
534
|
-
/**
|
|
535
|
-
* One daemon-supervision step: if the daemon isn't answering /health, (re)start
|
|
536
|
-
* it. Deliberately INDEPENDENT of server health — the daemon (and its support
|
|
537
|
-
* channel) must stay up even when the server is crashed/mid-upgrade. Re-entrancy
|
|
538
|
-
* guarded so a slow spawn can't overlap the next tick. Never throws.
|
|
539
|
-
*/
|
|
540
|
-
/** The daemon version the currently-running daemon was spawned under, or null. */
|
|
541
|
-
readDaemonMarker() {
|
|
542
|
-
try {
|
|
543
|
-
const v = JSON.parse(fs.readFileSync(this.daemonRuntimeFile, 'utf8'))?.daemonVersion;
|
|
544
|
-
return typeof v === 'string' ? v : null;
|
|
545
|
-
} catch {
|
|
546
|
-
return null;
|
|
547
|
-
}
|
|
548
|
-
}
|
|
549
|
-
|
|
550
|
-
writeDaemonMarker(version) {
|
|
551
|
-
if (!version) return; // unknown installed version (PATH/vendor) — don't pin
|
|
552
|
-
try {
|
|
553
|
-
fs.mkdirSync(this.globalDir, { recursive: true });
|
|
554
|
-
fs.writeFileSync(this.daemonRuntimeFile, JSON.stringify({ daemonVersion: version }));
|
|
555
|
-
} catch {
|
|
556
|
-
/* best-effort */
|
|
557
|
-
}
|
|
558
|
-
}
|
|
559
|
-
|
|
560
|
-
async daemonTick() {
|
|
561
|
-
if (!this.daemonSupervisor || this._daemonTicking) return 'skip';
|
|
562
|
-
this._daemonTicking = true;
|
|
563
|
-
try {
|
|
564
|
-
const installed = this.daemonVersionImpl();
|
|
565
|
-
const h = await this.daemonSupervisor.health();
|
|
566
|
-
if (h && h.running) {
|
|
567
|
-
// Running — but is it the CURRENT binary? After an auto-update the daemon
|
|
568
|
-
// keeps the old code until recycled (RC1b analog). Recycle when the
|
|
569
|
-
// installed version differs from what we recorded at spawn (a null marker
|
|
570
|
-
// = spawned by a pre-drift-aware supervisor → treat as drift, recycle once).
|
|
571
|
-
if (installed && this.readDaemonMarker() !== installed && this.daemonSupervisor.recycle) {
|
|
572
|
-
this.log(
|
|
573
|
-
`daemon version drift (marker=${this.readDaemonMarker() || 'none'} installed=${installed}) — recycling`,
|
|
574
|
-
);
|
|
575
|
-
const r = await this.daemonSupervisor.recycle();
|
|
576
|
-
if (r && r.started) {
|
|
577
|
-
this.writeDaemonMarker(installed);
|
|
578
|
-
this.log(`daemon recycled to ${installed} (pid=${r.pid})`);
|
|
579
|
-
return 'recycled';
|
|
580
|
-
}
|
|
581
|
-
this.log(`daemon recycle failed: ${r?.error || 'unknown'}`);
|
|
582
|
-
return 'recycle-failed';
|
|
583
|
-
}
|
|
584
|
-
return 'healthy';
|
|
585
|
-
}
|
|
586
|
-
const r = await this.daemonSupervisor.ensureRunning();
|
|
587
|
-
if (r && r.started) {
|
|
588
|
-
this.writeDaemonMarker(installed);
|
|
589
|
-
this.log(`daemon respawned (pid=${r.pid})`);
|
|
590
|
-
return 'respawned';
|
|
591
|
-
}
|
|
592
|
-
if (r && r.alreadyRunning) return 'healthy';
|
|
593
|
-
this.log(`daemon down, respawn not started: ${r?.error || 'unknown'}`);
|
|
594
|
-
return 'failed';
|
|
595
|
-
} catch (e) {
|
|
596
|
-
this.log(`daemon-tick error: ${e?.message || e}`);
|
|
597
|
-
return 'error';
|
|
598
|
-
} finally {
|
|
599
|
-
this._daemonTicking = false;
|
|
600
|
-
}
|
|
601
|
-
}
|
|
602
|
-
|
|
603
|
-
/** Acquire the lock and start the supervision loop. Idempotent across processes. */
|
|
604
|
-
start() {
|
|
605
|
-
if (!this.acquireLock()) return { started: false, reason: 'already-running' };
|
|
606
|
-
process.on('exit', () => this.releaseLock());
|
|
607
|
-
process.on('SIGTERM', () => process.exit(0));
|
|
608
|
-
process.on('SIGINT', () => process.exit(0));
|
|
609
|
-
this.log(`supervisor start pid=${process.pid} v${this.selfVersion || '?'} watching http://127.0.0.1:${this.port}/api/health (workspace=${this.workspaceDir})`);
|
|
610
|
-
this.timer = setInterval(() => { this.tick().catch((e) => this.log(`tick error: ${e?.message || e}`)); }, this.pollMs);
|
|
611
|
-
this.tick().catch((e) => this.log(`tick error: ${e?.message || e}`));
|
|
612
|
-
|
|
613
|
-
// Phase 2 auto-update: wake on a slow timer; the first check fires shortly
|
|
614
|
-
// after start so the server has time to come up (verify reads its /health).
|
|
615
|
-
if (this.autoUpdate && this.env.VITEST !== 'true' && this.env.NODE_ENV !== 'test') {
|
|
616
|
-
this.buildAutoUpdater().then((u) => {
|
|
617
|
-
this.autoUpdater = u;
|
|
618
|
-
this.updateTimer = setInterval(() => this.runUpdateTick(), this.updatePollMs);
|
|
619
|
-
if (this.updateTimer.unref) this.updateTimer.unref();
|
|
620
|
-
// FORCE the first post-boot check (bypass the 6h dueForCheck): a restart
|
|
621
|
-
// should always pull the latest, so a rebooted/woken machine can't sit on
|
|
622
|
-
// a stale version just because it last checked <6h ago.
|
|
623
|
-
const kick = setTimeout(() => this.runUpdateTick({ force: true }), 60_000);
|
|
624
|
-
if (kick.unref) kick.unref();
|
|
625
|
-
}).catch((e) => this.log(`auto-update init error: ${e?.message || e}`));
|
|
626
|
-
}
|
|
627
|
-
|
|
628
|
-
// Phase 3: keep the bmo-sync daemon alive independent of the server, so the
|
|
629
|
-
// out-of-band support channel survives the server being down.
|
|
630
|
-
if (this.superviseDaemon && this.env.VITEST !== 'true' && this.env.NODE_ENV !== 'test') {
|
|
631
|
-
this.buildDaemonSupervisor().then((d) => {
|
|
632
|
-
this.daemonSupervisor = d;
|
|
633
|
-
this.daemonTimer = setInterval(() => { this.daemonTick().catch((e) => this.log(`daemon-tick error: ${e?.message || e}`)); }, this.daemonPollMs);
|
|
634
|
-
if (this.daemonTimer.unref) this.daemonTimer.unref();
|
|
635
|
-
this.daemonTick().catch(() => {}); // first probe now
|
|
636
|
-
}).catch((e) => this.log(`daemon supervision init error: ${e?.message || e}`));
|
|
637
|
-
}
|
|
638
|
-
return { started: true };
|
|
639
|
-
}
|
|
640
|
-
|
|
641
|
-
stop() {
|
|
642
|
-
if (this.timer) { clearInterval(this.timer); this.timer = null; }
|
|
643
|
-
if (this.updateTimer) { clearInterval(this.updateTimer); this.updateTimer = null; }
|
|
644
|
-
if (this.daemonTimer) { clearInterval(this.daemonTimer); this.daemonTimer = null; }
|
|
645
|
-
this.releaseLock();
|
|
646
|
-
}
|
|
647
|
-
}
|
|
1
|
+
// WorkspaceSupervisor — keeps the wild-workspace server alive in the background.
|
|
2
|
+
//
|
|
3
|
+
// The server itself auto-starts the bmo-sync daemon on boot (DaemonSupervisor),
|
|
4
|
+
// so keeping the server up brings the whole local stack — public URL included —
|
|
5
|
+
// back to life. This is the watchdog half of the always-on feature
|
|
6
|
+
// (docs/always-on-design.md); `service.mjs` is the per-OS autostart half that
|
|
7
|
+
// launches this hidden at login via `wild-workspace service run`.
|
|
8
|
+
//
|
|
9
|
+
// Design (all proven on Windows incl. a real reboot, 2026-05-30):
|
|
10
|
+
// - Health-driven: polls GET /api/health and (re)spawns the server only when
|
|
11
|
+
// it is down — so it never fights a server someone else started and handles
|
|
12
|
+
// crash recovery naturally.
|
|
13
|
+
// - Singleton: an exclusive lockfile in the machine-global dir
|
|
14
|
+
// (~/.wild-workspace, NEVER the synced workspace — locked principle #1).
|
|
15
|
+
// A stale lock whose pid is dead is taken over.
|
|
16
|
+
// - Exponential backoff (capped) so a crash-looping server can't spin the CPU.
|
|
17
|
+
// - Everything is logged — silent death is the #1 un-debuggable failure mode.
|
|
18
|
+
//
|
|
19
|
+
// Every external touch-point (spawn, health probe, clock) is an injected seam
|
|
20
|
+
// so the suite never spawns a real process.
|
|
21
|
+
|
|
22
|
+
import { spawn } from 'node:child_process';
|
|
23
|
+
import http from 'node:http';
|
|
24
|
+
import fs from 'node:fs';
|
|
25
|
+
import os from 'node:os';
|
|
26
|
+
import path from 'node:path';
|
|
27
|
+
import { fileURLToPath } from 'node:url';
|
|
28
|
+
import { resolveDaemonVersion } from './daemon-bin.mjs';
|
|
29
|
+
import { restartSelf } from './service.mjs';
|
|
30
|
+
|
|
31
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
32
|
+
const DEFAULT_SERVER_ENTRY = path.join(__dirname, 'index.mjs');
|
|
33
|
+
|
|
34
|
+
/** Resolve true iff the local server answers /api/health. Never throws. */
|
|
35
|
+
export function probeHealth(port, timeoutMs = 2500) {
|
|
36
|
+
return new Promise((resolve) => {
|
|
37
|
+
const req = http.get(
|
|
38
|
+
{ host: '127.0.0.1', port, path: '/api/health', timeout: timeoutMs },
|
|
39
|
+
(res) => { res.resume(); resolve(res.statusCode > 0); },
|
|
40
|
+
);
|
|
41
|
+
req.on('error', () => resolve(false));
|
|
42
|
+
req.on('timeout', () => { req.destroy(); resolve(false); });
|
|
43
|
+
});
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Ask the running server its version via /api/health. Returns the version string
|
|
48
|
+
* or null (server down / no version field / parse error). Never throws. Used by
|
|
49
|
+
* the version-drift check (RC1) — a stale server keeps running its OLD code after
|
|
50
|
+
* an upgrade, so we compare what's RUNNING to what's INSTALLED on disk.
|
|
51
|
+
*/
|
|
52
|
+
export function probeHealthVersion(port, timeoutMs = 2500) {
|
|
53
|
+
return new Promise((resolve) => {
|
|
54
|
+
const req = http.get(
|
|
55
|
+
{ host: '127.0.0.1', port, path: '/api/health', timeout: timeoutMs },
|
|
56
|
+
(res) => {
|
|
57
|
+
let body = '';
|
|
58
|
+
res.on('data', (d) => { body += d; if (body.length > 4096) req.destroy(); });
|
|
59
|
+
res.on('end', () => {
|
|
60
|
+
try { resolve(JSON.parse(body).version || null); } catch { resolve(null); }
|
|
61
|
+
});
|
|
62
|
+
},
|
|
63
|
+
);
|
|
64
|
+
req.on('error', () => resolve(null));
|
|
65
|
+
req.on('timeout', () => { req.destroy(); resolve(null); });
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* The version installed on disk RIGHT NOW — read fresh from the package.json that
|
|
71
|
+
* ships next to this file, NOT the in-memory APP_VERSION constant. The supervisor
|
|
72
|
+
* is long-lived: after `npm i -g` (or the operator `reinstall-daemon`) swaps the
|
|
73
|
+
* package, the supervisor's own constant is stale too, so only a fresh disk read
|
|
74
|
+
* sees the new version. Respawning the server child reloads index.mjs from this
|
|
75
|
+
* same path, so the restart actually picks up the new code. Returns null on error.
|
|
76
|
+
*/
|
|
77
|
+
export function installedVersion(entry = DEFAULT_SERVER_ENTRY) {
|
|
78
|
+
try {
|
|
79
|
+
// index.mjs lives at <pkg>/server/src/index.mjs → package.json is ../../.
|
|
80
|
+
const pkg = path.resolve(path.dirname(entry), '..', '..', 'package.json');
|
|
81
|
+
return JSON.parse(fs.readFileSync(pkg, 'utf8')).version || null;
|
|
82
|
+
} catch {
|
|
83
|
+
return null;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Captured ONCE at module load = the version of the code THIS supervisor process
|
|
88
|
+
// is running. A fresh installedVersion() reads disk, which moves ahead after an
|
|
89
|
+
// in-place `npm i -g`; the difference is the supervisor's OWN staleness (the
|
|
90
|
+
// Part-8 gap). Distinct from APP_VERSION only in that we read the same file the
|
|
91
|
+
// drift check reads, so they're guaranteed equal at startup (no false drift).
|
|
92
|
+
export const SUPERVISOR_VERSION = installedVersion();
|
|
93
|
+
|
|
94
|
+
export class WorkspaceSupervisor {
|
|
95
|
+
constructor({
|
|
96
|
+
serverEntry = DEFAULT_SERVER_ENTRY,
|
|
97
|
+
workspaceDir = process.cwd(),
|
|
98
|
+
port = Number(process.env.WILD_WORKSPACE_PORT || 5173),
|
|
99
|
+
globalDir = path.join(os.homedir(), '.wild-workspace'),
|
|
100
|
+
node = process.execPath,
|
|
101
|
+
pollMs = 3000,
|
|
102
|
+
backoffStartMs = 1000,
|
|
103
|
+
backoffMaxMs = 30000,
|
|
104
|
+
probeTimeoutMs = 2500,
|
|
105
|
+
spawnImpl = spawn,
|
|
106
|
+
probeImpl = probeHealth,
|
|
107
|
+
nowImpl = () => Date.now(),
|
|
108
|
+
env = process.env,
|
|
109
|
+
crashLoopThreshold = 3,
|
|
110
|
+
diagnosticsImpl = null,
|
|
111
|
+
// RC1 version-drift auto-restart: when the RUNNING server reports an older
|
|
112
|
+
// version than what's INSTALLED on disk, restart it so it picks up the new
|
|
113
|
+
// code. On by default; seams injected for tests. WILD_WORKSPACE_NO_AUTORESTART=1
|
|
114
|
+
// disables it (e.g. a developer running an intentionally-older server).
|
|
115
|
+
autoRestartOnVersionDrift = env.WILD_WORKSPACE_NO_AUTORESTART !== '1',
|
|
116
|
+
versionImpl = probeHealthVersion,
|
|
117
|
+
installedVersionImpl = () => installedVersion(serverEntry),
|
|
118
|
+
// Phase 2 auto-update (Pillar B): the always-on supervisor self-updates the
|
|
119
|
+
// whole stack on the user's channel, with health-gated rollback. On by
|
|
120
|
+
// default; the env kill switch + the persisted off switch both disable it.
|
|
121
|
+
// Only wired up in start() (not in the unit-test path, which calls tick()
|
|
122
|
+
// directly) — see start(). updatePollMs is the *wake* cadence; the actual
|
|
123
|
+
// check interval lives inside AutoUpdater (6h) and self-rate-limits.
|
|
124
|
+
autoUpdate = env.WILD_WORKSPACE_NO_AUTOUPDATE !== '1',
|
|
125
|
+
updatePollMs = 60 * 60 * 1000, // wake hourly; AutoUpdater gates real checks
|
|
126
|
+
autoUpdaterFactory = null, // test seam: (supervisor) => AutoUpdater-like
|
|
127
|
+
// Phase 3 (Pillar A prerequisite): the always-on supervisor keeps the bmo-sync
|
|
128
|
+
// DAEMON alive too, independent of the workspace server. The daemon hosts the
|
|
129
|
+
// out-of-band support channel (reachable when :5173 is down), so it must not
|
|
130
|
+
// depend on the server being up. The server still ensureRunning()s the daemon
|
|
131
|
+
// at boot (idempotent); this is the keep-alive owner. On by default; kill switch
|
|
132
|
+
// WILD_WORKSPACE_NO_DAEMON_SUPERVISION=1. Only wired in start() (not the unit
|
|
133
|
+
// -test path, which calls daemonTick() directly with an injected factory).
|
|
134
|
+
superviseDaemon = env.WILD_WORKSPACE_NO_DAEMON_SUPERVISION !== '1',
|
|
135
|
+
daemonPollMs = 10000, // probe the daemon every 10s
|
|
136
|
+
daemonSupervisorFactory = null, // test seam: (supervisor) => DaemonSupervisor-like
|
|
137
|
+
// Daemon version-drift restart (the daemon analog of RC1b): after an
|
|
138
|
+
// auto-update installs a new daemon binary, the long-lived daemon process
|
|
139
|
+
// keeps running the OLD code until something restarts it — so the support
|
|
140
|
+
// channel silently won't activate. We recycle the daemon when the installed
|
|
141
|
+
// subpackage version differs from the version the running daemon was spawned
|
|
142
|
+
// under (tracked in `daemon-runtime.json`, since the daemon's /health reports
|
|
143
|
+
// no version). Test seam: inject a version function.
|
|
144
|
+
daemonVersionImpl = () => resolveDaemonVersion({ env }),
|
|
145
|
+
// Supervisor self-restart after auto-update (the Part-8 stale-process fix):
|
|
146
|
+
// once an update installs new code and the server child restarts + verifies
|
|
147
|
+
// healthy, the supervisor must restart ITSELF so its own new code (e.g. the
|
|
148
|
+
// daemon-drift recycle) loads — RC1b only restarts the child. Per-OS re-exec
|
|
149
|
+
// lives in service.mjs::restartSelf. On by default; kill switch
|
|
150
|
+
// WILD_WORKSPACE_NO_SELF_RESTART=1. A cooldown + a once-per-process guard
|
|
151
|
+
// prevent any restart loop; the delay lets the triggering update tick unwind
|
|
152
|
+
// and logs flush first. All seams injected (no real exit/spawn in tests).
|
|
153
|
+
selfRestart = env.WILD_WORKSPACE_NO_SELF_RESTART !== '1',
|
|
154
|
+
selfRestartCooldownMs = 10 * 60 * 1000,
|
|
155
|
+
selfRestartDelayMs = 3000,
|
|
156
|
+
restartSelfImpl = restartSelf,
|
|
157
|
+
exitImpl = (code = 0) => process.exit(code),
|
|
158
|
+
scheduleImpl = (fn, ms) => { const t = setTimeout(fn, ms); if (t.unref) t.unref(); return t; },
|
|
159
|
+
// The version THIS supervisor process is running (captured at module load).
|
|
160
|
+
// The self-drift backstop self-restarts when the installed-on-disk version
|
|
161
|
+
// moves ahead of this — covering EVERY update path (our auto-updater, the
|
|
162
|
+
// operator `update-now`, the CLI `update apply`, a manual `npm i -g`), not
|
|
163
|
+
// just our own. null disables the backstop (tests default to null).
|
|
164
|
+
selfVersion = SUPERVISOR_VERSION,
|
|
165
|
+
} = {}) {
|
|
166
|
+
Object.assign(this, {
|
|
167
|
+
serverEntry, workspaceDir, port, globalDir, node, pollMs,
|
|
168
|
+
backoffStartMs, backoffMaxMs, probeTimeoutMs, spawnImpl, probeImpl, nowImpl, env,
|
|
169
|
+
crashLoopThreshold, diagnosticsImpl,
|
|
170
|
+
autoRestartOnVersionDrift, versionImpl, installedVersionImpl,
|
|
171
|
+
autoUpdate, updatePollMs, autoUpdaterFactory,
|
|
172
|
+
superviseDaemon, daemonPollMs, daemonSupervisorFactory, daemonVersionImpl,
|
|
173
|
+
selfRestart, selfRestartCooldownMs, selfRestartDelayMs, restartSelfImpl, exitImpl, scheduleImpl,
|
|
174
|
+
selfVersion,
|
|
175
|
+
});
|
|
176
|
+
this.autoUpdater = null;
|
|
177
|
+
this.updateTimer = null;
|
|
178
|
+
this.daemonSupervisor = null;
|
|
179
|
+
this.daemonTimer = null;
|
|
180
|
+
this._daemonTicking = false;
|
|
181
|
+
this.daemonRuntimeFile = path.join(globalDir, 'daemon-runtime.json');
|
|
182
|
+
// Persists the last self-restart time so a fresh post-re-exec supervisor
|
|
183
|
+
// honours the cooldown too (belt-and-suspenders against a restart loop).
|
|
184
|
+
this.selfRestartFile = path.join(globalDir, 'self-restart.json');
|
|
185
|
+
this._selfRestartScheduled = false;
|
|
186
|
+
this.logFile = path.join(globalDir, 'supervisor.log');
|
|
187
|
+
this.serverLogFile = path.join(globalDir, 'server.out.log');
|
|
188
|
+
this.lockFile = path.join(globalDir, 'supervisor.lock');
|
|
189
|
+
// Phase 3.2: the bmo-sync daemon drops this file (a consented support
|
|
190
|
+
// `restart-server` action) for us to action — so a restart can be triggered
|
|
191
|
+
// out-of-band even when :5173 is wedged. We kill the child; the next tick
|
|
192
|
+
// respawns it from disk (new code loads). Safe: absent file = no-op.
|
|
193
|
+
this.restartRequestFile = path.join(globalDir, 'restart-request.json');
|
|
194
|
+
this.child = null;
|
|
195
|
+
this.backoff = backoffStartMs;
|
|
196
|
+
this.lastSpawn = 0;
|
|
197
|
+
this.timer = null;
|
|
198
|
+
this.spawnCount = 0; // consecutive spawns without becoming healthy
|
|
199
|
+
this.pushedThisEpisode = false; // crash-loop diagnostics pushed once per episode
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
log(msg) {
|
|
203
|
+
try { fs.appendFileSync(this.logFile, `[${new Date().toISOString()}] ${msg}\n`); } catch { /* best-effort */ }
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/** Is a pid alive? EPERM means "exists, not ours" → still alive. */
|
|
207
|
+
pidAlive(pid) {
|
|
208
|
+
try { process.kill(pid, 0); return true; } catch (e) { return !!(e && e.code === 'EPERM'); }
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
/** Exclusive lock; take over ONLY a stale lock (recorded pid no longer alive). */
|
|
212
|
+
acquireLock() {
|
|
213
|
+
try { fs.mkdirSync(this.globalDir, { recursive: true }); } catch { /* surfaced below */ }
|
|
214
|
+
try {
|
|
215
|
+
const fd = fs.openSync(this.lockFile, 'wx');
|
|
216
|
+
fs.writeSync(fd, String(process.pid));
|
|
217
|
+
fs.closeSync(fd);
|
|
218
|
+
return true;
|
|
219
|
+
} catch {
|
|
220
|
+
let old = null;
|
|
221
|
+
try { old = Number(fs.readFileSync(this.lockFile, 'utf8').trim()); } catch { /* unreadable */ }
|
|
222
|
+
if (old && this.pidAlive(old)) {
|
|
223
|
+
this.log(`live supervisor pid=${old} already running; exiting`);
|
|
224
|
+
return false;
|
|
225
|
+
}
|
|
226
|
+
try { fs.writeFileSync(this.lockFile, String(process.pid)); this.log('took over stale lock'); return true; }
|
|
227
|
+
catch { return false; }
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
releaseLock() {
|
|
232
|
+
try {
|
|
233
|
+
if (Number(fs.readFileSync(this.lockFile, 'utf8').trim()) === process.pid) fs.unlinkSync(this.lockFile);
|
|
234
|
+
} catch { /* already gone */ }
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
spawnServer() {
|
|
238
|
+
let out = 'ignore';
|
|
239
|
+
try { out = fs.openSync(this.serverLogFile, 'a'); } catch { /* output discarded */ }
|
|
240
|
+
this.child = this.spawnImpl(this.node, [this.serverEntry], {
|
|
241
|
+
cwd: this.workspaceDir,
|
|
242
|
+
windowsHide: true,
|
|
243
|
+
stdio: ['ignore', out, out],
|
|
244
|
+
env: { ...this.env, WILD_WORKSPACE_NO_OPEN: '1', WILD_WORKSPACE_DIR: this.workspaceDir },
|
|
245
|
+
});
|
|
246
|
+
if (typeof out === 'number') { try { fs.closeSync(out); } catch { /* parent fd */ } }
|
|
247
|
+
this.lastSpawn = this.nowImpl();
|
|
248
|
+
const pid = this.child && this.child.pid;
|
|
249
|
+
this.log(`spawned server pid=${pid} (backoff=${this.backoff}ms)`);
|
|
250
|
+
if (this.child && this.child.on) {
|
|
251
|
+
this.child.on('exit', (code, sig) => { this.log(`server pid=${pid} exited code=${code} sig=${sig}`); this.child = null; });
|
|
252
|
+
}
|
|
253
|
+
return this.child;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
/**
|
|
257
|
+
* Consume a pending support `restart-server` request (Phase 3.2). Returns true
|
|
258
|
+
* iff a request file was present (and removes it). Reading-then-deleting makes
|
|
259
|
+
* "present" mean "unhandled" — idempotent across ticks.
|
|
260
|
+
*/
|
|
261
|
+
consumeRestartRequest() {
|
|
262
|
+
try {
|
|
263
|
+
fs.readFileSync(this.restartRequestFile); // throws if absent
|
|
264
|
+
} catch {
|
|
265
|
+
return false;
|
|
266
|
+
}
|
|
267
|
+
try { fs.unlinkSync(this.restartRequestFile); } catch { /* best-effort */ }
|
|
268
|
+
return true;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
/** One supervision step. Returns its decision (exposed for tests). */
|
|
272
|
+
async tick() {
|
|
273
|
+
// Phase 3.2: a consented support restart request takes priority — kill the
|
|
274
|
+
// child so the next tick respawns it from disk (picks up any new code).
|
|
275
|
+
if (this.consumeRestartRequest()) {
|
|
276
|
+
this.log('restart-server requested (support channel) — restarting');
|
|
277
|
+
this.restartChild();
|
|
278
|
+
return 'restart-requested';
|
|
279
|
+
}
|
|
280
|
+
// Part-8 backstop: if disk moved ahead of our own code (any update path),
|
|
281
|
+
// schedule a supervisor self-restart. Side-effect only — never changes the
|
|
282
|
+
// tick decision below (server/daemon healing proceeds as usual meanwhile).
|
|
283
|
+
this.maybeSelfRestartOnDrift();
|
|
284
|
+
if (await this.probeImpl(this.port, this.probeTimeoutMs)) {
|
|
285
|
+
this.backoff = this.backoffStartMs; // healthy → reset backoff
|
|
286
|
+
this.spawnCount = 0; // healthy → not a crash loop
|
|
287
|
+
this.pushedThisEpisode = false;
|
|
288
|
+
// RC1 version drift: a healthy-but-STALE server (running older code than
|
|
289
|
+
// what's installed) should be restarted so the upgrade actually lands.
|
|
290
|
+
// Only when WE own the child — we restart by killing it and letting the
|
|
291
|
+
// next tick respawn (which reloads index.mjs from disk). A server started
|
|
292
|
+
// by someone else (foreground `wild-workspace`) we leave alone; we have no
|
|
293
|
+
// handle on it. The restarted server reports the installed version, so the
|
|
294
|
+
// drift clears and this won't loop.
|
|
295
|
+
if (this.autoRestartOnVersionDrift && this.child) {
|
|
296
|
+
try {
|
|
297
|
+
const running = await this.versionImpl(this.port, this.probeTimeoutMs);
|
|
298
|
+
const installed = this.installedVersionImpl();
|
|
299
|
+
if (running && installed && running !== installed) {
|
|
300
|
+
this.log(`version drift: running=${running} installed=${installed} — restarting server`);
|
|
301
|
+
try { this.child.kill(); } catch { /* exit handler clears child */ }
|
|
302
|
+
this.child = null;
|
|
303
|
+
this.backoff = this.backoffStartMs; // upgrade is intentional, not a crash
|
|
304
|
+
return 'version-drift-restart';
|
|
305
|
+
}
|
|
306
|
+
} catch (e) {
|
|
307
|
+
this.log(`version-drift check error: ${e?.message || e}`);
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
return 'healthy';
|
|
311
|
+
}
|
|
312
|
+
if (this.child) return 'booting'; // spawned, still coming up
|
|
313
|
+
if (this.nowImpl() - this.lastSpawn < this.backoff) return 'backoff';
|
|
314
|
+
this.spawnServer();
|
|
315
|
+
this.backoff = Math.min(this.backoff * 2, this.backoffMaxMs);
|
|
316
|
+
this.spawnCount += 1;
|
|
317
|
+
// Crash loop: the server won't stay up, so the operator channel (which rides
|
|
318
|
+
// the :5173 server) can't reach this machine at all. Push an install-down
|
|
319
|
+
// `doctor` bundle to bmo-sync ONCE per episode so support sees it anyway —
|
|
320
|
+
// the install-failed-before-server-up case (docs/user-experience.md §5).
|
|
321
|
+
if (this.spawnCount >= this.crashLoopThreshold && !this.pushedThisEpisode) {
|
|
322
|
+
this.pushedThisEpisode = true;
|
|
323
|
+
Promise.resolve(this.pushDiagnostics()).catch((e) => this.log(`diag push error: ${e?.message || e}`));
|
|
324
|
+
}
|
|
325
|
+
return 'spawned';
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
/**
|
|
329
|
+
* Push an install-down diagnostic bundle to bmo-sync. Injected (`diagnosticsImpl`)
|
|
330
|
+
* in tests; the real path is consent- + token-gated and never runs under the
|
|
331
|
+
* test runner. Best-effort, never throws into the supervision loop.
|
|
332
|
+
*/
|
|
333
|
+
async pushDiagnostics() {
|
|
334
|
+
if (this.diagnosticsImpl) return this.diagnosticsImpl(this);
|
|
335
|
+
if (process.env.VITEST || process.env.NODE_ENV === 'test') return;
|
|
336
|
+
try {
|
|
337
|
+
const [{ buildConfig }, { runDoctor }, { loadObservabilityConsent }] = await Promise.all([
|
|
338
|
+
import('./config.mjs'),
|
|
339
|
+
import('./doctor.mjs'),
|
|
340
|
+
import('./observability.mjs'),
|
|
341
|
+
]);
|
|
342
|
+
const config = buildConfig({ workspaceDir: this.workspaceDir, port: this.port });
|
|
343
|
+
if (!config.accountToken) return; // can't key it to a user
|
|
344
|
+
if (process.env.WILD_WORKSPACE_NO_TELEMETRY === '1') return; // kill switch
|
|
345
|
+
if (!loadObservabilityConsent(config.dataDir).enabled) return; // consent
|
|
346
|
+
const report = await runDoctor({ config });
|
|
347
|
+
const url = `${config.bmoSyncServerUrl.replace(/\/$/, '')}/api/telemetry`;
|
|
348
|
+
const ctrl = new AbortController();
|
|
349
|
+
const t = setTimeout(() => ctrl.abort(), 5000);
|
|
350
|
+
try {
|
|
351
|
+
await fetch(url, {
|
|
352
|
+
method: 'POST',
|
|
353
|
+
headers: { 'content-type': 'application/json' },
|
|
354
|
+
body: JSON.stringify({
|
|
355
|
+
account_token: config.accountToken,
|
|
356
|
+
slug: config.account?.slug || null,
|
|
357
|
+
workspace_id: config.workspaceId,
|
|
358
|
+
kind: 'install-down',
|
|
359
|
+
doctor: report,
|
|
360
|
+
sent_at: Math.floor(Date.now() / 1000),
|
|
361
|
+
}),
|
|
362
|
+
signal: ctrl.signal,
|
|
363
|
+
});
|
|
364
|
+
this.log(`pushed install-down diagnostics (fail=${report.summary?.fail})`);
|
|
365
|
+
} finally {
|
|
366
|
+
clearTimeout(t);
|
|
367
|
+
}
|
|
368
|
+
} catch (e) {
|
|
369
|
+
this.log(`diagnostics push failed: ${e?.message || e}`);
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
/**
|
|
374
|
+
* Restart the supervised server child so freshly installed code is loaded.
|
|
375
|
+
* Kills it and lets the next tick respawn (which reloads index.mjs from disk) —
|
|
376
|
+
* the same mechanism as the version-drift restart, exposed for the AutoUpdater.
|
|
377
|
+
* No-op (returns false) when we don't own a child (foreground server).
|
|
378
|
+
*/
|
|
379
|
+
restartChild() {
|
|
380
|
+
if (!this.child) return false;
|
|
381
|
+
this.log('restartChild: killing server to load new code');
|
|
382
|
+
try { this.child.kill(); } catch { /* exit handler clears child */ }
|
|
383
|
+
this.child = null;
|
|
384
|
+
this.backoff = this.backoffStartMs; // an intentional restart, not a crash
|
|
385
|
+
return true;
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
/** The last self-restart time (epoch ms), or 0. Used for the loop-guard cooldown. */
|
|
389
|
+
readLastSelfRestart() {
|
|
390
|
+
try { return Number(JSON.parse(fs.readFileSync(this.selfRestartFile, 'utf8')).at) || 0; }
|
|
391
|
+
catch { return 0; }
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
writeLastSelfRestart(at) {
|
|
395
|
+
try {
|
|
396
|
+
fs.mkdirSync(this.globalDir, { recursive: true });
|
|
397
|
+
fs.writeFileSync(this.selfRestartFile, JSON.stringify({ at }));
|
|
398
|
+
} catch { /* best-effort */ }
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
/**
|
|
402
|
+
* Schedule a supervisor self-restart so freshly-installed SUPERVISOR code loads
|
|
403
|
+
* (the Part-8 stale-process fix). Called from the AutoUpdater's onUpdate hook
|
|
404
|
+
* AFTER an update installed + restarted the server child + verified it healthy —
|
|
405
|
+
* so a bad release has already rolled back before we re-exec ourselves. Guarded
|
|
406
|
+
* three ways against a restart loop: the kill switch, a once-per-process flag,
|
|
407
|
+
* and a persisted cooldown (survives the re-exec). Returns a status string
|
|
408
|
+
* ('scheduled' | 'disabled' | 'already' | 'cooldown') for tests/logging. The
|
|
409
|
+
* actual restart runs on a short delay so the triggering tick unwinds first.
|
|
410
|
+
*/
|
|
411
|
+
scheduleSelfRestart(reason) {
|
|
412
|
+
if (!this.selfRestart) return 'disabled';
|
|
413
|
+
if (this._selfRestartScheduled) return 'already';
|
|
414
|
+
const now = this.nowImpl();
|
|
415
|
+
const last = this.readLastSelfRestart();
|
|
416
|
+
if (last && now - last < this.selfRestartCooldownMs) {
|
|
417
|
+
this.log(`self-restart skipped (cooldown, last ${Math.round((now - last) / 1000)}s ago) — ${reason}`);
|
|
418
|
+
return 'cooldown';
|
|
419
|
+
}
|
|
420
|
+
this._selfRestartScheduled = true;
|
|
421
|
+
this.writeLastSelfRestart(now);
|
|
422
|
+
this.log(`self-restart scheduled in ${this.selfRestartDelayMs}ms — ${reason}`);
|
|
423
|
+
this.scheduleImpl(() => {
|
|
424
|
+
this._performSelfRestart(reason).catch((e) => this.log(`self-restart error: ${e?.message || e}`));
|
|
425
|
+
}, this.selfRestartDelayMs);
|
|
426
|
+
return 'scheduled';
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
/**
|
|
430
|
+
* Carry out the self-restart. On mac/Linux the service manager kills+relaunches
|
|
431
|
+
* us (we just issue the command and get SIGTERM'd → our exit handler releases the
|
|
432
|
+
* lock). On Windows restartSelf spawned a hidden successor and returns
|
|
433
|
+
* willExit:true — we then release the lock (via stop()) and exit so the successor
|
|
434
|
+
* can take it. A non-managed run reports restarted:false and we stay up on the
|
|
435
|
+
* old code (no worse than before this feature). Never throws.
|
|
436
|
+
*/
|
|
437
|
+
async _performSelfRestart(reason) {
|
|
438
|
+
this.log(`self-restart now — ${reason}`);
|
|
439
|
+
let r;
|
|
440
|
+
try {
|
|
441
|
+
r = await this.restartSelfImpl({ dir: this.globalDir, port: this.port });
|
|
442
|
+
} catch (e) {
|
|
443
|
+
this.log(`self-restart impl error: ${e?.message || e}`);
|
|
444
|
+
return { restarted: false, error: e?.message || String(e) };
|
|
445
|
+
}
|
|
446
|
+
this.log(`self-restart result: ${JSON.stringify(r)}`);
|
|
447
|
+
if (r && r.willExit) {
|
|
448
|
+
this.stop(); // clears timers + releases the lock so the successor can take it
|
|
449
|
+
this.exitImpl(0);
|
|
450
|
+
}
|
|
451
|
+
return r;
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
/**
|
|
455
|
+
* Backstop for the Part-8 gap on EVERY update path, not just our own auto-
|
|
456
|
+
* updater: when the version installed on disk no longer matches the code THIS
|
|
457
|
+
* supervisor is running, the supervisor is stale → schedule a self-restart.
|
|
458
|
+
* RC1b already restarts the stale server child and daemonTick recycles the
|
|
459
|
+
* stale daemon; this is the missing third leg (the supervisor itself), so an
|
|
460
|
+
* operator `update-now` / CLI `update apply` / manual `npm i -g` also lands new
|
|
461
|
+
* supervisor code with no reboot. Skipped while OUR auto-updater is mid-flight
|
|
462
|
+
* so the rollback window is respected (that path self-restarts via the onUpdate
|
|
463
|
+
* hook, only after verify succeeds). Cheap (an in-memory compare guarding a disk
|
|
464
|
+
* read) and idempotent (scheduleSelfRestart de-dupes). Never throws.
|
|
465
|
+
*/
|
|
466
|
+
maybeSelfRestartOnDrift() {
|
|
467
|
+
if (!this.selfRestart || !this.selfVersion) return false;
|
|
468
|
+
if (this._selfRestartScheduled) return false;
|
|
469
|
+
if (this.autoUpdater && this.autoUpdater.inProgress) return false; // respect rollback window
|
|
470
|
+
let installed = null;
|
|
471
|
+
try { installed = this.installedVersionImpl(); } catch { return false; }
|
|
472
|
+
if (!installed || installed === this.selfVersion) return false;
|
|
473
|
+
this.log(`supervisor version drift: running=${this.selfVersion} installed=${installed} — self-restarting`);
|
|
474
|
+
this.scheduleSelfRestart(`supervisor drift ${this.selfVersion}→${installed}`);
|
|
475
|
+
return true;
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
/** Build the AutoUpdater bound to this supervisor. Separated for the test seam. */
|
|
479
|
+
async buildAutoUpdater() {
|
|
480
|
+
if (this.autoUpdaterFactory) return this.autoUpdaterFactory(this);
|
|
481
|
+
// Lazy import keeps the unit-test path (which never calls start()) free of the
|
|
482
|
+
// auto-update module + its registry/npm seams.
|
|
483
|
+
const { AutoUpdater } = await import('./auto-update.mjs');
|
|
484
|
+
return new AutoUpdater({
|
|
485
|
+
globalDir: this.globalDir,
|
|
486
|
+
port: this.port,
|
|
487
|
+
installedVersionImpl: this.installedVersionImpl,
|
|
488
|
+
healthVersionImpl: (port) => this.versionImpl(port, this.probeTimeoutMs),
|
|
489
|
+
restartImpl: async () => { this.restartChild(); },
|
|
490
|
+
nowImpl: this.nowImpl,
|
|
491
|
+
env: this.env,
|
|
492
|
+
logImpl: (m) => this.log(m),
|
|
493
|
+
onUpdate: (rec) => {
|
|
494
|
+
this.log(`auto-update result: ${rec.from || '?'}→${rec.to} ${rec.status}`);
|
|
495
|
+
// A genuine version change landed healthy → restart the supervisor itself
|
|
496
|
+
// so its own new code loads (Part-8 stale-process fix). Guarded against
|
|
497
|
+
// loops inside scheduleSelfRestart. Fires only on a real bump (to≠from),
|
|
498
|
+
// never on rollback/failure (those statuses aren't 'ok').
|
|
499
|
+
if (rec.status === 'ok' && rec.to && rec.from && rec.to !== rec.from) {
|
|
500
|
+
this.scheduleSelfRestart(`auto-update ${rec.from}→${rec.to}`);
|
|
501
|
+
}
|
|
502
|
+
},
|
|
503
|
+
});
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
runUpdateTick(opts = {}) {
|
|
507
|
+
if (!this.autoUpdater) return;
|
|
508
|
+
this.autoUpdater.tick(opts)
|
|
509
|
+
.then((r) => { if (r && !['not-due', 'disabled', 'up-to-date', 'busy'].includes(r)) this.log(`auto-update tick: ${r}`); })
|
|
510
|
+
.catch((e) => this.log(`auto-update error: ${e?.message || e}`));
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
/**
|
|
514
|
+
* Build the DaemonSupervisor the always-on layer owns. Reads a FRESH config
|
|
515
|
+
* (not the stale module constant) so the account token / relay in effect when
|
|
516
|
+
* always-on starts are used. Lazy import keeps the unit-test path (which never
|
|
517
|
+
* calls start()) free of config + daemon-supervisor. Test seam: factory.
|
|
518
|
+
*/
|
|
519
|
+
async buildDaemonSupervisor() {
|
|
520
|
+
if (this.daemonSupervisorFactory) return this.daemonSupervisorFactory(this);
|
|
521
|
+
const [{ buildConfig }, { DaemonSupervisor }] = await Promise.all([
|
|
522
|
+
import('./config.mjs'),
|
|
523
|
+
import('./daemon-supervisor.mjs'),
|
|
524
|
+
]);
|
|
525
|
+
const config = buildConfig({ workspaceDir: this.workspaceDir, port: this.port });
|
|
526
|
+
return new DaemonSupervisor({
|
|
527
|
+
httpBase: config.daemonHttpUrl,
|
|
528
|
+
globalDir: this.globalDir,
|
|
529
|
+
accountToken: config.accountToken,
|
|
530
|
+
serverUrl: config.bmoSyncServerUrl,
|
|
531
|
+
});
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
/**
|
|
535
|
+
* One daemon-supervision step: if the daemon isn't answering /health, (re)start
|
|
536
|
+
* it. Deliberately INDEPENDENT of server health — the daemon (and its support
|
|
537
|
+
* channel) must stay up even when the server is crashed/mid-upgrade. Re-entrancy
|
|
538
|
+
* guarded so a slow spawn can't overlap the next tick. Never throws.
|
|
539
|
+
*/
|
|
540
|
+
/** The daemon version the currently-running daemon was spawned under, or null. */
|
|
541
|
+
readDaemonMarker() {
|
|
542
|
+
try {
|
|
543
|
+
const v = JSON.parse(fs.readFileSync(this.daemonRuntimeFile, 'utf8'))?.daemonVersion;
|
|
544
|
+
return typeof v === 'string' ? v : null;
|
|
545
|
+
} catch {
|
|
546
|
+
return null;
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
writeDaemonMarker(version) {
|
|
551
|
+
if (!version) return; // unknown installed version (PATH/vendor) — don't pin
|
|
552
|
+
try {
|
|
553
|
+
fs.mkdirSync(this.globalDir, { recursive: true });
|
|
554
|
+
fs.writeFileSync(this.daemonRuntimeFile, JSON.stringify({ daemonVersion: version }));
|
|
555
|
+
} catch {
|
|
556
|
+
/* best-effort */
|
|
557
|
+
}
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
async daemonTick() {
|
|
561
|
+
if (!this.daemonSupervisor || this._daemonTicking) return 'skip';
|
|
562
|
+
this._daemonTicking = true;
|
|
563
|
+
try {
|
|
564
|
+
const installed = this.daemonVersionImpl();
|
|
565
|
+
const h = await this.daemonSupervisor.health();
|
|
566
|
+
if (h && h.running) {
|
|
567
|
+
// Running — but is it the CURRENT binary? After an auto-update the daemon
|
|
568
|
+
// keeps the old code until recycled (RC1b analog). Recycle when the
|
|
569
|
+
// installed version differs from what we recorded at spawn (a null marker
|
|
570
|
+
// = spawned by a pre-drift-aware supervisor → treat as drift, recycle once).
|
|
571
|
+
if (installed && this.readDaemonMarker() !== installed && this.daemonSupervisor.recycle) {
|
|
572
|
+
this.log(
|
|
573
|
+
`daemon version drift (marker=${this.readDaemonMarker() || 'none'} installed=${installed}) — recycling`,
|
|
574
|
+
);
|
|
575
|
+
const r = await this.daemonSupervisor.recycle();
|
|
576
|
+
if (r && r.started) {
|
|
577
|
+
this.writeDaemonMarker(installed);
|
|
578
|
+
this.log(`daemon recycled to ${installed} (pid=${r.pid})`);
|
|
579
|
+
return 'recycled';
|
|
580
|
+
}
|
|
581
|
+
this.log(`daemon recycle failed: ${r?.error || 'unknown'}`);
|
|
582
|
+
return 'recycle-failed';
|
|
583
|
+
}
|
|
584
|
+
return 'healthy';
|
|
585
|
+
}
|
|
586
|
+
const r = await this.daemonSupervisor.ensureRunning();
|
|
587
|
+
if (r && r.started) {
|
|
588
|
+
this.writeDaemonMarker(installed);
|
|
589
|
+
this.log(`daemon respawned (pid=${r.pid})`);
|
|
590
|
+
return 'respawned';
|
|
591
|
+
}
|
|
592
|
+
if (r && r.alreadyRunning) return 'healthy';
|
|
593
|
+
this.log(`daemon down, respawn not started: ${r?.error || 'unknown'}`);
|
|
594
|
+
return 'failed';
|
|
595
|
+
} catch (e) {
|
|
596
|
+
this.log(`daemon-tick error: ${e?.message || e}`);
|
|
597
|
+
return 'error';
|
|
598
|
+
} finally {
|
|
599
|
+
this._daemonTicking = false;
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
/** Acquire the lock and start the supervision loop. Idempotent across processes. */
|
|
604
|
+
start() {
|
|
605
|
+
if (!this.acquireLock()) return { started: false, reason: 'already-running' };
|
|
606
|
+
process.on('exit', () => this.releaseLock());
|
|
607
|
+
process.on('SIGTERM', () => process.exit(0));
|
|
608
|
+
process.on('SIGINT', () => process.exit(0));
|
|
609
|
+
this.log(`supervisor start pid=${process.pid} v${this.selfVersion || '?'} watching http://127.0.0.1:${this.port}/api/health (workspace=${this.workspaceDir})`);
|
|
610
|
+
this.timer = setInterval(() => { this.tick().catch((e) => this.log(`tick error: ${e?.message || e}`)); }, this.pollMs);
|
|
611
|
+
this.tick().catch((e) => this.log(`tick error: ${e?.message || e}`));
|
|
612
|
+
|
|
613
|
+
// Phase 2 auto-update: wake on a slow timer; the first check fires shortly
|
|
614
|
+
// after start so the server has time to come up (verify reads its /health).
|
|
615
|
+
if (this.autoUpdate && this.env.VITEST !== 'true' && this.env.NODE_ENV !== 'test') {
|
|
616
|
+
this.buildAutoUpdater().then((u) => {
|
|
617
|
+
this.autoUpdater = u;
|
|
618
|
+
this.updateTimer = setInterval(() => this.runUpdateTick(), this.updatePollMs);
|
|
619
|
+
if (this.updateTimer.unref) this.updateTimer.unref();
|
|
620
|
+
// FORCE the first post-boot check (bypass the 6h dueForCheck): a restart
|
|
621
|
+
// should always pull the latest, so a rebooted/woken machine can't sit on
|
|
622
|
+
// a stale version just because it last checked <6h ago.
|
|
623
|
+
const kick = setTimeout(() => this.runUpdateTick({ force: true }), 60_000);
|
|
624
|
+
if (kick.unref) kick.unref();
|
|
625
|
+
}).catch((e) => this.log(`auto-update init error: ${e?.message || e}`));
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
// Phase 3: keep the bmo-sync daemon alive independent of the server, so the
|
|
629
|
+
// out-of-band support channel survives the server being down.
|
|
630
|
+
if (this.superviseDaemon && this.env.VITEST !== 'true' && this.env.NODE_ENV !== 'test') {
|
|
631
|
+
this.buildDaemonSupervisor().then((d) => {
|
|
632
|
+
this.daemonSupervisor = d;
|
|
633
|
+
this.daemonTimer = setInterval(() => { this.daemonTick().catch((e) => this.log(`daemon-tick error: ${e?.message || e}`)); }, this.daemonPollMs);
|
|
634
|
+
if (this.daemonTimer.unref) this.daemonTimer.unref();
|
|
635
|
+
this.daemonTick().catch(() => {}); // first probe now
|
|
636
|
+
}).catch((e) => this.log(`daemon supervision init error: ${e?.message || e}`));
|
|
637
|
+
}
|
|
638
|
+
return { started: true };
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
stop() {
|
|
642
|
+
if (this.timer) { clearInterval(this.timer); this.timer = null; }
|
|
643
|
+
if (this.updateTimer) { clearInterval(this.updateTimer); this.updateTimer = null; }
|
|
644
|
+
if (this.daemonTimer) { clearInterval(this.daemonTimer); this.daemonTimer = null; }
|
|
645
|
+
this.releaseLock();
|
|
646
|
+
}
|
|
647
|
+
}
|