@venturewild/workspace 0.3.6 → 0.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/LICENSE +21 -21
  2. package/README.md +112 -112
  3. package/package.json +83 -83
  4. package/server/bin/wild-workspace.mjs +995 -995
  5. package/server/src/account.mjs +114 -114
  6. package/server/src/agent-login.mjs +146 -146
  7. package/server/src/agent-readiness.mjs +200 -200
  8. package/server/src/agent.mjs +468 -453
  9. package/server/src/bazaar/core.mjs +579 -579
  10. package/server/src/bazaar/index.mjs +75 -75
  11. package/server/src/bazaar/mcp-server.mjs +328 -328
  12. package/server/src/bazaar/mock-tickup.mjs +97 -97
  13. package/server/src/bazaar/preview-server.mjs +95 -95
  14. package/server/src/bazaar/seed-recipes/customer-feedback-form/know-how.md +23 -23
  15. package/server/src/bazaar/seed-recipes/customer-feedback-form/recipe.json +24 -24
  16. package/server/src/bazaar/seed-recipes/landing-page-launch/know-how.md +29 -29
  17. package/server/src/bazaar/seed-recipes/landing-page-launch/recipe.json +25 -25
  18. package/server/src/bazaar/seed-recipes/personal-portfolio/know-how.md +21 -21
  19. package/server/src/bazaar/seed-recipes/personal-portfolio/recipe.json +24 -24
  20. package/server/src/bazaar/seed-recipes/receipt-sorter/know-how.md +31 -31
  21. package/server/src/bazaar/seed-recipes/receipt-sorter/recipe.json +25 -25
  22. package/server/src/bazaar/seed-recipes/tickup-hr-matching/know-how.md +79 -79
  23. package/server/src/bazaar/seed-recipes/tickup-hr-matching/recipe.json +32 -32
  24. package/server/src/canvas/core.mjs +421 -421
  25. package/server/src/canvas/index.mjs +42 -42
  26. package/server/src/canvas/mcp-server.mjs +253 -253
  27. package/server/src/config.mjs +404 -404
  28. package/server/src/daemon-bin.mjs +110 -110
  29. package/server/src/daemon-supervisor.mjs +285 -285
  30. package/server/src/doctor.mjs +375 -375
  31. package/server/src/inbox.mjs +86 -86
  32. package/server/src/index.mjs +2475 -2365
  33. package/server/src/logpaths.mjs +98 -98
  34. package/server/src/observability.mjs +45 -45
  35. package/server/src/operator.mjs +92 -92
  36. package/server/src/pairing.mjs +137 -137
  37. package/server/src/service.mjs +515 -515
  38. package/server/src/session-reporter.mjs +201 -201
  39. package/server/src/settings.mjs +145 -0
  40. package/server/src/share.mjs +182 -182
  41. package/server/src/skills.mjs +213 -0
  42. package/server/src/supervisor.mjs +647 -647
  43. package/server/src/support-consent.mjs +133 -133
  44. package/server/src/sync.mjs +248 -248
  45. package/server/src/transcript.mjs +121 -121
  46. package/server/src/turn-mcp.mjs +46 -46
  47. package/server/src/usage.mjs +405 -0
  48. package/web/dist/assets/index-BxRx8EsD.js +91 -0
  49. package/web/dist/assets/index-DoOPBr3s.css +1 -0
  50. package/web/dist/index.html +2 -2
  51. package/web/dist/assets/index-B7cOsWLt.js +0 -91
  52. package/web/dist/assets/index-Dl0VT5e6.css +0 -1
@@ -1,647 +1,647 @@
1
- // WorkspaceSupervisor — keeps the wild-workspace server alive in the background.
2
- //
3
- // The server itself auto-starts the bmo-sync daemon on boot (DaemonSupervisor),
4
- // so keeping the server up brings the whole local stack — public URL included —
5
- // back to life. This is the watchdog half of the always-on feature
6
- // (docs/always-on-design.md); `service.mjs` is the per-OS autostart half that
7
- // launches this hidden at login via `wild-workspace service run`.
8
- //
9
- // Design (all proven on Windows incl. a real reboot, 2026-05-30):
10
- // - Health-driven: polls GET /api/health and (re)spawns the server only when
11
- // it is down — so it never fights a server someone else started and handles
12
- // crash recovery naturally.
13
- // - Singleton: an exclusive lockfile in the machine-global dir
14
- // (~/.wild-workspace, NEVER the synced workspace — locked principle #1).
15
- // A stale lock whose pid is dead is taken over.
16
- // - Exponential backoff (capped) so a crash-looping server can't spin the CPU.
17
- // - Everything is logged — silent death is the #1 un-debuggable failure mode.
18
- //
19
- // Every external touch-point (spawn, health probe, clock) is an injected seam
20
- // so the suite never spawns a real process.
21
-
22
- import { spawn } from 'node:child_process';
23
- import http from 'node:http';
24
- import fs from 'node:fs';
25
- import os from 'node:os';
26
- import path from 'node:path';
27
- import { fileURLToPath } from 'node:url';
28
- import { resolveDaemonVersion } from './daemon-bin.mjs';
29
- import { restartSelf } from './service.mjs';
30
-
31
- const __dirname = path.dirname(fileURLToPath(import.meta.url));
32
- const DEFAULT_SERVER_ENTRY = path.join(__dirname, 'index.mjs');
33
-
34
- /** Resolve true iff the local server answers /api/health. Never throws. */
35
- export function probeHealth(port, timeoutMs = 2500) {
36
- return new Promise((resolve) => {
37
- const req = http.get(
38
- { host: '127.0.0.1', port, path: '/api/health', timeout: timeoutMs },
39
- (res) => { res.resume(); resolve(res.statusCode > 0); },
40
- );
41
- req.on('error', () => resolve(false));
42
- req.on('timeout', () => { req.destroy(); resolve(false); });
43
- });
44
- }
45
-
46
- /**
47
- * Ask the running server its version via /api/health. Returns the version string
48
- * or null (server down / no version field / parse error). Never throws. Used by
49
- * the version-drift check (RC1) — a stale server keeps running its OLD code after
50
- * an upgrade, so we compare what's RUNNING to what's INSTALLED on disk.
51
- */
52
- export function probeHealthVersion(port, timeoutMs = 2500) {
53
- return new Promise((resolve) => {
54
- const req = http.get(
55
- { host: '127.0.0.1', port, path: '/api/health', timeout: timeoutMs },
56
- (res) => {
57
- let body = '';
58
- res.on('data', (d) => { body += d; if (body.length > 4096) req.destroy(); });
59
- res.on('end', () => {
60
- try { resolve(JSON.parse(body).version || null); } catch { resolve(null); }
61
- });
62
- },
63
- );
64
- req.on('error', () => resolve(null));
65
- req.on('timeout', () => { req.destroy(); resolve(null); });
66
- });
67
- }
68
-
69
- /**
70
- * The version installed on disk RIGHT NOW — read fresh from the package.json that
71
- * ships next to this file, NOT the in-memory APP_VERSION constant. The supervisor
72
- * is long-lived: after `npm i -g` (or the operator `reinstall-daemon`) swaps the
73
- * package, the supervisor's own constant is stale too, so only a fresh disk read
74
- * sees the new version. Respawning the server child reloads index.mjs from this
75
- * same path, so the restart actually picks up the new code. Returns null on error.
76
- */
77
- export function installedVersion(entry = DEFAULT_SERVER_ENTRY) {
78
- try {
79
- // index.mjs lives at <pkg>/server/src/index.mjs → package.json is ../../.
80
- const pkg = path.resolve(path.dirname(entry), '..', '..', 'package.json');
81
- return JSON.parse(fs.readFileSync(pkg, 'utf8')).version || null;
82
- } catch {
83
- return null;
84
- }
85
- }
86
-
87
- // Captured ONCE at module load = the version of the code THIS supervisor process
88
- // is running. A fresh installedVersion() reads disk, which moves ahead after an
89
- // in-place `npm i -g`; the difference is the supervisor's OWN staleness (the
90
- // Part-8 gap). Distinct from APP_VERSION only in that we read the same file the
91
- // drift check reads, so they're guaranteed equal at startup (no false drift).
92
- export const SUPERVISOR_VERSION = installedVersion();
93
-
94
- export class WorkspaceSupervisor {
95
- constructor({
96
- serverEntry = DEFAULT_SERVER_ENTRY,
97
- workspaceDir = process.cwd(),
98
- port = Number(process.env.WILD_WORKSPACE_PORT || 5173),
99
- globalDir = path.join(os.homedir(), '.wild-workspace'),
100
- node = process.execPath,
101
- pollMs = 3000,
102
- backoffStartMs = 1000,
103
- backoffMaxMs = 30000,
104
- probeTimeoutMs = 2500,
105
- spawnImpl = spawn,
106
- probeImpl = probeHealth,
107
- nowImpl = () => Date.now(),
108
- env = process.env,
109
- crashLoopThreshold = 3,
110
- diagnosticsImpl = null,
111
- // RC1 version-drift auto-restart: when the RUNNING server reports an older
112
- // version than what's INSTALLED on disk, restart it so it picks up the new
113
- // code. On by default; seams injected for tests. WILD_WORKSPACE_NO_AUTORESTART=1
114
- // disables it (e.g. a developer running an intentionally-older server).
115
- autoRestartOnVersionDrift = env.WILD_WORKSPACE_NO_AUTORESTART !== '1',
116
- versionImpl = probeHealthVersion,
117
- installedVersionImpl = () => installedVersion(serverEntry),
118
- // Phase 2 auto-update (Pillar B): the always-on supervisor self-updates the
119
- // whole stack on the user's channel, with health-gated rollback. On by
120
- // default; the env kill switch + the persisted off switch both disable it.
121
- // Only wired up in start() (not in the unit-test path, which calls tick()
122
- // directly) — see start(). updatePollMs is the *wake* cadence; the actual
123
- // check interval lives inside AutoUpdater (6h) and self-rate-limits.
124
- autoUpdate = env.WILD_WORKSPACE_NO_AUTOUPDATE !== '1',
125
- updatePollMs = 60 * 60 * 1000, // wake hourly; AutoUpdater gates real checks
126
- autoUpdaterFactory = null, // test seam: (supervisor) => AutoUpdater-like
127
- // Phase 3 (Pillar A prerequisite): the always-on supervisor keeps the bmo-sync
128
- // DAEMON alive too, independent of the workspace server. The daemon hosts the
129
- // out-of-band support channel (reachable when :5173 is down), so it must not
130
- // depend on the server being up. The server still ensureRunning()s the daemon
131
- // at boot (idempotent); this is the keep-alive owner. On by default; kill switch
132
- // WILD_WORKSPACE_NO_DAEMON_SUPERVISION=1. Only wired in start() (not the unit
133
- // -test path, which calls daemonTick() directly with an injected factory).
134
- superviseDaemon = env.WILD_WORKSPACE_NO_DAEMON_SUPERVISION !== '1',
135
- daemonPollMs = 10000, // probe the daemon every 10s
136
- daemonSupervisorFactory = null, // test seam: (supervisor) => DaemonSupervisor-like
137
- // Daemon version-drift restart (the daemon analog of RC1b): after an
138
- // auto-update installs a new daemon binary, the long-lived daemon process
139
- // keeps running the OLD code until something restarts it — so the support
140
- // channel silently won't activate. We recycle the daemon when the installed
141
- // subpackage version differs from the version the running daemon was spawned
142
- // under (tracked in `daemon-runtime.json`, since the daemon's /health reports
143
- // no version). Test seam: inject a version function.
144
- daemonVersionImpl = () => resolveDaemonVersion({ env }),
145
- // Supervisor self-restart after auto-update (the Part-8 stale-process fix):
146
- // once an update installs new code and the server child restarts + verifies
147
- // healthy, the supervisor must restart ITSELF so its own new code (e.g. the
148
- // daemon-drift recycle) loads — RC1b only restarts the child. Per-OS re-exec
149
- // lives in service.mjs::restartSelf. On by default; kill switch
150
- // WILD_WORKSPACE_NO_SELF_RESTART=1. A cooldown + a once-per-process guard
151
- // prevent any restart loop; the delay lets the triggering update tick unwind
152
- // and logs flush first. All seams injected (no real exit/spawn in tests).
153
- selfRestart = env.WILD_WORKSPACE_NO_SELF_RESTART !== '1',
154
- selfRestartCooldownMs = 10 * 60 * 1000,
155
- selfRestartDelayMs = 3000,
156
- restartSelfImpl = restartSelf,
157
- exitImpl = (code = 0) => process.exit(code),
158
- scheduleImpl = (fn, ms) => { const t = setTimeout(fn, ms); if (t.unref) t.unref(); return t; },
159
- // The version THIS supervisor process is running (captured at module load).
160
- // The self-drift backstop self-restarts when the installed-on-disk version
161
- // moves ahead of this — covering EVERY update path (our auto-updater, the
162
- // operator `update-now`, the CLI `update apply`, a manual `npm i -g`), not
163
- // just our own. null disables the backstop (tests default to null).
164
- selfVersion = SUPERVISOR_VERSION,
165
- } = {}) {
166
- Object.assign(this, {
167
- serverEntry, workspaceDir, port, globalDir, node, pollMs,
168
- backoffStartMs, backoffMaxMs, probeTimeoutMs, spawnImpl, probeImpl, nowImpl, env,
169
- crashLoopThreshold, diagnosticsImpl,
170
- autoRestartOnVersionDrift, versionImpl, installedVersionImpl,
171
- autoUpdate, updatePollMs, autoUpdaterFactory,
172
- superviseDaemon, daemonPollMs, daemonSupervisorFactory, daemonVersionImpl,
173
- selfRestart, selfRestartCooldownMs, selfRestartDelayMs, restartSelfImpl, exitImpl, scheduleImpl,
174
- selfVersion,
175
- });
176
- this.autoUpdater = null;
177
- this.updateTimer = null;
178
- this.daemonSupervisor = null;
179
- this.daemonTimer = null;
180
- this._daemonTicking = false;
181
- this.daemonRuntimeFile = path.join(globalDir, 'daemon-runtime.json');
182
- // Persists the last self-restart time so a fresh post-re-exec supervisor
183
- // honours the cooldown too (belt-and-suspenders against a restart loop).
184
- this.selfRestartFile = path.join(globalDir, 'self-restart.json');
185
- this._selfRestartScheduled = false;
186
- this.logFile = path.join(globalDir, 'supervisor.log');
187
- this.serverLogFile = path.join(globalDir, 'server.out.log');
188
- this.lockFile = path.join(globalDir, 'supervisor.lock');
189
- // Phase 3.2: the bmo-sync daemon drops this file (a consented support
190
- // `restart-server` action) for us to action — so a restart can be triggered
191
- // out-of-band even when :5173 is wedged. We kill the child; the next tick
192
- // respawns it from disk (new code loads). Safe: absent file = no-op.
193
- this.restartRequestFile = path.join(globalDir, 'restart-request.json');
194
- this.child = null;
195
- this.backoff = backoffStartMs;
196
- this.lastSpawn = 0;
197
- this.timer = null;
198
- this.spawnCount = 0; // consecutive spawns without becoming healthy
199
- this.pushedThisEpisode = false; // crash-loop diagnostics pushed once per episode
200
- }
201
-
202
- log(msg) {
203
- try { fs.appendFileSync(this.logFile, `[${new Date().toISOString()}] ${msg}\n`); } catch { /* best-effort */ }
204
- }
205
-
206
- /** Is a pid alive? EPERM means "exists, not ours" → still alive. */
207
- pidAlive(pid) {
208
- try { process.kill(pid, 0); return true; } catch (e) { return !!(e && e.code === 'EPERM'); }
209
- }
210
-
211
- /** Exclusive lock; take over ONLY a stale lock (recorded pid no longer alive). */
212
- acquireLock() {
213
- try { fs.mkdirSync(this.globalDir, { recursive: true }); } catch { /* surfaced below */ }
214
- try {
215
- const fd = fs.openSync(this.lockFile, 'wx');
216
- fs.writeSync(fd, String(process.pid));
217
- fs.closeSync(fd);
218
- return true;
219
- } catch {
220
- let old = null;
221
- try { old = Number(fs.readFileSync(this.lockFile, 'utf8').trim()); } catch { /* unreadable */ }
222
- if (old && this.pidAlive(old)) {
223
- this.log(`live supervisor pid=${old} already running; exiting`);
224
- return false;
225
- }
226
- try { fs.writeFileSync(this.lockFile, String(process.pid)); this.log('took over stale lock'); return true; }
227
- catch { return false; }
228
- }
229
- }
230
-
231
- releaseLock() {
232
- try {
233
- if (Number(fs.readFileSync(this.lockFile, 'utf8').trim()) === process.pid) fs.unlinkSync(this.lockFile);
234
- } catch { /* already gone */ }
235
- }
236
-
237
- spawnServer() {
238
- let out = 'ignore';
239
- try { out = fs.openSync(this.serverLogFile, 'a'); } catch { /* output discarded */ }
240
- this.child = this.spawnImpl(this.node, [this.serverEntry], {
241
- cwd: this.workspaceDir,
242
- windowsHide: true,
243
- stdio: ['ignore', out, out],
244
- env: { ...this.env, WILD_WORKSPACE_NO_OPEN: '1', WILD_WORKSPACE_DIR: this.workspaceDir },
245
- });
246
- if (typeof out === 'number') { try { fs.closeSync(out); } catch { /* parent fd */ } }
247
- this.lastSpawn = this.nowImpl();
248
- const pid = this.child && this.child.pid;
249
- this.log(`spawned server pid=${pid} (backoff=${this.backoff}ms)`);
250
- if (this.child && this.child.on) {
251
- this.child.on('exit', (code, sig) => { this.log(`server pid=${pid} exited code=${code} sig=${sig}`); this.child = null; });
252
- }
253
- return this.child;
254
- }
255
-
256
- /**
257
- * Consume a pending support `restart-server` request (Phase 3.2). Returns true
258
- * iff a request file was present (and removes it). Reading-then-deleting makes
259
- * "present" mean "unhandled" — idempotent across ticks.
260
- */
261
- consumeRestartRequest() {
262
- try {
263
- fs.readFileSync(this.restartRequestFile); // throws if absent
264
- } catch {
265
- return false;
266
- }
267
- try { fs.unlinkSync(this.restartRequestFile); } catch { /* best-effort */ }
268
- return true;
269
- }
270
-
271
- /** One supervision step. Returns its decision (exposed for tests). */
272
- async tick() {
273
- // Phase 3.2: a consented support restart request takes priority — kill the
274
- // child so the next tick respawns it from disk (picks up any new code).
275
- if (this.consumeRestartRequest()) {
276
- this.log('restart-server requested (support channel) — restarting');
277
- this.restartChild();
278
- return 'restart-requested';
279
- }
280
- // Part-8 backstop: if disk moved ahead of our own code (any update path),
281
- // schedule a supervisor self-restart. Side-effect only — never changes the
282
- // tick decision below (server/daemon healing proceeds as usual meanwhile).
283
- this.maybeSelfRestartOnDrift();
284
- if (await this.probeImpl(this.port, this.probeTimeoutMs)) {
285
- this.backoff = this.backoffStartMs; // healthy → reset backoff
286
- this.spawnCount = 0; // healthy → not a crash loop
287
- this.pushedThisEpisode = false;
288
- // RC1 version drift: a healthy-but-STALE server (running older code than
289
- // what's installed) should be restarted so the upgrade actually lands.
290
- // Only when WE own the child — we restart by killing it and letting the
291
- // next tick respawn (which reloads index.mjs from disk). A server started
292
- // by someone else (foreground `wild-workspace`) we leave alone; we have no
293
- // handle on it. The restarted server reports the installed version, so the
294
- // drift clears and this won't loop.
295
- if (this.autoRestartOnVersionDrift && this.child) {
296
- try {
297
- const running = await this.versionImpl(this.port, this.probeTimeoutMs);
298
- const installed = this.installedVersionImpl();
299
- if (running && installed && running !== installed) {
300
- this.log(`version drift: running=${running} installed=${installed} — restarting server`);
301
- try { this.child.kill(); } catch { /* exit handler clears child */ }
302
- this.child = null;
303
- this.backoff = this.backoffStartMs; // upgrade is intentional, not a crash
304
- return 'version-drift-restart';
305
- }
306
- } catch (e) {
307
- this.log(`version-drift check error: ${e?.message || e}`);
308
- }
309
- }
310
- return 'healthy';
311
- }
312
- if (this.child) return 'booting'; // spawned, still coming up
313
- if (this.nowImpl() - this.lastSpawn < this.backoff) return 'backoff';
314
- this.spawnServer();
315
- this.backoff = Math.min(this.backoff * 2, this.backoffMaxMs);
316
- this.spawnCount += 1;
317
- // Crash loop: the server won't stay up, so the operator channel (which rides
318
- // the :5173 server) can't reach this machine at all. Push an install-down
319
- // `doctor` bundle to bmo-sync ONCE per episode so support sees it anyway —
320
- // the install-failed-before-server-up case (docs/user-experience.md §5).
321
- if (this.spawnCount >= this.crashLoopThreshold && !this.pushedThisEpisode) {
322
- this.pushedThisEpisode = true;
323
- Promise.resolve(this.pushDiagnostics()).catch((e) => this.log(`diag push error: ${e?.message || e}`));
324
- }
325
- return 'spawned';
326
- }
327
-
328
- /**
329
- * Push an install-down diagnostic bundle to bmo-sync. Injected (`diagnosticsImpl`)
330
- * in tests; the real path is consent- + token-gated and never runs under the
331
- * test runner. Best-effort, never throws into the supervision loop.
332
- */
333
- async pushDiagnostics() {
334
- if (this.diagnosticsImpl) return this.diagnosticsImpl(this);
335
- if (process.env.VITEST || process.env.NODE_ENV === 'test') return;
336
- try {
337
- const [{ buildConfig }, { runDoctor }, { loadObservabilityConsent }] = await Promise.all([
338
- import('./config.mjs'),
339
- import('./doctor.mjs'),
340
- import('./observability.mjs'),
341
- ]);
342
- const config = buildConfig({ workspaceDir: this.workspaceDir, port: this.port });
343
- if (!config.accountToken) return; // can't key it to a user
344
- if (process.env.WILD_WORKSPACE_NO_TELEMETRY === '1') return; // kill switch
345
- if (!loadObservabilityConsent(config.dataDir).enabled) return; // consent
346
- const report = await runDoctor({ config });
347
- const url = `${config.bmoSyncServerUrl.replace(/\/$/, '')}/api/telemetry`;
348
- const ctrl = new AbortController();
349
- const t = setTimeout(() => ctrl.abort(), 5000);
350
- try {
351
- await fetch(url, {
352
- method: 'POST',
353
- headers: { 'content-type': 'application/json' },
354
- body: JSON.stringify({
355
- account_token: config.accountToken,
356
- slug: config.account?.slug || null,
357
- workspace_id: config.workspaceId,
358
- kind: 'install-down',
359
- doctor: report,
360
- sent_at: Math.floor(Date.now() / 1000),
361
- }),
362
- signal: ctrl.signal,
363
- });
364
- this.log(`pushed install-down diagnostics (fail=${report.summary?.fail})`);
365
- } finally {
366
- clearTimeout(t);
367
- }
368
- } catch (e) {
369
- this.log(`diagnostics push failed: ${e?.message || e}`);
370
- }
371
- }
372
-
373
- /**
374
- * Restart the supervised server child so freshly installed code is loaded.
375
- * Kills it and lets the next tick respawn (which reloads index.mjs from disk) —
376
- * the same mechanism as the version-drift restart, exposed for the AutoUpdater.
377
- * No-op (returns false) when we don't own a child (foreground server).
378
- */
379
- restartChild() {
380
- if (!this.child) return false;
381
- this.log('restartChild: killing server to load new code');
382
- try { this.child.kill(); } catch { /* exit handler clears child */ }
383
- this.child = null;
384
- this.backoff = this.backoffStartMs; // an intentional restart, not a crash
385
- return true;
386
- }
387
-
388
- /** The last self-restart time (epoch ms), or 0. Used for the loop-guard cooldown. */
389
- readLastSelfRestart() {
390
- try { return Number(JSON.parse(fs.readFileSync(this.selfRestartFile, 'utf8')).at) || 0; }
391
- catch { return 0; }
392
- }
393
-
394
- writeLastSelfRestart(at) {
395
- try {
396
- fs.mkdirSync(this.globalDir, { recursive: true });
397
- fs.writeFileSync(this.selfRestartFile, JSON.stringify({ at }));
398
- } catch { /* best-effort */ }
399
- }
400
-
401
- /**
402
- * Schedule a supervisor self-restart so freshly-installed SUPERVISOR code loads
403
- * (the Part-8 stale-process fix). Called from the AutoUpdater's onUpdate hook
404
- * AFTER an update installed + restarted the server child + verified it healthy —
405
- * so a bad release has already rolled back before we re-exec ourselves. Guarded
406
- * three ways against a restart loop: the kill switch, a once-per-process flag,
407
- * and a persisted cooldown (survives the re-exec). Returns a status string
408
- * ('scheduled' | 'disabled' | 'already' | 'cooldown') for tests/logging. The
409
- * actual restart runs on a short delay so the triggering tick unwinds first.
410
- */
411
- scheduleSelfRestart(reason) {
412
- if (!this.selfRestart) return 'disabled';
413
- if (this._selfRestartScheduled) return 'already';
414
- const now = this.nowImpl();
415
- const last = this.readLastSelfRestart();
416
- if (last && now - last < this.selfRestartCooldownMs) {
417
- this.log(`self-restart skipped (cooldown, last ${Math.round((now - last) / 1000)}s ago) — ${reason}`);
418
- return 'cooldown';
419
- }
420
- this._selfRestartScheduled = true;
421
- this.writeLastSelfRestart(now);
422
- this.log(`self-restart scheduled in ${this.selfRestartDelayMs}ms — ${reason}`);
423
- this.scheduleImpl(() => {
424
- this._performSelfRestart(reason).catch((e) => this.log(`self-restart error: ${e?.message || e}`));
425
- }, this.selfRestartDelayMs);
426
- return 'scheduled';
427
- }
428
-
429
- /**
430
- * Carry out the self-restart. On mac/Linux the service manager kills+relaunches
431
- * us (we just issue the command and get SIGTERM'd → our exit handler releases the
432
- * lock). On Windows restartSelf spawned a hidden successor and returns
433
- * willExit:true — we then release the lock (via stop()) and exit so the successor
434
- * can take it. A non-managed run reports restarted:false and we stay up on the
435
- * old code (no worse than before this feature). Never throws.
436
- */
437
- async _performSelfRestart(reason) {
438
- this.log(`self-restart now — ${reason}`);
439
- let r;
440
- try {
441
- r = await this.restartSelfImpl({ dir: this.globalDir, port: this.port });
442
- } catch (e) {
443
- this.log(`self-restart impl error: ${e?.message || e}`);
444
- return { restarted: false, error: e?.message || String(e) };
445
- }
446
- this.log(`self-restart result: ${JSON.stringify(r)}`);
447
- if (r && r.willExit) {
448
- this.stop(); // clears timers + releases the lock so the successor can take it
449
- this.exitImpl(0);
450
- }
451
- return r;
452
- }
453
-
454
- /**
455
- * Backstop for the Part-8 gap on EVERY update path, not just our own auto-
456
- * updater: when the version installed on disk no longer matches the code THIS
457
- * supervisor is running, the supervisor is stale → schedule a self-restart.
458
- * RC1b already restarts the stale server child and daemonTick recycles the
459
- * stale daemon; this is the missing third leg (the supervisor itself), so an
460
- * operator `update-now` / CLI `update apply` / manual `npm i -g` also lands new
461
- * supervisor code with no reboot. Skipped while OUR auto-updater is mid-flight
462
- * so the rollback window is respected (that path self-restarts via the onUpdate
463
- * hook, only after verify succeeds). Cheap (an in-memory compare guarding a disk
464
- * read) and idempotent (scheduleSelfRestart de-dupes). Never throws.
465
- */
466
- maybeSelfRestartOnDrift() {
467
- if (!this.selfRestart || !this.selfVersion) return false;
468
- if (this._selfRestartScheduled) return false;
469
- if (this.autoUpdater && this.autoUpdater.inProgress) return false; // respect rollback window
470
- let installed = null;
471
- try { installed = this.installedVersionImpl(); } catch { return false; }
472
- if (!installed || installed === this.selfVersion) return false;
473
- this.log(`supervisor version drift: running=${this.selfVersion} installed=${installed} — self-restarting`);
474
- this.scheduleSelfRestart(`supervisor drift ${this.selfVersion}→${installed}`);
475
- return true;
476
- }
477
-
478
- /** Build the AutoUpdater bound to this supervisor. Separated for the test seam. */
479
- async buildAutoUpdater() {
480
- if (this.autoUpdaterFactory) return this.autoUpdaterFactory(this);
481
- // Lazy import keeps the unit-test path (which never calls start()) free of the
482
- // auto-update module + its registry/npm seams.
483
- const { AutoUpdater } = await import('./auto-update.mjs');
484
- return new AutoUpdater({
485
- globalDir: this.globalDir,
486
- port: this.port,
487
- installedVersionImpl: this.installedVersionImpl,
488
- healthVersionImpl: (port) => this.versionImpl(port, this.probeTimeoutMs),
489
- restartImpl: async () => { this.restartChild(); },
490
- nowImpl: this.nowImpl,
491
- env: this.env,
492
- logImpl: (m) => this.log(m),
493
- onUpdate: (rec) => {
494
- this.log(`auto-update result: ${rec.from || '?'}→${rec.to} ${rec.status}`);
495
- // A genuine version change landed healthy → restart the supervisor itself
496
- // so its own new code loads (Part-8 stale-process fix). Guarded against
497
- // loops inside scheduleSelfRestart. Fires only on a real bump (to≠from),
498
- // never on rollback/failure (those statuses aren't 'ok').
499
- if (rec.status === 'ok' && rec.to && rec.from && rec.to !== rec.from) {
500
- this.scheduleSelfRestart(`auto-update ${rec.from}→${rec.to}`);
501
- }
502
- },
503
- });
504
- }
505
-
506
- runUpdateTick(opts = {}) {
507
- if (!this.autoUpdater) return;
508
- this.autoUpdater.tick(opts)
509
- .then((r) => { if (r && !['not-due', 'disabled', 'up-to-date', 'busy'].includes(r)) this.log(`auto-update tick: ${r}`); })
510
- .catch((e) => this.log(`auto-update error: ${e?.message || e}`));
511
- }
512
-
513
- /**
514
- * Build the DaemonSupervisor the always-on layer owns. Reads a FRESH config
515
- * (not the stale module constant) so the account token / relay in effect when
516
- * always-on starts are used. Lazy import keeps the unit-test path (which never
517
- * calls start()) free of config + daemon-supervisor. Test seam: factory.
518
- */
519
- async buildDaemonSupervisor() {
520
- if (this.daemonSupervisorFactory) return this.daemonSupervisorFactory(this);
521
- const [{ buildConfig }, { DaemonSupervisor }] = await Promise.all([
522
- import('./config.mjs'),
523
- import('./daemon-supervisor.mjs'),
524
- ]);
525
- const config = buildConfig({ workspaceDir: this.workspaceDir, port: this.port });
526
- return new DaemonSupervisor({
527
- httpBase: config.daemonHttpUrl,
528
- globalDir: this.globalDir,
529
- accountToken: config.accountToken,
530
- serverUrl: config.bmoSyncServerUrl,
531
- });
532
- }
533
-
534
- /**
535
- * One daemon-supervision step: if the daemon isn't answering /health, (re)start
536
- * it. Deliberately INDEPENDENT of server health — the daemon (and its support
537
- * channel) must stay up even when the server is crashed/mid-upgrade. Re-entrancy
538
- * guarded so a slow spawn can't overlap the next tick. Never throws.
539
- */
540
- /** The daemon version the currently-running daemon was spawned under, or null. */
541
- readDaemonMarker() {
542
- try {
543
- const v = JSON.parse(fs.readFileSync(this.daemonRuntimeFile, 'utf8'))?.daemonVersion;
544
- return typeof v === 'string' ? v : null;
545
- } catch {
546
- return null;
547
- }
548
- }
549
-
550
- writeDaemonMarker(version) {
551
- if (!version) return; // unknown installed version (PATH/vendor) — don't pin
552
- try {
553
- fs.mkdirSync(this.globalDir, { recursive: true });
554
- fs.writeFileSync(this.daemonRuntimeFile, JSON.stringify({ daemonVersion: version }));
555
- } catch {
556
- /* best-effort */
557
- }
558
- }
559
-
560
- async daemonTick() {
561
- if (!this.daemonSupervisor || this._daemonTicking) return 'skip';
562
- this._daemonTicking = true;
563
- try {
564
- const installed = this.daemonVersionImpl();
565
- const h = await this.daemonSupervisor.health();
566
- if (h && h.running) {
567
- // Running — but is it the CURRENT binary? After an auto-update the daemon
568
- // keeps the old code until recycled (RC1b analog). Recycle when the
569
- // installed version differs from what we recorded at spawn (a null marker
570
- // = spawned by a pre-drift-aware supervisor → treat as drift, recycle once).
571
- if (installed && this.readDaemonMarker() !== installed && this.daemonSupervisor.recycle) {
572
- this.log(
573
- `daemon version drift (marker=${this.readDaemonMarker() || 'none'} installed=${installed}) — recycling`,
574
- );
575
- const r = await this.daemonSupervisor.recycle();
576
- if (r && r.started) {
577
- this.writeDaemonMarker(installed);
578
- this.log(`daemon recycled to ${installed} (pid=${r.pid})`);
579
- return 'recycled';
580
- }
581
- this.log(`daemon recycle failed: ${r?.error || 'unknown'}`);
582
- return 'recycle-failed';
583
- }
584
- return 'healthy';
585
- }
586
- const r = await this.daemonSupervisor.ensureRunning();
587
- if (r && r.started) {
588
- this.writeDaemonMarker(installed);
589
- this.log(`daemon respawned (pid=${r.pid})`);
590
- return 'respawned';
591
- }
592
- if (r && r.alreadyRunning) return 'healthy';
593
- this.log(`daemon down, respawn not started: ${r?.error || 'unknown'}`);
594
- return 'failed';
595
- } catch (e) {
596
- this.log(`daemon-tick error: ${e?.message || e}`);
597
- return 'error';
598
- } finally {
599
- this._daemonTicking = false;
600
- }
601
- }
602
-
603
- /** Acquire the lock and start the supervision loop. Idempotent across processes. */
604
- start() {
605
- if (!this.acquireLock()) return { started: false, reason: 'already-running' };
606
- process.on('exit', () => this.releaseLock());
607
- process.on('SIGTERM', () => process.exit(0));
608
- process.on('SIGINT', () => process.exit(0));
609
- this.log(`supervisor start pid=${process.pid} v${this.selfVersion || '?'} watching http://127.0.0.1:${this.port}/api/health (workspace=${this.workspaceDir})`);
610
- this.timer = setInterval(() => { this.tick().catch((e) => this.log(`tick error: ${e?.message || e}`)); }, this.pollMs);
611
- this.tick().catch((e) => this.log(`tick error: ${e?.message || e}`));
612
-
613
- // Phase 2 auto-update: wake on a slow timer; the first check fires shortly
614
- // after start so the server has time to come up (verify reads its /health).
615
- if (this.autoUpdate && this.env.VITEST !== 'true' && this.env.NODE_ENV !== 'test') {
616
- this.buildAutoUpdater().then((u) => {
617
- this.autoUpdater = u;
618
- this.updateTimer = setInterval(() => this.runUpdateTick(), this.updatePollMs);
619
- if (this.updateTimer.unref) this.updateTimer.unref();
620
- // FORCE the first post-boot check (bypass the 6h dueForCheck): a restart
621
- // should always pull the latest, so a rebooted/woken machine can't sit on
622
- // a stale version just because it last checked <6h ago.
623
- const kick = setTimeout(() => this.runUpdateTick({ force: true }), 60_000);
624
- if (kick.unref) kick.unref();
625
- }).catch((e) => this.log(`auto-update init error: ${e?.message || e}`));
626
- }
627
-
628
- // Phase 3: keep the bmo-sync daemon alive independent of the server, so the
629
- // out-of-band support channel survives the server being down.
630
- if (this.superviseDaemon && this.env.VITEST !== 'true' && this.env.NODE_ENV !== 'test') {
631
- this.buildDaemonSupervisor().then((d) => {
632
- this.daemonSupervisor = d;
633
- this.daemonTimer = setInterval(() => { this.daemonTick().catch((e) => this.log(`daemon-tick error: ${e?.message || e}`)); }, this.daemonPollMs);
634
- if (this.daemonTimer.unref) this.daemonTimer.unref();
635
- this.daemonTick().catch(() => {}); // first probe now
636
- }).catch((e) => this.log(`daemon supervision init error: ${e?.message || e}`));
637
- }
638
- return { started: true };
639
- }
640
-
641
- stop() {
642
- if (this.timer) { clearInterval(this.timer); this.timer = null; }
643
- if (this.updateTimer) { clearInterval(this.updateTimer); this.updateTimer = null; }
644
- if (this.daemonTimer) { clearInterval(this.daemonTimer); this.daemonTimer = null; }
645
- this.releaseLock();
646
- }
647
- }
1
+ // WorkspaceSupervisor — keeps the wild-workspace server alive in the background.
2
+ //
3
+ // The server itself auto-starts the bmo-sync daemon on boot (DaemonSupervisor),
4
+ // so keeping the server up brings the whole local stack — public URL included —
5
+ // back to life. This is the watchdog half of the always-on feature
6
+ // (docs/always-on-design.md); `service.mjs` is the per-OS autostart half that
7
+ // launches this hidden at login via `wild-workspace service run`.
8
+ //
9
+ // Design (all proven on Windows incl. a real reboot, 2026-05-30):
10
+ // - Health-driven: polls GET /api/health and (re)spawns the server only when
11
+ // it is down — so it never fights a server someone else started and handles
12
+ // crash recovery naturally.
13
+ // - Singleton: an exclusive lockfile in the machine-global dir
14
+ // (~/.wild-workspace, NEVER the synced workspace — locked principle #1).
15
+ // A stale lock whose pid is dead is taken over.
16
+ // - Exponential backoff (capped) so a crash-looping server can't spin the CPU.
17
+ // - Everything is logged — silent death is the #1 un-debuggable failure mode.
18
+ //
19
+ // Every external touch-point (spawn, health probe, clock) is an injected seam
20
+ // so the suite never spawns a real process.
21
+
22
+ import { spawn } from 'node:child_process';
23
+ import http from 'node:http';
24
+ import fs from 'node:fs';
25
+ import os from 'node:os';
26
+ import path from 'node:path';
27
+ import { fileURLToPath } from 'node:url';
28
+ import { resolveDaemonVersion } from './daemon-bin.mjs';
29
+ import { restartSelf } from './service.mjs';
30
+
31
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
32
+ const DEFAULT_SERVER_ENTRY = path.join(__dirname, 'index.mjs');
33
+
34
+ /** Resolve true iff the local server answers /api/health. Never throws. */
35
+ export function probeHealth(port, timeoutMs = 2500) {
36
+ return new Promise((resolve) => {
37
+ const req = http.get(
38
+ { host: '127.0.0.1', port, path: '/api/health', timeout: timeoutMs },
39
+ (res) => { res.resume(); resolve(res.statusCode > 0); },
40
+ );
41
+ req.on('error', () => resolve(false));
42
+ req.on('timeout', () => { req.destroy(); resolve(false); });
43
+ });
44
+ }
45
+
46
+ /**
47
+ * Ask the running server its version via /api/health. Returns the version string
48
+ * or null (server down / no version field / parse error). Never throws. Used by
49
+ * the version-drift check (RC1) — a stale server keeps running its OLD code after
50
+ * an upgrade, so we compare what's RUNNING to what's INSTALLED on disk.
51
+ */
52
+ export function probeHealthVersion(port, timeoutMs = 2500) {
53
+ return new Promise((resolve) => {
54
+ const req = http.get(
55
+ { host: '127.0.0.1', port, path: '/api/health', timeout: timeoutMs },
56
+ (res) => {
57
+ let body = '';
58
+ res.on('data', (d) => { body += d; if (body.length > 4096) req.destroy(); });
59
+ res.on('end', () => {
60
+ try { resolve(JSON.parse(body).version || null); } catch { resolve(null); }
61
+ });
62
+ },
63
+ );
64
+ req.on('error', () => resolve(null));
65
+ req.on('timeout', () => { req.destroy(); resolve(null); });
66
+ });
67
+ }
68
+
69
+ /**
70
+ * The version installed on disk RIGHT NOW — read fresh from the package.json that
71
+ * ships next to this file, NOT the in-memory APP_VERSION constant. The supervisor
72
+ * is long-lived: after `npm i -g` (or the operator `reinstall-daemon`) swaps the
73
+ * package, the supervisor's own constant is stale too, so only a fresh disk read
74
+ * sees the new version. Respawning the server child reloads index.mjs from this
75
+ * same path, so the restart actually picks up the new code. Returns null on error.
76
+ */
77
+ export function installedVersion(entry = DEFAULT_SERVER_ENTRY) {
78
+ try {
79
+ // index.mjs lives at <pkg>/server/src/index.mjs → package.json is ../../.
80
+ const pkg = path.resolve(path.dirname(entry), '..', '..', 'package.json');
81
+ return JSON.parse(fs.readFileSync(pkg, 'utf8')).version || null;
82
+ } catch {
83
+ return null;
84
+ }
85
+ }
86
+
87
+ // Captured ONCE at module load = the version of the code THIS supervisor process
88
+ // is running. A fresh installedVersion() reads disk, which moves ahead after an
89
+ // in-place `npm i -g`; the difference is the supervisor's OWN staleness (the
90
+ // Part-8 gap). Distinct from APP_VERSION only in that we read the same file the
91
+ // drift check reads, so they're guaranteed equal at startup (no false drift).
92
+ export const SUPERVISOR_VERSION = installedVersion();
93
+
94
+ export class WorkspaceSupervisor {
95
+ constructor({
96
+ serverEntry = DEFAULT_SERVER_ENTRY,
97
+ workspaceDir = process.cwd(),
98
+ port = Number(process.env.WILD_WORKSPACE_PORT || 5173),
99
+ globalDir = path.join(os.homedir(), '.wild-workspace'),
100
+ node = process.execPath,
101
+ pollMs = 3000,
102
+ backoffStartMs = 1000,
103
+ backoffMaxMs = 30000,
104
+ probeTimeoutMs = 2500,
105
+ spawnImpl = spawn,
106
+ probeImpl = probeHealth,
107
+ nowImpl = () => Date.now(),
108
+ env = process.env,
109
+ crashLoopThreshold = 3,
110
+ diagnosticsImpl = null,
111
+ // RC1 version-drift auto-restart: when the RUNNING server reports an older
112
+ // version than what's INSTALLED on disk, restart it so it picks up the new
113
+ // code. On by default; seams injected for tests. WILD_WORKSPACE_NO_AUTORESTART=1
114
+ // disables it (e.g. a developer running an intentionally-older server).
115
+ autoRestartOnVersionDrift = env.WILD_WORKSPACE_NO_AUTORESTART !== '1',
116
+ versionImpl = probeHealthVersion,
117
+ installedVersionImpl = () => installedVersion(serverEntry),
118
+ // Phase 2 auto-update (Pillar B): the always-on supervisor self-updates the
119
+ // whole stack on the user's channel, with health-gated rollback. On by
120
+ // default; the env kill switch + the persisted off switch both disable it.
121
+ // Only wired up in start() (not in the unit-test path, which calls tick()
122
+ // directly) — see start(). updatePollMs is the *wake* cadence; the actual
123
+ // check interval lives inside AutoUpdater (6h) and self-rate-limits.
124
+ autoUpdate = env.WILD_WORKSPACE_NO_AUTOUPDATE !== '1',
125
+ updatePollMs = 60 * 60 * 1000, // wake hourly; AutoUpdater gates real checks
126
+ autoUpdaterFactory = null, // test seam: (supervisor) => AutoUpdater-like
127
+ // Phase 3 (Pillar A prerequisite): the always-on supervisor keeps the bmo-sync
128
+ // DAEMON alive too, independent of the workspace server. The daemon hosts the
129
+ // out-of-band support channel (reachable when :5173 is down), so it must not
130
+ // depend on the server being up. The server still ensureRunning()s the daemon
131
+ // at boot (idempotent); this is the keep-alive owner. On by default; kill switch
132
+ // WILD_WORKSPACE_NO_DAEMON_SUPERVISION=1. Only wired in start() (not the unit
133
+ // -test path, which calls daemonTick() directly with an injected factory).
134
+ superviseDaemon = env.WILD_WORKSPACE_NO_DAEMON_SUPERVISION !== '1',
135
+ daemonPollMs = 10000, // probe the daemon every 10s
136
+ daemonSupervisorFactory = null, // test seam: (supervisor) => DaemonSupervisor-like
137
+ // Daemon version-drift restart (the daemon analog of RC1b): after an
138
+ // auto-update installs a new daemon binary, the long-lived daemon process
139
+ // keeps running the OLD code until something restarts it — so the support
140
+ // channel silently won't activate. We recycle the daemon when the installed
141
+ // subpackage version differs from the version the running daemon was spawned
142
+ // under (tracked in `daemon-runtime.json`, since the daemon's /health reports
143
+ // no version). Test seam: inject a version function.
144
+ daemonVersionImpl = () => resolveDaemonVersion({ env }),
145
+ // Supervisor self-restart after auto-update (the Part-8 stale-process fix):
146
+ // once an update installs new code and the server child restarts + verifies
147
+ // healthy, the supervisor must restart ITSELF so its own new code (e.g. the
148
+ // daemon-drift recycle) loads — RC1b only restarts the child. Per-OS re-exec
149
+ // lives in service.mjs::restartSelf. On by default; kill switch
150
+ // WILD_WORKSPACE_NO_SELF_RESTART=1. A cooldown + a once-per-process guard
151
+ // prevent any restart loop; the delay lets the triggering update tick unwind
152
+ // and logs flush first. All seams injected (no real exit/spawn in tests).
153
+ selfRestart = env.WILD_WORKSPACE_NO_SELF_RESTART !== '1',
154
+ selfRestartCooldownMs = 10 * 60 * 1000,
155
+ selfRestartDelayMs = 3000,
156
+ restartSelfImpl = restartSelf,
157
+ exitImpl = (code = 0) => process.exit(code),
158
+ scheduleImpl = (fn, ms) => { const t = setTimeout(fn, ms); if (t.unref) t.unref(); return t; },
159
+ // The version THIS supervisor process is running (captured at module load).
160
+ // The self-drift backstop self-restarts when the installed-on-disk version
161
+ // moves ahead of this — covering EVERY update path (our auto-updater, the
162
+ // operator `update-now`, the CLI `update apply`, a manual `npm i -g`), not
163
+ // just our own. null disables the backstop (tests default to null).
164
+ selfVersion = SUPERVISOR_VERSION,
165
+ } = {}) {
166
+ Object.assign(this, {
167
+ serverEntry, workspaceDir, port, globalDir, node, pollMs,
168
+ backoffStartMs, backoffMaxMs, probeTimeoutMs, spawnImpl, probeImpl, nowImpl, env,
169
+ crashLoopThreshold, diagnosticsImpl,
170
+ autoRestartOnVersionDrift, versionImpl, installedVersionImpl,
171
+ autoUpdate, updatePollMs, autoUpdaterFactory,
172
+ superviseDaemon, daemonPollMs, daemonSupervisorFactory, daemonVersionImpl,
173
+ selfRestart, selfRestartCooldownMs, selfRestartDelayMs, restartSelfImpl, exitImpl, scheduleImpl,
174
+ selfVersion,
175
+ });
176
+ this.autoUpdater = null;
177
+ this.updateTimer = null;
178
+ this.daemonSupervisor = null;
179
+ this.daemonTimer = null;
180
+ this._daemonTicking = false;
181
+ this.daemonRuntimeFile = path.join(globalDir, 'daemon-runtime.json');
182
+ // Persists the last self-restart time so a fresh post-re-exec supervisor
183
+ // honours the cooldown too (belt-and-suspenders against a restart loop).
184
+ this.selfRestartFile = path.join(globalDir, 'self-restart.json');
185
+ this._selfRestartScheduled = false;
186
+ this.logFile = path.join(globalDir, 'supervisor.log');
187
+ this.serverLogFile = path.join(globalDir, 'server.out.log');
188
+ this.lockFile = path.join(globalDir, 'supervisor.lock');
189
+ // Phase 3.2: the bmo-sync daemon drops this file (a consented support
190
+ // `restart-server` action) for us to action — so a restart can be triggered
191
+ // out-of-band even when :5173 is wedged. We kill the child; the next tick
192
+ // respawns it from disk (new code loads). Safe: absent file = no-op.
193
+ this.restartRequestFile = path.join(globalDir, 'restart-request.json');
194
+ this.child = null;
195
+ this.backoff = backoffStartMs;
196
+ this.lastSpawn = 0;
197
+ this.timer = null;
198
+ this.spawnCount = 0; // consecutive spawns without becoming healthy
199
+ this.pushedThisEpisode = false; // crash-loop diagnostics pushed once per episode
200
+ }
201
+
202
+ log(msg) {
203
+ try { fs.appendFileSync(this.logFile, `[${new Date().toISOString()}] ${msg}\n`); } catch { /* best-effort */ }
204
+ }
205
+
206
+ /** Is a pid alive? EPERM means "exists, not ours" → still alive. */
207
+ pidAlive(pid) {
208
+ try { process.kill(pid, 0); return true; } catch (e) { return !!(e && e.code === 'EPERM'); }
209
+ }
210
+
211
+ /** Exclusive lock; take over ONLY a stale lock (recorded pid no longer alive). */
212
+ acquireLock() {
213
+ try { fs.mkdirSync(this.globalDir, { recursive: true }); } catch { /* surfaced below */ }
214
+ try {
215
+ const fd = fs.openSync(this.lockFile, 'wx');
216
+ fs.writeSync(fd, String(process.pid));
217
+ fs.closeSync(fd);
218
+ return true;
219
+ } catch {
220
+ let old = null;
221
+ try { old = Number(fs.readFileSync(this.lockFile, 'utf8').trim()); } catch { /* unreadable */ }
222
+ if (old && this.pidAlive(old)) {
223
+ this.log(`live supervisor pid=${old} already running; exiting`);
224
+ return false;
225
+ }
226
+ try { fs.writeFileSync(this.lockFile, String(process.pid)); this.log('took over stale lock'); return true; }
227
+ catch { return false; }
228
+ }
229
+ }
230
+
231
+ releaseLock() {
232
+ try {
233
+ if (Number(fs.readFileSync(this.lockFile, 'utf8').trim()) === process.pid) fs.unlinkSync(this.lockFile);
234
+ } catch { /* already gone */ }
235
+ }
236
+
237
+ spawnServer() {
238
+ let out = 'ignore';
239
+ try { out = fs.openSync(this.serverLogFile, 'a'); } catch { /* output discarded */ }
240
+ this.child = this.spawnImpl(this.node, [this.serverEntry], {
241
+ cwd: this.workspaceDir,
242
+ windowsHide: true,
243
+ stdio: ['ignore', out, out],
244
+ env: { ...this.env, WILD_WORKSPACE_NO_OPEN: '1', WILD_WORKSPACE_DIR: this.workspaceDir },
245
+ });
246
+ if (typeof out === 'number') { try { fs.closeSync(out); } catch { /* parent fd */ } }
247
+ this.lastSpawn = this.nowImpl();
248
+ const pid = this.child && this.child.pid;
249
+ this.log(`spawned server pid=${pid} (backoff=${this.backoff}ms)`);
250
+ if (this.child && this.child.on) {
251
+ this.child.on('exit', (code, sig) => { this.log(`server pid=${pid} exited code=${code} sig=${sig}`); this.child = null; });
252
+ }
253
+ return this.child;
254
+ }
255
+
256
+ /**
257
+ * Consume a pending support `restart-server` request (Phase 3.2). Returns true
258
+ * iff a request file was present (and removes it). Reading-then-deleting makes
259
+ * "present" mean "unhandled" — idempotent across ticks.
260
+ */
261
+ consumeRestartRequest() {
262
+ try {
263
+ fs.readFileSync(this.restartRequestFile); // throws if absent
264
+ } catch {
265
+ return false;
266
+ }
267
+ try { fs.unlinkSync(this.restartRequestFile); } catch { /* best-effort */ }
268
+ return true;
269
+ }
270
+
271
+ /** One supervision step. Returns its decision (exposed for tests). */
272
+ async tick() {
273
+ // Phase 3.2: a consented support restart request takes priority — kill the
274
+ // child so the next tick respawns it from disk (picks up any new code).
275
+ if (this.consumeRestartRequest()) {
276
+ this.log('restart-server requested (support channel) — restarting');
277
+ this.restartChild();
278
+ return 'restart-requested';
279
+ }
280
+ // Part-8 backstop: if disk moved ahead of our own code (any update path),
281
+ // schedule a supervisor self-restart. Side-effect only — never changes the
282
+ // tick decision below (server/daemon healing proceeds as usual meanwhile).
283
+ this.maybeSelfRestartOnDrift();
284
+ if (await this.probeImpl(this.port, this.probeTimeoutMs)) {
285
+ this.backoff = this.backoffStartMs; // healthy → reset backoff
286
+ this.spawnCount = 0; // healthy → not a crash loop
287
+ this.pushedThisEpisode = false;
288
+ // RC1 version drift: a healthy-but-STALE server (running older code than
289
+ // what's installed) should be restarted so the upgrade actually lands.
290
+ // Only when WE own the child — we restart by killing it and letting the
291
+ // next tick respawn (which reloads index.mjs from disk). A server started
292
+ // by someone else (foreground `wild-workspace`) we leave alone; we have no
293
+ // handle on it. The restarted server reports the installed version, so the
294
+ // drift clears and this won't loop.
295
+ if (this.autoRestartOnVersionDrift && this.child) {
296
+ try {
297
+ const running = await this.versionImpl(this.port, this.probeTimeoutMs);
298
+ const installed = this.installedVersionImpl();
299
+ if (running && installed && running !== installed) {
300
+ this.log(`version drift: running=${running} installed=${installed} — restarting server`);
301
+ try { this.child.kill(); } catch { /* exit handler clears child */ }
302
+ this.child = null;
303
+ this.backoff = this.backoffStartMs; // upgrade is intentional, not a crash
304
+ return 'version-drift-restart';
305
+ }
306
+ } catch (e) {
307
+ this.log(`version-drift check error: ${e?.message || e}`);
308
+ }
309
+ }
310
+ return 'healthy';
311
+ }
312
+ if (this.child) return 'booting'; // spawned, still coming up
313
+ if (this.nowImpl() - this.lastSpawn < this.backoff) return 'backoff';
314
+ this.spawnServer();
315
+ this.backoff = Math.min(this.backoff * 2, this.backoffMaxMs);
316
+ this.spawnCount += 1;
317
+ // Crash loop: the server won't stay up, so the operator channel (which rides
318
+ // the :5173 server) can't reach this machine at all. Push an install-down
319
+ // `doctor` bundle to bmo-sync ONCE per episode so support sees it anyway —
320
+ // the install-failed-before-server-up case (docs/user-experience.md §5).
321
+ if (this.spawnCount >= this.crashLoopThreshold && !this.pushedThisEpisode) {
322
+ this.pushedThisEpisode = true;
323
+ Promise.resolve(this.pushDiagnostics()).catch((e) => this.log(`diag push error: ${e?.message || e}`));
324
+ }
325
+ return 'spawned';
326
+ }
327
+
328
+ /**
329
+ * Push an install-down diagnostic bundle to bmo-sync. Injected (`diagnosticsImpl`)
330
+ * in tests; the real path is consent- + token-gated and never runs under the
331
+ * test runner. Best-effort, never throws into the supervision loop.
332
+ */
333
+ async pushDiagnostics() {
334
+ if (this.diagnosticsImpl) return this.diagnosticsImpl(this);
335
+ if (process.env.VITEST || process.env.NODE_ENV === 'test') return;
336
+ try {
337
+ const [{ buildConfig }, { runDoctor }, { loadObservabilityConsent }] = await Promise.all([
338
+ import('./config.mjs'),
339
+ import('./doctor.mjs'),
340
+ import('./observability.mjs'),
341
+ ]);
342
+ const config = buildConfig({ workspaceDir: this.workspaceDir, port: this.port });
343
+ if (!config.accountToken) return; // can't key it to a user
344
+ if (process.env.WILD_WORKSPACE_NO_TELEMETRY === '1') return; // kill switch
345
+ if (!loadObservabilityConsent(config.dataDir).enabled) return; // consent
346
+ const report = await runDoctor({ config });
347
+ const url = `${config.bmoSyncServerUrl.replace(/\/$/, '')}/api/telemetry`;
348
+ const ctrl = new AbortController();
349
+ const t = setTimeout(() => ctrl.abort(), 5000);
350
+ try {
351
+ await fetch(url, {
352
+ method: 'POST',
353
+ headers: { 'content-type': 'application/json' },
354
+ body: JSON.stringify({
355
+ account_token: config.accountToken,
356
+ slug: config.account?.slug || null,
357
+ workspace_id: config.workspaceId,
358
+ kind: 'install-down',
359
+ doctor: report,
360
+ sent_at: Math.floor(Date.now() / 1000),
361
+ }),
362
+ signal: ctrl.signal,
363
+ });
364
+ this.log(`pushed install-down diagnostics (fail=${report.summary?.fail})`);
365
+ } finally {
366
+ clearTimeout(t);
367
+ }
368
+ } catch (e) {
369
+ this.log(`diagnostics push failed: ${e?.message || e}`);
370
+ }
371
+ }
372
+
373
+ /**
374
+ * Restart the supervised server child so freshly installed code is loaded.
375
+ * Kills it and lets the next tick respawn (which reloads index.mjs from disk) —
376
+ * the same mechanism as the version-drift restart, exposed for the AutoUpdater.
377
+ * No-op (returns false) when we don't own a child (foreground server).
378
+ */
379
+ restartChild() {
380
+ if (!this.child) return false;
381
+ this.log('restartChild: killing server to load new code');
382
+ try { this.child.kill(); } catch { /* exit handler clears child */ }
383
+ this.child = null;
384
+ this.backoff = this.backoffStartMs; // an intentional restart, not a crash
385
+ return true;
386
+ }
387
+
388
+ /** The last self-restart time (epoch ms), or 0. Used for the loop-guard cooldown. */
389
+ readLastSelfRestart() {
390
+ try { return Number(JSON.parse(fs.readFileSync(this.selfRestartFile, 'utf8')).at) || 0; }
391
+ catch { return 0; }
392
+ }
393
+
394
+ writeLastSelfRestart(at) {
395
+ try {
396
+ fs.mkdirSync(this.globalDir, { recursive: true });
397
+ fs.writeFileSync(this.selfRestartFile, JSON.stringify({ at }));
398
+ } catch { /* best-effort */ }
399
+ }
400
+
401
+ /**
402
+ * Schedule a supervisor self-restart so freshly-installed SUPERVISOR code loads
403
+ * (the Part-8 stale-process fix). Called from the AutoUpdater's onUpdate hook
404
+ * AFTER an update installed + restarted the server child + verified it healthy —
405
+ * so a bad release has already rolled back before we re-exec ourselves. Guarded
406
+ * three ways against a restart loop: the kill switch, a once-per-process flag,
407
+ * and a persisted cooldown (survives the re-exec). Returns a status string
408
+ * ('scheduled' | 'disabled' | 'already' | 'cooldown') for tests/logging. The
409
+ * actual restart runs on a short delay so the triggering tick unwinds first.
410
+ */
411
+ scheduleSelfRestart(reason) {
412
+ if (!this.selfRestart) return 'disabled';
413
+ if (this._selfRestartScheduled) return 'already';
414
+ const now = this.nowImpl();
415
+ const last = this.readLastSelfRestart();
416
+ if (last && now - last < this.selfRestartCooldownMs) {
417
+ this.log(`self-restart skipped (cooldown, last ${Math.round((now - last) / 1000)}s ago) — ${reason}`);
418
+ return 'cooldown';
419
+ }
420
+ this._selfRestartScheduled = true;
421
+ this.writeLastSelfRestart(now);
422
+ this.log(`self-restart scheduled in ${this.selfRestartDelayMs}ms — ${reason}`);
423
+ this.scheduleImpl(() => {
424
+ this._performSelfRestart(reason).catch((e) => this.log(`self-restart error: ${e?.message || e}`));
425
+ }, this.selfRestartDelayMs);
426
+ return 'scheduled';
427
+ }
428
+
429
+ /**
430
+ * Carry out the self-restart. On mac/Linux the service manager kills+relaunches
431
+ * us (we just issue the command and get SIGTERM'd → our exit handler releases the
432
+ * lock). On Windows restartSelf spawned a hidden successor and returns
433
+ * willExit:true — we then release the lock (via stop()) and exit so the successor
434
+ * can take it. A non-managed run reports restarted:false and we stay up on the
435
+ * old code (no worse than before this feature). Never throws.
436
+ */
437
+ async _performSelfRestart(reason) {
438
+ this.log(`self-restart now — ${reason}`);
439
+ let r;
440
+ try {
441
+ r = await this.restartSelfImpl({ dir: this.globalDir, port: this.port });
442
+ } catch (e) {
443
+ this.log(`self-restart impl error: ${e?.message || e}`);
444
+ return { restarted: false, error: e?.message || String(e) };
445
+ }
446
+ this.log(`self-restart result: ${JSON.stringify(r)}`);
447
+ if (r && r.willExit) {
448
+ this.stop(); // clears timers + releases the lock so the successor can take it
449
+ this.exitImpl(0);
450
+ }
451
+ return r;
452
+ }
453
+
454
+ /**
455
+ * Backstop for the Part-8 gap on EVERY update path, not just our own auto-
456
+ * updater: when the version installed on disk no longer matches the code THIS
457
+ * supervisor is running, the supervisor is stale → schedule a self-restart.
458
+ * RC1b already restarts the stale server child and daemonTick recycles the
459
+ * stale daemon; this is the missing third leg (the supervisor itself), so an
460
+ * operator `update-now` / CLI `update apply` / manual `npm i -g` also lands new
461
+ * supervisor code with no reboot. Skipped while OUR auto-updater is mid-flight
462
+ * so the rollback window is respected (that path self-restarts via the onUpdate
463
+ * hook, only after verify succeeds). Cheap (an in-memory compare guarding a disk
464
+ * read) and idempotent (scheduleSelfRestart de-dupes). Never throws.
465
+ */
466
+ maybeSelfRestartOnDrift() {
467
+ if (!this.selfRestart || !this.selfVersion) return false;
468
+ if (this._selfRestartScheduled) return false;
469
+ if (this.autoUpdater && this.autoUpdater.inProgress) return false; // respect rollback window
470
+ let installed = null;
471
+ try { installed = this.installedVersionImpl(); } catch { return false; }
472
+ if (!installed || installed === this.selfVersion) return false;
473
+ this.log(`supervisor version drift: running=${this.selfVersion} installed=${installed} — self-restarting`);
474
+ this.scheduleSelfRestart(`supervisor drift ${this.selfVersion}→${installed}`);
475
+ return true;
476
+ }
477
+
478
+ /** Build the AutoUpdater bound to this supervisor. Separated for the test seam. */
479
+ async buildAutoUpdater() {
480
+ if (this.autoUpdaterFactory) return this.autoUpdaterFactory(this);
481
+ // Lazy import keeps the unit-test path (which never calls start()) free of the
482
+ // auto-update module + its registry/npm seams.
483
+ const { AutoUpdater } = await import('./auto-update.mjs');
484
+ return new AutoUpdater({
485
+ globalDir: this.globalDir,
486
+ port: this.port,
487
+ installedVersionImpl: this.installedVersionImpl,
488
+ healthVersionImpl: (port) => this.versionImpl(port, this.probeTimeoutMs),
489
+ restartImpl: async () => { this.restartChild(); },
490
+ nowImpl: this.nowImpl,
491
+ env: this.env,
492
+ logImpl: (m) => this.log(m),
493
+ onUpdate: (rec) => {
494
+ this.log(`auto-update result: ${rec.from || '?'}→${rec.to} ${rec.status}`);
495
+ // A genuine version change landed healthy → restart the supervisor itself
496
+ // so its own new code loads (Part-8 stale-process fix). Guarded against
497
+ // loops inside scheduleSelfRestart. Fires only on a real bump (to≠from),
498
+ // never on rollback/failure (those statuses aren't 'ok').
499
+ if (rec.status === 'ok' && rec.to && rec.from && rec.to !== rec.from) {
500
+ this.scheduleSelfRestart(`auto-update ${rec.from}→${rec.to}`);
501
+ }
502
+ },
503
+ });
504
+ }
505
+
506
+ runUpdateTick(opts = {}) {
507
+ if (!this.autoUpdater) return;
508
+ this.autoUpdater.tick(opts)
509
+ .then((r) => { if (r && !['not-due', 'disabled', 'up-to-date', 'busy'].includes(r)) this.log(`auto-update tick: ${r}`); })
510
+ .catch((e) => this.log(`auto-update error: ${e?.message || e}`));
511
+ }
512
+
513
+ /**
514
+ * Build the DaemonSupervisor the always-on layer owns. Reads a FRESH config
515
+ * (not the stale module constant) so the account token / relay in effect when
516
+ * always-on starts are used. Lazy import keeps the unit-test path (which never
517
+ * calls start()) free of config + daemon-supervisor. Test seam: factory.
518
+ */
519
+ async buildDaemonSupervisor() {
520
+ if (this.daemonSupervisorFactory) return this.daemonSupervisorFactory(this);
521
+ const [{ buildConfig }, { DaemonSupervisor }] = await Promise.all([
522
+ import('./config.mjs'),
523
+ import('./daemon-supervisor.mjs'),
524
+ ]);
525
+ const config = buildConfig({ workspaceDir: this.workspaceDir, port: this.port });
526
+ return new DaemonSupervisor({
527
+ httpBase: config.daemonHttpUrl,
528
+ globalDir: this.globalDir,
529
+ accountToken: config.accountToken,
530
+ serverUrl: config.bmoSyncServerUrl,
531
+ });
532
+ }
533
+
534
+ /**
535
+ * One daemon-supervision step: if the daemon isn't answering /health, (re)start
536
+ * it. Deliberately INDEPENDENT of server health — the daemon (and its support
537
+ * channel) must stay up even when the server is crashed/mid-upgrade. Re-entrancy
538
+ * guarded so a slow spawn can't overlap the next tick. Never throws.
539
+ */
540
+ /** The daemon version the currently-running daemon was spawned under, or null. */
541
+ readDaemonMarker() {
542
+ try {
543
+ const v = JSON.parse(fs.readFileSync(this.daemonRuntimeFile, 'utf8'))?.daemonVersion;
544
+ return typeof v === 'string' ? v : null;
545
+ } catch {
546
+ return null;
547
+ }
548
+ }
549
+
550
+ writeDaemonMarker(version) {
551
+ if (!version) return; // unknown installed version (PATH/vendor) — don't pin
552
+ try {
553
+ fs.mkdirSync(this.globalDir, { recursive: true });
554
+ fs.writeFileSync(this.daemonRuntimeFile, JSON.stringify({ daemonVersion: version }));
555
+ } catch {
556
+ /* best-effort */
557
+ }
558
+ }
559
+
560
+ async daemonTick() {
561
+ if (!this.daemonSupervisor || this._daemonTicking) return 'skip';
562
+ this._daemonTicking = true;
563
+ try {
564
+ const installed = this.daemonVersionImpl();
565
+ const h = await this.daemonSupervisor.health();
566
+ if (h && h.running) {
567
+ // Running — but is it the CURRENT binary? After an auto-update the daemon
568
+ // keeps the old code until recycled (RC1b analog). Recycle when the
569
+ // installed version differs from what we recorded at spawn (a null marker
570
+ // = spawned by a pre-drift-aware supervisor → treat as drift, recycle once).
571
+ if (installed && this.readDaemonMarker() !== installed && this.daemonSupervisor.recycle) {
572
+ this.log(
573
+ `daemon version drift (marker=${this.readDaemonMarker() || 'none'} installed=${installed}) — recycling`,
574
+ );
575
+ const r = await this.daemonSupervisor.recycle();
576
+ if (r && r.started) {
577
+ this.writeDaemonMarker(installed);
578
+ this.log(`daemon recycled to ${installed} (pid=${r.pid})`);
579
+ return 'recycled';
580
+ }
581
+ this.log(`daemon recycle failed: ${r?.error || 'unknown'}`);
582
+ return 'recycle-failed';
583
+ }
584
+ return 'healthy';
585
+ }
586
+ const r = await this.daemonSupervisor.ensureRunning();
587
+ if (r && r.started) {
588
+ this.writeDaemonMarker(installed);
589
+ this.log(`daemon respawned (pid=${r.pid})`);
590
+ return 'respawned';
591
+ }
592
+ if (r && r.alreadyRunning) return 'healthy';
593
+ this.log(`daemon down, respawn not started: ${r?.error || 'unknown'}`);
594
+ return 'failed';
595
+ } catch (e) {
596
+ this.log(`daemon-tick error: ${e?.message || e}`);
597
+ return 'error';
598
+ } finally {
599
+ this._daemonTicking = false;
600
+ }
601
+ }
602
+
603
+ /** Acquire the lock and start the supervision loop. Idempotent across processes. */
604
+ start() {
605
+ if (!this.acquireLock()) return { started: false, reason: 'already-running' };
606
+ process.on('exit', () => this.releaseLock());
607
+ process.on('SIGTERM', () => process.exit(0));
608
+ process.on('SIGINT', () => process.exit(0));
609
+ this.log(`supervisor start pid=${process.pid} v${this.selfVersion || '?'} watching http://127.0.0.1:${this.port}/api/health (workspace=${this.workspaceDir})`);
610
+ this.timer = setInterval(() => { this.tick().catch((e) => this.log(`tick error: ${e?.message || e}`)); }, this.pollMs);
611
+ this.tick().catch((e) => this.log(`tick error: ${e?.message || e}`));
612
+
613
+ // Phase 2 auto-update: wake on a slow timer; the first check fires shortly
614
+ // after start so the server has time to come up (verify reads its /health).
615
+ if (this.autoUpdate && this.env.VITEST !== 'true' && this.env.NODE_ENV !== 'test') {
616
+ this.buildAutoUpdater().then((u) => {
617
+ this.autoUpdater = u;
618
+ this.updateTimer = setInterval(() => this.runUpdateTick(), this.updatePollMs);
619
+ if (this.updateTimer.unref) this.updateTimer.unref();
620
+ // FORCE the first post-boot check (bypass the 6h dueForCheck): a restart
621
+ // should always pull the latest, so a rebooted/woken machine can't sit on
622
+ // a stale version just because it last checked <6h ago.
623
+ const kick = setTimeout(() => this.runUpdateTick({ force: true }), 60_000);
624
+ if (kick.unref) kick.unref();
625
+ }).catch((e) => this.log(`auto-update init error: ${e?.message || e}`));
626
+ }
627
+
628
+ // Phase 3: keep the bmo-sync daemon alive independent of the server, so the
629
+ // out-of-band support channel survives the server being down.
630
+ if (this.superviseDaemon && this.env.VITEST !== 'true' && this.env.NODE_ENV !== 'test') {
631
+ this.buildDaemonSupervisor().then((d) => {
632
+ this.daemonSupervisor = d;
633
+ this.daemonTimer = setInterval(() => { this.daemonTick().catch((e) => this.log(`daemon-tick error: ${e?.message || e}`)); }, this.daemonPollMs);
634
+ if (this.daemonTimer.unref) this.daemonTimer.unref();
635
+ this.daemonTick().catch(() => {}); // first probe now
636
+ }).catch((e) => this.log(`daemon supervision init error: ${e?.message || e}`));
637
+ }
638
+ return { started: true };
639
+ }
640
+
641
+ stop() {
642
+ if (this.timer) { clearInterval(this.timer); this.timer = null; }
643
+ if (this.updateTimer) { clearInterval(this.updateTimer); this.updateTimer = null; }
644
+ if (this.daemonTimer) { clearInterval(this.daemonTimer); this.daemonTimer = null; }
645
+ this.releaseLock();
646
+ }
647
+ }