@venturewild/workspace 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/server/bin/wild-workspace.mjs +71 -0
- package/server/src/auto-update.mjs +277 -0
- package/server/src/config.mjs +6 -0
- package/server/src/doctor.mjs +75 -1
- package/server/src/index.mjs +75 -2
- package/server/src/operator.mjs +27 -0
- package/server/src/supervisor.mjs +137 -0
- package/server/src/tunnel-watchdog.mjs +153 -0
package/package.json
CHANGED
|
@@ -23,6 +23,10 @@ import { appendLine, listLogs, tailFile } from '../src/logpaths.mjs';
|
|
|
23
23
|
import { runDoctor, renderDoctor, writeDoctorBundle } from '../src/doctor.mjs';
|
|
24
24
|
import { enableOperator, disableOperator, operatorStatus } from '../src/operator.mjs';
|
|
25
25
|
import { loadObservabilityConsent, setObservabilityConsent } from '../src/observability.mjs';
|
|
26
|
+
import {
|
|
27
|
+
AutoUpdater, PACKAGE_NAME, npmInstall, recordUpdate,
|
|
28
|
+
loadUpdateSettings, setUpdateEnabled, setUpdateChannel,
|
|
29
|
+
} from '../src/auto-update.mjs';
|
|
26
30
|
import { openOwnerBrowser } from '../src/owner-browser.mjs';
|
|
27
31
|
import { planReset, applyReset, RESET_KEEPS } from '../src/reset.mjs';
|
|
28
32
|
|
|
@@ -56,6 +60,9 @@ Usage:
|
|
|
56
60
|
wild-workspace operator disable revoke the support token
|
|
57
61
|
wild-workspace operator status is the support channel on?
|
|
58
62
|
wild-workspace observability [on|off|status] share session + install health so we can help (default on; never chat content)
|
|
63
|
+
wild-workspace update [apply] check for / install a newer version (auto by default)
|
|
64
|
+
wild-workspace update on|off toggle background auto-update
|
|
65
|
+
wild-workspace update channel stable|beta choose the update channel
|
|
59
66
|
wild-workspace service install keep your workspace always-on (starts at login, no admin)
|
|
60
67
|
wild-workspace service uninstall turn always-on off
|
|
61
68
|
wild-workspace service status show always-on status (installed? supervisor? server?)
|
|
@@ -599,6 +606,67 @@ async function runObservabilityCommand(action = 'status', opts = {}) {
|
|
|
599
606
|
return;
|
|
600
607
|
}
|
|
601
608
|
|
|
609
|
+
// `wild-workspace update [apply|on|off|channel <stable|beta>]` — Phase 2
|
|
610
|
+
// auto-update (docs/remote-support-and-self-healing-design.md). With no
|
|
611
|
+
// sub-command it checks the channel for a newer release; `apply` installs it now;
|
|
612
|
+
// `on`/`off` toggle the default-on background updater; `channel` switches
|
|
613
|
+
// stable/beta. The always-on supervisor does this automatically — this is the
|
|
614
|
+
// manual lever + the off switch.
|
|
615
|
+
async function runUpdateCommand(opts) {
|
|
616
|
+
const sub = opts.positional[1];
|
|
617
|
+
const gdir = globalDir();
|
|
618
|
+
|
|
619
|
+
if (sub === 'on' || sub === 'off') {
|
|
620
|
+
const rec = setUpdateEnabled(gdir, sub === 'on');
|
|
621
|
+
console.log(
|
|
622
|
+
rec.enabled
|
|
623
|
+
? '✓ auto-update ON — wild-workspace keeps itself up to date in the background.'
|
|
624
|
+
: '✓ auto-update OFF — update manually with `wild-workspace update apply`.',
|
|
625
|
+
);
|
|
626
|
+
console.log(` channel: ${rec.channel}`);
|
|
627
|
+
return;
|
|
628
|
+
}
|
|
629
|
+
if (sub === 'channel') {
|
|
630
|
+
const chan = opts.positional[2];
|
|
631
|
+
if (chan !== 'stable' && chan !== 'beta') {
|
|
632
|
+
console.log('usage: wild-workspace update channel stable|beta');
|
|
633
|
+
return;
|
|
634
|
+
}
|
|
635
|
+
console.log(`✓ update channel set to ${setUpdateChannel(gdir, chan).channel}.`);
|
|
636
|
+
return;
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
const settings = loadUpdateSettings(gdir);
|
|
640
|
+
const c = await new AutoUpdater({ globalDir: gdir }).check();
|
|
641
|
+
console.log(`wild-workspace ${c.current} (channel: ${c.channel}, auto-update: ${settings.enabled ? 'on' : 'off'})`);
|
|
642
|
+
if (!c.latest) {
|
|
643
|
+
console.log(' could not reach the npm registry to check for updates.');
|
|
644
|
+
process.exitCode = 1;
|
|
645
|
+
return;
|
|
646
|
+
}
|
|
647
|
+
if (!c.available) {
|
|
648
|
+
console.log(` up to date — ${c.latest} is the latest on ${c.channel}.`);
|
|
649
|
+
return;
|
|
650
|
+
}
|
|
651
|
+
console.log(` update available: ${c.current} → ${c.latest}`);
|
|
652
|
+
if (sub !== 'apply') {
|
|
653
|
+
console.log(' run `wild-workspace update apply` to install it now.');
|
|
654
|
+
return;
|
|
655
|
+
}
|
|
656
|
+
console.log(` installing ${c.latest}…`);
|
|
657
|
+
const res = await npmInstall(`${PACKAGE_NAME}@${c.latest}`);
|
|
658
|
+
if (res.code === 0) {
|
|
659
|
+
recordUpdate(gdir, { from: c.current, to: c.latest, at: Date.now(), status: 'installed' });
|
|
660
|
+
console.log(` ✓ installed ${c.latest}.`);
|
|
661
|
+
console.log(' The always-on supervisor will restart into the new version shortly,');
|
|
662
|
+
console.log(' or restart `wild-workspace` yourself to use it now.');
|
|
663
|
+
} else {
|
|
664
|
+
console.log(` ✗ install failed (code ${res.code}).`);
|
|
665
|
+
if (res.output) console.log(' ' + res.output.split('\n').filter(Boolean).slice(-3).join('\n '));
|
|
666
|
+
process.exitCode = 1;
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
|
|
602
670
|
// `wild-workspace operator [enable|disable|status]` — the consented support
|
|
603
671
|
// channel (docs/SECURITY.md). OFF by default; `enable` mints a token to hand to
|
|
604
672
|
// the wild-workspace team so they can diagnose + run a fixed set of safe fixes.
|
|
@@ -734,6 +802,9 @@ async function main() {
|
|
|
734
802
|
if (opts.positional[0] === 'logs') {
|
|
735
803
|
return runLogsCommand(opts);
|
|
736
804
|
}
|
|
805
|
+
if (opts.positional[0] === 'update') {
|
|
806
|
+
return runUpdateCommand(opts);
|
|
807
|
+
}
|
|
737
808
|
if (opts.positional[0] === 'operator') {
|
|
738
809
|
return runOperatorCommand(opts.positional[1], opts);
|
|
739
810
|
}
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
// AutoUpdater — Phase 2 (Pillar B) of the self-healing epic
|
|
2
|
+
// (docs/remote-support-and-self-healing-design.md). Kills the manual `npm i -g`
|
|
3
|
+
// re-install treadmill that turned the first external install into a copy-paste
|
|
4
|
+
// marathon: the always-on supervisor periodically checks the published version on
|
|
5
|
+
// the user's channel, and on a new release it installs it, restarts the supervised
|
|
6
|
+
// server (reusing the version-drift restart RC1b already ships), health-checks the
|
|
7
|
+
// result, and ROLLS BACK to the pinned previous version if the new one doesn't come
|
|
8
|
+
// up healthy.
|
|
9
|
+
//
|
|
10
|
+
// Tuan's locked decisions (design doc Part 6): auto-update is DEFAULT-ON (the
|
|
11
|
+
// OneDrive bar — it just updates itself) with a visible "updated to vX" note + an
|
|
12
|
+
// off switch; channels are `stable` (default) and `beta` (opt-in).
|
|
13
|
+
//
|
|
14
|
+
// Like observability.mjs/operator.mjs, settings live in their own file in the
|
|
15
|
+
// machine-global dir (~/.wild-workspace, NEVER the synced workspace — locked
|
|
16
|
+
// principle #1) so they survive the supervisor relaunching from a different cwd.
|
|
17
|
+
//
|
|
18
|
+
// Every external touch-point (npm install, registry fetch, health probe, restart,
|
|
19
|
+
// clock, sleep) is an injected seam so the suite never spawns a process, hits the
|
|
20
|
+
// network, or actually waits.
|
|
21
|
+
|
|
22
|
+
import { spawn } from 'node:child_process';
|
|
23
|
+
import fs from 'node:fs';
|
|
24
|
+
import path from 'node:path';
|
|
25
|
+
import { globalDir as defaultGlobalDir } from './logpaths.mjs';
|
|
26
|
+
import { installedVersion, probeHealthVersion } from './supervisor.mjs';
|
|
27
|
+
import { ensureToolPath } from './agent.mjs';
|
|
28
|
+
|
|
29
|
+
export const PACKAGE_NAME = '@venturewild/workspace';
|
|
30
|
+
const DEFAULT_CHECK_INTERVAL_MS = 6 * 60 * 60 * 1000; // 6h — releases are not frequent
|
|
31
|
+
|
|
32
|
+
// --- persisted settings (~/.wild-workspace/update.json) ----------------------
|
|
33
|
+
|
|
34
|
+
function updateFile(dir) { return path.join(dir, 'update.json'); }
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* DEFAULT-ON: an absent file means "auto-update enabled, stable channel" — the
|
|
38
|
+
* disclosure rides onboarding + the visible update note, mirroring observability.
|
|
39
|
+
*/
|
|
40
|
+
export function loadUpdateSettings(dir = defaultGlobalDir()) {
|
|
41
|
+
try {
|
|
42
|
+
const p = JSON.parse(fs.readFileSync(updateFile(dir), 'utf8'));
|
|
43
|
+
return {
|
|
44
|
+
enabled: p.enabled !== false,
|
|
45
|
+
channel: p.channel === 'beta' ? 'beta' : 'stable',
|
|
46
|
+
lastCheckAt: Number(p.lastCheckAt) || 0,
|
|
47
|
+
lastUpdate: p.lastUpdate || null, // { from, to, at, status }
|
|
48
|
+
};
|
|
49
|
+
} catch {
|
|
50
|
+
return { enabled: true, channel: 'stable', lastCheckAt: 0, lastUpdate: null };
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function writeSettings(dir, rec) {
|
|
55
|
+
try {
|
|
56
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
57
|
+
fs.writeFileSync(updateFile(dir), JSON.stringify(rec, null, 2), { mode: 0o600 });
|
|
58
|
+
} catch {
|
|
59
|
+
/* read-only fs — fall back to in-memory for this run */
|
|
60
|
+
}
|
|
61
|
+
return rec;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
export function setUpdateEnabled(dir, enabled) {
|
|
65
|
+
return writeSettings(dir, { ...loadUpdateSettings(dir), enabled: Boolean(enabled) });
|
|
66
|
+
}
|
|
67
|
+
export function setUpdateChannel(dir, channel) {
|
|
68
|
+
return writeSettings(dir, { ...loadUpdateSettings(dir), channel: channel === 'beta' ? 'beta' : 'stable' });
|
|
69
|
+
}
|
|
70
|
+
export function touchLastCheck(dir, at) {
|
|
71
|
+
return writeSettings(dir, { ...loadUpdateSettings(dir), lastCheckAt: at });
|
|
72
|
+
}
|
|
73
|
+
/** Record the outcome of an update attempt; returns the stored record. */
|
|
74
|
+
export function recordUpdate(dir, rec) {
|
|
75
|
+
return writeSettings(dir, { ...loadUpdateSettings(dir), lastUpdate: rec }).lastUpdate;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// --- semver compare (no dep) -------------------------------------------------
|
|
79
|
+
|
|
80
|
+
function parseVersion(v) {
|
|
81
|
+
const [core, pre = ''] = String(v).replace(/^v/, '').split('-');
|
|
82
|
+
const nums = core.split('.').map((n) => parseInt(n, 10) || 0);
|
|
83
|
+
while (nums.length < 3) nums.push(0);
|
|
84
|
+
return { nums, pre };
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/** True iff version `a` is strictly newer than `b` (good enough for dist-tags). */
|
|
88
|
+
export function isNewer(a, b) {
|
|
89
|
+
if (!a || !b) return false;
|
|
90
|
+
const pa = parseVersion(a), pb = parseVersion(b);
|
|
91
|
+
for (let i = 0; i < 3; i++) {
|
|
92
|
+
if (pa.nums[i] !== pb.nums[i]) return pa.nums[i] > pb.nums[i];
|
|
93
|
+
}
|
|
94
|
+
// Equal core: a release (no prerelease) outranks a prerelease; else lexical.
|
|
95
|
+
if (pa.pre && !pb.pre) return false;
|
|
96
|
+
if (!pa.pre && pb.pre) return true;
|
|
97
|
+
if (pa.pre && pb.pre) return pa.pre > pb.pre;
|
|
98
|
+
return false; // identical
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// --- registry + npm primitives -----------------------------------------------
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* The version published on `channel`'s dist-tag (stable→latest, beta→beta), or
|
|
105
|
+
* null on any failure. Uses the abbreviated registry document (smaller payload).
|
|
106
|
+
*/
|
|
107
|
+
export async function fetchLatestVersion(channel, {
|
|
108
|
+
fetchImpl = fetch, packageName = PACKAGE_NAME, timeoutMs = 8000,
|
|
109
|
+
} = {}) {
|
|
110
|
+
const tag = channel === 'beta' ? 'beta' : 'latest';
|
|
111
|
+
const ctrl = new AbortController();
|
|
112
|
+
const t = setTimeout(() => ctrl.abort(), timeoutMs);
|
|
113
|
+
try {
|
|
114
|
+
const res = await fetchImpl(`https://registry.npmjs.org/${packageName.replace('/', '%2f')}`, {
|
|
115
|
+
signal: ctrl.signal,
|
|
116
|
+
headers: { accept: 'application/vnd.npm.install-v1+json' },
|
|
117
|
+
});
|
|
118
|
+
if (!res || !res.ok) return null;
|
|
119
|
+
const body = await res.json();
|
|
120
|
+
return body?.['dist-tags']?.[tag] || null;
|
|
121
|
+
} catch {
|
|
122
|
+
return null;
|
|
123
|
+
} finally {
|
|
124
|
+
clearTimeout(t);
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/** Run `npm i -g <spec>`. Resolves {code, output, timedOut?, error?}; never rejects. */
|
|
129
|
+
export function npmInstall(spec, {
|
|
130
|
+
spawnImpl = spawn, timeoutMs = 180000, ensurePathImpl = ensureToolPath, env = process.env,
|
|
131
|
+
} = {}) {
|
|
132
|
+
return new Promise((resolve) => {
|
|
133
|
+
const cmd = process.platform === 'win32' ? 'npm.cmd' : 'npm';
|
|
134
|
+
// The always-on supervisor (our caller in the field) runs under launchd/GUI,
|
|
135
|
+
// which inherits a MINIMAL PATH omitting ~/.npm-global, /usr/local/bin,
|
|
136
|
+
// Homebrew, nvm — so a bare `npm` spawn would ENOENT (the 0.1.8 `claude`
|
|
137
|
+
// bug class). Augment PATH the same way agent.mjs does before spawning. We
|
|
138
|
+
// copy env so we never mutate the caller's process.env.
|
|
139
|
+
const childEnv = { ...env };
|
|
140
|
+
try { ensurePathImpl(childEnv); } catch { /* best-effort — fall back to inherited PATH */ }
|
|
141
|
+
let child;
|
|
142
|
+
try {
|
|
143
|
+
child = spawnImpl(cmd, ['i', '-g', spec], { windowsHide: true, env: childEnv });
|
|
144
|
+
} catch (e) {
|
|
145
|
+
return resolve({ code: -1, error: e?.message || String(e), output: '' });
|
|
146
|
+
}
|
|
147
|
+
let out = '';
|
|
148
|
+
const cap = (d) => { out += String(d); if (out.length > 20000) out = out.slice(-20000); };
|
|
149
|
+
child.stdout?.on?.('data', cap);
|
|
150
|
+
child.stderr?.on?.('data', cap);
|
|
151
|
+
const timer = setTimeout(() => { try { child.kill?.(); } catch { /* gone */ } resolve({ code: -1, timedOut: true, output: out }); }, timeoutMs);
|
|
152
|
+
child.on?.('exit', (code) => { clearTimeout(timer); resolve({ code, output: out }); });
|
|
153
|
+
child.on?.('error', (e) => { clearTimeout(timer); resolve({ code: -1, error: e?.message || String(e), output: out }); });
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
const sleep = (ms) => new Promise((r) => setTimeout(r, ms));
|
|
158
|
+
|
|
159
|
+
// --- the updater -------------------------------------------------------------
|
|
160
|
+
|
|
161
|
+
export class AutoUpdater {
|
|
162
|
+
constructor({
|
|
163
|
+
globalDir = defaultGlobalDir(),
|
|
164
|
+
packageName = PACKAGE_NAME,
|
|
165
|
+
port = Number(process.env.WILD_WORKSPACE_PORT || 5173),
|
|
166
|
+
installedVersionImpl = () => installedVersion(),
|
|
167
|
+
fetchLatestImpl = fetchLatestVersion,
|
|
168
|
+
installImpl = npmInstall,
|
|
169
|
+
// Ask the owner (the supervisor) to restart the server child so the freshly
|
|
170
|
+
// installed code is loaded. Default no-op for standalone/manual use.
|
|
171
|
+
restartImpl = async () => {},
|
|
172
|
+
// Read the RUNNING server's version (probeHealthVersion bound to our port).
|
|
173
|
+
healthVersionImpl = (port_) => probeHealthVersion(port_),
|
|
174
|
+
nowImpl = () => Date.now(),
|
|
175
|
+
logImpl = () => {},
|
|
176
|
+
env = process.env,
|
|
177
|
+
checkIntervalMs = DEFAULT_CHECK_INTERVAL_MS,
|
|
178
|
+
verifyAttempts = 10,
|
|
179
|
+
verifyDelayMs = 3000,
|
|
180
|
+
sleepImpl = sleep,
|
|
181
|
+
onUpdate = null, // (rec) => void — surface the "updated to vX" note
|
|
182
|
+
} = {}) {
|
|
183
|
+
Object.assign(this, {
|
|
184
|
+
globalDir, packageName, port, installedVersionImpl, fetchLatestImpl, installImpl,
|
|
185
|
+
restartImpl, healthVersionImpl, nowImpl, logImpl, env,
|
|
186
|
+
checkIntervalMs, verifyAttempts, verifyDelayMs, sleepImpl, onUpdate,
|
|
187
|
+
});
|
|
188
|
+
this.inProgress = false;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
enabled() {
|
|
192
|
+
if (this.env.WILD_WORKSPACE_NO_AUTOUPDATE === '1') return false; // hard kill switch
|
|
193
|
+
return loadUpdateSettings(this.globalDir).enabled;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
channel() { return loadUpdateSettings(this.globalDir).channel; }
|
|
197
|
+
|
|
198
|
+
dueForCheck(settings = loadUpdateSettings(this.globalDir)) {
|
|
199
|
+
return this.nowImpl() - (settings.lastCheckAt || 0) >= this.checkIntervalMs;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/** What's installed vs what's published — { current, latest, channel, available }. */
|
|
203
|
+
async check() {
|
|
204
|
+
const current = this.installedVersionImpl();
|
|
205
|
+
const channel = this.channel();
|
|
206
|
+
const latest = await this.fetchLatestImpl(channel, { packageName: this.packageName });
|
|
207
|
+
return { current, latest, channel, available: isNewer(latest, current) };
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
/** Poll the running server until it reports `expected`, up to verifyAttempts. */
|
|
211
|
+
async verify(expected) {
|
|
212
|
+
for (let i = 0; i < this.verifyAttempts; i++) {
|
|
213
|
+
let running = null;
|
|
214
|
+
try { running = await this.healthVersionImpl(this.port); } catch { running = null; }
|
|
215
|
+
if (running && running === expected) return true;
|
|
216
|
+
await this.sleepImpl(this.verifyDelayMs);
|
|
217
|
+
}
|
|
218
|
+
return false;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* Install `target`, restart, verify healthy; on failure roll back to `from`.
|
|
223
|
+
* Returns { ok, stage?, rolledBack?, rec }. Records the outcome to update.json.
|
|
224
|
+
*/
|
|
225
|
+
async applyUpdate(target, { from } = {}) {
|
|
226
|
+
this.logImpl(`auto-update: installing ${this.packageName}@${target} (from ${from || 'unknown'})`);
|
|
227
|
+
const install = await this.installImpl(`${this.packageName}@${target}`);
|
|
228
|
+
if (install.code !== 0) {
|
|
229
|
+
this.logImpl(`auto-update: install failed code=${install.code}${install.timedOut ? ' (timeout)' : ''}`);
|
|
230
|
+
const rec = recordUpdate(this.globalDir, { from, to: target, at: this.nowImpl(), status: 'install-failed' });
|
|
231
|
+
this.onUpdate?.(rec);
|
|
232
|
+
return { ok: false, stage: 'install', install, rec };
|
|
233
|
+
}
|
|
234
|
+
await this.restartImpl();
|
|
235
|
+
if (await this.verify(target)) {
|
|
236
|
+
this.logImpl(`auto-update: now running ${target}`);
|
|
237
|
+
const rec = recordUpdate(this.globalDir, { from, to: target, at: this.nowImpl(), status: 'ok' });
|
|
238
|
+
this.onUpdate?.(rec);
|
|
239
|
+
return { ok: true, rec };
|
|
240
|
+
}
|
|
241
|
+
// New version didn't come up healthy → roll back to the pinned previous.
|
|
242
|
+
this.logImpl(`auto-update: ${target} unhealthy — rolling back to ${from}`);
|
|
243
|
+
let rolledBack = false;
|
|
244
|
+
if (from) {
|
|
245
|
+
const rb = await this.installImpl(`${this.packageName}@${from}`);
|
|
246
|
+
if (rb.code === 0) { await this.restartImpl(); rolledBack = await this.verify(from); }
|
|
247
|
+
}
|
|
248
|
+
const status = rolledBack ? 'rolled-back' : 'rollback-failed';
|
|
249
|
+
this.logImpl(`auto-update: ${status} (target ${target})`);
|
|
250
|
+
const rec = recordUpdate(this.globalDir, { from, to: target, at: this.nowImpl(), status });
|
|
251
|
+
this.onUpdate?.(rec);
|
|
252
|
+
return { ok: false, stage: 'verify', rolledBack, rec };
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
/**
|
|
256
|
+
* One auto-update cycle, called on the supervisor's slow timer. Self-rate-limits
|
|
257
|
+
* via dueForCheck so the timer cadence and the check interval are independent.
|
|
258
|
+
* Returns a short status string (exposed for tests/logging).
|
|
259
|
+
*/
|
|
260
|
+
async tick() {
|
|
261
|
+
if (this.inProgress) return 'busy';
|
|
262
|
+
if (!this.enabled()) return 'disabled';
|
|
263
|
+
if (!this.dueForCheck()) return 'not-due';
|
|
264
|
+
this.inProgress = true;
|
|
265
|
+
try {
|
|
266
|
+
touchLastCheck(this.globalDir, this.nowImpl());
|
|
267
|
+
const c = await this.check();
|
|
268
|
+
if (!c.latest) return 'check-failed';
|
|
269
|
+
if (!c.available) return 'up-to-date';
|
|
270
|
+
this.logImpl(`auto-update: ${c.current} → ${c.latest} (${c.channel})`);
|
|
271
|
+
const r = await this.applyUpdate(c.latest, { from: c.current });
|
|
272
|
+
return r.ok ? 'updated' : (r.rolledBack ? 'rolled-back' : 'failed');
|
|
273
|
+
} finally {
|
|
274
|
+
this.inProgress = false;
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
}
|
package/server/src/config.mjs
CHANGED
|
@@ -312,6 +312,12 @@ export function buildConfig(overrides = {}) {
|
|
|
312
312
|
overrides.operatorToken ??
|
|
313
313
|
env.WILD_WORKSPACE_OPERATOR_TOKEN ??
|
|
314
314
|
loadOperatorToken(dataDir),
|
|
315
|
+
// RC1 hot-reload seam: the EXPLICIT token (override or env), with NO disk read.
|
|
316
|
+
// When this is null the live auth path re-reads the token file on each request
|
|
317
|
+
// (getOperatorToken) so `operator enable` takes effect with no restart; when a
|
|
318
|
+
// test/env pins a token, that value stays authoritative. (See index.mjs.)
|
|
319
|
+
operatorTokenExplicit:
|
|
320
|
+
overrides.operatorToken ?? env.WILD_WORKSPACE_OPERATOR_TOKEN ?? null,
|
|
315
321
|
workspaceId:
|
|
316
322
|
overrides.workspaceId ||
|
|
317
323
|
env.WILD_WORKSPACE_ID ||
|
package/server/src/doctor.mjs
CHANGED
|
@@ -21,7 +21,7 @@ import { resolveDaemonBinary } from './daemon-bin.mjs';
|
|
|
21
21
|
import { checkPort } from './preview.mjs';
|
|
22
22
|
import { loadAccount } from './account.mjs';
|
|
23
23
|
import { serviceStatus } from './service.mjs';
|
|
24
|
-
import { probeHealth } from './supervisor.mjs';
|
|
24
|
+
import { probeHealth, probeHealthVersion } from './supervisor.mjs';
|
|
25
25
|
import { listLogs, diagnosticsDir } from './logpaths.mjs';
|
|
26
26
|
|
|
27
27
|
const STATUS_ICON = { ok: '✅', warn: '⚠️', fail: '❌', info: 'ℹ️' };
|
|
@@ -36,6 +36,27 @@ function nodeMajor(version = process.version) {
|
|
|
36
36
|
return m ? Number(m[1]) : 0;
|
|
37
37
|
}
|
|
38
38
|
|
|
39
|
+
// RC3: probe the LIVE public tunnel end-to-end — out to Cloudflare, through the
|
|
40
|
+
// relay, down the daemon's tunnel, back to this server. This is the check the old
|
|
41
|
+
// `doctor` lacked: it only resolved the slug in the registry (claimed in the DB),
|
|
42
|
+
// which stays green even when `<slug>.venturewild.llc` is 502. A 200 here proves
|
|
43
|
+
// the whole chain works; a 5xx/timeout is the exact RC2 "linked but unreachable".
|
|
44
|
+
async function probeTunnel(slug, fetchImpl, timeoutMs = 8000) {
|
|
45
|
+
const url = `https://${encodeURIComponent(slug)}.venturewild.llc/api/health`;
|
|
46
|
+
const ctrl = new AbortController();
|
|
47
|
+
const timer = setTimeout(() => ctrl.abort(), timeoutMs);
|
|
48
|
+
try {
|
|
49
|
+
const res = await fetchImpl(url, { signal: ctrl.signal, headers: { 'cache-control': 'no-cache' } });
|
|
50
|
+
let version = null;
|
|
51
|
+
try { version = (await res.json())?.version || null; } catch { /* non-JSON */ }
|
|
52
|
+
return { reachable: true, status: res.status, version, url };
|
|
53
|
+
} catch (e) {
|
|
54
|
+
return { reachable: false, error: String(e?.message || e), url };
|
|
55
|
+
} finally {
|
|
56
|
+
clearTimeout(timer);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
39
60
|
// Reach the bmo-sync registry: resolve the user's slug if linked, else /health.
|
|
40
61
|
async function probeRegistry(config, fetchImpl) {
|
|
41
62
|
const base = String(config.bmoSyncServerUrl || '').replace(/\/$/, '');
|
|
@@ -71,6 +92,7 @@ export async function runDoctor(opts = {}, deps = {}) {
|
|
|
71
92
|
serviceStatus: deps.serviceStatus || serviceStatus,
|
|
72
93
|
listLogs: deps.listLogs || listLogs,
|
|
73
94
|
fetchImpl: deps.fetchImpl || ((...a) => globalThis.fetch(...a)),
|
|
95
|
+
probeRunningVersion: deps.probeRunningVersion || probeHealthVersion,
|
|
74
96
|
};
|
|
75
97
|
const checks = [];
|
|
76
98
|
const add = (c) => checks.push(c);
|
|
@@ -155,6 +177,26 @@ export async function runDoctor(opts = {}, deps = {}) {
|
|
|
155
177
|
: { status: 'ok', detail: 'free', hint: null };
|
|
156
178
|
});
|
|
157
179
|
|
|
180
|
+
// 5b. Running server version vs installed (RC3). `doctor` runs as the
|
|
181
|
+
// freshly-invoked CLI, so APP_VERSION here == the version installed on disk.
|
|
182
|
+
// If a server is answering :port with a DIFFERENT version, it's running stale
|
|
183
|
+
// code from before the last upgrade — the "kept running 0.1.14 after 0.2.1"
|
|
184
|
+
// failure. Surface it so the fix (restart) is obvious instead of invisible.
|
|
185
|
+
await guarded('runningVersion', 'Running version', async () => {
|
|
186
|
+
const running = await d.probeRunningVersion(config.port);
|
|
187
|
+
if (!running) {
|
|
188
|
+
return { status: 'info', detail: `no server answering :${config.port} (not started yet)`, hint: null };
|
|
189
|
+
}
|
|
190
|
+
if (running === APP_VERSION) {
|
|
191
|
+
return { status: 'ok', detail: `v${running} (matches installed)`, hint: null };
|
|
192
|
+
}
|
|
193
|
+
return {
|
|
194
|
+
status: 'warn',
|
|
195
|
+
detail: `running v${running}, but v${APP_VERSION} is installed`,
|
|
196
|
+
hint: 'A workspace server is running older code than what is installed. Restart it (close the app — always-on restarts it clean) to finish the upgrade.',
|
|
197
|
+
};
|
|
198
|
+
});
|
|
199
|
+
|
|
158
200
|
// 6. Account linked (slug)
|
|
159
201
|
let account = null;
|
|
160
202
|
await guarded('account', 'Workspace account linked', async () => {
|
|
@@ -181,6 +223,38 @@ export async function runDoctor(opts = {}, deps = {}) {
|
|
|
181
223
|
: { status: 'warn', detail: `server returned HTTP ${r.status}`, hint: null };
|
|
182
224
|
});
|
|
183
225
|
|
|
226
|
+
// 7b. Public URL reachable end-to-end (RC3). Only meaningful once linked. This
|
|
227
|
+
// is the half the old doctor was blind to — the registry check above can be
|
|
228
|
+
// green (slug claimed) while this is red (tunnel down). Together they tell the
|
|
229
|
+
// two apart: claimed-but-unreachable ⟹ the daemon link is broken (RC2), the
|
|
230
|
+
// operator/auto-relink path is the fix.
|
|
231
|
+
await guarded('tunnel', 'Public URL reachable', async () => {
|
|
232
|
+
const slug = account?.slug || config.account?.slug || null;
|
|
233
|
+
if (!slug) {
|
|
234
|
+
return { status: 'info', detail: 'not linked — no public URL yet', hint: null };
|
|
235
|
+
}
|
|
236
|
+
const r = await probeTunnel(slug, d.fetchImpl);
|
|
237
|
+
if (!r.reachable) {
|
|
238
|
+
return {
|
|
239
|
+
status: 'fail',
|
|
240
|
+
detail: `${r.url} unreachable: ${r.error}`,
|
|
241
|
+
hint: 'The public link is down. Restart sync (`wild-workspace daemon stop` then `wild-workspace`), or the operator `relink-account` fix.',
|
|
242
|
+
};
|
|
243
|
+
}
|
|
244
|
+
if (r.status >= 500) {
|
|
245
|
+
return {
|
|
246
|
+
status: 'fail',
|
|
247
|
+
detail: `${r.url} returned HTTP ${r.status} (tunnel down — slug claimed but not linked)`,
|
|
248
|
+
hint: 'The daemon is not linked to the relay. Restart sync (`wild-workspace daemon stop` then `wild-workspace`).',
|
|
249
|
+
};
|
|
250
|
+
}
|
|
251
|
+
if (r.status >= 400) {
|
|
252
|
+
// 401/403/404 = the chain works; auth/slug is the nuance, not a tunnel fault.
|
|
253
|
+
return { status: 'warn', detail: `reachable but HTTP ${r.status} (auth/slug check)`, hint: null };
|
|
254
|
+
}
|
|
255
|
+
return { status: 'ok', detail: `live (HTTP ${r.status}${r.version ? `, v${r.version}` : ''})`, hint: null };
|
|
256
|
+
});
|
|
257
|
+
|
|
184
258
|
// 8. Always-on / autostart
|
|
185
259
|
await guarded('service', 'Always-on (autostart)', async () => {
|
|
186
260
|
const s = await d.serviceStatus({ port: config.port }, { probeImpl: (p) => probeHealth(p) });
|
package/server/src/index.mjs
CHANGED
|
@@ -35,10 +35,12 @@ import { InboxWatcher } from './inbox.mjs';
|
|
|
35
35
|
import { ActivityBus } from './activity.mjs';
|
|
36
36
|
import { loadIdentity, saveIdentity, markOnboarded, TONES } from './agent-identity.mjs';
|
|
37
37
|
import { probeAgentReadiness } from './agent-readiness.mjs';
|
|
38
|
+
import { AutoUpdater, npmInstall, recordUpdate, loadUpdateSettings, PACKAGE_NAME } from './auto-update.mjs';
|
|
38
39
|
import { ClaudeLoginSession } from './agent-login.mjs';
|
|
39
40
|
import { ErrorReporter } from './error-reporter.mjs';
|
|
40
41
|
import { DaemonBridge } from './daemon.mjs';
|
|
41
42
|
import { DaemonSupervisor } from './daemon-supervisor.mjs';
|
|
43
|
+
import { TunnelWatchdog } from './tunnel-watchdog.mjs';
|
|
42
44
|
import { SyncControl } from './sync.mjs';
|
|
43
45
|
import { detectPreviewPorts, checkPort } from './preview.mjs';
|
|
44
46
|
import { createBazaar } from './bazaar/core.mjs';
|
|
@@ -47,6 +49,7 @@ import { matchCandidates } from './bazaar/mock-tickup.mjs';
|
|
|
47
49
|
import { servePreviewFile, confineBuildDir } from './bazaar/preview-server.mjs';
|
|
48
50
|
import { TURN_SYSTEM_PROMPT, writeTurnMcpConfig } from './turn-mcp.mjs';
|
|
49
51
|
import { loadAccount } from './account.mjs';
|
|
52
|
+
import { getOperatorToken } from './operator.mjs';
|
|
50
53
|
import { runDoctor } from './doctor.mjs';
|
|
51
54
|
import { appendLine, tailFile, logFile, TAILABLE, globalDir } from './logpaths.mjs';
|
|
52
55
|
import { SessionReporter } from './session-reporter.mjs';
|
|
@@ -172,6 +175,29 @@ export async function createServer(overrides = {}) {
|
|
|
172
175
|
.catch((e) => ({ started: false, error: String(e?.message || e) }))
|
|
173
176
|
: Promise.resolve({ started: false, skipped: true });
|
|
174
177
|
|
|
178
|
+
// RC2 tunnel self-heal: when this install is slug-linked (so it's SUPPOSED to be
|
|
179
|
+
// reachable at <slug>.venturewild.llc), watch the public URL end-to-end and
|
|
180
|
+
// relink the daemon if it goes dead while we're locally healthy. Off without a
|
|
181
|
+
// daemon supervisor, without a slug, or under tests. `overrides.tunnelWatchdog`
|
|
182
|
+
// is a test seam (false disables; an object injects options).
|
|
183
|
+
const relinkDaemon = async () => {
|
|
184
|
+
if (!daemonSupervisor) return;
|
|
185
|
+
await daemonSupervisor.stop().catch(() => {});
|
|
186
|
+
await daemonSupervisor.ensureRunning().catch(() => {});
|
|
187
|
+
};
|
|
188
|
+
const tunnelWatchdog =
|
|
189
|
+
overrides.tunnelWatchdog === false ||
|
|
190
|
+
!daemonSupervisor ||
|
|
191
|
+
!config.account?.slug ||
|
|
192
|
+
!config.daemonAutostart
|
|
193
|
+
? null
|
|
194
|
+
: new TunnelWatchdog({
|
|
195
|
+
publicBaseUrl: `https://${config.account.slug}.venturewild.llc`,
|
|
196
|
+
relink: relinkDaemon,
|
|
197
|
+
log: (m) => log('[tunnel]', m),
|
|
198
|
+
...(typeof overrides.tunnelWatchdog === 'object' ? overrides.tunnelWatchdog : {}),
|
|
199
|
+
}).start();
|
|
200
|
+
|
|
175
201
|
// Control plane for bmo-sync folder sharing (pair / detach / invite).
|
|
176
202
|
// `overrides.syncControl` is a test seam.
|
|
177
203
|
const syncControl =
|
|
@@ -488,6 +514,14 @@ export async function createServer(overrides = {}) {
|
|
|
488
514
|
const app = new Hono();
|
|
489
515
|
|
|
490
516
|
// --- auth helpers ---------------------------------------------------------
|
|
517
|
+
// RC1 hot-reload: resolve the operator token LIVE per request. An explicit
|
|
518
|
+
// override/env token (tests, pinned deployments) stays authoritative; otherwise
|
|
519
|
+
// the token file is re-read (TTL-cached) so `operator enable`/`disable` take
|
|
520
|
+
// effect with no server restart — the literal 401 from the first external
|
|
521
|
+
// install. `overrides.operatorDataDir` is unused; the file lives in dataDir.
|
|
522
|
+
const liveOperatorToken = () =>
|
|
523
|
+
config.operatorTokenExplicit ?? getOperatorToken(config.dataDir);
|
|
524
|
+
|
|
491
525
|
// Classify one raw token into a role. Shared by the Authorization header, the
|
|
492
526
|
// HttpOnly auth cookie, and the `?t=` query so all three stay consistent.
|
|
493
527
|
// `allowOperator` is true ONLY for the header path — the operator (support)
|
|
@@ -498,7 +532,8 @@ export async function createServer(overrides = {}) {
|
|
|
498
532
|
if (token === config.partnerToken) {
|
|
499
533
|
return { role: ROLES.PARTNER, sub: 'partner', source };
|
|
500
534
|
}
|
|
501
|
-
|
|
535
|
+
const opToken = allowOperator ? liveOperatorToken() : null;
|
|
536
|
+
if (opToken && token === opToken) {
|
|
502
537
|
return { role: ROLES.OPERATOR, sub: 'operator', source: source || 'operator-token' };
|
|
503
538
|
}
|
|
504
539
|
const payload = await verifyShareToken(token, config.shareSecret);
|
|
@@ -985,6 +1020,22 @@ export async function createServer(overrides = {}) {
|
|
|
985
1020
|
return c.json({ agent: agentTag(activeAgent), ...verdict });
|
|
986
1021
|
});
|
|
987
1022
|
|
|
1023
|
+
// Auto-update status (Phase 2) — what's running, the channel, on/off, and the
|
|
1024
|
+
// last update outcome (the "updated to vX" note the UI can surface). Read-only;
|
|
1025
|
+
// the toggle/apply levers are the CLI + the operator channel.
|
|
1026
|
+
app.get('/api/update/status', (c) => {
|
|
1027
|
+
const forbidden = require(c, 'chat');
|
|
1028
|
+
if (forbidden) return forbidden;
|
|
1029
|
+
const s = loadUpdateSettings(globalDir());
|
|
1030
|
+
return c.json({
|
|
1031
|
+
current: APP_VERSION,
|
|
1032
|
+
enabled: s.enabled,
|
|
1033
|
+
channel: s.channel,
|
|
1034
|
+
lastCheckAt: s.lastCheckAt || null,
|
|
1035
|
+
lastUpdate: s.lastUpdate || null,
|
|
1036
|
+
});
|
|
1037
|
+
});
|
|
1038
|
+
|
|
988
1039
|
// In-app "Sign in to Claude" — drives `claude auth login` in a real PTY so the
|
|
989
1040
|
// browser OAuth callback auto-completes and the user never touches a terminal.
|
|
990
1041
|
// (See agent-login.mjs.) Claude opens the OAuth URL in the user's browser itself
|
|
@@ -1220,7 +1271,9 @@ export async function createServer(overrides = {}) {
|
|
|
1220
1271
|
spawn,
|
|
1221
1272
|
...(overrides.operatorDeps || {}),
|
|
1222
1273
|
};
|
|
1223
|
-
|
|
1274
|
+
// Live so `operator enable` (run in a separate CLI process) lights the channel
|
|
1275
|
+
// up without a server restart, and `operator disable` takes it dark (RC1).
|
|
1276
|
+
const operatorEnabled = () => Boolean(liveOperatorToken());
|
|
1224
1277
|
function auditOperator(c, action, detail) {
|
|
1225
1278
|
const s = c.get('session') || {};
|
|
1226
1279
|
appendLine('operator', `${action} by=${s.sub || 'operator'} src=${s.source || '-'} ${detail || ''}`.trim());
|
|
@@ -1267,6 +1320,24 @@ export async function createServer(overrides = {}) {
|
|
|
1267
1320
|
child?.on?.('exit', (code) => appendLine('operator', `reinstall-daemon exited code=${code}`));
|
|
1268
1321
|
return { started: true, pid: child?.pid || null, command: `${cmd} i -g @venturewild/workspace` };
|
|
1269
1322
|
},
|
|
1323
|
+
// Phase 2: check the user's channel and install a newer version if one exists.
|
|
1324
|
+
// The always-on supervisor's version-drift auto-restart (RC1b) then loads it;
|
|
1325
|
+
// the supervisor also owns autonomous health-gated rollback. This is the
|
|
1326
|
+
// remote-support trigger for the same flow (Phase 3 capability).
|
|
1327
|
+
'update-now': async () => {
|
|
1328
|
+
const gdir = globalDir();
|
|
1329
|
+
const check = await (operatorDeps.checkUpdate
|
|
1330
|
+
? operatorDeps.checkUpdate()
|
|
1331
|
+
: new AutoUpdater({ globalDir: gdir }).check());
|
|
1332
|
+
if (!check.latest) return { ok: false, reason: 'registry-unreachable', current: check.current };
|
|
1333
|
+
if (!check.available) return { ok: true, updated: false, current: check.current, latest: check.latest };
|
|
1334
|
+
appendLine('operator', `update-now installing ${check.current} → ${check.latest} (${check.channel})`);
|
|
1335
|
+
const res = await (operatorDeps.npmInstall || npmInstall)(`${PACKAGE_NAME}@${check.latest}`);
|
|
1336
|
+
const ok = res.code === 0;
|
|
1337
|
+
if (ok) recordUpdate(gdir, { from: check.current, to: check.latest, at: Date.now(), status: 'installed' });
|
|
1338
|
+
appendLine('operator', `update-now ${ok ? 'installed' : `failed code=${res.code}`}`);
|
|
1339
|
+
return { ok, updated: ok, from: check.current, to: check.latest, code: res.code };
|
|
1340
|
+
},
|
|
1270
1341
|
};
|
|
1271
1342
|
|
|
1272
1343
|
app.get('/api/operator/diag', async (c) => {
|
|
@@ -1887,6 +1958,7 @@ export async function createServer(overrides = {}) {
|
|
|
1887
1958
|
daemonBridge,
|
|
1888
1959
|
daemonSupervisor,
|
|
1889
1960
|
daemonReady,
|
|
1961
|
+
tunnelWatchdog,
|
|
1890
1962
|
syncControl,
|
|
1891
1963
|
sessionReporter,
|
|
1892
1964
|
detectedAgents,
|
|
@@ -1898,6 +1970,7 @@ export async function createServer(overrides = {}) {
|
|
|
1898
1970
|
try { transcriptRecorder.stop(); } catch {}
|
|
1899
1971
|
try { inboxWatcher.stop(); } catch {}
|
|
1900
1972
|
try { daemonBridge?.stop(); } catch {}
|
|
1973
|
+
try { tunnelWatchdog?.stop(); } catch {}
|
|
1901
1974
|
// The daemon is deliberately NOT stopped here — it is detached so sync
|
|
1902
1975
|
// keeps running after wild-workspace closes. `wild-workspace daemon
|
|
1903
1976
|
// stop` is the explicit off-switch.
|
package/server/src/operator.mjs
CHANGED
|
@@ -29,6 +29,33 @@ export function loadOperatorToken(dataDir) {
|
|
|
29
29
|
}
|
|
30
30
|
}
|
|
31
31
|
|
|
32
|
+
// RC1 hot-reload: read the operator token LIVE (with a tiny TTL cache) instead of
|
|
33
|
+
// the value the server snapshotted at boot. Today `operator enable` writes the
|
|
34
|
+
// token to disk but a long-running server keeps serving its cached "disabled"
|
|
35
|
+
// state, so the channel 401s until a manual restart (the exact bug from the first
|
|
36
|
+
// external install). A short TTL keeps this off the hot auth path — every request
|
|
37
|
+
// reads from cache, and `enable`/`disable` take effect within `ttlMs`.
|
|
38
|
+
//
|
|
39
|
+
// The cache is keyed by dataDir so two servers (tests, multiple installs) in one
|
|
40
|
+
// process don't read each other's tokens. `now` is injectable for tests.
|
|
41
|
+
const _tokenCache = new Map(); // dataDir -> { token, at }
|
|
42
|
+
export function getOperatorToken(dataDir, { ttlMs = 2000, now = Date.now } = {}) {
|
|
43
|
+
const t = now();
|
|
44
|
+
const hit = _tokenCache.get(dataDir);
|
|
45
|
+
if (hit && t - hit.at < ttlMs) return hit.token;
|
|
46
|
+
const token = loadOperatorToken(dataDir);
|
|
47
|
+
_tokenCache.set(dataDir, { token, at: t });
|
|
48
|
+
return token;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// Drop the cached token for a dataDir (or all of them). `enable`/`disable` run in
|
|
52
|
+
// a separate CLI process from the server, so they don't need this — it exists so
|
|
53
|
+
// in-process callers (and tests) can force a re-read without waiting out the TTL.
|
|
54
|
+
export function invalidateOperatorTokenCache(dataDir) {
|
|
55
|
+
if (dataDir === undefined) _tokenCache.clear();
|
|
56
|
+
else _tokenCache.delete(dataDir);
|
|
57
|
+
}
|
|
58
|
+
|
|
32
59
|
// Turn the channel on. Idempotent by default — returns the existing token if one
|
|
33
60
|
// is already set (so a re-run doesn't invalidate the code the user already
|
|
34
61
|
// shared). Pass { rotate:true } to force a fresh token. Returns the token, or
|
|
@@ -41,6 +41,47 @@ export function probeHealth(port, timeoutMs = 2500) {
|
|
|
41
41
|
});
|
|
42
42
|
}
|
|
43
43
|
|
|
44
|
+
/**
|
|
45
|
+
* Ask the running server its version via /api/health. Returns the version string
|
|
46
|
+
* or null (server down / no version field / parse error). Never throws. Used by
|
|
47
|
+
* the version-drift check (RC1) — a stale server keeps running its OLD code after
|
|
48
|
+
* an upgrade, so we compare what's RUNNING to what's INSTALLED on disk.
|
|
49
|
+
*/
|
|
50
|
+
export function probeHealthVersion(port, timeoutMs = 2500) {
|
|
51
|
+
return new Promise((resolve) => {
|
|
52
|
+
const req = http.get(
|
|
53
|
+
{ host: '127.0.0.1', port, path: '/api/health', timeout: timeoutMs },
|
|
54
|
+
(res) => {
|
|
55
|
+
let body = '';
|
|
56
|
+
res.on('data', (d) => { body += d; if (body.length > 4096) req.destroy(); });
|
|
57
|
+
res.on('end', () => {
|
|
58
|
+
try { resolve(JSON.parse(body).version || null); } catch { resolve(null); }
|
|
59
|
+
});
|
|
60
|
+
},
|
|
61
|
+
);
|
|
62
|
+
req.on('error', () => resolve(null));
|
|
63
|
+
req.on('timeout', () => { req.destroy(); resolve(null); });
|
|
64
|
+
});
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* The version installed on disk RIGHT NOW — read fresh from the package.json that
|
|
69
|
+
* ships next to this file, NOT the in-memory APP_VERSION constant. The supervisor
|
|
70
|
+
* is long-lived: after `npm i -g` (or the operator `reinstall-daemon`) swaps the
|
|
71
|
+
* package, the supervisor's own constant is stale too, so only a fresh disk read
|
|
72
|
+
* sees the new version. Respawning the server child reloads index.mjs from this
|
|
73
|
+
* same path, so the restart actually picks up the new code. Returns null on error.
|
|
74
|
+
*/
|
|
75
|
+
export function installedVersion(entry = DEFAULT_SERVER_ENTRY) {
|
|
76
|
+
try {
|
|
77
|
+
// index.mjs lives at <pkg>/server/src/index.mjs → package.json is ../../.
|
|
78
|
+
const pkg = path.resolve(path.dirname(entry), '..', '..', 'package.json');
|
|
79
|
+
return JSON.parse(fs.readFileSync(pkg, 'utf8')).version || null;
|
|
80
|
+
} catch {
|
|
81
|
+
return null;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
44
85
|
export class WorkspaceSupervisor {
|
|
45
86
|
constructor({
|
|
46
87
|
serverEntry = DEFAULT_SERVER_ENTRY,
|
|
@@ -58,12 +99,32 @@ export class WorkspaceSupervisor {
|
|
|
58
99
|
env = process.env,
|
|
59
100
|
crashLoopThreshold = 3,
|
|
60
101
|
diagnosticsImpl = null,
|
|
102
|
+
// RC1 version-drift auto-restart: when the RUNNING server reports an older
|
|
103
|
+
// version than what's INSTALLED on disk, restart it so it picks up the new
|
|
104
|
+
// code. On by default; seams injected for tests. WILD_WORKSPACE_NO_AUTORESTART=1
|
|
105
|
+
// disables it (e.g. a developer running an intentionally-older server).
|
|
106
|
+
autoRestartOnVersionDrift = env.WILD_WORKSPACE_NO_AUTORESTART !== '1',
|
|
107
|
+
versionImpl = probeHealthVersion,
|
|
108
|
+
installedVersionImpl = () => installedVersion(serverEntry),
|
|
109
|
+
// Phase 2 auto-update (Pillar B): the always-on supervisor self-updates the
|
|
110
|
+
// whole stack on the user's channel, with health-gated rollback. On by
|
|
111
|
+
// default; the env kill switch + the persisted off switch both disable it.
|
|
112
|
+
// Only wired up in start() (not in the unit-test path, which calls tick()
|
|
113
|
+
// directly) — see start(). updatePollMs is the *wake* cadence; the actual
|
|
114
|
+
// check interval lives inside AutoUpdater (6h) and self-rate-limits.
|
|
115
|
+
autoUpdate = env.WILD_WORKSPACE_NO_AUTOUPDATE !== '1',
|
|
116
|
+
updatePollMs = 60 * 60 * 1000, // wake hourly; AutoUpdater gates real checks
|
|
117
|
+
autoUpdaterFactory = null, // test seam: (supervisor) => AutoUpdater-like
|
|
61
118
|
} = {}) {
|
|
62
119
|
Object.assign(this, {
|
|
63
120
|
serverEntry, workspaceDir, port, globalDir, node, pollMs,
|
|
64
121
|
backoffStartMs, backoffMaxMs, probeTimeoutMs, spawnImpl, probeImpl, nowImpl, env,
|
|
65
122
|
crashLoopThreshold, diagnosticsImpl,
|
|
123
|
+
autoRestartOnVersionDrift, versionImpl, installedVersionImpl,
|
|
124
|
+
autoUpdate, updatePollMs, autoUpdaterFactory,
|
|
66
125
|
});
|
|
126
|
+
this.autoUpdater = null;
|
|
127
|
+
this.updateTimer = null;
|
|
67
128
|
this.logFile = path.join(globalDir, 'supervisor.log');
|
|
68
129
|
this.serverLogFile = path.join(globalDir, 'server.out.log');
|
|
69
130
|
this.lockFile = path.join(globalDir, 'supervisor.lock');
|
|
@@ -135,6 +196,28 @@ export class WorkspaceSupervisor {
|
|
|
135
196
|
this.backoff = this.backoffStartMs; // healthy → reset backoff
|
|
136
197
|
this.spawnCount = 0; // healthy → not a crash loop
|
|
137
198
|
this.pushedThisEpisode = false;
|
|
199
|
+
// RC1 version drift: a healthy-but-STALE server (running older code than
|
|
200
|
+
// what's installed) should be restarted so the upgrade actually lands.
|
|
201
|
+
// Only when WE own the child — we restart by killing it and letting the
|
|
202
|
+
// next tick respawn (which reloads index.mjs from disk). A server started
|
|
203
|
+
// by someone else (foreground `wild-workspace`) we leave alone; we have no
|
|
204
|
+
// handle on it. The restarted server reports the installed version, so the
|
|
205
|
+
// drift clears and this won't loop.
|
|
206
|
+
if (this.autoRestartOnVersionDrift && this.child) {
|
|
207
|
+
try {
|
|
208
|
+
const running = await this.versionImpl(this.port, this.probeTimeoutMs);
|
|
209
|
+
const installed = this.installedVersionImpl();
|
|
210
|
+
if (running && installed && running !== installed) {
|
|
211
|
+
this.log(`version drift: running=${running} installed=${installed} — restarting server`);
|
|
212
|
+
try { this.child.kill(); } catch { /* exit handler clears child */ }
|
|
213
|
+
this.child = null;
|
|
214
|
+
this.backoff = this.backoffStartMs; // upgrade is intentional, not a crash
|
|
215
|
+
return 'version-drift-restart';
|
|
216
|
+
}
|
|
217
|
+
} catch (e) {
|
|
218
|
+
this.log(`version-drift check error: ${e?.message || e}`);
|
|
219
|
+
}
|
|
220
|
+
}
|
|
138
221
|
return 'healthy';
|
|
139
222
|
}
|
|
140
223
|
if (this.child) return 'booting'; // spawned, still coming up
|
|
@@ -198,6 +281,47 @@ export class WorkspaceSupervisor {
|
|
|
198
281
|
}
|
|
199
282
|
}
|
|
200
283
|
|
|
284
|
+
/**
|
|
285
|
+
* Restart the supervised server child so freshly installed code is loaded.
|
|
286
|
+
* Kills it and lets the next tick respawn (which reloads index.mjs from disk) —
|
|
287
|
+
* the same mechanism as the version-drift restart, exposed for the AutoUpdater.
|
|
288
|
+
* No-op (returns false) when we don't own a child (foreground server).
|
|
289
|
+
*/
|
|
290
|
+
restartChild() {
|
|
291
|
+
if (!this.child) return false;
|
|
292
|
+
this.log('restartChild: killing server to load new code');
|
|
293
|
+
try { this.child.kill(); } catch { /* exit handler clears child */ }
|
|
294
|
+
this.child = null;
|
|
295
|
+
this.backoff = this.backoffStartMs; // an intentional restart, not a crash
|
|
296
|
+
return true;
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
/** Build the AutoUpdater bound to this supervisor. Separated for the test seam. */
|
|
300
|
+
async buildAutoUpdater() {
|
|
301
|
+
if (this.autoUpdaterFactory) return this.autoUpdaterFactory(this);
|
|
302
|
+
// Lazy import keeps the unit-test path (which never calls start()) free of the
|
|
303
|
+
// auto-update module + its registry/npm seams.
|
|
304
|
+
const { AutoUpdater } = await import('./auto-update.mjs');
|
|
305
|
+
return new AutoUpdater({
|
|
306
|
+
globalDir: this.globalDir,
|
|
307
|
+
port: this.port,
|
|
308
|
+
installedVersionImpl: this.installedVersionImpl,
|
|
309
|
+
healthVersionImpl: (port) => this.versionImpl(port, this.probeTimeoutMs),
|
|
310
|
+
restartImpl: async () => { this.restartChild(); },
|
|
311
|
+
nowImpl: this.nowImpl,
|
|
312
|
+
env: this.env,
|
|
313
|
+
logImpl: (m) => this.log(m),
|
|
314
|
+
onUpdate: (rec) => this.log(`auto-update result: ${rec.from || '?'}→${rec.to} ${rec.status}`),
|
|
315
|
+
});
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
runUpdateTick() {
|
|
319
|
+
if (!this.autoUpdater) return;
|
|
320
|
+
this.autoUpdater.tick()
|
|
321
|
+
.then((r) => { if (r && !['not-due', 'disabled', 'up-to-date', 'busy'].includes(r)) this.log(`auto-update tick: ${r}`); })
|
|
322
|
+
.catch((e) => this.log(`auto-update error: ${e?.message || e}`));
|
|
323
|
+
}
|
|
324
|
+
|
|
201
325
|
/** Acquire the lock and start the supervision loop. Idempotent across processes. */
|
|
202
326
|
start() {
|
|
203
327
|
if (!this.acquireLock()) return { started: false, reason: 'already-running' };
|
|
@@ -207,11 +331,24 @@ export class WorkspaceSupervisor {
|
|
|
207
331
|
this.log(`supervisor start pid=${process.pid} watching http://127.0.0.1:${this.port}/api/health (workspace=${this.workspaceDir})`);
|
|
208
332
|
this.timer = setInterval(() => { this.tick().catch((e) => this.log(`tick error: ${e?.message || e}`)); }, this.pollMs);
|
|
209
333
|
this.tick().catch((e) => this.log(`tick error: ${e?.message || e}`));
|
|
334
|
+
|
|
335
|
+
// Phase 2 auto-update: wake on a slow timer; the first check fires shortly
|
|
336
|
+
// after start so the server has time to come up (verify reads its /health).
|
|
337
|
+
if (this.autoUpdate && this.env.VITEST !== 'true' && this.env.NODE_ENV !== 'test') {
|
|
338
|
+
this.buildAutoUpdater().then((u) => {
|
|
339
|
+
this.autoUpdater = u;
|
|
340
|
+
this.updateTimer = setInterval(() => this.runUpdateTick(), this.updatePollMs);
|
|
341
|
+
if (this.updateTimer.unref) this.updateTimer.unref();
|
|
342
|
+
const kick = setTimeout(() => this.runUpdateTick(), 60_000);
|
|
343
|
+
if (kick.unref) kick.unref();
|
|
344
|
+
}).catch((e) => this.log(`auto-update init error: ${e?.message || e}`));
|
|
345
|
+
}
|
|
210
346
|
return { started: true };
|
|
211
347
|
}
|
|
212
348
|
|
|
213
349
|
stop() {
|
|
214
350
|
if (this.timer) { clearInterval(this.timer); this.timer = null; }
|
|
351
|
+
if (this.updateTimer) { clearInterval(this.updateTimer); this.updateTimer = null; }
|
|
215
352
|
this.releaseLock();
|
|
216
353
|
}
|
|
217
354
|
}
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
// TunnelWatchdog — RC2 self-heal for "slug-linked but the public URL is dead".
|
|
2
|
+
//
|
|
3
|
+
// The bug from the first external install: after a restart the daemon was
|
|
4
|
+
// "running" but never re-linked to the relay, so `<slug>.venturewild.llc` was 502
|
|
5
|
+
// while `localhost:5173` was perfectly healthy. Nothing noticed the half: the
|
|
6
|
+
// install thought it was online. (`docs/remote-support-and-self-healing-design.md`
|
|
7
|
+
// RC2 — a sibling of the half-open-after-sleep fix already shipped in the daemon.)
|
|
8
|
+
//
|
|
9
|
+
// This watchdog closes that gap from the workspace server (no Rust change): it
|
|
10
|
+
// periodically asks our OWN public URL for /api/health. That request travels the
|
|
11
|
+
// full chain — out to Cloudflare, through the relay, down the daemon's tunnel,
|
|
12
|
+
// back to this server — so a 200 proves the whole path works. When the public
|
|
13
|
+
// side fails repeatedly WHILE the local server is healthy (so the fault is the
|
|
14
|
+
// LINK, not the server), it relinks the daemon — the same remedy as the operator
|
|
15
|
+
// `relink-account` action, applied automatically.
|
|
16
|
+
//
|
|
17
|
+
// Conservative by design: it acts only on a sustained failure (threshold), never
|
|
18
|
+
// relinks more often than `minRelinkIntervalMs`, and treats a down LOCAL server
|
|
19
|
+
// as "not my job" (the WorkspaceSupervisor owns that). Every touch-point is an
|
|
20
|
+
// injected seam so the suite never hits the network.
|
|
21
|
+
//
|
|
22
|
+
// SAFETY against thrash: a relink only helps when THIS daemon's link to the relay
|
|
23
|
+
// is the broken part. When the relay itself is globally down (or otherwise can't
|
|
24
|
+
// accept the link), relinking can't help — so retrying every interval just churns
|
|
25
|
+
// the daemon. Relinks that DON'T restore the tunnel are counted, and after
|
|
26
|
+
// `maxIneffectiveRelinks` the watchdog escalates to a long `longCooldownMs` quiet
|
|
27
|
+
// period; a relink that works clears the counter (the next probe is healthy), so
|
|
28
|
+
// the genuine RC2 case still self-heals fast.
|
|
29
|
+
//
|
|
30
|
+
// (Product model: one machine = one install = one daemon = one public slug;
|
|
31
|
+
// multi-folder work is VS-Code-style within that single install. So co-tenant
|
|
32
|
+
// daemon contention is NOT a supported state — this guard is for the relay-down /
|
|
33
|
+
// transient-unreachable case, which is the real one. A two-install-per-machine
|
|
34
|
+
// setup, as seen while dogfooding on 2026-06-07, is a test artifact only.)
|
|
35
|
+
|
|
36
|
+
const DEFAULT_INTERVAL_MS = 60_000;
|
|
37
|
+
const DEFAULT_PROBE_TIMEOUT_MS = 8_000;
|
|
38
|
+
|
|
39
|
+
export class TunnelWatchdog {
|
|
40
|
+
/**
|
|
41
|
+
* @param {object} opts
|
|
42
|
+
* @param {string} opts.publicBaseUrl e.g. https://<slug>.venturewild.llc
|
|
43
|
+
* @param {Function} opts.relink async () => relink the daemon (stop+ensureRunning)
|
|
44
|
+
* @param {Function} [opts.localHealthy] async () => boolean; default assumes healthy
|
|
45
|
+
* @param {Function} [opts.fetchImpl]
|
|
46
|
+
* @param {Function} [opts.nowImpl]
|
|
47
|
+
* @param {Function} [opts.log]
|
|
48
|
+
*/
|
|
49
|
+
constructor({
|
|
50
|
+
publicBaseUrl,
|
|
51
|
+
relink,
|
|
52
|
+
localHealthy = async () => true,
|
|
53
|
+
fetchImpl = (...a) => globalThis.fetch(...a),
|
|
54
|
+
nowImpl = () => Date.now(),
|
|
55
|
+
log = () => {},
|
|
56
|
+
intervalMs = DEFAULT_INTERVAL_MS,
|
|
57
|
+
failureThreshold = 3,
|
|
58
|
+
minRelinkIntervalMs = 120_000,
|
|
59
|
+
maxIneffectiveRelinks = 3,
|
|
60
|
+
longCooldownMs = 1_800_000, // 30 min quiet period once relinks stop helping
|
|
61
|
+
probeTimeoutMs = DEFAULT_PROBE_TIMEOUT_MS,
|
|
62
|
+
} = {}) {
|
|
63
|
+
this.publicBaseUrl = String(publicBaseUrl || '').replace(/\/+$/, '');
|
|
64
|
+
this.relink = relink;
|
|
65
|
+
this.localHealthy = localHealthy;
|
|
66
|
+
this.fetchImpl = fetchImpl;
|
|
67
|
+
this.nowImpl = nowImpl;
|
|
68
|
+
this.log = log;
|
|
69
|
+
this.intervalMs = intervalMs;
|
|
70
|
+
this.failureThreshold = failureThreshold;
|
|
71
|
+
this.minRelinkIntervalMs = minRelinkIntervalMs;
|
|
72
|
+
this.maxIneffectiveRelinks = maxIneffectiveRelinks;
|
|
73
|
+
this.longCooldownMs = longCooldownMs;
|
|
74
|
+
this.probeTimeoutMs = probeTimeoutMs;
|
|
75
|
+
this.failures = 0;
|
|
76
|
+
this.lastRelink = 0;
|
|
77
|
+
this.consecutiveRelinks = 0; // relinks since the last healthy probe (thrash guard)
|
|
78
|
+
this.timer = null;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/** Probe the public /api/health. Returns true iff it answers (any HTTP status). */
|
|
82
|
+
async probePublic() {
|
|
83
|
+
if (!this.publicBaseUrl) return false;
|
|
84
|
+
const ctrl = new AbortController();
|
|
85
|
+
const t = setTimeout(() => ctrl.abort(), this.probeTimeoutMs);
|
|
86
|
+
try {
|
|
87
|
+
const res = await this.fetchImpl(`${this.publicBaseUrl}/api/health`, {
|
|
88
|
+
signal: ctrl.signal,
|
|
89
|
+
// never cached by the edge — we want the live tunnel, not a CDN copy
|
|
90
|
+
headers: { 'cache-control': 'no-cache' },
|
|
91
|
+
});
|
|
92
|
+
// A 5xx from the tunnel layer (502/504) means the link is down even though
|
|
93
|
+
// we got an HTTP response from the edge; treat only <500 as "reachable".
|
|
94
|
+
return res.status < 500;
|
|
95
|
+
} catch {
|
|
96
|
+
return false; // network error / timeout / abort
|
|
97
|
+
} finally {
|
|
98
|
+
clearTimeout(t);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/** One watchdog step. Returns its decision (exposed for tests). */
|
|
103
|
+
async tick() {
|
|
104
|
+
// Local server down → the WorkspaceSupervisor's problem, not ours. A failing
|
|
105
|
+
// public probe in that state says nothing about the LINK, so don't relink.
|
|
106
|
+
if (!(await this.localHealthy())) {
|
|
107
|
+
this.failures = 0;
|
|
108
|
+
return 'local-down';
|
|
109
|
+
}
|
|
110
|
+
if (await this.probePublic()) {
|
|
111
|
+
this.failures = 0;
|
|
112
|
+
this.consecutiveRelinks = 0; // the link is up → any prior relink worked
|
|
113
|
+
return 'healthy';
|
|
114
|
+
}
|
|
115
|
+
this.failures += 1;
|
|
116
|
+
if (this.failures < this.failureThreshold) return 'degraded';
|
|
117
|
+
const now = this.nowImpl();
|
|
118
|
+
// Relinks that aren't restoring the tunnel (shared daemon / relay down) earn an
|
|
119
|
+
// escalating quiet period so we never thrash a co-tenant's daemon (real finding).
|
|
120
|
+
const cooldown =
|
|
121
|
+
this.consecutiveRelinks >= this.maxIneffectiveRelinks
|
|
122
|
+
? this.longCooldownMs
|
|
123
|
+
: this.minRelinkIntervalMs;
|
|
124
|
+
if (now - this.lastRelink < cooldown) return 'cooldown';
|
|
125
|
+
this.lastRelink = now;
|
|
126
|
+
this.failures = 0;
|
|
127
|
+
this.consecutiveRelinks += 1;
|
|
128
|
+
this.log(
|
|
129
|
+
`public tunnel unreachable while local is healthy — relinking daemon` +
|
|
130
|
+
` (attempt ${this.consecutiveRelinks} since last healthy)`,
|
|
131
|
+
);
|
|
132
|
+
try {
|
|
133
|
+
await this.relink();
|
|
134
|
+
return 'relinked';
|
|
135
|
+
} catch (e) {
|
|
136
|
+
this.log(`relink failed: ${e?.message || e}`);
|
|
137
|
+
return 'relink-error';
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
start() {
|
|
142
|
+
if (this.timer) return this;
|
|
143
|
+
this.timer = setInterval(() => {
|
|
144
|
+
this.tick().catch((e) => this.log(`tick error: ${e?.message || e}`));
|
|
145
|
+
}, this.intervalMs);
|
|
146
|
+
if (this.timer.unref) this.timer.unref(); // never keep the process alive
|
|
147
|
+
return this;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
stop() {
|
|
151
|
+
if (this.timer) { clearInterval(this.timer); this.timer = null; }
|
|
152
|
+
}
|
|
153
|
+
}
|