@venturewild/workspace 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@venturewild/workspace",
3
- "version": "0.2.1",
3
+ "version": "0.2.3",
4
4
  "description": "Claude Code Web — Replit/Lovable-style chat-first browser UI that wraps the AI agent already installed on your machine.",
5
5
  "license": "MIT",
6
6
  "bin": {
@@ -23,6 +23,10 @@ import { appendLine, listLogs, tailFile } from '../src/logpaths.mjs';
23
23
  import { runDoctor, renderDoctor, writeDoctorBundle } from '../src/doctor.mjs';
24
24
  import { enableOperator, disableOperator, operatorStatus } from '../src/operator.mjs';
25
25
  import { loadObservabilityConsent, setObservabilityConsent } from '../src/observability.mjs';
26
+ import {
27
+ AutoUpdater, PACKAGE_NAME, npmInstall, recordUpdate,
28
+ loadUpdateSettings, setUpdateEnabled, setUpdateChannel,
29
+ } from '../src/auto-update.mjs';
26
30
  import { openOwnerBrowser } from '../src/owner-browser.mjs';
27
31
  import { planReset, applyReset, RESET_KEEPS } from '../src/reset.mjs';
28
32
 
@@ -56,6 +60,9 @@ Usage:
56
60
  wild-workspace operator disable revoke the support token
57
61
  wild-workspace operator status is the support channel on?
58
62
  wild-workspace observability [on|off|status] share session + install health so we can help (default on; never chat content)
63
+ wild-workspace update [apply] check for / install a newer version (auto by default)
64
+ wild-workspace update on|off toggle background auto-update
65
+ wild-workspace update channel stable|beta choose the update channel
59
66
  wild-workspace service install keep your workspace always-on (starts at login, no admin)
60
67
  wild-workspace service uninstall turn always-on off
61
68
  wild-workspace service status show always-on status (installed? supervisor? server?)
@@ -599,6 +606,67 @@ async function runObservabilityCommand(action = 'status', opts = {}) {
599
606
  return;
600
607
  }
601
608
 
609
+ // `wild-workspace update [apply|on|off|channel <stable|beta>]` — Phase 2
610
+ // auto-update (docs/remote-support-and-self-healing-design.md). With no
611
+ // sub-command it checks the channel for a newer release; `apply` installs it now;
612
+ // `on`/`off` toggle the default-on background updater; `channel` switches
613
+ // stable/beta. The always-on supervisor does this automatically — this is the
614
+ // manual lever + the off switch.
615
+ async function runUpdateCommand(opts) {
616
+ const sub = opts.positional[1];
617
+ const gdir = globalDir();
618
+
619
+ if (sub === 'on' || sub === 'off') {
620
+ const rec = setUpdateEnabled(gdir, sub === 'on');
621
+ console.log(
622
+ rec.enabled
623
+ ? '✓ auto-update ON — wild-workspace keeps itself up to date in the background.'
624
+ : '✓ auto-update OFF — update manually with `wild-workspace update apply`.',
625
+ );
626
+ console.log(` channel: ${rec.channel}`);
627
+ return;
628
+ }
629
+ if (sub === 'channel') {
630
+ const chan = opts.positional[2];
631
+ if (chan !== 'stable' && chan !== 'beta') {
632
+ console.log('usage: wild-workspace update channel stable|beta');
633
+ return;
634
+ }
635
+ console.log(`✓ update channel set to ${setUpdateChannel(gdir, chan).channel}.`);
636
+ return;
637
+ }
638
+
639
+ const settings = loadUpdateSettings(gdir);
640
+ const c = await new AutoUpdater({ globalDir: gdir }).check();
641
+ console.log(`wild-workspace ${c.current} (channel: ${c.channel}, auto-update: ${settings.enabled ? 'on' : 'off'})`);
642
+ if (!c.latest) {
643
+ console.log(' could not reach the npm registry to check for updates.');
644
+ process.exitCode = 1;
645
+ return;
646
+ }
647
+ if (!c.available) {
648
+ console.log(` up to date — ${c.latest} is the latest on ${c.channel}.`);
649
+ return;
650
+ }
651
+ console.log(` update available: ${c.current} → ${c.latest}`);
652
+ if (sub !== 'apply') {
653
+ console.log(' run `wild-workspace update apply` to install it now.');
654
+ return;
655
+ }
656
+ console.log(` installing ${c.latest}…`);
657
+ const res = await npmInstall(`${PACKAGE_NAME}@${c.latest}`);
658
+ if (res.code === 0) {
659
+ recordUpdate(gdir, { from: c.current, to: c.latest, at: Date.now(), status: 'installed' });
660
+ console.log(` ✓ installed ${c.latest}.`);
661
+ console.log(' The always-on supervisor will restart into the new version shortly,');
662
+ console.log(' or restart `wild-workspace` yourself to use it now.');
663
+ } else {
664
+ console.log(` ✗ install failed (code ${res.code}).`);
665
+ if (res.output) console.log(' ' + res.output.split('\n').filter(Boolean).slice(-3).join('\n '));
666
+ process.exitCode = 1;
667
+ }
668
+ }
669
+
602
670
  // `wild-workspace operator [enable|disable|status]` — the consented support
603
671
  // channel (docs/SECURITY.md). OFF by default; `enable` mints a token to hand to
604
672
  // the wild-workspace team so they can diagnose + run a fixed set of safe fixes.
@@ -734,6 +802,9 @@ async function main() {
734
802
  if (opts.positional[0] === 'logs') {
735
803
  return runLogsCommand(opts);
736
804
  }
805
+ if (opts.positional[0] === 'update') {
806
+ return runUpdateCommand(opts);
807
+ }
737
808
  if (opts.positional[0] === 'operator') {
738
809
  return runOperatorCommand(opts.positional[1], opts);
739
810
  }
@@ -0,0 +1,277 @@
1
+ // AutoUpdater — Phase 2 (Pillar B) of the self-healing epic
2
+ // (docs/remote-support-and-self-healing-design.md). Kills the manual `npm i -g`
3
+ // re-install treadmill that turned the first external install into a copy-paste
4
+ // marathon: the always-on supervisor periodically checks the published version on
5
+ // the user's channel, and on a new release it installs it, restarts the supervised
6
+ // server (reusing the version-drift restart RC1b already ships), health-checks the
7
+ // result, and ROLLS BACK to the pinned previous version if the new one doesn't come
8
+ // up healthy.
9
+ //
10
+ // Tuan's locked decisions (design doc Part 6): auto-update is DEFAULT-ON (the
11
+ // OneDrive bar — it just updates itself) with a visible "updated to vX" note + an
12
+ // off switch; channels are `stable` (default) and `beta` (opt-in).
13
+ //
14
+ // Like observability.mjs/operator.mjs, settings live in their own file in the
15
+ // machine-global dir (~/.wild-workspace, NEVER the synced workspace — locked
16
+ // principle #1) so they survive the supervisor relaunching from a different cwd.
17
+ //
18
+ // Every external touch-point (npm install, registry fetch, health probe, restart,
19
+ // clock, sleep) is an injected seam so the suite never spawns a process, hits the
20
+ // network, or actually waits.
21
+
22
+ import { spawn } from 'node:child_process';
23
+ import fs from 'node:fs';
24
+ import path from 'node:path';
25
+ import { globalDir as defaultGlobalDir } from './logpaths.mjs';
26
+ import { installedVersion, probeHealthVersion } from './supervisor.mjs';
27
+ import { ensureToolPath } from './agent.mjs';
28
+
29
+ export const PACKAGE_NAME = '@venturewild/workspace';
30
+ const DEFAULT_CHECK_INTERVAL_MS = 6 * 60 * 60 * 1000; // 6h — releases are not frequent
31
+
32
+ // --- persisted settings (~/.wild-workspace/update.json) ----------------------
33
+
34
+ function updateFile(dir) { return path.join(dir, 'update.json'); }
35
+
36
+ /**
37
+ * DEFAULT-ON: an absent file means "auto-update enabled, stable channel" — the
38
+ * disclosure rides onboarding + the visible update note, mirroring observability.
39
+ */
40
+ export function loadUpdateSettings(dir = defaultGlobalDir()) {
41
+ try {
42
+ const p = JSON.parse(fs.readFileSync(updateFile(dir), 'utf8'));
43
+ return {
44
+ enabled: p.enabled !== false,
45
+ channel: p.channel === 'beta' ? 'beta' : 'stable',
46
+ lastCheckAt: Number(p.lastCheckAt) || 0,
47
+ lastUpdate: p.lastUpdate || null, // { from, to, at, status }
48
+ };
49
+ } catch {
50
+ return { enabled: true, channel: 'stable', lastCheckAt: 0, lastUpdate: null };
51
+ }
52
+ }
53
+
54
+ function writeSettings(dir, rec) {
55
+ try {
56
+ fs.mkdirSync(dir, { recursive: true });
57
+ fs.writeFileSync(updateFile(dir), JSON.stringify(rec, null, 2), { mode: 0o600 });
58
+ } catch {
59
+ /* read-only fs — fall back to in-memory for this run */
60
+ }
61
+ return rec;
62
+ }
63
+
64
+ export function setUpdateEnabled(dir, enabled) {
65
+ return writeSettings(dir, { ...loadUpdateSettings(dir), enabled: Boolean(enabled) });
66
+ }
67
+ export function setUpdateChannel(dir, channel) {
68
+ return writeSettings(dir, { ...loadUpdateSettings(dir), channel: channel === 'beta' ? 'beta' : 'stable' });
69
+ }
70
+ export function touchLastCheck(dir, at) {
71
+ return writeSettings(dir, { ...loadUpdateSettings(dir), lastCheckAt: at });
72
+ }
73
+ /** Record the outcome of an update attempt; returns the stored record. */
74
+ export function recordUpdate(dir, rec) {
75
+ return writeSettings(dir, { ...loadUpdateSettings(dir), lastUpdate: rec }).lastUpdate;
76
+ }
77
+
78
+ // --- semver compare (no dep) -------------------------------------------------
79
+
80
+ function parseVersion(v) {
81
+ const [core, pre = ''] = String(v).replace(/^v/, '').split('-');
82
+ const nums = core.split('.').map((n) => parseInt(n, 10) || 0);
83
+ while (nums.length < 3) nums.push(0);
84
+ return { nums, pre };
85
+ }
86
+
87
+ /** True iff version `a` is strictly newer than `b` (good enough for dist-tags). */
88
+ export function isNewer(a, b) {
89
+ if (!a || !b) return false;
90
+ const pa = parseVersion(a), pb = parseVersion(b);
91
+ for (let i = 0; i < 3; i++) {
92
+ if (pa.nums[i] !== pb.nums[i]) return pa.nums[i] > pb.nums[i];
93
+ }
94
+ // Equal core: a release (no prerelease) outranks a prerelease; else lexical.
95
+ if (pa.pre && !pb.pre) return false;
96
+ if (!pa.pre && pb.pre) return true;
97
+ if (pa.pre && pb.pre) return pa.pre > pb.pre;
98
+ return false; // identical
99
+ }
100
+
101
+ // --- registry + npm primitives -----------------------------------------------
102
+
103
+ /**
104
+ * The version published on `channel`'s dist-tag (stable→latest, beta→beta), or
105
+ * null on any failure. Uses the abbreviated registry document (smaller payload).
106
+ */
107
+ export async function fetchLatestVersion(channel, {
108
+ fetchImpl = fetch, packageName = PACKAGE_NAME, timeoutMs = 8000,
109
+ } = {}) {
110
+ const tag = channel === 'beta' ? 'beta' : 'latest';
111
+ const ctrl = new AbortController();
112
+ const t = setTimeout(() => ctrl.abort(), timeoutMs);
113
+ try {
114
+ const res = await fetchImpl(`https://registry.npmjs.org/${packageName.replace('/', '%2f')}`, {
115
+ signal: ctrl.signal,
116
+ headers: { accept: 'application/vnd.npm.install-v1+json' },
117
+ });
118
+ if (!res || !res.ok) return null;
119
+ const body = await res.json();
120
+ return body?.['dist-tags']?.[tag] || null;
121
+ } catch {
122
+ return null;
123
+ } finally {
124
+ clearTimeout(t);
125
+ }
126
+ }
127
+
128
+ /** Run `npm i -g <spec>`. Resolves {code, output, timedOut?, error?}; never rejects. */
129
+ export function npmInstall(spec, {
130
+ spawnImpl = spawn, timeoutMs = 180000, ensurePathImpl = ensureToolPath, env = process.env,
131
+ } = {}) {
132
+ return new Promise((resolve) => {
133
+ const cmd = process.platform === 'win32' ? 'npm.cmd' : 'npm';
134
+ // The always-on supervisor (our caller in the field) runs under launchd/GUI,
135
+ // which inherits a MINIMAL PATH omitting ~/.npm-global, /usr/local/bin,
136
+ // Homebrew, nvm — so a bare `npm` spawn would ENOENT (the 0.1.8 `claude`
137
+ // bug class). Augment PATH the same way agent.mjs does before spawning. We
138
+ // copy env so we never mutate the caller's process.env.
139
+ const childEnv = { ...env };
140
+ try { ensurePathImpl(childEnv); } catch { /* best-effort — fall back to inherited PATH */ }
141
+ let child;
142
+ try {
143
+ child = spawnImpl(cmd, ['i', '-g', spec], { windowsHide: true, env: childEnv });
144
+ } catch (e) {
145
+ return resolve({ code: -1, error: e?.message || String(e), output: '' });
146
+ }
147
+ let out = '';
148
+ const cap = (d) => { out += String(d); if (out.length > 20000) out = out.slice(-20000); };
149
+ child.stdout?.on?.('data', cap);
150
+ child.stderr?.on?.('data', cap);
151
+ const timer = setTimeout(() => { try { child.kill?.(); } catch { /* gone */ } resolve({ code: -1, timedOut: true, output: out }); }, timeoutMs);
152
+ child.on?.('exit', (code) => { clearTimeout(timer); resolve({ code, output: out }); });
153
+ child.on?.('error', (e) => { clearTimeout(timer); resolve({ code: -1, error: e?.message || String(e), output: out }); });
154
+ });
155
+ }
156
+
157
+ const sleep = (ms) => new Promise((r) => setTimeout(r, ms));
158
+
159
+ // --- the updater -------------------------------------------------------------
160
+
161
+ export class AutoUpdater {
162
+ constructor({
163
+ globalDir = defaultGlobalDir(),
164
+ packageName = PACKAGE_NAME,
165
+ port = Number(process.env.WILD_WORKSPACE_PORT || 5173),
166
+ installedVersionImpl = () => installedVersion(),
167
+ fetchLatestImpl = fetchLatestVersion,
168
+ installImpl = npmInstall,
169
+ // Ask the owner (the supervisor) to restart the server child so the freshly
170
+ // installed code is loaded. Default no-op for standalone/manual use.
171
+ restartImpl = async () => {},
172
+ // Read the RUNNING server's version (probeHealthVersion bound to our port).
173
+ healthVersionImpl = (port_) => probeHealthVersion(port_),
174
+ nowImpl = () => Date.now(),
175
+ logImpl = () => {},
176
+ env = process.env,
177
+ checkIntervalMs = DEFAULT_CHECK_INTERVAL_MS,
178
+ verifyAttempts = 10,
179
+ verifyDelayMs = 3000,
180
+ sleepImpl = sleep,
181
+ onUpdate = null, // (rec) => void — surface the "updated to vX" note
182
+ } = {}) {
183
+ Object.assign(this, {
184
+ globalDir, packageName, port, installedVersionImpl, fetchLatestImpl, installImpl,
185
+ restartImpl, healthVersionImpl, nowImpl, logImpl, env,
186
+ checkIntervalMs, verifyAttempts, verifyDelayMs, sleepImpl, onUpdate,
187
+ });
188
+ this.inProgress = false;
189
+ }
190
+
191
+ enabled() {
192
+ if (this.env.WILD_WORKSPACE_NO_AUTOUPDATE === '1') return false; // hard kill switch
193
+ return loadUpdateSettings(this.globalDir).enabled;
194
+ }
195
+
196
+ channel() { return loadUpdateSettings(this.globalDir).channel; }
197
+
198
+ dueForCheck(settings = loadUpdateSettings(this.globalDir)) {
199
+ return this.nowImpl() - (settings.lastCheckAt || 0) >= this.checkIntervalMs;
200
+ }
201
+
202
+ /** What's installed vs what's published — { current, latest, channel, available }. */
203
+ async check() {
204
+ const current = this.installedVersionImpl();
205
+ const channel = this.channel();
206
+ const latest = await this.fetchLatestImpl(channel, { packageName: this.packageName });
207
+ return { current, latest, channel, available: isNewer(latest, current) };
208
+ }
209
+
210
+ /** Poll the running server until it reports `expected`, up to verifyAttempts. */
211
+ async verify(expected) {
212
+ for (let i = 0; i < this.verifyAttempts; i++) {
213
+ let running = null;
214
+ try { running = await this.healthVersionImpl(this.port); } catch { running = null; }
215
+ if (running && running === expected) return true;
216
+ await this.sleepImpl(this.verifyDelayMs);
217
+ }
218
+ return false;
219
+ }
220
+
221
+ /**
222
+ * Install `target`, restart, verify healthy; on failure roll back to `from`.
223
+ * Returns { ok, stage?, rolledBack?, rec }. Records the outcome to update.json.
224
+ */
225
+ async applyUpdate(target, { from } = {}) {
226
+ this.logImpl(`auto-update: installing ${this.packageName}@${target} (from ${from || 'unknown'})`);
227
+ const install = await this.installImpl(`${this.packageName}@${target}`);
228
+ if (install.code !== 0) {
229
+ this.logImpl(`auto-update: install failed code=${install.code}${install.timedOut ? ' (timeout)' : ''}`);
230
+ const rec = recordUpdate(this.globalDir, { from, to: target, at: this.nowImpl(), status: 'install-failed' });
231
+ this.onUpdate?.(rec);
232
+ return { ok: false, stage: 'install', install, rec };
233
+ }
234
+ await this.restartImpl();
235
+ if (await this.verify(target)) {
236
+ this.logImpl(`auto-update: now running ${target}`);
237
+ const rec = recordUpdate(this.globalDir, { from, to: target, at: this.nowImpl(), status: 'ok' });
238
+ this.onUpdate?.(rec);
239
+ return { ok: true, rec };
240
+ }
241
+ // New version didn't come up healthy → roll back to the pinned previous.
242
+ this.logImpl(`auto-update: ${target} unhealthy — rolling back to ${from}`);
243
+ let rolledBack = false;
244
+ if (from) {
245
+ const rb = await this.installImpl(`${this.packageName}@${from}`);
246
+ if (rb.code === 0) { await this.restartImpl(); rolledBack = await this.verify(from); }
247
+ }
248
+ const status = rolledBack ? 'rolled-back' : 'rollback-failed';
249
+ this.logImpl(`auto-update: ${status} (target ${target})`);
250
+ const rec = recordUpdate(this.globalDir, { from, to: target, at: this.nowImpl(), status });
251
+ this.onUpdate?.(rec);
252
+ return { ok: false, stage: 'verify', rolledBack, rec };
253
+ }
254
+
255
+ /**
256
+ * One auto-update cycle, called on the supervisor's slow timer. Self-rate-limits
257
+ * via dueForCheck so the timer cadence and the check interval are independent.
258
+ * Returns a short status string (exposed for tests/logging).
259
+ */
260
+ async tick() {
261
+ if (this.inProgress) return 'busy';
262
+ if (!this.enabled()) return 'disabled';
263
+ if (!this.dueForCheck()) return 'not-due';
264
+ this.inProgress = true;
265
+ try {
266
+ touchLastCheck(this.globalDir, this.nowImpl());
267
+ const c = await this.check();
268
+ if (!c.latest) return 'check-failed';
269
+ if (!c.available) return 'up-to-date';
270
+ this.logImpl(`auto-update: ${c.current} → ${c.latest} (${c.channel})`);
271
+ const r = await this.applyUpdate(c.latest, { from: c.current });
272
+ return r.ok ? 'updated' : (r.rolledBack ? 'rolled-back' : 'failed');
273
+ } finally {
274
+ this.inProgress = false;
275
+ }
276
+ }
277
+ }
@@ -312,6 +312,12 @@ export function buildConfig(overrides = {}) {
312
312
  overrides.operatorToken ??
313
313
  env.WILD_WORKSPACE_OPERATOR_TOKEN ??
314
314
  loadOperatorToken(dataDir),
315
+ // RC1 hot-reload seam: the EXPLICIT token (override or env), with NO disk read.
316
+ // When this is null the live auth path re-reads the token file on each request
317
+ // (getOperatorToken) so `operator enable` takes effect with no restart; when a
318
+ // test/env pins a token, that value stays authoritative. (See index.mjs.)
319
+ operatorTokenExplicit:
320
+ overrides.operatorToken ?? env.WILD_WORKSPACE_OPERATOR_TOKEN ?? null,
315
321
  workspaceId:
316
322
  overrides.workspaceId ||
317
323
  env.WILD_WORKSPACE_ID ||
@@ -21,7 +21,7 @@ import { resolveDaemonBinary } from './daemon-bin.mjs';
21
21
  import { checkPort } from './preview.mjs';
22
22
  import { loadAccount } from './account.mjs';
23
23
  import { serviceStatus } from './service.mjs';
24
- import { probeHealth } from './supervisor.mjs';
24
+ import { probeHealth, probeHealthVersion } from './supervisor.mjs';
25
25
  import { listLogs, diagnosticsDir } from './logpaths.mjs';
26
26
 
27
27
  const STATUS_ICON = { ok: '✅', warn: '⚠️', fail: '❌', info: 'ℹ️' };
@@ -36,6 +36,27 @@ function nodeMajor(version = process.version) {
36
36
  return m ? Number(m[1]) : 0;
37
37
  }
38
38
 
39
+ // RC3: probe the LIVE public tunnel end-to-end — out to Cloudflare, through the
40
+ // relay, down the daemon's tunnel, back to this server. This is the check the old
41
+ // `doctor` lacked: it only resolved the slug in the registry (claimed in the DB),
42
+ // which stays green even when `<slug>.venturewild.llc` is 502. A 200 here proves
43
+ // the whole chain works; a 5xx/timeout is the exact RC2 "linked but unreachable".
44
+ async function probeTunnel(slug, fetchImpl, timeoutMs = 8000) {
45
+ const url = `https://${encodeURIComponent(slug)}.venturewild.llc/api/health`;
46
+ const ctrl = new AbortController();
47
+ const timer = setTimeout(() => ctrl.abort(), timeoutMs);
48
+ try {
49
+ const res = await fetchImpl(url, { signal: ctrl.signal, headers: { 'cache-control': 'no-cache' } });
50
+ let version = null;
51
+ try { version = (await res.json())?.version || null; } catch { /* non-JSON */ }
52
+ return { reachable: true, status: res.status, version, url };
53
+ } catch (e) {
54
+ return { reachable: false, error: String(e?.message || e), url };
55
+ } finally {
56
+ clearTimeout(timer);
57
+ }
58
+ }
59
+
39
60
  // Reach the bmo-sync registry: resolve the user's slug if linked, else /health.
40
61
  async function probeRegistry(config, fetchImpl) {
41
62
  const base = String(config.bmoSyncServerUrl || '').replace(/\/$/, '');
@@ -71,6 +92,7 @@ export async function runDoctor(opts = {}, deps = {}) {
71
92
  serviceStatus: deps.serviceStatus || serviceStatus,
72
93
  listLogs: deps.listLogs || listLogs,
73
94
  fetchImpl: deps.fetchImpl || ((...a) => globalThis.fetch(...a)),
95
+ probeRunningVersion: deps.probeRunningVersion || probeHealthVersion,
74
96
  };
75
97
  const checks = [];
76
98
  const add = (c) => checks.push(c);
@@ -155,6 +177,26 @@ export async function runDoctor(opts = {}, deps = {}) {
155
177
  : { status: 'ok', detail: 'free', hint: null };
156
178
  });
157
179
 
180
+ // 5b. Running server version vs installed (RC3). `doctor` runs as the
181
+ // freshly-invoked CLI, so APP_VERSION here == the version installed on disk.
182
+ // If a server is answering :port with a DIFFERENT version, it's running stale
183
+ // code from before the last upgrade — the "kept running 0.1.14 after 0.2.1"
184
+ // failure. Surface it so the fix (restart) is obvious instead of invisible.
185
+ await guarded('runningVersion', 'Running version', async () => {
186
+ const running = await d.probeRunningVersion(config.port);
187
+ if (!running) {
188
+ return { status: 'info', detail: `no server answering :${config.port} (not started yet)`, hint: null };
189
+ }
190
+ if (running === APP_VERSION) {
191
+ return { status: 'ok', detail: `v${running} (matches installed)`, hint: null };
192
+ }
193
+ return {
194
+ status: 'warn',
195
+ detail: `running v${running}, but v${APP_VERSION} is installed`,
196
+ hint: 'A workspace server is running older code than what is installed. Restart it (close the app — always-on restarts it clean) to finish the upgrade.',
197
+ };
198
+ });
199
+
158
200
  // 6. Account linked (slug)
159
201
  let account = null;
160
202
  await guarded('account', 'Workspace account linked', async () => {
@@ -181,6 +223,38 @@ export async function runDoctor(opts = {}, deps = {}) {
181
223
  : { status: 'warn', detail: `server returned HTTP ${r.status}`, hint: null };
182
224
  });
183
225
 
226
+ // 7b. Public URL reachable end-to-end (RC3). Only meaningful once linked. This
227
+ // is the half the old doctor was blind to — the registry check above can be
228
+ // green (slug claimed) while this is red (tunnel down). Together they tell the
229
+ // two apart: claimed-but-unreachable ⟹ the daemon link is broken (RC2), the
230
+ // operator/auto-relink path is the fix.
231
+ await guarded('tunnel', 'Public URL reachable', async () => {
232
+ const slug = account?.slug || config.account?.slug || null;
233
+ if (!slug) {
234
+ return { status: 'info', detail: 'not linked — no public URL yet', hint: null };
235
+ }
236
+ const r = await probeTunnel(slug, d.fetchImpl);
237
+ if (!r.reachable) {
238
+ return {
239
+ status: 'fail',
240
+ detail: `${r.url} unreachable: ${r.error}`,
241
+ hint: 'The public link is down. Restart sync (`wild-workspace daemon stop` then `wild-workspace`), or the operator `relink-account` fix.',
242
+ };
243
+ }
244
+ if (r.status >= 500) {
245
+ return {
246
+ status: 'fail',
247
+ detail: `${r.url} returned HTTP ${r.status} (tunnel down — slug claimed but not linked)`,
248
+ hint: 'The daemon is not linked to the relay. Restart sync (`wild-workspace daemon stop` then `wild-workspace`).',
249
+ };
250
+ }
251
+ if (r.status >= 400) {
252
+ // 401/403/404 = the chain works; auth/slug is the nuance, not a tunnel fault.
253
+ return { status: 'warn', detail: `reachable but HTTP ${r.status} (auth/slug check)`, hint: null };
254
+ }
255
+ return { status: 'ok', detail: `live (HTTP ${r.status}${r.version ? `, v${r.version}` : ''})`, hint: null };
256
+ });
257
+
184
258
  // 8. Always-on / autostart
185
259
  await guarded('service', 'Always-on (autostart)', async () => {
186
260
  const s = await d.serviceStatus({ port: config.port }, { probeImpl: (p) => probeHealth(p) });
@@ -35,10 +35,12 @@ import { InboxWatcher } from './inbox.mjs';
35
35
  import { ActivityBus } from './activity.mjs';
36
36
  import { loadIdentity, saveIdentity, markOnboarded, TONES } from './agent-identity.mjs';
37
37
  import { probeAgentReadiness } from './agent-readiness.mjs';
38
+ import { AutoUpdater, npmInstall, recordUpdate, loadUpdateSettings, PACKAGE_NAME } from './auto-update.mjs';
38
39
  import { ClaudeLoginSession } from './agent-login.mjs';
39
40
  import { ErrorReporter } from './error-reporter.mjs';
40
41
  import { DaemonBridge } from './daemon.mjs';
41
42
  import { DaemonSupervisor } from './daemon-supervisor.mjs';
43
+ import { TunnelWatchdog } from './tunnel-watchdog.mjs';
42
44
  import { SyncControl } from './sync.mjs';
43
45
  import { detectPreviewPorts, checkPort } from './preview.mjs';
44
46
  import { createBazaar } from './bazaar/core.mjs';
@@ -47,6 +49,7 @@ import { matchCandidates } from './bazaar/mock-tickup.mjs';
47
49
  import { servePreviewFile, confineBuildDir } from './bazaar/preview-server.mjs';
48
50
  import { TURN_SYSTEM_PROMPT, writeTurnMcpConfig } from './turn-mcp.mjs';
49
51
  import { loadAccount } from './account.mjs';
52
+ import { getOperatorToken } from './operator.mjs';
50
53
  import { runDoctor } from './doctor.mjs';
51
54
  import { appendLine, tailFile, logFile, TAILABLE, globalDir } from './logpaths.mjs';
52
55
  import { SessionReporter } from './session-reporter.mjs';
@@ -172,6 +175,29 @@ export async function createServer(overrides = {}) {
172
175
  .catch((e) => ({ started: false, error: String(e?.message || e) }))
173
176
  : Promise.resolve({ started: false, skipped: true });
174
177
 
178
+ // RC2 tunnel self-heal: when this install is slug-linked (so it's SUPPOSED to be
179
+ // reachable at <slug>.venturewild.llc), watch the public URL end-to-end and
180
+ // relink the daemon if it goes dead while we're locally healthy. Off without a
181
+ // daemon supervisor, without a slug, or under tests. `overrides.tunnelWatchdog`
182
+ // is a test seam (false disables; an object injects options).
183
+ const relinkDaemon = async () => {
184
+ if (!daemonSupervisor) return;
185
+ await daemonSupervisor.stop().catch(() => {});
186
+ await daemonSupervisor.ensureRunning().catch(() => {});
187
+ };
188
+ const tunnelWatchdog =
189
+ overrides.tunnelWatchdog === false ||
190
+ !daemonSupervisor ||
191
+ !config.account?.slug ||
192
+ !config.daemonAutostart
193
+ ? null
194
+ : new TunnelWatchdog({
195
+ publicBaseUrl: `https://${config.account.slug}.venturewild.llc`,
196
+ relink: relinkDaemon,
197
+ log: (m) => log('[tunnel]', m),
198
+ ...(typeof overrides.tunnelWatchdog === 'object' ? overrides.tunnelWatchdog : {}),
199
+ }).start();
200
+
175
201
  // Control plane for bmo-sync folder sharing (pair / detach / invite).
176
202
  // `overrides.syncControl` is a test seam.
177
203
  const syncControl =
@@ -488,6 +514,14 @@ export async function createServer(overrides = {}) {
488
514
  const app = new Hono();
489
515
 
490
516
  // --- auth helpers ---------------------------------------------------------
517
+ // RC1 hot-reload: resolve the operator token LIVE per request. An explicit
518
+ // override/env token (tests, pinned deployments) stays authoritative; otherwise
519
+ // the token file is re-read (TTL-cached) so `operator enable`/`disable` take
520
+ // effect with no server restart — the literal 401 from the first external
521
+ // install. `overrides.operatorDataDir` is unused; the file lives in dataDir.
522
+ const liveOperatorToken = () =>
523
+ config.operatorTokenExplicit ?? getOperatorToken(config.dataDir);
524
+
491
525
  // Classify one raw token into a role. Shared by the Authorization header, the
492
526
  // HttpOnly auth cookie, and the `?t=` query so all three stay consistent.
493
527
  // `allowOperator` is true ONLY for the header path — the operator (support)
@@ -498,7 +532,8 @@ export async function createServer(overrides = {}) {
498
532
  if (token === config.partnerToken) {
499
533
  return { role: ROLES.PARTNER, sub: 'partner', source };
500
534
  }
501
- if (allowOperator && config.operatorToken && token === config.operatorToken) {
535
+ const opToken = allowOperator ? liveOperatorToken() : null;
536
+ if (opToken && token === opToken) {
502
537
  return { role: ROLES.OPERATOR, sub: 'operator', source: source || 'operator-token' };
503
538
  }
504
539
  const payload = await verifyShareToken(token, config.shareSecret);
@@ -985,6 +1020,22 @@ export async function createServer(overrides = {}) {
985
1020
  return c.json({ agent: agentTag(activeAgent), ...verdict });
986
1021
  });
987
1022
 
1023
+ // Auto-update status (Phase 2) — what's running, the channel, on/off, and the
1024
+ // last update outcome (the "updated to vX" note the UI can surface). Read-only;
1025
+ // the toggle/apply levers are the CLI + the operator channel.
1026
+ app.get('/api/update/status', (c) => {
1027
+ const forbidden = require(c, 'chat');
1028
+ if (forbidden) return forbidden;
1029
+ const s = loadUpdateSettings(globalDir());
1030
+ return c.json({
1031
+ current: APP_VERSION,
1032
+ enabled: s.enabled,
1033
+ channel: s.channel,
1034
+ lastCheckAt: s.lastCheckAt || null,
1035
+ lastUpdate: s.lastUpdate || null,
1036
+ });
1037
+ });
1038
+
988
1039
  // In-app "Sign in to Claude" — drives `claude auth login` in a real PTY so the
989
1040
  // browser OAuth callback auto-completes and the user never touches a terminal.
990
1041
  // (See agent-login.mjs.) Claude opens the OAuth URL in the user's browser itself
@@ -1220,7 +1271,9 @@ export async function createServer(overrides = {}) {
1220
1271
  spawn,
1221
1272
  ...(overrides.operatorDeps || {}),
1222
1273
  };
1223
- const operatorEnabled = () => Boolean(config.operatorToken);
1274
+ // Live so `operator enable` (run in a separate CLI process) lights the channel
1275
+ // up without a server restart, and `operator disable` takes it dark (RC1).
1276
+ const operatorEnabled = () => Boolean(liveOperatorToken());
1224
1277
  function auditOperator(c, action, detail) {
1225
1278
  const s = c.get('session') || {};
1226
1279
  appendLine('operator', `${action} by=${s.sub || 'operator'} src=${s.source || '-'} ${detail || ''}`.trim());
@@ -1267,6 +1320,24 @@ export async function createServer(overrides = {}) {
1267
1320
  child?.on?.('exit', (code) => appendLine('operator', `reinstall-daemon exited code=${code}`));
1268
1321
  return { started: true, pid: child?.pid || null, command: `${cmd} i -g @venturewild/workspace` };
1269
1322
  },
1323
+ // Phase 2: check the user's channel and install a newer version if one exists.
1324
+ // The always-on supervisor's version-drift auto-restart (RC1b) then loads it;
1325
+ // the supervisor also owns autonomous health-gated rollback. This is the
1326
+ // remote-support trigger for the same flow (Phase 3 capability).
1327
+ 'update-now': async () => {
1328
+ const gdir = globalDir();
1329
+ const check = await (operatorDeps.checkUpdate
1330
+ ? operatorDeps.checkUpdate()
1331
+ : new AutoUpdater({ globalDir: gdir }).check());
1332
+ if (!check.latest) return { ok: false, reason: 'registry-unreachable', current: check.current };
1333
+ if (!check.available) return { ok: true, updated: false, current: check.current, latest: check.latest };
1334
+ appendLine('operator', `update-now installing ${check.current} → ${check.latest} (${check.channel})`);
1335
+ const res = await (operatorDeps.npmInstall || npmInstall)(`${PACKAGE_NAME}@${check.latest}`);
1336
+ const ok = res.code === 0;
1337
+ if (ok) recordUpdate(gdir, { from: check.current, to: check.latest, at: Date.now(), status: 'installed' });
1338
+ appendLine('operator', `update-now ${ok ? 'installed' : `failed code=${res.code}`}`);
1339
+ return { ok, updated: ok, from: check.current, to: check.latest, code: res.code };
1340
+ },
1270
1341
  };
1271
1342
 
1272
1343
  app.get('/api/operator/diag', async (c) => {
@@ -1887,6 +1958,7 @@ export async function createServer(overrides = {}) {
1887
1958
  daemonBridge,
1888
1959
  daemonSupervisor,
1889
1960
  daemonReady,
1961
+ tunnelWatchdog,
1890
1962
  syncControl,
1891
1963
  sessionReporter,
1892
1964
  detectedAgents,
@@ -1898,6 +1970,7 @@ export async function createServer(overrides = {}) {
1898
1970
  try { transcriptRecorder.stop(); } catch {}
1899
1971
  try { inboxWatcher.stop(); } catch {}
1900
1972
  try { daemonBridge?.stop(); } catch {}
1973
+ try { tunnelWatchdog?.stop(); } catch {}
1901
1974
  // The daemon is deliberately NOT stopped here — it is detached so sync
1902
1975
  // keeps running after wild-workspace closes. `wild-workspace daemon
1903
1976
  // stop` is the explicit off-switch.
@@ -29,6 +29,33 @@ export function loadOperatorToken(dataDir) {
29
29
  }
30
30
  }
31
31
 
32
+ // RC1 hot-reload: read the operator token LIVE (with a tiny TTL cache) instead of
33
+ // the value the server snapshotted at boot. Today `operator enable` writes the
34
+ // token to disk but a long-running server keeps serving its cached "disabled"
35
+ // state, so the channel 401s until a manual restart (the exact bug from the first
36
+ // external install). A short TTL keeps this off the hot auth path — every request
37
+ // reads from cache, and `enable`/`disable` take effect within `ttlMs`.
38
+ //
39
+ // The cache is keyed by dataDir so two servers (tests, multiple installs) in one
40
+ // process don't read each other's tokens. `now` is injectable for tests.
41
+ const _tokenCache = new Map(); // dataDir -> { token, at }
42
+ export function getOperatorToken(dataDir, { ttlMs = 2000, now = Date.now } = {}) {
43
+ const t = now();
44
+ const hit = _tokenCache.get(dataDir);
45
+ if (hit && t - hit.at < ttlMs) return hit.token;
46
+ const token = loadOperatorToken(dataDir);
47
+ _tokenCache.set(dataDir, { token, at: t });
48
+ return token;
49
+ }
50
+
51
+ // Drop the cached token for a dataDir (or all of them). `enable`/`disable` run in
52
+ // a separate CLI process from the server, so they don't need this — it exists so
53
+ // in-process callers (and tests) can force a re-read without waiting out the TTL.
54
+ export function invalidateOperatorTokenCache(dataDir) {
55
+ if (dataDir === undefined) _tokenCache.clear();
56
+ else _tokenCache.delete(dataDir);
57
+ }
58
+
32
59
  // Turn the channel on. Idempotent by default — returns the existing token if one
33
60
  // is already set (so a re-run doesn't invalidate the code the user already
34
61
  // shared). Pass { rotate:true } to force a fresh token. Returns the token, or
@@ -41,6 +41,47 @@ export function probeHealth(port, timeoutMs = 2500) {
41
41
  });
42
42
  }
43
43
 
44
+ /**
45
+ * Ask the running server its version via /api/health. Returns the version string
46
+ * or null (server down / no version field / parse error). Never throws. Used by
47
+ * the version-drift check (RC1) — a stale server keeps running its OLD code after
48
+ * an upgrade, so we compare what's RUNNING to what's INSTALLED on disk.
49
+ */
50
+ export function probeHealthVersion(port, timeoutMs = 2500) {
51
+ return new Promise((resolve) => {
52
+ const req = http.get(
53
+ { host: '127.0.0.1', port, path: '/api/health', timeout: timeoutMs },
54
+ (res) => {
55
+ let body = '';
56
+ res.on('data', (d) => { body += d; if (body.length > 4096) req.destroy(); });
57
+ res.on('end', () => {
58
+ try { resolve(JSON.parse(body).version || null); } catch { resolve(null); }
59
+ });
60
+ },
61
+ );
62
+ req.on('error', () => resolve(null));
63
+ req.on('timeout', () => { req.destroy(); resolve(null); });
64
+ });
65
+ }
66
+
67
+ /**
68
+ * The version installed on disk RIGHT NOW — read fresh from the package.json that
69
+ * ships next to this file, NOT the in-memory APP_VERSION constant. The supervisor
70
+ * is long-lived: after `npm i -g` (or the operator `reinstall-daemon`) swaps the
71
+ * package, the supervisor's own constant is stale too, so only a fresh disk read
72
+ * sees the new version. Respawning the server child reloads index.mjs from this
73
+ * same path, so the restart actually picks up the new code. Returns null on error.
74
+ */
75
+ export function installedVersion(entry = DEFAULT_SERVER_ENTRY) {
76
+ try {
77
+ // index.mjs lives at <pkg>/server/src/index.mjs → package.json is ../../.
78
+ const pkg = path.resolve(path.dirname(entry), '..', '..', 'package.json');
79
+ return JSON.parse(fs.readFileSync(pkg, 'utf8')).version || null;
80
+ } catch {
81
+ return null;
82
+ }
83
+ }
84
+
44
85
  export class WorkspaceSupervisor {
45
86
  constructor({
46
87
  serverEntry = DEFAULT_SERVER_ENTRY,
@@ -58,12 +99,32 @@ export class WorkspaceSupervisor {
58
99
  env = process.env,
59
100
  crashLoopThreshold = 3,
60
101
  diagnosticsImpl = null,
102
+ // RC1 version-drift auto-restart: when the RUNNING server reports an older
103
+ // version than what's INSTALLED on disk, restart it so it picks up the new
104
+ // code. On by default; seams injected for tests. WILD_WORKSPACE_NO_AUTORESTART=1
105
+ // disables it (e.g. a developer running an intentionally-older server).
106
+ autoRestartOnVersionDrift = env.WILD_WORKSPACE_NO_AUTORESTART !== '1',
107
+ versionImpl = probeHealthVersion,
108
+ installedVersionImpl = () => installedVersion(serverEntry),
109
+ // Phase 2 auto-update (Pillar B): the always-on supervisor self-updates the
110
+ // whole stack on the user's channel, with health-gated rollback. On by
111
+ // default; the env kill switch + the persisted off switch both disable it.
112
+ // Only wired up in start() (not in the unit-test path, which calls tick()
113
+ // directly) — see start(). updatePollMs is the *wake* cadence; the actual
114
+ // check interval lives inside AutoUpdater (6h) and self-rate-limits.
115
+ autoUpdate = env.WILD_WORKSPACE_NO_AUTOUPDATE !== '1',
116
+ updatePollMs = 60 * 60 * 1000, // wake hourly; AutoUpdater gates real checks
117
+ autoUpdaterFactory = null, // test seam: (supervisor) => AutoUpdater-like
61
118
  } = {}) {
62
119
  Object.assign(this, {
63
120
  serverEntry, workspaceDir, port, globalDir, node, pollMs,
64
121
  backoffStartMs, backoffMaxMs, probeTimeoutMs, spawnImpl, probeImpl, nowImpl, env,
65
122
  crashLoopThreshold, diagnosticsImpl,
123
+ autoRestartOnVersionDrift, versionImpl, installedVersionImpl,
124
+ autoUpdate, updatePollMs, autoUpdaterFactory,
66
125
  });
126
+ this.autoUpdater = null;
127
+ this.updateTimer = null;
67
128
  this.logFile = path.join(globalDir, 'supervisor.log');
68
129
  this.serverLogFile = path.join(globalDir, 'server.out.log');
69
130
  this.lockFile = path.join(globalDir, 'supervisor.lock');
@@ -135,6 +196,28 @@ export class WorkspaceSupervisor {
135
196
  this.backoff = this.backoffStartMs; // healthy → reset backoff
136
197
  this.spawnCount = 0; // healthy → not a crash loop
137
198
  this.pushedThisEpisode = false;
199
+ // RC1 version drift: a healthy-but-STALE server (running older code than
200
+ // what's installed) should be restarted so the upgrade actually lands.
201
+ // Only when WE own the child — we restart by killing it and letting the
202
+ // next tick respawn (which reloads index.mjs from disk). A server started
203
+ // by someone else (foreground `wild-workspace`) we leave alone; we have no
204
+ // handle on it. The restarted server reports the installed version, so the
205
+ // drift clears and this won't loop.
206
+ if (this.autoRestartOnVersionDrift && this.child) {
207
+ try {
208
+ const running = await this.versionImpl(this.port, this.probeTimeoutMs);
209
+ const installed = this.installedVersionImpl();
210
+ if (running && installed && running !== installed) {
211
+ this.log(`version drift: running=${running} installed=${installed} — restarting server`);
212
+ try { this.child.kill(); } catch { /* exit handler clears child */ }
213
+ this.child = null;
214
+ this.backoff = this.backoffStartMs; // upgrade is intentional, not a crash
215
+ return 'version-drift-restart';
216
+ }
217
+ } catch (e) {
218
+ this.log(`version-drift check error: ${e?.message || e}`);
219
+ }
220
+ }
138
221
  return 'healthy';
139
222
  }
140
223
  if (this.child) return 'booting'; // spawned, still coming up
@@ -198,6 +281,47 @@ export class WorkspaceSupervisor {
198
281
  }
199
282
  }
200
283
 
284
+ /**
285
+ * Restart the supervised server child so freshly installed code is loaded.
286
+ * Kills it and lets the next tick respawn (which reloads index.mjs from disk) —
287
+ * the same mechanism as the version-drift restart, exposed for the AutoUpdater.
288
+ * No-op (returns false) when we don't own a child (foreground server).
289
+ */
290
+ restartChild() {
291
+ if (!this.child) return false;
292
+ this.log('restartChild: killing server to load new code');
293
+ try { this.child.kill(); } catch { /* exit handler clears child */ }
294
+ this.child = null;
295
+ this.backoff = this.backoffStartMs; // an intentional restart, not a crash
296
+ return true;
297
+ }
298
+
299
+ /** Build the AutoUpdater bound to this supervisor. Separated for the test seam. */
300
+ async buildAutoUpdater() {
301
+ if (this.autoUpdaterFactory) return this.autoUpdaterFactory(this);
302
+ // Lazy import keeps the unit-test path (which never calls start()) free of the
303
+ // auto-update module + its registry/npm seams.
304
+ const { AutoUpdater } = await import('./auto-update.mjs');
305
+ return new AutoUpdater({
306
+ globalDir: this.globalDir,
307
+ port: this.port,
308
+ installedVersionImpl: this.installedVersionImpl,
309
+ healthVersionImpl: (port) => this.versionImpl(port, this.probeTimeoutMs),
310
+ restartImpl: async () => { this.restartChild(); },
311
+ nowImpl: this.nowImpl,
312
+ env: this.env,
313
+ logImpl: (m) => this.log(m),
314
+ onUpdate: (rec) => this.log(`auto-update result: ${rec.from || '?'}→${rec.to} ${rec.status}`),
315
+ });
316
+ }
317
+
318
+ runUpdateTick() {
319
+ if (!this.autoUpdater) return;
320
+ this.autoUpdater.tick()
321
+ .then((r) => { if (r && !['not-due', 'disabled', 'up-to-date', 'busy'].includes(r)) this.log(`auto-update tick: ${r}`); })
322
+ .catch((e) => this.log(`auto-update error: ${e?.message || e}`));
323
+ }
324
+
201
325
  /** Acquire the lock and start the supervision loop. Idempotent across processes. */
202
326
  start() {
203
327
  if (!this.acquireLock()) return { started: false, reason: 'already-running' };
@@ -207,11 +331,24 @@ export class WorkspaceSupervisor {
207
331
  this.log(`supervisor start pid=${process.pid} watching http://127.0.0.1:${this.port}/api/health (workspace=${this.workspaceDir})`);
208
332
  this.timer = setInterval(() => { this.tick().catch((e) => this.log(`tick error: ${e?.message || e}`)); }, this.pollMs);
209
333
  this.tick().catch((e) => this.log(`tick error: ${e?.message || e}`));
334
+
335
+ // Phase 2 auto-update: wake on a slow timer; the first check fires shortly
336
+ // after start so the server has time to come up (verify reads its /health).
337
+ if (this.autoUpdate && this.env.VITEST !== 'true' && this.env.NODE_ENV !== 'test') {
338
+ this.buildAutoUpdater().then((u) => {
339
+ this.autoUpdater = u;
340
+ this.updateTimer = setInterval(() => this.runUpdateTick(), this.updatePollMs);
341
+ if (this.updateTimer.unref) this.updateTimer.unref();
342
+ const kick = setTimeout(() => this.runUpdateTick(), 60_000);
343
+ if (kick.unref) kick.unref();
344
+ }).catch((e) => this.log(`auto-update init error: ${e?.message || e}`));
345
+ }
210
346
  return { started: true };
211
347
  }
212
348
 
213
349
  stop() {
214
350
  if (this.timer) { clearInterval(this.timer); this.timer = null; }
351
+ if (this.updateTimer) { clearInterval(this.updateTimer); this.updateTimer = null; }
215
352
  this.releaseLock();
216
353
  }
217
354
  }
@@ -0,0 +1,153 @@
1
+ // TunnelWatchdog — RC2 self-heal for "slug-linked but the public URL is dead".
2
+ //
3
+ // The bug from the first external install: after a restart the daemon was
4
+ // "running" but never re-linked to the relay, so `<slug>.venturewild.llc` was 502
5
+ // while `localhost:5173` was perfectly healthy. Nothing noticed the half: the
6
+ // install thought it was online. (`docs/remote-support-and-self-healing-design.md`
7
+ // RC2 — a sibling of the half-open-after-sleep fix already shipped in the daemon.)
8
+ //
9
+ // This watchdog closes that gap from the workspace server (no Rust change): it
10
+ // periodically asks our OWN public URL for /api/health. That request travels the
11
+ // full chain — out to Cloudflare, through the relay, down the daemon's tunnel,
12
+ // back to this server — so a 200 proves the whole path works. When the public
13
+ // side fails repeatedly WHILE the local server is healthy (so the fault is the
14
+ // LINK, not the server), it relinks the daemon — the same remedy as the operator
15
+ // `relink-account` action, applied automatically.
16
+ //
17
+ // Conservative by design: it acts only on a sustained failure (threshold), never
18
+ // relinks more often than `minRelinkIntervalMs`, and treats a down LOCAL server
19
+ // as "not my job" (the WorkspaceSupervisor owns that). Every touch-point is an
20
+ // injected seam so the suite never hits the network.
21
+ //
22
+ // SAFETY against thrash: a relink only helps when THIS daemon's link to the relay
23
+ // is the broken part. When the relay itself is globally down (or otherwise can't
24
+ // accept the link), relinking can't help — so retrying every interval just churns
25
+ // the daemon. Relinks that DON'T restore the tunnel are counted, and after
26
+ // `maxIneffectiveRelinks` the watchdog escalates to a long `longCooldownMs` quiet
27
+ // period; a relink that works clears the counter (the next probe is healthy), so
28
+ // the genuine RC2 case still self-heals fast.
29
+ //
30
+ // (Product model: one machine = one install = one daemon = one public slug;
31
+ // multi-folder work is VS-Code-style within that single install. So co-tenant
32
+ // daemon contention is NOT a supported state — this guard is for the relay-down /
33
+ // transient-unreachable case, which is the real one. A two-install-per-machine
34
+ // setup, as seen while dogfooding on 2026-06-07, is a test artifact only.)
35
+
36
+ const DEFAULT_INTERVAL_MS = 60_000;
37
+ const DEFAULT_PROBE_TIMEOUT_MS = 8_000;
38
+
39
+ export class TunnelWatchdog {
40
+ /**
41
+ * @param {object} opts
42
+ * @param {string} opts.publicBaseUrl e.g. https://<slug>.venturewild.llc
43
+ * @param {Function} opts.relink async () => relink the daemon (stop+ensureRunning)
44
+ * @param {Function} [opts.localHealthy] async () => boolean; default assumes healthy
45
+ * @param {Function} [opts.fetchImpl]
46
+ * @param {Function} [opts.nowImpl]
47
+ * @param {Function} [opts.log]
48
+ */
49
+ constructor({
50
+ publicBaseUrl,
51
+ relink,
52
+ localHealthy = async () => true,
53
+ fetchImpl = (...a) => globalThis.fetch(...a),
54
+ nowImpl = () => Date.now(),
55
+ log = () => {},
56
+ intervalMs = DEFAULT_INTERVAL_MS,
57
+ failureThreshold = 3,
58
+ minRelinkIntervalMs = 120_000,
59
+ maxIneffectiveRelinks = 3,
60
+ longCooldownMs = 1_800_000, // 30 min quiet period once relinks stop helping
61
+ probeTimeoutMs = DEFAULT_PROBE_TIMEOUT_MS,
62
+ } = {}) {
63
+ this.publicBaseUrl = String(publicBaseUrl || '').replace(/\/+$/, '');
64
+ this.relink = relink;
65
+ this.localHealthy = localHealthy;
66
+ this.fetchImpl = fetchImpl;
67
+ this.nowImpl = nowImpl;
68
+ this.log = log;
69
+ this.intervalMs = intervalMs;
70
+ this.failureThreshold = failureThreshold;
71
+ this.minRelinkIntervalMs = minRelinkIntervalMs;
72
+ this.maxIneffectiveRelinks = maxIneffectiveRelinks;
73
+ this.longCooldownMs = longCooldownMs;
74
+ this.probeTimeoutMs = probeTimeoutMs;
75
+ this.failures = 0;
76
+ this.lastRelink = 0;
77
+ this.consecutiveRelinks = 0; // relinks since the last healthy probe (thrash guard)
78
+ this.timer = null;
79
+ }
80
+
81
+ /** Probe the public /api/health. Returns true iff it answers (any HTTP status). */
82
+ async probePublic() {
83
+ if (!this.publicBaseUrl) return false;
84
+ const ctrl = new AbortController();
85
+ const t = setTimeout(() => ctrl.abort(), this.probeTimeoutMs);
86
+ try {
87
+ const res = await this.fetchImpl(`${this.publicBaseUrl}/api/health`, {
88
+ signal: ctrl.signal,
89
+ // never cached by the edge — we want the live tunnel, not a CDN copy
90
+ headers: { 'cache-control': 'no-cache' },
91
+ });
92
+ // A 5xx from the tunnel layer (502/504) means the link is down even though
93
+ // we got an HTTP response from the edge; treat only <500 as "reachable".
94
+ return res.status < 500;
95
+ } catch {
96
+ return false; // network error / timeout / abort
97
+ } finally {
98
+ clearTimeout(t);
99
+ }
100
+ }
101
+
102
+ /** One watchdog step. Returns its decision (exposed for tests). */
103
+ async tick() {
104
+ // Local server down → the WorkspaceSupervisor's problem, not ours. A failing
105
+ // public probe in that state says nothing about the LINK, so don't relink.
106
+ if (!(await this.localHealthy())) {
107
+ this.failures = 0;
108
+ return 'local-down';
109
+ }
110
+ if (await this.probePublic()) {
111
+ this.failures = 0;
112
+ this.consecutiveRelinks = 0; // the link is up → any prior relink worked
113
+ return 'healthy';
114
+ }
115
+ this.failures += 1;
116
+ if (this.failures < this.failureThreshold) return 'degraded';
117
+ const now = this.nowImpl();
118
+ // Relinks that aren't restoring the tunnel (shared daemon / relay down) earn an
119
+ // escalating quiet period so we never thrash a co-tenant's daemon (real finding).
120
+ const cooldown =
121
+ this.consecutiveRelinks >= this.maxIneffectiveRelinks
122
+ ? this.longCooldownMs
123
+ : this.minRelinkIntervalMs;
124
+ if (now - this.lastRelink < cooldown) return 'cooldown';
125
+ this.lastRelink = now;
126
+ this.failures = 0;
127
+ this.consecutiveRelinks += 1;
128
+ this.log(
129
+ `public tunnel unreachable while local is healthy — relinking daemon` +
130
+ ` (attempt ${this.consecutiveRelinks} since last healthy)`,
131
+ );
132
+ try {
133
+ await this.relink();
134
+ return 'relinked';
135
+ } catch (e) {
136
+ this.log(`relink failed: ${e?.message || e}`);
137
+ return 'relink-error';
138
+ }
139
+ }
140
+
141
+ start() {
142
+ if (this.timer) return this;
143
+ this.timer = setInterval(() => {
144
+ this.tick().catch((e) => this.log(`tick error: ${e?.message || e}`));
145
+ }, this.intervalMs);
146
+ if (this.timer.unref) this.timer.unref(); // never keep the process alive
147
+ return this;
148
+ }
149
+
150
+ stop() {
151
+ if (this.timer) { clearInterval(this.timer); this.timer = null; }
152
+ }
153
+ }