@venturewild/workspace 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/server/bin/wild-workspace.mjs +148 -75
- package/server/src/auto-update.mjs +277 -0
- package/server/src/config.mjs +6 -0
- package/server/src/doctor.mjs +75 -1
- package/server/src/index.mjs +75 -2
- package/server/src/operator.mjs +27 -0
- package/server/src/owner-browser.mjs +84 -0
- package/server/src/reset.mjs +78 -0
- package/server/src/supervisor.mjs +137 -0
- package/server/src/tunnel-watchdog.mjs +153 -0
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
// TunnelWatchdog — RC2 self-heal for "slug-linked but the public URL is dead".
|
|
2
|
+
//
|
|
3
|
+
// The bug from the first external install: after a restart the daemon was
|
|
4
|
+
// "running" but never re-linked to the relay, so `<slug>.venturewild.llc` was 502
|
|
5
|
+
// while `localhost:5173` was perfectly healthy. Nothing noticed the half: the
|
|
6
|
+
// install thought it was online. (`docs/remote-support-and-self-healing-design.md`
|
|
7
|
+
// RC2 — a sibling of the half-open-after-sleep fix already shipped in the daemon.)
|
|
8
|
+
//
|
|
9
|
+
// This watchdog closes that gap from the workspace server (no Rust change): it
|
|
10
|
+
// periodically asks our OWN public URL for /api/health. That request travels the
|
|
11
|
+
// full chain — out to Cloudflare, through the relay, down the daemon's tunnel,
|
|
12
|
+
// back to this server — so a 200 proves the whole path works. When the public
|
|
13
|
+
// side fails repeatedly WHILE the local server is healthy (so the fault is the
|
|
14
|
+
// LINK, not the server), it relinks the daemon — the same remedy as the operator
|
|
15
|
+
// `relink-account` action, applied automatically.
|
|
16
|
+
//
|
|
17
|
+
// Conservative by design: it acts only on a sustained failure (threshold), never
|
|
18
|
+
// relinks more often than `minRelinkIntervalMs`, and treats a down LOCAL server
|
|
19
|
+
// as "not my job" (the WorkspaceSupervisor owns that). Every touch-point is an
|
|
20
|
+
// injected seam so the suite never hits the network.
|
|
21
|
+
//
|
|
22
|
+
// SAFETY against thrash: a relink only helps when THIS daemon's link to the relay
|
|
23
|
+
// is the broken part. When the relay itself is globally down (or otherwise can't
|
|
24
|
+
// accept the link), relinking can't help — so retrying every interval just churns
|
|
25
|
+
// the daemon. Relinks that DON'T restore the tunnel are counted, and after
|
|
26
|
+
// `maxIneffectiveRelinks` the watchdog escalates to a long `longCooldownMs` quiet
|
|
27
|
+
// period; a relink that works clears the counter (the next probe is healthy), so
|
|
28
|
+
// the genuine RC2 case still self-heals fast.
|
|
29
|
+
//
|
|
30
|
+
// (Product model: one machine = one install = one daemon = one public slug;
|
|
31
|
+
// multi-folder work is VS-Code-style within that single install. So co-tenant
|
|
32
|
+
// daemon contention is NOT a supported state — this guard is for the relay-down /
|
|
33
|
+
// transient-unreachable case, which is the real one. A two-install-per-machine
|
|
34
|
+
// setup, as seen while dogfooding on 2026-06-07, is a test artifact only.)
|
|
35
|
+
|
|
36
|
+
const DEFAULT_INTERVAL_MS = 60_000;
|
|
37
|
+
const DEFAULT_PROBE_TIMEOUT_MS = 8_000;
|
|
38
|
+
|
|
39
|
+
export class TunnelWatchdog {
|
|
40
|
+
/**
|
|
41
|
+
* @param {object} opts
|
|
42
|
+
* @param {string} opts.publicBaseUrl e.g. https://<slug>.venturewild.llc
|
|
43
|
+
* @param {Function} opts.relink async () => relink the daemon (stop+ensureRunning)
|
|
44
|
+
* @param {Function} [opts.localHealthy] async () => boolean; default assumes healthy
|
|
45
|
+
* @param {Function} [opts.fetchImpl]
|
|
46
|
+
* @param {Function} [opts.nowImpl]
|
|
47
|
+
* @param {Function} [opts.log]
|
|
48
|
+
*/
|
|
49
|
+
constructor({
|
|
50
|
+
publicBaseUrl,
|
|
51
|
+
relink,
|
|
52
|
+
localHealthy = async () => true,
|
|
53
|
+
fetchImpl = (...a) => globalThis.fetch(...a),
|
|
54
|
+
nowImpl = () => Date.now(),
|
|
55
|
+
log = () => {},
|
|
56
|
+
intervalMs = DEFAULT_INTERVAL_MS,
|
|
57
|
+
failureThreshold = 3,
|
|
58
|
+
minRelinkIntervalMs = 120_000,
|
|
59
|
+
maxIneffectiveRelinks = 3,
|
|
60
|
+
longCooldownMs = 1_800_000, // 30 min quiet period once relinks stop helping
|
|
61
|
+
probeTimeoutMs = DEFAULT_PROBE_TIMEOUT_MS,
|
|
62
|
+
} = {}) {
|
|
63
|
+
this.publicBaseUrl = String(publicBaseUrl || '').replace(/\/+$/, '');
|
|
64
|
+
this.relink = relink;
|
|
65
|
+
this.localHealthy = localHealthy;
|
|
66
|
+
this.fetchImpl = fetchImpl;
|
|
67
|
+
this.nowImpl = nowImpl;
|
|
68
|
+
this.log = log;
|
|
69
|
+
this.intervalMs = intervalMs;
|
|
70
|
+
this.failureThreshold = failureThreshold;
|
|
71
|
+
this.minRelinkIntervalMs = minRelinkIntervalMs;
|
|
72
|
+
this.maxIneffectiveRelinks = maxIneffectiveRelinks;
|
|
73
|
+
this.longCooldownMs = longCooldownMs;
|
|
74
|
+
this.probeTimeoutMs = probeTimeoutMs;
|
|
75
|
+
this.failures = 0;
|
|
76
|
+
this.lastRelink = 0;
|
|
77
|
+
this.consecutiveRelinks = 0; // relinks since the last healthy probe (thrash guard)
|
|
78
|
+
this.timer = null;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/** Probe the public /api/health. Returns true iff it answers (any HTTP status). */
|
|
82
|
+
async probePublic() {
|
|
83
|
+
if (!this.publicBaseUrl) return false;
|
|
84
|
+
const ctrl = new AbortController();
|
|
85
|
+
const t = setTimeout(() => ctrl.abort(), this.probeTimeoutMs);
|
|
86
|
+
try {
|
|
87
|
+
const res = await this.fetchImpl(`${this.publicBaseUrl}/api/health`, {
|
|
88
|
+
signal: ctrl.signal,
|
|
89
|
+
// never cached by the edge — we want the live tunnel, not a CDN copy
|
|
90
|
+
headers: { 'cache-control': 'no-cache' },
|
|
91
|
+
});
|
|
92
|
+
// A 5xx from the tunnel layer (502/504) means the link is down even though
|
|
93
|
+
// we got an HTTP response from the edge; treat only <500 as "reachable".
|
|
94
|
+
return res.status < 500;
|
|
95
|
+
} catch {
|
|
96
|
+
return false; // network error / timeout / abort
|
|
97
|
+
} finally {
|
|
98
|
+
clearTimeout(t);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/** One watchdog step. Returns its decision (exposed for tests). */
|
|
103
|
+
async tick() {
|
|
104
|
+
// Local server down → the WorkspaceSupervisor's problem, not ours. A failing
|
|
105
|
+
// public probe in that state says nothing about the LINK, so don't relink.
|
|
106
|
+
if (!(await this.localHealthy())) {
|
|
107
|
+
this.failures = 0;
|
|
108
|
+
return 'local-down';
|
|
109
|
+
}
|
|
110
|
+
if (await this.probePublic()) {
|
|
111
|
+
this.failures = 0;
|
|
112
|
+
this.consecutiveRelinks = 0; // the link is up → any prior relink worked
|
|
113
|
+
return 'healthy';
|
|
114
|
+
}
|
|
115
|
+
this.failures += 1;
|
|
116
|
+
if (this.failures < this.failureThreshold) return 'degraded';
|
|
117
|
+
const now = this.nowImpl();
|
|
118
|
+
// Relinks that aren't restoring the tunnel (shared daemon / relay down) earn an
|
|
119
|
+
// escalating quiet period so we never thrash a co-tenant's daemon (real finding).
|
|
120
|
+
const cooldown =
|
|
121
|
+
this.consecutiveRelinks >= this.maxIneffectiveRelinks
|
|
122
|
+
? this.longCooldownMs
|
|
123
|
+
: this.minRelinkIntervalMs;
|
|
124
|
+
if (now - this.lastRelink < cooldown) return 'cooldown';
|
|
125
|
+
this.lastRelink = now;
|
|
126
|
+
this.failures = 0;
|
|
127
|
+
this.consecutiveRelinks += 1;
|
|
128
|
+
this.log(
|
|
129
|
+
`public tunnel unreachable while local is healthy — relinking daemon` +
|
|
130
|
+
` (attempt ${this.consecutiveRelinks} since last healthy)`,
|
|
131
|
+
);
|
|
132
|
+
try {
|
|
133
|
+
await this.relink();
|
|
134
|
+
return 'relinked';
|
|
135
|
+
} catch (e) {
|
|
136
|
+
this.log(`relink failed: ${e?.message || e}`);
|
|
137
|
+
return 'relink-error';
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
start() {
|
|
142
|
+
if (this.timer) return this;
|
|
143
|
+
this.timer = setInterval(() => {
|
|
144
|
+
this.tick().catch((e) => this.log(`tick error: ${e?.message || e}`));
|
|
145
|
+
}, this.intervalMs);
|
|
146
|
+
if (this.timer.unref) this.timer.unref(); // never keep the process alive
|
|
147
|
+
return this;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
stop() {
|
|
151
|
+
if (this.timer) { clearInterval(this.timer); this.timer = null; }
|
|
152
|
+
}
|
|
153
|
+
}
|