@venturewild/workspace 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,153 @@
1
+ // TunnelWatchdog — RC2 self-heal for "slug-linked but the public URL is dead".
2
+ //
3
+ // The bug from the first external install: after a restart the daemon was
4
+ // "running" but never re-linked to the relay, so `<slug>.venturewild.llc` was 502
5
+ // while `localhost:5173` was perfectly healthy. Nothing noticed the half: the
6
+ // install thought it was online. (`docs/remote-support-and-self-healing-design.md`
7
+ // RC2 — a sibling of the half-open-after-sleep fix already shipped in the daemon.)
8
+ //
9
+ // This watchdog closes that gap from the workspace server (no Rust change): it
10
+ // periodically asks our OWN public URL for /api/health. That request travels the
11
+ // full chain — out to Cloudflare, through the relay, down the daemon's tunnel,
12
+ // back to this server — so a 200 proves the whole path works. When the public
13
+ // side fails repeatedly WHILE the local server is healthy (so the fault is the
14
+ // LINK, not the server), it relinks the daemon — the same remedy as the operator
15
+ // `relink-account` action, applied automatically.
16
+ //
17
+ // Conservative by design: it acts only on a sustained failure (threshold), never
18
+ // relinks more often than `minRelinkIntervalMs`, and treats a down LOCAL server
19
+ // as "not my job" (the WorkspaceSupervisor owns that). Every touch-point is an
20
+ // injected seam so the suite never hits the network.
21
+ //
22
+ // SAFETY against thrash: a relink only helps when THIS daemon's link to the relay
23
+ // is the broken part. When the relay itself is globally down (or otherwise can't
24
+ // accept the link), relinking can't help — so retrying every interval just churns
25
+ // the daemon. Relinks that DON'T restore the tunnel are counted, and after
26
+ // `maxIneffectiveRelinks` the watchdog escalates to a long `longCooldownMs` quiet
27
+ // period; a relink that works clears the counter (the next probe is healthy), so
28
+ // the genuine RC2 case still self-heals fast.
29
+ //
30
+ // (Product model: one machine = one install = one daemon = one public slug;
31
+ // multi-folder work is VS-Code-style within that single install. So co-tenant
32
+ // daemon contention is NOT a supported state — this guard is for the relay-down /
33
+ // transient-unreachable case, which is the real one. A two-install-per-machine
34
+ // setup, as seen while dogfooding on 2026-06-07, is a test artifact only.)
35
+
36
+ const DEFAULT_INTERVAL_MS = 60_000;
37
+ const DEFAULT_PROBE_TIMEOUT_MS = 8_000;
38
+
39
+ export class TunnelWatchdog {
40
+ /**
41
+ * @param {object} opts
42
+ * @param {string} opts.publicBaseUrl e.g. https://<slug>.venturewild.llc
43
+ * @param {Function} opts.relink async () => relink the daemon (stop+ensureRunning)
44
+ * @param {Function} [opts.localHealthy] async () => boolean; default assumes healthy
45
+ * @param {Function} [opts.fetchImpl]
46
+ * @param {Function} [opts.nowImpl]
47
+ * @param {Function} [opts.log]
48
+ */
49
+ constructor({
50
+ publicBaseUrl,
51
+ relink,
52
+ localHealthy = async () => true,
53
+ fetchImpl = (...a) => globalThis.fetch(...a),
54
+ nowImpl = () => Date.now(),
55
+ log = () => {},
56
+ intervalMs = DEFAULT_INTERVAL_MS,
57
+ failureThreshold = 3,
58
+ minRelinkIntervalMs = 120_000,
59
+ maxIneffectiveRelinks = 3,
60
+ longCooldownMs = 1_800_000, // 30 min quiet period once relinks stop helping
61
+ probeTimeoutMs = DEFAULT_PROBE_TIMEOUT_MS,
62
+ } = {}) {
63
+ this.publicBaseUrl = String(publicBaseUrl || '').replace(/\/+$/, '');
64
+ this.relink = relink;
65
+ this.localHealthy = localHealthy;
66
+ this.fetchImpl = fetchImpl;
67
+ this.nowImpl = nowImpl;
68
+ this.log = log;
69
+ this.intervalMs = intervalMs;
70
+ this.failureThreshold = failureThreshold;
71
+ this.minRelinkIntervalMs = minRelinkIntervalMs;
72
+ this.maxIneffectiveRelinks = maxIneffectiveRelinks;
73
+ this.longCooldownMs = longCooldownMs;
74
+ this.probeTimeoutMs = probeTimeoutMs;
75
+ this.failures = 0;
76
+ this.lastRelink = 0;
77
+ this.consecutiveRelinks = 0; // relinks since the last healthy probe (thrash guard)
78
+ this.timer = null;
79
+ }
80
+
81
+ /** Probe the public /api/health. Returns true iff it answers (any HTTP status). */
82
+ async probePublic() {
83
+ if (!this.publicBaseUrl) return false;
84
+ const ctrl = new AbortController();
85
+ const t = setTimeout(() => ctrl.abort(), this.probeTimeoutMs);
86
+ try {
87
+ const res = await this.fetchImpl(`${this.publicBaseUrl}/api/health`, {
88
+ signal: ctrl.signal,
89
+ // never cached by the edge — we want the live tunnel, not a CDN copy
90
+ headers: { 'cache-control': 'no-cache' },
91
+ });
92
+ // A 5xx from the tunnel layer (502/504) means the link is down even though
93
+ // we got an HTTP response from the edge; treat only <500 as "reachable".
94
+ return res.status < 500;
95
+ } catch {
96
+ return false; // network error / timeout / abort
97
+ } finally {
98
+ clearTimeout(t);
99
+ }
100
+ }
101
+
102
+ /** One watchdog step. Returns its decision (exposed for tests). */
103
+ async tick() {
104
+ // Local server down → the WorkspaceSupervisor's problem, not ours. A failing
105
+ // public probe in that state says nothing about the LINK, so don't relink.
106
+ if (!(await this.localHealthy())) {
107
+ this.failures = 0;
108
+ return 'local-down';
109
+ }
110
+ if (await this.probePublic()) {
111
+ this.failures = 0;
112
+ this.consecutiveRelinks = 0; // the link is up → any prior relink worked
113
+ return 'healthy';
114
+ }
115
+ this.failures += 1;
116
+ if (this.failures < this.failureThreshold) return 'degraded';
117
+ const now = this.nowImpl();
118
+ // Relinks that aren't restoring the tunnel (shared daemon / relay down) earn an
119
+ // escalating quiet period so we never thrash a co-tenant's daemon (real finding).
120
+ const cooldown =
121
+ this.consecutiveRelinks >= this.maxIneffectiveRelinks
122
+ ? this.longCooldownMs
123
+ : this.minRelinkIntervalMs;
124
+ if (now - this.lastRelink < cooldown) return 'cooldown';
125
+ this.lastRelink = now;
126
+ this.failures = 0;
127
+ this.consecutiveRelinks += 1;
128
+ this.log(
129
+ `public tunnel unreachable while local is healthy — relinking daemon` +
130
+ ` (attempt ${this.consecutiveRelinks} since last healthy)`,
131
+ );
132
+ try {
133
+ await this.relink();
134
+ return 'relinked';
135
+ } catch (e) {
136
+ this.log(`relink failed: ${e?.message || e}`);
137
+ return 'relink-error';
138
+ }
139
+ }
140
+
141
+ start() {
142
+ if (this.timer) return this;
143
+ this.timer = setInterval(() => {
144
+ this.tick().catch((e) => this.log(`tick error: ${e?.message || e}`));
145
+ }, this.intervalMs);
146
+ if (this.timer.unref) this.timer.unref(); // never keep the process alive
147
+ return this;
148
+ }
149
+
150
+ stop() {
151
+ if (this.timer) { clearInterval(this.timer); this.timer = null; }
152
+ }
153
+ }