ai-or-die 0.1.66 → 0.1.67

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/supervisor.js CHANGED
@@ -6,11 +6,38 @@ const { spawn } = require('child_process');
6
6
  const path = require('path');
7
7
  const { RESTART_EXIT_CODE } = require('../src/restart-manager');
8
8
 
9
- const RESTART_DELAY_MS = 1000;
10
- const CRASH_RESTART_DELAY_MS = 3000;
11
- const CIRCUIT_BREAKER_WINDOW_MS = 30000;
12
- const CIRCUIT_BREAKER_MAX_CRASHES = 3;
13
- const SHUTDOWN_TIMEOUT_MS = 10000;
9
+ // ---------------------------------------------------------------------------
10
+ // Tunables all overridable via env vars so the regression test can shrink
11
+ // the windows from hours/minutes to ms. See docs/audits/proc-supervisor-breaker.md
12
+ // for the rationale behind every default.
13
+ // ---------------------------------------------------------------------------
14
+
15
+ const RESTART_DELAY_MS = parseInt(process.env.RESTART_DELAY_MS, 10) || 1000; // clean RESTART_EXIT_CODE respawn
16
+ const CRASH_RESTART_DELAY_MS = parseInt(process.env.CRASH_RESTART_DELAY_MS, 10) || 3000; // normal crash respawn
17
+ const SHUTDOWN_TIMEOUT_MS = parseInt(process.env.SHUTDOWN_TIMEOUT_MS, 10) || 10000; // SIGINT/SIGTERM hard-kill fallback
18
+
19
+ // Tier 1 — tight crash loop. 3 crashes in 30 s historically tripped a hard
20
+ // process.exit(1). The fix replaces that with an extended restart delay
21
+ // (and a loud log) so the daemon ALWAYS comes back; permanent halt strands
22
+ // the user's single browser session with no way to recover short of SSH.
23
+ const CIRCUIT_BREAKER_WINDOW_MS = parseInt(process.env.CIRCUIT_BREAKER_WINDOW_MS, 10) || 30000; // 30 s
24
+ const CIRCUIT_BREAKER_MAX_CRASHES = parseInt(process.env.CIRCUIT_BREAKER_MAX_CRASHES, 10) || 3;
25
+ const TIER1_RESTART_DELAY_MS = parseInt(process.env.TIER1_RESTART_DELAY_MS, 10) || 60000; // 1 min
26
+
27
+ // Tier 2 — sustained slow churn. The old breaker missed this entirely:
28
+ // a server that crashed once every 31 s respawned forever at the normal
29
+ // cadence, masking the underlying bug. Tier 2 catches 5 crashes in 1 h
30
+ // and slows respawn to 5 min, dropping log volume by ~100×.
31
+ const SUSTAINED_CRASH_WINDOW_MS = parseInt(process.env.SUSTAINED_CRASH_WINDOW_MS, 10) || 3600000; // 1 h
32
+ const SUSTAINED_CRASH_MAX = parseInt(process.env.SUSTAINED_CRASH_MAX, 10) || 5;
33
+ const TIER2_RESTART_DELAY_MS = parseInt(process.env.TIER2_RESTART_DELAY_MS, 10) || 300000; // 5 min
34
+
35
+ // Hard cap on the crashTimestamps array so a pathological 100/sec crash loop
36
+ // over an hour can't grow it to 360 k entries. 1024 is comfortably more than
37
+ // any realistic backoff cadence would produce in 1 h (even at tier-2's
38
+ // minimum 5-min cadence that's 12 entries/h; at tier-1's 60-s cadence it's
39
+ // 60/h). Trimming oldest-first preserves the most-recent-N invariant.
40
+ const CRASH_TIMESTAMPS_CAP = parseInt(process.env.CRASH_TIMESTAMPS_CAP, 10) || 1024;
14
41
 
15
42
  const serverScript = process.env.SUPERVISOR_CHILD_SCRIPT
16
43
  || path.join(__dirname, 'ai-or-die.js');
@@ -21,6 +48,64 @@ let shuttingDown = false;
21
48
  let crashTimestamps = [];
22
49
  let pendingRestartTimer = null;
23
50
 
51
+ // Queued IPC message delivered to the NEXT spawned child once its IPC channel
52
+ // is open. Used to forward tier-2 escalation downstream so the in-process
53
+ // server can surface it to the browser ("supervisor is throttling restarts").
54
+ let pendingWarning = null;
55
+
56
+ // Test-only: when SUPERVISOR_ESCALATION_OBSERVER=1, the supervisor emits a
57
+ // {type:'supervisor_escalation', tier, count, ...} IPC message to ITS parent
58
+ // after each classification, so a regression test can deterministically watch
59
+ // tier transitions without parsing log strings. Production runs leave it null.
60
+ let escalationObserver = process.env.SUPERVISOR_ESCALATION_OBSERVER === '1'
61
+ ? (info) => { try { if (process.send) process.send({ type: 'supervisor_escalation', ...info }); } catch (_) {} }
62
+ : null;
63
+
64
+ function classifyCrash(now) {
65
+ // Trim to the longer of the two windows so the array stays bounded.
66
+ const cutoff = now - Math.max(CIRCUIT_BREAKER_WINDOW_MS, SUSTAINED_CRASH_WINDOW_MS);
67
+ crashTimestamps = crashTimestamps.filter((t) => t >= cutoff);
68
+
69
+ // Defence-in-depth cap (SUP-REL review). The time-window trim already
70
+ // bounds the array to "crashes in the last hour"; this guards against
71
+ // an extreme pathological case (e.g. CRASH_RESTART_DELAY_MS overridden
72
+ // to 0 in a test, or a future env-var injection raising the window).
73
+ // Keep the most-recent entries — classification only ever cares about
74
+ // the head of the array.
75
+ if (crashTimestamps.length > CRASH_TIMESTAMPS_CAP) {
76
+ crashTimestamps = crashTimestamps.slice(-CRASH_TIMESTAMPS_CAP);
77
+ }
78
+
79
+ const inSustained = crashTimestamps.filter((t) => now - t < SUSTAINED_CRASH_WINDOW_MS).length;
80
+ const inTight = crashTimestamps.filter((t) => now - t < CIRCUIT_BREAKER_WINDOW_MS).length;
81
+
82
+ // Higher tier wins.
83
+ if (inSustained >= SUSTAINED_CRASH_MAX) {
84
+ return { tier: 2, count: inSustained, windowMs: SUSTAINED_CRASH_WINDOW_MS, delayMs: TIER2_RESTART_DELAY_MS };
85
+ }
86
+ if (inTight >= CIRCUIT_BREAKER_MAX_CRASHES) {
87
+ return { tier: 1, count: inTight, windowMs: CIRCUIT_BREAKER_WINDOW_MS, delayMs: TIER1_RESTART_DELAY_MS };
88
+ }
89
+ return { tier: 0, count: inTight, windowMs: CIRCUIT_BREAKER_WINDOW_MS, delayMs: CRASH_RESTART_DELAY_MS };
90
+ }
91
+
92
+ function logEscalation(decision) {
93
+ if (decision.tier === 2) {
94
+ console.error(
95
+ `\n[supervisor] ⚠ TIER 2 ESCALATION: ${decision.count} crashes within ` +
96
+ `${Math.round(decision.windowMs / 60000)}m. Throttling restart to ` +
97
+ `${Math.round(decision.delayMs / 60000)}m. Underlying defect is likely real — ` +
98
+ `inspect server logs.\n`
99
+ );
100
+ } else if (decision.tier === 1) {
101
+ console.error(
102
+ `\n[supervisor] ⚠ TIER 1 ESCALATION: ${decision.count} crashes within ` +
103
+ `${Math.round(decision.windowMs / 1000)}s. Throttling restart to ` +
104
+ `${Math.round(decision.delayMs / 1000)}s.\n`
105
+ );
106
+ }
107
+ }
108
+
24
109
  function startServer() {
25
110
  pendingRestartTimer = null;
26
111
  const nodeArgs = ['--expose-gc', serverScript, ...forwardedArgs];
@@ -30,6 +115,32 @@ function startServer() {
30
115
  env: { ...process.env, SUPERVISED: '1' }
31
116
  });
32
117
 
118
+ // Flush a queued supervisor_warning into the new child's IPC channel.
119
+ // SUP-REL review: the immediately-after-spawn `child.connected` is false
120
+ // (the IPC handshake hasn't completed yet), so this block used to silently
121
+ // drop the warning. Defer via the 'spawn' event, which Node fires AFTER
122
+ // the child has been successfully spawned and the IPC channel is wired.
123
+ // Future CLIENT-04 server-side wiring will then receive it deterministically.
124
+ if (pendingWarning) {
125
+ const warning = pendingWarning;
126
+ pendingWarning = null;
127
+ const flush = () => {
128
+ try {
129
+ if (child && child.connected) child.send(warning);
130
+ } catch (_) { /* IPC may have closed between spawn and send — best-effort */ }
131
+ };
132
+ // Node ≥ 16: 'spawn' event fires once when spawn succeeds. If the child
133
+ // already crashed before 'spawn' fires, we never send; that's correct
134
+ // behaviour — the next-next child will get its own warning if the crash
135
+ // sequence re-escalates.
136
+ if (typeof child.once === 'function') {
137
+ child.once('spawn', flush);
138
+ } else {
139
+ // Defensive: pre-Node-16 fallback (unsupported but harmless).
140
+ process.nextTick(flush);
141
+ }
142
+ }
143
+
33
144
  child.on('exit', (code, signal) => {
34
145
  child = null;
35
146
 
@@ -52,21 +163,34 @@ function startServer() {
52
163
  return;
53
164
  }
54
165
 
55
- // Unexpected exit — check circuit breaker
166
+ // Unexpected exit — classify against both windows.
56
167
  const now = Date.now();
57
168
  crashTimestamps.push(now);
58
- // Remove timestamps outside the window
59
- crashTimestamps = crashTimestamps.filter(t => now - t < CIRCUIT_BREAKER_WINDOW_MS);
60
-
61
- if (crashTimestamps.length >= CIRCUIT_BREAKER_MAX_CRASHES) {
62
- console.error(`[supervisor] Circuit breaker: ${CIRCUIT_BREAKER_MAX_CRASHES} crashes within ${CIRCUIT_BREAKER_WINDOW_MS / 1000}s. Stopping.`);
63
- process.exit(1);
64
- return;
169
+ const decision = classifyCrash(now);
170
+
171
+ if (decision.tier > 0) {
172
+ logEscalation(decision);
173
+ // Queue a downstream warning so the next-spawned server can surface
174
+ // it to the browser UI. (Receiver-side wiring is a future task — for
175
+ // now this is a no-op on the child side but adds zero risk.)
176
+ pendingWarning = {
177
+ type: 'supervisor_warning',
178
+ tier: decision.tier,
179
+ crashes: decision.count,
180
+ windowMs: decision.windowMs,
181
+ nextDelayMs: decision.delayMs,
182
+ };
65
183
  }
66
184
 
185
+ if (escalationObserver) escalationObserver(decision);
186
+
67
187
  const exitInfo = signal ? `signal ${signal}` : `code ${code}`;
68
- console.warn(`[supervisor] Server exited unexpectedly (${exitInfo}), restarting in ${CRASH_RESTART_DELAY_MS}ms... (crash ${crashTimestamps.length}/${CIRCUIT_BREAKER_MAX_CRASHES})`);
69
- pendingRestartTimer = setTimeout(startServer, CRASH_RESTART_DELAY_MS);
188
+ console.warn(
189
+ `[supervisor] Server exited unexpectedly (${exitInfo}), restarting in ` +
190
+ `${decision.delayMs}ms... (crash ${decision.count} in ` +
191
+ `${Math.round(decision.windowMs / 1000)}s window, tier ${decision.tier})`
192
+ );
193
+ pendingRestartTimer = setTimeout(startServer, decision.delayMs);
70
194
  });
71
195
 
72
196
  child.on('error', (err) => {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ai-or-die",
3
- "version": "0.1.66",
3
+ "version": "0.1.67",
4
4
  "description": "Universal AI coding terminal — Claude, Copilot, Gemini & more in your browser",
5
5
  "main": "src/server.js",
6
6
  "bin": {
@@ -15,6 +15,11 @@
15
15
  "test": "mocha --exit test/*.test.js",
16
16
  "test:integration": "mocha --exit --timeout 60000 test/supervisor-integration.test.js test/integration/*.test.js",
17
17
  "test:browser": "npx playwright test --config e2e/playwright.config.js",
18
+ "soak": "node test/longevity/harness/cli.js",
19
+ "test:longevity-smoke": "mocha --exit --timeout 120000 test/longevity/smoke.test.js",
20
+ "test:longevity:server": "mocha --exit --timeout 120000 --recursive --extension .test.js test/longevity/event-loop test/longevity/disk test/longevity/process test/longevity/browser-sampler.test.js test/longevity/cli.test.js test/longevity/gate-evaluator-directional.test.js test/longevity/gate-evaluator-vacuous.test.js test/longevity/resume.test.js test/longevity/smoke.test.js",
21
+ "test:longevity:browser": "playwright test --config test/longevity/playwright.config.js",
22
+ "test:longevity": "npm run test:longevity:server && npm run test:longevity:browser",
18
23
  "build:bundle": "node scripts/build-sea.js bundle",
19
24
  "build:sea": "node scripts/build-sea.js",
20
25
  "release:pr": "bash scripts/release-pr.sh"
package/src/public/app.js CHANGED
@@ -5781,3 +5781,118 @@ document.addEventListener('DOMContentLoaded', () => {
5781
5781
  window.app = app;
5782
5782
  app.startHeartbeat();
5783
5783
  });
5784
+
5785
+ // CLIENT-03 (stability-hardening-2026): browser-side diagnostics surface.
5786
+ // Mirrors the server `/api/diagnostics` shape so SUP-SOAK's browser soak
5787
+ // can sample uniformly. Installed at module level so it is callable from
5788
+ // the moment app.js finishes loading — independent of when `window.app`
5789
+ // is constructed or whether a session has been joined. All sub-collectors
5790
+ // are wrapped in try/catch so the function never throws; safe to call
5791
+ // pre-session. Returns a Promise (the optional
5792
+ // `performance.measureUserAgentSpecificMemory()` call is async).
5793
+ // Idempotent: re-loading app.js (e.g. HMR) overwrites the previous
5794
+ // install — last loader wins.
5795
+ // Spec: docs/specs/client-longevity.md
5796
+ window.__diagnostics = async function __diagnostics() {
5797
+ const ts = Date.now();
5798
+ const snap = {
5799
+ ts: ts,
5800
+ dom: { total_nodes: 0 },
5801
+ buffers: { plan_detector_bytes: 0, xterm_scrollback_lines: 0 },
5802
+ ws: { state: null, url: null },
5803
+ sse: { connected: false, streams: 0 },
5804
+ memory: null
5805
+ };
5806
+
5807
+ // dom.total_nodes
5808
+ try {
5809
+ snap.dom.total_nodes = document.querySelectorAll('*').length;
5810
+ } catch (_) { /* leave 0 */ }
5811
+
5812
+ // dom.listeners_tracked — only emit if a tracker exists. No tracker
5813
+ // ships today; SUP-SOAK must tolerate absence per spec v1.
5814
+ try {
5815
+ if (typeof window.__listenerCount === 'number') {
5816
+ snap.dom.listeners_tracked = window.__listenerCount;
5817
+ }
5818
+ } catch (_) { /* leave omitted */ }
5819
+
5820
+ // buffers.plan_detector_bytes — prefer the CLIENT-01 `bufferBytes`
5821
+ // field; fall back to inline sum if running against an older detector.
5822
+ try {
5823
+ const pd = window.app && window.app.planDetector;
5824
+ if (pd) {
5825
+ if (typeof pd.bufferBytes === 'number') {
5826
+ snap.buffers.plan_detector_bytes = pd.bufferBytes;
5827
+ } else if (Array.isArray(pd.outputBuffer)) {
5828
+ let sum = 0;
5829
+ for (let i = 0; i < pd.outputBuffer.length; i++) {
5830
+ const e = pd.outputBuffer[i];
5831
+ if (e && typeof e.data === 'string') sum += e.data.length;
5832
+ }
5833
+ snap.buffers.plan_detector_bytes = sum;
5834
+ }
5835
+ }
5836
+ } catch (_) { /* leave 0 */ }
5837
+
5838
+ // buffers.xterm_scrollback_lines — xterm.js buffer line count.
5839
+ try {
5840
+ const term = window.app && window.app.terminal;
5841
+ if (term && term.buffer && term.buffer.active
5842
+ && typeof term.buffer.active.length === 'number') {
5843
+ snap.buffers.xterm_scrollback_lines = term.buffer.active.length;
5844
+ }
5845
+ } catch (_) { /* leave 0 */ }
5846
+
5847
+ // ws.state / ws.url
5848
+ try {
5849
+ const sock = window.app && window.app.socket;
5850
+ if (sock) {
5851
+ if (typeof sock.readyState === 'number') snap.ws.state = sock.readyState;
5852
+ if (typeof sock.url === 'string') snap.ws.url = sock.url;
5853
+ }
5854
+ } catch (_) { /* leave null */ }
5855
+
5856
+ // sse — best-effort walk of known holders. No global EventSource count
5857
+ // is exposed by browsers, so this is a lower bound.
5858
+ try {
5859
+ let streams = 0;
5860
+ const candidates = [
5861
+ window.app && window.app._fileBrowserPanel
5862
+ && window.app._fileBrowserPanel._fileWatcher
5863
+ && window.app._fileBrowserPanel._fileWatcher._eventSource,
5864
+ window.app && window.app._fileSearchPanel
5865
+ && window.app._fileSearchPanel._eventSource,
5866
+ window.app && window.app._fileWatcher
5867
+ && window.app._fileWatcher._eventSource,
5868
+ ];
5869
+ for (let i = 0; i < candidates.length; i++) {
5870
+ const es = candidates[i];
5871
+ // EventSource.OPEN === 1; CONNECTING === 0; CLOSED === 2.
5872
+ if (es && typeof es.readyState === 'number' && es.readyState !== 2) {
5873
+ streams++;
5874
+ }
5875
+ }
5876
+ snap.sse.streams = streams;
5877
+ snap.sse.connected = streams > 0;
5878
+ } catch (_) { /* leave defaults */ }
5879
+
5880
+ // memory — cross-origin-isolated Chrome only; fall back to
5881
+ // navigator.deviceMemory; else null.
5882
+ try {
5883
+ if (typeof performance !== 'undefined'
5884
+ && typeof performance.measureUserAgentSpecificMemory === 'function') {
5885
+ try {
5886
+ snap.memory = await performance.measureUserAgentSpecificMemory();
5887
+ } catch (_) {
5888
+ if (typeof navigator !== 'undefined' && typeof navigator.deviceMemory === 'number') {
5889
+ snap.memory = { deviceMemoryGB: navigator.deviceMemory };
5890
+ }
5891
+ }
5892
+ } else if (typeof navigator !== 'undefined' && typeof navigator.deviceMemory === 'number') {
5893
+ snap.memory = { deviceMemoryGB: navigator.deviceMemory };
5894
+ }
5895
+ } catch (_) { /* leave null */ }
5896
+
5897
+ return snap;
5898
+ };
@@ -2,12 +2,18 @@ class PlanDetector {
2
2
  constructor() {
3
3
  this.isMonitoring = false;
4
4
  this.outputBuffer = [];
5
+ // Byte-count cap (data.length is a faithful proxy for V8 string heap
6
+ // cost — see docs/audits/client-plan-detector.md). 8 MB hard cap; FIFO
7
+ // eviction when exceeded. Replaces the older 10 000-item cap, which
8
+ // permitted ~80 MB of retained string memory per tab under sustained
9
+ // heavy PTY output.
10
+ this.maxBufferBytes = 8 * 1024 * 1024;
11
+ this.bufferBytes = 0;
5
12
  this.planModeActive = false;
6
13
  this.currentPlan = null;
7
14
  this.currentTool = null;
8
15
  this.planStartMarker = '## Implementation Plan:';
9
16
  this.planEndMarker = 'User has approved your plan';
10
- this.maxBufferSize = 10000;
11
17
  this.onPlanDetected = null;
12
18
  this.onPlanModeChange = null;
13
19
  this.onStepProgress = null;
@@ -64,11 +70,18 @@ class PlanDetector {
64
70
  timestamp: Date.now(),
65
71
  data: data
66
72
  });
67
-
68
- // Keep buffer size manageable
69
- if (this.outputBuffer.length > this.maxBufferSize) {
70
- this.outputBuffer = this.outputBuffer.slice(-this.maxBufferSize / 2);
73
+ this.bufferBytes += data.length;
74
+
75
+ // FIFO eviction: pop oldest until under the byte cap. O(k) per call
76
+ // where k is the number of entries to evict — usually 1 unless a
77
+ // single huge chunk pushes us multiple entries over.
78
+ while (this.bufferBytes > this.maxBufferBytes && this.outputBuffer.length > 0) {
79
+ const evicted = this.outputBuffer.shift();
80
+ this.bufferBytes -= evicted.data.length;
71
81
  }
82
+ // Defensive: clamp accounting drift caused by arithmetic on unusual
83
+ // string types (should never happen, but cheap insurance).
84
+ if (this.bufferBytes < 0) this.bufferBytes = 0;
72
85
 
73
86
  // Stage 1: Quick trigger scan on the new chunk only (O(k) where k = chunk size).
74
87
  // Prepend overlap from the previous chunk to catch triggers spanning boundaries.
@@ -405,6 +418,7 @@ class PlanDetector {
405
418
  startMonitoring() {
406
419
  this.isMonitoring = true;
407
420
  this.outputBuffer = [];
421
+ this.bufferBytes = 0;
408
422
  this.planModeActive = false;
409
423
  this.currentPlan = null;
410
424
  this._lastChunkTail = '';
@@ -413,6 +427,7 @@ class PlanDetector {
413
427
  stopMonitoring() {
414
428
  this.isMonitoring = false;
415
429
  this.outputBuffer = [];
430
+ this.bufferBytes = 0;
416
431
  this.planModeActive = false;
417
432
  this.currentPlan = null;
418
433
  this._lastChunkTail = '';
@@ -420,6 +435,7 @@ class PlanDetector {
420
435
 
421
436
  clearBuffer() {
422
437
  this.outputBuffer = [];
438
+ this.bufferBytes = 0;
423
439
  this.currentPlan = null;
424
440
  this._lastChunkTail = '';
425
441
  }