@testdriverai/runner 7.8.0-canary.14 → 7.8.0-canary.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.js CHANGED
@@ -280,9 +280,9 @@ class PresenceRunner {
280
280
  await new Promise((resolve, reject) => {
281
281
  this.ably.connection.on('connected', resolve);
282
282
  this.ably.connection.on('failed', (err) => {
283
- reject(new Error(`Ably connection failed: ${err?.reason?.message || 'unknown'}`));
283
+ reject(new Error(`Realtime connection failed: ${err?.reason?.message || 'unknown'}`));
284
284
  });
285
- setTimeout(() => reject(new Error('Ably connection timeout')), 30000);
285
+ setTimeout(() => reject(new Error('Realtime connection timeout')), 30000);
286
286
  });
287
287
 
288
288
  log('Connected to Ably');
@@ -291,7 +291,7 @@ class PresenceRunner {
291
291
  this.ably.connection.on((stateChange) => {
292
292
  const { current, previous, reason, retryIn } = stateChange;
293
293
  const reasonMsg = reason ? (reason.message || reason.code || String(reason)) : undefined;
294
- log(`[ably] Presence connection: ${previous} → ${current}${reasonMsg ? ' — ' + reasonMsg : ''}${retryIn ? ' (retryIn=' + retryIn + 'ms)' : ''}`);
294
+ log(`[realtime] Presence connection: ${previous} → ${current}${reasonMsg ? ' — ' + reasonMsg : ''}${retryIn ? ' (retryIn=' + retryIn + 'ms)' : ''}`);
295
295
  });
296
296
 
297
297
  // Get runner channel and enter presence
@@ -201,10 +201,10 @@ class AblyService extends EventEmitter {
201
201
  resolve();
202
202
  });
203
203
  this._ably.connection.on('failed', () => {
204
- reject(new Error('Ably connection failed'));
204
+ reject(new Error('Realtime connection failed'));
205
205
  });
206
206
  setTimeout(() => {
207
- reject(new Error('Ably connection timeout (30s)'));
207
+ reject(new Error('Realtime connection timeout (30s)'));
208
208
  }, 30000);
209
209
  });
210
210
 
@@ -275,6 +275,9 @@ class AblyService extends EventEmitter {
275
275
 
276
276
  this.emit('log', `Command received: ${type} (requestId=${requestId})`);
277
277
 
278
+ // Stop re-publishing runner.ready once we get the first command
279
+ this._stopReadySignal();
280
+
278
281
  // Per-command timeout: use message.timeout if provided, else default 120s
279
282
  // Prevents hanging forever if screenshot capture or S3 upload stalls
280
283
  const commandTimeout = (message.timeout && message.timeout > 0)
@@ -331,7 +334,7 @@ class AblyService extends EventEmitter {
331
334
  };
332
335
  this._commandSubscription = await this._sessionChannel.subscribe('command', this._onCommandMsg);
333
336
 
334
- // ─── Ably connection state monitoring → Sentry ─────────────────────────
337
+ // ─── Realtime connection state monitoring → Sentry ─────────────────────────
335
338
  this._ably.connection.on((stateChange) => {
336
339
  const { current, previous, reason, retryIn } = stateChange;
337
340
  const reasonMsg = reason ? (reason.message || reason.code || String(reason)) : undefined;
@@ -346,28 +349,28 @@ class AblyService extends EventEmitter {
346
349
  // Preserve original behavior
347
350
  if (current === 'disconnected') {
348
351
  this._connected = false;
349
- this.emit('log', `Ably connection: ${previous} → ${current}${reasonMsg ? ' — ' + reasonMsg : ''}${retryIn ? ' (retryIn=' + retryIn + 'ms)' : ''}`);
352
+ this.emit('log', `Realtime connection: ${previous} → ${current}${reasonMsg ? ' — ' + reasonMsg : ''}${retryIn ? ' (retryIn=' + retryIn + 'ms)' : ''}`);
350
353
  this.emit('log', 'Ably disconnected — will auto-reconnect');
351
354
  } else if (current === 'connected' && previous !== 'initialized') {
352
355
  if (!this._connected) {
353
356
  this._connected = true;
354
- this.emit('log', `Ably connection: ${previous} → ${current}`);
357
+ this.emit('log', `Realtime connection: ${previous} → ${current}`);
355
358
  this.emit('log', 'Ably reconnected');
356
359
  }
357
360
  } else if (current === 'failed') {
358
361
  this._connected = false;
359
- this.emit('log', `Ably connection: ${previous} → ${current}${reasonMsg ? ' — ' + reasonMsg : ''}`);
360
- this.emit('error', new Error('Ably connection failed'));
362
+ this.emit('log', `Realtime connection: ${previous} → ${current}${reasonMsg ? ' — ' + reasonMsg : ''}`);
363
+ this.emit('error', new Error('Realtime connection failed'));
361
364
  } else if (current === 'suspended') {
362
365
  this._connected = false;
363
- this.emit('log', `Ably connection: ${previous} → ${current}${reasonMsg ? ' — ' + reasonMsg : ''}`);
366
+ this.emit('log', `Realtime connection: ${previous} → ${current}${reasonMsg ? ' — ' + reasonMsg : ''}`);
364
367
  this.emit('log', 'Ably suspended — connection lost for extended period, will keep retrying');
365
368
  } else if (current === 'closed') {
366
369
  this._connected = false;
367
- this.emit('log', `Ably connection: ${previous} → ${current}`);
370
+ this.emit('log', `Realtime connection: ${previous} → ${current}`);
368
371
  this.emit('disconnected');
369
372
  } else {
370
- this.emit('log', `Ably connection: ${previous} → ${current}${reasonMsg ? ' — ' + reasonMsg : ''}`);
373
+ this.emit('log', `Realtime connection: ${previous} → ${current}${reasonMsg ? ' — ' + reasonMsg : ''}`);
371
374
  }
372
375
 
373
376
  // Capture exceptions for bad states
@@ -377,7 +380,7 @@ class AblyService extends EventEmitter {
377
380
  scope.setTag('ably.state', current);
378
381
  scope.setTag('sandbox.id', this._sandboxId);
379
382
  scope.setContext('ably_connection', { from: previous, to: current, reason: reasonMsg, retryIn });
380
- const err = reason instanceof Error ? reason : new Error('Ably connection state error');
383
+ const err = reason instanceof Error ? reason : new Error('Realtime connection state error');
381
384
  err.name = 'AblyConnectionError';
382
385
  Sentry.captureException(err);
383
386
  });
@@ -415,8 +418,8 @@ class AblyService extends EventEmitter {
415
418
 
416
419
  // Detect discontinuity: channel re-attached but message continuity was lost.
417
420
  // Use historyBeforeSubscribe() on each subscription to recover missed messages.
418
- if (current === 'attached' && stateChange.resumed === false && previous) {
419
- this.emit('log', `Ably channel [session]: DISCONTINUITY (resumed=false)${reasonMsg ? ' — ' + reasonMsg : ''}`);
421
+ if (current === 'attached' && stateChange.resumed === false && previous === 'attached') {
422
+ this.emit('log', `Ably channel [session]: DISCONTINUITY (resumed=false)${reasonMsg ? ' — ' + reasonMsg : ''}`);
420
423
 
421
424
  Sentry.withScope((scope) => {
422
425
  scope.setTag('ably.client', 'runner');
@@ -459,7 +462,7 @@ class AblyService extends EventEmitter {
459
462
  // Signal readiness to SDK — commands sent before this would be lost
460
463
  const readyPayload = {
461
464
  type: 'runner.ready',
462
- os: 'windows',
465
+ os: process.platform === 'win32' ? 'windows' : 'linux',
463
466
  sandboxId: this._sandboxId,
464
467
  runnerVersion: getLocalVersion() || 'unknown',
465
468
  timestamp: Date.now(),
@@ -473,6 +476,39 @@ class AblyService extends EventEmitter {
473
476
  }
474
477
  await this._sessionChannel.publish('control', readyPayload);
475
478
  this.emit('log', 'Published runner.ready signal');
479
+
480
+ // Re-publish runner.ready every 3s for up to 60s.
481
+ // The SDK may connect after the first publish (race condition),
482
+ // and Ably channel history may not be enabled. Repeating ensures
483
+ // the SDK catches at least one live runner.ready message.
484
+ this._readyInterval = setInterval(async () => {
485
+ try {
486
+ readyPayload.timestamp = Date.now();
487
+ await this._sessionChannel.publish('control', readyPayload);
488
+ this.emit('log', 'Re-published runner.ready signal');
489
+ } catch (err) {
490
+ this.emit('log', `Failed to re-publish runner.ready: ${err.message}`);
491
+ }
492
+ }, 3000);
493
+
494
+ // Stop after 60s regardless
495
+ this._readyTimeout = setTimeout(() => {
496
+ this._stopReadySignal();
497
+ }, 60000);
498
+ }
499
+
500
+ /**
501
+ * Stop the repeated runner.ready signal (called on first command or after timeout).
502
+ */
503
+ _stopReadySignal() {
504
+ if (this._readyInterval) {
505
+ clearInterval(this._readyInterval);
506
+ this._readyInterval = null;
507
+ }
508
+ if (this._readyTimeout) {
509
+ clearTimeout(this._readyTimeout);
510
+ this._readyTimeout = null;
511
+ }
476
512
  }
477
513
 
478
514
  /**
@@ -615,7 +651,9 @@ class AblyService extends EventEmitter {
615
651
  * Disconnect from Ably and clean up.
616
652
  */
617
653
  async close() {
618
- this.emit('log', 'Closing Ably service...');
654
+ this.emit('log', 'Closing realtime service...');
655
+
656
+ this._stopReadySignal();
619
657
 
620
658
  if (this._statsInterval) {
621
659
  clearInterval(this._statsInterval);
package/lib/automation.js CHANGED
@@ -45,8 +45,10 @@ const API_KEY = process.env.TD_API_KEY;
45
45
  // shell injection and escaping issues.
46
46
 
47
47
  const PYTHON = IS_WINDOWS ? 'python' : 'python3';
48
+ // On Linux, ensure DISPLAY is set (use env var or fallback to :0)
49
+ // The os.environ.get() preserves the parent's DISPLAY setting for E2B's :1 display
48
50
  const PY_IMPORT = IS_LINUX
49
- ? "import os; os.environ['DISPLAY'] = ':0'; import pyautogui, sys; pyautogui.FAILSAFE = False; "
51
+ ? "import os; os.environ.setdefault('DISPLAY', ':0'); import pyautogui, sys; pyautogui.FAILSAFE = False; "
50
52
  : 'import pyautogui, sys; pyautogui.FAILSAFE = False; ';
51
53
 
52
54
  /**
@@ -660,33 +662,68 @@ class Automation extends EventEmitter {
660
662
 
661
663
  async _captureScreenshot() {
662
664
  const sharp = require('sharp');
663
- const tmpFile = path.join(os.tmpdir(), `td_screenshot_${Date.now()}.png`);
665
+ const maxAttempts = 3;
664
666
 
665
- try {
666
- // Capture screenshot via pyautogui → saves to temp file
667
- // Python handles Retina downscale: if physical size differs from logical,
668
- // the image is resized to logical dimensions before saving.
669
- await runPyAutoGUI(
670
- 'img = pyautogui.screenshot()\n' +
671
- 'logical = pyautogui.size()\n' +
672
- 'if img.size[0] != logical[0] or img.size[1] != logical[1]:\n' +
673
- ' from PIL import Image\n' +
674
- ' img = img.resize((logical[0], logical[1]), Image.LANCZOS)\n' +
675
- 'img.save(sys.argv[1], format="PNG")',
676
- [tmpFile],
677
- 20000
678
- );
667
+ for (let attempt = 1; attempt <= maxAttempts; attempt++) {
668
+ const tmpFile = path.join(os.tmpdir(), `td_screenshot_${Date.now()}.png`);
679
669
 
680
- // Read the PNG and re-encode with sharp (lossless, no compression)
681
- const pngBuffer = fs.readFileSync(tmpFile);
682
- const buffer = await sharp(pngBuffer)
683
- .png({ compressionLevel: 0 })
684
- .toBuffer();
670
+ try {
671
+ // Capture screenshot via pyautogui → saves to temp file
672
+ // Python handles Retina downscale: if physical size differs from logical,
673
+ // the image is resized to logical dimensions before saving.
674
+ await runPyAutoGUI(
675
+ 'img = pyautogui.screenshot()\n' +
676
+ 'logical = pyautogui.size()\n' +
677
+ 'if img.size[0] != logical[0] or img.size[1] != logical[1]:\n' +
678
+ ' from PIL import Image\n' +
679
+ ' img = img.resize((logical[0], logical[1]), Image.LANCZOS)\n' +
680
+ 'img.save(sys.argv[1], format="PNG")',
681
+ [tmpFile],
682
+ 20000
683
+ );
685
684
 
686
- return buffer.toString('base64');
687
- } finally {
688
- // Clean up temp file
689
- try { fs.unlinkSync(tmpFile); } catch {}
685
+ // Read the PNG and re-encode with sharp (lossless, no compression)
686
+ const pngBuffer = fs.readFileSync(tmpFile);
687
+ const image = sharp(pngBuffer);
688
+
689
+ // Detect all-black screenshots (Xvfb/compositor issue)
690
+ if (IS_LINUX) {
691
+ const { channels } = await image.stats();
692
+ // channels[0..2] = R, G, B — check if max pixel value across all channels is near-zero
693
+ const maxPixel = Math.max(
694
+ channels[0]?.max ?? 0,
695
+ channels[1]?.max ?? 0,
696
+ channels[2]?.max ?? 0
697
+ );
698
+ if (maxPixel <= 1) {
699
+ console.warn(`[automation] Screenshot attempt ${attempt}/${maxAttempts}: image is all black (max pixel=${maxPixel})`);
700
+ if (attempt < maxAttempts) {
701
+ // Try to heal: poke the display to trigger a redraw
702
+ try {
703
+ await runPyAutoGUI(
704
+ "import subprocess; " +
705
+ "subprocess.run(['xdotool', 'key', '--clearmodifiers', 'super'], timeout=5); " +
706
+ "subprocess.run(['xset', 's', 'off'], timeout=5); " +
707
+ "subprocess.run(['xset', 's', 'noblank'], timeout=5); " +
708
+ "subprocess.run(['xset', '-dpms'], timeout=5)",
709
+ [],
710
+ 10000
711
+ );
712
+ } catch {}
713
+ // Wait for display to recover
714
+ await new Promise(r => setTimeout(r, 2000));
715
+ continue;
716
+ }
717
+ console.error('[automation] All screenshot attempts returned black — display may be broken');
718
+ }
719
+ }
720
+
721
+ const buffer = await image.png({ compressionLevel: 0 }).toBuffer();
722
+ return buffer.toString('base64');
723
+ } finally {
724
+ // Clean up temp file
725
+ try { fs.unlinkSync(tmpFile); } catch {}
726
+ }
690
727
  }
691
728
  }
692
729
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@testdriverai/runner",
3
- "version": "7.8.0-canary.14",
3
+ "version": "7.8.0-canary.16",
4
4
  "description": "TestDriver Runner - Ably-based remote automation agent with Node.js automation",
5
5
  "main": "index.js",
6
6
  "bin": {
@@ -37,6 +37,9 @@
37
37
  "sharp": "^0.33.0",
38
38
  "uuid": "^9.0.0"
39
39
  },
40
+ "publishConfig": {
41
+ "access": "public"
42
+ },
40
43
  "devDependencies": {
41
44
  "e2b": "^2.12.1"
42
45
  }
package/sandbox-agent.js CHANGED
@@ -236,8 +236,8 @@ async function main() {
236
236
  updateInfo: null, // sandbox-agent doesn't do self-update checks
237
237
  });
238
238
 
239
- ablyService.on('log', (msg) => log(`[ably] ${msg}`));
240
- ablyService.on('error', (err) => log(`[ably] ERROR: ${err.message}`));
239
+ ablyService.on('log', (msg) => log(`[realtime] ${msg}`));
240
+ ablyService.on('error', (err) => log(`[realtime] ERROR: ${err.message}`));
241
241
 
242
242
  await ablyService.connect();
243
243
  log('Agent ready — listening for commands via Ably');
@@ -0,0 +1,105 @@
1
+ #!/bin/bash
2
+ # ─── TestDriver Sandbox Agent Startup ────────────────────────────────────────
3
+ # Starts the sandbox-agent.js (Ably-based automation agent) inside the E2B
4
+ # sandbox. This script is called by the API after writing the config file
5
+ # to /tmp/testdriver-agent.json.
6
+ #
7
+ # This matches the Windows runner pattern: the agent runs locally on the
8
+ # sandbox and executes commands via pyautogui (instead of @e2b/desktop RPC).
9
+ #
10
+ # Usage: bash /opt/testdriver-runner/scripts-desktop/start-agent.sh [&]
11
+ #
12
+ # Prerequisites:
13
+ # - Desktop environment running (start-desktop.sh completed)
14
+ # - Config file at /tmp/testdriver-agent.json with Ably credentials
15
+ # - Node.js installed
16
+ # - Runner installed at /opt/testdriver-runner
17
+
18
+ set -e
19
+
20
+ export DISPLAY="${DISPLAY:-:0}"
21
+ export XAUTHORITY="${XAUTHORITY:-${HOME}/.Xauthority}"
22
+
23
+ RUNNER_DIR="/opt/testdriver-runner"
24
+ CONFIG_PATH="/tmp/testdriver-agent.json"
25
+ LOG_FILE="/tmp/sandbox-agent.log"
26
+ PID_FILE="/tmp/sandbox-agent.pid"
27
+
28
+ log() {
29
+ echo "[$(date -Iseconds)] [start-agent] $1" | tee -a "$LOG_FILE"
30
+ }
31
+
32
+ # ─── Check if already running ─────────────────────────────────────────────────
33
+ if [ -f "$PID_FILE" ]; then
34
+ existing_pid=$(cat "$PID_FILE")
35
+ if kill -0 "$existing_pid" 2>/dev/null; then
36
+ log "Agent already running (PID: $existing_pid), exiting"
37
+ exit 0
38
+ else
39
+ log "Stale PID file found, removing"
40
+ rm -f "$PID_FILE"
41
+ fi
42
+ fi
43
+
44
+ # ─── Verify prerequisites ─────────────────────────────────────────────────────
45
+ if [ ! -d "$RUNNER_DIR" ]; then
46
+ log "ERROR: Runner not found at $RUNNER_DIR"
47
+ exit 1
48
+ fi
49
+
50
+ if [ ! -f "$RUNNER_DIR/sandbox-agent.js" ]; then
51
+ log "ERROR: sandbox-agent.js not found in $RUNNER_DIR"
52
+ exit 1
53
+ fi
54
+
55
+ if ! command -v node &> /dev/null; then
56
+ log "ERROR: Node.js not installed"
57
+ exit 1
58
+ fi
59
+
60
+ # ─── Wait for config file (with timeout) ─────────────────────────────────────
61
+ # The API writes the config file before calling this script, but we add a
62
+ # brief wait just in case there's any race condition.
63
+ WAIT_TIMEOUT=30
64
+ WAIT_INTERVAL=1
65
+ elapsed=0
66
+
67
+ log "Waiting for config file: $CONFIG_PATH"
68
+ while [ ! -f "$CONFIG_PATH" ] && [ $elapsed -lt $WAIT_TIMEOUT ]; do
69
+ sleep $WAIT_INTERVAL
70
+ elapsed=$((elapsed + WAIT_INTERVAL))
71
+ done
72
+
73
+ if [ ! -f "$CONFIG_PATH" ]; then
74
+ log "ERROR: Config file not found after ${WAIT_TIMEOUT}s: $CONFIG_PATH"
75
+ exit 1
76
+ fi
77
+
78
+ log "Config file found"
79
+
80
+ # ─── Start the agent ──────────────────────────────────────────────────────────
81
+ log "Starting sandbox-agent.js..."
82
+ log "DISPLAY=$DISPLAY, RUNNER_DIR=$RUNNER_DIR"
83
+
84
+ # Run in background, redirect output to log file
85
+ cd "$RUNNER_DIR"
86
+ nohup node sandbox-agent.js >> "$LOG_FILE" 2>&1 &
87
+ AGENT_PID=$!
88
+
89
+ # Write PID file for process management
90
+ echo "$AGENT_PID" > "$PID_FILE"
91
+
92
+ log "Agent started (PID: $AGENT_PID)"
93
+ log "Log file: $LOG_FILE"
94
+
95
+ # Brief pause to catch any immediate startup errors
96
+ sleep 2
97
+
98
+ if kill -0 "$AGENT_PID" 2>/dev/null; then
99
+ log "Agent running successfully"
100
+ exit 0
101
+ else
102
+ log "ERROR: Agent exited unexpectedly. Check $LOG_FILE for details"
103
+ tail -20 "$LOG_FILE" | while read line; do log " $line"; done
104
+ exit 1
105
+ fi
@@ -60,6 +60,23 @@ if [ -z "$DBUS_SESSION_BUS_ADDRESS" ]; then
60
60
  export DBUS_SESSION_BUS_ADDRESS
61
61
  fi
62
62
 
63
+ # ─── Pre-configure xfwm4 to disable compositor ───────────────────────────────
64
+ # Writing the config file BEFORE starting XFCE ensures xfwm4 starts with
65
+ # compositing disabled from frame zero. The previous approach ran xfconf-query
66
+ # 3 seconds after startxfce4, but xfwm4 often started with compositing enabled
67
+ # before the query ran (or dbus wasn't ready) — causing the Xvfb framebuffer to
68
+ # stay permanently black (~1/15 runs). Pre-writing the XML avoids the race.
69
+ mkdir -p "${HOME}/.config/xfce4/xfconf/xfce-perchannel-xml"
70
+ cat > "${HOME}/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml" << 'EOF'
71
+ <?xml version="1.0" encoding="UTF-8"?>
72
+ <channel name="xfwm4" version="1.0">
73
+ <property name="general" type="empty">
74
+ <property name="use_compositing" type="bool" value="false"/>
75
+ </property>
76
+ </channel>
77
+ EOF
78
+ echo "[start-desktop] xfwm4 compositor pre-disabled via config file"
79
+
63
80
  # ─── Start XFCE desktop ──────────────────────────────────────────────────────
64
81
  if pgrep -x xfce4-session > /dev/null 2>&1; then
65
82
  echo "[start-desktop] XFCE already running, skipping"
@@ -68,9 +85,6 @@ else
68
85
  startxfce4 &
69
86
  sleep 3
70
87
 
71
- # Disable xfwm4 compositor (causes black screen in Xvfb — no GPU)
72
- xfconf-query -c xfwm4 -p /general/use_compositing -s false 2>/dev/null || true
73
-
74
88
  # Kill power manager, screensaver, and error dialogs (not needed in headless)
75
89
  killall xfce4-power-manager 2>/dev/null || true
76
90
  killall xfce4-screensaver 2>/dev/null || true
@@ -78,6 +92,12 @@ else
78
92
  xdotool search --name "Power Manager" windowclose 2>/dev/null || true
79
93
  fi
80
94
 
95
+ # Always enforce compositor=off at runtime regardless of whether XFCE was already
96
+ # running. Belt-and-suspenders: covers the case where this script's previous run
97
+ # started XFCE (skipping the else-block above), or where xfwm4 somehow ignored
98
+ # the config file.
99
+ xfconf-query -c xfwm4 -p /general/use_compositing -s false 2>/dev/null || true
100
+
81
101
  # ─── Set TestDriver wallpaper ─────────────────────────────────────────────────
82
102
  WALLPAPER="/usr/share/backgrounds/xfce/wallpaper.png"
83
103
  if [ -f "$WALLPAPER" ]; then
@@ -155,7 +175,130 @@ sleep 1
155
175
 
156
176
  echo "[start-desktop] Desktop environment ready"
157
177
 
178
+ # ─── Helper: restart Xvfb ────────────────────────────────────────────────────
179
+ restart_xvfb() {
180
+ echo "[watchdog] Restarting Xvfb..."
181
+ killall Xvfb 2>/dev/null || true
182
+ sleep 1
183
+ rm -f /tmp/.X0-lock /tmp/.X11-unix/X0 2>/dev/null
184
+ Xvfb :0 -ac -screen 0 "${SCREEN_WIDTH}x${SCREEN_HEIGHT}x24" -retro -nolisten tcp &
185
+ XVFB_PID=$!
186
+ # Wait for Xvfb to be ready
187
+ for _w in $(seq 1 10); do
188
+ xdpyinfo -display :0 > /dev/null 2>&1 && break
189
+ sleep 1
190
+ done
191
+ if ! kill -0 $XVFB_PID 2>/dev/null; then
192
+ echo "[watchdog] ERROR: Xvfb failed to restart"
193
+ return 1
194
+ fi
195
+ # Re-disable screen blanking & DPMS on the fresh Xvfb
196
+ xset s off 2>/dev/null || true
197
+ xset s noblank 2>/dev/null || true
198
+ xset -dpms 2>/dev/null || true
199
+ echo "[watchdog] Xvfb restarted (PID: $XVFB_PID)"
200
+ }
201
+
202
+ # ─── Helper: restart xfce4 (mirrors E2B's defunct-process check) ─────────────
203
+ restart_xfce4() {
204
+ echo "[watchdog] Restarting xfce4-session..."
205
+ killall xfce4-session 2>/dev/null || true
206
+ sleep 1
207
+ startxfce4 &
208
+ XFCE4_PID=$!
209
+ sleep 3
210
+ killall xfce4-power-manager 2>/dev/null || true
211
+ killall xfce4-screensaver 2>/dev/null || true
212
+ xfconf-query -c xfwm4 -p /general/use_compositing -s false 2>/dev/null || true
213
+ xset s off 2>/dev/null || true
214
+ xset s noblank 2>/dev/null || true
215
+ xset -dpms 2>/dev/null || true
216
+ echo "[watchdog] xfce4-session restarted (PID: $XFCE4_PID)"
217
+ }
218
+
219
+ # ─── Helper: restart x11vnc ──────────────────────────────────────────────────
220
+ restart_x11vnc() {
221
+ echo "[watchdog] Restarting x11vnc..."
222
+ killall x11vnc 2>/dev/null || true
223
+ sleep 1
224
+ x11vnc -display :0 -forever -nopw -shared -rfbport 5900 \
225
+ -noxdamage -fixscreen V=2 \
226
+ -bg -o /dev/null 2>/dev/null || true
227
+ echo "[watchdog] x11vnc restarted"
228
+ }
229
+
230
+ # ─── Watchdog loop ────────────────────────────────────────────────────────────
231
+ # Monitors Xvfb, xfce4-session, and x11vnc health every 10 seconds.
232
+ # Restarts any component that has crashed or become defunct.
233
+ # Also periodically re-disables screen blanking/compositor as belt-and-suspenders.
234
+ WATCHDOG_INTERVAL=10
235
+ BLANKING_RESET_COUNTER=0
236
+
237
+ watchdog_loop() {
238
+ while true; do
239
+ sleep "$WATCHDOG_INTERVAL"
240
+
241
+ # ── Check Xvfb ──
242
+ if ! pgrep -x Xvfb > /dev/null 2>&1; then
243
+ echo "[watchdog] Xvfb not running! Recovering..."
244
+ restart_xvfb
245
+ # x11vnc and xfce need a running Xvfb, restart them too
246
+ restart_xfce4
247
+ restart_x11vnc
248
+ continue
249
+ fi
250
+
251
+ # Verify Xvfb is actually responding (not just a zombie process)
252
+ if ! xdpyinfo -display :0 > /dev/null 2>&1; then
253
+ echo "[watchdog] Xvfb process exists but display :0 is unresponsive! Recovering..."
254
+ restart_xvfb
255
+ restart_xfce4
256
+ restart_x11vnc
257
+ continue
258
+ fi
259
+
260
+ # ── Check xfce4-session (E2B pattern: detect <defunct> zombie) ──
261
+ XFCE_PID=$(pgrep -x xfce4-session | head -1)
262
+ if [ -z "$XFCE_PID" ]; then
263
+ echo "[watchdog] xfce4-session not running! Restarting..."
264
+ restart_xfce4
265
+ elif ps aux | grep "$XFCE_PID" | grep -v grep | head -1 | grep -q '<defunct>'; then
266
+ echo "[watchdog] xfce4-session is defunct (zombie)! Restarting..."
267
+ restart_xfce4
268
+ fi
269
+
270
+ # ── Check x11vnc ──
271
+ if ! pgrep -x x11vnc > /dev/null 2>&1; then
272
+ echo "[watchdog] x11vnc not running! Restarting..."
273
+ restart_x11vnc
274
+ fi
275
+
276
+ # ── Periodically re-disable screen blanking & compositor (every ~60s) ──
277
+ BLANKING_RESET_COUNTER=$((BLANKING_RESET_COUNTER + 1))
278
+ if [ "$BLANKING_RESET_COUNTER" -ge 6 ]; then
279
+ BLANKING_RESET_COUNTER=0
280
+ xset s off 2>/dev/null || true
281
+ xset s noblank 2>/dev/null || true
282
+ xset -dpms 2>/dev/null || true
283
+ xfconf-query -c xfwm4 -p /general/use_compositing -s false 2>/dev/null || true
284
+ fi
285
+
286
+ # ── Monitor /dev/shm usage ──
287
+ if [ -d /dev/shm ]; then
288
+ SHM_USAGE=$(df /dev/shm 2>/dev/null | awk 'NR==2 {print $5}' | tr -d '%')
289
+ if [ -n "$SHM_USAGE" ] && [ "$SHM_USAGE" -gt 90 ] 2>/dev/null; then
290
+ echo "[watchdog] WARNING: /dev/shm is ${SHM_USAGE}% full — X11 may fail to allocate pixmaps"
291
+ fi
292
+ fi
293
+ done
294
+ }
295
+
296
+ # Start watchdog in background
297
+ watchdog_loop &
298
+ WATCHDOG_PID=$!
299
+ echo "[start-desktop] Watchdog started (PID: $WATCHDOG_PID)"
300
+
158
301
  # Keep the script running so E2B doesn't consider the sandbox stopped
159
302
  # Trap signals for clean shutdown
160
- trap "kill $XVFB_PID $NOVNC_PID 2>/dev/null; exit 0" SIGTERM SIGINT
303
+ trap "kill $XVFB_PID $NOVNC_PID $WATCHDOG_PID 2>/dev/null; exit 0" SIGTERM SIGINT
161
304
  wait