npm - @testdriverai/runner - Versions diffs - 7.8.0-canary.14 → 7.8.0-canary.16 - Mend

@testdriverai/runner 7.8.0-canary.14 → 7.8.0-canary.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/index.js +3 -3
package/lib/ably-service.js +53 -15
package/lib/automation.js +62 -25
package/package.json +4 -1
package/sandbox-agent.js +2 -2
package/scripts-desktop/start-agent.sh +105 -0
package/scripts-desktop/start-desktop.sh +147 -4
package/lib/automation.js.bak +0 -882

package/index.js CHANGED Viewed

@@ -280,9 +280,9 @@ class PresenceRunner {
     await new Promise((resolve, reject) => {
       this.ably.connection.on('connected', resolve);
       this.ably.connection.on('failed', (err) => {
-        reject(new Error(`Ably connection failed: ${err?.reason?.message || 'unknown'}`));
+        reject(new Error(`Realtime connection failed: ${err?.reason?.message || 'unknown'}`));
       });
-      setTimeout(() => reject(new Error('Ably connection timeout')), 30000);
+      setTimeout(() => reject(new Error('Realtime connection timeout')), 30000);
     });
     log('Connected to Ably');
@@ -291,7 +291,7 @@ class PresenceRunner {
     this.ably.connection.on((stateChange) => {
       const { current, previous, reason, retryIn } = stateChange;
       const reasonMsg = reason ? (reason.message || reason.code || String(reason)) : undefined;
-      log(`[ably] Presence connection: ${previous} → ${current}${reasonMsg ? ' — ' + reasonMsg : ''}${retryIn ? ' (retryIn=' + retryIn + 'ms)' : ''}`);
+      log(`[realtime] Presence connection: ${previous} → ${current}${reasonMsg ? ' — ' + reasonMsg : ''}${retryIn ? ' (retryIn=' + retryIn + 'ms)' : ''}`);
     });
     // Get runner channel and enter presence

package/lib/ably-service.js CHANGED Viewed

@@ -201,10 +201,10 @@ class AblyService extends EventEmitter {
         resolve();
       });
       this._ably.connection.on('failed', () => {
-        reject(new Error('Ably connection failed'));
+        reject(new Error('Realtime connection failed'));
       });
       setTimeout(() => {
-        reject(new Error('Ably connection timeout (30s)'));
+        reject(new Error('Realtime connection timeout (30s)'));
       }, 30000);
     });
@@ -275,6 +275,9 @@ class AblyService extends EventEmitter {
       this.emit('log', `Command received: ${type} (requestId=${requestId})`);
+      // Stop re-publishing runner.ready once we get the first command
+      this._stopReadySignal();
       // Per-command timeout: use message.timeout if provided, else default 120s
       // Prevents hanging forever if screenshot capture or S3 upload stalls
       const commandTimeout = (message.timeout && message.timeout > 0)
@@ -331,7 +334,7 @@ class AblyService extends EventEmitter {
     };
     this._commandSubscription = await this._sessionChannel.subscribe('command', this._onCommandMsg);
-    // ─── Ably connection state monitoring → Sentry ─────────────────────────
+    // ─── Realtime connection state monitoring → Sentry ─────────────────────────
     this._ably.connection.on((stateChange) => {
       const { current, previous, reason, retryIn } = stateChange;
       const reasonMsg = reason ? (reason.message || reason.code || String(reason)) : undefined;
@@ -346,28 +349,28 @@ class AblyService extends EventEmitter {
       // Preserve original behavior
       if (current === 'disconnected') {
         this._connected = false;
-        this.emit('log', `Ably connection: ${previous} → ${current}${reasonMsg ? ' — ' + reasonMsg : ''}${retryIn ? ' (retryIn=' + retryIn + 'ms)' : ''}`);
+        this.emit('log', `Realtime connection: ${previous} → ${current}${reasonMsg ? ' — ' + reasonMsg : ''}${retryIn ? ' (retryIn=' + retryIn + 'ms)' : ''}`);
         this.emit('log', 'Ably disconnected — will auto-reconnect');
       } else if (current === 'connected' && previous !== 'initialized') {
         if (!this._connected) {
           this._connected = true;
-          this.emit('log', `Ably connection: ${previous} → ${current}`);
+          this.emit('log', `Realtime connection: ${previous} → ${current}`);
           this.emit('log', 'Ably reconnected');
         }
       } else if (current === 'failed') {
         this._connected = false;
-        this.emit('log', `Ably connection: ${previous} → ${current}${reasonMsg ? ' — ' + reasonMsg : ''}`);
-        this.emit('error', new Error('Ably connection failed'));
+        this.emit('log', `Realtime connection: ${previous} → ${current}${reasonMsg ? ' — ' + reasonMsg : ''}`);
+        this.emit('error', new Error('Realtime connection failed'));
       } else if (current === 'suspended') {
         this._connected = false;
-        this.emit('log', `Ably connection: ${previous} → ${current}${reasonMsg ? ' — ' + reasonMsg : ''}`);
+        this.emit('log', `Realtime connection: ${previous} → ${current}${reasonMsg ? ' — ' + reasonMsg : ''}`);
         this.emit('log', 'Ably suspended — connection lost for extended period, will keep retrying');
       } else if (current === 'closed') {
         this._connected = false;
-        this.emit('log', `Ably connection: ${previous} → ${current}`);
+        this.emit('log', `Realtime connection: ${previous} → ${current}`);
         this.emit('disconnected');
       } else {
-        this.emit('log', `Ably connection: ${previous} → ${current}${reasonMsg ? ' — ' + reasonMsg : ''}`);
+        this.emit('log', `Realtime connection: ${previous} → ${current}${reasonMsg ? ' — ' + reasonMsg : ''}`);
       }
       // Capture exceptions for bad states
@@ -377,7 +380,7 @@ class AblyService extends EventEmitter {
           scope.setTag('ably.state', current);
           scope.setTag('sandbox.id', this._sandboxId);
           scope.setContext('ably_connection', { from: previous, to: current, reason: reasonMsg, retryIn });
-          const err = reason instanceof Error ? reason : new Error('Ably connection state error');
+          const err = reason instanceof Error ? reason : new Error('Realtime connection state error');
           err.name = 'AblyConnectionError';
           Sentry.captureException(err);
         });
@@ -415,8 +418,8 @@ class AblyService extends EventEmitter {
         // Detect discontinuity: channel re-attached but message continuity was lost.
         // Use historyBeforeSubscribe() on each subscription to recover missed messages.
-        if (current === 'attached' && stateChange.resumed === false && previous) {
-          this.emit('log', `Ably channel [session]: DISCONTINUITY (resumed=false)${reasonMsg ? ' — ' + reasonMsg : ''}`);
+        if (current === 'attached' && stateChange.resumed === false && previous === 'attached') {
+          this.emit('log', `Ably channel [session]: DISCONTINUITY (resumed=false)${reasonMsg ? ' — ' + reasonMsg : ''}`);
           Sentry.withScope((scope) => {
             scope.setTag('ably.client', 'runner');
@@ -459,7 +462,7 @@ class AblyService extends EventEmitter {
     // Signal readiness to SDK — commands sent before this would be lost
     const readyPayload = {
       type: 'runner.ready',
-      os: 'windows',
+      os: process.platform === 'win32' ? 'windows' : 'linux',
       sandboxId: this._sandboxId,
       runnerVersion: getLocalVersion() || 'unknown',
       timestamp: Date.now(),
@@ -473,6 +476,39 @@ class AblyService extends EventEmitter {
     }
     await this._sessionChannel.publish('control', readyPayload);
     this.emit('log', 'Published runner.ready signal');
+    // Re-publish runner.ready every 3s for up to 60s.
+    // The SDK may connect after the first publish (race condition),
+    // and Ably channel history may not be enabled. Repeating ensures
+    // the SDK catches at least one live runner.ready message.
+    this._readyInterval = setInterval(async () => {
+      try {
+        readyPayload.timestamp = Date.now();
+        await this._sessionChannel.publish('control', readyPayload);
+        this.emit('log', 'Re-published runner.ready signal');
+      } catch (err) {
+        this.emit('log', `Failed to re-publish runner.ready: ${err.message}`);
+      }
+    }, 3000);
+    // Stop after 60s regardless
+    this._readyTimeout = setTimeout(() => {
+      this._stopReadySignal();
+    }, 60000);
+  }
+  /**
+   * Stop the repeated runner.ready signal (called on first command or after timeout).
+   */
+  _stopReadySignal() {
+    if (this._readyInterval) {
+      clearInterval(this._readyInterval);
+      this._readyInterval = null;
+    }
+    if (this._readyTimeout) {
+      clearTimeout(this._readyTimeout);
+      this._readyTimeout = null;
+    }
   }
   /**
@@ -615,7 +651,9 @@ class AblyService extends EventEmitter {
    * Disconnect from Ably and clean up.
    */
   async close() {
-    this.emit('log', 'Closing Ably service...');
+    this.emit('log', 'Closing realtime service...');
+    this._stopReadySignal();
     if (this._statsInterval) {
       clearInterval(this._statsInterval);

package/lib/automation.js CHANGED Viewed

@@ -45,8 +45,10 @@ const API_KEY = process.env.TD_API_KEY;
 // shell injection and escaping issues.
 const PYTHON = IS_WINDOWS ? 'python' : 'python3';
+// On Linux, ensure DISPLAY is set (use env var or fallback to :0)
+// The os.environ.get() preserves the parent's DISPLAY setting for E2B's :1 display
 const PY_IMPORT = IS_LINUX
-  ? "import os; os.environ['DISPLAY'] = ':0'; import pyautogui, sys; pyautogui.FAILSAFE = False; "
+  ? "import os; os.environ.setdefault('DISPLAY', ':0'); import pyautogui, sys; pyautogui.FAILSAFE = False; "
   : 'import pyautogui, sys; pyautogui.FAILSAFE = False; ';
 /**
@@ -660,33 +662,68 @@ class Automation extends EventEmitter {
   async _captureScreenshot() {
     const sharp = require('sharp');
-    const tmpFile = path.join(os.tmpdir(), `td_screenshot_${Date.now()}.png`);
+    const maxAttempts = 3;
-    try {
-      // Capture screenshot via pyautogui → saves to temp file
-      // Python handles Retina downscale: if physical size differs from logical,
-      // the image is resized to logical dimensions before saving.
-      await runPyAutoGUI(
-        'img = pyautogui.screenshot()\n' +
-        'logical = pyautogui.size()\n' +
-        'if img.size[0] != logical[0] or img.size[1] != logical[1]:\n' +
-        '    from PIL import Image\n' +
-        '    img = img.resize((logical[0], logical[1]), Image.LANCZOS)\n' +
-        'img.save(sys.argv[1], format="PNG")',
-        [tmpFile],
-        20000
-      );
+    for (let attempt = 1; attempt <= maxAttempts; attempt++) {
+      const tmpFile = path.join(os.tmpdir(), `td_screenshot_${Date.now()}.png`);
-      // Read the PNG and re-encode with sharp (lossless, no compression)
-      const pngBuffer = fs.readFileSync(tmpFile);
-      const buffer = await sharp(pngBuffer)
-        .png({ compressionLevel: 0 })
-        .toBuffer();
+      try {
+        // Capture screenshot via pyautogui → saves to temp file
+        // Python handles Retina downscale: if physical size differs from logical,
+        // the image is resized to logical dimensions before saving.
+        await runPyAutoGUI(
+          'img = pyautogui.screenshot()\n' +
+          'logical = pyautogui.size()\n' +
+          'if img.size[0] != logical[0] or img.size[1] != logical[1]:\n' +
+          '    from PIL import Image\n' +
+          '    img = img.resize((logical[0], logical[1]), Image.LANCZOS)\n' +
+          'img.save(sys.argv[1], format="PNG")',
+          [tmpFile],
+          20000
+        );
-      return buffer.toString('base64');
-    } finally {
-      // Clean up temp file
-      try { fs.unlinkSync(tmpFile); } catch {}
+        // Read the PNG and re-encode with sharp (lossless, no compression)
+        const pngBuffer = fs.readFileSync(tmpFile);
+        const image = sharp(pngBuffer);
+        // Detect all-black screenshots (Xvfb/compositor issue)
+        if (IS_LINUX) {
+          const { channels } = await image.stats();
+          // channels[0..2] = R, G, B — check if max pixel value across all channels is near-zero
+          const maxPixel = Math.max(
+            channels[0]?.max ?? 0,
+            channels[1]?.max ?? 0,
+            channels[2]?.max ?? 0
+          );
+          if (maxPixel <= 1) {
+            console.warn(`[automation] Screenshot attempt ${attempt}/${maxAttempts}: image is all black (max pixel=${maxPixel})`);
+            if (attempt < maxAttempts) {
+              // Try to heal: poke the display to trigger a redraw
+              try {
+                await runPyAutoGUI(
+                  "import subprocess; " +
+                  "subprocess.run(['xdotool', 'key', '--clearmodifiers', 'super'], timeout=5); " +
+                  "subprocess.run(['xset', 's', 'off'], timeout=5); " +
+                  "subprocess.run(['xset', 's', 'noblank'], timeout=5); " +
+                  "subprocess.run(['xset', '-dpms'], timeout=5)",
+                  [],
+                  10000
+                );
+              } catch {}
+              // Wait for display to recover
+              await new Promise(r => setTimeout(r, 2000));
+              continue;
+            }
+            console.error('[automation] All screenshot attempts returned black — display may be broken');
+          }
+        }
+        const buffer = await image.png({ compressionLevel: 0 }).toBuffer();
+        return buffer.toString('base64');
+      } finally {
+        // Clean up temp file
+        try { fs.unlinkSync(tmpFile); } catch {}
+      }
     }
   }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@testdriverai/runner",
-  "version": "7.8.0-canary.14",
+  "version": "7.8.0-canary.16",
   "description": "TestDriver Runner - Ably-based remote automation agent with Node.js automation",
   "main": "index.js",
   "bin": {
@@ -37,6 +37,9 @@
     "sharp": "^0.33.0",
     "uuid": "^9.0.0"
   },
+  "publishConfig": {
+    "access": "public"
+  },
   "devDependencies": {
     "e2b": "^2.12.1"
   }

package/sandbox-agent.js CHANGED Viewed

@@ -236,8 +236,8 @@ async function main() {
     updateInfo: null, // sandbox-agent doesn't do self-update checks
   });
-  ablyService.on('log', (msg) => log(`[ably] ${msg}`));
-  ablyService.on('error', (err) => log(`[ably] ERROR: ${err.message}`));
+  ablyService.on('log', (msg) => log(`[realtime] ${msg}`));
+  ablyService.on('error', (err) => log(`[realtime] ERROR: ${err.message}`));
   await ablyService.connect();
   log('Agent ready — listening for commands via Ably');

package/scripts-desktop/start-agent.sh ADDED Viewed

@@ -0,0 +1,105 @@
+#!/bin/bash
+# ─── TestDriver Sandbox Agent Startup ────────────────────────────────────────
+# Starts the sandbox-agent.js (Ably-based automation agent) inside the E2B
+# sandbox. This script is called by the API after writing the config file
+# to /tmp/testdriver-agent.json.
+#
+# This matches the Windows runner pattern: the agent runs locally on the
+# sandbox and executes commands via pyautogui (instead of @e2b/desktop RPC).
+#
+# Usage: bash /opt/testdriver-runner/scripts-desktop/start-agent.sh [&]
+#
+# Prerequisites:
+#   - Desktop environment running (start-desktop.sh completed)
+#   - Config file at /tmp/testdriver-agent.json with Ably credentials
+#   - Node.js installed
+#   - Runner installed at /opt/testdriver-runner
+set -e
+export DISPLAY="${DISPLAY:-:0}"
+export XAUTHORITY="${XAUTHORITY:-${HOME}/.Xauthority}"
+RUNNER_DIR="/opt/testdriver-runner"
+CONFIG_PATH="/tmp/testdriver-agent.json"
+LOG_FILE="/tmp/sandbox-agent.log"
+PID_FILE="/tmp/sandbox-agent.pid"
+log() {
+  echo "[$(date -Iseconds)] [start-agent] $1" | tee -a "$LOG_FILE"
+}
+# ─── Check if already running ─────────────────────────────────────────────────
+if [ -f "$PID_FILE" ]; then
+  existing_pid=$(cat "$PID_FILE")
+  if kill -0 "$existing_pid" 2>/dev/null; then
+    log "Agent already running (PID: $existing_pid), exiting"
+    exit 0
+  else
+    log "Stale PID file found, removing"
+    rm -f "$PID_FILE"
+  fi
+fi
+# ─── Verify prerequisites ─────────────────────────────────────────────────────
+if [ ! -d "$RUNNER_DIR" ]; then
+  log "ERROR: Runner not found at $RUNNER_DIR"
+  exit 1
+fi
+if [ ! -f "$RUNNER_DIR/sandbox-agent.js" ]; then
+  log "ERROR: sandbox-agent.js not found in $RUNNER_DIR"
+  exit 1
+fi
+if ! command -v node &> /dev/null; then
+  log "ERROR: Node.js not installed"
+  exit 1
+fi
+# ─── Wait for config file (with timeout) ─────────────────────────────────────
+# The API writes the config file before calling this script, but we add a
+# brief wait just in case there's any race condition.
+WAIT_TIMEOUT=30
+WAIT_INTERVAL=1
+elapsed=0
+log "Waiting for config file: $CONFIG_PATH"
+while [ ! -f "$CONFIG_PATH" ] && [ $elapsed -lt $WAIT_TIMEOUT ]; do
+  sleep $WAIT_INTERVAL
+  elapsed=$((elapsed + WAIT_INTERVAL))
+done
+if [ ! -f "$CONFIG_PATH" ]; then
+  log "ERROR: Config file not found after ${WAIT_TIMEOUT}s: $CONFIG_PATH"
+  exit 1
+fi
+log "Config file found"
+# ─── Start the agent ──────────────────────────────────────────────────────────
+log "Starting sandbox-agent.js..."
+log "DISPLAY=$DISPLAY, RUNNER_DIR=$RUNNER_DIR"
+# Run in background, redirect output to log file
+cd "$RUNNER_DIR"
+nohup node sandbox-agent.js >> "$LOG_FILE" 2>&1 &
+AGENT_PID=$!
+# Write PID file for process management
+echo "$AGENT_PID" > "$PID_FILE"
+log "Agent started (PID: $AGENT_PID)"
+log "Log file: $LOG_FILE"
+# Brief pause to catch any immediate startup errors
+sleep 2
+if kill -0 "$AGENT_PID" 2>/dev/null; then
+  log "Agent running successfully"
+  exit 0
+else
+  log "ERROR: Agent exited unexpectedly. Check $LOG_FILE for details"
+  tail -20 "$LOG_FILE" | while read line; do log "  $line"; done
+  exit 1
+fi

package/scripts-desktop/start-desktop.sh CHANGED Viewed

@@ -60,6 +60,23 @@ if [ -z "$DBUS_SESSION_BUS_ADDRESS" ]; then
   export DBUS_SESSION_BUS_ADDRESS
 fi
+# ─── Pre-configure xfwm4 to disable compositor ───────────────────────────────
+# Writing the config file BEFORE starting XFCE ensures xfwm4 starts with
+# compositing disabled from frame zero.  The previous approach ran xfconf-query
+# 3 seconds after startxfce4, but xfwm4 often started with compositing enabled
+# before the query ran (or dbus wasn't ready) — causing the Xvfb framebuffer to
+# stay permanently black (~1/15 runs).  Pre-writing the XML avoids the race.
+mkdir -p "${HOME}/.config/xfce4/xfconf/xfce-perchannel-xml"
+cat > "${HOME}/.config/xfce4/xfconf/xfce-perchannel-xml/xfwm4.xml" << 'EOF'
+<?xml version="1.0" encoding="UTF-8"?>
+<channel name="xfwm4" version="1.0">
+  <property name="general" type="empty">
+    <property name="use_compositing" type="bool" value="false"/>
+  </property>
+</channel>
+EOF
+echo "[start-desktop] xfwm4 compositor pre-disabled via config file"
 # ─── Start XFCE desktop ──────────────────────────────────────────────────────
 if pgrep -x xfce4-session > /dev/null 2>&1; then
   echo "[start-desktop] XFCE already running, skipping"
@@ -68,9 +85,6 @@ else
   startxfce4 &
   sleep 3
-  # Disable xfwm4 compositor (causes black screen in Xvfb — no GPU)
-  xfconf-query -c xfwm4 -p /general/use_compositing -s false 2>/dev/null || true
   # Kill power manager, screensaver, and error dialogs (not needed in headless)
   killall xfce4-power-manager 2>/dev/null || true
   killall xfce4-screensaver 2>/dev/null || true
@@ -78,6 +92,12 @@ else
   xdotool search --name "Power Manager" windowclose 2>/dev/null || true
 fi
+# Always enforce compositor=off at runtime regardless of whether XFCE was already
+# running.  Belt-and-suspenders: covers the case where this script's previous run
+# started XFCE (skipping the else-block above), or where xfwm4 somehow ignored
+# the config file.
+xfconf-query -c xfwm4 -p /general/use_compositing -s false 2>/dev/null || true
 # ─── Set TestDriver wallpaper ─────────────────────────────────────────────────
 WALLPAPER="/usr/share/backgrounds/xfce/wallpaper.png"
 if [ -f "$WALLPAPER" ]; then
@@ -155,7 +175,130 @@ sleep 1
 echo "[start-desktop] Desktop environment ready"
+# ─── Helper: restart Xvfb ────────────────────────────────────────────────────
+restart_xvfb() {
+  echo "[watchdog] Restarting Xvfb..."
+  killall Xvfb 2>/dev/null || true
+  sleep 1
+  rm -f /tmp/.X0-lock /tmp/.X11-unix/X0 2>/dev/null
+  Xvfb :0 -ac -screen 0 "${SCREEN_WIDTH}x${SCREEN_HEIGHT}x24" -retro -nolisten tcp &
+  XVFB_PID=$!
+  # Wait for Xvfb to be ready
+  for _w in $(seq 1 10); do
+    xdpyinfo -display :0 > /dev/null 2>&1 && break
+    sleep 1
+  done
+  if ! kill -0 $XVFB_PID 2>/dev/null; then
+    echo "[watchdog] ERROR: Xvfb failed to restart"
+    return 1
+  fi
+  # Re-disable screen blanking & DPMS on the fresh Xvfb
+  xset s off 2>/dev/null || true
+  xset s noblank 2>/dev/null || true
+  xset -dpms 2>/dev/null || true
+  echo "[watchdog] Xvfb restarted (PID: $XVFB_PID)"
+}
+# ─── Helper: restart xfce4 (mirrors E2B's defunct-process check) ─────────────
+restart_xfce4() {
+  echo "[watchdog] Restarting xfce4-session..."
+  killall xfce4-session 2>/dev/null || true
+  sleep 1
+  startxfce4 &
+  XFCE4_PID=$!
+  sleep 3
+  killall xfce4-power-manager 2>/dev/null || true
+  killall xfce4-screensaver 2>/dev/null || true
+  xfconf-query -c xfwm4 -p /general/use_compositing -s false 2>/dev/null || true
+  xset s off 2>/dev/null || true
+  xset s noblank 2>/dev/null || true
+  xset -dpms 2>/dev/null || true
+  echo "[watchdog] xfce4-session restarted (PID: $XFCE4_PID)"
+}
+# ─── Helper: restart x11vnc ──────────────────────────────────────────────────
+restart_x11vnc() {
+  echo "[watchdog] Restarting x11vnc..."
+  killall x11vnc 2>/dev/null || true
+  sleep 1
+  x11vnc -display :0 -forever -nopw -shared -rfbport 5900 \
+    -noxdamage -fixscreen V=2 \
+    -bg -o /dev/null 2>/dev/null || true
+  echo "[watchdog] x11vnc restarted"
+}
+# ─── Watchdog loop ────────────────────────────────────────────────────────────
+# Monitors Xvfb, xfce4-session, and x11vnc health every 10 seconds.
+# Restarts any component that has crashed or become defunct.
+# Also periodically re-disables screen blanking/compositor as belt-and-suspenders.
+WATCHDOG_INTERVAL=10
+BLANKING_RESET_COUNTER=0
+watchdog_loop() {
+  while true; do
+    sleep "$WATCHDOG_INTERVAL"
+    # ── Check Xvfb ──
+    if ! pgrep -x Xvfb > /dev/null 2>&1; then
+      echo "[watchdog] Xvfb not running! Recovering..."
+      restart_xvfb
+      # x11vnc and xfce need a running Xvfb, restart them too
+      restart_xfce4
+      restart_x11vnc
+      continue
+    fi
+    # Verify Xvfb is actually responding (not just a zombie process)
+    if ! xdpyinfo -display :0 > /dev/null 2>&1; then
+      echo "[watchdog] Xvfb process exists but display :0 is unresponsive! Recovering..."
+      restart_xvfb
+      restart_xfce4
+      restart_x11vnc
+      continue
+    fi
+    # ── Check xfce4-session (E2B pattern: detect <defunct> zombie) ──
+    XFCE_PID=$(pgrep -x xfce4-session | head -1)
+    if [ -z "$XFCE_PID" ]; then
+      echo "[watchdog] xfce4-session not running! Restarting..."
+      restart_xfce4
+    elif ps aux | grep "$XFCE_PID" | grep -v grep | head -1 | grep -q '<defunct>'; then
+      echo "[watchdog] xfce4-session is defunct (zombie)! Restarting..."
+      restart_xfce4
+    fi
+    # ── Check x11vnc ──
+    if ! pgrep -x x11vnc > /dev/null 2>&1; then
+      echo "[watchdog] x11vnc not running! Restarting..."
+      restart_x11vnc
+    fi
+    # ── Periodically re-disable screen blanking & compositor (every ~60s) ──
+    BLANKING_RESET_COUNTER=$((BLANKING_RESET_COUNTER + 1))
+    if [ "$BLANKING_RESET_COUNTER" -ge 6 ]; then
+      BLANKING_RESET_COUNTER=0
+      xset s off 2>/dev/null || true
+      xset s noblank 2>/dev/null || true
+      xset -dpms 2>/dev/null || true
+      xfconf-query -c xfwm4 -p /general/use_compositing -s false 2>/dev/null || true
+    fi
+    # ── Monitor /dev/shm usage ──
+    if [ -d /dev/shm ]; then
+      SHM_USAGE=$(df /dev/shm 2>/dev/null | awk 'NR==2 {print $5}' | tr -d '%')
+      if [ -n "$SHM_USAGE" ] && [ "$SHM_USAGE" -gt 90 ] 2>/dev/null; then
+        echo "[watchdog] WARNING: /dev/shm is ${SHM_USAGE}% full — X11 may fail to allocate pixmaps"
+      fi
+    fi
+  done
+}
+# Start watchdog in background
+watchdog_loop &
+WATCHDOG_PID=$!
+echo "[start-desktop] Watchdog started (PID: $WATCHDOG_PID)"
 # Keep the script running so E2B doesn't consider the sandbox stopped
 # Trap signals for clean shutdown
-trap "kill $XVFB_PID $NOVNC_PID 2>/dev/null; exit 0" SIGTERM SIGINT
+trap "kill $XVFB_PID $NOVNC_PID $WATCHDOG_PID 2>/dev/null; exit 0" SIGTERM SIGINT
 wait