npm - omnius - Versions diffs - 1.0.205 → 1.0.207 - Mend

omnius 1.0.205 → 1.0.207

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.js CHANGED Viewed

@@ -3987,9 +3987,33 @@ var init_system_deps = __esm({
 // packages/execution/dist/tools/desktop-control.js
 import { execSync as execSync4 } from "node:child_process";
 import { existsSync as existsSync6, statSync as statSync3 } from "node:fs";
+function activateDesktopWindow(windowId) {
+  const attempts = [];
+  const tryAction = (label, command) => {
+    const result = run(command, 5e3);
+    if (result.ok)
+      return label;
+    attempts.push({ label, message: result.message });
+    return null;
+  };
+  if (process.platform === "linux") {
+    if (hasCommand2("xdotool")) {
+      const backend = tryAction("xdotool", `xdotool windowactivate --sync ${quoteShell(windowId)}`);
+      if (backend)
+        return backend;
+    }
+    if (hasCommand2("wmctrl")) {
+      const backend = tryAction("wmctrl", `wmctrl -ia ${quoteShell(windowId)}`);
+      if (backend)
+        return backend;
+    }
+  }
+  throw new Error("No desktop window activation backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage());
+}
 function captureDesktopScreenshot(outputPath3) {
   const attempts = [];
   const out = quoteShell(outputPath3);
+  const allowInteractiveWaylandScreenshot = envFlag("OMNIUS_DESKTOP_INTERACTIVE_SCREENSHOT", false);
   const tryCapture = (label, command, timeout2 = 1e4) => {
     const result = run(command, timeout2);
     if (result.ok && existsSync6(outputPath3)) {
@@ -4022,11 +4046,22 @@ $bitmap.Dispose()
       return backend;
   } else if (process.platform === "linux") {
     const desktop = `${process.env["XDG_CURRENT_DESKTOP"] || ""} ${process.env["DESKTOP_SESSION"] || ""}`;
-    if ((/wayland/i.test(process.env["XDG_SESSION_TYPE"] || "") || process.env["WAYLAND_DISPLAY"]) && hasCommand2("gdbus") && hasCommand2("dbus-monitor")) {
+    if ((/wayland/i.test(process.env["XDG_SESSION_TYPE"] || "") || process.env["WAYLAND_DISPLAY"]) && /gnome/i.test(desktop) && hasCommand2("gdbus") && allowInteractiveWaylandScreenshot) {
+      const backend = tryCapture("gnome-shell-interactive-screenshot-dbus", gnomeInteractiveScreenshotCommand(outputPath3), 13e4);
+      if (backend)
+        return backend;
+    }
+    if ((/wayland/i.test(process.env["XDG_SESSION_TYPE"] || "") || process.env["WAYLAND_DISPLAY"]) && hasCommand2("gdbus") && hasCommand2("dbus-monitor") && allowInteractiveWaylandScreenshot) {
       const backend = tryCapture("xdg-desktop-portal-screenshot", portalScreenshotCommand(outputPath3), 13e4);
       if (backend)
         return backend;
     }
+    if ((/wayland/i.test(process.env["XDG_SESSION_TYPE"] || "") || process.env["WAYLAND_DISPLAY"]) && !allowInteractiveWaylandScreenshot) {
+      attempts.push({
+        label: "interactive-wayland-screenshot",
+        message: "skipped by default to avoid unattended GNOME/portal screenshot selection stalls. For desktop app loops, pass window_title to vision_action_loop so Omnius captures the named X11/XWayland window. Set OMNIUS_DESKTOP_INTERACTIVE_SCREENSHOT=1 only when a human will complete the full-screen screenshot prompt."
+      });
+    }
     if (/gnome/i.test(desktop) && hasCommand2("gdbus")) {
       const backend = tryCapture("gnome-shell-screenshot-dbus", `gdbus call --session --dest org.gnome.Shell.Screenshot --object-path /org/gnome/Shell/Screenshot --method org.gnome.Shell.Screenshot.Screenshot false false ${out}`);
       if (backend)
@@ -4079,6 +4114,83 @@ $bitmap.Dispose()
   }
   throw new Error("No desktop screenshot backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage());
 }
+function captureDesktopWindowScreenshot(outputPath3, windowTitle) {
+  const attempts = [];
+  const out = quoteShell(outputPath3);
+  const runText = (command, timeout2 = 5e3) => {
+    const result = runCaptureText(command, timeout2);
+    if (result.ok)
+      return result.output.trim();
+    attempts.push({ label: command.split(/\s+/)[0] || command, message: result.message });
+    return null;
+  };
+  if (process.platform !== "linux") {
+    throw new Error("Window screenshot capture is currently implemented for Linux/X11/XWayland windows only.");
+  }
+  if (!hasCommand2("xdotool")) {
+    attempts.push({ label: "xdotool", message: "not found on PATH" });
+  } else if (!hasCommand2("import")) {
+    attempts.push({ label: "import", message: "ImageMagick import not found on PATH" });
+  } else {
+    const windowId = windowTitle ? runText(`xdotool search --name ${quoteShell(windowTitle)} | tail -n1`) : runText("xdotool getactivewindow");
+    if (windowId) {
+      const geometry = runText(`xdotool getwindowgeometry --shell ${quoteShell(windowId)}`);
+      const parsed = parseXdotoolGeometry(geometry || "");
+      if (!parsed) {
+        attempts.push({ label: "xdotool getwindowgeometry", message: `Could not parse geometry for window ${windowId}: ${geometry}` });
+      } else {
+        const result = run(`import -window ${quoteShell(windowId)} ${out}`, 1e4);
+        if (result.ok && existsSync6(outputPath3)) {
+          const inspection = inspectScreenshot(outputPath3);
+          if (inspection.ok) {
+            return {
+              backend: "imagemagick-import-window",
+              windowId,
+              ...parsed
+            };
+          }
+          attempts.push({ label: "import-window", message: inspection.message });
+        } else {
+          attempts.push({ label: "import-window", message: result.message });
+        }
+      }
+    }
+  }
+  throw new Error("No desktop window screenshot backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage());
+}
+function gnomeInteractiveScreenshotCommand(outputPath3) {
+  const script = `
+set -eu
+out=${quoteShell(outputPath3)}
+printf '%s\\n' 'Omnius desktop screenshot: choose Full Screen in the GNOME screenshot UI, then press Enter.' >&2
+call_out="$(gdbus call --session --dest org.gnome.Shell.Screenshot --object-path /org/gnome/Shell/Screenshot --method org.gnome.Shell.Screenshot.InteractiveScreenshot 2>&1)" || {
+  printf '%s\\n' "$call_out" >&2
+  exit 1
+}
+uri="$(printf '%s\\n' "$call_out" | sed -n "s/.*'\\(file:[^']*\\)'.*/\\1/p" | tail -n 1)"
+if [ -z "$uri" ]; then
+  printf 'GNOME interactive screenshot did not return a file URI: %s\\n' "$call_out" >&2
+  exit 1
+fi
+if command -v gio >/dev/null 2>&1; then
+  gio copy -f "$uri" "$out"
+elif command -v python3 >/dev/null 2>&1; then
+  python3 - "$uri" "$out" <<'PY'
+import shutil, sys, urllib.parse
+uri, out = sys.argv[1], sys.argv[2]
+parsed = urllib.parse.urlparse(uri)
+if parsed.scheme != "file":
+    raise SystemExit(f"Unsupported screenshot URI scheme: {parsed.scheme}")
+shutil.copyfile(urllib.parse.unquote(parsed.path), out)
+PY
+else
+  printf 'Need gio or python3 to copy GNOME screenshot URI %s to %s.\\n' "$uri" "$out" >&2
+  exit 1
+fi
+`.trim();
+  return `bash -lc ${quoteShellLiteral(script)}`;
+}
 function portalScreenshotCommand(outputPath3) {
   const script = `
 set -eu
@@ -4086,6 +4198,7 @@ out=${quoteShell(outputPath3)}
 monitor_file="$(mktemp)"
 cleanup() {
   if [ -n "\${monpid:-}" ]; then kill "$monpid" >/dev/null 2>&1 || true; fi
+  if [ -n "\${keypid:-}" ]; then kill "$keypid" >/dev/null 2>&1 || true; fi
   rm -f "$monitor_file"
 }
 trap cleanup EXIT
@@ -4102,6 +4215,25 @@ fi
 dbus-monitor --session "type='signal',interface='org.freedesktop.portal.Request',member='Response',path='$handle'" > "$monitor_file" 2>&1 &
 monpid=$!
+if [ "\${OMNIUS_SCREENSHOT_AUTO_CONFIRM:-1}" != "0" ]; then
+  (
+    sleep 1
+    i=0
+    while [ "$i" -lt 8 ]; do
+      if command -v xdotool >/dev/null 2>&1; then
+        DISPLAY="\${DISPLAY:-:0}" xdotool key Return >/dev/null 2>&1 || true
+      elif command -v ydotool >/dev/null 2>&1; then
+        ydotool key 28:1 28:0 >/dev/null 2>&1 || true
+      elif command -v dotool >/dev/null 2>&1; then
+        printf 'key enter
+' | dotool >/dev/null 2>&1 || true
+      fi
+      i=$((i + 1))
+      sleep 0.5
+    done
+  ) &
+  keypid=$!
+fi
 deadline=$(( $(date +%s) + 120 ))
 while :; do
   if grep -q "member=Response" "$monitor_file"; then break; fi
@@ -4144,7 +4276,7 @@ else
   exit 1
 fi
 `.trim();
-  return `bash -lc ${quoteShell(script)}`;
+  return `bash -lc ${quoteShellLiteral(script)}`;
 }
 function moveDesktopPointer(x, y) {
   const result = performDesktopPointerAction({ x, y, moveOnly: true });
@@ -4167,6 +4299,25 @@ function clickDesktopAt(x, y, button, clickType) {
   lastPointer = { x: Math.round(x), y: Math.round(y) };
   return result.backend;
 }
+function typeDesktopText(text, delayMs = 10) {
+  const result = performDesktopKeyboardAction({
+    kind: "type",
+    text,
+    delayMs
+  });
+  if (!result.ok)
+    throw new Error(result.message);
+  return result.backend;
+}
+function pressDesktopKey(key) {
+  const result = performDesktopKeyboardAction({
+    kind: "key",
+    key
+  });
+  if (!result.ok)
+    throw new Error(result.message);
+  return result.backend;
+}
 function tryRunXdotoolShellFallback(command) {
   if (!/\bxdotool\b|\bxdtool\b/.test(command))
     return null;
@@ -4278,7 +4429,9 @@ function desktopAutomationRecoveryMessage(command) {
     "  Linux pointer control: xdotool/X11, ydotool, dotool, python-xlib",
     "  macOS: cliclick, then System Events",
     "  Windows: PowerShell user32 input",
-    "On Wayland, Omnius requests screenshot permission through xdg-desktop-portal when available. Approve the system screenshot prompt to continue.",
+    "On Wayland, unattended full-desktop screenshots are compositor-restricted and interactive screenshot prompts are skipped by default.",
+    "For desktop app loops on GNOME Wayland, pass window_title to vision_action_loop so Omnius captures the named X11/XWayland window without a full-screen prompt.",
+    "Set OMNIUS_DESKTOP_INTERACTIVE_SCREENSHOT=1 only when a human will complete the full-screen screenshot prompt.",
     "On GNOME Wayland, unattended screenshots may still be denied by compositor policy. Install gnome-screenshot or grant screenshot permission for the session if capture is blocked.",
     "On Wayland pointer control, install and enable ydotool or dotool when xdotool cannot open an X display."
   ].join("\n");
@@ -4382,6 +4535,72 @@ ${options2.moveOnly ? "" : `for _ in range(${clicks}):
     message: "No desktop mouse backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage()
   };
 }
+function performDesktopKeyboardAction(options2) {
+  const attempts = [];
+  const tryAction = (label, command) => {
+    const result = run(command, 1e4);
+    if (result.ok)
+      return label;
+    attempts.push({ label, message: result.message });
+    return null;
+  };
+  if (process.platform === "linux") {
+    if (hasCommand2("xdotool")) {
+      const command = options2.kind === "type" ? `xdotool type --clearmodifiers --delay ${Math.max(0, Math.min(500, Math.round(options2.delayMs)))} ${quoteShell(options2.text)}` : `xdotool key --clearmodifiers ${quoteShell(options2.key)}`;
+      const backend = tryAction("xdotool", command);
+      if (backend)
+        return { ok: true, backend };
+    } else {
+      attempts.push({ label: "xdotool", message: "not found on PATH" });
+    }
+    if (hasCommand2("ydotool")) {
+      const command = options2.kind === "type" ? `ydotool type ${quoteShell(options2.text)}` : `ydotool key ${quoteShell(options2.key)}`;
+      const backend = tryAction("ydotool", command);
+      if (backend)
+        return { ok: true, backend };
+    }
+    if (hasCommand2("dotool")) {
+      const commandText = options2.kind === "type" ? `type ${options2.text}
+` : `key ${options2.key}
+`;
+      const backend = tryAction("dotool", `printf ${quoteShell(commandText)} | dotool`);
+      if (backend)
+        return { ok: true, backend };
+    }
+  } else if (process.platform === "darwin") {
+    if (options2.kind === "type") {
+      const script = `tell application "System Events" to keystroke ${quoteAppleScript(options2.text)}`;
+      const backend = tryAction("osascript-system-events", `osascript -e ${quoteShell(script)}`);
+      if (backend)
+        return { ok: true, backend };
+    } else {
+      const script = `tell application "System Events" to key code ${quoteAppleScript(options2.key)}`;
+      const backend = tryAction("osascript-system-events", `osascript -e ${quoteShell(script)}`);
+      if (backend)
+        return { ok: true, backend };
+    }
+  }
+  return {
+    ok: false,
+    message: "No desktop keyboard backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage()
+  };
+}
+function parseXdotoolGeometry(raw) {
+  const values = /* @__PURE__ */ new Map();
+  for (const line of raw.split(/\r?\n/)) {
+    const match = line.match(/^([A-Z]+)=(-?\d+)$/);
+    if (match)
+      values.set(match[1], Number(match[2]));
+  }
+  const x = values.get("X");
+  const y = values.get("Y");
+  const width = values.get("WIDTH");
+  const height = values.get("HEIGHT");
+  if ([x, y, width, height].every((value2) => Number.isFinite(value2))) {
+    return { x, y, width, height };
+  }
+  return null;
+}
 function windowsMouseScript(x, y, down, up, clicks = 1) {
   const clickBody = down == null || up == null ? "" : `
 for ($i = 0; $i -lt ${clicks}; $i++) {
@@ -4505,6 +4724,26 @@ function run(command, timeout2) {
     };
   }
 }
+function runCaptureText(command, timeout2) {
+  try {
+    const output = execSync4(command, {
+      stdio: ["pipe", "pipe", "pipe"],
+      timeout: timeout2,
+      env: { ...process.env },
+      encoding: "utf8"
+    });
+    return { ok: true, output, message: "" };
+  } catch (err) {
+    const anyErr = err;
+    const stdout = bufferishToString(anyErr.stdout);
+    const stderr = bufferishToString(anyErr.stderr);
+    return {
+      ok: false,
+      output: stdout,
+      message: (stderr || stdout || anyErr.message || String(err)).trim().slice(0, 800)
+    };
+  }
+}
 function bufferishToString(value2) {
   if (Buffer.isBuffer(value2))
     return value2.toString("utf8");
@@ -4517,6 +4756,18 @@ function powershellCommand(script) {
 function quoteShell(value2) {
   return JSON.stringify(value2);
 }
+function quoteShellLiteral(value2) {
+  return `'${value2.replace(/'/g, `'"'"'`)}'`;
+}
+function envFlag(name10, fallback) {
+  const value2 = process.env[name10];
+  if (value2 === void 0)
+    return fallback;
+  return /^(1|true|yes|on)$/i.test(value2.trim());
+}
+function quoteAppleScript(value2) {
+  return JSON.stringify(value2);
+}
 function psString(value2) {
   return value2.replace(/'/g, "''");
 }
@@ -8507,7 +8758,7 @@ function resetMoondreamClient() {
 function getVisionPointDiagnostics() {
   return [...lastPointDiagnostics];
 }
-function envFlag(value2, fallback = false) {
+function envFlag2(value2, fallback = false) {
   if (value2 === void 0)
     return fallback;
   if (/^(1|true|yes|on)$/i.test(value2.trim()))
@@ -8627,8 +8878,8 @@ function resolveHuggingFaceVisionModelCandidates(preferredModel) {
   for (const model of splitModelList(process.env["OMNIUS_MOONDREAM_HF_MODELS"]))
     add2(model, true);
   add2(process.env["MOONDREAM_HF_MODEL"] || "", Boolean(process.env["MOONDREAM_HF_MODEL"]));
-  const fullPreviewAuto = envFlag(process.env["OMNIUS_MOONDREAM3_PREVIEW_AUTO"], true) || envFlag(process.env["OMNIUS_MOONDREAM3_PREVIEW"], false);
-  const compactFallbackAuto = envFlag(process.env["OMNIUS_MOONDREAM2_4BIT_AUTO"], true);
+  const fullPreviewAuto = envFlag2(process.env["OMNIUS_MOONDREAM3_PREVIEW_AUTO"], true) || envFlag2(process.env["OMNIUS_MOONDREAM3_PREVIEW"], false);
+  const compactFallbackAuto = envFlag2(process.env["OMNIUS_MOONDREAM2_4BIT_AUTO"], true);
   const hasExplicitHf = candidates.some((candidate) => candidate.explicit);
   const hasExplicitNonHf = Boolean(preferred) && !isHuggingFaceVisionModel(preferred);
   if (hasExplicitNonHf)
@@ -8937,7 +9188,7 @@ async function callOllamaVision(ollamaHost, model, prompt, imageBase64, timeoutM
   return typeof data.response === "string" && data.response.trim() ? data.response : null;
 }
 function shouldAutoPullOllamaVisionModel(model) {
-  if (!envFlag(process.env["OMNIUS_OLLAMA_VISION_AUTO_PULL"], true))
+  if (!envFlag2(process.env["OMNIUS_OLLAMA_VISION_AUTO_PULL"], true))
     return false;
   return Boolean(model.trim());
 }
@@ -9031,7 +9282,7 @@ async function prepareHuggingFaceVisionCandidate(candidate, diagnostics) {
   return { python, env: env2, gpuIndex: decision2.lease.gpuIndex, release: () => decision2.lease.release() };
 }
 function ensureHuggingFaceVisionPython(modelId) {
-  const managed = envFlag(process.env["OMNIUS_MOONDREAM_HF_MANAGED"], true);
+  const managed = envFlag2(process.env["OMNIUS_MOONDREAM_HF_MANAGED"], true);
   if (!managed)
     return legacyHuggingFaceVisionPython(modelId);
   ensureUnifiedCacheDirs();
@@ -10075,6 +10326,79 @@ function pngDimensions(buffer2) {
   }
   return null;
 }
+async function describeFocusedEditable(pageHandle) {
+  const active = await pageHandle.evaluate(`(() => {
+    const el = document.activeElement;
+    if (!el) return null;
+    const rect = el.getBoundingClientRect();
+    const role = (el.getAttribute("role") || "").toLowerCase();
+    const contentEditable = String(el.getAttribute("contenteditable") || "").toLowerCase();
+    const isEditable = el.matches("input, textarea")
+      || contentEditable === "" || contentEditable === "true"
+      || ["textbox", "searchbox", "combobox"].includes(role);
+    return {
+      tag: String(el.tagName || "").toLowerCase(),
+      id: el.id || "",
+      name: el.getAttribute("name") || "",
+      role,
+      ariaLabel: el.getAttribute("aria-label") || "",
+      type: el.getAttribute("type") || "",
+      placeholder: el.getAttribute("placeholder") || "",
+      text: String(el.textContent || "").trim().slice(0, 120),
+      isEditable,
+      rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
+    };
+  })()`);
+  return active && typeof active === "object" ? active : null;
+}
+async function clickAndFillBrowserTarget(pageHandle, target, text, typingDelay) {
+  const viewport = pageHandle.viewportSize?.() ?? { width: 1280, height: 720 };
+  let candidate = await findBrowserVisualCandidate(pageHandle, target, viewport.width / 2, viewport.height / 2, true);
+  let source = "dom-candidate";
+  if (!candidate) {
+    candidate = await findBrowserVisualCandidate(pageHandle, target, viewport.width / 2, viewport.height / 2, true, true, true);
+    if (candidate?.["scrolledIntoView"] === true)
+      source += "+scroll";
+    if (candidate)
+      await pageHandle.waitForTimeout(150);
+  }
+  const center = candidate?.["center"];
+  const x = Number(center?.x);
+  const y = Number(center?.y);
+  if (!Number.isFinite(x) || !Number.isFinite(y)) {
+    throw new Error(`No visible editable candidate matched target "${target}". Run observe_bundle or dom_summary to inspect available labels/selectors.`);
+  }
+  await pageHandle.mouse.click(x, y);
+  await pageHandle.waitForTimeout(80);
+  const active = await describeFocusedEditable(pageHandle);
+  if (!active?.["isEditable"]) {
+    throw new Error(`Target "${target}" was clicked, but no editable element became focused. Matched element: ${candidate ? JSON.stringify({
+      tag: candidate["tag"],
+      text: candidate["text"],
+      ariaLabel: candidate["ariaLabel"],
+      placeholder: candidate["placeholder"],
+      name: candidate["name"]
+    }) : "(none)"}.`);
+  }
+  const selectAll = process.platform === "darwin" ? "Meta+A" : "Control+A";
+  await pageHandle.keyboard.press(selectAll);
+  await pageHandle.keyboard.type(text, { delay: typingDelay });
+  return { candidate, active, source };
+}
+function evaluateFailureMessage(err, code8) {
+  const raw = err instanceof Error ? err.message : String(err);
+  const hints = [];
+  if (/map is not a function/i.test(raw) && /querySelectorAll/i.test(code8)) {
+    hints.push("document.querySelectorAll() returns a NodeList; use Array.from(document.querySelectorAll(selector)).map(...) or [...document.querySelectorAll(selector)].map(...).");
+  }
+  if (/(?:\.value\s*=|setAttribute\(['"]value['"])/.test(code8) && /\b(input|textarea|querySelector)/i.test(code8)) {
+    hints.push("Do not fill modern React/Vue/Svelte forms by assigning .value in evaluate; use playwright_browser fill, or visual_click the field then type, so input/change events fire.");
+  }
+  if (/querySelectorAll|querySelector/.test(code8)) {
+    hints.push("For page inspection, prefer query_all, dom_summary, or observe_bundle before raw evaluate.");
+  }
+  return [raw.slice(0, 500), ...hints.map((hint) => `Hint: ${hint}`)].join("\n");
+}
 function buildImageMarker(buffer2) {
   let mimeType = "image/png";
   let out = buffer2;
@@ -10308,7 +10632,7 @@ var init_playwright_browser = __esm({
     PLAYWRIGHT_BROWSERS_DIR = join13(PLAYWRIGHT_RUNTIME_DIR, "browsers");
     PlaywrightBrowserTool = class {
       name = "playwright_browser";
-      description = "Full-scope Playwright browser automation + diagnostic capture. Launches a persistent headless Chromium session by default, with optional visible/headed mode when a GUI display is available. Beyond navigation/interaction, this tool buffers everything the running app emits (console messages, network requests, JS exceptions, accessibility tree) so the agent can verify what is ACTUALLY happening — not just what the build/test reports. Auto-installs Playwright + Chromium on first use without sudo or OS package manager escalation. Diagnostic actions: observe_bundle, dom_summary, dom, console_logs, network_log, page_errors, a11y_snapshot, bounding_box, query_all, performance, cookies, storage, viewport, clear_diagnostics. Interaction actions: navigate, click, visual_click, fill, type, press, select, check, hover. Capture actions: screenshot, pdf, content, innerText, innerHTML, getAttribute, evaluate. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Workflow for user-facing work: start/serve the system with the stack-native tool, navigate to the real URL, then inspect page_errors, console_logs, network_log, DOM/accessibility, and screenshot evidence before completion. Build/typecheck/test output is only one layer; runtime browser evidence is required when the delivered artifact is a page, app, dashboard, game, form, visualization, or other UI. Repeat navigate/act/observe until the actual user flow is clean.";
+      description = "Full-scope Playwright browser automation + diagnostic capture. Launches a persistent headless Chromium session by default, with optional visible/headed mode when a GUI display is available. Beyond navigation/interaction, this tool buffers everything the running app emits (console messages, network requests, JS exceptions, accessibility tree) so the agent can verify what is ACTUALLY happening — not just what the build/test reports. Auto-installs Playwright + Chromium on first use without sudo or OS package manager escalation. Diagnostic actions: observe_bundle, dom_summary, dom, console_logs, network_log, page_errors, a11y_snapshot, bounding_box, query_all, performance, cookies, storage, viewport, clear_diagnostics. Interaction actions: navigate, click, visual_click, fill, type, press, select, check, hover. Use fill with a selector or natural-language target for form fields; avoid raw evaluate for form filling because direct .value assignment does not fire app input/change events. This is a separate browser/runtime from browser_action; once you start a workflow here, continue here unless you intentionally navigate browser_action to the same URL. Capture actions: screenshot, pdf, content, innerText, innerHTML, getAttribute, evaluate. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Workflow for user-facing work: start/serve the system with the stack-native tool, navigate to the real URL, then inspect page_errors, console_logs, network_log, DOM/accessibility, and screenshot evidence before completion. Build/typecheck/test output is only one layer; runtime browser evidence is required when the delivered artifact is a page, app, dashboard, game, form, visualization, or other UI. Repeat navigate/act/observe until the actual user flow is clean.";
       parameters = {
         type: "object",
         properties: {
@@ -10358,7 +10682,7 @@ var init_playwright_browser = __esm({
               "clear_diagnostics",
               "close"
             ],
-            description: "Action to perform:\n- navigate: go to a URL\n- click: click element by selector\n- fill: clear input and type text (for form fields)\n- type: type text character by character into a selector, or into the currently focused element after visual_click\n- press: press a key (Enter, Tab, Escape, etc.)\n- screenshot: capture the headless browser page, not the desktop; use value to choose the output file path\n- observe_bundle: capture URL/title/viewport, DOM summary, a11y, diagnostics, screenshot, and gate assessment\n- visual_click: browser screenshot -> Moondream point -> elementFromPoint -> human-like Playwright mouse click -> post-action screenshot\n- evaluate: run JavaScript in page context\n- content: get page text content (readable, stripped)\n- dom: get raw page HTML (truncated)\n- dom_summary: compact interactive DOM summary with selectors\n- innerText: get innerText of a specific element\n- select: select dropdown option by value\n- check/uncheck: toggle checkbox\n- hover: hover over element\n- wait: wait for a selector to appear\n- waitForNavigation: wait for page navigation to complete\n- waitForSelector: wait for element matching selector\n- title: get page title\n- url: get current URL\n- getAttribute: get element attribute value\n- innerHTML: get element's innerHTML\n- textContent: get element's textContent\n- goBack/goForward/reload: browser navigation\n- pdf: save page as PDF\n- close: close browser session"
+            description: "Action to perform:\n- navigate: go to a URL\n- click: click element by selector\n- fill: clear input and type text by selector, or by natural-language target when selector is absent\n- type: type text character by character into a selector, or into the currently focused element after visual_click\n- press: press a key (Enter, Tab, Escape, etc.)\n- screenshot: capture the headless browser page, not the desktop; use value to choose the output file path\n- observe_bundle: capture URL/title/viewport, DOM summary, a11y, diagnostics, screenshot, and gate assessment\n- visual_click: browser screenshot -> Moondream point -> elementFromPoint -> human-like Playwright mouse click -> post-action screenshot\n- evaluate: run JavaScript in page context\n- content: get page text content (readable, stripped)\n- dom: get raw page HTML (truncated)\n- dom_summary: compact interactive DOM summary with selectors\n- innerText: get innerText of a specific element\n- select: select dropdown option by value\n- check/uncheck: toggle checkbox\n- hover: hover over element\n- wait: wait for a selector to appear\n- waitForNavigation: wait for page navigation to complete\n- waitForSelector: wait for element matching selector\n- title: get page title\n- url: get current URL\n- getAttribute: get element attribute value\n- innerHTML: get element's innerHTML\n- textContent: get element's textContent\n- goBack/goForward/reload: browser navigation\n- pdf: save page as PDF\n- close: close browser session"
           },
           url: {
             type: "string",
@@ -10374,7 +10698,7 @@ var init_playwright_browser = __esm({
           },
           target: {
             type: "string",
-            description: "Natural-language browser visual target for visual_click, for example 'the green Continue button' or 'the search field'."
+            description: "Natural-language browser visual target for visual_click or selector-less fill, for example 'the green Continue button', 'username field', or 'password field'."
           },
           value: {
             type: "string",
@@ -10494,12 +10818,22 @@ var init_playwright_browser = __esm({
               return ok(`Clicked: ${resolvedSelector}${resolvedSelector !== selector ? ` (from ${selector})` : ""}`, start2);
             }
             case "fill": {
-              if (!selector)
-                return fail("selector is required", start2);
               if (text === void 0)
                 return fail("text is required", start2);
-              await page.fill(selector, text, { timeout: timeout2 });
-              return ok(`Filled ${selector} with "${text}"`, start2);
+              const typingDelay = typeof args.typing_delay_ms === "number" ? Math.max(0, Math.min(500, Math.round(args.typing_delay_ms))) : 20;
+              if (selector) {
+                const resolvedSelector = resolveDomSummarySelector(selector);
+                if (!resolvedSelector)
+                  return fail(`No selector known for DOM summary reference ${selector}; run dom_summary and use the emitted selector.`, start2);
+                await page.fill(resolvedSelector, text, { timeout: timeout2 });
+                return ok(`Filled ${resolvedSelector}${resolvedSelector !== selector ? ` (from ${selector})` : ""} with "${text}"`, start2);
+              }
+              const target = typeof args.target === "string" && args.target.trim() ? args.target.trim() : "";
+              if (!target)
+                return fail("selector or target is required for fill. Prefer target for visual/natural-language form fields, e.g. target='username field'.", start2);
+              const result = await clickAndFillBrowserTarget(page, target, text, typingDelay);
+              const active = result.active ?? {};
+              return ok(`Filled target "${target}" via ${result.source} into <${active["tag"] || "element"}>${active["name"] ? ` name=${JSON.stringify(active["name"])}` : ""}${active["placeholder"] ? ` placeholder=${JSON.stringify(active["placeholder"])}` : ""}${active["ariaLabel"] ? ` aria-label=${JSON.stringify(active["ariaLabel"])}` : ""}.`, start2);
             }
             case "type": {
               if (text === void 0)
@@ -10642,9 +10976,13 @@ var init_playwright_browser = __esm({
             case "evaluate": {
               if (!text)
                 return fail("text (JavaScript code) is required", start2);
-              const result = await page.evaluate(text);
-              const serialized = typeof result === "string" ? result : JSON.stringify(result, null, 2);
-              return ok(serialized?.slice(0, 15e3) ?? "undefined", start2);
+              try {
+                const result = await page.evaluate(text);
+                const serialized = typeof result === "string" ? result : JSON.stringify(result, null, 2);
+                return ok(serialized?.slice(0, 15e3) ?? "undefined", start2);
+              } catch (err2) {
+                return fail(evaluateFailureMessage(err2, text), start2);
+              }
             }
             // ── Screenshot / PDF ──
             case "screenshot": {
@@ -23743,8 +24081,8 @@ var init_explore_tools = __esm({
       enter_worktree: "Create isolated git worktree for safe parallel file modifications",
       exit_worktree: "Exit and optionally remove a git worktree (keep for merge or discard)",
       notebook_edit: "Edit Jupyter .ipynb notebooks at cell level (list, replace, insert, delete cells)",
-      browser_action: "Interactive browser: login, fill forms, click buttons, screenshot — session persists between calls; for console/page-error/network diagnostics prefer playwright_browser",
-      playwright_browser: "Full browser verification and visual action loop: observe_bundle, visual_click via Moondream pointing, focused-element typing for visual form filling, screenshot, page_errors, console_logs, network_log, DOM/accessibility, storage",
+      browser_action: "Interactive Selenium browser: login, fill forms, click buttons, screenshot — session persists between browser_action calls only; separate runtime from playwright_browser",
+      playwright_browser: "Full browser verification and visual action loop: observe_bundle, visual_click via Moondream pointing, selector/target fill, focused-element typing, screenshot, page_errors, console_logs, network_log, DOM/accessibility, storage",
       carbonyl_browser: "Terminal-rendered real browser automation via Carbonyl: navigate, read rendered text, click/type, sessions, daemon mode",
       scheduler: "Schedule tasks for automatic future execution via OS cron",
       cronjob: "Alias for scheduler: OS cron-backed time triggers",
@@ -282954,6 +283292,18 @@ var init_vision_action_loop = __esm({
             enum: ["single", "double"],
             description: "Click type for click operation. Default single."
           },
+          text: {
+            type: "string",
+            description: "Optional text to type after a live click, or into the currently focused desktop control when operation='none'."
+          },
+          key: {
+            type: "string",
+            description: "Optional key/chord to press after a live click/text entry, for example Enter, Escape, Tab, ctrl+f."
+          },
+          typing_delay_ms: {
+            type: "number",
+            description: "Per-character delay for desktop text entry. Default 10ms."
+          },
           index: {
             type: "number",
             description: "If multiple target points are found, use this 1-based index. Default 1."
@@ -282978,6 +283328,10 @@ var init_vision_action_loop = __esm({
             type: "string",
             description: "Optional screenshot output directory. Relative paths resolve from the workspace."
           },
+          window_title: {
+            type: "string",
+            description: "Optional X11/XWayland window title to capture instead of the whole desktop. Useful on Wayland when root screenshots are blocked."
+          },
           clear_artifacts: {
             type: "boolean",
             description: "Only for reset. If true, also deletes this session's screenshot directory."
@@ -283064,8 +283418,12 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
         const includeVision = asBoolean(args["include_vision"], true);
         const visionPromptOverride = asString(args["vision_prompt"]);
         const language = asString(args["language"]) || "eng";
+        const windowTitle = asString(args["window_title"]);
         const button = this.parseButton(args["button"]);
         const clickType = args["click_type"] === "double" ? "double" : "single";
+        const textToType = asString(args["text"]);
+        const keyToPress = asString(args["key"]);
+        const typingDelayMs = clampInteger2(args["typing_delay_ms"], 10, 0, 500);
         const index = clampInteger2(args["index"], 1, 1, 100);
         const delayMs = clampInteger2(args["delay_ms"], 0, 0, 6e4);
         const maxSteps = action === "run" ? clampInteger2(args["max_steps"], DEFAULT_MAX_STEPS, 1, HARD_MAX_STEPS) : 1;
@@ -283088,13 +283446,28 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
           }
           const stamp = timestampSlug2();
           const screenshotPath = join52(sessionDir2, `${stamp}-step-${step}-before.png`);
-          if (process.platform === "linux" && (process.env["WAYLAND_DISPLAY"] || /wayland/i.test(process.env["XDG_SESSION_TYPE"] || ""))) {
-            yield "Vision action loop: requesting desktop screenshot permission if the system prompts";
+          if (!windowTitle && process.platform === "linux" && (process.env["WAYLAND_DISPLAY"] || /wayland/i.test(process.env["XDG_SESSION_TYPE"] || ""))) {
+            yield "Vision action loop: capturing full desktop on Wayland. For unattended app loops, pass window_title to avoid compositor screenshot prompts.";
           }
           yield `Vision action loop: capturing screenshot ${step}/${maxSteps}`;
           let screenshotBackend = "";
+          let captureOffset = { x: 0, y: 0 };
+          let captureWindow;
           try {
-            screenshotBackend = captureDesktopScreenshot(screenshotPath);
+            if (windowTitle) {
+              const windowCapture = captureDesktopWindowScreenshot(screenshotPath, windowTitle);
+              screenshotBackend = windowCapture.backend;
+              captureOffset = { x: windowCapture.x, y: windowCapture.y };
+              captureWindow = {
+                windowId: windowCapture.windowId,
+                x: windowCapture.x,
+                y: windowCapture.y,
+                width: windowCapture.width,
+                height: windowCapture.height
+              };
+            } else {
+              screenshotBackend = captureDesktopScreenshot(screenshotPath);
+            }
             mutatedFiles.push(screenshotPath);
           } catch (err) {
             success = false;
@@ -283121,6 +283494,9 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
           outputLines.push(`Screenshot saved: ${screenshotPath}`);
           outputLines.push(`Screen: ${dims.width}x${dims.height}`);
           outputLines.push(`Screenshot backend: ${screenshotBackend}`);
+          if (captureWindow) {
+            outputLines.push(`Window: ${windowTitle} id=${captureWindow.windowId} geometry=${captureWindow.x},${captureWindow.y} ${captureWindow.width}x${captureWindow.height}`);
+          }
           let ocr = null;
           if (includeOcr) {
             yield `Vision action loop: running OCR for screenshot ${step}/${maxSteps}`;
@@ -283209,6 +283585,14 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
               selectedPoint = pointFromVisionResult({ points: ocrPoints, source: "tesseract-ocr" }, index, dims);
             }
             if (selectedPoint) {
+              if (captureOffset.x !== 0 || captureOffset.y !== 0) {
+                selectedPoint = {
+                  ...selectedPoint,
+                  pixelX: selectedPoint.pixelX + captureOffset.x,
+                  pixelY: selectedPoint.pixelY + captureOffset.y,
+                  source: `${selectedPoint.source}+window:${captureWindow?.windowId ?? "active"}`
+                };
+              }
               outputLines.push(`Target: ${effectiveTarget}`);
               outputLines.push(`Point source: ${selectedPoint.source}`);
               outputLines.push(`Mapped point: (${Math.round(selectedPoint.pixelX)}, ${Math.round(selectedPoint.pixelY)}) normalized (${selectedPoint.x.toFixed(4)}, ${selectedPoint.y.toFixed(4)})`);
@@ -283238,15 +283622,29 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
             } else {
               yield `Vision action loop: performing ${effectiveOperation} at (${pixelX}, ${pixelY})`;
               try {
+                if (captureWindow?.windowId) {
+                  const activationBackend = activateDesktopWindow(captureWindow.windowId);
+                  outputLines.push(`Activated window via ${activationBackend}: ${captureWindow.windowId}`);
+                }
                 const backend = effectiveOperation === "move" ? moveDesktopPointer(pixelX, pixelY) : clickDesktopAt(pixelX, pixelY, button, clickType);
                 actionTaken = effectiveOperation === "move" ? `Moved pointer to (${pixelX}, ${pixelY}) via ${backend}` : `Clicked at (${pixelX}, ${pixelY}) via ${backend} [${button} ${clickType}]`;
                 outputLines.push(actionTaken);
+                if (textToType) {
+                  const keyboardBackend = typeDesktopText(textToType, typingDelayMs);
+                  outputLines.push(`Typed text via ${keyboardBackend}: ${JSON.stringify(textToType)}`);
+                  actionTaken += `; typed text via ${keyboardBackend}`;
+                }
+                if (keyToPress) {
+                  const keyboardBackend = pressDesktopKey(keyToPress);
+                  outputLines.push(`Pressed key via ${keyboardBackend}: ${keyToPress}`);
+                  actionTaken += `; pressed key via ${keyboardBackend}: ${keyToPress}`;
+                }
                 afterScreenshotPath = join52(sessionDir2, `${timestampSlug2()}-step-${step}-after.png`);
-                if (process.platform === "linux" && (process.env["WAYLAND_DISPLAY"] || /wayland/i.test(process.env["XDG_SESSION_TYPE"] || ""))) {
-                  yield "Vision action loop: requesting desktop screenshot permission for post-action verification if the system prompts";
+                if (!windowTitle && process.platform === "linux" && (process.env["WAYLAND_DISPLAY"] || /wayland/i.test(process.env["XDG_SESSION_TYPE"] || ""))) {
+                  yield "Vision action loop: capturing post-action full desktop on Wayland. For unattended app loops, pass window_title to avoid compositor screenshot prompts.";
                 }
                 yield `Vision action loop: capturing post-action screenshot ${step}/${maxSteps}`;
-                const afterBackend = captureDesktopScreenshot(afterScreenshotPath);
+                const afterBackend = windowTitle ? captureDesktopWindowScreenshot(afterScreenshotPath, windowTitle).backend : captureDesktopScreenshot(afterScreenshotPath);
                 mutatedFiles.push(afterScreenshotPath);
                 outputLines.push(`Post-action screenshot: ${afterScreenshotPath}`);
                 outputLines.push(`Post-action screenshot backend: ${afterBackend}`);
@@ -283260,6 +283658,35 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
             }
           } else if (effectiveOperation !== "none" && (effectiveTarget || hasCoordinates)) {
             outputLines.push(`Action skipped: no usable point for operation '${effectiveOperation}'.`);
+          } else if (allowActions && !dryRun && (textToType || keyToPress)) {
+            try {
+              const keyboardActions = [];
+              if (captureWindow?.windowId) {
+                const activationBackend = activateDesktopWindow(captureWindow.windowId);
+                keyboardActions.push(`Activated window via ${activationBackend}: ${captureWindow.windowId}`);
+              }
+              if (textToType) {
+                const keyboardBackend = typeDesktopText(textToType, typingDelayMs);
+                keyboardActions.push(`Typed text via ${keyboardBackend}: ${JSON.stringify(textToType)}`);
+              }
+              if (keyToPress) {
+                const keyboardBackend = pressDesktopKey(keyToPress);
+                keyboardActions.push(`Pressed key via ${keyboardBackend}: ${keyToPress}`);
+              }
+              actionTaken = keyboardActions.join("; ");
+              outputLines.push(actionTaken);
+              afterScreenshotPath = join52(sessionDir2, `${timestampSlug2()}-step-${step}-after.png`);
+              const afterBackend = windowTitle ? captureDesktopWindowScreenshot(afterScreenshotPath, windowTitle).backend : captureDesktopScreenshot(afterScreenshotPath);
+              mutatedFiles.push(afterScreenshotPath);
+              outputLines.push(`Post-action screenshot: ${afterScreenshotPath}`);
+              outputLines.push(`Post-action screenshot backend: ${afterBackend}`);
+            } catch (err) {
+              success = false;
+              actionError = err instanceof Error ? err.message : String(err);
+              error = actionError;
+              outputLines.push(`Keyboard action failed: ${actionError}`);
+              outputLines.push(desktopAutomationRecoveryMessage());
+            }
           } else if (action === "run" && !effectiveTarget && !hasCoordinates) {
             success = false;
             error = "Vision loop stopped: no target or coordinates were provided and visual planning did not identify a clickable target. Stopping instead of repeating observe-only screenshots.";
@@ -284152,6 +284579,7 @@ async function ensureSession(options2 = {}) {
       }
       activeSessionId = null;
       activeSessionHeadless = null;
+      activeSessionUrl = null;
     }
   }
   if (activeSessionId) {
@@ -284163,6 +284591,13 @@ async function ensureSession(options2 = {}) {
     }
     activeSessionId = null;
     activeSessionHeadless = null;
+    activeSessionUrl = null;
+  }
+  if (options2.allowCreate === false) {
+    return {
+      error: "No active browser_action Selenium session exists for this action. browser_action is a separate browser/runtime from playwright_browser; continue the current page with playwright_browser, or call browser_action({action:'navigate', url: ...}) first.",
+      sessionId: ""
+    };
   }
   const headless = options2.headless ?? defaultBrowserHeadless();
   const res = await fetch(`${BASE_URL}/session/start`, {
@@ -284180,8 +284615,16 @@ async function ensureSession(options2 = {}) {
     return { error: String(data.message ?? "Failed to start browser session"), sessionId: "" };
   activeSessionId = data.session_id;
   activeSessionHeadless = headless;
+  activeSessionUrl = null;
   return { sessionId: activeSessionId };
 }
+function browserActionRuntimeHint() {
+  return [
+    "browser_action is a separate browser/runtime from playwright_browser and uses its own Selenium/Chrome session; it does not share page state, cookies, focus, or navigation.",
+    activeSessionUrl ? `Current browser_action URL: ${activeSessionUrl}` : "Current browser_action URL: unknown or not navigated.",
+    "If this page was opened with playwright_browser, keep using playwright_browser actions such as dom_summary, fill, type, press, visual_click, and observe_bundle."
+  ].join(" ");
+}
 async function apiCall(endpoint, method = "POST", body) {
   const options2 = {
     method,
@@ -284204,7 +284647,7 @@ async function apiCall(endpoint, method = "POST", body) {
   const res = await fetch(url, options2);
   return await res.json();
 }
-var __dirname3, DEFAULT_PORT, SCRAPE_SCRIPT, BASE_URL, serviceProcess, activeSessionId, activeSessionHeadless, BrowserActionTool;
+var __dirname3, DEFAULT_PORT, SCRAPE_SCRIPT, BASE_URL, serviceProcess, activeSessionId, activeSessionHeadless, activeSessionUrl, BrowserActionTool;
 var init_browser_action = __esm({
   "packages/execution/dist/tools/browser-action.js"() {
     "use strict";
@@ -284217,9 +284660,10 @@ var init_browser_action = __esm({
     serviceProcess = null;
     activeSessionId = null;
     activeSessionHeadless = null;
+    activeSessionUrl = null;
     BrowserActionTool = class {
       name = "browser_action";
-      description = "Control a persistent headless Chrome browser session for interactive web tasks. The browser stays open between calls, maintaining cookies, login state, and history. Use this (not web_fetch/web_crawl) when you need to: (1) log into a website, (2) fill and submit forms, (3) click buttons or links interactively, (4) take screenshots of rendered pages, (5) navigate multi-step workflows (checkout, signup, dashboards), (6) interact with elements that require JavaScript (dropdowns, modals, infinite scroll). Actions: navigate, click, click_xy, type, screenshot, dom, scroll, scroll_up, scroll_down, back, forward, close. For browser visuals, use browser_action({action:'screenshot', width, height, output_path}) — this captures the headless browser viewport, not the desktop. Use the desktop screenshot tool only when the actual OS screen is the target. For verification of browser runtime failures, prefer playwright_browser because it exposes page_errors, console_logs, network_log, DOM/accessibility, and screenshots from the same session. IMPORTANT: Start by calling navigate with the URL — do NOT ask the user for credentials or info first. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Navigate to the page, then use dom/screenshot to see what's there, then type/click to interact. Call 'close' when done to free resources. This tool does not save or download arbitrary rendered files (PDFs, archives, media) to disk — clicking a 'Download' link inside the browser does not produce a local file path for the agent. For file acquisition, use the dedicated download/file tool and validate the resulting content-type and size before treating the result as success.";
+      description = "Control a persistent headless Chrome browser session for interactive web tasks. The browser stays open between calls, maintaining cookies, login state, and history. This is a separate Selenium/Chrome runtime from playwright_browser; do not switch between the two mid-workflow unless you intentionally navigate the second tool to the same URL. Use this (not web_fetch/web_crawl) when you need to: (1) log into a website, (2) fill and submit forms, (3) click buttons or links interactively, (4) take screenshots of rendered pages, (5) navigate multi-step workflows (checkout, signup, dashboards), (6) interact with elements that require JavaScript (dropdowns, modals, infinite scroll). Actions: navigate, click, click_xy, type, screenshot, dom, scroll, scroll_up, scroll_down, back, forward, close. For browser visuals, use browser_action({action:'screenshot', width, height, output_path}) — this captures the headless browser viewport, not the desktop. Use the desktop screenshot tool only when the actual OS screen is the target. For verification of browser runtime failures, prefer playwright_browser because it exposes page_errors, console_logs, network_log, DOM/accessibility, and screenshots from the same session. IMPORTANT: Start by calling navigate with the URL — do NOT ask the user for credentials or info first. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Navigate to the page, then use dom/screenshot to see what's there, then type/click to interact. Call 'close' when done to free resources. This tool does not save or download arbitrary rendered files (PDFs, archives, media) to disk — clicking a 'Download' link inside the browser does not produce a local file path for the agent. For file acquisition, use the dedicated download/file tool and validate the resulting content-type and size before treating the result as success.";
       parameters = {
         type: "object",
         properties: {
@@ -284289,27 +284733,38 @@ var init_browser_action = __esm({
         const requestedWidth = args.width == null ? void 0 : asPositiveInt2(args.width, 1280, 320, 3840);
         const requestedHeight = args.height == null ? void 0 : asPositiveInt2(args.height, 720, 240, 2160);
         const requestedScale = args.device_scale_factor == null ? void 0 : asPositiveNumber(args.device_scale_factor, 1, 0.25, 3);
-        const launchErr = await launchService();
-        if (launchErr) {
-          return { success: false, output: "", error: launchErr, durationMs: Date.now() - start2 };
-        }
         if (action === "close") {
-          if (activeSessionId) {
+          if (activeSessionId || await probeService()) {
             try {
               await apiCall("/session/close");
             } catch {
             }
             activeSessionId = null;
             activeSessionHeadless = null;
+            activeSessionUrl = null;
           }
           return { success: true, output: "Browser session closed.", durationMs: Date.now() - start2 };
         }
+        const actionStartsSession = action === "navigate";
+        if (!actionStartsSession && !activeSessionId) {
+          return {
+            success: false,
+            output: "",
+            error: `browser_action ${action || "(missing action)"} requires an active browser_action session. ` + browserActionRuntimeHint(),
+            durationMs: Date.now() - start2
+          };
+        }
+        const launchErr = await launchService();
+        if (launchErr) {
+          return { success: false, output: "", error: launchErr, durationMs: Date.now() - start2 };
+        }
         const session = await ensureSession({
           width: requestedWidth,
           height: requestedHeight,
           deviceScaleFactor: requestedScale,
           headless: asOptionalBoolean2(args.headless),
-          forceNew: asOptionalBoolean2(args.force_new) === true
+          forceNew: asOptionalBoolean2(args.force_new) === true,
+          allowCreate: actionStartsSession
         });
         if (session.error) {
           return { success: false, output: "", error: session.error, durationMs: Date.now() - start2 };
@@ -284327,7 +284782,13 @@ var init_browser_action = __esm({
               }
               result = await apiCall("/navigate", "POST", { url: args.url });
               if (result.ok) {
-                return { success: true, output: `Navigated to ${args.url}`, durationMs: Date.now() - start2 };
+                activeSessionUrl = args.url;
+                return {
+                  success: true,
+                  output: `Navigated to ${args.url}
+Runtime: browser_action Selenium/Chrome session. Continue with browser_action for this page, or use playwright_browser separately after navigating it.`,
+                  durationMs: Date.now() - start2
+                };
               }
               const navMsg = String(result.message ?? "Navigation failed");
               const navHint = navMsg.toLowerCase().includes("connection") || navMsg.toLowerCase().includes("refused") || navMsg.toLowerCase().includes("err_connection") ? " (the URL appears unreachable — check if the target server is running and accepting connections)" : navMsg.toLowerCase().includes("timeout") ? " (page load timed out — try again or use a different URL)" : "";
@@ -284349,7 +284810,7 @@ var init_browser_action = __esm({
               return {
                 success: false,
                 output: `Click on ${args.selector} failed: ${clickMsg}`,
-                error: `browser_action click failed: ${clickMsg}. Try dom_summary first to see what selectors exist on the page.`,
+                error: `browser_action click failed: ${clickMsg}. Try dom_summary first to see what selectors exist on the page. ${browserActionRuntimeHint()}`,
                 durationMs: Date.now() - start2
               };
             }
@@ -284391,7 +284852,7 @@ var init_browser_action = __esm({
               return {
                 success: false,
                 output: `Type into ${args.selector} failed: ${typeMsg}`,
-                error: `browser_action type failed: ${typeMsg}. Verify the element is visible and is an input/textarea — use dom_summary to check.`,
+                error: `browser_action type failed: ${typeMsg}. Verify the element is visible and is an input/textarea — use dom_summary to check. ${browserActionRuntimeHint()}`,
                 durationMs: Date.now() - start2
               };
             }
@@ -284532,7 +284993,7 @@ var init_browser_action = __esm({
                 if (!pointResult || pointResult.points.length === 0) {
                   return {
                     success: false,
-                    output: `Vision could not find "${target}" on the page. Try using dom_summary to find the CSS selector instead.`,
+                    output: `Vision could not find "${target}" on the page. Try using dom_summary to find the CSS selector instead. ${browserActionRuntimeHint()}`,
                     error: "No point backend returned normalized coordinates.",
                     durationMs: Date.now() - start2
                   };

package/npm-shrinkwrap.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
   "name": "omnius",
-  "version": "1.0.205",
+  "version": "1.0.207",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "omnius",
-      "version": "1.0.205",
+      "version": "1.0.207",
       "bundleDependencies": [
         "image-to-ascii"
       ],
@@ -4565,9 +4565,19 @@
       }
     },
     "node_modules/js-yaml": {
-      "version": "4.1.1",
-      "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.1.tgz",
-      "integrity": "sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==",
+      "version": "4.2.0",
+      "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.2.0.tgz",
+      "integrity": "sha512-ePWsvanv0DWuDRsW8dnt+R4jQ31SCRCQ7hhNcPXZPsoBZiemuZNYGf7adZdqX2D86j6rvKp3RpCxVTSb8WQlOw==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/puzrin"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/nodeca"
+        }
+      ],
       "license": "MIT",
       "dependencies": {
         "argparse": "^2.0.1"

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "omnius",
-  "version": "1.0.205",
+  "version": "1.0.207",
   "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
   "type": "module",
   "main": "./dist/index.js",