omnius 1.0.205 → 1.0.207

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -3987,9 +3987,33 @@ var init_system_deps = __esm({
3987
3987
  // packages/execution/dist/tools/desktop-control.js
3988
3988
  import { execSync as execSync4 } from "node:child_process";
3989
3989
  import { existsSync as existsSync6, statSync as statSync3 } from "node:fs";
3990
+ function activateDesktopWindow(windowId) {
3991
+ const attempts = [];
3992
+ const tryAction = (label, command) => {
3993
+ const result = run(command, 5e3);
3994
+ if (result.ok)
3995
+ return label;
3996
+ attempts.push({ label, message: result.message });
3997
+ return null;
3998
+ };
3999
+ if (process.platform === "linux") {
4000
+ if (hasCommand2("xdotool")) {
4001
+ const backend = tryAction("xdotool", `xdotool windowactivate --sync ${quoteShell(windowId)}`);
4002
+ if (backend)
4003
+ return backend;
4004
+ }
4005
+ if (hasCommand2("wmctrl")) {
4006
+ const backend = tryAction("wmctrl", `wmctrl -ia ${quoteShell(windowId)}`);
4007
+ if (backend)
4008
+ return backend;
4009
+ }
4010
+ }
4011
+ throw new Error("No desktop window activation backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage());
4012
+ }
3990
4013
  function captureDesktopScreenshot(outputPath3) {
3991
4014
  const attempts = [];
3992
4015
  const out = quoteShell(outputPath3);
4016
+ const allowInteractiveWaylandScreenshot = envFlag("OMNIUS_DESKTOP_INTERACTIVE_SCREENSHOT", false);
3993
4017
  const tryCapture = (label, command, timeout2 = 1e4) => {
3994
4018
  const result = run(command, timeout2);
3995
4019
  if (result.ok && existsSync6(outputPath3)) {
@@ -4022,11 +4046,22 @@ $bitmap.Dispose()
4022
4046
  return backend;
4023
4047
  } else if (process.platform === "linux") {
4024
4048
  const desktop = `${process.env["XDG_CURRENT_DESKTOP"] || ""} ${process.env["DESKTOP_SESSION"] || ""}`;
4025
- if ((/wayland/i.test(process.env["XDG_SESSION_TYPE"] || "") || process.env["WAYLAND_DISPLAY"]) && hasCommand2("gdbus") && hasCommand2("dbus-monitor")) {
4049
+ if ((/wayland/i.test(process.env["XDG_SESSION_TYPE"] || "") || process.env["WAYLAND_DISPLAY"]) && /gnome/i.test(desktop) && hasCommand2("gdbus") && allowInteractiveWaylandScreenshot) {
4050
+ const backend = tryCapture("gnome-shell-interactive-screenshot-dbus", gnomeInteractiveScreenshotCommand(outputPath3), 13e4);
4051
+ if (backend)
4052
+ return backend;
4053
+ }
4054
+ if ((/wayland/i.test(process.env["XDG_SESSION_TYPE"] || "") || process.env["WAYLAND_DISPLAY"]) && hasCommand2("gdbus") && hasCommand2("dbus-monitor") && allowInteractiveWaylandScreenshot) {
4026
4055
  const backend = tryCapture("xdg-desktop-portal-screenshot", portalScreenshotCommand(outputPath3), 13e4);
4027
4056
  if (backend)
4028
4057
  return backend;
4029
4058
  }
4059
+ if ((/wayland/i.test(process.env["XDG_SESSION_TYPE"] || "") || process.env["WAYLAND_DISPLAY"]) && !allowInteractiveWaylandScreenshot) {
4060
+ attempts.push({
4061
+ label: "interactive-wayland-screenshot",
4062
+ message: "skipped by default to avoid unattended GNOME/portal screenshot selection stalls. For desktop app loops, pass window_title to vision_action_loop so Omnius captures the named X11/XWayland window. Set OMNIUS_DESKTOP_INTERACTIVE_SCREENSHOT=1 only when a human will complete the full-screen screenshot prompt."
4063
+ });
4064
+ }
4030
4065
  if (/gnome/i.test(desktop) && hasCommand2("gdbus")) {
4031
4066
  const backend = tryCapture("gnome-shell-screenshot-dbus", `gdbus call --session --dest org.gnome.Shell.Screenshot --object-path /org/gnome/Shell/Screenshot --method org.gnome.Shell.Screenshot.Screenshot false false ${out}`);
4032
4067
  if (backend)
@@ -4079,6 +4114,83 @@ $bitmap.Dispose()
4079
4114
  }
4080
4115
  throw new Error("No desktop screenshot backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage());
4081
4116
  }
4117
+ function captureDesktopWindowScreenshot(outputPath3, windowTitle) {
4118
+ const attempts = [];
4119
+ const out = quoteShell(outputPath3);
4120
+ const runText = (command, timeout2 = 5e3) => {
4121
+ const result = runCaptureText(command, timeout2);
4122
+ if (result.ok)
4123
+ return result.output.trim();
4124
+ attempts.push({ label: command.split(/\s+/)[0] || command, message: result.message });
4125
+ return null;
4126
+ };
4127
+ if (process.platform !== "linux") {
4128
+ throw new Error("Window screenshot capture is currently implemented for Linux/X11/XWayland windows only.");
4129
+ }
4130
+ if (!hasCommand2("xdotool")) {
4131
+ attempts.push({ label: "xdotool", message: "not found on PATH" });
4132
+ } else if (!hasCommand2("import")) {
4133
+ attempts.push({ label: "import", message: "ImageMagick import not found on PATH" });
4134
+ } else {
4135
+ const windowId = windowTitle ? runText(`xdotool search --name ${quoteShell(windowTitle)} | tail -n1`) : runText("xdotool getactivewindow");
4136
+ if (windowId) {
4137
+ const geometry = runText(`xdotool getwindowgeometry --shell ${quoteShell(windowId)}`);
4138
+ const parsed = parseXdotoolGeometry(geometry || "");
4139
+ if (!parsed) {
4140
+ attempts.push({ label: "xdotool getwindowgeometry", message: `Could not parse geometry for window ${windowId}: ${geometry}` });
4141
+ } else {
4142
+ const result = run(`import -window ${quoteShell(windowId)} ${out}`, 1e4);
4143
+ if (result.ok && existsSync6(outputPath3)) {
4144
+ const inspection = inspectScreenshot(outputPath3);
4145
+ if (inspection.ok) {
4146
+ return {
4147
+ backend: "imagemagick-import-window",
4148
+ windowId,
4149
+ ...parsed
4150
+ };
4151
+ }
4152
+ attempts.push({ label: "import-window", message: inspection.message });
4153
+ } else {
4154
+ attempts.push({ label: "import-window", message: result.message });
4155
+ }
4156
+ }
4157
+ }
4158
+ }
4159
+ throw new Error("No desktop window screenshot backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage());
4160
+ }
4161
+ function gnomeInteractiveScreenshotCommand(outputPath3) {
4162
+ const script = `
4163
+ set -eu
4164
+ out=${quoteShell(outputPath3)}
4165
+ printf '%s\\n' 'Omnius desktop screenshot: choose Full Screen in the GNOME screenshot UI, then press Enter.' >&2
4166
+ call_out="$(gdbus call --session --dest org.gnome.Shell.Screenshot --object-path /org/gnome/Shell/Screenshot --method org.gnome.Shell.Screenshot.InteractiveScreenshot 2>&1)" || {
4167
+ printf '%s\\n' "$call_out" >&2
4168
+ exit 1
4169
+ }
4170
+ uri="$(printf '%s\\n' "$call_out" | sed -n "s/.*'\\(file:[^']*\\)'.*/\\1/p" | tail -n 1)"
4171
+ if [ -z "$uri" ]; then
4172
+ printf 'GNOME interactive screenshot did not return a file URI: %s\\n' "$call_out" >&2
4173
+ exit 1
4174
+ fi
4175
+
4176
+ if command -v gio >/dev/null 2>&1; then
4177
+ gio copy -f "$uri" "$out"
4178
+ elif command -v python3 >/dev/null 2>&1; then
4179
+ python3 - "$uri" "$out" <<'PY'
4180
+ import shutil, sys, urllib.parse
4181
+ uri, out = sys.argv[1], sys.argv[2]
4182
+ parsed = urllib.parse.urlparse(uri)
4183
+ if parsed.scheme != "file":
4184
+ raise SystemExit(f"Unsupported screenshot URI scheme: {parsed.scheme}")
4185
+ shutil.copyfile(urllib.parse.unquote(parsed.path), out)
4186
+ PY
4187
+ else
4188
+ printf 'Need gio or python3 to copy GNOME screenshot URI %s to %s.\\n' "$uri" "$out" >&2
4189
+ exit 1
4190
+ fi
4191
+ `.trim();
4192
+ return `bash -lc ${quoteShellLiteral(script)}`;
4193
+ }
4082
4194
  function portalScreenshotCommand(outputPath3) {
4083
4195
  const script = `
4084
4196
  set -eu
@@ -4086,6 +4198,7 @@ out=${quoteShell(outputPath3)}
4086
4198
  monitor_file="$(mktemp)"
4087
4199
  cleanup() {
4088
4200
  if [ -n "\${monpid:-}" ]; then kill "$monpid" >/dev/null 2>&1 || true; fi
4201
+ if [ -n "\${keypid:-}" ]; then kill "$keypid" >/dev/null 2>&1 || true; fi
4089
4202
  rm -f "$monitor_file"
4090
4203
  }
4091
4204
  trap cleanup EXIT
@@ -4102,6 +4215,25 @@ fi
4102
4215
 
4103
4216
  dbus-monitor --session "type='signal',interface='org.freedesktop.portal.Request',member='Response',path='$handle'" > "$monitor_file" 2>&1 &
4104
4217
  monpid=$!
4218
+ if [ "\${OMNIUS_SCREENSHOT_AUTO_CONFIRM:-1}" != "0" ]; then
4219
+ (
4220
+ sleep 1
4221
+ i=0
4222
+ while [ "$i" -lt 8 ]; do
4223
+ if command -v xdotool >/dev/null 2>&1; then
4224
+ DISPLAY="\${DISPLAY:-:0}" xdotool key Return >/dev/null 2>&1 || true
4225
+ elif command -v ydotool >/dev/null 2>&1; then
4226
+ ydotool key 28:1 28:0 >/dev/null 2>&1 || true
4227
+ elif command -v dotool >/dev/null 2>&1; then
4228
+ printf 'key enter
4229
+ ' | dotool >/dev/null 2>&1 || true
4230
+ fi
4231
+ i=$((i + 1))
4232
+ sleep 0.5
4233
+ done
4234
+ ) &
4235
+ keypid=$!
4236
+ fi
4105
4237
  deadline=$(( $(date +%s) + 120 ))
4106
4238
  while :; do
4107
4239
  if grep -q "member=Response" "$monitor_file"; then break; fi
@@ -4144,7 +4276,7 @@ else
4144
4276
  exit 1
4145
4277
  fi
4146
4278
  `.trim();
4147
- return `bash -lc ${quoteShell(script)}`;
4279
+ return `bash -lc ${quoteShellLiteral(script)}`;
4148
4280
  }
4149
4281
  function moveDesktopPointer(x, y) {
4150
4282
  const result = performDesktopPointerAction({ x, y, moveOnly: true });
@@ -4167,6 +4299,25 @@ function clickDesktopAt(x, y, button, clickType) {
4167
4299
  lastPointer = { x: Math.round(x), y: Math.round(y) };
4168
4300
  return result.backend;
4169
4301
  }
4302
+ function typeDesktopText(text, delayMs = 10) {
4303
+ const result = performDesktopKeyboardAction({
4304
+ kind: "type",
4305
+ text,
4306
+ delayMs
4307
+ });
4308
+ if (!result.ok)
4309
+ throw new Error(result.message);
4310
+ return result.backend;
4311
+ }
4312
+ function pressDesktopKey(key) {
4313
+ const result = performDesktopKeyboardAction({
4314
+ kind: "key",
4315
+ key
4316
+ });
4317
+ if (!result.ok)
4318
+ throw new Error(result.message);
4319
+ return result.backend;
4320
+ }
4170
4321
  function tryRunXdotoolShellFallback(command) {
4171
4322
  if (!/\bxdotool\b|\bxdtool\b/.test(command))
4172
4323
  return null;
@@ -4278,7 +4429,9 @@ function desktopAutomationRecoveryMessage(command) {
4278
4429
  " Linux pointer control: xdotool/X11, ydotool, dotool, python-xlib",
4279
4430
  " macOS: cliclick, then System Events",
4280
4431
  " Windows: PowerShell user32 input",
4281
- "On Wayland, Omnius requests screenshot permission through xdg-desktop-portal when available. Approve the system screenshot prompt to continue.",
4432
+ "On Wayland, unattended full-desktop screenshots are compositor-restricted and interactive screenshot prompts are skipped by default.",
4433
+ "For desktop app loops on GNOME Wayland, pass window_title to vision_action_loop so Omnius captures the named X11/XWayland window without a full-screen prompt.",
4434
+ "Set OMNIUS_DESKTOP_INTERACTIVE_SCREENSHOT=1 only when a human will complete the full-screen screenshot prompt.",
4282
4435
  "On GNOME Wayland, unattended screenshots may still be denied by compositor policy. Install gnome-screenshot or grant screenshot permission for the session if capture is blocked.",
4283
4436
  "On Wayland pointer control, install and enable ydotool or dotool when xdotool cannot open an X display."
4284
4437
  ].join("\n");
@@ -4382,6 +4535,72 @@ ${options2.moveOnly ? "" : `for _ in range(${clicks}):
4382
4535
  message: "No desktop mouse backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage()
4383
4536
  };
4384
4537
  }
4538
+ function performDesktopKeyboardAction(options2) {
4539
+ const attempts = [];
4540
+ const tryAction = (label, command) => {
4541
+ const result = run(command, 1e4);
4542
+ if (result.ok)
4543
+ return label;
4544
+ attempts.push({ label, message: result.message });
4545
+ return null;
4546
+ };
4547
+ if (process.platform === "linux") {
4548
+ if (hasCommand2("xdotool")) {
4549
+ const command = options2.kind === "type" ? `xdotool type --clearmodifiers --delay ${Math.max(0, Math.min(500, Math.round(options2.delayMs)))} ${quoteShell(options2.text)}` : `xdotool key --clearmodifiers ${quoteShell(options2.key)}`;
4550
+ const backend = tryAction("xdotool", command);
4551
+ if (backend)
4552
+ return { ok: true, backend };
4553
+ } else {
4554
+ attempts.push({ label: "xdotool", message: "not found on PATH" });
4555
+ }
4556
+ if (hasCommand2("ydotool")) {
4557
+ const command = options2.kind === "type" ? `ydotool type ${quoteShell(options2.text)}` : `ydotool key ${quoteShell(options2.key)}`;
4558
+ const backend = tryAction("ydotool", command);
4559
+ if (backend)
4560
+ return { ok: true, backend };
4561
+ }
4562
+ if (hasCommand2("dotool")) {
4563
+ const commandText = options2.kind === "type" ? `type ${options2.text}
4564
+ ` : `key ${options2.key}
4565
+ `;
4566
+ const backend = tryAction("dotool", `printf ${quoteShell(commandText)} | dotool`);
4567
+ if (backend)
4568
+ return { ok: true, backend };
4569
+ }
4570
+ } else if (process.platform === "darwin") {
4571
+ if (options2.kind === "type") {
4572
+ const script = `tell application "System Events" to keystroke ${quoteAppleScript(options2.text)}`;
4573
+ const backend = tryAction("osascript-system-events", `osascript -e ${quoteShell(script)}`);
4574
+ if (backend)
4575
+ return { ok: true, backend };
4576
+ } else {
4577
+ const script = `tell application "System Events" to key code ${quoteAppleScript(options2.key)}`;
4578
+ const backend = tryAction("osascript-system-events", `osascript -e ${quoteShell(script)}`);
4579
+ if (backend)
4580
+ return { ok: true, backend };
4581
+ }
4582
+ }
4583
+ return {
4584
+ ok: false,
4585
+ message: "No desktop keyboard backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage()
4586
+ };
4587
+ }
4588
+ function parseXdotoolGeometry(raw) {
4589
+ const values = /* @__PURE__ */ new Map();
4590
+ for (const line of raw.split(/\r?\n/)) {
4591
+ const match = line.match(/^([A-Z]+)=(-?\d+)$/);
4592
+ if (match)
4593
+ values.set(match[1], Number(match[2]));
4594
+ }
4595
+ const x = values.get("X");
4596
+ const y = values.get("Y");
4597
+ const width = values.get("WIDTH");
4598
+ const height = values.get("HEIGHT");
4599
+ if ([x, y, width, height].every((value2) => Number.isFinite(value2))) {
4600
+ return { x, y, width, height };
4601
+ }
4602
+ return null;
4603
+ }
4385
4604
  function windowsMouseScript(x, y, down, up, clicks = 1) {
4386
4605
  const clickBody = down == null || up == null ? "" : `
4387
4606
  for ($i = 0; $i -lt ${clicks}; $i++) {
@@ -4505,6 +4724,26 @@ function run(command, timeout2) {
4505
4724
  };
4506
4725
  }
4507
4726
  }
4727
+ function runCaptureText(command, timeout2) {
4728
+ try {
4729
+ const output = execSync4(command, {
4730
+ stdio: ["pipe", "pipe", "pipe"],
4731
+ timeout: timeout2,
4732
+ env: { ...process.env },
4733
+ encoding: "utf8"
4734
+ });
4735
+ return { ok: true, output, message: "" };
4736
+ } catch (err) {
4737
+ const anyErr = err;
4738
+ const stdout = bufferishToString(anyErr.stdout);
4739
+ const stderr = bufferishToString(anyErr.stderr);
4740
+ return {
4741
+ ok: false,
4742
+ output: stdout,
4743
+ message: (stderr || stdout || anyErr.message || String(err)).trim().slice(0, 800)
4744
+ };
4745
+ }
4746
+ }
4508
4747
  function bufferishToString(value2) {
4509
4748
  if (Buffer.isBuffer(value2))
4510
4749
  return value2.toString("utf8");
@@ -4517,6 +4756,18 @@ function powershellCommand(script) {
4517
4756
  function quoteShell(value2) {
4518
4757
  return JSON.stringify(value2);
4519
4758
  }
4759
+ function quoteShellLiteral(value2) {
4760
+ return `'${value2.replace(/'/g, `'"'"'`)}'`;
4761
+ }
4762
+ function envFlag(name10, fallback) {
4763
+ const value2 = process.env[name10];
4764
+ if (value2 === void 0)
4765
+ return fallback;
4766
+ return /^(1|true|yes|on)$/i.test(value2.trim());
4767
+ }
4768
+ function quoteAppleScript(value2) {
4769
+ return JSON.stringify(value2);
4770
+ }
4520
4771
  function psString(value2) {
4521
4772
  return value2.replace(/'/g, "''");
4522
4773
  }
@@ -8507,7 +8758,7 @@ function resetMoondreamClient() {
8507
8758
  function getVisionPointDiagnostics() {
8508
8759
  return [...lastPointDiagnostics];
8509
8760
  }
8510
- function envFlag(value2, fallback = false) {
8761
+ function envFlag2(value2, fallback = false) {
8511
8762
  if (value2 === void 0)
8512
8763
  return fallback;
8513
8764
  if (/^(1|true|yes|on)$/i.test(value2.trim()))
@@ -8627,8 +8878,8 @@ function resolveHuggingFaceVisionModelCandidates(preferredModel) {
8627
8878
  for (const model of splitModelList(process.env["OMNIUS_MOONDREAM_HF_MODELS"]))
8628
8879
  add2(model, true);
8629
8880
  add2(process.env["MOONDREAM_HF_MODEL"] || "", Boolean(process.env["MOONDREAM_HF_MODEL"]));
8630
- const fullPreviewAuto = envFlag(process.env["OMNIUS_MOONDREAM3_PREVIEW_AUTO"], true) || envFlag(process.env["OMNIUS_MOONDREAM3_PREVIEW"], false);
8631
- const compactFallbackAuto = envFlag(process.env["OMNIUS_MOONDREAM2_4BIT_AUTO"], true);
8881
+ const fullPreviewAuto = envFlag2(process.env["OMNIUS_MOONDREAM3_PREVIEW_AUTO"], true) || envFlag2(process.env["OMNIUS_MOONDREAM3_PREVIEW"], false);
8882
+ const compactFallbackAuto = envFlag2(process.env["OMNIUS_MOONDREAM2_4BIT_AUTO"], true);
8632
8883
  const hasExplicitHf = candidates.some((candidate) => candidate.explicit);
8633
8884
  const hasExplicitNonHf = Boolean(preferred) && !isHuggingFaceVisionModel(preferred);
8634
8885
  if (hasExplicitNonHf)
@@ -8937,7 +9188,7 @@ async function callOllamaVision(ollamaHost, model, prompt, imageBase64, timeoutM
8937
9188
  return typeof data.response === "string" && data.response.trim() ? data.response : null;
8938
9189
  }
8939
9190
  function shouldAutoPullOllamaVisionModel(model) {
8940
- if (!envFlag(process.env["OMNIUS_OLLAMA_VISION_AUTO_PULL"], true))
9191
+ if (!envFlag2(process.env["OMNIUS_OLLAMA_VISION_AUTO_PULL"], true))
8941
9192
  return false;
8942
9193
  return Boolean(model.trim());
8943
9194
  }
@@ -9031,7 +9282,7 @@ async function prepareHuggingFaceVisionCandidate(candidate, diagnostics) {
9031
9282
  return { python, env: env2, gpuIndex: decision2.lease.gpuIndex, release: () => decision2.lease.release() };
9032
9283
  }
9033
9284
  function ensureHuggingFaceVisionPython(modelId) {
9034
- const managed = envFlag(process.env["OMNIUS_MOONDREAM_HF_MANAGED"], true);
9285
+ const managed = envFlag2(process.env["OMNIUS_MOONDREAM_HF_MANAGED"], true);
9035
9286
  if (!managed)
9036
9287
  return legacyHuggingFaceVisionPython(modelId);
9037
9288
  ensureUnifiedCacheDirs();
@@ -10075,6 +10326,79 @@ function pngDimensions(buffer2) {
10075
10326
  }
10076
10327
  return null;
10077
10328
  }
10329
+ async function describeFocusedEditable(pageHandle) {
10330
+ const active = await pageHandle.evaluate(`(() => {
10331
+ const el = document.activeElement;
10332
+ if (!el) return null;
10333
+ const rect = el.getBoundingClientRect();
10334
+ const role = (el.getAttribute("role") || "").toLowerCase();
10335
+ const contentEditable = String(el.getAttribute("contenteditable") || "").toLowerCase();
10336
+ const isEditable = el.matches("input, textarea")
10337
+ || contentEditable === "" || contentEditable === "true"
10338
+ || ["textbox", "searchbox", "combobox"].includes(role);
10339
+ return {
10340
+ tag: String(el.tagName || "").toLowerCase(),
10341
+ id: el.id || "",
10342
+ name: el.getAttribute("name") || "",
10343
+ role,
10344
+ ariaLabel: el.getAttribute("aria-label") || "",
10345
+ type: el.getAttribute("type") || "",
10346
+ placeholder: el.getAttribute("placeholder") || "",
10347
+ text: String(el.textContent || "").trim().slice(0, 120),
10348
+ isEditable,
10349
+ rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
10350
+ };
10351
+ })()`);
10352
+ return active && typeof active === "object" ? active : null;
10353
+ }
10354
+ async function clickAndFillBrowserTarget(pageHandle, target, text, typingDelay) {
10355
+ const viewport = pageHandle.viewportSize?.() ?? { width: 1280, height: 720 };
10356
+ let candidate = await findBrowserVisualCandidate(pageHandle, target, viewport.width / 2, viewport.height / 2, true);
10357
+ let source = "dom-candidate";
10358
+ if (!candidate) {
10359
+ candidate = await findBrowserVisualCandidate(pageHandle, target, viewport.width / 2, viewport.height / 2, true, true, true);
10360
+ if (candidate?.["scrolledIntoView"] === true)
10361
+ source += "+scroll";
10362
+ if (candidate)
10363
+ await pageHandle.waitForTimeout(150);
10364
+ }
10365
+ const center = candidate?.["center"];
10366
+ const x = Number(center?.x);
10367
+ const y = Number(center?.y);
10368
+ if (!Number.isFinite(x) || !Number.isFinite(y)) {
10369
+ throw new Error(`No visible editable candidate matched target "${target}". Run observe_bundle or dom_summary to inspect available labels/selectors.`);
10370
+ }
10371
+ await pageHandle.mouse.click(x, y);
10372
+ await pageHandle.waitForTimeout(80);
10373
+ const active = await describeFocusedEditable(pageHandle);
10374
+ if (!active?.["isEditable"]) {
10375
+ throw new Error(`Target "${target}" was clicked, but no editable element became focused. Matched element: ${candidate ? JSON.stringify({
10376
+ tag: candidate["tag"],
10377
+ text: candidate["text"],
10378
+ ariaLabel: candidate["ariaLabel"],
10379
+ placeholder: candidate["placeholder"],
10380
+ name: candidate["name"]
10381
+ }) : "(none)"}.`);
10382
+ }
10383
+ const selectAll = process.platform === "darwin" ? "Meta+A" : "Control+A";
10384
+ await pageHandle.keyboard.press(selectAll);
10385
+ await pageHandle.keyboard.type(text, { delay: typingDelay });
10386
+ return { candidate, active, source };
10387
+ }
10388
+ function evaluateFailureMessage(err, code8) {
10389
+ const raw = err instanceof Error ? err.message : String(err);
10390
+ const hints = [];
10391
+ if (/map is not a function/i.test(raw) && /querySelectorAll/i.test(code8)) {
10392
+ hints.push("document.querySelectorAll() returns a NodeList; use Array.from(document.querySelectorAll(selector)).map(...) or [...document.querySelectorAll(selector)].map(...).");
10393
+ }
10394
+ if (/(?:\.value\s*=|setAttribute\(['"]value['"])/.test(code8) && /\b(input|textarea|querySelector)/i.test(code8)) {
10395
+ hints.push("Do not fill modern React/Vue/Svelte forms by assigning .value in evaluate; use playwright_browser fill, or visual_click the field then type, so input/change events fire.");
10396
+ }
10397
+ if (/querySelectorAll|querySelector/.test(code8)) {
10398
+ hints.push("For page inspection, prefer query_all, dom_summary, or observe_bundle before raw evaluate.");
10399
+ }
10400
+ return [raw.slice(0, 500), ...hints.map((hint) => `Hint: ${hint}`)].join("\n");
10401
+ }
10078
10402
  function buildImageMarker(buffer2) {
10079
10403
  let mimeType = "image/png";
10080
10404
  let out = buffer2;
@@ -10308,7 +10632,7 @@ var init_playwright_browser = __esm({
10308
10632
  PLAYWRIGHT_BROWSERS_DIR = join13(PLAYWRIGHT_RUNTIME_DIR, "browsers");
10309
10633
  PlaywrightBrowserTool = class {
10310
10634
  name = "playwright_browser";
10311
- description = "Full-scope Playwright browser automation + diagnostic capture. Launches a persistent headless Chromium session by default, with optional visible/headed mode when a GUI display is available. Beyond navigation/interaction, this tool buffers everything the running app emits (console messages, network requests, JS exceptions, accessibility tree) so the agent can verify what is ACTUALLY happening — not just what the build/test reports. Auto-installs Playwright + Chromium on first use without sudo or OS package manager escalation. Diagnostic actions: observe_bundle, dom_summary, dom, console_logs, network_log, page_errors, a11y_snapshot, bounding_box, query_all, performance, cookies, storage, viewport, clear_diagnostics. Interaction actions: navigate, click, visual_click, fill, type, press, select, check, hover. Capture actions: screenshot, pdf, content, innerText, innerHTML, getAttribute, evaluate. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Workflow for user-facing work: start/serve the system with the stack-native tool, navigate to the real URL, then inspect page_errors, console_logs, network_log, DOM/accessibility, and screenshot evidence before completion. Build/typecheck/test output is only one layer; runtime browser evidence is required when the delivered artifact is a page, app, dashboard, game, form, visualization, or other UI. Repeat navigate/act/observe until the actual user flow is clean.";
10635
+ description = "Full-scope Playwright browser automation + diagnostic capture. Launches a persistent headless Chromium session by default, with optional visible/headed mode when a GUI display is available. Beyond navigation/interaction, this tool buffers everything the running app emits (console messages, network requests, JS exceptions, accessibility tree) so the agent can verify what is ACTUALLY happening — not just what the build/test reports. Auto-installs Playwright + Chromium on first use without sudo or OS package manager escalation. Diagnostic actions: observe_bundle, dom_summary, dom, console_logs, network_log, page_errors, a11y_snapshot, bounding_box, query_all, performance, cookies, storage, viewport, clear_diagnostics. Interaction actions: navigate, click, visual_click, fill, type, press, select, check, hover. Use fill with a selector or natural-language target for form fields; avoid raw evaluate for form filling because direct .value assignment does not fire app input/change events. This is a separate browser/runtime from browser_action; once you start a workflow here, continue here unless you intentionally navigate browser_action to the same URL. Capture actions: screenshot, pdf, content, innerText, innerHTML, getAttribute, evaluate. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Workflow for user-facing work: start/serve the system with the stack-native tool, navigate to the real URL, then inspect page_errors, console_logs, network_log, DOM/accessibility, and screenshot evidence before completion. Build/typecheck/test output is only one layer; runtime browser evidence is required when the delivered artifact is a page, app, dashboard, game, form, visualization, or other UI. Repeat navigate/act/observe until the actual user flow is clean.";
10312
10636
  parameters = {
10313
10637
  type: "object",
10314
10638
  properties: {
@@ -10358,7 +10682,7 @@ var init_playwright_browser = __esm({
10358
10682
  "clear_diagnostics",
10359
10683
  "close"
10360
10684
  ],
10361
- description: "Action to perform:\n- navigate: go to a URL\n- click: click element by selector\n- fill: clear input and type text (for form fields)\n- type: type text character by character into a selector, or into the currently focused element after visual_click\n- press: press a key (Enter, Tab, Escape, etc.)\n- screenshot: capture the headless browser page, not the desktop; use value to choose the output file path\n- observe_bundle: capture URL/title/viewport, DOM summary, a11y, diagnostics, screenshot, and gate assessment\n- visual_click: browser screenshot -> Moondream point -> elementFromPoint -> human-like Playwright mouse click -> post-action screenshot\n- evaluate: run JavaScript in page context\n- content: get page text content (readable, stripped)\n- dom: get raw page HTML (truncated)\n- dom_summary: compact interactive DOM summary with selectors\n- innerText: get innerText of a specific element\n- select: select dropdown option by value\n- check/uncheck: toggle checkbox\n- hover: hover over element\n- wait: wait for a selector to appear\n- waitForNavigation: wait for page navigation to complete\n- waitForSelector: wait for element matching selector\n- title: get page title\n- url: get current URL\n- getAttribute: get element attribute value\n- innerHTML: get element's innerHTML\n- textContent: get element's textContent\n- goBack/goForward/reload: browser navigation\n- pdf: save page as PDF\n- close: close browser session"
10685
+ description: "Action to perform:\n- navigate: go to a URL\n- click: click element by selector\n- fill: clear input and type text by selector, or by natural-language target when selector is absent\n- type: type text character by character into a selector, or into the currently focused element after visual_click\n- press: press a key (Enter, Tab, Escape, etc.)\n- screenshot: capture the headless browser page, not the desktop; use value to choose the output file path\n- observe_bundle: capture URL/title/viewport, DOM summary, a11y, diagnostics, screenshot, and gate assessment\n- visual_click: browser screenshot -> Moondream point -> elementFromPoint -> human-like Playwright mouse click -> post-action screenshot\n- evaluate: run JavaScript in page context\n- content: get page text content (readable, stripped)\n- dom: get raw page HTML (truncated)\n- dom_summary: compact interactive DOM summary with selectors\n- innerText: get innerText of a specific element\n- select: select dropdown option by value\n- check/uncheck: toggle checkbox\n- hover: hover over element\n- wait: wait for a selector to appear\n- waitForNavigation: wait for page navigation to complete\n- waitForSelector: wait for element matching selector\n- title: get page title\n- url: get current URL\n- getAttribute: get element attribute value\n- innerHTML: get element's innerHTML\n- textContent: get element's textContent\n- goBack/goForward/reload: browser navigation\n- pdf: save page as PDF\n- close: close browser session"
10362
10686
  },
10363
10687
  url: {
10364
10688
  type: "string",
@@ -10374,7 +10698,7 @@ var init_playwright_browser = __esm({
10374
10698
  },
10375
10699
  target: {
10376
10700
  type: "string",
10377
- description: "Natural-language browser visual target for visual_click, for example 'the green Continue button' or 'the search field'."
10701
+ description: "Natural-language browser visual target for visual_click or selector-less fill, for example 'the green Continue button', 'username field', or 'password field'."
10378
10702
  },
10379
10703
  value: {
10380
10704
  type: "string",
@@ -10494,12 +10818,22 @@ var init_playwright_browser = __esm({
10494
10818
  return ok(`Clicked: ${resolvedSelector}${resolvedSelector !== selector ? ` (from ${selector})` : ""}`, start2);
10495
10819
  }
10496
10820
  case "fill": {
10497
- if (!selector)
10498
- return fail("selector is required", start2);
10499
10821
  if (text === void 0)
10500
10822
  return fail("text is required", start2);
10501
- await page.fill(selector, text, { timeout: timeout2 });
10502
- return ok(`Filled ${selector} with "${text}"`, start2);
10823
+ const typingDelay = typeof args.typing_delay_ms === "number" ? Math.max(0, Math.min(500, Math.round(args.typing_delay_ms))) : 20;
10824
+ if (selector) {
10825
+ const resolvedSelector = resolveDomSummarySelector(selector);
10826
+ if (!resolvedSelector)
10827
+ return fail(`No selector known for DOM summary reference ${selector}; run dom_summary and use the emitted selector.`, start2);
10828
+ await page.fill(resolvedSelector, text, { timeout: timeout2 });
10829
+ return ok(`Filled ${resolvedSelector}${resolvedSelector !== selector ? ` (from ${selector})` : ""} with "${text}"`, start2);
10830
+ }
10831
+ const target = typeof args.target === "string" && args.target.trim() ? args.target.trim() : "";
10832
+ if (!target)
10833
+ return fail("selector or target is required for fill. Prefer target for visual/natural-language form fields, e.g. target='username field'.", start2);
10834
+ const result = await clickAndFillBrowserTarget(page, target, text, typingDelay);
10835
+ const active = result.active ?? {};
10836
+ return ok(`Filled target "${target}" via ${result.source} into <${active["tag"] || "element"}>${active["name"] ? ` name=${JSON.stringify(active["name"])}` : ""}${active["placeholder"] ? ` placeholder=${JSON.stringify(active["placeholder"])}` : ""}${active["ariaLabel"] ? ` aria-label=${JSON.stringify(active["ariaLabel"])}` : ""}.`, start2);
10503
10837
  }
10504
10838
  case "type": {
10505
10839
  if (text === void 0)
@@ -10642,9 +10976,13 @@ var init_playwright_browser = __esm({
10642
10976
  case "evaluate": {
10643
10977
  if (!text)
10644
10978
  return fail("text (JavaScript code) is required", start2);
10645
- const result = await page.evaluate(text);
10646
- const serialized = typeof result === "string" ? result : JSON.stringify(result, null, 2);
10647
- return ok(serialized?.slice(0, 15e3) ?? "undefined", start2);
10979
+ try {
10980
+ const result = await page.evaluate(text);
10981
+ const serialized = typeof result === "string" ? result : JSON.stringify(result, null, 2);
10982
+ return ok(serialized?.slice(0, 15e3) ?? "undefined", start2);
10983
+ } catch (err2) {
10984
+ return fail(evaluateFailureMessage(err2, text), start2);
10985
+ }
10648
10986
  }
10649
10987
  // ── Screenshot / PDF ──
10650
10988
  case "screenshot": {
@@ -23743,8 +24081,8 @@ var init_explore_tools = __esm({
23743
24081
  enter_worktree: "Create isolated git worktree for safe parallel file modifications",
23744
24082
  exit_worktree: "Exit and optionally remove a git worktree (keep for merge or discard)",
23745
24083
  notebook_edit: "Edit Jupyter .ipynb notebooks at cell level (list, replace, insert, delete cells)",
23746
- browser_action: "Interactive browser: login, fill forms, click buttons, screenshot — session persists between calls; for console/page-error/network diagnostics prefer playwright_browser",
23747
- playwright_browser: "Full browser verification and visual action loop: observe_bundle, visual_click via Moondream pointing, focused-element typing for visual form filling, screenshot, page_errors, console_logs, network_log, DOM/accessibility, storage",
24084
+ browser_action: "Interactive Selenium browser: login, fill forms, click buttons, screenshot — session persists between browser_action calls only; separate runtime from playwright_browser",
24085
+ playwright_browser: "Full browser verification and visual action loop: observe_bundle, visual_click via Moondream pointing, selector/target fill, focused-element typing, screenshot, page_errors, console_logs, network_log, DOM/accessibility, storage",
23748
24086
  carbonyl_browser: "Terminal-rendered real browser automation via Carbonyl: navigate, read rendered text, click/type, sessions, daemon mode",
23749
24087
  scheduler: "Schedule tasks for automatic future execution via OS cron",
23750
24088
  cronjob: "Alias for scheduler: OS cron-backed time triggers",
@@ -282954,6 +283292,18 @@ var init_vision_action_loop = __esm({
282954
283292
  enum: ["single", "double"],
282955
283293
  description: "Click type for click operation. Default single."
282956
283294
  },
283295
+ text: {
283296
+ type: "string",
283297
+ description: "Optional text to type after a live click, or into the currently focused desktop control when operation='none'."
283298
+ },
283299
+ key: {
283300
+ type: "string",
283301
+ description: "Optional key/chord to press after a live click/text entry, for example Enter, Escape, Tab, ctrl+f."
283302
+ },
283303
+ typing_delay_ms: {
283304
+ type: "number",
283305
+ description: "Per-character delay for desktop text entry. Default 10ms."
283306
+ },
282957
283307
  index: {
282958
283308
  type: "number",
282959
283309
  description: "If multiple target points are found, use this 1-based index. Default 1."
@@ -282978,6 +283328,10 @@ var init_vision_action_loop = __esm({
282978
283328
  type: "string",
282979
283329
  description: "Optional screenshot output directory. Relative paths resolve from the workspace."
282980
283330
  },
283331
+ window_title: {
283332
+ type: "string",
283333
+ description: "Optional X11/XWayland window title to capture instead of the whole desktop. Useful on Wayland when root screenshots are blocked."
283334
+ },
282981
283335
  clear_artifacts: {
282982
283336
  type: "boolean",
282983
283337
  description: "Only for reset. If true, also deletes this session's screenshot directory."
@@ -283064,8 +283418,12 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
283064
283418
  const includeVision = asBoolean(args["include_vision"], true);
283065
283419
  const visionPromptOverride = asString(args["vision_prompt"]);
283066
283420
  const language = asString(args["language"]) || "eng";
283421
+ const windowTitle = asString(args["window_title"]);
283067
283422
  const button = this.parseButton(args["button"]);
283068
283423
  const clickType = args["click_type"] === "double" ? "double" : "single";
283424
+ const textToType = asString(args["text"]);
283425
+ const keyToPress = asString(args["key"]);
283426
+ const typingDelayMs = clampInteger2(args["typing_delay_ms"], 10, 0, 500);
283069
283427
  const index = clampInteger2(args["index"], 1, 1, 100);
283070
283428
  const delayMs = clampInteger2(args["delay_ms"], 0, 0, 6e4);
283071
283429
  const maxSteps = action === "run" ? clampInteger2(args["max_steps"], DEFAULT_MAX_STEPS, 1, HARD_MAX_STEPS) : 1;
@@ -283088,13 +283446,28 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
283088
283446
  }
283089
283447
  const stamp = timestampSlug2();
283090
283448
  const screenshotPath = join52(sessionDir2, `${stamp}-step-${step}-before.png`);
283091
- if (process.platform === "linux" && (process.env["WAYLAND_DISPLAY"] || /wayland/i.test(process.env["XDG_SESSION_TYPE"] || ""))) {
283092
- yield "Vision action loop: requesting desktop screenshot permission if the system prompts";
283449
+ if (!windowTitle && process.platform === "linux" && (process.env["WAYLAND_DISPLAY"] || /wayland/i.test(process.env["XDG_SESSION_TYPE"] || ""))) {
283450
+ yield "Vision action loop: capturing full desktop on Wayland. For unattended app loops, pass window_title to avoid compositor screenshot prompts.";
283093
283451
  }
283094
283452
  yield `Vision action loop: capturing screenshot ${step}/${maxSteps}`;
283095
283453
  let screenshotBackend = "";
283454
+ let captureOffset = { x: 0, y: 0 };
283455
+ let captureWindow;
283096
283456
  try {
283097
- screenshotBackend = captureDesktopScreenshot(screenshotPath);
283457
+ if (windowTitle) {
283458
+ const windowCapture = captureDesktopWindowScreenshot(screenshotPath, windowTitle);
283459
+ screenshotBackend = windowCapture.backend;
283460
+ captureOffset = { x: windowCapture.x, y: windowCapture.y };
283461
+ captureWindow = {
283462
+ windowId: windowCapture.windowId,
283463
+ x: windowCapture.x,
283464
+ y: windowCapture.y,
283465
+ width: windowCapture.width,
283466
+ height: windowCapture.height
283467
+ };
283468
+ } else {
283469
+ screenshotBackend = captureDesktopScreenshot(screenshotPath);
283470
+ }
283098
283471
  mutatedFiles.push(screenshotPath);
283099
283472
  } catch (err) {
283100
283473
  success = false;
@@ -283121,6 +283494,9 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
283121
283494
  outputLines.push(`Screenshot saved: ${screenshotPath}`);
283122
283495
  outputLines.push(`Screen: ${dims.width}x${dims.height}`);
283123
283496
  outputLines.push(`Screenshot backend: ${screenshotBackend}`);
283497
+ if (captureWindow) {
283498
+ outputLines.push(`Window: ${windowTitle} id=${captureWindow.windowId} geometry=${captureWindow.x},${captureWindow.y} ${captureWindow.width}x${captureWindow.height}`);
283499
+ }
283124
283500
  let ocr = null;
283125
283501
  if (includeOcr) {
283126
283502
  yield `Vision action loop: running OCR for screenshot ${step}/${maxSteps}`;
@@ -283209,6 +283585,14 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
283209
283585
  selectedPoint = pointFromVisionResult({ points: ocrPoints, source: "tesseract-ocr" }, index, dims);
283210
283586
  }
283211
283587
  if (selectedPoint) {
283588
+ if (captureOffset.x !== 0 || captureOffset.y !== 0) {
283589
+ selectedPoint = {
283590
+ ...selectedPoint,
283591
+ pixelX: selectedPoint.pixelX + captureOffset.x,
283592
+ pixelY: selectedPoint.pixelY + captureOffset.y,
283593
+ source: `${selectedPoint.source}+window:${captureWindow?.windowId ?? "active"}`
283594
+ };
283595
+ }
283212
283596
  outputLines.push(`Target: ${effectiveTarget}`);
283213
283597
  outputLines.push(`Point source: ${selectedPoint.source}`);
283214
283598
  outputLines.push(`Mapped point: (${Math.round(selectedPoint.pixelX)}, ${Math.round(selectedPoint.pixelY)}) normalized (${selectedPoint.x.toFixed(4)}, ${selectedPoint.y.toFixed(4)})`);
@@ -283238,15 +283622,29 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
283238
283622
  } else {
283239
283623
  yield `Vision action loop: performing ${effectiveOperation} at (${pixelX}, ${pixelY})`;
283240
283624
  try {
283625
+ if (captureWindow?.windowId) {
283626
+ const activationBackend = activateDesktopWindow(captureWindow.windowId);
283627
+ outputLines.push(`Activated window via ${activationBackend}: ${captureWindow.windowId}`);
283628
+ }
283241
283629
  const backend = effectiveOperation === "move" ? moveDesktopPointer(pixelX, pixelY) : clickDesktopAt(pixelX, pixelY, button, clickType);
283242
283630
  actionTaken = effectiveOperation === "move" ? `Moved pointer to (${pixelX}, ${pixelY}) via ${backend}` : `Clicked at (${pixelX}, ${pixelY}) via ${backend} [${button} ${clickType}]`;
283243
283631
  outputLines.push(actionTaken);
283632
+ if (textToType) {
283633
+ const keyboardBackend = typeDesktopText(textToType, typingDelayMs);
283634
+ outputLines.push(`Typed text via ${keyboardBackend}: ${JSON.stringify(textToType)}`);
283635
+ actionTaken += `; typed text via ${keyboardBackend}`;
283636
+ }
283637
+ if (keyToPress) {
283638
+ const keyboardBackend = pressDesktopKey(keyToPress);
283639
+ outputLines.push(`Pressed key via ${keyboardBackend}: ${keyToPress}`);
283640
+ actionTaken += `; pressed key via ${keyboardBackend}: ${keyToPress}`;
283641
+ }
283244
283642
  afterScreenshotPath = join52(sessionDir2, `${timestampSlug2()}-step-${step}-after.png`);
283245
- if (process.platform === "linux" && (process.env["WAYLAND_DISPLAY"] || /wayland/i.test(process.env["XDG_SESSION_TYPE"] || ""))) {
283246
- yield "Vision action loop: requesting desktop screenshot permission for post-action verification if the system prompts";
283643
+ if (!windowTitle && process.platform === "linux" && (process.env["WAYLAND_DISPLAY"] || /wayland/i.test(process.env["XDG_SESSION_TYPE"] || ""))) {
283644
+ yield "Vision action loop: capturing post-action full desktop on Wayland. For unattended app loops, pass window_title to avoid compositor screenshot prompts.";
283247
283645
  }
283248
283646
  yield `Vision action loop: capturing post-action screenshot ${step}/${maxSteps}`;
283249
- const afterBackend = captureDesktopScreenshot(afterScreenshotPath);
283647
+ const afterBackend = windowTitle ? captureDesktopWindowScreenshot(afterScreenshotPath, windowTitle).backend : captureDesktopScreenshot(afterScreenshotPath);
283250
283648
  mutatedFiles.push(afterScreenshotPath);
283251
283649
  outputLines.push(`Post-action screenshot: ${afterScreenshotPath}`);
283252
283650
  outputLines.push(`Post-action screenshot backend: ${afterBackend}`);
@@ -283260,6 +283658,35 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
283260
283658
  }
283261
283659
  } else if (effectiveOperation !== "none" && (effectiveTarget || hasCoordinates)) {
283262
283660
  outputLines.push(`Action skipped: no usable point for operation '${effectiveOperation}'.`);
283661
+ } else if (allowActions && !dryRun && (textToType || keyToPress)) {
283662
+ try {
283663
+ const keyboardActions = [];
283664
+ if (captureWindow?.windowId) {
283665
+ const activationBackend = activateDesktopWindow(captureWindow.windowId);
283666
+ keyboardActions.push(`Activated window via ${activationBackend}: ${captureWindow.windowId}`);
283667
+ }
283668
+ if (textToType) {
283669
+ const keyboardBackend = typeDesktopText(textToType, typingDelayMs);
283670
+ keyboardActions.push(`Typed text via ${keyboardBackend}: ${JSON.stringify(textToType)}`);
283671
+ }
283672
+ if (keyToPress) {
283673
+ const keyboardBackend = pressDesktopKey(keyToPress);
283674
+ keyboardActions.push(`Pressed key via ${keyboardBackend}: ${keyToPress}`);
283675
+ }
283676
+ actionTaken = keyboardActions.join("; ");
283677
+ outputLines.push(actionTaken);
283678
+ afterScreenshotPath = join52(sessionDir2, `${timestampSlug2()}-step-${step}-after.png`);
283679
+ const afterBackend = windowTitle ? captureDesktopWindowScreenshot(afterScreenshotPath, windowTitle).backend : captureDesktopScreenshot(afterScreenshotPath);
283680
+ mutatedFiles.push(afterScreenshotPath);
283681
+ outputLines.push(`Post-action screenshot: ${afterScreenshotPath}`);
283682
+ outputLines.push(`Post-action screenshot backend: ${afterBackend}`);
283683
+ } catch (err) {
283684
+ success = false;
283685
+ actionError = err instanceof Error ? err.message : String(err);
283686
+ error = actionError;
283687
+ outputLines.push(`Keyboard action failed: ${actionError}`);
283688
+ outputLines.push(desktopAutomationRecoveryMessage());
283689
+ }
283263
283690
  } else if (action === "run" && !effectiveTarget && !hasCoordinates) {
283264
283691
  success = false;
283265
283692
  error = "Vision loop stopped: no target or coordinates were provided and visual planning did not identify a clickable target. Stopping instead of repeating observe-only screenshots.";
@@ -284152,6 +284579,7 @@ async function ensureSession(options2 = {}) {
284152
284579
  }
284153
284580
  activeSessionId = null;
284154
284581
  activeSessionHeadless = null;
284582
+ activeSessionUrl = null;
284155
284583
  }
284156
284584
  }
284157
284585
  if (activeSessionId) {
@@ -284163,6 +284591,13 @@ async function ensureSession(options2 = {}) {
284163
284591
  }
284164
284592
  activeSessionId = null;
284165
284593
  activeSessionHeadless = null;
284594
+ activeSessionUrl = null;
284595
+ }
284596
+ if (options2.allowCreate === false) {
284597
+ return {
284598
+ error: "No active browser_action Selenium session exists for this action. browser_action is a separate browser/runtime from playwright_browser; continue the current page with playwright_browser, or call browser_action({action:'navigate', url: ...}) first.",
284599
+ sessionId: ""
284600
+ };
284166
284601
  }
284167
284602
  const headless = options2.headless ?? defaultBrowserHeadless();
284168
284603
  const res = await fetch(`${BASE_URL}/session/start`, {
@@ -284180,8 +284615,16 @@ async function ensureSession(options2 = {}) {
284180
284615
  return { error: String(data.message ?? "Failed to start browser session"), sessionId: "" };
284181
284616
  activeSessionId = data.session_id;
284182
284617
  activeSessionHeadless = headless;
284618
+ activeSessionUrl = null;
284183
284619
  return { sessionId: activeSessionId };
284184
284620
  }
284621
+ function browserActionRuntimeHint() {
284622
+ return [
284623
+ "browser_action is a separate browser/runtime from playwright_browser and uses its own Selenium/Chrome session; it does not share page state, cookies, focus, or navigation.",
284624
+ activeSessionUrl ? `Current browser_action URL: ${activeSessionUrl}` : "Current browser_action URL: unknown or not navigated.",
284625
+ "If this page was opened with playwright_browser, keep using playwright_browser actions such as dom_summary, fill, type, press, visual_click, and observe_bundle."
284626
+ ].join(" ");
284627
+ }
284185
284628
  async function apiCall(endpoint, method = "POST", body) {
284186
284629
  const options2 = {
284187
284630
  method,
@@ -284204,7 +284647,7 @@ async function apiCall(endpoint, method = "POST", body) {
284204
284647
  const res = await fetch(url, options2);
284205
284648
  return await res.json();
284206
284649
  }
284207
- var __dirname3, DEFAULT_PORT, SCRAPE_SCRIPT, BASE_URL, serviceProcess, activeSessionId, activeSessionHeadless, BrowserActionTool;
284650
+ var __dirname3, DEFAULT_PORT, SCRAPE_SCRIPT, BASE_URL, serviceProcess, activeSessionId, activeSessionHeadless, activeSessionUrl, BrowserActionTool;
284208
284651
  var init_browser_action = __esm({
284209
284652
  "packages/execution/dist/tools/browser-action.js"() {
284210
284653
  "use strict";
@@ -284217,9 +284660,10 @@ var init_browser_action = __esm({
284217
284660
  serviceProcess = null;
284218
284661
  activeSessionId = null;
284219
284662
  activeSessionHeadless = null;
284663
+ activeSessionUrl = null;
284220
284664
  BrowserActionTool = class {
284221
284665
  name = "browser_action";
284222
- description = "Control a persistent headless Chrome browser session for interactive web tasks. The browser stays open between calls, maintaining cookies, login state, and history. Use this (not web_fetch/web_crawl) when you need to: (1) log into a website, (2) fill and submit forms, (3) click buttons or links interactively, (4) take screenshots of rendered pages, (5) navigate multi-step workflows (checkout, signup, dashboards), (6) interact with elements that require JavaScript (dropdowns, modals, infinite scroll). Actions: navigate, click, click_xy, type, screenshot, dom, scroll, scroll_up, scroll_down, back, forward, close. For browser visuals, use browser_action({action:'screenshot', width, height, output_path}) — this captures the headless browser viewport, not the desktop. Use the desktop screenshot tool only when the actual OS screen is the target. For verification of browser runtime failures, prefer playwright_browser because it exposes page_errors, console_logs, network_log, DOM/accessibility, and screenshots from the same session. IMPORTANT: Start by calling navigate with the URL — do NOT ask the user for credentials or info first. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Navigate to the page, then use dom/screenshot to see what's there, then type/click to interact. Call 'close' when done to free resources. This tool does not save or download arbitrary rendered files (PDFs, archives, media) to disk — clicking a 'Download' link inside the browser does not produce a local file path for the agent. For file acquisition, use the dedicated download/file tool and validate the resulting content-type and size before treating the result as success.";
284666
+ description = "Control a persistent headless Chrome browser session for interactive web tasks. The browser stays open between calls, maintaining cookies, login state, and history. This is a separate Selenium/Chrome runtime from playwright_browser; do not switch between the two mid-workflow unless you intentionally navigate the second tool to the same URL. Use this (not web_fetch/web_crawl) when you need to: (1) log into a website, (2) fill and submit forms, (3) click buttons or links interactively, (4) take screenshots of rendered pages, (5) navigate multi-step workflows (checkout, signup, dashboards), (6) interact with elements that require JavaScript (dropdowns, modals, infinite scroll). Actions: navigate, click, click_xy, type, screenshot, dom, scroll, scroll_up, scroll_down, back, forward, close. For browser visuals, use browser_action({action:'screenshot', width, height, output_path}) — this captures the headless browser viewport, not the desktop. Use the desktop screenshot tool only when the actual OS screen is the target. For verification of browser runtime failures, prefer playwright_browser because it exposes page_errors, console_logs, network_log, DOM/accessibility, and screenshots from the same session. IMPORTANT: Start by calling navigate with the URL — do NOT ask the user for credentials or info first. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Navigate to the page, then use dom/screenshot to see what's there, then type/click to interact. Call 'close' when done to free resources. This tool does not save or download arbitrary rendered files (PDFs, archives, media) to disk — clicking a 'Download' link inside the browser does not produce a local file path for the agent. For file acquisition, use the dedicated download/file tool and validate the resulting content-type and size before treating the result as success.";
284223
284667
  parameters = {
284224
284668
  type: "object",
284225
284669
  properties: {
@@ -284289,27 +284733,38 @@ var init_browser_action = __esm({
284289
284733
  const requestedWidth = args.width == null ? void 0 : asPositiveInt2(args.width, 1280, 320, 3840);
284290
284734
  const requestedHeight = args.height == null ? void 0 : asPositiveInt2(args.height, 720, 240, 2160);
284291
284735
  const requestedScale = args.device_scale_factor == null ? void 0 : asPositiveNumber(args.device_scale_factor, 1, 0.25, 3);
284292
- const launchErr = await launchService();
284293
- if (launchErr) {
284294
- return { success: false, output: "", error: launchErr, durationMs: Date.now() - start2 };
284295
- }
284296
284736
  if (action === "close") {
284297
- if (activeSessionId) {
284737
+ if (activeSessionId || await probeService()) {
284298
284738
  try {
284299
284739
  await apiCall("/session/close");
284300
284740
  } catch {
284301
284741
  }
284302
284742
  activeSessionId = null;
284303
284743
  activeSessionHeadless = null;
284744
+ activeSessionUrl = null;
284304
284745
  }
284305
284746
  return { success: true, output: "Browser session closed.", durationMs: Date.now() - start2 };
284306
284747
  }
284748
+ const actionStartsSession = action === "navigate";
284749
+ if (!actionStartsSession && !activeSessionId) {
284750
+ return {
284751
+ success: false,
284752
+ output: "",
284753
+ error: `browser_action ${action || "(missing action)"} requires an active browser_action session. ` + browserActionRuntimeHint(),
284754
+ durationMs: Date.now() - start2
284755
+ };
284756
+ }
284757
+ const launchErr = await launchService();
284758
+ if (launchErr) {
284759
+ return { success: false, output: "", error: launchErr, durationMs: Date.now() - start2 };
284760
+ }
284307
284761
  const session = await ensureSession({
284308
284762
  width: requestedWidth,
284309
284763
  height: requestedHeight,
284310
284764
  deviceScaleFactor: requestedScale,
284311
284765
  headless: asOptionalBoolean2(args.headless),
284312
- forceNew: asOptionalBoolean2(args.force_new) === true
284766
+ forceNew: asOptionalBoolean2(args.force_new) === true,
284767
+ allowCreate: actionStartsSession
284313
284768
  });
284314
284769
  if (session.error) {
284315
284770
  return { success: false, output: "", error: session.error, durationMs: Date.now() - start2 };
@@ -284327,7 +284782,13 @@ var init_browser_action = __esm({
284327
284782
  }
284328
284783
  result = await apiCall("/navigate", "POST", { url: args.url });
284329
284784
  if (result.ok) {
284330
- return { success: true, output: `Navigated to ${args.url}`, durationMs: Date.now() - start2 };
284785
+ activeSessionUrl = args.url;
284786
+ return {
284787
+ success: true,
284788
+ output: `Navigated to ${args.url}
284789
+ Runtime: browser_action Selenium/Chrome session. Continue with browser_action for this page, or use playwright_browser separately after navigating it.`,
284790
+ durationMs: Date.now() - start2
284791
+ };
284331
284792
  }
284332
284793
  const navMsg = String(result.message ?? "Navigation failed");
284333
284794
  const navHint = navMsg.toLowerCase().includes("connection") || navMsg.toLowerCase().includes("refused") || navMsg.toLowerCase().includes("err_connection") ? " (the URL appears unreachable — check if the target server is running and accepting connections)" : navMsg.toLowerCase().includes("timeout") ? " (page load timed out — try again or use a different URL)" : "";
@@ -284349,7 +284810,7 @@ var init_browser_action = __esm({
284349
284810
  return {
284350
284811
  success: false,
284351
284812
  output: `Click on ${args.selector} failed: ${clickMsg}`,
284352
- error: `browser_action click failed: ${clickMsg}. Try dom_summary first to see what selectors exist on the page.`,
284813
+ error: `browser_action click failed: ${clickMsg}. Try dom_summary first to see what selectors exist on the page. ${browserActionRuntimeHint()}`,
284353
284814
  durationMs: Date.now() - start2
284354
284815
  };
284355
284816
  }
@@ -284391,7 +284852,7 @@ var init_browser_action = __esm({
284391
284852
  return {
284392
284853
  success: false,
284393
284854
  output: `Type into ${args.selector} failed: ${typeMsg}`,
284394
- error: `browser_action type failed: ${typeMsg}. Verify the element is visible and is an input/textarea — use dom_summary to check.`,
284855
+ error: `browser_action type failed: ${typeMsg}. Verify the element is visible and is an input/textarea — use dom_summary to check. ${browserActionRuntimeHint()}`,
284395
284856
  durationMs: Date.now() - start2
284396
284857
  };
284397
284858
  }
@@ -284532,7 +284993,7 @@ var init_browser_action = __esm({
284532
284993
  if (!pointResult || pointResult.points.length === 0) {
284533
284994
  return {
284534
284995
  success: false,
284535
- output: `Vision could not find "${target}" on the page. Try using dom_summary to find the CSS selector instead.`,
284996
+ output: `Vision could not find "${target}" on the page. Try using dom_summary to find the CSS selector instead. ${browserActionRuntimeHint()}`,
284536
284997
  error: "No point backend returned normalized coordinates.",
284537
284998
  durationMs: Date.now() - start2
284538
284999
  };
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.205",
3
+ "version": "1.0.207",
4
4
  "lockfileVersion": 3,
5
5
  "requires": true,
6
6
  "packages": {
7
7
  "": {
8
8
  "name": "omnius",
9
- "version": "1.0.205",
9
+ "version": "1.0.207",
10
10
  "bundleDependencies": [
11
11
  "image-to-ascii"
12
12
  ],
@@ -4565,9 +4565,19 @@
4565
4565
  }
4566
4566
  },
4567
4567
  "node_modules/js-yaml": {
4568
- "version": "4.1.1",
4569
- "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.1.tgz",
4570
- "integrity": "sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==",
4568
+ "version": "4.2.0",
4569
+ "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.2.0.tgz",
4570
+ "integrity": "sha512-ePWsvanv0DWuDRsW8dnt+R4jQ31SCRCQ7hhNcPXZPsoBZiemuZNYGf7adZdqX2D86j6rvKp3RpCxVTSb8WQlOw==",
4571
+ "funding": [
4572
+ {
4573
+ "type": "github",
4574
+ "url": "https://github.com/sponsors/puzrin"
4575
+ },
4576
+ {
4577
+ "type": "github",
4578
+ "url": "https://github.com/sponsors/nodeca"
4579
+ }
4580
+ ],
4571
4581
  "license": "MIT",
4572
4582
  "dependencies": {
4573
4583
  "argparse": "^2.0.1"
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.205",
3
+ "version": "1.0.207",
4
4
  "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",