omnius 1.0.205 → 1.0.207
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +499 -38
- package/npm-shrinkwrap.json +15 -5
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -3987,9 +3987,33 @@ var init_system_deps = __esm({
|
|
|
3987
3987
|
// packages/execution/dist/tools/desktop-control.js
|
|
3988
3988
|
import { execSync as execSync4 } from "node:child_process";
|
|
3989
3989
|
import { existsSync as existsSync6, statSync as statSync3 } from "node:fs";
|
|
3990
|
+
function activateDesktopWindow(windowId) {
|
|
3991
|
+
const attempts = [];
|
|
3992
|
+
const tryAction = (label, command) => {
|
|
3993
|
+
const result = run(command, 5e3);
|
|
3994
|
+
if (result.ok)
|
|
3995
|
+
return label;
|
|
3996
|
+
attempts.push({ label, message: result.message });
|
|
3997
|
+
return null;
|
|
3998
|
+
};
|
|
3999
|
+
if (process.platform === "linux") {
|
|
4000
|
+
if (hasCommand2("xdotool")) {
|
|
4001
|
+
const backend = tryAction("xdotool", `xdotool windowactivate --sync ${quoteShell(windowId)}`);
|
|
4002
|
+
if (backend)
|
|
4003
|
+
return backend;
|
|
4004
|
+
}
|
|
4005
|
+
if (hasCommand2("wmctrl")) {
|
|
4006
|
+
const backend = tryAction("wmctrl", `wmctrl -ia ${quoteShell(windowId)}`);
|
|
4007
|
+
if (backend)
|
|
4008
|
+
return backend;
|
|
4009
|
+
}
|
|
4010
|
+
}
|
|
4011
|
+
throw new Error("No desktop window activation backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage());
|
|
4012
|
+
}
|
|
3990
4013
|
function captureDesktopScreenshot(outputPath3) {
|
|
3991
4014
|
const attempts = [];
|
|
3992
4015
|
const out = quoteShell(outputPath3);
|
|
4016
|
+
const allowInteractiveWaylandScreenshot = envFlag("OMNIUS_DESKTOP_INTERACTIVE_SCREENSHOT", false);
|
|
3993
4017
|
const tryCapture = (label, command, timeout2 = 1e4) => {
|
|
3994
4018
|
const result = run(command, timeout2);
|
|
3995
4019
|
if (result.ok && existsSync6(outputPath3)) {
|
|
@@ -4022,11 +4046,22 @@ $bitmap.Dispose()
|
|
|
4022
4046
|
return backend;
|
|
4023
4047
|
} else if (process.platform === "linux") {
|
|
4024
4048
|
const desktop = `${process.env["XDG_CURRENT_DESKTOP"] || ""} ${process.env["DESKTOP_SESSION"] || ""}`;
|
|
4025
|
-
if ((/wayland/i.test(process.env["XDG_SESSION_TYPE"] || "") || process.env["WAYLAND_DISPLAY"]) &&
|
|
4049
|
+
if ((/wayland/i.test(process.env["XDG_SESSION_TYPE"] || "") || process.env["WAYLAND_DISPLAY"]) && /gnome/i.test(desktop) && hasCommand2("gdbus") && allowInteractiveWaylandScreenshot) {
|
|
4050
|
+
const backend = tryCapture("gnome-shell-interactive-screenshot-dbus", gnomeInteractiveScreenshotCommand(outputPath3), 13e4);
|
|
4051
|
+
if (backend)
|
|
4052
|
+
return backend;
|
|
4053
|
+
}
|
|
4054
|
+
if ((/wayland/i.test(process.env["XDG_SESSION_TYPE"] || "") || process.env["WAYLAND_DISPLAY"]) && hasCommand2("gdbus") && hasCommand2("dbus-monitor") && allowInteractiveWaylandScreenshot) {
|
|
4026
4055
|
const backend = tryCapture("xdg-desktop-portal-screenshot", portalScreenshotCommand(outputPath3), 13e4);
|
|
4027
4056
|
if (backend)
|
|
4028
4057
|
return backend;
|
|
4029
4058
|
}
|
|
4059
|
+
if ((/wayland/i.test(process.env["XDG_SESSION_TYPE"] || "") || process.env["WAYLAND_DISPLAY"]) && !allowInteractiveWaylandScreenshot) {
|
|
4060
|
+
attempts.push({
|
|
4061
|
+
label: "interactive-wayland-screenshot",
|
|
4062
|
+
message: "skipped by default to avoid unattended GNOME/portal screenshot selection stalls. For desktop app loops, pass window_title to vision_action_loop so Omnius captures the named X11/XWayland window. Set OMNIUS_DESKTOP_INTERACTIVE_SCREENSHOT=1 only when a human will complete the full-screen screenshot prompt."
|
|
4063
|
+
});
|
|
4064
|
+
}
|
|
4030
4065
|
if (/gnome/i.test(desktop) && hasCommand2("gdbus")) {
|
|
4031
4066
|
const backend = tryCapture("gnome-shell-screenshot-dbus", `gdbus call --session --dest org.gnome.Shell.Screenshot --object-path /org/gnome/Shell/Screenshot --method org.gnome.Shell.Screenshot.Screenshot false false ${out}`);
|
|
4032
4067
|
if (backend)
|
|
@@ -4079,6 +4114,83 @@ $bitmap.Dispose()
|
|
|
4079
4114
|
}
|
|
4080
4115
|
throw new Error("No desktop screenshot backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage());
|
|
4081
4116
|
}
|
|
4117
|
+
function captureDesktopWindowScreenshot(outputPath3, windowTitle) {
|
|
4118
|
+
const attempts = [];
|
|
4119
|
+
const out = quoteShell(outputPath3);
|
|
4120
|
+
const runText = (command, timeout2 = 5e3) => {
|
|
4121
|
+
const result = runCaptureText(command, timeout2);
|
|
4122
|
+
if (result.ok)
|
|
4123
|
+
return result.output.trim();
|
|
4124
|
+
attempts.push({ label: command.split(/\s+/)[0] || command, message: result.message });
|
|
4125
|
+
return null;
|
|
4126
|
+
};
|
|
4127
|
+
if (process.platform !== "linux") {
|
|
4128
|
+
throw new Error("Window screenshot capture is currently implemented for Linux/X11/XWayland windows only.");
|
|
4129
|
+
}
|
|
4130
|
+
if (!hasCommand2("xdotool")) {
|
|
4131
|
+
attempts.push({ label: "xdotool", message: "not found on PATH" });
|
|
4132
|
+
} else if (!hasCommand2("import")) {
|
|
4133
|
+
attempts.push({ label: "import", message: "ImageMagick import not found on PATH" });
|
|
4134
|
+
} else {
|
|
4135
|
+
const windowId = windowTitle ? runText(`xdotool search --name ${quoteShell(windowTitle)} | tail -n1`) : runText("xdotool getactivewindow");
|
|
4136
|
+
if (windowId) {
|
|
4137
|
+
const geometry = runText(`xdotool getwindowgeometry --shell ${quoteShell(windowId)}`);
|
|
4138
|
+
const parsed = parseXdotoolGeometry(geometry || "");
|
|
4139
|
+
if (!parsed) {
|
|
4140
|
+
attempts.push({ label: "xdotool getwindowgeometry", message: `Could not parse geometry for window ${windowId}: ${geometry}` });
|
|
4141
|
+
} else {
|
|
4142
|
+
const result = run(`import -window ${quoteShell(windowId)} ${out}`, 1e4);
|
|
4143
|
+
if (result.ok && existsSync6(outputPath3)) {
|
|
4144
|
+
const inspection = inspectScreenshot(outputPath3);
|
|
4145
|
+
if (inspection.ok) {
|
|
4146
|
+
return {
|
|
4147
|
+
backend: "imagemagick-import-window",
|
|
4148
|
+
windowId,
|
|
4149
|
+
...parsed
|
|
4150
|
+
};
|
|
4151
|
+
}
|
|
4152
|
+
attempts.push({ label: "import-window", message: inspection.message });
|
|
4153
|
+
} else {
|
|
4154
|
+
attempts.push({ label: "import-window", message: result.message });
|
|
4155
|
+
}
|
|
4156
|
+
}
|
|
4157
|
+
}
|
|
4158
|
+
}
|
|
4159
|
+
throw new Error("No desktop window screenshot backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage());
|
|
4160
|
+
}
|
|
4161
|
+
function gnomeInteractiveScreenshotCommand(outputPath3) {
|
|
4162
|
+
const script = `
|
|
4163
|
+
set -eu
|
|
4164
|
+
out=${quoteShell(outputPath3)}
|
|
4165
|
+
printf '%s\\n' 'Omnius desktop screenshot: choose Full Screen in the GNOME screenshot UI, then press Enter.' >&2
|
|
4166
|
+
call_out="$(gdbus call --session --dest org.gnome.Shell.Screenshot --object-path /org/gnome/Shell/Screenshot --method org.gnome.Shell.Screenshot.InteractiveScreenshot 2>&1)" || {
|
|
4167
|
+
printf '%s\\n' "$call_out" >&2
|
|
4168
|
+
exit 1
|
|
4169
|
+
}
|
|
4170
|
+
uri="$(printf '%s\\n' "$call_out" | sed -n "s/.*'\\(file:[^']*\\)'.*/\\1/p" | tail -n 1)"
|
|
4171
|
+
if [ -z "$uri" ]; then
|
|
4172
|
+
printf 'GNOME interactive screenshot did not return a file URI: %s\\n' "$call_out" >&2
|
|
4173
|
+
exit 1
|
|
4174
|
+
fi
|
|
4175
|
+
|
|
4176
|
+
if command -v gio >/dev/null 2>&1; then
|
|
4177
|
+
gio copy -f "$uri" "$out"
|
|
4178
|
+
elif command -v python3 >/dev/null 2>&1; then
|
|
4179
|
+
python3 - "$uri" "$out" <<'PY'
|
|
4180
|
+
import shutil, sys, urllib.parse
|
|
4181
|
+
uri, out = sys.argv[1], sys.argv[2]
|
|
4182
|
+
parsed = urllib.parse.urlparse(uri)
|
|
4183
|
+
if parsed.scheme != "file":
|
|
4184
|
+
raise SystemExit(f"Unsupported screenshot URI scheme: {parsed.scheme}")
|
|
4185
|
+
shutil.copyfile(urllib.parse.unquote(parsed.path), out)
|
|
4186
|
+
PY
|
|
4187
|
+
else
|
|
4188
|
+
printf 'Need gio or python3 to copy GNOME screenshot URI %s to %s.\\n' "$uri" "$out" >&2
|
|
4189
|
+
exit 1
|
|
4190
|
+
fi
|
|
4191
|
+
`.trim();
|
|
4192
|
+
return `bash -lc ${quoteShellLiteral(script)}`;
|
|
4193
|
+
}
|
|
4082
4194
|
function portalScreenshotCommand(outputPath3) {
|
|
4083
4195
|
const script = `
|
|
4084
4196
|
set -eu
|
|
@@ -4086,6 +4198,7 @@ out=${quoteShell(outputPath3)}
|
|
|
4086
4198
|
monitor_file="$(mktemp)"
|
|
4087
4199
|
cleanup() {
|
|
4088
4200
|
if [ -n "\${monpid:-}" ]; then kill "$monpid" >/dev/null 2>&1 || true; fi
|
|
4201
|
+
if [ -n "\${keypid:-}" ]; then kill "$keypid" >/dev/null 2>&1 || true; fi
|
|
4089
4202
|
rm -f "$monitor_file"
|
|
4090
4203
|
}
|
|
4091
4204
|
trap cleanup EXIT
|
|
@@ -4102,6 +4215,25 @@ fi
|
|
|
4102
4215
|
|
|
4103
4216
|
dbus-monitor --session "type='signal',interface='org.freedesktop.portal.Request',member='Response',path='$handle'" > "$monitor_file" 2>&1 &
|
|
4104
4217
|
monpid=$!
|
|
4218
|
+
if [ "\${OMNIUS_SCREENSHOT_AUTO_CONFIRM:-1}" != "0" ]; then
|
|
4219
|
+
(
|
|
4220
|
+
sleep 1
|
|
4221
|
+
i=0
|
|
4222
|
+
while [ "$i" -lt 8 ]; do
|
|
4223
|
+
if command -v xdotool >/dev/null 2>&1; then
|
|
4224
|
+
DISPLAY="\${DISPLAY:-:0}" xdotool key Return >/dev/null 2>&1 || true
|
|
4225
|
+
elif command -v ydotool >/dev/null 2>&1; then
|
|
4226
|
+
ydotool key 28:1 28:0 >/dev/null 2>&1 || true
|
|
4227
|
+
elif command -v dotool >/dev/null 2>&1; then
|
|
4228
|
+
printf 'key enter
|
|
4229
|
+
' | dotool >/dev/null 2>&1 || true
|
|
4230
|
+
fi
|
|
4231
|
+
i=$((i + 1))
|
|
4232
|
+
sleep 0.5
|
|
4233
|
+
done
|
|
4234
|
+
) &
|
|
4235
|
+
keypid=$!
|
|
4236
|
+
fi
|
|
4105
4237
|
deadline=$(( $(date +%s) + 120 ))
|
|
4106
4238
|
while :; do
|
|
4107
4239
|
if grep -q "member=Response" "$monitor_file"; then break; fi
|
|
@@ -4144,7 +4276,7 @@ else
|
|
|
4144
4276
|
exit 1
|
|
4145
4277
|
fi
|
|
4146
4278
|
`.trim();
|
|
4147
|
-
return `bash -lc ${
|
|
4279
|
+
return `bash -lc ${quoteShellLiteral(script)}`;
|
|
4148
4280
|
}
|
|
4149
4281
|
function moveDesktopPointer(x, y) {
|
|
4150
4282
|
const result = performDesktopPointerAction({ x, y, moveOnly: true });
|
|
@@ -4167,6 +4299,25 @@ function clickDesktopAt(x, y, button, clickType) {
|
|
|
4167
4299
|
lastPointer = { x: Math.round(x), y: Math.round(y) };
|
|
4168
4300
|
return result.backend;
|
|
4169
4301
|
}
|
|
4302
|
+
function typeDesktopText(text, delayMs = 10) {
|
|
4303
|
+
const result = performDesktopKeyboardAction({
|
|
4304
|
+
kind: "type",
|
|
4305
|
+
text,
|
|
4306
|
+
delayMs
|
|
4307
|
+
});
|
|
4308
|
+
if (!result.ok)
|
|
4309
|
+
throw new Error(result.message);
|
|
4310
|
+
return result.backend;
|
|
4311
|
+
}
|
|
4312
|
+
function pressDesktopKey(key) {
|
|
4313
|
+
const result = performDesktopKeyboardAction({
|
|
4314
|
+
kind: "key",
|
|
4315
|
+
key
|
|
4316
|
+
});
|
|
4317
|
+
if (!result.ok)
|
|
4318
|
+
throw new Error(result.message);
|
|
4319
|
+
return result.backend;
|
|
4320
|
+
}
|
|
4170
4321
|
function tryRunXdotoolShellFallback(command) {
|
|
4171
4322
|
if (!/\bxdotool\b|\bxdtool\b/.test(command))
|
|
4172
4323
|
return null;
|
|
@@ -4278,7 +4429,9 @@ function desktopAutomationRecoveryMessage(command) {
|
|
|
4278
4429
|
" Linux pointer control: xdotool/X11, ydotool, dotool, python-xlib",
|
|
4279
4430
|
" macOS: cliclick, then System Events",
|
|
4280
4431
|
" Windows: PowerShell user32 input",
|
|
4281
|
-
"On Wayland,
|
|
4432
|
+
"On Wayland, unattended full-desktop screenshots are compositor-restricted and interactive screenshot prompts are skipped by default.",
|
|
4433
|
+
"For desktop app loops on GNOME Wayland, pass window_title to vision_action_loop so Omnius captures the named X11/XWayland window without a full-screen prompt.",
|
|
4434
|
+
"Set OMNIUS_DESKTOP_INTERACTIVE_SCREENSHOT=1 only when a human will complete the full-screen screenshot prompt.",
|
|
4282
4435
|
"On GNOME Wayland, unattended screenshots may still be denied by compositor policy. Install gnome-screenshot or grant screenshot permission for the session if capture is blocked.",
|
|
4283
4436
|
"On Wayland pointer control, install and enable ydotool or dotool when xdotool cannot open an X display."
|
|
4284
4437
|
].join("\n");
|
|
@@ -4382,6 +4535,72 @@ ${options2.moveOnly ? "" : `for _ in range(${clicks}):
|
|
|
4382
4535
|
message: "No desktop mouse backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage()
|
|
4383
4536
|
};
|
|
4384
4537
|
}
|
|
4538
|
+
function performDesktopKeyboardAction(options2) {
|
|
4539
|
+
const attempts = [];
|
|
4540
|
+
const tryAction = (label, command) => {
|
|
4541
|
+
const result = run(command, 1e4);
|
|
4542
|
+
if (result.ok)
|
|
4543
|
+
return label;
|
|
4544
|
+
attempts.push({ label, message: result.message });
|
|
4545
|
+
return null;
|
|
4546
|
+
};
|
|
4547
|
+
if (process.platform === "linux") {
|
|
4548
|
+
if (hasCommand2("xdotool")) {
|
|
4549
|
+
const command = options2.kind === "type" ? `xdotool type --clearmodifiers --delay ${Math.max(0, Math.min(500, Math.round(options2.delayMs)))} ${quoteShell(options2.text)}` : `xdotool key --clearmodifiers ${quoteShell(options2.key)}`;
|
|
4550
|
+
const backend = tryAction("xdotool", command);
|
|
4551
|
+
if (backend)
|
|
4552
|
+
return { ok: true, backend };
|
|
4553
|
+
} else {
|
|
4554
|
+
attempts.push({ label: "xdotool", message: "not found on PATH" });
|
|
4555
|
+
}
|
|
4556
|
+
if (hasCommand2("ydotool")) {
|
|
4557
|
+
const command = options2.kind === "type" ? `ydotool type ${quoteShell(options2.text)}` : `ydotool key ${quoteShell(options2.key)}`;
|
|
4558
|
+
const backend = tryAction("ydotool", command);
|
|
4559
|
+
if (backend)
|
|
4560
|
+
return { ok: true, backend };
|
|
4561
|
+
}
|
|
4562
|
+
if (hasCommand2("dotool")) {
|
|
4563
|
+
const commandText = options2.kind === "type" ? `type ${options2.text}
|
|
4564
|
+
` : `key ${options2.key}
|
|
4565
|
+
`;
|
|
4566
|
+
const backend = tryAction("dotool", `printf ${quoteShell(commandText)} | dotool`);
|
|
4567
|
+
if (backend)
|
|
4568
|
+
return { ok: true, backend };
|
|
4569
|
+
}
|
|
4570
|
+
} else if (process.platform === "darwin") {
|
|
4571
|
+
if (options2.kind === "type") {
|
|
4572
|
+
const script = `tell application "System Events" to keystroke ${quoteAppleScript(options2.text)}`;
|
|
4573
|
+
const backend = tryAction("osascript-system-events", `osascript -e ${quoteShell(script)}`);
|
|
4574
|
+
if (backend)
|
|
4575
|
+
return { ok: true, backend };
|
|
4576
|
+
} else {
|
|
4577
|
+
const script = `tell application "System Events" to key code ${quoteAppleScript(options2.key)}`;
|
|
4578
|
+
const backend = tryAction("osascript-system-events", `osascript -e ${quoteShell(script)}`);
|
|
4579
|
+
if (backend)
|
|
4580
|
+
return { ok: true, backend };
|
|
4581
|
+
}
|
|
4582
|
+
}
|
|
4583
|
+
return {
|
|
4584
|
+
ok: false,
|
|
4585
|
+
message: "No desktop keyboard backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage()
|
|
4586
|
+
};
|
|
4587
|
+
}
|
|
4588
|
+
function parseXdotoolGeometry(raw) {
|
|
4589
|
+
const values = /* @__PURE__ */ new Map();
|
|
4590
|
+
for (const line of raw.split(/\r?\n/)) {
|
|
4591
|
+
const match = line.match(/^([A-Z]+)=(-?\d+)$/);
|
|
4592
|
+
if (match)
|
|
4593
|
+
values.set(match[1], Number(match[2]));
|
|
4594
|
+
}
|
|
4595
|
+
const x = values.get("X");
|
|
4596
|
+
const y = values.get("Y");
|
|
4597
|
+
const width = values.get("WIDTH");
|
|
4598
|
+
const height = values.get("HEIGHT");
|
|
4599
|
+
if ([x, y, width, height].every((value2) => Number.isFinite(value2))) {
|
|
4600
|
+
return { x, y, width, height };
|
|
4601
|
+
}
|
|
4602
|
+
return null;
|
|
4603
|
+
}
|
|
4385
4604
|
function windowsMouseScript(x, y, down, up, clicks = 1) {
|
|
4386
4605
|
const clickBody = down == null || up == null ? "" : `
|
|
4387
4606
|
for ($i = 0; $i -lt ${clicks}; $i++) {
|
|
@@ -4505,6 +4724,26 @@ function run(command, timeout2) {
|
|
|
4505
4724
|
};
|
|
4506
4725
|
}
|
|
4507
4726
|
}
|
|
4727
|
+
function runCaptureText(command, timeout2) {
|
|
4728
|
+
try {
|
|
4729
|
+
const output = execSync4(command, {
|
|
4730
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
4731
|
+
timeout: timeout2,
|
|
4732
|
+
env: { ...process.env },
|
|
4733
|
+
encoding: "utf8"
|
|
4734
|
+
});
|
|
4735
|
+
return { ok: true, output, message: "" };
|
|
4736
|
+
} catch (err) {
|
|
4737
|
+
const anyErr = err;
|
|
4738
|
+
const stdout = bufferishToString(anyErr.stdout);
|
|
4739
|
+
const stderr = bufferishToString(anyErr.stderr);
|
|
4740
|
+
return {
|
|
4741
|
+
ok: false,
|
|
4742
|
+
output: stdout,
|
|
4743
|
+
message: (stderr || stdout || anyErr.message || String(err)).trim().slice(0, 800)
|
|
4744
|
+
};
|
|
4745
|
+
}
|
|
4746
|
+
}
|
|
4508
4747
|
function bufferishToString(value2) {
|
|
4509
4748
|
if (Buffer.isBuffer(value2))
|
|
4510
4749
|
return value2.toString("utf8");
|
|
@@ -4517,6 +4756,18 @@ function powershellCommand(script) {
|
|
|
4517
4756
|
function quoteShell(value2) {
|
|
4518
4757
|
return JSON.stringify(value2);
|
|
4519
4758
|
}
|
|
4759
|
+
function quoteShellLiteral(value2) {
|
|
4760
|
+
return `'${value2.replace(/'/g, `'"'"'`)}'`;
|
|
4761
|
+
}
|
|
4762
|
+
function envFlag(name10, fallback) {
|
|
4763
|
+
const value2 = process.env[name10];
|
|
4764
|
+
if (value2 === void 0)
|
|
4765
|
+
return fallback;
|
|
4766
|
+
return /^(1|true|yes|on)$/i.test(value2.trim());
|
|
4767
|
+
}
|
|
4768
|
+
function quoteAppleScript(value2) {
|
|
4769
|
+
return JSON.stringify(value2);
|
|
4770
|
+
}
|
|
4520
4771
|
function psString(value2) {
|
|
4521
4772
|
return value2.replace(/'/g, "''");
|
|
4522
4773
|
}
|
|
@@ -8507,7 +8758,7 @@ function resetMoondreamClient() {
|
|
|
8507
8758
|
function getVisionPointDiagnostics() {
|
|
8508
8759
|
return [...lastPointDiagnostics];
|
|
8509
8760
|
}
|
|
8510
|
-
function
|
|
8761
|
+
function envFlag2(value2, fallback = false) {
|
|
8511
8762
|
if (value2 === void 0)
|
|
8512
8763
|
return fallback;
|
|
8513
8764
|
if (/^(1|true|yes|on)$/i.test(value2.trim()))
|
|
@@ -8627,8 +8878,8 @@ function resolveHuggingFaceVisionModelCandidates(preferredModel) {
|
|
|
8627
8878
|
for (const model of splitModelList(process.env["OMNIUS_MOONDREAM_HF_MODELS"]))
|
|
8628
8879
|
add2(model, true);
|
|
8629
8880
|
add2(process.env["MOONDREAM_HF_MODEL"] || "", Boolean(process.env["MOONDREAM_HF_MODEL"]));
|
|
8630
|
-
const fullPreviewAuto =
|
|
8631
|
-
const compactFallbackAuto =
|
|
8881
|
+
const fullPreviewAuto = envFlag2(process.env["OMNIUS_MOONDREAM3_PREVIEW_AUTO"], true) || envFlag2(process.env["OMNIUS_MOONDREAM3_PREVIEW"], false);
|
|
8882
|
+
const compactFallbackAuto = envFlag2(process.env["OMNIUS_MOONDREAM2_4BIT_AUTO"], true);
|
|
8632
8883
|
const hasExplicitHf = candidates.some((candidate) => candidate.explicit);
|
|
8633
8884
|
const hasExplicitNonHf = Boolean(preferred) && !isHuggingFaceVisionModel(preferred);
|
|
8634
8885
|
if (hasExplicitNonHf)
|
|
@@ -8937,7 +9188,7 @@ async function callOllamaVision(ollamaHost, model, prompt, imageBase64, timeoutM
|
|
|
8937
9188
|
return typeof data.response === "string" && data.response.trim() ? data.response : null;
|
|
8938
9189
|
}
|
|
8939
9190
|
function shouldAutoPullOllamaVisionModel(model) {
|
|
8940
|
-
if (!
|
|
9191
|
+
if (!envFlag2(process.env["OMNIUS_OLLAMA_VISION_AUTO_PULL"], true))
|
|
8941
9192
|
return false;
|
|
8942
9193
|
return Boolean(model.trim());
|
|
8943
9194
|
}
|
|
@@ -9031,7 +9282,7 @@ async function prepareHuggingFaceVisionCandidate(candidate, diagnostics) {
|
|
|
9031
9282
|
return { python, env: env2, gpuIndex: decision2.lease.gpuIndex, release: () => decision2.lease.release() };
|
|
9032
9283
|
}
|
|
9033
9284
|
function ensureHuggingFaceVisionPython(modelId) {
|
|
9034
|
-
const managed =
|
|
9285
|
+
const managed = envFlag2(process.env["OMNIUS_MOONDREAM_HF_MANAGED"], true);
|
|
9035
9286
|
if (!managed)
|
|
9036
9287
|
return legacyHuggingFaceVisionPython(modelId);
|
|
9037
9288
|
ensureUnifiedCacheDirs();
|
|
@@ -10075,6 +10326,79 @@ function pngDimensions(buffer2) {
|
|
|
10075
10326
|
}
|
|
10076
10327
|
return null;
|
|
10077
10328
|
}
|
|
10329
|
+
async function describeFocusedEditable(pageHandle) {
|
|
10330
|
+
const active = await pageHandle.evaluate(`(() => {
|
|
10331
|
+
const el = document.activeElement;
|
|
10332
|
+
if (!el) return null;
|
|
10333
|
+
const rect = el.getBoundingClientRect();
|
|
10334
|
+
const role = (el.getAttribute("role") || "").toLowerCase();
|
|
10335
|
+
const contentEditable = String(el.getAttribute("contenteditable") || "").toLowerCase();
|
|
10336
|
+
const isEditable = el.matches("input, textarea")
|
|
10337
|
+
|| contentEditable === "" || contentEditable === "true"
|
|
10338
|
+
|| ["textbox", "searchbox", "combobox"].includes(role);
|
|
10339
|
+
return {
|
|
10340
|
+
tag: String(el.tagName || "").toLowerCase(),
|
|
10341
|
+
id: el.id || "",
|
|
10342
|
+
name: el.getAttribute("name") || "",
|
|
10343
|
+
role,
|
|
10344
|
+
ariaLabel: el.getAttribute("aria-label") || "",
|
|
10345
|
+
type: el.getAttribute("type") || "",
|
|
10346
|
+
placeholder: el.getAttribute("placeholder") || "",
|
|
10347
|
+
text: String(el.textContent || "").trim().slice(0, 120),
|
|
10348
|
+
isEditable,
|
|
10349
|
+
rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
|
|
10350
|
+
};
|
|
10351
|
+
})()`);
|
|
10352
|
+
return active && typeof active === "object" ? active : null;
|
|
10353
|
+
}
|
|
10354
|
+
async function clickAndFillBrowserTarget(pageHandle, target, text, typingDelay) {
|
|
10355
|
+
const viewport = pageHandle.viewportSize?.() ?? { width: 1280, height: 720 };
|
|
10356
|
+
let candidate = await findBrowserVisualCandidate(pageHandle, target, viewport.width / 2, viewport.height / 2, true);
|
|
10357
|
+
let source = "dom-candidate";
|
|
10358
|
+
if (!candidate) {
|
|
10359
|
+
candidate = await findBrowserVisualCandidate(pageHandle, target, viewport.width / 2, viewport.height / 2, true, true, true);
|
|
10360
|
+
if (candidate?.["scrolledIntoView"] === true)
|
|
10361
|
+
source += "+scroll";
|
|
10362
|
+
if (candidate)
|
|
10363
|
+
await pageHandle.waitForTimeout(150);
|
|
10364
|
+
}
|
|
10365
|
+
const center = candidate?.["center"];
|
|
10366
|
+
const x = Number(center?.x);
|
|
10367
|
+
const y = Number(center?.y);
|
|
10368
|
+
if (!Number.isFinite(x) || !Number.isFinite(y)) {
|
|
10369
|
+
throw new Error(`No visible editable candidate matched target "${target}". Run observe_bundle or dom_summary to inspect available labels/selectors.`);
|
|
10370
|
+
}
|
|
10371
|
+
await pageHandle.mouse.click(x, y);
|
|
10372
|
+
await pageHandle.waitForTimeout(80);
|
|
10373
|
+
const active = await describeFocusedEditable(pageHandle);
|
|
10374
|
+
if (!active?.["isEditable"]) {
|
|
10375
|
+
throw new Error(`Target "${target}" was clicked, but no editable element became focused. Matched element: ${candidate ? JSON.stringify({
|
|
10376
|
+
tag: candidate["tag"],
|
|
10377
|
+
text: candidate["text"],
|
|
10378
|
+
ariaLabel: candidate["ariaLabel"],
|
|
10379
|
+
placeholder: candidate["placeholder"],
|
|
10380
|
+
name: candidate["name"]
|
|
10381
|
+
}) : "(none)"}.`);
|
|
10382
|
+
}
|
|
10383
|
+
const selectAll = process.platform === "darwin" ? "Meta+A" : "Control+A";
|
|
10384
|
+
await pageHandle.keyboard.press(selectAll);
|
|
10385
|
+
await pageHandle.keyboard.type(text, { delay: typingDelay });
|
|
10386
|
+
return { candidate, active, source };
|
|
10387
|
+
}
|
|
10388
|
+
function evaluateFailureMessage(err, code8) {
|
|
10389
|
+
const raw = err instanceof Error ? err.message : String(err);
|
|
10390
|
+
const hints = [];
|
|
10391
|
+
if (/map is not a function/i.test(raw) && /querySelectorAll/i.test(code8)) {
|
|
10392
|
+
hints.push("document.querySelectorAll() returns a NodeList; use Array.from(document.querySelectorAll(selector)).map(...) or [...document.querySelectorAll(selector)].map(...).");
|
|
10393
|
+
}
|
|
10394
|
+
if (/(?:\.value\s*=|setAttribute\(['"]value['"])/.test(code8) && /\b(input|textarea|querySelector)/i.test(code8)) {
|
|
10395
|
+
hints.push("Do not fill modern React/Vue/Svelte forms by assigning .value in evaluate; use playwright_browser fill, or visual_click the field then type, so input/change events fire.");
|
|
10396
|
+
}
|
|
10397
|
+
if (/querySelectorAll|querySelector/.test(code8)) {
|
|
10398
|
+
hints.push("For page inspection, prefer query_all, dom_summary, or observe_bundle before raw evaluate.");
|
|
10399
|
+
}
|
|
10400
|
+
return [raw.slice(0, 500), ...hints.map((hint) => `Hint: ${hint}`)].join("\n");
|
|
10401
|
+
}
|
|
10078
10402
|
function buildImageMarker(buffer2) {
|
|
10079
10403
|
let mimeType = "image/png";
|
|
10080
10404
|
let out = buffer2;
|
|
@@ -10308,7 +10632,7 @@ var init_playwright_browser = __esm({
|
|
|
10308
10632
|
PLAYWRIGHT_BROWSERS_DIR = join13(PLAYWRIGHT_RUNTIME_DIR, "browsers");
|
|
10309
10633
|
PlaywrightBrowserTool = class {
|
|
10310
10634
|
name = "playwright_browser";
|
|
10311
|
-
description = "Full-scope Playwright browser automation + diagnostic capture. Launches a persistent headless Chromium session by default, with optional visible/headed mode when a GUI display is available. Beyond navigation/interaction, this tool buffers everything the running app emits (console messages, network requests, JS exceptions, accessibility tree) so the agent can verify what is ACTUALLY happening — not just what the build/test reports. Auto-installs Playwright + Chromium on first use without sudo or OS package manager escalation. Diagnostic actions: observe_bundle, dom_summary, dom, console_logs, network_log, page_errors, a11y_snapshot, bounding_box, query_all, performance, cookies, storage, viewport, clear_diagnostics. Interaction actions: navigate, click, visual_click, fill, type, press, select, check, hover. Capture actions: screenshot, pdf, content, innerText, innerHTML, getAttribute, evaluate. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Workflow for user-facing work: start/serve the system with the stack-native tool, navigate to the real URL, then inspect page_errors, console_logs, network_log, DOM/accessibility, and screenshot evidence before completion. Build/typecheck/test output is only one layer; runtime browser evidence is required when the delivered artifact is a page, app, dashboard, game, form, visualization, or other UI. Repeat navigate/act/observe until the actual user flow is clean.";
|
|
10635
|
+
description = "Full-scope Playwright browser automation + diagnostic capture. Launches a persistent headless Chromium session by default, with optional visible/headed mode when a GUI display is available. Beyond navigation/interaction, this tool buffers everything the running app emits (console messages, network requests, JS exceptions, accessibility tree) so the agent can verify what is ACTUALLY happening — not just what the build/test reports. Auto-installs Playwright + Chromium on first use without sudo or OS package manager escalation. Diagnostic actions: observe_bundle, dom_summary, dom, console_logs, network_log, page_errors, a11y_snapshot, bounding_box, query_all, performance, cookies, storage, viewport, clear_diagnostics. Interaction actions: navigate, click, visual_click, fill, type, press, select, check, hover. Use fill with a selector or natural-language target for form fields; avoid raw evaluate for form filling because direct .value assignment does not fire app input/change events. This is a separate browser/runtime from browser_action; once you start a workflow here, continue here unless you intentionally navigate browser_action to the same URL. Capture actions: screenshot, pdf, content, innerText, innerHTML, getAttribute, evaluate. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Workflow for user-facing work: start/serve the system with the stack-native tool, navigate to the real URL, then inspect page_errors, console_logs, network_log, DOM/accessibility, and screenshot evidence before completion. Build/typecheck/test output is only one layer; runtime browser evidence is required when the delivered artifact is a page, app, dashboard, game, form, visualization, or other UI. Repeat navigate/act/observe until the actual user flow is clean.";
|
|
10312
10636
|
parameters = {
|
|
10313
10637
|
type: "object",
|
|
10314
10638
|
properties: {
|
|
@@ -10358,7 +10682,7 @@ var init_playwright_browser = __esm({
|
|
|
10358
10682
|
"clear_diagnostics",
|
|
10359
10683
|
"close"
|
|
10360
10684
|
],
|
|
10361
|
-
description: "Action to perform:\n- navigate: go to a URL\n- click: click element by selector\n- fill: clear input and type text
|
|
10685
|
+
description: "Action to perform:\n- navigate: go to a URL\n- click: click element by selector\n- fill: clear input and type text by selector, or by natural-language target when selector is absent\n- type: type text character by character into a selector, or into the currently focused element after visual_click\n- press: press a key (Enter, Tab, Escape, etc.)\n- screenshot: capture the headless browser page, not the desktop; use value to choose the output file path\n- observe_bundle: capture URL/title/viewport, DOM summary, a11y, diagnostics, screenshot, and gate assessment\n- visual_click: browser screenshot -> Moondream point -> elementFromPoint -> human-like Playwright mouse click -> post-action screenshot\n- evaluate: run JavaScript in page context\n- content: get page text content (readable, stripped)\n- dom: get raw page HTML (truncated)\n- dom_summary: compact interactive DOM summary with selectors\n- innerText: get innerText of a specific element\n- select: select dropdown option by value\n- check/uncheck: toggle checkbox\n- hover: hover over element\n- wait: wait for a selector to appear\n- waitForNavigation: wait for page navigation to complete\n- waitForSelector: wait for element matching selector\n- title: get page title\n- url: get current URL\n- getAttribute: get element attribute value\n- innerHTML: get element's innerHTML\n- textContent: get element's textContent\n- goBack/goForward/reload: browser navigation\n- pdf: save page as PDF\n- close: close browser session"
|
|
10362
10686
|
},
|
|
10363
10687
|
url: {
|
|
10364
10688
|
type: "string",
|
|
@@ -10374,7 +10698,7 @@ var init_playwright_browser = __esm({
|
|
|
10374
10698
|
},
|
|
10375
10699
|
target: {
|
|
10376
10700
|
type: "string",
|
|
10377
|
-
description: "Natural-language browser visual target for visual_click, for example 'the green Continue button' or '
|
|
10701
|
+
description: "Natural-language browser visual target for visual_click or selector-less fill, for example 'the green Continue button', 'username field', or 'password field'."
|
|
10378
10702
|
},
|
|
10379
10703
|
value: {
|
|
10380
10704
|
type: "string",
|
|
@@ -10494,12 +10818,22 @@ var init_playwright_browser = __esm({
|
|
|
10494
10818
|
return ok(`Clicked: ${resolvedSelector}${resolvedSelector !== selector ? ` (from ${selector})` : ""}`, start2);
|
|
10495
10819
|
}
|
|
10496
10820
|
case "fill": {
|
|
10497
|
-
if (!selector)
|
|
10498
|
-
return fail("selector is required", start2);
|
|
10499
10821
|
if (text === void 0)
|
|
10500
10822
|
return fail("text is required", start2);
|
|
10501
|
-
|
|
10502
|
-
|
|
10823
|
+
const typingDelay = typeof args.typing_delay_ms === "number" ? Math.max(0, Math.min(500, Math.round(args.typing_delay_ms))) : 20;
|
|
10824
|
+
if (selector) {
|
|
10825
|
+
const resolvedSelector = resolveDomSummarySelector(selector);
|
|
10826
|
+
if (!resolvedSelector)
|
|
10827
|
+
return fail(`No selector known for DOM summary reference ${selector}; run dom_summary and use the emitted selector.`, start2);
|
|
10828
|
+
await page.fill(resolvedSelector, text, { timeout: timeout2 });
|
|
10829
|
+
return ok(`Filled ${resolvedSelector}${resolvedSelector !== selector ? ` (from ${selector})` : ""} with "${text}"`, start2);
|
|
10830
|
+
}
|
|
10831
|
+
const target = typeof args.target === "string" && args.target.trim() ? args.target.trim() : "";
|
|
10832
|
+
if (!target)
|
|
10833
|
+
return fail("selector or target is required for fill. Prefer target for visual/natural-language form fields, e.g. target='username field'.", start2);
|
|
10834
|
+
const result = await clickAndFillBrowserTarget(page, target, text, typingDelay);
|
|
10835
|
+
const active = result.active ?? {};
|
|
10836
|
+
return ok(`Filled target "${target}" via ${result.source} into <${active["tag"] || "element"}>${active["name"] ? ` name=${JSON.stringify(active["name"])}` : ""}${active["placeholder"] ? ` placeholder=${JSON.stringify(active["placeholder"])}` : ""}${active["ariaLabel"] ? ` aria-label=${JSON.stringify(active["ariaLabel"])}` : ""}.`, start2);
|
|
10503
10837
|
}
|
|
10504
10838
|
case "type": {
|
|
10505
10839
|
if (text === void 0)
|
|
@@ -10642,9 +10976,13 @@ var init_playwright_browser = __esm({
|
|
|
10642
10976
|
case "evaluate": {
|
|
10643
10977
|
if (!text)
|
|
10644
10978
|
return fail("text (JavaScript code) is required", start2);
|
|
10645
|
-
|
|
10646
|
-
|
|
10647
|
-
|
|
10979
|
+
try {
|
|
10980
|
+
const result = await page.evaluate(text);
|
|
10981
|
+
const serialized = typeof result === "string" ? result : JSON.stringify(result, null, 2);
|
|
10982
|
+
return ok(serialized?.slice(0, 15e3) ?? "undefined", start2);
|
|
10983
|
+
} catch (err2) {
|
|
10984
|
+
return fail(evaluateFailureMessage(err2, text), start2);
|
|
10985
|
+
}
|
|
10648
10986
|
}
|
|
10649
10987
|
// ── Screenshot / PDF ──
|
|
10650
10988
|
case "screenshot": {
|
|
@@ -23743,8 +24081,8 @@ var init_explore_tools = __esm({
|
|
|
23743
24081
|
enter_worktree: "Create isolated git worktree for safe parallel file modifications",
|
|
23744
24082
|
exit_worktree: "Exit and optionally remove a git worktree (keep for merge or discard)",
|
|
23745
24083
|
notebook_edit: "Edit Jupyter .ipynb notebooks at cell level (list, replace, insert, delete cells)",
|
|
23746
|
-
browser_action: "Interactive browser: login, fill forms, click buttons, screenshot — session persists between calls;
|
|
23747
|
-
playwright_browser: "Full browser verification and visual action loop: observe_bundle, visual_click via Moondream pointing, focused-element typing
|
|
24084
|
+
browser_action: "Interactive Selenium browser: login, fill forms, click buttons, screenshot — session persists between browser_action calls only; separate runtime from playwright_browser",
|
|
24085
|
+
playwright_browser: "Full browser verification and visual action loop: observe_bundle, visual_click via Moondream pointing, selector/target fill, focused-element typing, screenshot, page_errors, console_logs, network_log, DOM/accessibility, storage",
|
|
23748
24086
|
carbonyl_browser: "Terminal-rendered real browser automation via Carbonyl: navigate, read rendered text, click/type, sessions, daemon mode",
|
|
23749
24087
|
scheduler: "Schedule tasks for automatic future execution via OS cron",
|
|
23750
24088
|
cronjob: "Alias for scheduler: OS cron-backed time triggers",
|
|
@@ -282954,6 +283292,18 @@ var init_vision_action_loop = __esm({
|
|
|
282954
283292
|
enum: ["single", "double"],
|
|
282955
283293
|
description: "Click type for click operation. Default single."
|
|
282956
283294
|
},
|
|
283295
|
+
text: {
|
|
283296
|
+
type: "string",
|
|
283297
|
+
description: "Optional text to type after a live click, or into the currently focused desktop control when operation='none'."
|
|
283298
|
+
},
|
|
283299
|
+
key: {
|
|
283300
|
+
type: "string",
|
|
283301
|
+
description: "Optional key/chord to press after a live click/text entry, for example Enter, Escape, Tab, ctrl+f."
|
|
283302
|
+
},
|
|
283303
|
+
typing_delay_ms: {
|
|
283304
|
+
type: "number",
|
|
283305
|
+
description: "Per-character delay for desktop text entry. Default 10ms."
|
|
283306
|
+
},
|
|
282957
283307
|
index: {
|
|
282958
283308
|
type: "number",
|
|
282959
283309
|
description: "If multiple target points are found, use this 1-based index. Default 1."
|
|
@@ -282978,6 +283328,10 @@ var init_vision_action_loop = __esm({
|
|
|
282978
283328
|
type: "string",
|
|
282979
283329
|
description: "Optional screenshot output directory. Relative paths resolve from the workspace."
|
|
282980
283330
|
},
|
|
283331
|
+
window_title: {
|
|
283332
|
+
type: "string",
|
|
283333
|
+
description: "Optional X11/XWayland window title to capture instead of the whole desktop. Useful on Wayland when root screenshots are blocked."
|
|
283334
|
+
},
|
|
282981
283335
|
clear_artifacts: {
|
|
282982
283336
|
type: "boolean",
|
|
282983
283337
|
description: "Only for reset. If true, also deletes this session's screenshot directory."
|
|
@@ -283064,8 +283418,12 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
|
|
|
283064
283418
|
const includeVision = asBoolean(args["include_vision"], true);
|
|
283065
283419
|
const visionPromptOverride = asString(args["vision_prompt"]);
|
|
283066
283420
|
const language = asString(args["language"]) || "eng";
|
|
283421
|
+
const windowTitle = asString(args["window_title"]);
|
|
283067
283422
|
const button = this.parseButton(args["button"]);
|
|
283068
283423
|
const clickType = args["click_type"] === "double" ? "double" : "single";
|
|
283424
|
+
const textToType = asString(args["text"]);
|
|
283425
|
+
const keyToPress = asString(args["key"]);
|
|
283426
|
+
const typingDelayMs = clampInteger2(args["typing_delay_ms"], 10, 0, 500);
|
|
283069
283427
|
const index = clampInteger2(args["index"], 1, 1, 100);
|
|
283070
283428
|
const delayMs = clampInteger2(args["delay_ms"], 0, 0, 6e4);
|
|
283071
283429
|
const maxSteps = action === "run" ? clampInteger2(args["max_steps"], DEFAULT_MAX_STEPS, 1, HARD_MAX_STEPS) : 1;
|
|
@@ -283088,13 +283446,28 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
|
|
|
283088
283446
|
}
|
|
283089
283447
|
const stamp = timestampSlug2();
|
|
283090
283448
|
const screenshotPath = join52(sessionDir2, `${stamp}-step-${step}-before.png`);
|
|
283091
|
-
if (process.platform === "linux" && (process.env["WAYLAND_DISPLAY"] || /wayland/i.test(process.env["XDG_SESSION_TYPE"] || ""))) {
|
|
283092
|
-
yield "Vision action loop:
|
|
283449
|
+
if (!windowTitle && process.platform === "linux" && (process.env["WAYLAND_DISPLAY"] || /wayland/i.test(process.env["XDG_SESSION_TYPE"] || ""))) {
|
|
283450
|
+
yield "Vision action loop: capturing full desktop on Wayland. For unattended app loops, pass window_title to avoid compositor screenshot prompts.";
|
|
283093
283451
|
}
|
|
283094
283452
|
yield `Vision action loop: capturing screenshot ${step}/${maxSteps}`;
|
|
283095
283453
|
let screenshotBackend = "";
|
|
283454
|
+
let captureOffset = { x: 0, y: 0 };
|
|
283455
|
+
let captureWindow;
|
|
283096
283456
|
try {
|
|
283097
|
-
|
|
283457
|
+
if (windowTitle) {
|
|
283458
|
+
const windowCapture = captureDesktopWindowScreenshot(screenshotPath, windowTitle);
|
|
283459
|
+
screenshotBackend = windowCapture.backend;
|
|
283460
|
+
captureOffset = { x: windowCapture.x, y: windowCapture.y };
|
|
283461
|
+
captureWindow = {
|
|
283462
|
+
windowId: windowCapture.windowId,
|
|
283463
|
+
x: windowCapture.x,
|
|
283464
|
+
y: windowCapture.y,
|
|
283465
|
+
width: windowCapture.width,
|
|
283466
|
+
height: windowCapture.height
|
|
283467
|
+
};
|
|
283468
|
+
} else {
|
|
283469
|
+
screenshotBackend = captureDesktopScreenshot(screenshotPath);
|
|
283470
|
+
}
|
|
283098
283471
|
mutatedFiles.push(screenshotPath);
|
|
283099
283472
|
} catch (err) {
|
|
283100
283473
|
success = false;
|
|
@@ -283121,6 +283494,9 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
|
|
|
283121
283494
|
outputLines.push(`Screenshot saved: ${screenshotPath}`);
|
|
283122
283495
|
outputLines.push(`Screen: ${dims.width}x${dims.height}`);
|
|
283123
283496
|
outputLines.push(`Screenshot backend: ${screenshotBackend}`);
|
|
283497
|
+
if (captureWindow) {
|
|
283498
|
+
outputLines.push(`Window: ${windowTitle} id=${captureWindow.windowId} geometry=${captureWindow.x},${captureWindow.y} ${captureWindow.width}x${captureWindow.height}`);
|
|
283499
|
+
}
|
|
283124
283500
|
let ocr = null;
|
|
283125
283501
|
if (includeOcr) {
|
|
283126
283502
|
yield `Vision action loop: running OCR for screenshot ${step}/${maxSteps}`;
|
|
@@ -283209,6 +283585,14 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
|
|
|
283209
283585
|
selectedPoint = pointFromVisionResult({ points: ocrPoints, source: "tesseract-ocr" }, index, dims);
|
|
283210
283586
|
}
|
|
283211
283587
|
if (selectedPoint) {
|
|
283588
|
+
if (captureOffset.x !== 0 || captureOffset.y !== 0) {
|
|
283589
|
+
selectedPoint = {
|
|
283590
|
+
...selectedPoint,
|
|
283591
|
+
pixelX: selectedPoint.pixelX + captureOffset.x,
|
|
283592
|
+
pixelY: selectedPoint.pixelY + captureOffset.y,
|
|
283593
|
+
source: `${selectedPoint.source}+window:${captureWindow?.windowId ?? "active"}`
|
|
283594
|
+
};
|
|
283595
|
+
}
|
|
283212
283596
|
outputLines.push(`Target: ${effectiveTarget}`);
|
|
283213
283597
|
outputLines.push(`Point source: ${selectedPoint.source}`);
|
|
283214
283598
|
outputLines.push(`Mapped point: (${Math.round(selectedPoint.pixelX)}, ${Math.round(selectedPoint.pixelY)}) normalized (${selectedPoint.x.toFixed(4)}, ${selectedPoint.y.toFixed(4)})`);
|
|
@@ -283238,15 +283622,29 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
|
|
|
283238
283622
|
} else {
|
|
283239
283623
|
yield `Vision action loop: performing ${effectiveOperation} at (${pixelX}, ${pixelY})`;
|
|
283240
283624
|
try {
|
|
283625
|
+
if (captureWindow?.windowId) {
|
|
283626
|
+
const activationBackend = activateDesktopWindow(captureWindow.windowId);
|
|
283627
|
+
outputLines.push(`Activated window via ${activationBackend}: ${captureWindow.windowId}`);
|
|
283628
|
+
}
|
|
283241
283629
|
const backend = effectiveOperation === "move" ? moveDesktopPointer(pixelX, pixelY) : clickDesktopAt(pixelX, pixelY, button, clickType);
|
|
283242
283630
|
actionTaken = effectiveOperation === "move" ? `Moved pointer to (${pixelX}, ${pixelY}) via ${backend}` : `Clicked at (${pixelX}, ${pixelY}) via ${backend} [${button} ${clickType}]`;
|
|
283243
283631
|
outputLines.push(actionTaken);
|
|
283632
|
+
if (textToType) {
|
|
283633
|
+
const keyboardBackend = typeDesktopText(textToType, typingDelayMs);
|
|
283634
|
+
outputLines.push(`Typed text via ${keyboardBackend}: ${JSON.stringify(textToType)}`);
|
|
283635
|
+
actionTaken += `; typed text via ${keyboardBackend}`;
|
|
283636
|
+
}
|
|
283637
|
+
if (keyToPress) {
|
|
283638
|
+
const keyboardBackend = pressDesktopKey(keyToPress);
|
|
283639
|
+
outputLines.push(`Pressed key via ${keyboardBackend}: ${keyToPress}`);
|
|
283640
|
+
actionTaken += `; pressed key via ${keyboardBackend}: ${keyToPress}`;
|
|
283641
|
+
}
|
|
283244
283642
|
afterScreenshotPath = join52(sessionDir2, `${timestampSlug2()}-step-${step}-after.png`);
|
|
283245
|
-
if (process.platform === "linux" && (process.env["WAYLAND_DISPLAY"] || /wayland/i.test(process.env["XDG_SESSION_TYPE"] || ""))) {
|
|
283246
|
-
yield "Vision action loop:
|
|
283643
|
+
if (!windowTitle && process.platform === "linux" && (process.env["WAYLAND_DISPLAY"] || /wayland/i.test(process.env["XDG_SESSION_TYPE"] || ""))) {
|
|
283644
|
+
yield "Vision action loop: capturing post-action full desktop on Wayland. For unattended app loops, pass window_title to avoid compositor screenshot prompts.";
|
|
283247
283645
|
}
|
|
283248
283646
|
yield `Vision action loop: capturing post-action screenshot ${step}/${maxSteps}`;
|
|
283249
|
-
const afterBackend = captureDesktopScreenshot(afterScreenshotPath);
|
|
283647
|
+
const afterBackend = windowTitle ? captureDesktopWindowScreenshot(afterScreenshotPath, windowTitle).backend : captureDesktopScreenshot(afterScreenshotPath);
|
|
283250
283648
|
mutatedFiles.push(afterScreenshotPath);
|
|
283251
283649
|
outputLines.push(`Post-action screenshot: ${afterScreenshotPath}`);
|
|
283252
283650
|
outputLines.push(`Post-action screenshot backend: ${afterBackend}`);
|
|
@@ -283260,6 +283658,35 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
|
|
|
283260
283658
|
}
|
|
283261
283659
|
} else if (effectiveOperation !== "none" && (effectiveTarget || hasCoordinates)) {
|
|
283262
283660
|
outputLines.push(`Action skipped: no usable point for operation '${effectiveOperation}'.`);
|
|
283661
|
+
} else if (allowActions && !dryRun && (textToType || keyToPress)) {
|
|
283662
|
+
try {
|
|
283663
|
+
const keyboardActions = [];
|
|
283664
|
+
if (captureWindow?.windowId) {
|
|
283665
|
+
const activationBackend = activateDesktopWindow(captureWindow.windowId);
|
|
283666
|
+
keyboardActions.push(`Activated window via ${activationBackend}: ${captureWindow.windowId}`);
|
|
283667
|
+
}
|
|
283668
|
+
if (textToType) {
|
|
283669
|
+
const keyboardBackend = typeDesktopText(textToType, typingDelayMs);
|
|
283670
|
+
keyboardActions.push(`Typed text via ${keyboardBackend}: ${JSON.stringify(textToType)}`);
|
|
283671
|
+
}
|
|
283672
|
+
if (keyToPress) {
|
|
283673
|
+
const keyboardBackend = pressDesktopKey(keyToPress);
|
|
283674
|
+
keyboardActions.push(`Pressed key via ${keyboardBackend}: ${keyToPress}`);
|
|
283675
|
+
}
|
|
283676
|
+
actionTaken = keyboardActions.join("; ");
|
|
283677
|
+
outputLines.push(actionTaken);
|
|
283678
|
+
afterScreenshotPath = join52(sessionDir2, `${timestampSlug2()}-step-${step}-after.png`);
|
|
283679
|
+
const afterBackend = windowTitle ? captureDesktopWindowScreenshot(afterScreenshotPath, windowTitle).backend : captureDesktopScreenshot(afterScreenshotPath);
|
|
283680
|
+
mutatedFiles.push(afterScreenshotPath);
|
|
283681
|
+
outputLines.push(`Post-action screenshot: ${afterScreenshotPath}`);
|
|
283682
|
+
outputLines.push(`Post-action screenshot backend: ${afterBackend}`);
|
|
283683
|
+
} catch (err) {
|
|
283684
|
+
success = false;
|
|
283685
|
+
actionError = err instanceof Error ? err.message : String(err);
|
|
283686
|
+
error = actionError;
|
|
283687
|
+
outputLines.push(`Keyboard action failed: ${actionError}`);
|
|
283688
|
+
outputLines.push(desktopAutomationRecoveryMessage());
|
|
283689
|
+
}
|
|
283263
283690
|
} else if (action === "run" && !effectiveTarget && !hasCoordinates) {
|
|
283264
283691
|
success = false;
|
|
283265
283692
|
error = "Vision loop stopped: no target or coordinates were provided and visual planning did not identify a clickable target. Stopping instead of repeating observe-only screenshots.";
|
|
@@ -284152,6 +284579,7 @@ async function ensureSession(options2 = {}) {
|
|
|
284152
284579
|
}
|
|
284153
284580
|
activeSessionId = null;
|
|
284154
284581
|
activeSessionHeadless = null;
|
|
284582
|
+
activeSessionUrl = null;
|
|
284155
284583
|
}
|
|
284156
284584
|
}
|
|
284157
284585
|
if (activeSessionId) {
|
|
@@ -284163,6 +284591,13 @@ async function ensureSession(options2 = {}) {
|
|
|
284163
284591
|
}
|
|
284164
284592
|
activeSessionId = null;
|
|
284165
284593
|
activeSessionHeadless = null;
|
|
284594
|
+
activeSessionUrl = null;
|
|
284595
|
+
}
|
|
284596
|
+
if (options2.allowCreate === false) {
|
|
284597
|
+
return {
|
|
284598
|
+
error: "No active browser_action Selenium session exists for this action. browser_action is a separate browser/runtime from playwright_browser; continue the current page with playwright_browser, or call browser_action({action:'navigate', url: ...}) first.",
|
|
284599
|
+
sessionId: ""
|
|
284600
|
+
};
|
|
284166
284601
|
}
|
|
284167
284602
|
const headless = options2.headless ?? defaultBrowserHeadless();
|
|
284168
284603
|
const res = await fetch(`${BASE_URL}/session/start`, {
|
|
@@ -284180,8 +284615,16 @@ async function ensureSession(options2 = {}) {
|
|
|
284180
284615
|
return { error: String(data.message ?? "Failed to start browser session"), sessionId: "" };
|
|
284181
284616
|
activeSessionId = data.session_id;
|
|
284182
284617
|
activeSessionHeadless = headless;
|
|
284618
|
+
activeSessionUrl = null;
|
|
284183
284619
|
return { sessionId: activeSessionId };
|
|
284184
284620
|
}
|
|
284621
|
+
function browserActionRuntimeHint() {
|
|
284622
|
+
return [
|
|
284623
|
+
"browser_action is a separate browser/runtime from playwright_browser and uses its own Selenium/Chrome session; it does not share page state, cookies, focus, or navigation.",
|
|
284624
|
+
activeSessionUrl ? `Current browser_action URL: ${activeSessionUrl}` : "Current browser_action URL: unknown or not navigated.",
|
|
284625
|
+
"If this page was opened with playwright_browser, keep using playwright_browser actions such as dom_summary, fill, type, press, visual_click, and observe_bundle."
|
|
284626
|
+
].join(" ");
|
|
284627
|
+
}
|
|
284185
284628
|
async function apiCall(endpoint, method = "POST", body) {
|
|
284186
284629
|
const options2 = {
|
|
284187
284630
|
method,
|
|
@@ -284204,7 +284647,7 @@ async function apiCall(endpoint, method = "POST", body) {
|
|
|
284204
284647
|
const res = await fetch(url, options2);
|
|
284205
284648
|
return await res.json();
|
|
284206
284649
|
}
|
|
284207
|
-
var __dirname3, DEFAULT_PORT, SCRAPE_SCRIPT, BASE_URL, serviceProcess, activeSessionId, activeSessionHeadless, BrowserActionTool;
|
|
284650
|
+
var __dirname3, DEFAULT_PORT, SCRAPE_SCRIPT, BASE_URL, serviceProcess, activeSessionId, activeSessionHeadless, activeSessionUrl, BrowserActionTool;
|
|
284208
284651
|
var init_browser_action = __esm({
|
|
284209
284652
|
"packages/execution/dist/tools/browser-action.js"() {
|
|
284210
284653
|
"use strict";
|
|
@@ -284217,9 +284660,10 @@ var init_browser_action = __esm({
|
|
|
284217
284660
|
serviceProcess = null;
|
|
284218
284661
|
activeSessionId = null;
|
|
284219
284662
|
activeSessionHeadless = null;
|
|
284663
|
+
activeSessionUrl = null;
|
|
284220
284664
|
BrowserActionTool = class {
|
|
284221
284665
|
name = "browser_action";
|
|
284222
|
-
description = "Control a persistent headless Chrome browser session for interactive web tasks. The browser stays open between calls, maintaining cookies, login state, and history. Use this (not web_fetch/web_crawl) when you need to: (1) log into a website, (2) fill and submit forms, (3) click buttons or links interactively, (4) take screenshots of rendered pages, (5) navigate multi-step workflows (checkout, signup, dashboards), (6) interact with elements that require JavaScript (dropdowns, modals, infinite scroll). Actions: navigate, click, click_xy, type, screenshot, dom, scroll, scroll_up, scroll_down, back, forward, close. For browser visuals, use browser_action({action:'screenshot', width, height, output_path}) — this captures the headless browser viewport, not the desktop. Use the desktop screenshot tool only when the actual OS screen is the target. For verification of browser runtime failures, prefer playwright_browser because it exposes page_errors, console_logs, network_log, DOM/accessibility, and screenshots from the same session. IMPORTANT: Start by calling navigate with the URL — do NOT ask the user for credentials or info first. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Navigate to the page, then use dom/screenshot to see what's there, then type/click to interact. Call 'close' when done to free resources. This tool does not save or download arbitrary rendered files (PDFs, archives, media) to disk — clicking a 'Download' link inside the browser does not produce a local file path for the agent. For file acquisition, use the dedicated download/file tool and validate the resulting content-type and size before treating the result as success.";
|
|
284666
|
+
description = "Control a persistent headless Chrome browser session for interactive web tasks. The browser stays open between calls, maintaining cookies, login state, and history. This is a separate Selenium/Chrome runtime from playwright_browser; do not switch between the two mid-workflow unless you intentionally navigate the second tool to the same URL. Use this (not web_fetch/web_crawl) when you need to: (1) log into a website, (2) fill and submit forms, (3) click buttons or links interactively, (4) take screenshots of rendered pages, (5) navigate multi-step workflows (checkout, signup, dashboards), (6) interact with elements that require JavaScript (dropdowns, modals, infinite scroll). Actions: navigate, click, click_xy, type, screenshot, dom, scroll, scroll_up, scroll_down, back, forward, close. For browser visuals, use browser_action({action:'screenshot', width, height, output_path}) — this captures the headless browser viewport, not the desktop. Use the desktop screenshot tool only when the actual OS screen is the target. For verification of browser runtime failures, prefer playwright_browser because it exposes page_errors, console_logs, network_log, DOM/accessibility, and screenshots from the same session. IMPORTANT: Start by calling navigate with the URL — do NOT ask the user for credentials or info first. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Navigate to the page, then use dom/screenshot to see what's there, then type/click to interact. Call 'close' when done to free resources. This tool does not save or download arbitrary rendered files (PDFs, archives, media) to disk — clicking a 'Download' link inside the browser does not produce a local file path for the agent. For file acquisition, use the dedicated download/file tool and validate the resulting content-type and size before treating the result as success.";
|
|
284223
284667
|
parameters = {
|
|
284224
284668
|
type: "object",
|
|
284225
284669
|
properties: {
|
|
@@ -284289,27 +284733,38 @@ var init_browser_action = __esm({
|
|
|
284289
284733
|
const requestedWidth = args.width == null ? void 0 : asPositiveInt2(args.width, 1280, 320, 3840);
|
|
284290
284734
|
const requestedHeight = args.height == null ? void 0 : asPositiveInt2(args.height, 720, 240, 2160);
|
|
284291
284735
|
const requestedScale = args.device_scale_factor == null ? void 0 : asPositiveNumber(args.device_scale_factor, 1, 0.25, 3);
|
|
284292
|
-
const launchErr = await launchService();
|
|
284293
|
-
if (launchErr) {
|
|
284294
|
-
return { success: false, output: "", error: launchErr, durationMs: Date.now() - start2 };
|
|
284295
|
-
}
|
|
284296
284736
|
if (action === "close") {
|
|
284297
|
-
if (activeSessionId) {
|
|
284737
|
+
if (activeSessionId || await probeService()) {
|
|
284298
284738
|
try {
|
|
284299
284739
|
await apiCall("/session/close");
|
|
284300
284740
|
} catch {
|
|
284301
284741
|
}
|
|
284302
284742
|
activeSessionId = null;
|
|
284303
284743
|
activeSessionHeadless = null;
|
|
284744
|
+
activeSessionUrl = null;
|
|
284304
284745
|
}
|
|
284305
284746
|
return { success: true, output: "Browser session closed.", durationMs: Date.now() - start2 };
|
|
284306
284747
|
}
|
|
284748
|
+
const actionStartsSession = action === "navigate";
|
|
284749
|
+
if (!actionStartsSession && !activeSessionId) {
|
|
284750
|
+
return {
|
|
284751
|
+
success: false,
|
|
284752
|
+
output: "",
|
|
284753
|
+
error: `browser_action ${action || "(missing action)"} requires an active browser_action session. ` + browserActionRuntimeHint(),
|
|
284754
|
+
durationMs: Date.now() - start2
|
|
284755
|
+
};
|
|
284756
|
+
}
|
|
284757
|
+
const launchErr = await launchService();
|
|
284758
|
+
if (launchErr) {
|
|
284759
|
+
return { success: false, output: "", error: launchErr, durationMs: Date.now() - start2 };
|
|
284760
|
+
}
|
|
284307
284761
|
const session = await ensureSession({
|
|
284308
284762
|
width: requestedWidth,
|
|
284309
284763
|
height: requestedHeight,
|
|
284310
284764
|
deviceScaleFactor: requestedScale,
|
|
284311
284765
|
headless: asOptionalBoolean2(args.headless),
|
|
284312
|
-
forceNew: asOptionalBoolean2(args.force_new) === true
|
|
284766
|
+
forceNew: asOptionalBoolean2(args.force_new) === true,
|
|
284767
|
+
allowCreate: actionStartsSession
|
|
284313
284768
|
});
|
|
284314
284769
|
if (session.error) {
|
|
284315
284770
|
return { success: false, output: "", error: session.error, durationMs: Date.now() - start2 };
|
|
@@ -284327,7 +284782,13 @@ var init_browser_action = __esm({
|
|
|
284327
284782
|
}
|
|
284328
284783
|
result = await apiCall("/navigate", "POST", { url: args.url });
|
|
284329
284784
|
if (result.ok) {
|
|
284330
|
-
|
|
284785
|
+
activeSessionUrl = args.url;
|
|
284786
|
+
return {
|
|
284787
|
+
success: true,
|
|
284788
|
+
output: `Navigated to ${args.url}
|
|
284789
|
+
Runtime: browser_action Selenium/Chrome session. Continue with browser_action for this page, or use playwright_browser separately after navigating it.`,
|
|
284790
|
+
durationMs: Date.now() - start2
|
|
284791
|
+
};
|
|
284331
284792
|
}
|
|
284332
284793
|
const navMsg = String(result.message ?? "Navigation failed");
|
|
284333
284794
|
const navHint = navMsg.toLowerCase().includes("connection") || navMsg.toLowerCase().includes("refused") || navMsg.toLowerCase().includes("err_connection") ? " (the URL appears unreachable — check if the target server is running and accepting connections)" : navMsg.toLowerCase().includes("timeout") ? " (page load timed out — try again or use a different URL)" : "";
|
|
@@ -284349,7 +284810,7 @@ var init_browser_action = __esm({
|
|
|
284349
284810
|
return {
|
|
284350
284811
|
success: false,
|
|
284351
284812
|
output: `Click on ${args.selector} failed: ${clickMsg}`,
|
|
284352
|
-
error: `browser_action click failed: ${clickMsg}. Try dom_summary first to see what selectors exist on the page
|
|
284813
|
+
error: `browser_action click failed: ${clickMsg}. Try dom_summary first to see what selectors exist on the page. ${browserActionRuntimeHint()}`,
|
|
284353
284814
|
durationMs: Date.now() - start2
|
|
284354
284815
|
};
|
|
284355
284816
|
}
|
|
@@ -284391,7 +284852,7 @@ var init_browser_action = __esm({
|
|
|
284391
284852
|
return {
|
|
284392
284853
|
success: false,
|
|
284393
284854
|
output: `Type into ${args.selector} failed: ${typeMsg}`,
|
|
284394
|
-
error: `browser_action type failed: ${typeMsg}. Verify the element is visible and is an input/textarea — use dom_summary to check
|
|
284855
|
+
error: `browser_action type failed: ${typeMsg}. Verify the element is visible and is an input/textarea — use dom_summary to check. ${browserActionRuntimeHint()}`,
|
|
284395
284856
|
durationMs: Date.now() - start2
|
|
284396
284857
|
};
|
|
284397
284858
|
}
|
|
@@ -284532,7 +284993,7 @@ var init_browser_action = __esm({
|
|
|
284532
284993
|
if (!pointResult || pointResult.points.length === 0) {
|
|
284533
284994
|
return {
|
|
284534
284995
|
success: false,
|
|
284535
|
-
output: `Vision could not find "${target}" on the page. Try using dom_summary to find the CSS selector instead
|
|
284996
|
+
output: `Vision could not find "${target}" on the page. Try using dom_summary to find the CSS selector instead. ${browserActionRuntimeHint()}`,
|
|
284536
284997
|
error: "No point backend returned normalized coordinates.",
|
|
284537
284998
|
durationMs: Date.now() - start2
|
|
284538
284999
|
};
|
package/npm-shrinkwrap.json
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "omnius",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.207",
|
|
4
4
|
"lockfileVersion": 3,
|
|
5
5
|
"requires": true,
|
|
6
6
|
"packages": {
|
|
7
7
|
"": {
|
|
8
8
|
"name": "omnius",
|
|
9
|
-
"version": "1.0.
|
|
9
|
+
"version": "1.0.207",
|
|
10
10
|
"bundleDependencies": [
|
|
11
11
|
"image-to-ascii"
|
|
12
12
|
],
|
|
@@ -4565,9 +4565,19 @@
|
|
|
4565
4565
|
}
|
|
4566
4566
|
},
|
|
4567
4567
|
"node_modules/js-yaml": {
|
|
4568
|
-
"version": "4.
|
|
4569
|
-
"resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.
|
|
4570
|
-
"integrity": "sha512-
|
|
4568
|
+
"version": "4.2.0",
|
|
4569
|
+
"resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.2.0.tgz",
|
|
4570
|
+
"integrity": "sha512-ePWsvanv0DWuDRsW8dnt+R4jQ31SCRCQ7hhNcPXZPsoBZiemuZNYGf7adZdqX2D86j6rvKp3RpCxVTSb8WQlOw==",
|
|
4571
|
+
"funding": [
|
|
4572
|
+
{
|
|
4573
|
+
"type": "github",
|
|
4574
|
+
"url": "https://github.com/sponsors/puzrin"
|
|
4575
|
+
},
|
|
4576
|
+
{
|
|
4577
|
+
"type": "github",
|
|
4578
|
+
"url": "https://github.com/sponsors/nodeca"
|
|
4579
|
+
}
|
|
4580
|
+
],
|
|
4571
4581
|
"license": "MIT",
|
|
4572
4582
|
"dependencies": {
|
|
4573
4583
|
"argparse": "^2.0.1"
|
package/package.json
CHANGED