omnius 1.0.205 → 1.0.206
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +354 -14
- package/npm-shrinkwrap.json +2 -2
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -3987,9 +3987,33 @@ var init_system_deps = __esm({
|
|
|
3987
3987
|
// packages/execution/dist/tools/desktop-control.js
|
|
3988
3988
|
import { execSync as execSync4 } from "node:child_process";
|
|
3989
3989
|
import { existsSync as existsSync6, statSync as statSync3 } from "node:fs";
|
|
3990
|
+
function activateDesktopWindow(windowId) {
|
|
3991
|
+
const attempts = [];
|
|
3992
|
+
const tryAction = (label, command) => {
|
|
3993
|
+
const result = run(command, 5e3);
|
|
3994
|
+
if (result.ok)
|
|
3995
|
+
return label;
|
|
3996
|
+
attempts.push({ label, message: result.message });
|
|
3997
|
+
return null;
|
|
3998
|
+
};
|
|
3999
|
+
if (process.platform === "linux") {
|
|
4000
|
+
if (hasCommand2("xdotool")) {
|
|
4001
|
+
const backend = tryAction("xdotool", `xdotool windowactivate --sync ${quoteShell(windowId)}`);
|
|
4002
|
+
if (backend)
|
|
4003
|
+
return backend;
|
|
4004
|
+
}
|
|
4005
|
+
if (hasCommand2("wmctrl")) {
|
|
4006
|
+
const backend = tryAction("wmctrl", `wmctrl -ia ${quoteShell(windowId)}`);
|
|
4007
|
+
if (backend)
|
|
4008
|
+
return backend;
|
|
4009
|
+
}
|
|
4010
|
+
}
|
|
4011
|
+
throw new Error("No desktop window activation backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage());
|
|
4012
|
+
}
|
|
3990
4013
|
function captureDesktopScreenshot(outputPath3) {
|
|
3991
4014
|
const attempts = [];
|
|
3992
4015
|
const out = quoteShell(outputPath3);
|
|
4016
|
+
const allowInteractiveWaylandScreenshot = envFlag("OMNIUS_DESKTOP_INTERACTIVE_SCREENSHOT", false);
|
|
3993
4017
|
const tryCapture = (label, command, timeout2 = 1e4) => {
|
|
3994
4018
|
const result = run(command, timeout2);
|
|
3995
4019
|
if (result.ok && existsSync6(outputPath3)) {
|
|
@@ -4022,11 +4046,22 @@ $bitmap.Dispose()
|
|
|
4022
4046
|
return backend;
|
|
4023
4047
|
} else if (process.platform === "linux") {
|
|
4024
4048
|
const desktop = `${process.env["XDG_CURRENT_DESKTOP"] || ""} ${process.env["DESKTOP_SESSION"] || ""}`;
|
|
4025
|
-
if ((/wayland/i.test(process.env["XDG_SESSION_TYPE"] || "") || process.env["WAYLAND_DISPLAY"]) &&
|
|
4049
|
+
if ((/wayland/i.test(process.env["XDG_SESSION_TYPE"] || "") || process.env["WAYLAND_DISPLAY"]) && /gnome/i.test(desktop) && hasCommand2("gdbus") && allowInteractiveWaylandScreenshot) {
|
|
4050
|
+
const backend = tryCapture("gnome-shell-interactive-screenshot-dbus", gnomeInteractiveScreenshotCommand(outputPath3), 13e4);
|
|
4051
|
+
if (backend)
|
|
4052
|
+
return backend;
|
|
4053
|
+
}
|
|
4054
|
+
if ((/wayland/i.test(process.env["XDG_SESSION_TYPE"] || "") || process.env["WAYLAND_DISPLAY"]) && hasCommand2("gdbus") && hasCommand2("dbus-monitor") && allowInteractiveWaylandScreenshot) {
|
|
4026
4055
|
const backend = tryCapture("xdg-desktop-portal-screenshot", portalScreenshotCommand(outputPath3), 13e4);
|
|
4027
4056
|
if (backend)
|
|
4028
4057
|
return backend;
|
|
4029
4058
|
}
|
|
4059
|
+
if ((/wayland/i.test(process.env["XDG_SESSION_TYPE"] || "") || process.env["WAYLAND_DISPLAY"]) && !allowInteractiveWaylandScreenshot) {
|
|
4060
|
+
attempts.push({
|
|
4061
|
+
label: "interactive-wayland-screenshot",
|
|
4062
|
+
message: "skipped by default to avoid unattended GNOME/portal screenshot selection stalls. For desktop app loops, pass window_title to vision_action_loop so Omnius captures the named X11/XWayland window. Set OMNIUS_DESKTOP_INTERACTIVE_SCREENSHOT=1 only when a human will complete the full-screen screenshot prompt."
|
|
4063
|
+
});
|
|
4064
|
+
}
|
|
4030
4065
|
if (/gnome/i.test(desktop) && hasCommand2("gdbus")) {
|
|
4031
4066
|
const backend = tryCapture("gnome-shell-screenshot-dbus", `gdbus call --session --dest org.gnome.Shell.Screenshot --object-path /org/gnome/Shell/Screenshot --method org.gnome.Shell.Screenshot.Screenshot false false ${out}`);
|
|
4032
4067
|
if (backend)
|
|
@@ -4079,6 +4114,83 @@ $bitmap.Dispose()
|
|
|
4079
4114
|
}
|
|
4080
4115
|
throw new Error("No desktop screenshot backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage());
|
|
4081
4116
|
}
|
|
4117
|
+
function captureDesktopWindowScreenshot(outputPath3, windowTitle) {
|
|
4118
|
+
const attempts = [];
|
|
4119
|
+
const out = quoteShell(outputPath3);
|
|
4120
|
+
const runText = (command, timeout2 = 5e3) => {
|
|
4121
|
+
const result = runCaptureText(command, timeout2);
|
|
4122
|
+
if (result.ok)
|
|
4123
|
+
return result.output.trim();
|
|
4124
|
+
attempts.push({ label: command.split(/\s+/)[0] || command, message: result.message });
|
|
4125
|
+
return null;
|
|
4126
|
+
};
|
|
4127
|
+
if (process.platform !== "linux") {
|
|
4128
|
+
throw new Error("Window screenshot capture is currently implemented for Linux/X11/XWayland windows only.");
|
|
4129
|
+
}
|
|
4130
|
+
if (!hasCommand2("xdotool")) {
|
|
4131
|
+
attempts.push({ label: "xdotool", message: "not found on PATH" });
|
|
4132
|
+
} else if (!hasCommand2("import")) {
|
|
4133
|
+
attempts.push({ label: "import", message: "ImageMagick import not found on PATH" });
|
|
4134
|
+
} else {
|
|
4135
|
+
const windowId = windowTitle ? runText(`xdotool search --name ${quoteShell(windowTitle)} | tail -n1`) : runText("xdotool getactivewindow");
|
|
4136
|
+
if (windowId) {
|
|
4137
|
+
const geometry = runText(`xdotool getwindowgeometry --shell ${quoteShell(windowId)}`);
|
|
4138
|
+
const parsed = parseXdotoolGeometry(geometry || "");
|
|
4139
|
+
if (!parsed) {
|
|
4140
|
+
attempts.push({ label: "xdotool getwindowgeometry", message: `Could not parse geometry for window ${windowId}: ${geometry}` });
|
|
4141
|
+
} else {
|
|
4142
|
+
const result = run(`import -window ${quoteShell(windowId)} ${out}`, 1e4);
|
|
4143
|
+
if (result.ok && existsSync6(outputPath3)) {
|
|
4144
|
+
const inspection = inspectScreenshot(outputPath3);
|
|
4145
|
+
if (inspection.ok) {
|
|
4146
|
+
return {
|
|
4147
|
+
backend: "imagemagick-import-window",
|
|
4148
|
+
windowId,
|
|
4149
|
+
...parsed
|
|
4150
|
+
};
|
|
4151
|
+
}
|
|
4152
|
+
attempts.push({ label: "import-window", message: inspection.message });
|
|
4153
|
+
} else {
|
|
4154
|
+
attempts.push({ label: "import-window", message: result.message });
|
|
4155
|
+
}
|
|
4156
|
+
}
|
|
4157
|
+
}
|
|
4158
|
+
}
|
|
4159
|
+
throw new Error("No desktop window screenshot backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage());
|
|
4160
|
+
}
|
|
4161
|
+
function gnomeInteractiveScreenshotCommand(outputPath3) {
|
|
4162
|
+
const script = `
|
|
4163
|
+
set -eu
|
|
4164
|
+
out=${quoteShell(outputPath3)}
|
|
4165
|
+
printf '%s\\n' 'Omnius desktop screenshot: choose Full Screen in the GNOME screenshot UI, then press Enter.' >&2
|
|
4166
|
+
call_out="$(gdbus call --session --dest org.gnome.Shell.Screenshot --object-path /org/gnome/Shell/Screenshot --method org.gnome.Shell.Screenshot.InteractiveScreenshot 2>&1)" || {
|
|
4167
|
+
printf '%s\\n' "$call_out" >&2
|
|
4168
|
+
exit 1
|
|
4169
|
+
}
|
|
4170
|
+
uri="$(printf '%s\\n' "$call_out" | sed -n "s/.*'\\(file:[^']*\\)'.*/\\1/p" | tail -n 1)"
|
|
4171
|
+
if [ -z "$uri" ]; then
|
|
4172
|
+
printf 'GNOME interactive screenshot did not return a file URI: %s\\n' "$call_out" >&2
|
|
4173
|
+
exit 1
|
|
4174
|
+
fi
|
|
4175
|
+
|
|
4176
|
+
if command -v gio >/dev/null 2>&1; then
|
|
4177
|
+
gio copy -f "$uri" "$out"
|
|
4178
|
+
elif command -v python3 >/dev/null 2>&1; then
|
|
4179
|
+
python3 - "$uri" "$out" <<'PY'
|
|
4180
|
+
import shutil, sys, urllib.parse
|
|
4181
|
+
uri, out = sys.argv[1], sys.argv[2]
|
|
4182
|
+
parsed = urllib.parse.urlparse(uri)
|
|
4183
|
+
if parsed.scheme != "file":
|
|
4184
|
+
raise SystemExit(f"Unsupported screenshot URI scheme: {parsed.scheme}")
|
|
4185
|
+
shutil.copyfile(urllib.parse.unquote(parsed.path), out)
|
|
4186
|
+
PY
|
|
4187
|
+
else
|
|
4188
|
+
printf 'Need gio or python3 to copy GNOME screenshot URI %s to %s.\\n' "$uri" "$out" >&2
|
|
4189
|
+
exit 1
|
|
4190
|
+
fi
|
|
4191
|
+
`.trim();
|
|
4192
|
+
return `bash -lc ${quoteShellLiteral(script)}`;
|
|
4193
|
+
}
|
|
4082
4194
|
function portalScreenshotCommand(outputPath3) {
|
|
4083
4195
|
const script = `
|
|
4084
4196
|
set -eu
|
|
@@ -4086,6 +4198,7 @@ out=${quoteShell(outputPath3)}
|
|
|
4086
4198
|
monitor_file="$(mktemp)"
|
|
4087
4199
|
cleanup() {
|
|
4088
4200
|
if [ -n "\${monpid:-}" ]; then kill "$monpid" >/dev/null 2>&1 || true; fi
|
|
4201
|
+
if [ -n "\${keypid:-}" ]; then kill "$keypid" >/dev/null 2>&1 || true; fi
|
|
4089
4202
|
rm -f "$monitor_file"
|
|
4090
4203
|
}
|
|
4091
4204
|
trap cleanup EXIT
|
|
@@ -4102,6 +4215,25 @@ fi
|
|
|
4102
4215
|
|
|
4103
4216
|
dbus-monitor --session "type='signal',interface='org.freedesktop.portal.Request',member='Response',path='$handle'" > "$monitor_file" 2>&1 &
|
|
4104
4217
|
monpid=$!
|
|
4218
|
+
if [ "\${OMNIUS_SCREENSHOT_AUTO_CONFIRM:-1}" != "0" ]; then
|
|
4219
|
+
(
|
|
4220
|
+
sleep 1
|
|
4221
|
+
i=0
|
|
4222
|
+
while [ "$i" -lt 8 ]; do
|
|
4223
|
+
if command -v xdotool >/dev/null 2>&1; then
|
|
4224
|
+
DISPLAY="\${DISPLAY:-:0}" xdotool key Return >/dev/null 2>&1 || true
|
|
4225
|
+
elif command -v ydotool >/dev/null 2>&1; then
|
|
4226
|
+
ydotool key 28:1 28:0 >/dev/null 2>&1 || true
|
|
4227
|
+
elif command -v dotool >/dev/null 2>&1; then
|
|
4228
|
+
printf 'key enter
|
|
4229
|
+
' | dotool >/dev/null 2>&1 || true
|
|
4230
|
+
fi
|
|
4231
|
+
i=$((i + 1))
|
|
4232
|
+
sleep 0.5
|
|
4233
|
+
done
|
|
4234
|
+
) &
|
|
4235
|
+
keypid=$!
|
|
4236
|
+
fi
|
|
4105
4237
|
deadline=$(( $(date +%s) + 120 ))
|
|
4106
4238
|
while :; do
|
|
4107
4239
|
if grep -q "member=Response" "$monitor_file"; then break; fi
|
|
@@ -4144,7 +4276,7 @@ else
|
|
|
4144
4276
|
exit 1
|
|
4145
4277
|
fi
|
|
4146
4278
|
`.trim();
|
|
4147
|
-
return `bash -lc ${
|
|
4279
|
+
return `bash -lc ${quoteShellLiteral(script)}`;
|
|
4148
4280
|
}
|
|
4149
4281
|
function moveDesktopPointer(x, y) {
|
|
4150
4282
|
const result = performDesktopPointerAction({ x, y, moveOnly: true });
|
|
@@ -4167,6 +4299,25 @@ function clickDesktopAt(x, y, button, clickType) {
|
|
|
4167
4299
|
lastPointer = { x: Math.round(x), y: Math.round(y) };
|
|
4168
4300
|
return result.backend;
|
|
4169
4301
|
}
|
|
4302
|
+
function typeDesktopText(text, delayMs = 10) {
|
|
4303
|
+
const result = performDesktopKeyboardAction({
|
|
4304
|
+
kind: "type",
|
|
4305
|
+
text,
|
|
4306
|
+
delayMs
|
|
4307
|
+
});
|
|
4308
|
+
if (!result.ok)
|
|
4309
|
+
throw new Error(result.message);
|
|
4310
|
+
return result.backend;
|
|
4311
|
+
}
|
|
4312
|
+
function pressDesktopKey(key) {
|
|
4313
|
+
const result = performDesktopKeyboardAction({
|
|
4314
|
+
kind: "key",
|
|
4315
|
+
key
|
|
4316
|
+
});
|
|
4317
|
+
if (!result.ok)
|
|
4318
|
+
throw new Error(result.message);
|
|
4319
|
+
return result.backend;
|
|
4320
|
+
}
|
|
4170
4321
|
function tryRunXdotoolShellFallback(command) {
|
|
4171
4322
|
if (!/\bxdotool\b|\bxdtool\b/.test(command))
|
|
4172
4323
|
return null;
|
|
@@ -4278,7 +4429,9 @@ function desktopAutomationRecoveryMessage(command) {
|
|
|
4278
4429
|
" Linux pointer control: xdotool/X11, ydotool, dotool, python-xlib",
|
|
4279
4430
|
" macOS: cliclick, then System Events",
|
|
4280
4431
|
" Windows: PowerShell user32 input",
|
|
4281
|
-
"On Wayland,
|
|
4432
|
+
"On Wayland, unattended full-desktop screenshots are compositor-restricted and interactive screenshot prompts are skipped by default.",
|
|
4433
|
+
"For desktop app loops on GNOME Wayland, pass window_title to vision_action_loop so Omnius captures the named X11/XWayland window without a full-screen prompt.",
|
|
4434
|
+
"Set OMNIUS_DESKTOP_INTERACTIVE_SCREENSHOT=1 only when a human will complete the full-screen screenshot prompt.",
|
|
4282
4435
|
"On GNOME Wayland, unattended screenshots may still be denied by compositor policy. Install gnome-screenshot or grant screenshot permission for the session if capture is blocked.",
|
|
4283
4436
|
"On Wayland pointer control, install and enable ydotool or dotool when xdotool cannot open an X display."
|
|
4284
4437
|
].join("\n");
|
|
@@ -4382,6 +4535,72 @@ ${options2.moveOnly ? "" : `for _ in range(${clicks}):
|
|
|
4382
4535
|
message: "No desktop mouse backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage()
|
|
4383
4536
|
};
|
|
4384
4537
|
}
|
|
4538
|
+
function performDesktopKeyboardAction(options2) {
|
|
4539
|
+
const attempts = [];
|
|
4540
|
+
const tryAction = (label, command) => {
|
|
4541
|
+
const result = run(command, 1e4);
|
|
4542
|
+
if (result.ok)
|
|
4543
|
+
return label;
|
|
4544
|
+
attempts.push({ label, message: result.message });
|
|
4545
|
+
return null;
|
|
4546
|
+
};
|
|
4547
|
+
if (process.platform === "linux") {
|
|
4548
|
+
if (hasCommand2("xdotool")) {
|
|
4549
|
+
const command = options2.kind === "type" ? `xdotool type --clearmodifiers --delay ${Math.max(0, Math.min(500, Math.round(options2.delayMs)))} ${quoteShell(options2.text)}` : `xdotool key --clearmodifiers ${quoteShell(options2.key)}`;
|
|
4550
|
+
const backend = tryAction("xdotool", command);
|
|
4551
|
+
if (backend)
|
|
4552
|
+
return { ok: true, backend };
|
|
4553
|
+
} else {
|
|
4554
|
+
attempts.push({ label: "xdotool", message: "not found on PATH" });
|
|
4555
|
+
}
|
|
4556
|
+
if (hasCommand2("ydotool")) {
|
|
4557
|
+
const command = options2.kind === "type" ? `ydotool type ${quoteShell(options2.text)}` : `ydotool key ${quoteShell(options2.key)}`;
|
|
4558
|
+
const backend = tryAction("ydotool", command);
|
|
4559
|
+
if (backend)
|
|
4560
|
+
return { ok: true, backend };
|
|
4561
|
+
}
|
|
4562
|
+
if (hasCommand2("dotool")) {
|
|
4563
|
+
const commandText = options2.kind === "type" ? `type ${options2.text}
|
|
4564
|
+
` : `key ${options2.key}
|
|
4565
|
+
`;
|
|
4566
|
+
const backend = tryAction("dotool", `printf ${quoteShell(commandText)} | dotool`);
|
|
4567
|
+
if (backend)
|
|
4568
|
+
return { ok: true, backend };
|
|
4569
|
+
}
|
|
4570
|
+
} else if (process.platform === "darwin") {
|
|
4571
|
+
if (options2.kind === "type") {
|
|
4572
|
+
const script = `tell application "System Events" to keystroke ${quoteAppleScript(options2.text)}`;
|
|
4573
|
+
const backend = tryAction("osascript-system-events", `osascript -e ${quoteShell(script)}`);
|
|
4574
|
+
if (backend)
|
|
4575
|
+
return { ok: true, backend };
|
|
4576
|
+
} else {
|
|
4577
|
+
const script = `tell application "System Events" to key code ${quoteAppleScript(options2.key)}`;
|
|
4578
|
+
const backend = tryAction("osascript-system-events", `osascript -e ${quoteShell(script)}`);
|
|
4579
|
+
if (backend)
|
|
4580
|
+
return { ok: true, backend };
|
|
4581
|
+
}
|
|
4582
|
+
}
|
|
4583
|
+
return {
|
|
4584
|
+
ok: false,
|
|
4585
|
+
message: "No desktop keyboard backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage()
|
|
4586
|
+
};
|
|
4587
|
+
}
|
|
4588
|
+
function parseXdotoolGeometry(raw) {
|
|
4589
|
+
const values = /* @__PURE__ */ new Map();
|
|
4590
|
+
for (const line of raw.split(/\r?\n/)) {
|
|
4591
|
+
const match = line.match(/^([A-Z]+)=(-?\d+)$/);
|
|
4592
|
+
if (match)
|
|
4593
|
+
values.set(match[1], Number(match[2]));
|
|
4594
|
+
}
|
|
4595
|
+
const x = values.get("X");
|
|
4596
|
+
const y = values.get("Y");
|
|
4597
|
+
const width = values.get("WIDTH");
|
|
4598
|
+
const height = values.get("HEIGHT");
|
|
4599
|
+
if ([x, y, width, height].every((value2) => Number.isFinite(value2))) {
|
|
4600
|
+
return { x, y, width, height };
|
|
4601
|
+
}
|
|
4602
|
+
return null;
|
|
4603
|
+
}
|
|
4385
4604
|
function windowsMouseScript(x, y, down, up, clicks = 1) {
|
|
4386
4605
|
const clickBody = down == null || up == null ? "" : `
|
|
4387
4606
|
for ($i = 0; $i -lt ${clicks}; $i++) {
|
|
@@ -4505,6 +4724,26 @@ function run(command, timeout2) {
|
|
|
4505
4724
|
};
|
|
4506
4725
|
}
|
|
4507
4726
|
}
|
|
4727
|
+
function runCaptureText(command, timeout2) {
|
|
4728
|
+
try {
|
|
4729
|
+
const output = execSync4(command, {
|
|
4730
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
4731
|
+
timeout: timeout2,
|
|
4732
|
+
env: { ...process.env },
|
|
4733
|
+
encoding: "utf8"
|
|
4734
|
+
});
|
|
4735
|
+
return { ok: true, output, message: "" };
|
|
4736
|
+
} catch (err) {
|
|
4737
|
+
const anyErr = err;
|
|
4738
|
+
const stdout = bufferishToString(anyErr.stdout);
|
|
4739
|
+
const stderr = bufferishToString(anyErr.stderr);
|
|
4740
|
+
return {
|
|
4741
|
+
ok: false,
|
|
4742
|
+
output: stdout,
|
|
4743
|
+
message: (stderr || stdout || anyErr.message || String(err)).trim().slice(0, 800)
|
|
4744
|
+
};
|
|
4745
|
+
}
|
|
4746
|
+
}
|
|
4508
4747
|
function bufferishToString(value2) {
|
|
4509
4748
|
if (Buffer.isBuffer(value2))
|
|
4510
4749
|
return value2.toString("utf8");
|
|
@@ -4517,6 +4756,18 @@ function powershellCommand(script) {
|
|
|
4517
4756
|
function quoteShell(value2) {
|
|
4518
4757
|
return JSON.stringify(value2);
|
|
4519
4758
|
}
|
|
4759
|
+
function quoteShellLiteral(value2) {
|
|
4760
|
+
return `'${value2.replace(/'/g, `'"'"'`)}'`;
|
|
4761
|
+
}
|
|
4762
|
+
function envFlag(name10, fallback) {
|
|
4763
|
+
const value2 = process.env[name10];
|
|
4764
|
+
if (value2 === void 0)
|
|
4765
|
+
return fallback;
|
|
4766
|
+
return /^(1|true|yes|on)$/i.test(value2.trim());
|
|
4767
|
+
}
|
|
4768
|
+
function quoteAppleScript(value2) {
|
|
4769
|
+
return JSON.stringify(value2);
|
|
4770
|
+
}
|
|
4520
4771
|
function psString(value2) {
|
|
4521
4772
|
return value2.replace(/'/g, "''");
|
|
4522
4773
|
}
|
|
@@ -8507,7 +8758,7 @@ function resetMoondreamClient() {
|
|
|
8507
8758
|
function getVisionPointDiagnostics() {
|
|
8508
8759
|
return [...lastPointDiagnostics];
|
|
8509
8760
|
}
|
|
8510
|
-
function
|
|
8761
|
+
function envFlag2(value2, fallback = false) {
|
|
8511
8762
|
if (value2 === void 0)
|
|
8512
8763
|
return fallback;
|
|
8513
8764
|
if (/^(1|true|yes|on)$/i.test(value2.trim()))
|
|
@@ -8627,8 +8878,8 @@ function resolveHuggingFaceVisionModelCandidates(preferredModel) {
|
|
|
8627
8878
|
for (const model of splitModelList(process.env["OMNIUS_MOONDREAM_HF_MODELS"]))
|
|
8628
8879
|
add2(model, true);
|
|
8629
8880
|
add2(process.env["MOONDREAM_HF_MODEL"] || "", Boolean(process.env["MOONDREAM_HF_MODEL"]));
|
|
8630
|
-
const fullPreviewAuto =
|
|
8631
|
-
const compactFallbackAuto =
|
|
8881
|
+
const fullPreviewAuto = envFlag2(process.env["OMNIUS_MOONDREAM3_PREVIEW_AUTO"], true) || envFlag2(process.env["OMNIUS_MOONDREAM3_PREVIEW"], false);
|
|
8882
|
+
const compactFallbackAuto = envFlag2(process.env["OMNIUS_MOONDREAM2_4BIT_AUTO"], true);
|
|
8632
8883
|
const hasExplicitHf = candidates.some((candidate) => candidate.explicit);
|
|
8633
8884
|
const hasExplicitNonHf = Boolean(preferred) && !isHuggingFaceVisionModel(preferred);
|
|
8634
8885
|
if (hasExplicitNonHf)
|
|
@@ -8937,7 +9188,7 @@ async function callOllamaVision(ollamaHost, model, prompt, imageBase64, timeoutM
|
|
|
8937
9188
|
return typeof data.response === "string" && data.response.trim() ? data.response : null;
|
|
8938
9189
|
}
|
|
8939
9190
|
function shouldAutoPullOllamaVisionModel(model) {
|
|
8940
|
-
if (!
|
|
9191
|
+
if (!envFlag2(process.env["OMNIUS_OLLAMA_VISION_AUTO_PULL"], true))
|
|
8941
9192
|
return false;
|
|
8942
9193
|
return Boolean(model.trim());
|
|
8943
9194
|
}
|
|
@@ -9031,7 +9282,7 @@ async function prepareHuggingFaceVisionCandidate(candidate, diagnostics) {
|
|
|
9031
9282
|
return { python, env: env2, gpuIndex: decision2.lease.gpuIndex, release: () => decision2.lease.release() };
|
|
9032
9283
|
}
|
|
9033
9284
|
function ensureHuggingFaceVisionPython(modelId) {
|
|
9034
|
-
const managed =
|
|
9285
|
+
const managed = envFlag2(process.env["OMNIUS_MOONDREAM_HF_MANAGED"], true);
|
|
9035
9286
|
if (!managed)
|
|
9036
9287
|
return legacyHuggingFaceVisionPython(modelId);
|
|
9037
9288
|
ensureUnifiedCacheDirs();
|
|
@@ -282954,6 +283205,18 @@ var init_vision_action_loop = __esm({
|
|
|
282954
283205
|
enum: ["single", "double"],
|
|
282955
283206
|
description: "Click type for click operation. Default single."
|
|
282956
283207
|
},
|
|
283208
|
+
text: {
|
|
283209
|
+
type: "string",
|
|
283210
|
+
description: "Optional text to type after a live click, or into the currently focused desktop control when operation='none'."
|
|
283211
|
+
},
|
|
283212
|
+
key: {
|
|
283213
|
+
type: "string",
|
|
283214
|
+
description: "Optional key/chord to press after a live click/text entry, for example Enter, Escape, Tab, ctrl+f."
|
|
283215
|
+
},
|
|
283216
|
+
typing_delay_ms: {
|
|
283217
|
+
type: "number",
|
|
283218
|
+
description: "Per-character delay for desktop text entry. Default 10ms."
|
|
283219
|
+
},
|
|
282957
283220
|
index: {
|
|
282958
283221
|
type: "number",
|
|
282959
283222
|
description: "If multiple target points are found, use this 1-based index. Default 1."
|
|
@@ -282978,6 +283241,10 @@ var init_vision_action_loop = __esm({
|
|
|
282978
283241
|
type: "string",
|
|
282979
283242
|
description: "Optional screenshot output directory. Relative paths resolve from the workspace."
|
|
282980
283243
|
},
|
|
283244
|
+
window_title: {
|
|
283245
|
+
type: "string",
|
|
283246
|
+
description: "Optional X11/XWayland window title to capture instead of the whole desktop. Useful on Wayland when root screenshots are blocked."
|
|
283247
|
+
},
|
|
282981
283248
|
clear_artifacts: {
|
|
282982
283249
|
type: "boolean",
|
|
282983
283250
|
description: "Only for reset. If true, also deletes this session's screenshot directory."
|
|
@@ -283064,8 +283331,12 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
|
|
|
283064
283331
|
const includeVision = asBoolean(args["include_vision"], true);
|
|
283065
283332
|
const visionPromptOverride = asString(args["vision_prompt"]);
|
|
283066
283333
|
const language = asString(args["language"]) || "eng";
|
|
283334
|
+
const windowTitle = asString(args["window_title"]);
|
|
283067
283335
|
const button = this.parseButton(args["button"]);
|
|
283068
283336
|
const clickType = args["click_type"] === "double" ? "double" : "single";
|
|
283337
|
+
const textToType = asString(args["text"]);
|
|
283338
|
+
const keyToPress = asString(args["key"]);
|
|
283339
|
+
const typingDelayMs = clampInteger2(args["typing_delay_ms"], 10, 0, 500);
|
|
283069
283340
|
const index = clampInteger2(args["index"], 1, 1, 100);
|
|
283070
283341
|
const delayMs = clampInteger2(args["delay_ms"], 0, 0, 6e4);
|
|
283071
283342
|
const maxSteps = action === "run" ? clampInteger2(args["max_steps"], DEFAULT_MAX_STEPS, 1, HARD_MAX_STEPS) : 1;
|
|
@@ -283088,13 +283359,28 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
|
|
|
283088
283359
|
}
|
|
283089
283360
|
const stamp = timestampSlug2();
|
|
283090
283361
|
const screenshotPath = join52(sessionDir2, `${stamp}-step-${step}-before.png`);
|
|
283091
|
-
if (process.platform === "linux" && (process.env["WAYLAND_DISPLAY"] || /wayland/i.test(process.env["XDG_SESSION_TYPE"] || ""))) {
|
|
283092
|
-
yield "Vision action loop:
|
|
283362
|
+
if (!windowTitle && process.platform === "linux" && (process.env["WAYLAND_DISPLAY"] || /wayland/i.test(process.env["XDG_SESSION_TYPE"] || ""))) {
|
|
283363
|
+
yield "Vision action loop: capturing full desktop on Wayland. For unattended app loops, pass window_title to avoid compositor screenshot prompts.";
|
|
283093
283364
|
}
|
|
283094
283365
|
yield `Vision action loop: capturing screenshot ${step}/${maxSteps}`;
|
|
283095
283366
|
let screenshotBackend = "";
|
|
283367
|
+
let captureOffset = { x: 0, y: 0 };
|
|
283368
|
+
let captureWindow;
|
|
283096
283369
|
try {
|
|
283097
|
-
|
|
283370
|
+
if (windowTitle) {
|
|
283371
|
+
const windowCapture = captureDesktopWindowScreenshot(screenshotPath, windowTitle);
|
|
283372
|
+
screenshotBackend = windowCapture.backend;
|
|
283373
|
+
captureOffset = { x: windowCapture.x, y: windowCapture.y };
|
|
283374
|
+
captureWindow = {
|
|
283375
|
+
windowId: windowCapture.windowId,
|
|
283376
|
+
x: windowCapture.x,
|
|
283377
|
+
y: windowCapture.y,
|
|
283378
|
+
width: windowCapture.width,
|
|
283379
|
+
height: windowCapture.height
|
|
283380
|
+
};
|
|
283381
|
+
} else {
|
|
283382
|
+
screenshotBackend = captureDesktopScreenshot(screenshotPath);
|
|
283383
|
+
}
|
|
283098
283384
|
mutatedFiles.push(screenshotPath);
|
|
283099
283385
|
} catch (err) {
|
|
283100
283386
|
success = false;
|
|
@@ -283121,6 +283407,9 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
|
|
|
283121
283407
|
outputLines.push(`Screenshot saved: ${screenshotPath}`);
|
|
283122
283408
|
outputLines.push(`Screen: ${dims.width}x${dims.height}`);
|
|
283123
283409
|
outputLines.push(`Screenshot backend: ${screenshotBackend}`);
|
|
283410
|
+
if (captureWindow) {
|
|
283411
|
+
outputLines.push(`Window: ${windowTitle} id=${captureWindow.windowId} geometry=${captureWindow.x},${captureWindow.y} ${captureWindow.width}x${captureWindow.height}`);
|
|
283412
|
+
}
|
|
283124
283413
|
let ocr = null;
|
|
283125
283414
|
if (includeOcr) {
|
|
283126
283415
|
yield `Vision action loop: running OCR for screenshot ${step}/${maxSteps}`;
|
|
@@ -283209,6 +283498,14 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
|
|
|
283209
283498
|
selectedPoint = pointFromVisionResult({ points: ocrPoints, source: "tesseract-ocr" }, index, dims);
|
|
283210
283499
|
}
|
|
283211
283500
|
if (selectedPoint) {
|
|
283501
|
+
if (captureOffset.x !== 0 || captureOffset.y !== 0) {
|
|
283502
|
+
selectedPoint = {
|
|
283503
|
+
...selectedPoint,
|
|
283504
|
+
pixelX: selectedPoint.pixelX + captureOffset.x,
|
|
283505
|
+
pixelY: selectedPoint.pixelY + captureOffset.y,
|
|
283506
|
+
source: `${selectedPoint.source}+window:${captureWindow?.windowId ?? "active"}`
|
|
283507
|
+
};
|
|
283508
|
+
}
|
|
283212
283509
|
outputLines.push(`Target: ${effectiveTarget}`);
|
|
283213
283510
|
outputLines.push(`Point source: ${selectedPoint.source}`);
|
|
283214
283511
|
outputLines.push(`Mapped point: (${Math.round(selectedPoint.pixelX)}, ${Math.round(selectedPoint.pixelY)}) normalized (${selectedPoint.x.toFixed(4)}, ${selectedPoint.y.toFixed(4)})`);
|
|
@@ -283238,15 +283535,29 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
|
|
|
283238
283535
|
} else {
|
|
283239
283536
|
yield `Vision action loop: performing ${effectiveOperation} at (${pixelX}, ${pixelY})`;
|
|
283240
283537
|
try {
|
|
283538
|
+
if (captureWindow?.windowId) {
|
|
283539
|
+
const activationBackend = activateDesktopWindow(captureWindow.windowId);
|
|
283540
|
+
outputLines.push(`Activated window via ${activationBackend}: ${captureWindow.windowId}`);
|
|
283541
|
+
}
|
|
283241
283542
|
const backend = effectiveOperation === "move" ? moveDesktopPointer(pixelX, pixelY) : clickDesktopAt(pixelX, pixelY, button, clickType);
|
|
283242
283543
|
actionTaken = effectiveOperation === "move" ? `Moved pointer to (${pixelX}, ${pixelY}) via ${backend}` : `Clicked at (${pixelX}, ${pixelY}) via ${backend} [${button} ${clickType}]`;
|
|
283243
283544
|
outputLines.push(actionTaken);
|
|
283545
|
+
if (textToType) {
|
|
283546
|
+
const keyboardBackend = typeDesktopText(textToType, typingDelayMs);
|
|
283547
|
+
outputLines.push(`Typed text via ${keyboardBackend}: ${JSON.stringify(textToType)}`);
|
|
283548
|
+
actionTaken += `; typed text via ${keyboardBackend}`;
|
|
283549
|
+
}
|
|
283550
|
+
if (keyToPress) {
|
|
283551
|
+
const keyboardBackend = pressDesktopKey(keyToPress);
|
|
283552
|
+
outputLines.push(`Pressed key via ${keyboardBackend}: ${keyToPress}`);
|
|
283553
|
+
actionTaken += `; pressed key via ${keyboardBackend}: ${keyToPress}`;
|
|
283554
|
+
}
|
|
283244
283555
|
afterScreenshotPath = join52(sessionDir2, `${timestampSlug2()}-step-${step}-after.png`);
|
|
283245
|
-
if (process.platform === "linux" && (process.env["WAYLAND_DISPLAY"] || /wayland/i.test(process.env["XDG_SESSION_TYPE"] || ""))) {
|
|
283246
|
-
yield "Vision action loop:
|
|
283556
|
+
if (!windowTitle && process.platform === "linux" && (process.env["WAYLAND_DISPLAY"] || /wayland/i.test(process.env["XDG_SESSION_TYPE"] || ""))) {
|
|
283557
|
+
yield "Vision action loop: capturing post-action full desktop on Wayland. For unattended app loops, pass window_title to avoid compositor screenshot prompts.";
|
|
283247
283558
|
}
|
|
283248
283559
|
yield `Vision action loop: capturing post-action screenshot ${step}/${maxSteps}`;
|
|
283249
|
-
const afterBackend = captureDesktopScreenshot(afterScreenshotPath);
|
|
283560
|
+
const afterBackend = windowTitle ? captureDesktopWindowScreenshot(afterScreenshotPath, windowTitle).backend : captureDesktopScreenshot(afterScreenshotPath);
|
|
283250
283561
|
mutatedFiles.push(afterScreenshotPath);
|
|
283251
283562
|
outputLines.push(`Post-action screenshot: ${afterScreenshotPath}`);
|
|
283252
283563
|
outputLines.push(`Post-action screenshot backend: ${afterBackend}`);
|
|
@@ -283260,6 +283571,35 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
|
|
|
283260
283571
|
}
|
|
283261
283572
|
} else if (effectiveOperation !== "none" && (effectiveTarget || hasCoordinates)) {
|
|
283262
283573
|
outputLines.push(`Action skipped: no usable point for operation '${effectiveOperation}'.`);
|
|
283574
|
+
} else if (allowActions && !dryRun && (textToType || keyToPress)) {
|
|
283575
|
+
try {
|
|
283576
|
+
const keyboardActions = [];
|
|
283577
|
+
if (captureWindow?.windowId) {
|
|
283578
|
+
const activationBackend = activateDesktopWindow(captureWindow.windowId);
|
|
283579
|
+
keyboardActions.push(`Activated window via ${activationBackend}: ${captureWindow.windowId}`);
|
|
283580
|
+
}
|
|
283581
|
+
if (textToType) {
|
|
283582
|
+
const keyboardBackend = typeDesktopText(textToType, typingDelayMs);
|
|
283583
|
+
keyboardActions.push(`Typed text via ${keyboardBackend}: ${JSON.stringify(textToType)}`);
|
|
283584
|
+
}
|
|
283585
|
+
if (keyToPress) {
|
|
283586
|
+
const keyboardBackend = pressDesktopKey(keyToPress);
|
|
283587
|
+
keyboardActions.push(`Pressed key via ${keyboardBackend}: ${keyToPress}`);
|
|
283588
|
+
}
|
|
283589
|
+
actionTaken = keyboardActions.join("; ");
|
|
283590
|
+
outputLines.push(actionTaken);
|
|
283591
|
+
afterScreenshotPath = join52(sessionDir2, `${timestampSlug2()}-step-${step}-after.png`);
|
|
283592
|
+
const afterBackend = windowTitle ? captureDesktopWindowScreenshot(afterScreenshotPath, windowTitle).backend : captureDesktopScreenshot(afterScreenshotPath);
|
|
283593
|
+
mutatedFiles.push(afterScreenshotPath);
|
|
283594
|
+
outputLines.push(`Post-action screenshot: ${afterScreenshotPath}`);
|
|
283595
|
+
outputLines.push(`Post-action screenshot backend: ${afterBackend}`);
|
|
283596
|
+
} catch (err) {
|
|
283597
|
+
success = false;
|
|
283598
|
+
actionError = err instanceof Error ? err.message : String(err);
|
|
283599
|
+
error = actionError;
|
|
283600
|
+
outputLines.push(`Keyboard action failed: ${actionError}`);
|
|
283601
|
+
outputLines.push(desktopAutomationRecoveryMessage());
|
|
283602
|
+
}
|
|
283263
283603
|
} else if (action === "run" && !effectiveTarget && !hasCoordinates) {
|
|
283264
283604
|
success = false;
|
|
283265
283605
|
error = "Vision loop stopped: no target or coordinates were provided and visual planning did not identify a clickable target. Stopping instead of repeating observe-only screenshots.";
|
package/npm-shrinkwrap.json
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "omnius",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.206",
|
|
4
4
|
"lockfileVersion": 3,
|
|
5
5
|
"requires": true,
|
|
6
6
|
"packages": {
|
|
7
7
|
"": {
|
|
8
8
|
"name": "omnius",
|
|
9
|
-
"version": "1.0.
|
|
9
|
+
"version": "1.0.206",
|
|
10
10
|
"bundleDependencies": [
|
|
11
11
|
"image-to-ascii"
|
|
12
12
|
],
|
package/package.json
CHANGED