omnius 1.0.205 → 1.0.206

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -3987,9 +3987,33 @@ var init_system_deps = __esm({
3987
3987
  // packages/execution/dist/tools/desktop-control.js
3988
3988
  import { execSync as execSync4 } from "node:child_process";
3989
3989
  import { existsSync as existsSync6, statSync as statSync3 } from "node:fs";
3990
+ function activateDesktopWindow(windowId) {
3991
+ const attempts = [];
3992
+ const tryAction = (label, command) => {
3993
+ const result = run(command, 5e3);
3994
+ if (result.ok)
3995
+ return label;
3996
+ attempts.push({ label, message: result.message });
3997
+ return null;
3998
+ };
3999
+ if (process.platform === "linux") {
4000
+ if (hasCommand2("xdotool")) {
4001
+ const backend = tryAction("xdotool", `xdotool windowactivate --sync ${quoteShell(windowId)}`);
4002
+ if (backend)
4003
+ return backend;
4004
+ }
4005
+ if (hasCommand2("wmctrl")) {
4006
+ const backend = tryAction("wmctrl", `wmctrl -ia ${quoteShell(windowId)}`);
4007
+ if (backend)
4008
+ return backend;
4009
+ }
4010
+ }
4011
+ throw new Error("No desktop window activation backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage());
4012
+ }
3990
4013
  function captureDesktopScreenshot(outputPath3) {
3991
4014
  const attempts = [];
3992
4015
  const out = quoteShell(outputPath3);
4016
+ const allowInteractiveWaylandScreenshot = envFlag("OMNIUS_DESKTOP_INTERACTIVE_SCREENSHOT", false);
3993
4017
  const tryCapture = (label, command, timeout2 = 1e4) => {
3994
4018
  const result = run(command, timeout2);
3995
4019
  if (result.ok && existsSync6(outputPath3)) {
@@ -4022,11 +4046,22 @@ $bitmap.Dispose()
4022
4046
  return backend;
4023
4047
  } else if (process.platform === "linux") {
4024
4048
  const desktop = `${process.env["XDG_CURRENT_DESKTOP"] || ""} ${process.env["DESKTOP_SESSION"] || ""}`;
4025
- if ((/wayland/i.test(process.env["XDG_SESSION_TYPE"] || "") || process.env["WAYLAND_DISPLAY"]) && hasCommand2("gdbus") && hasCommand2("dbus-monitor")) {
4049
+ if ((/wayland/i.test(process.env["XDG_SESSION_TYPE"] || "") || process.env["WAYLAND_DISPLAY"]) && /gnome/i.test(desktop) && hasCommand2("gdbus") && allowInteractiveWaylandScreenshot) {
4050
+ const backend = tryCapture("gnome-shell-interactive-screenshot-dbus", gnomeInteractiveScreenshotCommand(outputPath3), 13e4);
4051
+ if (backend)
4052
+ return backend;
4053
+ }
4054
+ if ((/wayland/i.test(process.env["XDG_SESSION_TYPE"] || "") || process.env["WAYLAND_DISPLAY"]) && hasCommand2("gdbus") && hasCommand2("dbus-monitor") && allowInteractiveWaylandScreenshot) {
4026
4055
  const backend = tryCapture("xdg-desktop-portal-screenshot", portalScreenshotCommand(outputPath3), 13e4);
4027
4056
  if (backend)
4028
4057
  return backend;
4029
4058
  }
4059
+ if ((/wayland/i.test(process.env["XDG_SESSION_TYPE"] || "") || process.env["WAYLAND_DISPLAY"]) && !allowInteractiveWaylandScreenshot) {
4060
+ attempts.push({
4061
+ label: "interactive-wayland-screenshot",
4062
+ message: "skipped by default to avoid unattended GNOME/portal screenshot selection stalls. For desktop app loops, pass window_title to vision_action_loop so Omnius captures the named X11/XWayland window. Set OMNIUS_DESKTOP_INTERACTIVE_SCREENSHOT=1 only when a human will complete the full-screen screenshot prompt."
4063
+ });
4064
+ }
4030
4065
  if (/gnome/i.test(desktop) && hasCommand2("gdbus")) {
4031
4066
  const backend = tryCapture("gnome-shell-screenshot-dbus", `gdbus call --session --dest org.gnome.Shell.Screenshot --object-path /org/gnome/Shell/Screenshot --method org.gnome.Shell.Screenshot.Screenshot false false ${out}`);
4032
4067
  if (backend)
@@ -4079,6 +4114,83 @@ $bitmap.Dispose()
4079
4114
  }
4080
4115
  throw new Error("No desktop screenshot backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage());
4081
4116
  }
4117
+ function captureDesktopWindowScreenshot(outputPath3, windowTitle) {
4118
+ const attempts = [];
4119
+ const out = quoteShell(outputPath3);
4120
+ const runText = (command, timeout2 = 5e3) => {
4121
+ const result = runCaptureText(command, timeout2);
4122
+ if (result.ok)
4123
+ return result.output.trim();
4124
+ attempts.push({ label: command.split(/\s+/)[0] || command, message: result.message });
4125
+ return null;
4126
+ };
4127
+ if (process.platform !== "linux") {
4128
+ throw new Error("Window screenshot capture is currently implemented for Linux/X11/XWayland windows only.");
4129
+ }
4130
+ if (!hasCommand2("xdotool")) {
4131
+ attempts.push({ label: "xdotool", message: "not found on PATH" });
4132
+ } else if (!hasCommand2("import")) {
4133
+ attempts.push({ label: "import", message: "ImageMagick import not found on PATH" });
4134
+ } else {
4135
+ const windowId = windowTitle ? runText(`xdotool search --name ${quoteShell(windowTitle)} | tail -n1`) : runText("xdotool getactivewindow");
4136
+ if (windowId) {
4137
+ const geometry = runText(`xdotool getwindowgeometry --shell ${quoteShell(windowId)}`);
4138
+ const parsed = parseXdotoolGeometry(geometry || "");
4139
+ if (!parsed) {
4140
+ attempts.push({ label: "xdotool getwindowgeometry", message: `Could not parse geometry for window ${windowId}: ${geometry}` });
4141
+ } else {
4142
+ const result = run(`import -window ${quoteShell(windowId)} ${out}`, 1e4);
4143
+ if (result.ok && existsSync6(outputPath3)) {
4144
+ const inspection = inspectScreenshot(outputPath3);
4145
+ if (inspection.ok) {
4146
+ return {
4147
+ backend: "imagemagick-import-window",
4148
+ windowId,
4149
+ ...parsed
4150
+ };
4151
+ }
4152
+ attempts.push({ label: "import-window", message: inspection.message });
4153
+ } else {
4154
+ attempts.push({ label: "import-window", message: result.message });
4155
+ }
4156
+ }
4157
+ }
4158
+ }
4159
+ throw new Error("No desktop window screenshot backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage());
4160
+ }
4161
+ function gnomeInteractiveScreenshotCommand(outputPath3) {
4162
+ const script = `
4163
+ set -eu
4164
+ out=${quoteShell(outputPath3)}
4165
+ printf '%s\\n' 'Omnius desktop screenshot: choose Full Screen in the GNOME screenshot UI, then press Enter.' >&2
4166
+ call_out="$(gdbus call --session --dest org.gnome.Shell.Screenshot --object-path /org/gnome/Shell/Screenshot --method org.gnome.Shell.Screenshot.InteractiveScreenshot 2>&1)" || {
4167
+ printf '%s\\n' "$call_out" >&2
4168
+ exit 1
4169
+ }
4170
+ uri="$(printf '%s\\n' "$call_out" | sed -n "s/.*'\\(file:[^']*\\)'.*/\\1/p" | tail -n 1)"
4171
+ if [ -z "$uri" ]; then
4172
+ printf 'GNOME interactive screenshot did not return a file URI: %s\\n' "$call_out" >&2
4173
+ exit 1
4174
+ fi
4175
+
4176
+ if command -v gio >/dev/null 2>&1; then
4177
+ gio copy -f "$uri" "$out"
4178
+ elif command -v python3 >/dev/null 2>&1; then
4179
+ python3 - "$uri" "$out" <<'PY'
4180
+ import shutil, sys, urllib.parse
4181
+ uri, out = sys.argv[1], sys.argv[2]
4182
+ parsed = urllib.parse.urlparse(uri)
4183
+ if parsed.scheme != "file":
4184
+ raise SystemExit(f"Unsupported screenshot URI scheme: {parsed.scheme}")
4185
+ shutil.copyfile(urllib.parse.unquote(parsed.path), out)
4186
+ PY
4187
+ else
4188
+ printf 'Need gio or python3 to copy GNOME screenshot URI %s to %s.\\n' "$uri" "$out" >&2
4189
+ exit 1
4190
+ fi
4191
+ `.trim();
4192
+ return `bash -lc ${quoteShellLiteral(script)}`;
4193
+ }
4082
4194
  function portalScreenshotCommand(outputPath3) {
4083
4195
  const script = `
4084
4196
  set -eu
@@ -4086,6 +4198,7 @@ out=${quoteShell(outputPath3)}
4086
4198
  monitor_file="$(mktemp)"
4087
4199
  cleanup() {
4088
4200
  if [ -n "\${monpid:-}" ]; then kill "$monpid" >/dev/null 2>&1 || true; fi
4201
+ if [ -n "\${keypid:-}" ]; then kill "$keypid" >/dev/null 2>&1 || true; fi
4089
4202
  rm -f "$monitor_file"
4090
4203
  }
4091
4204
  trap cleanup EXIT
@@ -4102,6 +4215,25 @@ fi
4102
4215
 
4103
4216
  dbus-monitor --session "type='signal',interface='org.freedesktop.portal.Request',member='Response',path='$handle'" > "$monitor_file" 2>&1 &
4104
4217
  monpid=$!
4218
+ if [ "\${OMNIUS_SCREENSHOT_AUTO_CONFIRM:-1}" != "0" ]; then
4219
+ (
4220
+ sleep 1
4221
+ i=0
4222
+ while [ "$i" -lt 8 ]; do
4223
+ if command -v xdotool >/dev/null 2>&1; then
4224
+ DISPLAY="\${DISPLAY:-:0}" xdotool key Return >/dev/null 2>&1 || true
4225
+ elif command -v ydotool >/dev/null 2>&1; then
4226
+ ydotool key 28:1 28:0 >/dev/null 2>&1 || true
4227
+ elif command -v dotool >/dev/null 2>&1; then
4228
+ printf 'key enter
4229
+ ' | dotool >/dev/null 2>&1 || true
4230
+ fi
4231
+ i=$((i + 1))
4232
+ sleep 0.5
4233
+ done
4234
+ ) &
4235
+ keypid=$!
4236
+ fi
4105
4237
  deadline=$(( $(date +%s) + 120 ))
4106
4238
  while :; do
4107
4239
  if grep -q "member=Response" "$monitor_file"; then break; fi
@@ -4144,7 +4276,7 @@ else
4144
4276
  exit 1
4145
4277
  fi
4146
4278
  `.trim();
4147
- return `bash -lc ${quoteShell(script)}`;
4279
+ return `bash -lc ${quoteShellLiteral(script)}`;
4148
4280
  }
4149
4281
  function moveDesktopPointer(x, y) {
4150
4282
  const result = performDesktopPointerAction({ x, y, moveOnly: true });
@@ -4167,6 +4299,25 @@ function clickDesktopAt(x, y, button, clickType) {
4167
4299
  lastPointer = { x: Math.round(x), y: Math.round(y) };
4168
4300
  return result.backend;
4169
4301
  }
4302
+ function typeDesktopText(text, delayMs = 10) {
4303
+ const result = performDesktopKeyboardAction({
4304
+ kind: "type",
4305
+ text,
4306
+ delayMs
4307
+ });
4308
+ if (!result.ok)
4309
+ throw new Error(result.message);
4310
+ return result.backend;
4311
+ }
4312
+ function pressDesktopKey(key) {
4313
+ const result = performDesktopKeyboardAction({
4314
+ kind: "key",
4315
+ key
4316
+ });
4317
+ if (!result.ok)
4318
+ throw new Error(result.message);
4319
+ return result.backend;
4320
+ }
4170
4321
  function tryRunXdotoolShellFallback(command) {
4171
4322
  if (!/\bxdotool\b|\bxdtool\b/.test(command))
4172
4323
  return null;
@@ -4278,7 +4429,9 @@ function desktopAutomationRecoveryMessage(command) {
4278
4429
  " Linux pointer control: xdotool/X11, ydotool, dotool, python-xlib",
4279
4430
  " macOS: cliclick, then System Events",
4280
4431
  " Windows: PowerShell user32 input",
4281
- "On Wayland, Omnius requests screenshot permission through xdg-desktop-portal when available. Approve the system screenshot prompt to continue.",
4432
+ "On Wayland, unattended full-desktop screenshots are compositor-restricted and interactive screenshot prompts are skipped by default.",
4433
+ "For desktop app loops on GNOME Wayland, pass window_title to vision_action_loop so Omnius captures the named X11/XWayland window without a full-screen prompt.",
4434
+ "Set OMNIUS_DESKTOP_INTERACTIVE_SCREENSHOT=1 only when a human will complete the full-screen screenshot prompt.",
4282
4435
  "On GNOME Wayland, unattended screenshots may still be denied by compositor policy. Install gnome-screenshot or grant screenshot permission for the session if capture is blocked.",
4283
4436
  "On Wayland pointer control, install and enable ydotool or dotool when xdotool cannot open an X display."
4284
4437
  ].join("\n");
@@ -4382,6 +4535,72 @@ ${options2.moveOnly ? "" : `for _ in range(${clicks}):
4382
4535
  message: "No desktop mouse backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage()
4383
4536
  };
4384
4537
  }
4538
+ function performDesktopKeyboardAction(options2) {
4539
+ const attempts = [];
4540
+ const tryAction = (label, command) => {
4541
+ const result = run(command, 1e4);
4542
+ if (result.ok)
4543
+ return label;
4544
+ attempts.push({ label, message: result.message });
4545
+ return null;
4546
+ };
4547
+ if (process.platform === "linux") {
4548
+ if (hasCommand2("xdotool")) {
4549
+ const command = options2.kind === "type" ? `xdotool type --clearmodifiers --delay ${Math.max(0, Math.min(500, Math.round(options2.delayMs)))} ${quoteShell(options2.text)}` : `xdotool key --clearmodifiers ${quoteShell(options2.key)}`;
4550
+ const backend = tryAction("xdotool", command);
4551
+ if (backend)
4552
+ return { ok: true, backend };
4553
+ } else {
4554
+ attempts.push({ label: "xdotool", message: "not found on PATH" });
4555
+ }
4556
+ if (hasCommand2("ydotool")) {
4557
+ const command = options2.kind === "type" ? `ydotool type ${quoteShell(options2.text)}` : `ydotool key ${quoteShell(options2.key)}`;
4558
+ const backend = tryAction("ydotool", command);
4559
+ if (backend)
4560
+ return { ok: true, backend };
4561
+ }
4562
+ if (hasCommand2("dotool")) {
4563
+ const commandText = options2.kind === "type" ? `type ${options2.text}
4564
+ ` : `key ${options2.key}
4565
+ `;
4566
+ const backend = tryAction("dotool", `printf ${quoteShell(commandText)} | dotool`);
4567
+ if (backend)
4568
+ return { ok: true, backend };
4569
+ }
4570
+ } else if (process.platform === "darwin") {
4571
+ if (options2.kind === "type") {
4572
+ const script = `tell application "System Events" to keystroke ${quoteAppleScript(options2.text)}`;
4573
+ const backend = tryAction("osascript-system-events", `osascript -e ${quoteShell(script)}`);
4574
+ if (backend)
4575
+ return { ok: true, backend };
4576
+ } else {
4577
+ const script = `tell application "System Events" to key code ${quoteAppleScript(options2.key)}`;
4578
+ const backend = tryAction("osascript-system-events", `osascript -e ${quoteShell(script)}`);
4579
+ if (backend)
4580
+ return { ok: true, backend };
4581
+ }
4582
+ }
4583
+ return {
4584
+ ok: false,
4585
+ message: "No desktop keyboard backend succeeded.\n" + formatDesktopAttempts(attempts) + "\n" + desktopAutomationRecoveryMessage()
4586
+ };
4587
+ }
4588
+ function parseXdotoolGeometry(raw) {
4589
+ const values = /* @__PURE__ */ new Map();
4590
+ for (const line of raw.split(/\r?\n/)) {
4591
+ const match = line.match(/^([A-Z]+)=(-?\d+)$/);
4592
+ if (match)
4593
+ values.set(match[1], Number(match[2]));
4594
+ }
4595
+ const x = values.get("X");
4596
+ const y = values.get("Y");
4597
+ const width = values.get("WIDTH");
4598
+ const height = values.get("HEIGHT");
4599
+ if ([x, y, width, height].every((value2) => Number.isFinite(value2))) {
4600
+ return { x, y, width, height };
4601
+ }
4602
+ return null;
4603
+ }
4385
4604
  function windowsMouseScript(x, y, down, up, clicks = 1) {
4386
4605
  const clickBody = down == null || up == null ? "" : `
4387
4606
  for ($i = 0; $i -lt ${clicks}; $i++) {
@@ -4505,6 +4724,26 @@ function run(command, timeout2) {
4505
4724
  };
4506
4725
  }
4507
4726
  }
4727
+ function runCaptureText(command, timeout2) {
4728
+ try {
4729
+ const output = execSync4(command, {
4730
+ stdio: ["pipe", "pipe", "pipe"],
4731
+ timeout: timeout2,
4732
+ env: { ...process.env },
4733
+ encoding: "utf8"
4734
+ });
4735
+ return { ok: true, output, message: "" };
4736
+ } catch (err) {
4737
+ const anyErr = err;
4738
+ const stdout = bufferishToString(anyErr.stdout);
4739
+ const stderr = bufferishToString(anyErr.stderr);
4740
+ return {
4741
+ ok: false,
4742
+ output: stdout,
4743
+ message: (stderr || stdout || anyErr.message || String(err)).trim().slice(0, 800)
4744
+ };
4745
+ }
4746
+ }
4508
4747
  function bufferishToString(value2) {
4509
4748
  if (Buffer.isBuffer(value2))
4510
4749
  return value2.toString("utf8");
@@ -4517,6 +4756,18 @@ function powershellCommand(script) {
4517
4756
  function quoteShell(value2) {
4518
4757
  return JSON.stringify(value2);
4519
4758
  }
4759
+ function quoteShellLiteral(value2) {
4760
+ return `'${value2.replace(/'/g, `'"'"'`)}'`;
4761
+ }
4762
+ function envFlag(name10, fallback) {
4763
+ const value2 = process.env[name10];
4764
+ if (value2 === void 0)
4765
+ return fallback;
4766
+ return /^(1|true|yes|on)$/i.test(value2.trim());
4767
+ }
4768
+ function quoteAppleScript(value2) {
4769
+ return JSON.stringify(value2);
4770
+ }
4520
4771
  function psString(value2) {
4521
4772
  return value2.replace(/'/g, "''");
4522
4773
  }
@@ -8507,7 +8758,7 @@ function resetMoondreamClient() {
8507
8758
  function getVisionPointDiagnostics() {
8508
8759
  return [...lastPointDiagnostics];
8509
8760
  }
8510
- function envFlag(value2, fallback = false) {
8761
+ function envFlag2(value2, fallback = false) {
8511
8762
  if (value2 === void 0)
8512
8763
  return fallback;
8513
8764
  if (/^(1|true|yes|on)$/i.test(value2.trim()))
@@ -8627,8 +8878,8 @@ function resolveHuggingFaceVisionModelCandidates(preferredModel) {
8627
8878
  for (const model of splitModelList(process.env["OMNIUS_MOONDREAM_HF_MODELS"]))
8628
8879
  add2(model, true);
8629
8880
  add2(process.env["MOONDREAM_HF_MODEL"] || "", Boolean(process.env["MOONDREAM_HF_MODEL"]));
8630
- const fullPreviewAuto = envFlag(process.env["OMNIUS_MOONDREAM3_PREVIEW_AUTO"], true) || envFlag(process.env["OMNIUS_MOONDREAM3_PREVIEW"], false);
8631
- const compactFallbackAuto = envFlag(process.env["OMNIUS_MOONDREAM2_4BIT_AUTO"], true);
8881
+ const fullPreviewAuto = envFlag2(process.env["OMNIUS_MOONDREAM3_PREVIEW_AUTO"], true) || envFlag2(process.env["OMNIUS_MOONDREAM3_PREVIEW"], false);
8882
+ const compactFallbackAuto = envFlag2(process.env["OMNIUS_MOONDREAM2_4BIT_AUTO"], true);
8632
8883
  const hasExplicitHf = candidates.some((candidate) => candidate.explicit);
8633
8884
  const hasExplicitNonHf = Boolean(preferred) && !isHuggingFaceVisionModel(preferred);
8634
8885
  if (hasExplicitNonHf)
@@ -8937,7 +9188,7 @@ async function callOllamaVision(ollamaHost, model, prompt, imageBase64, timeoutM
8937
9188
  return typeof data.response === "string" && data.response.trim() ? data.response : null;
8938
9189
  }
8939
9190
  function shouldAutoPullOllamaVisionModel(model) {
8940
- if (!envFlag(process.env["OMNIUS_OLLAMA_VISION_AUTO_PULL"], true))
9191
+ if (!envFlag2(process.env["OMNIUS_OLLAMA_VISION_AUTO_PULL"], true))
8941
9192
  return false;
8942
9193
  return Boolean(model.trim());
8943
9194
  }
@@ -9031,7 +9282,7 @@ async function prepareHuggingFaceVisionCandidate(candidate, diagnostics) {
9031
9282
  return { python, env: env2, gpuIndex: decision2.lease.gpuIndex, release: () => decision2.lease.release() };
9032
9283
  }
9033
9284
  function ensureHuggingFaceVisionPython(modelId) {
9034
- const managed = envFlag(process.env["OMNIUS_MOONDREAM_HF_MANAGED"], true);
9285
+ const managed = envFlag2(process.env["OMNIUS_MOONDREAM_HF_MANAGED"], true);
9035
9286
  if (!managed)
9036
9287
  return legacyHuggingFaceVisionPython(modelId);
9037
9288
  ensureUnifiedCacheDirs();
@@ -282954,6 +283205,18 @@ var init_vision_action_loop = __esm({
282954
283205
  enum: ["single", "double"],
282955
283206
  description: "Click type for click operation. Default single."
282956
283207
  },
283208
+ text: {
283209
+ type: "string",
283210
+ description: "Optional text to type after a live click, or into the currently focused desktop control when operation='none'."
283211
+ },
283212
+ key: {
283213
+ type: "string",
283214
+ description: "Optional key/chord to press after a live click/text entry, for example Enter, Escape, Tab, ctrl+f."
283215
+ },
283216
+ typing_delay_ms: {
283217
+ type: "number",
283218
+ description: "Per-character delay for desktop text entry. Default 10ms."
283219
+ },
282957
283220
  index: {
282958
283221
  type: "number",
282959
283222
  description: "If multiple target points are found, use this 1-based index. Default 1."
@@ -282978,6 +283241,10 @@ var init_vision_action_loop = __esm({
282978
283241
  type: "string",
282979
283242
  description: "Optional screenshot output directory. Relative paths resolve from the workspace."
282980
283243
  },
283244
+ window_title: {
283245
+ type: "string",
283246
+ description: "Optional X11/XWayland window title to capture instead of the whole desktop. Useful on Wayland when root screenshots are blocked."
283247
+ },
282981
283248
  clear_artifacts: {
282982
283249
  type: "boolean",
282983
283250
  description: "Only for reset. If true, also deletes this session's screenshot directory."
@@ -283064,8 +283331,12 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
283064
283331
  const includeVision = asBoolean(args["include_vision"], true);
283065
283332
  const visionPromptOverride = asString(args["vision_prompt"]);
283066
283333
  const language = asString(args["language"]) || "eng";
283334
+ const windowTitle = asString(args["window_title"]);
283067
283335
  const button = this.parseButton(args["button"]);
283068
283336
  const clickType = args["click_type"] === "double" ? "double" : "single";
283337
+ const textToType = asString(args["text"]);
283338
+ const keyToPress = asString(args["key"]);
283339
+ const typingDelayMs = clampInteger2(args["typing_delay_ms"], 10, 0, 500);
283069
283340
  const index = clampInteger2(args["index"], 1, 1, 100);
283070
283341
  const delayMs = clampInteger2(args["delay_ms"], 0, 0, 6e4);
283071
283342
  const maxSteps = action === "run" ? clampInteger2(args["max_steps"], DEFAULT_MAX_STEPS, 1, HARD_MAX_STEPS) : 1;
@@ -283088,13 +283359,28 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
283088
283359
  }
283089
283360
  const stamp = timestampSlug2();
283090
283361
  const screenshotPath = join52(sessionDir2, `${stamp}-step-${step}-before.png`);
283091
- if (process.platform === "linux" && (process.env["WAYLAND_DISPLAY"] || /wayland/i.test(process.env["XDG_SESSION_TYPE"] || ""))) {
283092
- yield "Vision action loop: requesting desktop screenshot permission if the system prompts";
283362
+ if (!windowTitle && process.platform === "linux" && (process.env["WAYLAND_DISPLAY"] || /wayland/i.test(process.env["XDG_SESSION_TYPE"] || ""))) {
283363
+ yield "Vision action loop: capturing full desktop on Wayland. For unattended app loops, pass window_title to avoid compositor screenshot prompts.";
283093
283364
  }
283094
283365
  yield `Vision action loop: capturing screenshot ${step}/${maxSteps}`;
283095
283366
  let screenshotBackend = "";
283367
+ let captureOffset = { x: 0, y: 0 };
283368
+ let captureWindow;
283096
283369
  try {
283097
- screenshotBackend = captureDesktopScreenshot(screenshotPath);
283370
+ if (windowTitle) {
283371
+ const windowCapture = captureDesktopWindowScreenshot(screenshotPath, windowTitle);
283372
+ screenshotBackend = windowCapture.backend;
283373
+ captureOffset = { x: windowCapture.x, y: windowCapture.y };
283374
+ captureWindow = {
283375
+ windowId: windowCapture.windowId,
283376
+ x: windowCapture.x,
283377
+ y: windowCapture.y,
283378
+ width: windowCapture.width,
283379
+ height: windowCapture.height
283380
+ };
283381
+ } else {
283382
+ screenshotBackend = captureDesktopScreenshot(screenshotPath);
283383
+ }
283098
283384
  mutatedFiles.push(screenshotPath);
283099
283385
  } catch (err) {
283100
283386
  success = false;
@@ -283121,6 +283407,9 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
283121
283407
  outputLines.push(`Screenshot saved: ${screenshotPath}`);
283122
283408
  outputLines.push(`Screen: ${dims.width}x${dims.height}`);
283123
283409
  outputLines.push(`Screenshot backend: ${screenshotBackend}`);
283410
+ if (captureWindow) {
283411
+ outputLines.push(`Window: ${windowTitle} id=${captureWindow.windowId} geometry=${captureWindow.x},${captureWindow.y} ${captureWindow.width}x${captureWindow.height}`);
283412
+ }
283124
283413
  let ocr = null;
283125
283414
  if (includeOcr) {
283126
283415
  yield `Vision action loop: running OCR for screenshot ${step}/${maxSteps}`;
@@ -283209,6 +283498,14 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
283209
283498
  selectedPoint = pointFromVisionResult({ points: ocrPoints, source: "tesseract-ocr" }, index, dims);
283210
283499
  }
283211
283500
  if (selectedPoint) {
283501
+ if (captureOffset.x !== 0 || captureOffset.y !== 0) {
283502
+ selectedPoint = {
283503
+ ...selectedPoint,
283504
+ pixelX: selectedPoint.pixelX + captureOffset.x,
283505
+ pixelY: selectedPoint.pixelY + captureOffset.y,
283506
+ source: `${selectedPoint.source}+window:${captureWindow?.windowId ?? "active"}`
283507
+ };
283508
+ }
283212
283509
  outputLines.push(`Target: ${effectiveTarget}`);
283213
283510
  outputLines.push(`Point source: ${selectedPoint.source}`);
283214
283511
  outputLines.push(`Mapped point: (${Math.round(selectedPoint.pixelX)}, ${Math.round(selectedPoint.pixelY)}) normalized (${selectedPoint.x.toFixed(4)}, ${selectedPoint.y.toFixed(4)})`);
@@ -283238,15 +283535,29 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
283238
283535
  } else {
283239
283536
  yield `Vision action loop: performing ${effectiveOperation} at (${pixelX}, ${pixelY})`;
283240
283537
  try {
283538
+ if (captureWindow?.windowId) {
283539
+ const activationBackend = activateDesktopWindow(captureWindow.windowId);
283540
+ outputLines.push(`Activated window via ${activationBackend}: ${captureWindow.windowId}`);
283541
+ }
283241
283542
  const backend = effectiveOperation === "move" ? moveDesktopPointer(pixelX, pixelY) : clickDesktopAt(pixelX, pixelY, button, clickType);
283242
283543
  actionTaken = effectiveOperation === "move" ? `Moved pointer to (${pixelX}, ${pixelY}) via ${backend}` : `Clicked at (${pixelX}, ${pixelY}) via ${backend} [${button} ${clickType}]`;
283243
283544
  outputLines.push(actionTaken);
283545
+ if (textToType) {
283546
+ const keyboardBackend = typeDesktopText(textToType, typingDelayMs);
283547
+ outputLines.push(`Typed text via ${keyboardBackend}: ${JSON.stringify(textToType)}`);
283548
+ actionTaken += `; typed text via ${keyboardBackend}`;
283549
+ }
283550
+ if (keyToPress) {
283551
+ const keyboardBackend = pressDesktopKey(keyToPress);
283552
+ outputLines.push(`Pressed key via ${keyboardBackend}: ${keyToPress}`);
283553
+ actionTaken += `; pressed key via ${keyboardBackend}: ${keyToPress}`;
283554
+ }
283244
283555
  afterScreenshotPath = join52(sessionDir2, `${timestampSlug2()}-step-${step}-after.png`);
283245
- if (process.platform === "linux" && (process.env["WAYLAND_DISPLAY"] || /wayland/i.test(process.env["XDG_SESSION_TYPE"] || ""))) {
283246
- yield "Vision action loop: requesting desktop screenshot permission for post-action verification if the system prompts";
283556
+ if (!windowTitle && process.platform === "linux" && (process.env["WAYLAND_DISPLAY"] || /wayland/i.test(process.env["XDG_SESSION_TYPE"] || ""))) {
283557
+ yield "Vision action loop: capturing post-action full desktop on Wayland. For unattended app loops, pass window_title to avoid compositor screenshot prompts.";
283247
283558
  }
283248
283559
  yield `Vision action loop: capturing post-action screenshot ${step}/${maxSteps}`;
283249
- const afterBackend = captureDesktopScreenshot(afterScreenshotPath);
283560
+ const afterBackend = windowTitle ? captureDesktopWindowScreenshot(afterScreenshotPath, windowTitle).backend : captureDesktopScreenshot(afterScreenshotPath);
283250
283561
  mutatedFiles.push(afterScreenshotPath);
283251
283562
  outputLines.push(`Post-action screenshot: ${afterScreenshotPath}`);
283252
283563
  outputLines.push(`Post-action screenshot backend: ${afterBackend}`);
@@ -283260,6 +283571,35 @@ State: ${statePath}${clearArtifacts ? "\nArtifacts cleared: true" : ""}`,
283260
283571
  }
283261
283572
  } else if (effectiveOperation !== "none" && (effectiveTarget || hasCoordinates)) {
283262
283573
  outputLines.push(`Action skipped: no usable point for operation '${effectiveOperation}'.`);
283574
+ } else if (allowActions && !dryRun && (textToType || keyToPress)) {
283575
+ try {
283576
+ const keyboardActions = [];
283577
+ if (captureWindow?.windowId) {
283578
+ const activationBackend = activateDesktopWindow(captureWindow.windowId);
283579
+ keyboardActions.push(`Activated window via ${activationBackend}: ${captureWindow.windowId}`);
283580
+ }
283581
+ if (textToType) {
283582
+ const keyboardBackend = typeDesktopText(textToType, typingDelayMs);
283583
+ keyboardActions.push(`Typed text via ${keyboardBackend}: ${JSON.stringify(textToType)}`);
283584
+ }
283585
+ if (keyToPress) {
283586
+ const keyboardBackend = pressDesktopKey(keyToPress);
283587
+ keyboardActions.push(`Pressed key via ${keyboardBackend}: ${keyToPress}`);
283588
+ }
283589
+ actionTaken = keyboardActions.join("; ");
283590
+ outputLines.push(actionTaken);
283591
+ afterScreenshotPath = join52(sessionDir2, `${timestampSlug2()}-step-${step}-after.png`);
283592
+ const afterBackend = windowTitle ? captureDesktopWindowScreenshot(afterScreenshotPath, windowTitle).backend : captureDesktopScreenshot(afterScreenshotPath);
283593
+ mutatedFiles.push(afterScreenshotPath);
283594
+ outputLines.push(`Post-action screenshot: ${afterScreenshotPath}`);
283595
+ outputLines.push(`Post-action screenshot backend: ${afterBackend}`);
283596
+ } catch (err) {
283597
+ success = false;
283598
+ actionError = err instanceof Error ? err.message : String(err);
283599
+ error = actionError;
283600
+ outputLines.push(`Keyboard action failed: ${actionError}`);
283601
+ outputLines.push(desktopAutomationRecoveryMessage());
283602
+ }
283263
283603
  } else if (action === "run" && !effectiveTarget && !hasCoordinates) {
283264
283604
  success = false;
283265
283605
  error = "Vision loop stopped: no target or coordinates were provided and visual planning did not identify a clickable target. Stopping instead of repeating observe-only screenshots.";
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.205",
3
+ "version": "1.0.206",
4
4
  "lockfileVersion": 3,
5
5
  "requires": true,
6
6
  "packages": {
7
7
  "": {
8
8
  "name": "omnius",
9
- "version": "1.0.205",
9
+ "version": "1.0.206",
10
10
  "bundleDependencies": [
11
11
  "image-to-ascii"
12
12
  ],
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.205",
3
+ "version": "1.0.206",
4
4
  "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",