omnius 1.0.207 → 1.0.208

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -10326,15 +10326,19 @@ function pngDimensions(buffer2) {
10326
10326
  }
10327
10327
  return null;
10328
10328
  }
10329
- async function describeFocusedEditable(pageHandle) {
10330
- const active = await pageHandle.evaluate(`(() => {
10329
+ async function describeFocusedEditableInContext(context2, frameMeta) {
10330
+ const active = await context2.evaluate(`(() => {
10331
10331
  const el = document.activeElement;
10332
10332
  if (!el) return null;
10333
10333
  const rect = el.getBoundingClientRect();
10334
10334
  const role = (el.getAttribute("role") || "").toLowerCase();
10335
- const contentEditable = String(el.getAttribute("contenteditable") || "").toLowerCase();
10336
- const isEditable = el.matches("input, textarea")
10337
- || contentEditable === "" || contentEditable === "true"
10335
+ const contentEditableAttr = el.getAttribute("contenteditable");
10336
+ const contentEditable = contentEditableAttr !== null
10337
+ && (contentEditableAttr === "" || String(contentEditableAttr).toLowerCase() === "true");
10338
+ const disabled = !!el.disabled;
10339
+ const readOnly = !!el.readOnly;
10340
+ const isEditable = (el.matches("input, textarea") && !disabled && !readOnly)
10341
+ || contentEditable || el.isContentEditable === true
10338
10342
  || ["textbox", "searchbox", "combobox"].includes(role);
10339
10343
  return {
10340
10344
  tag: String(el.tagName || "").toLowerCase(),
@@ -10346,10 +10350,35 @@ async function describeFocusedEditable(pageHandle) {
10346
10350
  placeholder: el.getAttribute("placeholder") || "",
10347
10351
  text: String(el.textContent || "").trim().slice(0, 120),
10348
10352
  isEditable,
10353
+ disabled,
10354
+ readOnly,
10349
10355
  rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
10350
10356
  };
10351
10357
  })()`);
10352
- return active && typeof active === "object" ? active : null;
10358
+ if (!active || typeof active !== "object")
10359
+ return null;
10360
+ return frameMeta ? { ...active, frame: frameMeta } : active;
10361
+ }
10362
+ async function describeFocusedEditable(pageHandle) {
10363
+ const main2 = await describeFocusedEditableInContext(pageHandle, { kind: "main", url: pageHandle.url?.() ?? "" }).catch(() => null);
10364
+ if (main2?.["isEditable"])
10365
+ return main2;
10366
+ const frames = typeof pageHandle.frames === "function" ? pageHandle.frames() : [];
10367
+ const mainFrame = typeof pageHandle.mainFrame === "function" ? pageHandle.mainFrame() : null;
10368
+ for (let i2 = 0; i2 < frames.length; i2++) {
10369
+ const frame = frames[i2];
10370
+ if (!frame || frame === mainFrame)
10371
+ continue;
10372
+ const active = await describeFocusedEditableInContext(frame, {
10373
+ kind: "frame",
10374
+ index: i2,
10375
+ url: typeof frame.url === "function" ? frame.url() : "",
10376
+ name: typeof frame.name === "function" ? frame.name() : ""
10377
+ }).catch(() => null);
10378
+ if (active?.["isEditable"])
10379
+ return active;
10380
+ }
10381
+ return main2;
10353
10382
  }
10354
10383
  async function clickAndFillBrowserTarget(pageHandle, target, text, typingDelay) {
10355
10384
  const viewport = pageHandle.viewportSize?.() ?? { width: 1280, height: 720 };
@@ -10464,8 +10493,8 @@ ${input.text.slice(0, 2e4)}`.toLowerCase();
10464
10493
  }
10465
10494
  return { kind: "none", confidence: 0, evidence: [] };
10466
10495
  }
10467
- async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY, forceCandidate, includeOffscreen = false, scrollIntoView = false) {
10468
- const candidate = await pageHandle.evaluate(`(() => {
10496
+ async function findBrowserVisualCandidateInContext(context2, target, visualX, visualY, forceCandidate, includeOffscreen = false, scrollIntoView = false) {
10497
+ const candidate = await context2.evaluate(`(() => {
10469
10498
  const target = ${JSON.stringify(target)};
10470
10499
  const visualX = ${JSON.stringify(visualX)};
10471
10500
  const visualY = ${JSON.stringify(visualY)};
@@ -10496,12 +10525,55 @@ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY,
10496
10525
  if (window.CSS && typeof window.CSS.escape === "function") return window.CSS.escape(id);
10497
10526
  return String(id).replace(/["\\\\]/g, "\\\\$&");
10498
10527
  };
10528
+ const hasVisibleStyle = (el) => {
10529
+ for (let cur = el; cur && cur.nodeType === 1; cur = cur.parentElement) {
10530
+ const style = getComputedStyle(cur);
10531
+ if (style.display === "none" || style.visibility === "hidden" || style.visibility === "collapse") return false;
10532
+ const opacity = Number(style.opacity);
10533
+ if (Number.isFinite(opacity) && opacity <= 0.02) return false;
10534
+ }
10535
+ return true;
10536
+ };
10537
+ const hasUsableBox = (el) => {
10538
+ const rect = el.getBoundingClientRect();
10539
+ return rect.width > 1 && rect.height > 1;
10540
+ };
10541
+ const inViewport = (rect) => !(rect.bottom < 0 || rect.right < 0 || rect.top > window.innerHeight || rect.left > window.innerWidth);
10542
+ const isRendered = (el) => !!el && hasUsableBox(el) && hasVisibleStyle(el);
10499
10543
  const associatedControl = (el) => {
10500
10544
  if (/^label$/i.test(el.tagName || "") && el.getAttribute("for")) {
10501
10545
  return document.getElementById(el.getAttribute("for"));
10502
10546
  }
10547
+ if (/^label$/i.test(el.tagName || "")) {
10548
+ if (el.control) return el.control;
10549
+ const nested = el.querySelector("input, textarea, select, [contenteditable='true'], [role='textbox']");
10550
+ if (nested) return nested;
10551
+ const labelRect = el.getBoundingClientRect();
10552
+ let scope = el.parentElement;
10553
+ for (let depth = 0; depth < 5 && scope; depth++, scope = scope.parentElement) {
10554
+ const controls = Array.from(scope.querySelectorAll("input, textarea, select, [contenteditable='true'], [role='textbox']"))
10555
+ .filter(control => {
10556
+ const rect = control.getBoundingClientRect();
10557
+ return rect.width > 1 && rect.height > 1 && rect.top >= labelRect.top - 12 && Math.abs(rect.left - labelRect.left) < 260;
10558
+ })
10559
+ .sort((a, b) => {
10560
+ const ar = a.getBoundingClientRect();
10561
+ const br = b.getBoundingClientRect();
10562
+ return (Math.abs(ar.top - labelRect.bottom) - Math.abs(br.top - labelRect.bottom))
10563
+ || (Math.abs(ar.left - labelRect.left) - Math.abs(br.left - labelRect.left));
10564
+ });
10565
+ if (controls[0]) return controls[0];
10566
+ }
10567
+ }
10503
10568
  return el;
10504
10569
  };
10570
+ const clickTargetFor = (el) => {
10571
+ const control = associatedControl(el) || el;
10572
+ if (control === el) return el;
10573
+ if (isRendered(control)) return control;
10574
+ if (/^label$/i.test(el.tagName || "") && isRendered(el)) return el;
10575
+ return control;
10576
+ };
10505
10577
  const associatedLabelText = (el) => {
10506
10578
  const control = associatedControl(el);
10507
10579
  const id = control && control.id ? control.id : el.id;
@@ -10544,20 +10616,25 @@ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY,
10544
10616
  el.id || "",
10545
10617
  el.className || "",
10546
10618
  el.getAttribute("type") || "",
10619
+ el.getAttribute("autocomplete") || "",
10547
10620
  ].join(" ").toLowerCase();
10548
10621
  const infoFor = (el, score) => {
10549
- const rect = el.getBoundingClientRect();
10550
- const visible = !(rect.bottom < 0 || rect.right < 0 || rect.top > window.innerHeight || rect.left > window.innerWidth);
10622
+ const control = associatedControl(el) || el;
10623
+ const clickTarget = clickTargetFor(el);
10624
+ const rect = clickTarget.getBoundingClientRect();
10625
+ const visible = inViewport(rect) && hasVisibleStyle(clickTarget);
10551
10626
  return {
10552
- tag: String(el.tagName || "").toLowerCase(),
10553
- id: el.id || "",
10554
- className: String(el.className || "").slice(0, 160),
10555
- role: el.getAttribute("role") || "",
10556
- ariaLabel: el.getAttribute("aria-label") || "",
10557
- name: el.getAttribute("name") || "",
10558
- type: el.getAttribute("type") || "",
10559
- placeholder: el.getAttribute("placeholder") || "",
10560
- text: String(el.innerText || el.textContent || el.getAttribute("value") || "").trim().slice(0, 240),
10627
+ tag: String(control.tagName || el.tagName || "").toLowerCase(),
10628
+ id: control.id || el.id || "",
10629
+ className: String(control.className || el.className || "").slice(0, 160),
10630
+ role: control.getAttribute("role") || el.getAttribute("role") || "",
10631
+ ariaLabel: control.getAttribute("aria-label") || el.getAttribute("aria-label") || "",
10632
+ name: control.getAttribute("name") || el.getAttribute("name") || "",
10633
+ type: control.getAttribute("type") || el.getAttribute("type") || "",
10634
+ autocomplete: control.getAttribute("autocomplete") || el.getAttribute("autocomplete") || "",
10635
+ placeholder: control.getAttribute("placeholder") || el.getAttribute("placeholder") || "",
10636
+ text: String(el.innerText || el.textContent || control.innerText || control.textContent || control.getAttribute("value") || "").trim().slice(0, 240),
10637
+ clickTag: String(clickTarget.tagName || "").toLowerCase(),
10561
10638
  rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
10562
10639
  center: { x: rect.x + rect.width / 2, y: rect.y + rect.height / 2 },
10563
10640
  visible,
@@ -10579,9 +10656,11 @@ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY,
10579
10656
  let best = null;
10580
10657
  let bestElement = null;
10581
10658
  for (const el of Array.from(document.querySelectorAll(selectors))) {
10582
- const rect = el.getBoundingClientRect();
10583
- if (rect.width <= 1 || rect.height <= 1) continue;
10584
- const visible = !(rect.bottom < 0 || rect.right < 0 || rect.top > window.innerHeight || rect.left > window.innerWidth);
10659
+ if (!isRendered(el)) continue;
10660
+ const clickTarget = clickTargetFor(el);
10661
+ if (!isRendered(clickTarget)) continue;
10662
+ const rect = clickTarget.getBoundingClientRect();
10663
+ const visible = inViewport(rect);
10585
10664
  if (!includeOffscreen && !visible) continue;
10586
10665
  const hay = textFor(el);
10587
10666
  const tokenHits = tokens.filter(t => hay.includes(t)).length;
@@ -10592,7 +10671,7 @@ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY,
10592
10671
  if (score <= (forceCandidate ? 8 : 0)) continue;
10593
10672
  if (!best || score > best.score) {
10594
10673
  best = infoFor(el, score);
10595
- bestElement = el;
10674
+ bestElement = clickTarget;
10596
10675
  }
10597
10676
  }
10598
10677
  if (bestElement && scrollIntoView && best && !best.visible) {
@@ -10604,6 +10683,82 @@ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY,
10604
10683
  })()`);
10605
10684
  return candidate && typeof candidate === "object" ? candidate : null;
10606
10685
  }
10686
+ function offsetBrowserCandidate(candidate, offset, viewport, frameMeta) {
10687
+ const rect = candidate["rect"];
10688
+ const center = candidate["center"];
10689
+ const x = Number(rect?.x) + offset.x;
10690
+ const y = Number(rect?.y) + offset.y;
10691
+ const width = Number(rect?.width);
10692
+ const height = Number(rect?.height);
10693
+ const cx = Number(center?.x) + offset.x;
10694
+ const cy = Number(center?.y) + offset.y;
10695
+ const globalRect = {
10696
+ x,
10697
+ y,
10698
+ width,
10699
+ height
10700
+ };
10701
+ const visible = Number.isFinite(x) && Number.isFinite(y) && Number.isFinite(width) && Number.isFinite(height) && !(y + height < 0 || x + width < 0 || y > viewport.height || x > viewport.width);
10702
+ return {
10703
+ ...candidate,
10704
+ rect: globalRect,
10705
+ center: { x: cx, y: cy },
10706
+ visible: candidate["visible"] === true && visible,
10707
+ ...frameMeta ? {
10708
+ frame: frameMeta,
10709
+ frameLocalRect: rect,
10710
+ frameLocalCenter: center
10711
+ } : {}
10712
+ };
10713
+ }
10714
+ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY, forceCandidate, includeOffscreen = false, scrollIntoView = false) {
10715
+ const viewport = pageHandle.viewportSize?.() ?? { width: 1280, height: 720 };
10716
+ const candidates = [];
10717
+ const top = await findBrowserVisualCandidateInContext(pageHandle, target, visualX, visualY, forceCandidate, includeOffscreen, scrollIntoView).catch(() => null);
10718
+ if (top)
10719
+ candidates.push(offsetBrowserCandidate(top, { x: 0, y: 0 }, viewport));
10720
+ const frames = typeof pageHandle.frames === "function" ? pageHandle.frames() : [];
10721
+ const mainFrame = typeof pageHandle.mainFrame === "function" ? pageHandle.mainFrame() : null;
10722
+ for (let i2 = 0; i2 < frames.length; i2++) {
10723
+ const frame = frames[i2];
10724
+ if (!frame || frame === mainFrame)
10725
+ continue;
10726
+ const elementHandle = typeof frame.frameElement === "function" ? await frame.frameElement().catch(() => null) : null;
10727
+ if (!elementHandle)
10728
+ continue;
10729
+ let box = await elementHandle.boundingBox().catch(() => null);
10730
+ if (!box || box.width <= 1 || box.height <= 1)
10731
+ continue;
10732
+ const frameVisible = !(box.y + box.height < 0 || box.x + box.width < 0 || box.y > viewport.height || box.x > viewport.width);
10733
+ if (!includeOffscreen && !frameVisible)
10734
+ continue;
10735
+ const localX = Math.max(0, Math.min(box.width, visualX - box.x));
10736
+ const localY = Math.max(0, Math.min(box.height, visualY - box.y));
10737
+ let candidate = await findBrowserVisualCandidateInContext(frame, target, localX, localY, forceCandidate, includeOffscreen, scrollIntoView).catch(() => null);
10738
+ if (!candidate)
10739
+ continue;
10740
+ if (scrollIntoView && (!candidate["visible"] || !frameVisible)) {
10741
+ if (typeof elementHandle.scrollIntoViewIfNeeded === "function") {
10742
+ await elementHandle.scrollIntoViewIfNeeded().catch(() => void 0);
10743
+ }
10744
+ box = await elementHandle.boundingBox().catch(() => box);
10745
+ candidate = {
10746
+ ...candidate,
10747
+ scrolledIntoView: true
10748
+ };
10749
+ }
10750
+ if (!box)
10751
+ continue;
10752
+ candidates.push(offsetBrowserCandidate(candidate, { x: box.x, y: box.y }, viewport, {
10753
+ kind: "frame",
10754
+ index: i2,
10755
+ url: typeof frame.url === "function" ? frame.url() : "",
10756
+ name: typeof frame.name === "function" ? frame.name() : "",
10757
+ rect: { x: box.x, y: box.y, width: box.width, height: box.height }
10758
+ }));
10759
+ }
10760
+ return candidates.filter((candidate) => includeOffscreen || candidate["visible"] === true).sort((a2, b) => Number(b["score"] ?? 0) - Number(a2["score"] ?? 0))[0] ?? null;
10761
+ }
10607
10762
  function ok(output, start2) {
10608
10763
  return { success: true, output, durationMs: Date.now() - start2 };
10609
10764
  }
@@ -10682,7 +10837,7 @@ var init_playwright_browser = __esm({
10682
10837
  "clear_diagnostics",
10683
10838
  "close"
10684
10839
  ],
10685
- description: "Action to perform:\n- navigate: go to a URL\n- click: click element by selector\n- fill: clear input and type text by selector, or by natural-language target when selector is absent\n- type: type text character by character into a selector, or into the currently focused element after visual_click\n- press: press a key (Enter, Tab, Escape, etc.)\n- screenshot: capture the headless browser page, not the desktop; use value to choose the output file path\n- observe_bundle: capture URL/title/viewport, DOM summary, a11y, diagnostics, screenshot, and gate assessment\n- visual_click: browser screenshot -> Moondream point -> elementFromPoint -> human-like Playwright mouse click -> post-action screenshot\n- evaluate: run JavaScript in page context\n- content: get page text content (readable, stripped)\n- dom: get raw page HTML (truncated)\n- dom_summary: compact interactive DOM summary with selectors\n- innerText: get innerText of a specific element\n- select: select dropdown option by value\n- check/uncheck: toggle checkbox\n- hover: hover over element\n- wait: wait for a selector to appear\n- waitForNavigation: wait for page navigation to complete\n- waitForSelector: wait for element matching selector\n- title: get page title\n- url: get current URL\n- getAttribute: get element attribute value\n- innerHTML: get element's innerHTML\n- textContent: get element's textContent\n- goBack/goForward/reload: browser navigation\n- pdf: save page as PDF\n- close: close browser session"
10840
+ description: "Action to perform:\n- navigate: go to a URL\n- click: click element by selector\n- fill: clear input and type text by selector, or by natural-language target when selector is absent\n- type: type text character by character into a selector, or into the currently focused element after visual_click\n- press: press a key (Enter, Tab, Escape, etc.)\n- screenshot: capture the headless browser page, not the desktop; use value to choose the output file path\n- observe_bundle: capture URL/title/viewport, DOM summary, a11y, diagnostics, screenshot, and gate assessment\n- visual_click: browser screenshot -> Moondream point -> elementFromPoint -> human-like Playwright mouse click -> post-action screenshot\n- evaluate: run JavaScript in page context\n- content: get page text content (readable, stripped)\n- dom: get raw page HTML (truncated)\n- dom_summary: compact interactive DOM summary with selectors\n- innerText: get innerText of a specific element\n- select: select dropdown option by value\n- check/uncheck: toggle checkbox\n- hover: hover over element\n- wait: wait for a selector to appear, or sleep for timeout ms when no selector is provided\n- waitForNavigation: wait for page navigation to complete\n- waitForSelector: wait for element matching selector\n- title: get page title\n- url: get current URL\n- getAttribute: get element attribute value\n- innerHTML: get element's innerHTML\n- textContent: get element's textContent\n- goBack/goForward/reload: browser navigation\n- pdf: save page as PDF\n- close: close browser session"
10686
10841
  },
10687
10842
  url: {
10688
10843
  type: "string",
@@ -10843,30 +10998,14 @@ var init_playwright_browser = __esm({
10843
10998
  await page.type(selector, text, { timeout: timeout2, delay: typingDelay });
10844
10999
  return ok(`Typed "${text}" into ${selector}`, start2);
10845
11000
  }
10846
- const active = await page.evaluate(`(() => {
10847
- const el = document.activeElement;
10848
- if (!el) return null;
10849
- const rect = el.getBoundingClientRect();
10850
- return {
10851
- tag: String(el.tagName || "").toLowerCase(),
10852
- id: el.id || "",
10853
- name: el.getAttribute("name") || "",
10854
- role: el.getAttribute("role") || "",
10855
- ariaLabel: el.getAttribute("aria-label") || "",
10856
- type: el.getAttribute("type") || "",
10857
- placeholder: el.getAttribute("placeholder") || "",
10858
- text: String(el.textContent || "").trim().slice(0, 120),
10859
- isEditable: el.matches("input, textarea, [contenteditable=''], [contenteditable='true']")
10860
- || ["textbox", "searchbox", "combobox"].includes((el.getAttribute("role") || "").toLowerCase()),
10861
- rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
10862
- };
10863
- })()`);
11001
+ const active = await describeFocusedEditable(page);
10864
11002
  if (!active || typeof active !== "object" || active.isEditable !== true) {
10865
11003
  return fail("No editable focused element is active; use visual_click on a form field or pass a selector to type.", start2);
10866
11004
  }
10867
11005
  await page.keyboard.type(text, { delay: typingDelay });
10868
11006
  const label = active && typeof active === "object" ? `<${active.tag || "element"}>${active.id ? `#${active.id}` : ""}` : "focused element";
10869
- return ok(`Typed "${text}" into ${label}`, start2);
11007
+ const frame = active["frame"];
11008
+ return ok(`Typed "${text}" into ${label}${frame?.kind === "frame" ? ` in frame ${frame.index}` : ""}`, start2);
10870
11009
  }
10871
11010
  case "press": {
10872
11011
  const key = text || "Enter";
@@ -10907,7 +11046,14 @@ var init_playwright_browser = __esm({
10907
11046
  return ok(`Hovered: ${resolvedSelector}${resolvedSelector !== selector ? ` (from ${selector})` : ""}`, start2);
10908
11047
  }
10909
11048
  // ── Waiting ──
10910
- case "wait":
11049
+ case "wait": {
11050
+ if (selector) {
11051
+ await page.waitForSelector(selector, { timeout: timeout2 });
11052
+ return ok(`Element appeared: ${selector}`, start2);
11053
+ }
11054
+ await page.waitForTimeout(timeout2);
11055
+ return ok(`Waited ${timeout2}ms`, start2);
11056
+ }
10911
11057
  case "waitForSelector": {
10912
11058
  if (!selector)
10913
11059
  return fail("selector is required", start2);
@@ -11360,8 +11506,13 @@ ${JSON.stringify(data, null, 2)}`, start2);
11360
11506
  })()`);
11361
11507
  let clickSource = point.source || pointResult?.source || "vision";
11362
11508
  const candidate = await findBrowserVisualCandidate(page, visualTarget, cssX, cssY, false);
11363
- if (candidate) {
11364
- const candidateRecord = candidate;
11509
+ let candidateRecord = candidate;
11510
+ if (!candidateRecord) {
11511
+ candidateRecord = await findBrowserVisualCandidate(page, visualTarget, cssX, cssY, true, true, true);
11512
+ if (candidateRecord)
11513
+ await page.waitForTimeout(150);
11514
+ }
11515
+ if (candidateRecord) {
11365
11516
  const center = candidateRecord["center"];
11366
11517
  const nextX = Number(center?.x);
11367
11518
  const nextY = Number(center?.y);
@@ -11369,7 +11520,7 @@ ${JSON.stringify(data, null, 2)}`, start2);
11369
11520
  cssX = Math.max(0, Math.min(viewport.width, nextX));
11370
11521
  cssY = Math.max(0, Math.min(viewport.height, nextY));
11371
11522
  elementInfo = candidateRecord;
11372
- clickSource = `${clickSource}+dom-candidate`;
11523
+ clickSource = `${clickSource}+dom-candidate${candidateRecord["scrolledIntoView"] === true ? "+scroll" : ""}`;
11373
11524
  }
11374
11525
  }
11375
11526
  await page.mouse.move(cssX, cssY, { steps: 12 });
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.207",
3
+ "version": "1.0.208",
4
4
  "lockfileVersion": 3,
5
5
  "requires": true,
6
6
  "packages": {
7
7
  "": {
8
8
  "name": "omnius",
9
- "version": "1.0.207",
9
+ "version": "1.0.208",
10
10
  "bundleDependencies": [
11
11
  "image-to-ascii"
12
12
  ],
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.207",
3
+ "version": "1.0.208",
4
4
  "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",