omnius 1.0.207 → 1.0.209

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -10326,15 +10326,19 @@ function pngDimensions(buffer2) {
10326
10326
  }
10327
10327
  return null;
10328
10328
  }
10329
- async function describeFocusedEditable(pageHandle) {
10330
- const active = await pageHandle.evaluate(`(() => {
10329
+ async function describeFocusedEditableInContext(context2, frameMeta) {
10330
+ const active = await context2.evaluate(`(() => {
10331
10331
  const el = document.activeElement;
10332
10332
  if (!el) return null;
10333
10333
  const rect = el.getBoundingClientRect();
10334
10334
  const role = (el.getAttribute("role") || "").toLowerCase();
10335
- const contentEditable = String(el.getAttribute("contenteditable") || "").toLowerCase();
10336
- const isEditable = el.matches("input, textarea")
10337
- || contentEditable === "" || contentEditable === "true"
10335
+ const contentEditableAttr = el.getAttribute("contenteditable");
10336
+ const contentEditable = contentEditableAttr !== null
10337
+ && (contentEditableAttr === "" || String(contentEditableAttr).toLowerCase() === "true");
10338
+ const disabled = !!el.disabled;
10339
+ const readOnly = !!el.readOnly;
10340
+ const isEditable = (el.matches("input, textarea") && !disabled && !readOnly)
10341
+ || contentEditable || el.isContentEditable === true
10338
10342
  || ["textbox", "searchbox", "combobox"].includes(role);
10339
10343
  return {
10340
10344
  tag: String(el.tagName || "").toLowerCase(),
@@ -10346,10 +10350,35 @@ async function describeFocusedEditable(pageHandle) {
10346
10350
  placeholder: el.getAttribute("placeholder") || "",
10347
10351
  text: String(el.textContent || "").trim().slice(0, 120),
10348
10352
  isEditable,
10353
+ disabled,
10354
+ readOnly,
10349
10355
  rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
10350
10356
  };
10351
10357
  })()`);
10352
- return active && typeof active === "object" ? active : null;
10358
+ if (!active || typeof active !== "object")
10359
+ return null;
10360
+ return frameMeta ? { ...active, frame: frameMeta } : active;
10361
+ }
10362
+ async function describeFocusedEditable(pageHandle) {
10363
+ const main2 = await describeFocusedEditableInContext(pageHandle, { kind: "main", url: pageHandle.url?.() ?? "" }).catch(() => null);
10364
+ if (main2?.["isEditable"])
10365
+ return main2;
10366
+ const frames = typeof pageHandle.frames === "function" ? pageHandle.frames() : [];
10367
+ const mainFrame = typeof pageHandle.mainFrame === "function" ? pageHandle.mainFrame() : null;
10368
+ for (let i2 = 0; i2 < frames.length; i2++) {
10369
+ const frame = frames[i2];
10370
+ if (!frame || frame === mainFrame)
10371
+ continue;
10372
+ const active = await describeFocusedEditableInContext(frame, {
10373
+ kind: "frame",
10374
+ index: i2,
10375
+ url: typeof frame.url === "function" ? frame.url() : "",
10376
+ name: typeof frame.name === "function" ? frame.name() : ""
10377
+ }).catch(() => null);
10378
+ if (active?.["isEditable"])
10379
+ return active;
10380
+ }
10381
+ return main2;
10353
10382
  }
10354
10383
  async function clickAndFillBrowserTarget(pageHandle, target, text, typingDelay) {
10355
10384
  const viewport = pageHandle.viewportSize?.() ?? { width: 1280, height: 720 };
@@ -10464,8 +10493,8 @@ ${input.text.slice(0, 2e4)}`.toLowerCase();
10464
10493
  }
10465
10494
  return { kind: "none", confidence: 0, evidence: [] };
10466
10495
  }
10467
- async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY, forceCandidate, includeOffscreen = false, scrollIntoView = false) {
10468
- const candidate = await pageHandle.evaluate(`(() => {
10496
+ async function findBrowserVisualCandidateInContext(context2, target, visualX, visualY, forceCandidate, includeOffscreen = false, scrollIntoView = false) {
10497
+ const candidate = await context2.evaluate(`(() => {
10469
10498
  const target = ${JSON.stringify(target)};
10470
10499
  const visualX = ${JSON.stringify(visualX)};
10471
10500
  const visualY = ${JSON.stringify(visualY)};
@@ -10496,12 +10525,55 @@ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY,
10496
10525
  if (window.CSS && typeof window.CSS.escape === "function") return window.CSS.escape(id);
10497
10526
  return String(id).replace(/["\\\\]/g, "\\\\$&");
10498
10527
  };
10528
+ const hasVisibleStyle = (el) => {
10529
+ for (let cur = el; cur && cur.nodeType === 1; cur = cur.parentElement) {
10530
+ const style = getComputedStyle(cur);
10531
+ if (style.display === "none" || style.visibility === "hidden" || style.visibility === "collapse") return false;
10532
+ const opacity = Number(style.opacity);
10533
+ if (Number.isFinite(opacity) && opacity <= 0.02) return false;
10534
+ }
10535
+ return true;
10536
+ };
10537
+ const hasUsableBox = (el) => {
10538
+ const rect = el.getBoundingClientRect();
10539
+ return rect.width > 1 && rect.height > 1;
10540
+ };
10541
+ const inViewport = (rect) => !(rect.bottom < 0 || rect.right < 0 || rect.top > window.innerHeight || rect.left > window.innerWidth);
10542
+ const isRendered = (el) => !!el && hasUsableBox(el) && hasVisibleStyle(el);
10499
10543
  const associatedControl = (el) => {
10500
10544
  if (/^label$/i.test(el.tagName || "") && el.getAttribute("for")) {
10501
10545
  return document.getElementById(el.getAttribute("for"));
10502
10546
  }
10547
+ if (/^label$/i.test(el.tagName || "")) {
10548
+ if (el.control) return el.control;
10549
+ const nested = el.querySelector("input, textarea, select, [contenteditable='true'], [role='textbox']");
10550
+ if (nested) return nested;
10551
+ const labelRect = el.getBoundingClientRect();
10552
+ let scope = el.parentElement;
10553
+ for (let depth = 0; depth < 5 && scope; depth++, scope = scope.parentElement) {
10554
+ const controls = Array.from(scope.querySelectorAll("input, textarea, select, [contenteditable='true'], [role='textbox']"))
10555
+ .filter(control => {
10556
+ const rect = control.getBoundingClientRect();
10557
+ return rect.width > 1 && rect.height > 1 && rect.top >= labelRect.top - 12 && Math.abs(rect.left - labelRect.left) < 260;
10558
+ })
10559
+ .sort((a, b) => {
10560
+ const ar = a.getBoundingClientRect();
10561
+ const br = b.getBoundingClientRect();
10562
+ return (Math.abs(ar.top - labelRect.bottom) - Math.abs(br.top - labelRect.bottom))
10563
+ || (Math.abs(ar.left - labelRect.left) - Math.abs(br.left - labelRect.left));
10564
+ });
10565
+ if (controls[0]) return controls[0];
10566
+ }
10567
+ }
10503
10568
  return el;
10504
10569
  };
10570
+ const clickTargetFor = (el) => {
10571
+ const control = associatedControl(el) || el;
10572
+ if (control === el) return el;
10573
+ if (isRendered(control)) return control;
10574
+ if (/^label$/i.test(el.tagName || "") && isRendered(el)) return el;
10575
+ return control;
10576
+ };
10505
10577
  const associatedLabelText = (el) => {
10506
10578
  const control = associatedControl(el);
10507
10579
  const id = control && control.id ? control.id : el.id;
@@ -10544,20 +10616,25 @@ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY,
10544
10616
  el.id || "",
10545
10617
  el.className || "",
10546
10618
  el.getAttribute("type") || "",
10619
+ el.getAttribute("autocomplete") || "",
10547
10620
  ].join(" ").toLowerCase();
10548
10621
  const infoFor = (el, score) => {
10549
- const rect = el.getBoundingClientRect();
10550
- const visible = !(rect.bottom < 0 || rect.right < 0 || rect.top > window.innerHeight || rect.left > window.innerWidth);
10622
+ const control = associatedControl(el) || el;
10623
+ const clickTarget = clickTargetFor(el);
10624
+ const rect = clickTarget.getBoundingClientRect();
10625
+ const visible = inViewport(rect) && hasVisibleStyle(clickTarget);
10551
10626
  return {
10552
- tag: String(el.tagName || "").toLowerCase(),
10553
- id: el.id || "",
10554
- className: String(el.className || "").slice(0, 160),
10555
- role: el.getAttribute("role") || "",
10556
- ariaLabel: el.getAttribute("aria-label") || "",
10557
- name: el.getAttribute("name") || "",
10558
- type: el.getAttribute("type") || "",
10559
- placeholder: el.getAttribute("placeholder") || "",
10560
- text: String(el.innerText || el.textContent || el.getAttribute("value") || "").trim().slice(0, 240),
10627
+ tag: String(control.tagName || el.tagName || "").toLowerCase(),
10628
+ id: control.id || el.id || "",
10629
+ className: String(control.className || el.className || "").slice(0, 160),
10630
+ role: control.getAttribute("role") || el.getAttribute("role") || "",
10631
+ ariaLabel: control.getAttribute("aria-label") || el.getAttribute("aria-label") || "",
10632
+ name: control.getAttribute("name") || el.getAttribute("name") || "",
10633
+ type: control.getAttribute("type") || el.getAttribute("type") || "",
10634
+ autocomplete: control.getAttribute("autocomplete") || el.getAttribute("autocomplete") || "",
10635
+ placeholder: control.getAttribute("placeholder") || el.getAttribute("placeholder") || "",
10636
+ text: String(el.innerText || el.textContent || control.innerText || control.textContent || control.getAttribute("value") || "").trim().slice(0, 240),
10637
+ clickTag: String(clickTarget.tagName || "").toLowerCase(),
10561
10638
  rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
10562
10639
  center: { x: rect.x + rect.width / 2, y: rect.y + rect.height / 2 },
10563
10640
  visible,
@@ -10579,9 +10656,11 @@ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY,
10579
10656
  let best = null;
10580
10657
  let bestElement = null;
10581
10658
  for (const el of Array.from(document.querySelectorAll(selectors))) {
10582
- const rect = el.getBoundingClientRect();
10583
- if (rect.width <= 1 || rect.height <= 1) continue;
10584
- const visible = !(rect.bottom < 0 || rect.right < 0 || rect.top > window.innerHeight || rect.left > window.innerWidth);
10659
+ if (!isRendered(el)) continue;
10660
+ const clickTarget = clickTargetFor(el);
10661
+ if (!isRendered(clickTarget)) continue;
10662
+ const rect = clickTarget.getBoundingClientRect();
10663
+ const visible = inViewport(rect);
10585
10664
  if (!includeOffscreen && !visible) continue;
10586
10665
  const hay = textFor(el);
10587
10666
  const tokenHits = tokens.filter(t => hay.includes(t)).length;
@@ -10592,7 +10671,7 @@ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY,
10592
10671
  if (score <= (forceCandidate ? 8 : 0)) continue;
10593
10672
  if (!best || score > best.score) {
10594
10673
  best = infoFor(el, score);
10595
- bestElement = el;
10674
+ bestElement = clickTarget;
10596
10675
  }
10597
10676
  }
10598
10677
  if (bestElement && scrollIntoView && best && !best.visible) {
@@ -10604,6 +10683,82 @@ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY,
10604
10683
  })()`);
10605
10684
  return candidate && typeof candidate === "object" ? candidate : null;
10606
10685
  }
10686
+ function offsetBrowserCandidate(candidate, offset, viewport, frameMeta) {
10687
+ const rect = candidate["rect"];
10688
+ const center = candidate["center"];
10689
+ const x = Number(rect?.x) + offset.x;
10690
+ const y = Number(rect?.y) + offset.y;
10691
+ const width = Number(rect?.width);
10692
+ const height = Number(rect?.height);
10693
+ const cx = Number(center?.x) + offset.x;
10694
+ const cy = Number(center?.y) + offset.y;
10695
+ const globalRect = {
10696
+ x,
10697
+ y,
10698
+ width,
10699
+ height
10700
+ };
10701
+ const visible = Number.isFinite(x) && Number.isFinite(y) && Number.isFinite(width) && Number.isFinite(height) && !(y + height < 0 || x + width < 0 || y > viewport.height || x > viewport.width);
10702
+ return {
10703
+ ...candidate,
10704
+ rect: globalRect,
10705
+ center: { x: cx, y: cy },
10706
+ visible: candidate["visible"] === true && visible,
10707
+ ...frameMeta ? {
10708
+ frame: frameMeta,
10709
+ frameLocalRect: rect,
10710
+ frameLocalCenter: center
10711
+ } : {}
10712
+ };
10713
+ }
10714
+ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY, forceCandidate, includeOffscreen = false, scrollIntoView = false) {
10715
+ const viewport = pageHandle.viewportSize?.() ?? { width: 1280, height: 720 };
10716
+ const candidates = [];
10717
+ const top = await findBrowserVisualCandidateInContext(pageHandle, target, visualX, visualY, forceCandidate, includeOffscreen, scrollIntoView).catch(() => null);
10718
+ if (top)
10719
+ candidates.push(offsetBrowserCandidate(top, { x: 0, y: 0 }, viewport));
10720
+ const frames = typeof pageHandle.frames === "function" ? pageHandle.frames() : [];
10721
+ const mainFrame = typeof pageHandle.mainFrame === "function" ? pageHandle.mainFrame() : null;
10722
+ for (let i2 = 0; i2 < frames.length; i2++) {
10723
+ const frame = frames[i2];
10724
+ if (!frame || frame === mainFrame)
10725
+ continue;
10726
+ const elementHandle = typeof frame.frameElement === "function" ? await frame.frameElement().catch(() => null) : null;
10727
+ if (!elementHandle)
10728
+ continue;
10729
+ let box = await elementHandle.boundingBox().catch(() => null);
10730
+ if (!box || box.width <= 1 || box.height <= 1)
10731
+ continue;
10732
+ const frameVisible = !(box.y + box.height < 0 || box.x + box.width < 0 || box.y > viewport.height || box.x > viewport.width);
10733
+ if (!includeOffscreen && !frameVisible)
10734
+ continue;
10735
+ const localX = Math.max(0, Math.min(box.width, visualX - box.x));
10736
+ const localY = Math.max(0, Math.min(box.height, visualY - box.y));
10737
+ let candidate = await findBrowserVisualCandidateInContext(frame, target, localX, localY, forceCandidate, includeOffscreen, scrollIntoView).catch(() => null);
10738
+ if (!candidate)
10739
+ continue;
10740
+ if (scrollIntoView && (!candidate["visible"] || !frameVisible)) {
10741
+ if (typeof elementHandle.scrollIntoViewIfNeeded === "function") {
10742
+ await elementHandle.scrollIntoViewIfNeeded().catch(() => void 0);
10743
+ }
10744
+ box = await elementHandle.boundingBox().catch(() => box);
10745
+ candidate = {
10746
+ ...candidate,
10747
+ scrolledIntoView: true
10748
+ };
10749
+ }
10750
+ if (!box)
10751
+ continue;
10752
+ candidates.push(offsetBrowserCandidate(candidate, { x: box.x, y: box.y }, viewport, {
10753
+ kind: "frame",
10754
+ index: i2,
10755
+ url: typeof frame.url === "function" ? frame.url() : "",
10756
+ name: typeof frame.name === "function" ? frame.name() : "",
10757
+ rect: { x: box.x, y: box.y, width: box.width, height: box.height }
10758
+ }));
10759
+ }
10760
+ return candidates.filter((candidate) => includeOffscreen || candidate["visible"] === true).sort((a2, b) => Number(b["score"] ?? 0) - Number(a2["score"] ?? 0))[0] ?? null;
10761
+ }
10607
10762
  function ok(output, start2) {
10608
10763
  return { success: true, output, durationMs: Date.now() - start2 };
10609
10764
  }
@@ -10682,7 +10837,7 @@ var init_playwright_browser = __esm({
10682
10837
  "clear_diagnostics",
10683
10838
  "close"
10684
10839
  ],
10685
- description: "Action to perform:\n- navigate: go to a URL\n- click: click element by selector\n- fill: clear input and type text by selector, or by natural-language target when selector is absent\n- type: type text character by character into a selector, or into the currently focused element after visual_click\n- press: press a key (Enter, Tab, Escape, etc.)\n- screenshot: capture the headless browser page, not the desktop; use value to choose the output file path\n- observe_bundle: capture URL/title/viewport, DOM summary, a11y, diagnostics, screenshot, and gate assessment\n- visual_click: browser screenshot -> Moondream point -> elementFromPoint -> human-like Playwright mouse click -> post-action screenshot\n- evaluate: run JavaScript in page context\n- content: get page text content (readable, stripped)\n- dom: get raw page HTML (truncated)\n- dom_summary: compact interactive DOM summary with selectors\n- innerText: get innerText of a specific element\n- select: select dropdown option by value\n- check/uncheck: toggle checkbox\n- hover: hover over element\n- wait: wait for a selector to appear\n- waitForNavigation: wait for page navigation to complete\n- waitForSelector: wait for element matching selector\n- title: get page title\n- url: get current URL\n- getAttribute: get element attribute value\n- innerHTML: get element's innerHTML\n- textContent: get element's textContent\n- goBack/goForward/reload: browser navigation\n- pdf: save page as PDF\n- close: close browser session"
10840
+ description: "Action to perform:\n- navigate: go to a URL\n- click: click element by selector\n- fill: clear input and type text by selector, or by natural-language target when selector is absent\n- type: type text character by character into a selector, or into the currently focused element after visual_click\n- press: press a key (Enter, Tab, Escape, etc.)\n- screenshot: capture the headless browser page, not the desktop; use value to choose the output file path\n- observe_bundle: capture URL/title/viewport, DOM summary, a11y, diagnostics, screenshot, and gate assessment\n- visual_click: browser screenshot -> Moondream point -> elementFromPoint -> human-like Playwright mouse click -> post-action screenshot\n- evaluate: run JavaScript in page context\n- content: get page text content (readable, stripped)\n- dom: get raw page HTML (truncated)\n- dom_summary: compact interactive DOM summary with selectors\n- innerText: get innerText of a specific element\n- select: select dropdown option by value\n- check/uncheck: toggle checkbox\n- hover: hover over element\n- wait: wait for a selector to appear, or sleep for timeout ms when no selector is provided\n- waitForNavigation: wait for page navigation to complete\n- waitForSelector: wait for element matching selector\n- title: get page title\n- url: get current URL\n- getAttribute: get element attribute value\n- innerHTML: get element's innerHTML\n- textContent: get element's textContent\n- goBack/goForward/reload: browser navigation\n- pdf: save page as PDF\n- close: close browser session"
10686
10841
  },
10687
10842
  url: {
10688
10843
  type: "string",
@@ -10843,30 +10998,14 @@ var init_playwright_browser = __esm({
10843
10998
  await page.type(selector, text, { timeout: timeout2, delay: typingDelay });
10844
10999
  return ok(`Typed "${text}" into ${selector}`, start2);
10845
11000
  }
10846
- const active = await page.evaluate(`(() => {
10847
- const el = document.activeElement;
10848
- if (!el) return null;
10849
- const rect = el.getBoundingClientRect();
10850
- return {
10851
- tag: String(el.tagName || "").toLowerCase(),
10852
- id: el.id || "",
10853
- name: el.getAttribute("name") || "",
10854
- role: el.getAttribute("role") || "",
10855
- ariaLabel: el.getAttribute("aria-label") || "",
10856
- type: el.getAttribute("type") || "",
10857
- placeholder: el.getAttribute("placeholder") || "",
10858
- text: String(el.textContent || "").trim().slice(0, 120),
10859
- isEditable: el.matches("input, textarea, [contenteditable=''], [contenteditable='true']")
10860
- || ["textbox", "searchbox", "combobox"].includes((el.getAttribute("role") || "").toLowerCase()),
10861
- rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
10862
- };
10863
- })()`);
11001
+ const active = await describeFocusedEditable(page);
10864
11002
  if (!active || typeof active !== "object" || active.isEditable !== true) {
10865
11003
  return fail("No editable focused element is active; use visual_click on a form field or pass a selector to type.", start2);
10866
11004
  }
10867
11005
  await page.keyboard.type(text, { delay: typingDelay });
10868
11006
  const label = active && typeof active === "object" ? `<${active.tag || "element"}>${active.id ? `#${active.id}` : ""}` : "focused element";
10869
- return ok(`Typed "${text}" into ${label}`, start2);
11007
+ const frame = active["frame"];
11008
+ return ok(`Typed "${text}" into ${label}${frame?.kind === "frame" ? ` in frame ${frame.index}` : ""}`, start2);
10870
11009
  }
10871
11010
  case "press": {
10872
11011
  const key = text || "Enter";
@@ -10907,7 +11046,14 @@ var init_playwright_browser = __esm({
10907
11046
  return ok(`Hovered: ${resolvedSelector}${resolvedSelector !== selector ? ` (from ${selector})` : ""}`, start2);
10908
11047
  }
10909
11048
  // ── Waiting ──
10910
- case "wait":
11049
+ case "wait": {
11050
+ if (selector) {
11051
+ await page.waitForSelector(selector, { timeout: timeout2 });
11052
+ return ok(`Element appeared: ${selector}`, start2);
11053
+ }
11054
+ await page.waitForTimeout(timeout2);
11055
+ return ok(`Waited ${timeout2}ms`, start2);
11056
+ }
10911
11057
  case "waitForSelector": {
10912
11058
  if (!selector)
10913
11059
  return fail("selector is required", start2);
@@ -11360,8 +11506,13 @@ ${JSON.stringify(data, null, 2)}`, start2);
11360
11506
  })()`);
11361
11507
  let clickSource = point.source || pointResult?.source || "vision";
11362
11508
  const candidate = await findBrowserVisualCandidate(page, visualTarget, cssX, cssY, false);
11363
- if (candidate) {
11364
- const candidateRecord = candidate;
11509
+ let candidateRecord = candidate;
11510
+ if (!candidateRecord) {
11511
+ candidateRecord = await findBrowserVisualCandidate(page, visualTarget, cssX, cssY, true, true, true);
11512
+ if (candidateRecord)
11513
+ await page.waitForTimeout(150);
11514
+ }
11515
+ if (candidateRecord) {
11365
11516
  const center = candidateRecord["center"];
11366
11517
  const nextX = Number(center?.x);
11367
11518
  const nextY = Number(center?.y);
@@ -11369,7 +11520,7 @@ ${JSON.stringify(data, null, 2)}`, start2);
11369
11520
  cssX = Math.max(0, Math.min(viewport.width, nextX));
11370
11521
  cssY = Math.max(0, Math.min(viewport.height, nextY));
11371
11522
  elementInfo = candidateRecord;
11372
- clickSource = `${clickSource}+dom-candidate`;
11523
+ clickSource = `${clickSource}+dom-candidate${candidateRecord["scrolledIntoView"] === true ? "+scroll" : ""}`;
11373
11524
  }
11374
11525
  }
11375
11526
  await page.mouse.move(cssX, cssY, { steps: 12 });
@@ -284467,15 +284618,52 @@ function findScrapeScript() {
284467
284618
  ];
284468
284619
  return candidates.find((p2) => existsSync44(p2)) || candidates[0];
284469
284620
  }
284470
- async function probeService() {
284621
+ async function probeServiceInfo() {
284471
284622
  try {
284472
284623
  const controller = new AbortController();
284473
284624
  const timeout2 = setTimeout(() => controller.abort(), 3e3);
284474
284625
  const res = await fetch(`${BASE_URL}/health`, { signal: controller.signal });
284475
284626
  clearTimeout(timeout2);
284476
- return res.ok;
284627
+ if (!res.ok)
284628
+ return null;
284629
+ const data = await res.json().catch(() => null);
284630
+ return data && typeof data === "object" ? data : {};
284477
284631
  } catch {
284632
+ return null;
284633
+ }
284634
+ }
284635
+ async function probeService() {
284636
+ return Boolean(await probeServiceInfo());
284637
+ }
284638
+ function serviceHasCapabilities(info) {
284639
+ if (!info)
284478
284640
  return false;
284641
+ const raw = info["capabilities"];
284642
+ const capabilities = Array.isArray(raw) ? raw.map(String) : [];
284643
+ return REQUIRED_SERVICE_CAPABILITIES.every((capability) => capabilities.includes(capability));
284644
+ }
284645
+ function killBrowserActionServicePort() {
284646
+ if (serviceProcess && serviceProcess.pid && !serviceProcess.killed) {
284647
+ try {
284648
+ process.kill(-serviceProcess.pid, "SIGTERM");
284649
+ } catch {
284650
+ }
284651
+ try {
284652
+ serviceProcess.kill("SIGTERM");
284653
+ } catch {
284654
+ }
284655
+ serviceProcess = null;
284656
+ }
284657
+ const commands = [
284658
+ `lsof -ti tcp:${DEFAULT_PORT} | xargs -r kill -TERM`,
284659
+ `fuser -k ${DEFAULT_PORT}/tcp`
284660
+ ];
284661
+ for (const cmd of commands) {
284662
+ try {
284663
+ execSync22(cmd, { stdio: "ignore", timeout: 5e3 });
284664
+ break;
284665
+ } catch {
284666
+ }
284479
284667
  }
284480
284668
  }
284481
284669
  function findPython3() {
@@ -284490,8 +284678,17 @@ function findPython3() {
284490
284678
  return null;
284491
284679
  }
284492
284680
  async function launchService() {
284493
- if (await probeService())
284494
- return null;
284681
+ const existing = await probeServiceInfo();
284682
+ if (existing) {
284683
+ if (serviceHasCapabilities(existing))
284684
+ return null;
284685
+ killBrowserActionServicePort();
284686
+ for (let i2 = 0; i2 < 20; i2++) {
284687
+ await new Promise((r2) => setTimeout(r2, 250));
284688
+ if (!await probeService())
284689
+ break;
284690
+ }
284691
+ }
284495
284692
  const python = findPython3();
284496
284693
  if (!python)
284497
284694
  return "Python 3 not found. Install Python 3.9+ to use browser automation.";
@@ -284503,6 +284700,7 @@ async function launchService() {
284503
284700
  env: {
284504
284701
  ...process.env,
284505
284702
  SCRAPE_PORT: String(DEFAULT_PORT),
284703
+ OMNIUS_BROWSER_ACTION_VENV: join55(omniusHomeDir(), "runtimes", "browser", ".venv-selenium"),
284506
284704
  SCRAPE_HEADLESS_DEFAULT: process.env["SCRAPE_HEADLESS_DEFAULT"] ?? (defaultBrowserHeadless() ? "1" : "0"),
284507
284705
  SCRAPE_REQUIRE_AUTH: "0"
284508
284706
  }
@@ -284645,13 +284843,33 @@ async function apiCall(endpoint, method = "POST", body) {
284645
284843
  url += `?${params.toString()}`;
284646
284844
  }
284647
284845
  const res = await fetch(url, options2);
284648
- return await res.json();
284846
+ const raw = await res.text();
284847
+ try {
284848
+ return JSON.parse(raw);
284849
+ } catch {
284850
+ return {
284851
+ ok: false,
284852
+ error: `HTTP ${res.status} from browser_action service: ${raw.slice(0, 500)}`
284853
+ };
284854
+ }
284855
+ }
284856
+ function evaluateFailureMessage2(err, code8) {
284857
+ const raw = err instanceof Error ? err.message : String(err);
284858
+ const hints = [];
284859
+ if (/map is not a function/i.test(raw) && /querySelectorAll/i.test(code8)) {
284860
+ hints.push("document.querySelectorAll() returns a NodeList; use Array.from(document.querySelectorAll(selector)).map(...) or [...document.querySelectorAll(selector)].map(...).");
284861
+ }
284862
+ if (/(?:\.value\s*=|setAttribute\(['"]value['"])/.test(code8) && /\b(input|textarea|querySelector)/i.test(code8)) {
284863
+ hints.push("Direct .value assignment can bypass framework input/change handlers. Prefer browser_action type, browser_action click_xy plus input/sync paths, or playwright_browser fill/visual_click.");
284864
+ }
284865
+ return [raw.slice(0, 500), ...hints.map((hint) => `Hint: ${hint}`)].join("\n");
284649
284866
  }
284650
- var __dirname3, DEFAULT_PORT, SCRAPE_SCRIPT, BASE_URL, serviceProcess, activeSessionId, activeSessionHeadless, activeSessionUrl, BrowserActionTool;
284867
+ var __dirname3, DEFAULT_PORT, SCRAPE_SCRIPT, BASE_URL, serviceProcess, activeSessionId, activeSessionHeadless, activeSessionUrl, REQUIRED_SERVICE_CAPABILITIES, BrowserActionTool;
284651
284868
  var init_browser_action = __esm({
284652
284869
  "packages/execution/dist/tools/browser-action.js"() {
284653
284870
  "use strict";
284654
284871
  init_dom_summary();
284872
+ init_model_store();
284655
284873
  init_network_egress_policy();
284656
284874
  __dirname3 = dirname14(fileURLToPath6(import.meta.url));
284657
284875
  DEFAULT_PORT = 8130;
@@ -284661,16 +284879,17 @@ var init_browser_action = __esm({
284661
284879
  activeSessionId = null;
284662
284880
  activeSessionHeadless = null;
284663
284881
  activeSessionUrl = null;
284882
+ REQUIRED_SERVICE_CAPABILITIES = ["evaluate"];
284664
284883
  BrowserActionTool = class {
284665
284884
  name = "browser_action";
284666
- description = "Control a persistent headless Chrome browser session for interactive web tasks. The browser stays open between calls, maintaining cookies, login state, and history. This is a separate Selenium/Chrome runtime from playwright_browser; do not switch between the two mid-workflow unless you intentionally navigate the second tool to the same URL. Use this (not web_fetch/web_crawl) when you need to: (1) log into a website, (2) fill and submit forms, (3) click buttons or links interactively, (4) take screenshots of rendered pages, (5) navigate multi-step workflows (checkout, signup, dashboards), (6) interact with elements that require JavaScript (dropdowns, modals, infinite scroll). Actions: navigate, click, click_xy, type, screenshot, dom, scroll, scroll_up, scroll_down, back, forward, close. For browser visuals, use browser_action({action:'screenshot', width, height, output_path}) — this captures the headless browser viewport, not the desktop. Use the desktop screenshot tool only when the actual OS screen is the target. For verification of browser runtime failures, prefer playwright_browser because it exposes page_errors, console_logs, network_log, DOM/accessibility, and screenshots from the same session. IMPORTANT: Start by calling navigate with the URL — do NOT ask the user for credentials or info first. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Navigate to the page, then use dom/screenshot to see what's there, then type/click to interact. Call 'close' when done to free resources. This tool does not save or download arbitrary rendered files (PDFs, archives, media) to disk — clicking a 'Download' link inside the browser does not produce a local file path for the agent. For file acquisition, use the dedicated download/file tool and validate the resulting content-type and size before treating the result as success.";
284885
+ description = "Control a persistent headless Chrome browser session for interactive web tasks. The browser stays open between calls, maintaining cookies, login state, and history. This is a separate Selenium/Chrome runtime from playwright_browser; do not switch between the two mid-workflow unless you intentionally navigate the second tool to the same URL. Use this (not web_fetch/web_crawl) when you need to: (1) log into a website, (2) fill and submit forms, (3) click buttons or links interactively, (4) take screenshots of rendered pages, (5) navigate multi-step workflows (checkout, signup, dashboards), (6) interact with elements that require JavaScript (dropdowns, modals, infinite scroll). Actions: navigate, click, click_xy, type, evaluate, screenshot, dom, scroll, scroll_up, scroll_down, back, forward, close. For browser visuals, use browser_action({action:'screenshot', width, height, output_path}) — this captures the headless browser viewport, not the desktop. Use the desktop screenshot tool only when the actual OS screen is the target. For verification of browser runtime failures, prefer playwright_browser because it exposes page_errors, console_logs, network_log, DOM/accessibility, and screenshots from the same session. IMPORTANT: Start by calling navigate with the URL — do NOT ask the user for credentials or info first. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Navigate to the page, then use dom/screenshot to see what's there, then type/click to interact. Call 'close' when done to free resources. This tool does not save or download arbitrary rendered files (PDFs, archives, media) to disk — clicking a 'Download' link inside the browser does not produce a local file path for the agent. For file acquisition, use the dedicated download/file tool and validate the resulting content-type and size before treating the result as success.";
284667
284886
  parameters = {
284668
284887
  type: "object",
284669
284888
  properties: {
284670
284889
  action: {
284671
284890
  type: "string",
284672
- enum: ["navigate", "click", "click_xy", "type", "screenshot", "dom", "dom_summary", "vision_click", "scroll", "scroll_up", "scroll_down", "back", "forward", "close"],
284673
- description: "Browser action to perform. Key actions:\n- 'screenshot': capture the headless browser render at width/height; returns an image part and a local file path if output_path is provided\n- 'dom_summary': compact view of interactive elements (~1KB vs 200KB raw DOM)\n- 'vision_click': screenshot the page, use Moondream vision to find an element by description, then click it. Pass the element description in 'text' parameter (e.g. text='the login button'). This is the visual grounding loop from SeeAct.\n- 'click': click by CSS selector (fastest when you know the selector)\n- 'click_xy': click at pixel coordinates (when you have exact coords)"
284891
+ enum: ["navigate", "click", "click_xy", "type", "evaluate", "screenshot", "dom", "dom_summary", "vision_click", "scroll", "scroll_up", "scroll_down", "back", "forward", "close"],
284892
+ description: "Browser action to perform. Key actions:\n- 'screenshot': capture the headless browser render at width/height; returns an image part and a local file path if output_path is provided\n- 'dom_summary': compact view of interactive elements (~1KB vs 200KB raw DOM)\n- 'vision_click': screenshot the page, use Moondream vision to find an element by description, then click it. Pass the element description in 'text' parameter (e.g. text='the login button'). This is the visual grounding loop from SeeAct.\n- 'click': click by CSS selector (fastest when you know the selector)\n- 'click_xy': click at pixel coordinates (when you have exact coords)\n- 'evaluate': run JavaScript in the active Selenium page; pass code in text"
284674
284893
  },
284675
284894
  url: {
284676
284895
  type: "string",
@@ -284682,7 +284901,7 @@ var init_browser_action = __esm({
284682
284901
  },
284683
284902
  text: {
284684
284903
  type: "string",
284685
- description: "Text to type (for 'type' action) OR element description to find and click (for 'vision_click' action, e.g. 'the submit button', 'the search field', 'the country dropdown')"
284904
+ description: "Text to type (for 'type' action), JS code (for 'evaluate'), OR element description to find and click (for 'vision_click' action, e.g. 'the submit button', 'the search field', 'the country dropdown')"
284686
284905
  },
284687
284906
  x: {
284688
284907
  type: "number",
@@ -284856,6 +285075,32 @@ Runtime: browser_action Selenium/Chrome session. Continue with browser_action fo
284856
285075
  durationMs: Date.now() - start2
284857
285076
  };
284858
285077
  }
285078
+ case "evaluate": {
285079
+ const code8 = typeof args.text === "string" ? args.text : typeof args.value === "string" ? args.value : "";
285080
+ if (!code8.trim())
285081
+ return { success: false, output: "", error: "text is required for evaluate action", durationMs: Date.now() - start2 };
285082
+ result = await apiCall("/evaluate", "POST", { script: code8 });
285083
+ if (result.ok) {
285084
+ const resultType = String(result["result_type"] ?? "unknown");
285085
+ const payload = result["result"];
285086
+ const rendered = payload === void 0 ? "undefined" : typeof payload === "string" ? payload : JSON.stringify(payload, null, 2);
285087
+ const truncated = rendered.length > 2e4 ? `${rendered.slice(0, 2e4)}
285088
+ ... (truncated)` : rendered;
285089
+ return {
285090
+ success: true,
285091
+ output: `Evaluation result (${resultType}):
285092
+ ${truncated}`,
285093
+ durationMs: Date.now() - start2
285094
+ };
285095
+ }
285096
+ const evalMsg = String(result.error ?? result.message ?? "Evaluate failed");
285097
+ return {
285098
+ success: false,
285099
+ output: "",
285100
+ error: `browser_action evaluate failed: ${evaluateFailureMessage2(evalMsg, code8)} ${browserActionRuntimeHint()}`,
285101
+ durationMs: Date.now() - start2
285102
+ };
285103
+ }
284859
285104
  case "screenshot": {
284860
285105
  if (requestedWidth || requestedHeight || requestedScale) {
284861
285106
  const currentW = requestedWidth ?? 1280;
@@ -285039,7 +285284,7 @@ Runtime: browser_action Selenium/Chrome session. Continue with browser_action fo
285039
285284
  result = await apiCall("/history/forward", "POST");
285040
285285
  return { success: !!result.ok, output: "Navigated forward", durationMs: Date.now() - start2 };
285041
285286
  default:
285042
- return { success: false, output: "", error: `Unknown action: ${action}. Available: navigate, click, click_xy, type, screenshot, dom, scroll, scroll_up, scroll_down, back, forward, close`, durationMs: Date.now() - start2 };
285287
+ return { success: false, output: "", error: `Unknown action: ${action}. Available: navigate, click, click_xy, type, evaluate, screenshot, dom, dom_summary, vision_click, scroll, scroll_up, scroll_down, back, forward, close`, durationMs: Date.now() - start2 };
285043
285288
  }
285044
285289
  } catch (err) {
285045
285290
  return {
@@ -32,7 +32,13 @@ from typing import Dict, Optional
32
32
  # ──────────────────────────────────────────────────────────────
33
33
  # 0) Embedded venv bootstrap (same pattern as other services)
34
34
  # ──────────────────────────────────────────────────────────────
35
- VENV_DIR = Path.cwd() / ".venv"
35
+ SCRIPT_PATH = Path(__file__).resolve()
36
+ SCRIPT_DIR = SCRIPT_PATH.parent
37
+ OMNIUS_HOME = Path(os.environ.get("OMNIUS_HOME") or (Path.home() / ".omnius"))
38
+ VENV_DIR = Path(
39
+ os.environ.get("OMNIUS_BROWSER_ACTION_VENV")
40
+ or (OMNIUS_HOME / "runtimes" / "browser" / ".venv-selenium")
41
+ )
36
42
 
37
43
 
38
44
  def _in_venv() -> bool:
@@ -48,6 +54,7 @@ def _ensure_venv_and_reexec() -> None:
48
54
  return
49
55
  python = sys.executable
50
56
  if not VENV_DIR.exists():
57
+ VENV_DIR.parent.mkdir(parents=True, exist_ok=True)
51
58
  print(f"[bootstrap] creating virtualenv at {VENV_DIR}", file=sys.stderr)
52
59
  subprocess.check_call([python, "-m", "venv", str(VENV_DIR)])
53
60
  pip_bin = VENV_DIR / ("Scripts/pip.exe" if os.name == "nt" else "bin/pip")
@@ -69,10 +76,21 @@ _ensure_venv_and_reexec()
69
76
  # ──────────────────────────────────────────────────────────────
70
77
  import subprocess # noqa: E402 (re-import after re-exec)
71
78
 
72
- SCRIPT_PATH = Path(__file__).resolve()
73
- SCRIPT_DIR = SCRIPT_PATH.parent
74
- SETUP_MARKER = SCRIPT_DIR / ".scrape_setup_complete"
79
+ SETUP_MARKER = VENV_DIR / ".scrape_setup_complete"
75
80
  OUT_DIR = SCRIPT_DIR / "frames"
81
+ SERVICE_VERSION = "2026-06-01-evaluate-v1"
82
+ SERVICE_CAPABILITIES = [
83
+ "navigate",
84
+ "click",
85
+ "click_xy",
86
+ "type",
87
+ "evaluate",
88
+ "screenshot",
89
+ "dom",
90
+ "scroll",
91
+ "history",
92
+ "events",
93
+ ]
76
94
 
77
95
 
78
96
  def _pip_install(*pkgs: str) -> None:
@@ -129,6 +147,7 @@ from selenium.webdriver.common.by import By # noqa: E402
129
147
  from selenium.webdriver.common.keys import Keys # noqa: E402
130
148
  from selenium.webdriver.chrome.options import Options # noqa: E402
131
149
  from selenium.webdriver.chrome.service import Service # noqa: E402
150
+ from selenium.webdriver.remote.webelement import WebElement # noqa: E402
132
151
  from selenium.webdriver.support import expected_conditions as EC # noqa: E402
133
152
  from selenium.webdriver.support.ui import WebDriverWait # noqa: E402
134
153
  from webdriver_manager.chrome import ChromeDriverManager # noqa: E402
@@ -160,6 +179,64 @@ def _truthy(value) -> bool:
160
179
  return str(value).lower() in ("1", "true", "yes", "on")
161
180
 
162
181
 
182
+ def _serialize_script_result(value, depth: int = 0, seen: Optional[set[int]] = None):
183
+ if seen is None:
184
+ seen = set()
185
+ if value is None or isinstance(value, (str, int, float, bool)):
186
+ return value
187
+ if depth > 5:
188
+ return str(value)
189
+ if isinstance(value, WebElement):
190
+ try:
191
+ rect = value.rect or {}
192
+ except Exception:
193
+ rect = {}
194
+ try:
195
+ text = value.text or ""
196
+ except Exception:
197
+ text = ""
198
+ try:
199
+ tag = value.tag_name or ""
200
+ except Exception:
201
+ tag = ""
202
+ def attr(name: str) -> str:
203
+ try:
204
+ return value.get_attribute(name) or ""
205
+ except Exception:
206
+ return ""
207
+ return {
208
+ "__omnius_type": "element",
209
+ "tag": tag,
210
+ "id": attr("id"),
211
+ "name": attr("name"),
212
+ "type": attr("type"),
213
+ "role": attr("role"),
214
+ "ariaLabel": attr("aria-label"),
215
+ "text": text[:240],
216
+ "rect": {
217
+ "x": rect.get("x", 0),
218
+ "y": rect.get("y", 0),
219
+ "width": rect.get("width", 0),
220
+ "height": rect.get("height", 0),
221
+ },
222
+ }
223
+ if isinstance(value, (list, tuple, set)):
224
+ return [_serialize_script_result(item, depth + 1, seen) for item in list(value)[:200]]
225
+ if isinstance(value, dict):
226
+ ident = id(value)
227
+ if ident in seen:
228
+ return "[Circular]"
229
+ seen.add(ident)
230
+ out = {}
231
+ for idx, (key, item) in enumerate(value.items()):
232
+ if idx >= 200:
233
+ out["__omnius_truncated"] = True
234
+ break
235
+ out[str(key)] = _serialize_script_result(item, depth + 1, seen)
236
+ return out
237
+ return str(value)
238
+
239
+
163
240
  class Tools:
164
241
  _driver: Optional[webdriver.Chrome] = None
165
242
 
@@ -421,6 +498,21 @@ class Tools:
421
498
  log_message(f"[dom] snapshot failed: {exc}", "WARNING")
422
499
  return ""
423
500
 
501
+ @staticmethod
502
+ def evaluate(script: str):
503
+ if not Tools._driver:
504
+ return {"ok": False, "error": "browser not open"}
505
+ try:
506
+ result = Tools._driver.execute_script(script)
507
+ return {
508
+ "ok": True,
509
+ "result": _serialize_script_result(result),
510
+ "result_type": "undefined" if result is None else type(result).__name__,
511
+ }
512
+ except Exception as exc:
513
+ log_message(f"[evaluate] script failed: {exc}", "ERROR")
514
+ return {"ok": False, "error": str(exc)}
515
+
424
516
  @staticmethod
425
517
  def scroll(amount: int = 600) -> str:
426
518
  if not Tools._driver:
@@ -921,7 +1013,15 @@ def _error(message: str, status: int = 400):
921
1013
  # ──────────────────────────────────────────────────────────────
922
1014
  @app.get("/health")
923
1015
  def health():
924
- return jsonify({"status": "ok", "browser_open": Tools.is_browser_open(), "sessions": len(_SESSIONS)})
1016
+ return jsonify({
1017
+ "status": "ok",
1018
+ "service": "browser_action",
1019
+ "version": SERVICE_VERSION,
1020
+ "capabilities": SERVICE_CAPABILITIES,
1021
+ "browser_open": Tools.is_browser_open(),
1022
+ "sessions": len(_SESSIONS),
1023
+ "venv": str(VENV_DIR),
1024
+ })
925
1025
 
926
1026
 
927
1027
  @app.post("/session/start")
@@ -1041,6 +1141,23 @@ def type_text():
1041
1141
  return _ok(message=msg)
1042
1142
 
1043
1143
 
1144
+ @app.post("/evaluate")
1145
+ def evaluate_script():
1146
+ if not _auth_ok(request):
1147
+ return _error("unauthorized", 401)
1148
+ data = request.get_json(silent=True) or {}
1149
+ script = data.get("script") or data.get("text") or data.get("code") or ""
1150
+ if not str(script).strip():
1151
+ return _error("missing script", 400)
1152
+ with _slot():
1153
+ result = Tools.evaluate(str(script))
1154
+ if not isinstance(result, dict) or not result.get("ok"):
1155
+ return _error(result.get("error") if isinstance(result, dict) else "evaluate failed", 500)
1156
+ sid = data.get("sid") or next(iter(_SESSIONS), "")
1157
+ _queue_event(sid, {"type": "status", "msg": "evaluate", "ts": int(time.time() * 1000)})
1158
+ return _ok(result=result.get("result"), result_type=result.get("result_type"))
1159
+
1160
+
1044
1161
  @app.post("/scroll")
1045
1162
  def scroll():
1046
1163
  if not _auth_ok(request):
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.207",
3
+ "version": "1.0.209",
4
4
  "lockfileVersion": 3,
5
5
  "requires": true,
6
6
  "packages": {
7
7
  "": {
8
8
  "name": "omnius",
9
- "version": "1.0.207",
9
+ "version": "1.0.209",
10
10
  "bundleDependencies": [
11
11
  "image-to-ascii"
12
12
  ],
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.207",
3
+ "version": "1.0.209",
4
4
  "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",