npm - omnius - Versions diffs - 1.0.207 → 1.0.209 - Mend

omnius 1.0.207 → 1.0.209

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/index.js +303 -58
package/dist/scripts/web_scrape.py +122 -5
package/npm-shrinkwrap.json +2 -2
package/package.json +1 -1

package/dist/index.js CHANGED Viewed

@@ -10326,15 +10326,19 @@ function pngDimensions(buffer2) {
   }
   return null;
 }
-async function describeFocusedEditable(pageHandle) {
-  const active = await pageHandle.evaluate(`(() => {
+async function describeFocusedEditableInContext(context2, frameMeta) {
+  const active = await context2.evaluate(`(() => {
     const el = document.activeElement;
     if (!el) return null;
     const rect = el.getBoundingClientRect();
     const role = (el.getAttribute("role") || "").toLowerCase();
-    const contentEditable = String(el.getAttribute("contenteditable") || "").toLowerCase();
-    const isEditable = el.matches("input, textarea")
-      || contentEditable === "" || contentEditable === "true"
+    const contentEditableAttr = el.getAttribute("contenteditable");
+    const contentEditable = contentEditableAttr !== null
+      && (contentEditableAttr === "" || String(contentEditableAttr).toLowerCase() === "true");
+    const disabled = !!el.disabled;
+    const readOnly = !!el.readOnly;
+    const isEditable = (el.matches("input, textarea") && !disabled && !readOnly)
+      || contentEditable || el.isContentEditable === true
       || ["textbox", "searchbox", "combobox"].includes(role);
     return {
       tag: String(el.tagName || "").toLowerCase(),
@@ -10346,10 +10350,35 @@ async function describeFocusedEditable(pageHandle) {
       placeholder: el.getAttribute("placeholder") || "",
       text: String(el.textContent || "").trim().slice(0, 120),
       isEditable,
+      disabled,
+      readOnly,
       rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
     };
   })()`);
-  return active && typeof active === "object" ? active : null;
+  if (!active || typeof active !== "object")
+    return null;
+  return frameMeta ? { ...active, frame: frameMeta } : active;
+}
+async function describeFocusedEditable(pageHandle) {
+  const main2 = await describeFocusedEditableInContext(pageHandle, { kind: "main", url: pageHandle.url?.() ?? "" }).catch(() => null);
+  if (main2?.["isEditable"])
+    return main2;
+  const frames = typeof pageHandle.frames === "function" ? pageHandle.frames() : [];
+  const mainFrame = typeof pageHandle.mainFrame === "function" ? pageHandle.mainFrame() : null;
+  for (let i2 = 0; i2 < frames.length; i2++) {
+    const frame = frames[i2];
+    if (!frame || frame === mainFrame)
+      continue;
+    const active = await describeFocusedEditableInContext(frame, {
+      kind: "frame",
+      index: i2,
+      url: typeof frame.url === "function" ? frame.url() : "",
+      name: typeof frame.name === "function" ? frame.name() : ""
+    }).catch(() => null);
+    if (active?.["isEditable"])
+      return active;
+  }
+  return main2;
 }
 async function clickAndFillBrowserTarget(pageHandle, target, text, typingDelay) {
   const viewport = pageHandle.viewportSize?.() ?? { width: 1280, height: 720 };
@@ -10464,8 +10493,8 @@ ${input.text.slice(0, 2e4)}`.toLowerCase();
   }
   return { kind: "none", confidence: 0, evidence: [] };
 }
-async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY, forceCandidate, includeOffscreen = false, scrollIntoView = false) {
-  const candidate = await pageHandle.evaluate(`(() => {
+async function findBrowserVisualCandidateInContext(context2, target, visualX, visualY, forceCandidate, includeOffscreen = false, scrollIntoView = false) {
+  const candidate = await context2.evaluate(`(() => {
     const target = ${JSON.stringify(target)};
     const visualX = ${JSON.stringify(visualX)};
     const visualY = ${JSON.stringify(visualY)};
@@ -10496,12 +10525,55 @@ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY,
       if (window.CSS && typeof window.CSS.escape === "function") return window.CSS.escape(id);
       return String(id).replace(/["\\\\]/g, "\\\\$&");
     };
+    const hasVisibleStyle = (el) => {
+      for (let cur = el; cur && cur.nodeType === 1; cur = cur.parentElement) {
+        const style = getComputedStyle(cur);
+        if (style.display === "none" || style.visibility === "hidden" || style.visibility === "collapse") return false;
+        const opacity = Number(style.opacity);
+        if (Number.isFinite(opacity) && opacity <= 0.02) return false;
+      }
+      return true;
+    };
+    const hasUsableBox = (el) => {
+      const rect = el.getBoundingClientRect();
+      return rect.width > 1 && rect.height > 1;
+    };
+    const inViewport = (rect) => !(rect.bottom < 0 || rect.right < 0 || rect.top > window.innerHeight || rect.left > window.innerWidth);
+    const isRendered = (el) => !!el && hasUsableBox(el) && hasVisibleStyle(el);
     const associatedControl = (el) => {
       if (/^label$/i.test(el.tagName || "") && el.getAttribute("for")) {
         return document.getElementById(el.getAttribute("for"));
       }
+      if (/^label$/i.test(el.tagName || "")) {
+        if (el.control) return el.control;
+        const nested = el.querySelector("input, textarea, select, [contenteditable='true'], [role='textbox']");
+        if (nested) return nested;
+        const labelRect = el.getBoundingClientRect();
+        let scope = el.parentElement;
+        for (let depth = 0; depth < 5 && scope; depth++, scope = scope.parentElement) {
+          const controls = Array.from(scope.querySelectorAll("input, textarea, select, [contenteditable='true'], [role='textbox']"))
+            .filter(control => {
+              const rect = control.getBoundingClientRect();
+              return rect.width > 1 && rect.height > 1 && rect.top >= labelRect.top - 12 && Math.abs(rect.left - labelRect.left) < 260;
+            })
+            .sort((a, b) => {
+              const ar = a.getBoundingClientRect();
+              const br = b.getBoundingClientRect();
+              return (Math.abs(ar.top - labelRect.bottom) - Math.abs(br.top - labelRect.bottom))
+                || (Math.abs(ar.left - labelRect.left) - Math.abs(br.left - labelRect.left));
+            });
+          if (controls[0]) return controls[0];
+        }
+      }
       return el;
     };
+    const clickTargetFor = (el) => {
+      const control = associatedControl(el) || el;
+      if (control === el) return el;
+      if (isRendered(control)) return control;
+      if (/^label$/i.test(el.tagName || "") && isRendered(el)) return el;
+      return control;
+    };
     const associatedLabelText = (el) => {
       const control = associatedControl(el);
       const id = control && control.id ? control.id : el.id;
@@ -10544,20 +10616,25 @@ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY,
       el.id || "",
       el.className || "",
       el.getAttribute("type") || "",
+      el.getAttribute("autocomplete") || "",
     ].join(" ").toLowerCase();
     const infoFor = (el, score) => {
-      const rect = el.getBoundingClientRect();
-      const visible = !(rect.bottom < 0 || rect.right < 0 || rect.top > window.innerHeight || rect.left > window.innerWidth);
+      const control = associatedControl(el) || el;
+      const clickTarget = clickTargetFor(el);
+      const rect = clickTarget.getBoundingClientRect();
+      const visible = inViewport(rect) && hasVisibleStyle(clickTarget);
       return {
-        tag: String(el.tagName || "").toLowerCase(),
-        id: el.id || "",
-        className: String(el.className || "").slice(0, 160),
-        role: el.getAttribute("role") || "",
-        ariaLabel: el.getAttribute("aria-label") || "",
-        name: el.getAttribute("name") || "",
-        type: el.getAttribute("type") || "",
-        placeholder: el.getAttribute("placeholder") || "",
-        text: String(el.innerText || el.textContent || el.getAttribute("value") || "").trim().slice(0, 240),
+        tag: String(control.tagName || el.tagName || "").toLowerCase(),
+        id: control.id || el.id || "",
+        className: String(control.className || el.className || "").slice(0, 160),
+        role: control.getAttribute("role") || el.getAttribute("role") || "",
+        ariaLabel: control.getAttribute("aria-label") || el.getAttribute("aria-label") || "",
+        name: control.getAttribute("name") || el.getAttribute("name") || "",
+        type: control.getAttribute("type") || el.getAttribute("type") || "",
+        autocomplete: control.getAttribute("autocomplete") || el.getAttribute("autocomplete") || "",
+        placeholder: control.getAttribute("placeholder") || el.getAttribute("placeholder") || "",
+        text: String(el.innerText || el.textContent || control.innerText || control.textContent || control.getAttribute("value") || "").trim().slice(0, 240),
+        clickTag: String(clickTarget.tagName || "").toLowerCase(),
         rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
         center: { x: rect.x + rect.width / 2, y: rect.y + rect.height / 2 },
         visible,
@@ -10579,9 +10656,11 @@ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY,
     let best = null;
     let bestElement = null;
     for (const el of Array.from(document.querySelectorAll(selectors))) {
-      const rect = el.getBoundingClientRect();
-      if (rect.width <= 1 || rect.height <= 1) continue;
-      const visible = !(rect.bottom < 0 || rect.right < 0 || rect.top > window.innerHeight || rect.left > window.innerWidth);
+      if (!isRendered(el)) continue;
+      const clickTarget = clickTargetFor(el);
+      if (!isRendered(clickTarget)) continue;
+      const rect = clickTarget.getBoundingClientRect();
+      const visible = inViewport(rect);
       if (!includeOffscreen && !visible) continue;
       const hay = textFor(el);
       const tokenHits = tokens.filter(t => hay.includes(t)).length;
@@ -10592,7 +10671,7 @@ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY,
       if (score <= (forceCandidate ? 8 : 0)) continue;
       if (!best || score > best.score) {
         best = infoFor(el, score);
-        bestElement = el;
+        bestElement = clickTarget;
       }
     }
     if (bestElement && scrollIntoView && best && !best.visible) {
@@ -10604,6 +10683,82 @@ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY,
   })()`);
   return candidate && typeof candidate === "object" ? candidate : null;
 }
+function offsetBrowserCandidate(candidate, offset, viewport, frameMeta) {
+  const rect = candidate["rect"];
+  const center = candidate["center"];
+  const x = Number(rect?.x) + offset.x;
+  const y = Number(rect?.y) + offset.y;
+  const width = Number(rect?.width);
+  const height = Number(rect?.height);
+  const cx = Number(center?.x) + offset.x;
+  const cy = Number(center?.y) + offset.y;
+  const globalRect = {
+    x,
+    y,
+    width,
+    height
+  };
+  const visible = Number.isFinite(x) && Number.isFinite(y) && Number.isFinite(width) && Number.isFinite(height) && !(y + height < 0 || x + width < 0 || y > viewport.height || x > viewport.width);
+  return {
+    ...candidate,
+    rect: globalRect,
+    center: { x: cx, y: cy },
+    visible: candidate["visible"] === true && visible,
+    ...frameMeta ? {
+      frame: frameMeta,
+      frameLocalRect: rect,
+      frameLocalCenter: center
+    } : {}
+  };
+}
+async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY, forceCandidate, includeOffscreen = false, scrollIntoView = false) {
+  const viewport = pageHandle.viewportSize?.() ?? { width: 1280, height: 720 };
+  const candidates = [];
+  const top = await findBrowserVisualCandidateInContext(pageHandle, target, visualX, visualY, forceCandidate, includeOffscreen, scrollIntoView).catch(() => null);
+  if (top)
+    candidates.push(offsetBrowserCandidate(top, { x: 0, y: 0 }, viewport));
+  const frames = typeof pageHandle.frames === "function" ? pageHandle.frames() : [];
+  const mainFrame = typeof pageHandle.mainFrame === "function" ? pageHandle.mainFrame() : null;
+  for (let i2 = 0; i2 < frames.length; i2++) {
+    const frame = frames[i2];
+    if (!frame || frame === mainFrame)
+      continue;
+    const elementHandle = typeof frame.frameElement === "function" ? await frame.frameElement().catch(() => null) : null;
+    if (!elementHandle)
+      continue;
+    let box = await elementHandle.boundingBox().catch(() => null);
+    if (!box || box.width <= 1 || box.height <= 1)
+      continue;
+    const frameVisible = !(box.y + box.height < 0 || box.x + box.width < 0 || box.y > viewport.height || box.x > viewport.width);
+    if (!includeOffscreen && !frameVisible)
+      continue;
+    const localX = Math.max(0, Math.min(box.width, visualX - box.x));
+    const localY = Math.max(0, Math.min(box.height, visualY - box.y));
+    let candidate = await findBrowserVisualCandidateInContext(frame, target, localX, localY, forceCandidate, includeOffscreen, scrollIntoView).catch(() => null);
+    if (!candidate)
+      continue;
+    if (scrollIntoView && (!candidate["visible"] || !frameVisible)) {
+      if (typeof elementHandle.scrollIntoViewIfNeeded === "function") {
+        await elementHandle.scrollIntoViewIfNeeded().catch(() => void 0);
+      }
+      box = await elementHandle.boundingBox().catch(() => box);
+      candidate = {
+        ...candidate,
+        scrolledIntoView: true
+      };
+    }
+    if (!box)
+      continue;
+    candidates.push(offsetBrowserCandidate(candidate, { x: box.x, y: box.y }, viewport, {
+      kind: "frame",
+      index: i2,
+      url: typeof frame.url === "function" ? frame.url() : "",
+      name: typeof frame.name === "function" ? frame.name() : "",
+      rect: { x: box.x, y: box.y, width: box.width, height: box.height }
+    }));
+  }
+  return candidates.filter((candidate) => includeOffscreen || candidate["visible"] === true).sort((a2, b) => Number(b["score"] ?? 0) - Number(a2["score"] ?? 0))[0] ?? null;
+}
 function ok(output, start2) {
   return { success: true, output, durationMs: Date.now() - start2 };
 }
@@ -10682,7 +10837,7 @@ var init_playwright_browser = __esm({
               "clear_diagnostics",
               "close"
             ],
-            description: "Action to perform:\n- navigate: go to a URL\n- click: click element by selector\n- fill: clear input and type text by selector, or by natural-language target when selector is absent\n- type: type text character by character into a selector, or into the currently focused element after visual_click\n- press: press a key (Enter, Tab, Escape, etc.)\n- screenshot: capture the headless browser page, not the desktop; use value to choose the output file path\n- observe_bundle: capture URL/title/viewport, DOM summary, a11y, diagnostics, screenshot, and gate assessment\n- visual_click: browser screenshot -> Moondream point -> elementFromPoint -> human-like Playwright mouse click -> post-action screenshot\n- evaluate: run JavaScript in page context\n- content: get page text content (readable, stripped)\n- dom: get raw page HTML (truncated)\n- dom_summary: compact interactive DOM summary with selectors\n- innerText: get innerText of a specific element\n- select: select dropdown option by value\n- check/uncheck: toggle checkbox\n- hover: hover over element\n- wait: wait for a selector to appear\n- waitForNavigation: wait for page navigation to complete\n- waitForSelector: wait for element matching selector\n- title: get page title\n- url: get current URL\n- getAttribute: get element attribute value\n- innerHTML: get element's innerHTML\n- textContent: get element's textContent\n- goBack/goForward/reload: browser navigation\n- pdf: save page as PDF\n- close: close browser session"
+            description: "Action to perform:\n- navigate: go to a URL\n- click: click element by selector\n- fill: clear input and type text by selector, or by natural-language target when selector is absent\n- type: type text character by character into a selector, or into the currently focused element after visual_click\n- press: press a key (Enter, Tab, Escape, etc.)\n- screenshot: capture the headless browser page, not the desktop; use value to choose the output file path\n- observe_bundle: capture URL/title/viewport, DOM summary, a11y, diagnostics, screenshot, and gate assessment\n- visual_click: browser screenshot -> Moondream point -> elementFromPoint -> human-like Playwright mouse click -> post-action screenshot\n- evaluate: run JavaScript in page context\n- content: get page text content (readable, stripped)\n- dom: get raw page HTML (truncated)\n- dom_summary: compact interactive DOM summary with selectors\n- innerText: get innerText of a specific element\n- select: select dropdown option by value\n- check/uncheck: toggle checkbox\n- hover: hover over element\n- wait: wait for a selector to appear, or sleep for timeout ms when no selector is provided\n- waitForNavigation: wait for page navigation to complete\n- waitForSelector: wait for element matching selector\n- title: get page title\n- url: get current URL\n- getAttribute: get element attribute value\n- innerHTML: get element's innerHTML\n- textContent: get element's textContent\n- goBack/goForward/reload: browser navigation\n- pdf: save page as PDF\n- close: close browser session"
           },
           url: {
             type: "string",
@@ -10843,30 +10998,14 @@ var init_playwright_browser = __esm({
                 await page.type(selector, text, { timeout: timeout2, delay: typingDelay });
                 return ok(`Typed "${text}" into ${selector}`, start2);
               }
-              const active = await page.evaluate(`(() => {
-            const el = document.activeElement;
-            if (!el) return null;
-            const rect = el.getBoundingClientRect();
-            return {
-              tag: String(el.tagName || "").toLowerCase(),
-              id: el.id || "",
-              name: el.getAttribute("name") || "",
-              role: el.getAttribute("role") || "",
-              ariaLabel: el.getAttribute("aria-label") || "",
-              type: el.getAttribute("type") || "",
-              placeholder: el.getAttribute("placeholder") || "",
-              text: String(el.textContent || "").trim().slice(0, 120),
-              isEditable: el.matches("input, textarea, [contenteditable=''], [contenteditable='true']")
-                || ["textbox", "searchbox", "combobox"].includes((el.getAttribute("role") || "").toLowerCase()),
-              rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
-            };
-          })()`);
+              const active = await describeFocusedEditable(page);
               if (!active || typeof active !== "object" || active.isEditable !== true) {
                 return fail("No editable focused element is active; use visual_click on a form field or pass a selector to type.", start2);
               }
               await page.keyboard.type(text, { delay: typingDelay });
               const label = active && typeof active === "object" ? `<${active.tag || "element"}>${active.id ? `#${active.id}` : ""}` : "focused element";
-              return ok(`Typed "${text}" into ${label}`, start2);
+              const frame = active["frame"];
+              return ok(`Typed "${text}" into ${label}${frame?.kind === "frame" ? ` in frame ${frame.index}` : ""}`, start2);
             }
             case "press": {
               const key = text || "Enter";
@@ -10907,7 +11046,14 @@ var init_playwright_browser = __esm({
               return ok(`Hovered: ${resolvedSelector}${resolvedSelector !== selector ? ` (from ${selector})` : ""}`, start2);
             }
             // ── Waiting ──
-            case "wait":
+            case "wait": {
+              if (selector) {
+                await page.waitForSelector(selector, { timeout: timeout2 });
+                return ok(`Element appeared: ${selector}`, start2);
+              }
+              await page.waitForTimeout(timeout2);
+              return ok(`Waited ${timeout2}ms`, start2);
+            }
             case "waitForSelector": {
               if (!selector)
                 return fail("selector is required", start2);
@@ -11360,8 +11506,13 @@ ${JSON.stringify(data, null, 2)}`, start2);
           })()`);
               let clickSource = point.source || pointResult?.source || "vision";
               const candidate = await findBrowserVisualCandidate(page, visualTarget, cssX, cssY, false);
-              if (candidate) {
-                const candidateRecord = candidate;
+              let candidateRecord = candidate;
+              if (!candidateRecord) {
+                candidateRecord = await findBrowserVisualCandidate(page, visualTarget, cssX, cssY, true, true, true);
+                if (candidateRecord)
+                  await page.waitForTimeout(150);
+              }
+              if (candidateRecord) {
                 const center = candidateRecord["center"];
                 const nextX = Number(center?.x);
                 const nextY = Number(center?.y);
@@ -11369,7 +11520,7 @@ ${JSON.stringify(data, null, 2)}`, start2);
                   cssX = Math.max(0, Math.min(viewport.width, nextX));
                   cssY = Math.max(0, Math.min(viewport.height, nextY));
                   elementInfo = candidateRecord;
-                  clickSource = `${clickSource}+dom-candidate`;
+                  clickSource = `${clickSource}+dom-candidate${candidateRecord["scrolledIntoView"] === true ? "+scroll" : ""}`;
                 }
               }
               await page.mouse.move(cssX, cssY, { steps: 12 });
@@ -284467,15 +284618,52 @@ function findScrapeScript() {
   ];
   return candidates.find((p2) => existsSync44(p2)) || candidates[0];
 }
-async function probeService() {
+async function probeServiceInfo() {
   try {
     const controller = new AbortController();
     const timeout2 = setTimeout(() => controller.abort(), 3e3);
     const res = await fetch(`${BASE_URL}/health`, { signal: controller.signal });
     clearTimeout(timeout2);
-    return res.ok;
+    if (!res.ok)
+      return null;
+    const data = await res.json().catch(() => null);
+    return data && typeof data === "object" ? data : {};
   } catch {
+    return null;
+  }
+}
+async function probeService() {
+  return Boolean(await probeServiceInfo());
+}
+function serviceHasCapabilities(info) {
+  if (!info)
     return false;
+  const raw = info["capabilities"];
+  const capabilities = Array.isArray(raw) ? raw.map(String) : [];
+  return REQUIRED_SERVICE_CAPABILITIES.every((capability) => capabilities.includes(capability));
+}
+function killBrowserActionServicePort() {
+  if (serviceProcess && serviceProcess.pid && !serviceProcess.killed) {
+    try {
+      process.kill(-serviceProcess.pid, "SIGTERM");
+    } catch {
+    }
+    try {
+      serviceProcess.kill("SIGTERM");
+    } catch {
+    }
+    serviceProcess = null;
+  }
+  const commands = [
+    `lsof -ti tcp:${DEFAULT_PORT} | xargs -r kill -TERM`,
+    `fuser -k ${DEFAULT_PORT}/tcp`
+  ];
+  for (const cmd of commands) {
+    try {
+      execSync22(cmd, { stdio: "ignore", timeout: 5e3 });
+      break;
+    } catch {
+    }
   }
 }
 function findPython3() {
@@ -284490,8 +284678,17 @@ function findPython3() {
   return null;
 }
 async function launchService() {
-  if (await probeService())
-    return null;
+  const existing = await probeServiceInfo();
+  if (existing) {
+    if (serviceHasCapabilities(existing))
+      return null;
+    killBrowserActionServicePort();
+    for (let i2 = 0; i2 < 20; i2++) {
+      await new Promise((r2) => setTimeout(r2, 250));
+      if (!await probeService())
+        break;
+    }
+  }
   const python = findPython3();
   if (!python)
     return "Python 3 not found. Install Python 3.9+ to use browser automation.";
@@ -284503,6 +284700,7 @@ async function launchService() {
     env: {
       ...process.env,
       SCRAPE_PORT: String(DEFAULT_PORT),
+      OMNIUS_BROWSER_ACTION_VENV: join55(omniusHomeDir(), "runtimes", "browser", ".venv-selenium"),
       SCRAPE_HEADLESS_DEFAULT: process.env["SCRAPE_HEADLESS_DEFAULT"] ?? (defaultBrowserHeadless() ? "1" : "0"),
       SCRAPE_REQUIRE_AUTH: "0"
     }
@@ -284645,13 +284843,33 @@ async function apiCall(endpoint, method = "POST", body) {
     url += `?${params.toString()}`;
   }
   const res = await fetch(url, options2);
-  return await res.json();
+  const raw = await res.text();
+  try {
+    return JSON.parse(raw);
+  } catch {
+    return {
+      ok: false,
+      error: `HTTP ${res.status} from browser_action service: ${raw.slice(0, 500)}`
+    };
+  }
+}
+function evaluateFailureMessage2(err, code8) {
+  const raw = err instanceof Error ? err.message : String(err);
+  const hints = [];
+  if (/map is not a function/i.test(raw) && /querySelectorAll/i.test(code8)) {
+    hints.push("document.querySelectorAll() returns a NodeList; use Array.from(document.querySelectorAll(selector)).map(...) or [...document.querySelectorAll(selector)].map(...).");
+  }
+  if (/(?:\.value\s*=|setAttribute\(['"]value['"])/.test(code8) && /\b(input|textarea|querySelector)/i.test(code8)) {
+    hints.push("Direct .value assignment can bypass framework input/change handlers. Prefer browser_action type, browser_action click_xy plus input/sync paths, or playwright_browser fill/visual_click.");
+  }
+  return [raw.slice(0, 500), ...hints.map((hint) => `Hint: ${hint}`)].join("\n");
 }
-var __dirname3, DEFAULT_PORT, SCRAPE_SCRIPT, BASE_URL, serviceProcess, activeSessionId, activeSessionHeadless, activeSessionUrl, BrowserActionTool;
+var __dirname3, DEFAULT_PORT, SCRAPE_SCRIPT, BASE_URL, serviceProcess, activeSessionId, activeSessionHeadless, activeSessionUrl, REQUIRED_SERVICE_CAPABILITIES, BrowserActionTool;
 var init_browser_action = __esm({
   "packages/execution/dist/tools/browser-action.js"() {
     "use strict";
     init_dom_summary();
+    init_model_store();
     init_network_egress_policy();
     __dirname3 = dirname14(fileURLToPath6(import.meta.url));
     DEFAULT_PORT = 8130;
@@ -284661,16 +284879,17 @@ var init_browser_action = __esm({
     activeSessionId = null;
     activeSessionHeadless = null;
     activeSessionUrl = null;
+    REQUIRED_SERVICE_CAPABILITIES = ["evaluate"];
     BrowserActionTool = class {
       name = "browser_action";
-      description = "Control a persistent headless Chrome browser session for interactive web tasks. The browser stays open between calls, maintaining cookies, login state, and history. This is a separate Selenium/Chrome runtime from playwright_browser; do not switch between the two mid-workflow unless you intentionally navigate the second tool to the same URL. Use this (not web_fetch/web_crawl) when you need to: (1) log into a website, (2) fill and submit forms, (3) click buttons or links interactively, (4) take screenshots of rendered pages, (5) navigate multi-step workflows (checkout, signup, dashboards), (6) interact with elements that require JavaScript (dropdowns, modals, infinite scroll). Actions: navigate, click, click_xy, type, screenshot, dom, scroll, scroll_up, scroll_down, back, forward, close. For browser visuals, use browser_action({action:'screenshot', width, height, output_path}) — this captures the headless browser viewport, not the desktop. Use the desktop screenshot tool only when the actual OS screen is the target. For verification of browser runtime failures, prefer playwright_browser because it exposes page_errors, console_logs, network_log, DOM/accessibility, and screenshots from the same session. IMPORTANT: Start by calling navigate with the URL — do NOT ask the user for credentials or info first. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Navigate to the page, then use dom/screenshot to see what's there, then type/click to interact. Call 'close' when done to free resources. This tool does not save or download arbitrary rendered files (PDFs, archives, media) to disk — clicking a 'Download' link inside the browser does not produce a local file path for the agent. For file acquisition, use the dedicated download/file tool and validate the resulting content-type and size before treating the result as success.";
+      description = "Control a persistent headless Chrome browser session for interactive web tasks. The browser stays open between calls, maintaining cookies, login state, and history. This is a separate Selenium/Chrome runtime from playwright_browser; do not switch between the two mid-workflow unless you intentionally navigate the second tool to the same URL. Use this (not web_fetch/web_crawl) when you need to: (1) log into a website, (2) fill and submit forms, (3) click buttons or links interactively, (4) take screenshots of rendered pages, (5) navigate multi-step workflows (checkout, signup, dashboards), (6) interact with elements that require JavaScript (dropdowns, modals, infinite scroll). Actions: navigate, click, click_xy, type, evaluate, screenshot, dom, scroll, scroll_up, scroll_down, back, forward, close. For browser visuals, use browser_action({action:'screenshot', width, height, output_path}) — this captures the headless browser viewport, not the desktop. Use the desktop screenshot tool only when the actual OS screen is the target. For verification of browser runtime failures, prefer playwright_browser because it exposes page_errors, console_logs, network_log, DOM/accessibility, and screenshots from the same session. IMPORTANT: Start by calling navigate with the URL — do NOT ask the user for credentials or info first. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Navigate to the page, then use dom/screenshot to see what's there, then type/click to interact. Call 'close' when done to free resources. This tool does not save or download arbitrary rendered files (PDFs, archives, media) to disk — clicking a 'Download' link inside the browser does not produce a local file path for the agent. For file acquisition, use the dedicated download/file tool and validate the resulting content-type and size before treating the result as success.";
       parameters = {
         type: "object",
         properties: {
           action: {
             type: "string",
-            enum: ["navigate", "click", "click_xy", "type", "screenshot", "dom", "dom_summary", "vision_click", "scroll", "scroll_up", "scroll_down", "back", "forward", "close"],
-            description: "Browser action to perform. Key actions:\n- 'screenshot': capture the headless browser render at width/height; returns an image part and a local file path if output_path is provided\n- 'dom_summary': compact view of interactive elements (~1KB vs 200KB raw DOM)\n- 'vision_click': screenshot the page, use Moondream vision to find an element by description, then click it. Pass the element description in 'text' parameter (e.g. text='the login button'). This is the visual grounding loop from SeeAct.\n- 'click': click by CSS selector (fastest when you know the selector)\n- 'click_xy': click at pixel coordinates (when you have exact coords)"
+            enum: ["navigate", "click", "click_xy", "type", "evaluate", "screenshot", "dom", "dom_summary", "vision_click", "scroll", "scroll_up", "scroll_down", "back", "forward", "close"],
+            description: "Browser action to perform. Key actions:\n- 'screenshot': capture the headless browser render at width/height; returns an image part and a local file path if output_path is provided\n- 'dom_summary': compact view of interactive elements (~1KB vs 200KB raw DOM)\n- 'vision_click': screenshot the page, use Moondream vision to find an element by description, then click it. Pass the element description in 'text' parameter (e.g. text='the login button'). This is the visual grounding loop from SeeAct.\n- 'click': click by CSS selector (fastest when you know the selector)\n- 'click_xy': click at pixel coordinates (when you have exact coords)\n- 'evaluate': run JavaScript in the active Selenium page; pass code in text"
           },
           url: {
             type: "string",
@@ -284682,7 +284901,7 @@ var init_browser_action = __esm({
           },
           text: {
             type: "string",
-            description: "Text to type (for 'type' action) OR element description to find and click (for 'vision_click' action, e.g. 'the submit button', 'the search field', 'the country dropdown')"
+            description: "Text to type (for 'type' action), JS code (for 'evaluate'), OR element description to find and click (for 'vision_click' action, e.g. 'the submit button', 'the search field', 'the country dropdown')"
           },
           x: {
             type: "number",
@@ -284856,6 +285075,32 @@ Runtime: browser_action Selenium/Chrome session. Continue with browser_action fo
                 durationMs: Date.now() - start2
               };
             }
+            case "evaluate": {
+              const code8 = typeof args.text === "string" ? args.text : typeof args.value === "string" ? args.value : "";
+              if (!code8.trim())
+                return { success: false, output: "", error: "text is required for evaluate action", durationMs: Date.now() - start2 };
+              result = await apiCall("/evaluate", "POST", { script: code8 });
+              if (result.ok) {
+                const resultType = String(result["result_type"] ?? "unknown");
+                const payload = result["result"];
+                const rendered = payload === void 0 ? "undefined" : typeof payload === "string" ? payload : JSON.stringify(payload, null, 2);
+                const truncated = rendered.length > 2e4 ? `${rendered.slice(0, 2e4)}
+... (truncated)` : rendered;
+                return {
+                  success: true,
+                  output: `Evaluation result (${resultType}):
+${truncated}`,
+                  durationMs: Date.now() - start2
+                };
+              }
+              const evalMsg = String(result.error ?? result.message ?? "Evaluate failed");
+              return {
+                success: false,
+                output: "",
+                error: `browser_action evaluate failed: ${evaluateFailureMessage2(evalMsg, code8)} ${browserActionRuntimeHint()}`,
+                durationMs: Date.now() - start2
+              };
+            }
             case "screenshot": {
               if (requestedWidth || requestedHeight || requestedScale) {
                 const currentW = requestedWidth ?? 1280;
@@ -285039,7 +285284,7 @@ Runtime: browser_action Selenium/Chrome session. Continue with browser_action fo
               result = await apiCall("/history/forward", "POST");
               return { success: !!result.ok, output: "Navigated forward", durationMs: Date.now() - start2 };
             default:
-              return { success: false, output: "", error: `Unknown action: ${action}. Available: navigate, click, click_xy, type, screenshot, dom, scroll, scroll_up, scroll_down, back, forward, close`, durationMs: Date.now() - start2 };
+              return { success: false, output: "", error: `Unknown action: ${action}. Available: navigate, click, click_xy, type, evaluate, screenshot, dom, dom_summary, vision_click, scroll, scroll_up, scroll_down, back, forward, close`, durationMs: Date.now() - start2 };
           }
         } catch (err) {
           return {

package/dist/scripts/web_scrape.py CHANGED Viewed

@@ -32,7 +32,13 @@ from typing import Dict, Optional
 # ──────────────────────────────────────────────────────────────
 # 0) Embedded venv bootstrap (same pattern as other services)
 # ──────────────────────────────────────────────────────────────
-VENV_DIR = Path.cwd() / ".venv"
+SCRIPT_PATH = Path(__file__).resolve()
+SCRIPT_DIR = SCRIPT_PATH.parent
+OMNIUS_HOME = Path(os.environ.get("OMNIUS_HOME") or (Path.home() / ".omnius"))
+VENV_DIR = Path(
+    os.environ.get("OMNIUS_BROWSER_ACTION_VENV")
+    or (OMNIUS_HOME / "runtimes" / "browser" / ".venv-selenium")
+)
 def _in_venv() -> bool:
@@ -48,6 +54,7 @@ def _ensure_venv_and_reexec() -> None:
         return
     python = sys.executable
     if not VENV_DIR.exists():
+        VENV_DIR.parent.mkdir(parents=True, exist_ok=True)
         print(f"[bootstrap] creating virtualenv at {VENV_DIR}", file=sys.stderr)
         subprocess.check_call([python, "-m", "venv", str(VENV_DIR)])
         pip_bin = VENV_DIR / ("Scripts/pip.exe" if os.name == "nt" else "bin/pip")
@@ -69,10 +76,21 @@ _ensure_venv_and_reexec()
 # ──────────────────────────────────────────────────────────────
 import subprocess  # noqa: E402  (re-import after re-exec)
-SCRIPT_PATH = Path(__file__).resolve()
-SCRIPT_DIR = SCRIPT_PATH.parent
-SETUP_MARKER = SCRIPT_DIR / ".scrape_setup_complete"
+SETUP_MARKER = VENV_DIR / ".scrape_setup_complete"
 OUT_DIR = SCRIPT_DIR / "frames"
+SERVICE_VERSION = "2026-06-01-evaluate-v1"
+SERVICE_CAPABILITIES = [
+    "navigate",
+    "click",
+    "click_xy",
+    "type",
+    "evaluate",
+    "screenshot",
+    "dom",
+    "scroll",
+    "history",
+    "events",
+]
 def _pip_install(*pkgs: str) -> None:
@@ -129,6 +147,7 @@ from selenium.webdriver.common.by import By  # noqa: E402
 from selenium.webdriver.common.keys import Keys  # noqa: E402
 from selenium.webdriver.chrome.options import Options  # noqa: E402
 from selenium.webdriver.chrome.service import Service  # noqa: E402
+from selenium.webdriver.remote.webelement import WebElement  # noqa: E402
 from selenium.webdriver.support import expected_conditions as EC  # noqa: E402
 from selenium.webdriver.support.ui import WebDriverWait  # noqa: E402
 from webdriver_manager.chrome import ChromeDriverManager  # noqa: E402
@@ -160,6 +179,64 @@ def _truthy(value) -> bool:
     return str(value).lower() in ("1", "true", "yes", "on")
+def _serialize_script_result(value, depth: int = 0, seen: Optional[set[int]] = None):
+    if seen is None:
+        seen = set()
+    if value is None or isinstance(value, (str, int, float, bool)):
+        return value
+    if depth > 5:
+        return str(value)
+    if isinstance(value, WebElement):
+        try:
+            rect = value.rect or {}
+        except Exception:
+            rect = {}
+        try:
+            text = value.text or ""
+        except Exception:
+            text = ""
+        try:
+            tag = value.tag_name or ""
+        except Exception:
+            tag = ""
+        def attr(name: str) -> str:
+            try:
+                return value.get_attribute(name) or ""
+            except Exception:
+                return ""
+        return {
+            "__omnius_type": "element",
+            "tag": tag,
+            "id": attr("id"),
+            "name": attr("name"),
+            "type": attr("type"),
+            "role": attr("role"),
+            "ariaLabel": attr("aria-label"),
+            "text": text[:240],
+            "rect": {
+                "x": rect.get("x", 0),
+                "y": rect.get("y", 0),
+                "width": rect.get("width", 0),
+                "height": rect.get("height", 0),
+            },
+        }
+    if isinstance(value, (list, tuple, set)):
+        return [_serialize_script_result(item, depth + 1, seen) for item in list(value)[:200]]
+    if isinstance(value, dict):
+        ident = id(value)
+        if ident in seen:
+            return "[Circular]"
+        seen.add(ident)
+        out = {}
+        for idx, (key, item) in enumerate(value.items()):
+            if idx >= 200:
+                out["__omnius_truncated"] = True
+                break
+            out[str(key)] = _serialize_script_result(item, depth + 1, seen)
+        return out
+    return str(value)
 class Tools:
     _driver: Optional[webdriver.Chrome] = None
@@ -421,6 +498,21 @@ class Tools:
             log_message(f"[dom] snapshot failed: {exc}", "WARNING")
             return ""
+    @staticmethod
+    def evaluate(script: str):
+        if not Tools._driver:
+            return {"ok": False, "error": "browser not open"}
+        try:
+            result = Tools._driver.execute_script(script)
+            return {
+                "ok": True,
+                "result": _serialize_script_result(result),
+                "result_type": "undefined" if result is None else type(result).__name__,
+            }
+        except Exception as exc:
+            log_message(f"[evaluate] script failed: {exc}", "ERROR")
+            return {"ok": False, "error": str(exc)}
     @staticmethod
     def scroll(amount: int = 600) -> str:
         if not Tools._driver:
@@ -921,7 +1013,15 @@ def _error(message: str, status: int = 400):
 # ──────────────────────────────────────────────────────────────
 @app.get("/health")
 def health():
-    return jsonify({"status": "ok", "browser_open": Tools.is_browser_open(), "sessions": len(_SESSIONS)})
+    return jsonify({
+        "status": "ok",
+        "service": "browser_action",
+        "version": SERVICE_VERSION,
+        "capabilities": SERVICE_CAPABILITIES,
+        "browser_open": Tools.is_browser_open(),
+        "sessions": len(_SESSIONS),
+        "venv": str(VENV_DIR),
+    })
 @app.post("/session/start")
@@ -1041,6 +1141,23 @@ def type_text():
     return _ok(message=msg)
+@app.post("/evaluate")
+def evaluate_script():
+    if not _auth_ok(request):
+        return _error("unauthorized", 401)
+    data = request.get_json(silent=True) or {}
+    script = data.get("script") or data.get("text") or data.get("code") or ""
+    if not str(script).strip():
+        return _error("missing script", 400)
+    with _slot():
+        result = Tools.evaluate(str(script))
+    if not isinstance(result, dict) or not result.get("ok"):
+        return _error(result.get("error") if isinstance(result, dict) else "evaluate failed", 500)
+    sid = data.get("sid") or next(iter(_SESSIONS), "")
+    _queue_event(sid, {"type": "status", "msg": "evaluate", "ts": int(time.time() * 1000)})
+    return _ok(result=result.get("result"), result_type=result.get("result_type"))
 @app.post("/scroll")
 def scroll():
     if not _auth_ok(request):

package/npm-shrinkwrap.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
   "name": "omnius",
-  "version": "1.0.207",
+  "version": "1.0.209",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "omnius",
-      "version": "1.0.207",
+      "version": "1.0.209",
       "bundleDependencies": [
         "image-to-ascii"
       ],

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "omnius",
-  "version": "1.0.207",
+  "version": "1.0.209",
   "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
   "type": "module",
   "main": "./dist/index.js",