npm - omnius - Versions diffs - 1.0.208 → 1.0.209 - Mend

omnius 1.0.208 → 1.0.209

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/index.js +105 -11
package/dist/scripts/web_scrape.py +122 -5
package/npm-shrinkwrap.json +2 -2
package/package.json +1 -1

package/dist/index.js CHANGED Viewed

@@ -284618,15 +284618,52 @@ function findScrapeScript() {
   ];
   return candidates.find((p2) => existsSync44(p2)) || candidates[0];
 }
-async function probeService() {
+async function probeServiceInfo() {
   try {
     const controller = new AbortController();
     const timeout2 = setTimeout(() => controller.abort(), 3e3);
     const res = await fetch(`${BASE_URL}/health`, { signal: controller.signal });
     clearTimeout(timeout2);
-    return res.ok;
+    if (!res.ok)
+      return null;
+    const data = await res.json().catch(() => null);
+    return data && typeof data === "object" ? data : {};
   } catch {
+    return null;
+  }
+}
+async function probeService() {
+  return Boolean(await probeServiceInfo());
+}
+function serviceHasCapabilities(info) {
+  if (!info)
     return false;
+  const raw = info["capabilities"];
+  const capabilities = Array.isArray(raw) ? raw.map(String) : [];
+  return REQUIRED_SERVICE_CAPABILITIES.every((capability) => capabilities.includes(capability));
+}
+function killBrowserActionServicePort() {
+  if (serviceProcess && serviceProcess.pid && !serviceProcess.killed) {
+    try {
+      process.kill(-serviceProcess.pid, "SIGTERM");
+    } catch {
+    }
+    try {
+      serviceProcess.kill("SIGTERM");
+    } catch {
+    }
+    serviceProcess = null;
+  }
+  const commands = [
+    `lsof -ti tcp:${DEFAULT_PORT} | xargs -r kill -TERM`,
+    `fuser -k ${DEFAULT_PORT}/tcp`
+  ];
+  for (const cmd of commands) {
+    try {
+      execSync22(cmd, { stdio: "ignore", timeout: 5e3 });
+      break;
+    } catch {
+    }
   }
 }
 function findPython3() {
@@ -284641,8 +284678,17 @@ function findPython3() {
   return null;
 }
 async function launchService() {
-  if (await probeService())
-    return null;
+  const existing = await probeServiceInfo();
+  if (existing) {
+    if (serviceHasCapabilities(existing))
+      return null;
+    killBrowserActionServicePort();
+    for (let i2 = 0; i2 < 20; i2++) {
+      await new Promise((r2) => setTimeout(r2, 250));
+      if (!await probeService())
+        break;
+    }
+  }
   const python = findPython3();
   if (!python)
     return "Python 3 not found. Install Python 3.9+ to use browser automation.";
@@ -284654,6 +284700,7 @@ async function launchService() {
     env: {
       ...process.env,
       SCRAPE_PORT: String(DEFAULT_PORT),
+      OMNIUS_BROWSER_ACTION_VENV: join55(omniusHomeDir(), "runtimes", "browser", ".venv-selenium"),
       SCRAPE_HEADLESS_DEFAULT: process.env["SCRAPE_HEADLESS_DEFAULT"] ?? (defaultBrowserHeadless() ? "1" : "0"),
       SCRAPE_REQUIRE_AUTH: "0"
     }
@@ -284796,13 +284843,33 @@ async function apiCall(endpoint, method = "POST", body) {
     url += `?${params.toString()}`;
   }
   const res = await fetch(url, options2);
-  return await res.json();
+  const raw = await res.text();
+  try {
+    return JSON.parse(raw);
+  } catch {
+    return {
+      ok: false,
+      error: `HTTP ${res.status} from browser_action service: ${raw.slice(0, 500)}`
+    };
+  }
+}
+function evaluateFailureMessage2(err, code8) {
+  const raw = err instanceof Error ? err.message : String(err);
+  const hints = [];
+  if (/map is not a function/i.test(raw) && /querySelectorAll/i.test(code8)) {
+    hints.push("document.querySelectorAll() returns a NodeList; use Array.from(document.querySelectorAll(selector)).map(...) or [...document.querySelectorAll(selector)].map(...).");
+  }
+  if (/(?:\.value\s*=|setAttribute\(['"]value['"])/.test(code8) && /\b(input|textarea|querySelector)/i.test(code8)) {
+    hints.push("Direct .value assignment can bypass framework input/change handlers. Prefer browser_action type, browser_action click_xy plus input/sync paths, or playwright_browser fill/visual_click.");
+  }
+  return [raw.slice(0, 500), ...hints.map((hint) => `Hint: ${hint}`)].join("\n");
 }
-var __dirname3, DEFAULT_PORT, SCRAPE_SCRIPT, BASE_URL, serviceProcess, activeSessionId, activeSessionHeadless, activeSessionUrl, BrowserActionTool;
+var __dirname3, DEFAULT_PORT, SCRAPE_SCRIPT, BASE_URL, serviceProcess, activeSessionId, activeSessionHeadless, activeSessionUrl, REQUIRED_SERVICE_CAPABILITIES, BrowserActionTool;
 var init_browser_action = __esm({
   "packages/execution/dist/tools/browser-action.js"() {
     "use strict";
     init_dom_summary();
+    init_model_store();
     init_network_egress_policy();
     __dirname3 = dirname14(fileURLToPath6(import.meta.url));
     DEFAULT_PORT = 8130;
@@ -284812,16 +284879,17 @@ var init_browser_action = __esm({
     activeSessionId = null;
     activeSessionHeadless = null;
     activeSessionUrl = null;
+    REQUIRED_SERVICE_CAPABILITIES = ["evaluate"];
     BrowserActionTool = class {
       name = "browser_action";
-      description = "Control a persistent headless Chrome browser session for interactive web tasks. The browser stays open between calls, maintaining cookies, login state, and history. This is a separate Selenium/Chrome runtime from playwright_browser; do not switch between the two mid-workflow unless you intentionally navigate the second tool to the same URL. Use this (not web_fetch/web_crawl) when you need to: (1) log into a website, (2) fill and submit forms, (3) click buttons or links interactively, (4) take screenshots of rendered pages, (5) navigate multi-step workflows (checkout, signup, dashboards), (6) interact with elements that require JavaScript (dropdowns, modals, infinite scroll). Actions: navigate, click, click_xy, type, screenshot, dom, scroll, scroll_up, scroll_down, back, forward, close. For browser visuals, use browser_action({action:'screenshot', width, height, output_path}) — this captures the headless browser viewport, not the desktop. Use the desktop screenshot tool only when the actual OS screen is the target. For verification of browser runtime failures, prefer playwright_browser because it exposes page_errors, console_logs, network_log, DOM/accessibility, and screenshots from the same session. IMPORTANT: Start by calling navigate with the URL — do NOT ask the user for credentials or info first. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Navigate to the page, then use dom/screenshot to see what's there, then type/click to interact. Call 'close' when done to free resources. This tool does not save or download arbitrary rendered files (PDFs, archives, media) to disk — clicking a 'Download' link inside the browser does not produce a local file path for the agent. For file acquisition, use the dedicated download/file tool and validate the resulting content-type and size before treating the result as success.";
+      description = "Control a persistent headless Chrome browser session for interactive web tasks. The browser stays open between calls, maintaining cookies, login state, and history. This is a separate Selenium/Chrome runtime from playwright_browser; do not switch between the two mid-workflow unless you intentionally navigate the second tool to the same URL. Use this (not web_fetch/web_crawl) when you need to: (1) log into a website, (2) fill and submit forms, (3) click buttons or links interactively, (4) take screenshots of rendered pages, (5) navigate multi-step workflows (checkout, signup, dashboards), (6) interact with elements that require JavaScript (dropdowns, modals, infinite scroll). Actions: navigate, click, click_xy, type, evaluate, screenshot, dom, scroll, scroll_up, scroll_down, back, forward, close. For browser visuals, use browser_action({action:'screenshot', width, height, output_path}) — this captures the headless browser viewport, not the desktop. Use the desktop screenshot tool only when the actual OS screen is the target. For verification of browser runtime failures, prefer playwright_browser because it exposes page_errors, console_logs, network_log, DOM/accessibility, and screenshots from the same session. IMPORTANT: Start by calling navigate with the URL — do NOT ask the user for credentials or info first. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Navigate to the page, then use dom/screenshot to see what's there, then type/click to interact. Call 'close' when done to free resources. This tool does not save or download arbitrary rendered files (PDFs, archives, media) to disk — clicking a 'Download' link inside the browser does not produce a local file path for the agent. For file acquisition, use the dedicated download/file tool and validate the resulting content-type and size before treating the result as success.";
       parameters = {
         type: "object",
         properties: {
           action: {
             type: "string",
-            enum: ["navigate", "click", "click_xy", "type", "screenshot", "dom", "dom_summary", "vision_click", "scroll", "scroll_up", "scroll_down", "back", "forward", "close"],
-            description: "Browser action to perform. Key actions:\n- 'screenshot': capture the headless browser render at width/height; returns an image part and a local file path if output_path is provided\n- 'dom_summary': compact view of interactive elements (~1KB vs 200KB raw DOM)\n- 'vision_click': screenshot the page, use Moondream vision to find an element by description, then click it. Pass the element description in 'text' parameter (e.g. text='the login button'). This is the visual grounding loop from SeeAct.\n- 'click': click by CSS selector (fastest when you know the selector)\n- 'click_xy': click at pixel coordinates (when you have exact coords)"
+            enum: ["navigate", "click", "click_xy", "type", "evaluate", "screenshot", "dom", "dom_summary", "vision_click", "scroll", "scroll_up", "scroll_down", "back", "forward", "close"],
+            description: "Browser action to perform. Key actions:\n- 'screenshot': capture the headless browser render at width/height; returns an image part and a local file path if output_path is provided\n- 'dom_summary': compact view of interactive elements (~1KB vs 200KB raw DOM)\n- 'vision_click': screenshot the page, use Moondream vision to find an element by description, then click it. Pass the element description in 'text' parameter (e.g. text='the login button'). This is the visual grounding loop from SeeAct.\n- 'click': click by CSS selector (fastest when you know the selector)\n- 'click_xy': click at pixel coordinates (when you have exact coords)\n- 'evaluate': run JavaScript in the active Selenium page; pass code in text"
           },
           url: {
             type: "string",
@@ -284833,7 +284901,7 @@ var init_browser_action = __esm({
           },
           text: {
             type: "string",
-            description: "Text to type (for 'type' action) OR element description to find and click (for 'vision_click' action, e.g. 'the submit button', 'the search field', 'the country dropdown')"
+            description: "Text to type (for 'type' action), JS code (for 'evaluate'), OR element description to find and click (for 'vision_click' action, e.g. 'the submit button', 'the search field', 'the country dropdown')"
           },
           x: {
             type: "number",
@@ -285007,6 +285075,32 @@ Runtime: browser_action Selenium/Chrome session. Continue with browser_action fo
                 durationMs: Date.now() - start2
               };
             }
+            case "evaluate": {
+              const code8 = typeof args.text === "string" ? args.text : typeof args.value === "string" ? args.value : "";
+              if (!code8.trim())
+                return { success: false, output: "", error: "text is required for evaluate action", durationMs: Date.now() - start2 };
+              result = await apiCall("/evaluate", "POST", { script: code8 });
+              if (result.ok) {
+                const resultType = String(result["result_type"] ?? "unknown");
+                const payload = result["result"];
+                const rendered = payload === void 0 ? "undefined" : typeof payload === "string" ? payload : JSON.stringify(payload, null, 2);
+                const truncated = rendered.length > 2e4 ? `${rendered.slice(0, 2e4)}
+... (truncated)` : rendered;
+                return {
+                  success: true,
+                  output: `Evaluation result (${resultType}):
+${truncated}`,
+                  durationMs: Date.now() - start2
+                };
+              }
+              const evalMsg = String(result.error ?? result.message ?? "Evaluate failed");
+              return {
+                success: false,
+                output: "",
+                error: `browser_action evaluate failed: ${evaluateFailureMessage2(evalMsg, code8)} ${browserActionRuntimeHint()}`,
+                durationMs: Date.now() - start2
+              };
+            }
             case "screenshot": {
               if (requestedWidth || requestedHeight || requestedScale) {
                 const currentW = requestedWidth ?? 1280;
@@ -285190,7 +285284,7 @@ Runtime: browser_action Selenium/Chrome session. Continue with browser_action fo
               result = await apiCall("/history/forward", "POST");
               return { success: !!result.ok, output: "Navigated forward", durationMs: Date.now() - start2 };
             default:
-              return { success: false, output: "", error: `Unknown action: ${action}. Available: navigate, click, click_xy, type, screenshot, dom, scroll, scroll_up, scroll_down, back, forward, close`, durationMs: Date.now() - start2 };
+              return { success: false, output: "", error: `Unknown action: ${action}. Available: navigate, click, click_xy, type, evaluate, screenshot, dom, dom_summary, vision_click, scroll, scroll_up, scroll_down, back, forward, close`, durationMs: Date.now() - start2 };
           }
         } catch (err) {
           return {

package/dist/scripts/web_scrape.py CHANGED Viewed

@@ -32,7 +32,13 @@ from typing import Dict, Optional
 # ──────────────────────────────────────────────────────────────
 # 0) Embedded venv bootstrap (same pattern as other services)
 # ──────────────────────────────────────────────────────────────
-VENV_DIR = Path.cwd() / ".venv"
+SCRIPT_PATH = Path(__file__).resolve()
+SCRIPT_DIR = SCRIPT_PATH.parent
+OMNIUS_HOME = Path(os.environ.get("OMNIUS_HOME") or (Path.home() / ".omnius"))
+VENV_DIR = Path(
+    os.environ.get("OMNIUS_BROWSER_ACTION_VENV")
+    or (OMNIUS_HOME / "runtimes" / "browser" / ".venv-selenium")
+)
 def _in_venv() -> bool:
@@ -48,6 +54,7 @@ def _ensure_venv_and_reexec() -> None:
         return
     python = sys.executable
     if not VENV_DIR.exists():
+        VENV_DIR.parent.mkdir(parents=True, exist_ok=True)
         print(f"[bootstrap] creating virtualenv at {VENV_DIR}", file=sys.stderr)
         subprocess.check_call([python, "-m", "venv", str(VENV_DIR)])
         pip_bin = VENV_DIR / ("Scripts/pip.exe" if os.name == "nt" else "bin/pip")
@@ -69,10 +76,21 @@ _ensure_venv_and_reexec()
 # ──────────────────────────────────────────────────────────────
 import subprocess  # noqa: E402  (re-import after re-exec)
-SCRIPT_PATH = Path(__file__).resolve()
-SCRIPT_DIR = SCRIPT_PATH.parent
-SETUP_MARKER = SCRIPT_DIR / ".scrape_setup_complete"
+SETUP_MARKER = VENV_DIR / ".scrape_setup_complete"
 OUT_DIR = SCRIPT_DIR / "frames"
+SERVICE_VERSION = "2026-06-01-evaluate-v1"
+SERVICE_CAPABILITIES = [
+    "navigate",
+    "click",
+    "click_xy",
+    "type",
+    "evaluate",
+    "screenshot",
+    "dom",
+    "scroll",
+    "history",
+    "events",
+]
 def _pip_install(*pkgs: str) -> None:
@@ -129,6 +147,7 @@ from selenium.webdriver.common.by import By  # noqa: E402
 from selenium.webdriver.common.keys import Keys  # noqa: E402
 from selenium.webdriver.chrome.options import Options  # noqa: E402
 from selenium.webdriver.chrome.service import Service  # noqa: E402
+from selenium.webdriver.remote.webelement import WebElement  # noqa: E402
 from selenium.webdriver.support import expected_conditions as EC  # noqa: E402
 from selenium.webdriver.support.ui import WebDriverWait  # noqa: E402
 from webdriver_manager.chrome import ChromeDriverManager  # noqa: E402
@@ -160,6 +179,64 @@ def _truthy(value) -> bool:
     return str(value).lower() in ("1", "true", "yes", "on")
+def _serialize_script_result(value, depth: int = 0, seen: Optional[set[int]] = None):
+    if seen is None:
+        seen = set()
+    if value is None or isinstance(value, (str, int, float, bool)):
+        return value
+    if depth > 5:
+        return str(value)
+    if isinstance(value, WebElement):
+        try:
+            rect = value.rect or {}
+        except Exception:
+            rect = {}
+        try:
+            text = value.text or ""
+        except Exception:
+            text = ""
+        try:
+            tag = value.tag_name or ""
+        except Exception:
+            tag = ""
+        def attr(name: str) -> str:
+            try:
+                return value.get_attribute(name) or ""
+            except Exception:
+                return ""
+        return {
+            "__omnius_type": "element",
+            "tag": tag,
+            "id": attr("id"),
+            "name": attr("name"),
+            "type": attr("type"),
+            "role": attr("role"),
+            "ariaLabel": attr("aria-label"),
+            "text": text[:240],
+            "rect": {
+                "x": rect.get("x", 0),
+                "y": rect.get("y", 0),
+                "width": rect.get("width", 0),
+                "height": rect.get("height", 0),
+            },
+        }
+    if isinstance(value, (list, tuple, set)):
+        return [_serialize_script_result(item, depth + 1, seen) for item in list(value)[:200]]
+    if isinstance(value, dict):
+        ident = id(value)
+        if ident in seen:
+            return "[Circular]"
+        seen.add(ident)
+        out = {}
+        for idx, (key, item) in enumerate(value.items()):
+            if idx >= 200:
+                out["__omnius_truncated"] = True
+                break
+            out[str(key)] = _serialize_script_result(item, depth + 1, seen)
+        return out
+    return str(value)
 class Tools:
     _driver: Optional[webdriver.Chrome] = None
@@ -421,6 +498,21 @@ class Tools:
             log_message(f"[dom] snapshot failed: {exc}", "WARNING")
             return ""
+    @staticmethod
+    def evaluate(script: str):
+        if not Tools._driver:
+            return {"ok": False, "error": "browser not open"}
+        try:
+            result = Tools._driver.execute_script(script)
+            return {
+                "ok": True,
+                "result": _serialize_script_result(result),
+                "result_type": "undefined" if result is None else type(result).__name__,
+            }
+        except Exception as exc:
+            log_message(f"[evaluate] script failed: {exc}", "ERROR")
+            return {"ok": False, "error": str(exc)}
     @staticmethod
     def scroll(amount: int = 600) -> str:
         if not Tools._driver:
@@ -921,7 +1013,15 @@ def _error(message: str, status: int = 400):
 # ──────────────────────────────────────────────────────────────
 @app.get("/health")
 def health():
-    return jsonify({"status": "ok", "browser_open": Tools.is_browser_open(), "sessions": len(_SESSIONS)})
+    return jsonify({
+        "status": "ok",
+        "service": "browser_action",
+        "version": SERVICE_VERSION,
+        "capabilities": SERVICE_CAPABILITIES,
+        "browser_open": Tools.is_browser_open(),
+        "sessions": len(_SESSIONS),
+        "venv": str(VENV_DIR),
+    })
 @app.post("/session/start")
@@ -1041,6 +1141,23 @@ def type_text():
     return _ok(message=msg)
+@app.post("/evaluate")
+def evaluate_script():
+    if not _auth_ok(request):
+        return _error("unauthorized", 401)
+    data = request.get_json(silent=True) or {}
+    script = data.get("script") or data.get("text") or data.get("code") or ""
+    if not str(script).strip():
+        return _error("missing script", 400)
+    with _slot():
+        result = Tools.evaluate(str(script))
+    if not isinstance(result, dict) or not result.get("ok"):
+        return _error(result.get("error") if isinstance(result, dict) else "evaluate failed", 500)
+    sid = data.get("sid") or next(iter(_SESSIONS), "")
+    _queue_event(sid, {"type": "status", "msg": "evaluate", "ts": int(time.time() * 1000)})
+    return _ok(result=result.get("result"), result_type=result.get("result_type"))
 @app.post("/scroll")
 def scroll():
     if not _auth_ok(request):

package/npm-shrinkwrap.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
   "name": "omnius",
-  "version": "1.0.208",
+  "version": "1.0.209",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "omnius",
-      "version": "1.0.208",
+      "version": "1.0.209",
       "bundleDependencies": [
         "image-to-ascii"
       ],

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "omnius",
-  "version": "1.0.208",
+  "version": "1.0.209",
   "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
   "type": "module",
   "main": "./dist/index.js",