omnius 1.0.208 → 1.0.209

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -284618,15 +284618,52 @@ function findScrapeScript() {
284618
284618
  ];
284619
284619
  return candidates.find((p2) => existsSync44(p2)) || candidates[0];
284620
284620
  }
284621
- async function probeService() {
284621
+ async function probeServiceInfo() {
284622
284622
  try {
284623
284623
  const controller = new AbortController();
284624
284624
  const timeout2 = setTimeout(() => controller.abort(), 3e3);
284625
284625
  const res = await fetch(`${BASE_URL}/health`, { signal: controller.signal });
284626
284626
  clearTimeout(timeout2);
284627
- return res.ok;
284627
+ if (!res.ok)
284628
+ return null;
284629
+ const data = await res.json().catch(() => null);
284630
+ return data && typeof data === "object" ? data : {};
284628
284631
  } catch {
284632
+ return null;
284633
+ }
284634
+ }
284635
+ async function probeService() {
284636
+ return Boolean(await probeServiceInfo());
284637
+ }
284638
+ function serviceHasCapabilities(info) {
284639
+ if (!info)
284629
284640
  return false;
284641
+ const raw = info["capabilities"];
284642
+ const capabilities = Array.isArray(raw) ? raw.map(String) : [];
284643
+ return REQUIRED_SERVICE_CAPABILITIES.every((capability) => capabilities.includes(capability));
284644
+ }
284645
+ function killBrowserActionServicePort() {
284646
+ if (serviceProcess && serviceProcess.pid && !serviceProcess.killed) {
284647
+ try {
284648
+ process.kill(-serviceProcess.pid, "SIGTERM");
284649
+ } catch {
284650
+ }
284651
+ try {
284652
+ serviceProcess.kill("SIGTERM");
284653
+ } catch {
284654
+ }
284655
+ serviceProcess = null;
284656
+ }
284657
+ const commands = [
284658
+ `lsof -ti tcp:${DEFAULT_PORT} | xargs -r kill -TERM`,
284659
+ `fuser -k ${DEFAULT_PORT}/tcp`
284660
+ ];
284661
+ for (const cmd of commands) {
284662
+ try {
284663
+ execSync22(cmd, { stdio: "ignore", timeout: 5e3 });
284664
+ break;
284665
+ } catch {
284666
+ }
284630
284667
  }
284631
284668
  }
284632
284669
  function findPython3() {
@@ -284641,8 +284678,17 @@ function findPython3() {
284641
284678
  return null;
284642
284679
  }
284643
284680
  async function launchService() {
284644
- if (await probeService())
284645
- return null;
284681
+ const existing = await probeServiceInfo();
284682
+ if (existing) {
284683
+ if (serviceHasCapabilities(existing))
284684
+ return null;
284685
+ killBrowserActionServicePort();
284686
+ for (let i2 = 0; i2 < 20; i2++) {
284687
+ await new Promise((r2) => setTimeout(r2, 250));
284688
+ if (!await probeService())
284689
+ break;
284690
+ }
284691
+ }
284646
284692
  const python = findPython3();
284647
284693
  if (!python)
284648
284694
  return "Python 3 not found. Install Python 3.9+ to use browser automation.";
@@ -284654,6 +284700,7 @@ async function launchService() {
284654
284700
  env: {
284655
284701
  ...process.env,
284656
284702
  SCRAPE_PORT: String(DEFAULT_PORT),
284703
+ OMNIUS_BROWSER_ACTION_VENV: join55(omniusHomeDir(), "runtimes", "browser", ".venv-selenium"),
284657
284704
  SCRAPE_HEADLESS_DEFAULT: process.env["SCRAPE_HEADLESS_DEFAULT"] ?? (defaultBrowserHeadless() ? "1" : "0"),
284658
284705
  SCRAPE_REQUIRE_AUTH: "0"
284659
284706
  }
@@ -284796,13 +284843,33 @@ async function apiCall(endpoint, method = "POST", body) {
284796
284843
  url += `?${params.toString()}`;
284797
284844
  }
284798
284845
  const res = await fetch(url, options2);
284799
- return await res.json();
284846
+ const raw = await res.text();
284847
+ try {
284848
+ return JSON.parse(raw);
284849
+ } catch {
284850
+ return {
284851
+ ok: false,
284852
+ error: `HTTP ${res.status} from browser_action service: ${raw.slice(0, 500)}`
284853
+ };
284854
+ }
284855
+ }
284856
+ function evaluateFailureMessage2(err, code8) {
284857
+ const raw = err instanceof Error ? err.message : String(err);
284858
+ const hints = [];
284859
+ if (/map is not a function/i.test(raw) && /querySelectorAll/i.test(code8)) {
284860
+ hints.push("document.querySelectorAll() returns a NodeList; use Array.from(document.querySelectorAll(selector)).map(...) or [...document.querySelectorAll(selector)].map(...).");
284861
+ }
284862
+ if (/(?:\.value\s*=|setAttribute\(['"]value['"])/.test(code8) && /\b(input|textarea|querySelector)/i.test(code8)) {
284863
+ hints.push("Direct .value assignment can bypass framework input/change handlers. Prefer browser_action type, browser_action click_xy plus input/sync paths, or playwright_browser fill/visual_click.");
284864
+ }
284865
+ return [raw.slice(0, 500), ...hints.map((hint) => `Hint: ${hint}`)].join("\n");
284800
284866
  }
284801
- var __dirname3, DEFAULT_PORT, SCRAPE_SCRIPT, BASE_URL, serviceProcess, activeSessionId, activeSessionHeadless, activeSessionUrl, BrowserActionTool;
284867
+ var __dirname3, DEFAULT_PORT, SCRAPE_SCRIPT, BASE_URL, serviceProcess, activeSessionId, activeSessionHeadless, activeSessionUrl, REQUIRED_SERVICE_CAPABILITIES, BrowserActionTool;
284802
284868
  var init_browser_action = __esm({
284803
284869
  "packages/execution/dist/tools/browser-action.js"() {
284804
284870
  "use strict";
284805
284871
  init_dom_summary();
284872
+ init_model_store();
284806
284873
  init_network_egress_policy();
284807
284874
  __dirname3 = dirname14(fileURLToPath6(import.meta.url));
284808
284875
  DEFAULT_PORT = 8130;
@@ -284812,16 +284879,17 @@ var init_browser_action = __esm({
284812
284879
  activeSessionId = null;
284813
284880
  activeSessionHeadless = null;
284814
284881
  activeSessionUrl = null;
284882
+ REQUIRED_SERVICE_CAPABILITIES = ["evaluate"];
284815
284883
  BrowserActionTool = class {
284816
284884
  name = "browser_action";
284817
- description = "Control a persistent headless Chrome browser session for interactive web tasks. The browser stays open between calls, maintaining cookies, login state, and history. This is a separate Selenium/Chrome runtime from playwright_browser; do not switch between the two mid-workflow unless you intentionally navigate the second tool to the same URL. Use this (not web_fetch/web_crawl) when you need to: (1) log into a website, (2) fill and submit forms, (3) click buttons or links interactively, (4) take screenshots of rendered pages, (5) navigate multi-step workflows (checkout, signup, dashboards), (6) interact with elements that require JavaScript (dropdowns, modals, infinite scroll). Actions: navigate, click, click_xy, type, screenshot, dom, scroll, scroll_up, scroll_down, back, forward, close. For browser visuals, use browser_action({action:'screenshot', width, height, output_path}) — this captures the headless browser viewport, not the desktop. Use the desktop screenshot tool only when the actual OS screen is the target. For verification of browser runtime failures, prefer playwright_browser because it exposes page_errors, console_logs, network_log, DOM/accessibility, and screenshots from the same session. IMPORTANT: Start by calling navigate with the URL — do NOT ask the user for credentials or info first. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Navigate to the page, then use dom/screenshot to see what's there, then type/click to interact. Call 'close' when done to free resources. This tool does not save or download arbitrary rendered files (PDFs, archives, media) to disk — clicking a 'Download' link inside the browser does not produce a local file path for the agent. For file acquisition, use the dedicated download/file tool and validate the resulting content-type and size before treating the result as success.";
284885
+ description = "Control a persistent headless Chrome browser session for interactive web tasks. The browser stays open between calls, maintaining cookies, login state, and history. This is a separate Selenium/Chrome runtime from playwright_browser; do not switch between the two mid-workflow unless you intentionally navigate the second tool to the same URL. Use this (not web_fetch/web_crawl) when you need to: (1) log into a website, (2) fill and submit forms, (3) click buttons or links interactively, (4) take screenshots of rendered pages, (5) navigate multi-step workflows (checkout, signup, dashboards), (6) interact with elements that require JavaScript (dropdowns, modals, infinite scroll). Actions: navigate, click, click_xy, type, evaluate, screenshot, dom, scroll, scroll_up, scroll_down, back, forward, close. For browser visuals, use browser_action({action:'screenshot', width, height, output_path}) — this captures the headless browser viewport, not the desktop. Use the desktop screenshot tool only when the actual OS screen is the target. For verification of browser runtime failures, prefer playwright_browser because it exposes page_errors, console_logs, network_log, DOM/accessibility, and screenshots from the same session. IMPORTANT: Start by calling navigate with the URL — do NOT ask the user for credentials or info first. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Navigate to the page, then use dom/screenshot to see what's there, then type/click to interact. Call 'close' when done to free resources. This tool does not save or download arbitrary rendered files (PDFs, archives, media) to disk — clicking a 'Download' link inside the browser does not produce a local file path for the agent. For file acquisition, use the dedicated download/file tool and validate the resulting content-type and size before treating the result as success.";
284818
284886
  parameters = {
284819
284887
  type: "object",
284820
284888
  properties: {
284821
284889
  action: {
284822
284890
  type: "string",
284823
- enum: ["navigate", "click", "click_xy", "type", "screenshot", "dom", "dom_summary", "vision_click", "scroll", "scroll_up", "scroll_down", "back", "forward", "close"],
284824
- description: "Browser action to perform. Key actions:\n- 'screenshot': capture the headless browser render at width/height; returns an image part and a local file path if output_path is provided\n- 'dom_summary': compact view of interactive elements (~1KB vs 200KB raw DOM)\n- 'vision_click': screenshot the page, use Moondream vision to find an element by description, then click it. Pass the element description in 'text' parameter (e.g. text='the login button'). This is the visual grounding loop from SeeAct.\n- 'click': click by CSS selector (fastest when you know the selector)\n- 'click_xy': click at pixel coordinates (when you have exact coords)"
284891
+ enum: ["navigate", "click", "click_xy", "type", "evaluate", "screenshot", "dom", "dom_summary", "vision_click", "scroll", "scroll_up", "scroll_down", "back", "forward", "close"],
284892
+ description: "Browser action to perform. Key actions:\n- 'screenshot': capture the headless browser render at width/height; returns an image part and a local file path if output_path is provided\n- 'dom_summary': compact view of interactive elements (~1KB vs 200KB raw DOM)\n- 'vision_click': screenshot the page, use Moondream vision to find an element by description, then click it. Pass the element description in 'text' parameter (e.g. text='the login button'). This is the visual grounding loop from SeeAct.\n- 'click': click by CSS selector (fastest when you know the selector)\n- 'click_xy': click at pixel coordinates (when you have exact coords)\n- 'evaluate': run JavaScript in the active Selenium page; pass code in text"
284825
284893
  },
284826
284894
  url: {
284827
284895
  type: "string",
@@ -284833,7 +284901,7 @@ var init_browser_action = __esm({
284833
284901
  },
284834
284902
  text: {
284835
284903
  type: "string",
284836
- description: "Text to type (for 'type' action) OR element description to find and click (for 'vision_click' action, e.g. 'the submit button', 'the search field', 'the country dropdown')"
284904
+ description: "Text to type (for 'type' action), JS code (for 'evaluate'), OR element description to find and click (for 'vision_click' action, e.g. 'the submit button', 'the search field', 'the country dropdown')"
284837
284905
  },
284838
284906
  x: {
284839
284907
  type: "number",
@@ -285007,6 +285075,32 @@ Runtime: browser_action Selenium/Chrome session. Continue with browser_action fo
285007
285075
  durationMs: Date.now() - start2
285008
285076
  };
285009
285077
  }
285078
+ case "evaluate": {
285079
+ const code8 = typeof args.text === "string" ? args.text : typeof args.value === "string" ? args.value : "";
285080
+ if (!code8.trim())
285081
+ return { success: false, output: "", error: "text is required for evaluate action", durationMs: Date.now() - start2 };
285082
+ result = await apiCall("/evaluate", "POST", { script: code8 });
285083
+ if (result.ok) {
285084
+ const resultType = String(result["result_type"] ?? "unknown");
285085
+ const payload = result["result"];
285086
+ const rendered = payload === void 0 ? "undefined" : typeof payload === "string" ? payload : JSON.stringify(payload, null, 2);
285087
+ const truncated = rendered.length > 2e4 ? `${rendered.slice(0, 2e4)}
285088
+ ... (truncated)` : rendered;
285089
+ return {
285090
+ success: true,
285091
+ output: `Evaluation result (${resultType}):
285092
+ ${truncated}`,
285093
+ durationMs: Date.now() - start2
285094
+ };
285095
+ }
285096
+ const evalMsg = String(result.error ?? result.message ?? "Evaluate failed");
285097
+ return {
285098
+ success: false,
285099
+ output: "",
285100
+ error: `browser_action evaluate failed: ${evaluateFailureMessage2(evalMsg, code8)} ${browserActionRuntimeHint()}`,
285101
+ durationMs: Date.now() - start2
285102
+ };
285103
+ }
285010
285104
  case "screenshot": {
285011
285105
  if (requestedWidth || requestedHeight || requestedScale) {
285012
285106
  const currentW = requestedWidth ?? 1280;
@@ -285190,7 +285284,7 @@ Runtime: browser_action Selenium/Chrome session. Continue with browser_action fo
285190
285284
  result = await apiCall("/history/forward", "POST");
285191
285285
  return { success: !!result.ok, output: "Navigated forward", durationMs: Date.now() - start2 };
285192
285286
  default:
285193
- return { success: false, output: "", error: `Unknown action: ${action}. Available: navigate, click, click_xy, type, screenshot, dom, scroll, scroll_up, scroll_down, back, forward, close`, durationMs: Date.now() - start2 };
285287
+ return { success: false, output: "", error: `Unknown action: ${action}. Available: navigate, click, click_xy, type, evaluate, screenshot, dom, dom_summary, vision_click, scroll, scroll_up, scroll_down, back, forward, close`, durationMs: Date.now() - start2 };
285194
285288
  }
285195
285289
  } catch (err) {
285196
285290
  return {
@@ -32,7 +32,13 @@ from typing import Dict, Optional
32
32
  # ──────────────────────────────────────────────────────────────
33
33
  # 0) Embedded venv bootstrap (same pattern as other services)
34
34
  # ──────────────────────────────────────────────────────────────
35
- VENV_DIR = Path.cwd() / ".venv"
35
+ SCRIPT_PATH = Path(__file__).resolve()
36
+ SCRIPT_DIR = SCRIPT_PATH.parent
37
+ OMNIUS_HOME = Path(os.environ.get("OMNIUS_HOME") or (Path.home() / ".omnius"))
38
+ VENV_DIR = Path(
39
+ os.environ.get("OMNIUS_BROWSER_ACTION_VENV")
40
+ or (OMNIUS_HOME / "runtimes" / "browser" / ".venv-selenium")
41
+ )
36
42
 
37
43
 
38
44
  def _in_venv() -> bool:
@@ -48,6 +54,7 @@ def _ensure_venv_and_reexec() -> None:
48
54
  return
49
55
  python = sys.executable
50
56
  if not VENV_DIR.exists():
57
+ VENV_DIR.parent.mkdir(parents=True, exist_ok=True)
51
58
  print(f"[bootstrap] creating virtualenv at {VENV_DIR}", file=sys.stderr)
52
59
  subprocess.check_call([python, "-m", "venv", str(VENV_DIR)])
53
60
  pip_bin = VENV_DIR / ("Scripts/pip.exe" if os.name == "nt" else "bin/pip")
@@ -69,10 +76,21 @@ _ensure_venv_and_reexec()
69
76
  # ──────────────────────────────────────────────────────────────
70
77
  import subprocess # noqa: E402 (re-import after re-exec)
71
78
 
72
- SCRIPT_PATH = Path(__file__).resolve()
73
- SCRIPT_DIR = SCRIPT_PATH.parent
74
- SETUP_MARKER = SCRIPT_DIR / ".scrape_setup_complete"
79
+ SETUP_MARKER = VENV_DIR / ".scrape_setup_complete"
75
80
  OUT_DIR = SCRIPT_DIR / "frames"
81
+ SERVICE_VERSION = "2026-06-01-evaluate-v1"
82
+ SERVICE_CAPABILITIES = [
83
+ "navigate",
84
+ "click",
85
+ "click_xy",
86
+ "type",
87
+ "evaluate",
88
+ "screenshot",
89
+ "dom",
90
+ "scroll",
91
+ "history",
92
+ "events",
93
+ ]
76
94
 
77
95
 
78
96
  def _pip_install(*pkgs: str) -> None:
@@ -129,6 +147,7 @@ from selenium.webdriver.common.by import By # noqa: E402
129
147
  from selenium.webdriver.common.keys import Keys # noqa: E402
130
148
  from selenium.webdriver.chrome.options import Options # noqa: E402
131
149
  from selenium.webdriver.chrome.service import Service # noqa: E402
150
+ from selenium.webdriver.remote.webelement import WebElement # noqa: E402
132
151
  from selenium.webdriver.support import expected_conditions as EC # noqa: E402
133
152
  from selenium.webdriver.support.ui import WebDriverWait # noqa: E402
134
153
  from webdriver_manager.chrome import ChromeDriverManager # noqa: E402
@@ -160,6 +179,64 @@ def _truthy(value) -> bool:
160
179
  return str(value).lower() in ("1", "true", "yes", "on")
161
180
 
162
181
 
182
+ def _serialize_script_result(value, depth: int = 0, seen: Optional[set[int]] = None):
183
+ if seen is None:
184
+ seen = set()
185
+ if value is None or isinstance(value, (str, int, float, bool)):
186
+ return value
187
+ if depth > 5:
188
+ return str(value)
189
+ if isinstance(value, WebElement):
190
+ try:
191
+ rect = value.rect or {}
192
+ except Exception:
193
+ rect = {}
194
+ try:
195
+ text = value.text or ""
196
+ except Exception:
197
+ text = ""
198
+ try:
199
+ tag = value.tag_name or ""
200
+ except Exception:
201
+ tag = ""
202
+ def attr(name: str) -> str:
203
+ try:
204
+ return value.get_attribute(name) or ""
205
+ except Exception:
206
+ return ""
207
+ return {
208
+ "__omnius_type": "element",
209
+ "tag": tag,
210
+ "id": attr("id"),
211
+ "name": attr("name"),
212
+ "type": attr("type"),
213
+ "role": attr("role"),
214
+ "ariaLabel": attr("aria-label"),
215
+ "text": text[:240],
216
+ "rect": {
217
+ "x": rect.get("x", 0),
218
+ "y": rect.get("y", 0),
219
+ "width": rect.get("width", 0),
220
+ "height": rect.get("height", 0),
221
+ },
222
+ }
223
+ if isinstance(value, (list, tuple, set)):
224
+ return [_serialize_script_result(item, depth + 1, seen) for item in list(value)[:200]]
225
+ if isinstance(value, dict):
226
+ ident = id(value)
227
+ if ident in seen:
228
+ return "[Circular]"
229
+ seen.add(ident)
230
+ out = {}
231
+ for idx, (key, item) in enumerate(value.items()):
232
+ if idx >= 200:
233
+ out["__omnius_truncated"] = True
234
+ break
235
+ out[str(key)] = _serialize_script_result(item, depth + 1, seen)
236
+ return out
237
+ return str(value)
238
+
239
+
163
240
  class Tools:
164
241
  _driver: Optional[webdriver.Chrome] = None
165
242
 
@@ -421,6 +498,21 @@ class Tools:
421
498
  log_message(f"[dom] snapshot failed: {exc}", "WARNING")
422
499
  return ""
423
500
 
501
+ @staticmethod
502
+ def evaluate(script: str):
503
+ if not Tools._driver:
504
+ return {"ok": False, "error": "browser not open"}
505
+ try:
506
+ result = Tools._driver.execute_script(script)
507
+ return {
508
+ "ok": True,
509
+ "result": _serialize_script_result(result),
510
+ "result_type": "undefined" if result is None else type(result).__name__,
511
+ }
512
+ except Exception as exc:
513
+ log_message(f"[evaluate] script failed: {exc}", "ERROR")
514
+ return {"ok": False, "error": str(exc)}
515
+
424
516
  @staticmethod
425
517
  def scroll(amount: int = 600) -> str:
426
518
  if not Tools._driver:
@@ -921,7 +1013,15 @@ def _error(message: str, status: int = 400):
921
1013
  # ──────────────────────────────────────────────────────────────
922
1014
  @app.get("/health")
923
1015
  def health():
924
- return jsonify({"status": "ok", "browser_open": Tools.is_browser_open(), "sessions": len(_SESSIONS)})
1016
+ return jsonify({
1017
+ "status": "ok",
1018
+ "service": "browser_action",
1019
+ "version": SERVICE_VERSION,
1020
+ "capabilities": SERVICE_CAPABILITIES,
1021
+ "browser_open": Tools.is_browser_open(),
1022
+ "sessions": len(_SESSIONS),
1023
+ "venv": str(VENV_DIR),
1024
+ })
925
1025
 
926
1026
 
927
1027
  @app.post("/session/start")
@@ -1041,6 +1141,23 @@ def type_text():
1041
1141
  return _ok(message=msg)
1042
1142
 
1043
1143
 
1144
+ @app.post("/evaluate")
1145
+ def evaluate_script():
1146
+ if not _auth_ok(request):
1147
+ return _error("unauthorized", 401)
1148
+ data = request.get_json(silent=True) or {}
1149
+ script = data.get("script") or data.get("text") or data.get("code") or ""
1150
+ if not str(script).strip():
1151
+ return _error("missing script", 400)
1152
+ with _slot():
1153
+ result = Tools.evaluate(str(script))
1154
+ if not isinstance(result, dict) or not result.get("ok"):
1155
+ return _error(result.get("error") if isinstance(result, dict) else "evaluate failed", 500)
1156
+ sid = data.get("sid") or next(iter(_SESSIONS), "")
1157
+ _queue_event(sid, {"type": "status", "msg": "evaluate", "ts": int(time.time() * 1000)})
1158
+ return _ok(result=result.get("result"), result_type=result.get("result_type"))
1159
+
1160
+
1044
1161
  @app.post("/scroll")
1045
1162
  def scroll():
1046
1163
  if not _auth_ok(request):
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.208",
3
+ "version": "1.0.209",
4
4
  "lockfileVersion": 3,
5
5
  "requires": true,
6
6
  "packages": {
7
7
  "": {
8
8
  "name": "omnius",
9
- "version": "1.0.208",
9
+ "version": "1.0.209",
10
10
  "bundleDependencies": [
11
11
  "image-to-ascii"
12
12
  ],
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.208",
3
+ "version": "1.0.209",
4
4
  "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",