omnius 1.0.206 → 1.0.208

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -10326,6 +10326,108 @@ function pngDimensions(buffer2) {
10326
10326
  }
10327
10327
  return null;
10328
10328
  }
10329
+ async function describeFocusedEditableInContext(context2, frameMeta) {
10330
+ const active = await context2.evaluate(`(() => {
10331
+ const el = document.activeElement;
10332
+ if (!el) return null;
10333
+ const rect = el.getBoundingClientRect();
10334
+ const role = (el.getAttribute("role") || "").toLowerCase();
10335
+ const contentEditableAttr = el.getAttribute("contenteditable");
10336
+ const contentEditable = contentEditableAttr !== null
10337
+ && (contentEditableAttr === "" || String(contentEditableAttr).toLowerCase() === "true");
10338
+ const disabled = !!el.disabled;
10339
+ const readOnly = !!el.readOnly;
10340
+ const isEditable = (el.matches("input, textarea") && !disabled && !readOnly)
10341
+ || contentEditable || el.isContentEditable === true
10342
+ || ["textbox", "searchbox", "combobox"].includes(role);
10343
+ return {
10344
+ tag: String(el.tagName || "").toLowerCase(),
10345
+ id: el.id || "",
10346
+ name: el.getAttribute("name") || "",
10347
+ role,
10348
+ ariaLabel: el.getAttribute("aria-label") || "",
10349
+ type: el.getAttribute("type") || "",
10350
+ placeholder: el.getAttribute("placeholder") || "",
10351
+ text: String(el.textContent || "").trim().slice(0, 120),
10352
+ isEditable,
10353
+ disabled,
10354
+ readOnly,
10355
+ rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
10356
+ };
10357
+ })()`);
10358
+ if (!active || typeof active !== "object")
10359
+ return null;
10360
+ return frameMeta ? { ...active, frame: frameMeta } : active;
10361
+ }
10362
+ async function describeFocusedEditable(pageHandle) {
10363
+ const main2 = await describeFocusedEditableInContext(pageHandle, { kind: "main", url: pageHandle.url?.() ?? "" }).catch(() => null);
10364
+ if (main2?.["isEditable"])
10365
+ return main2;
10366
+ const frames = typeof pageHandle.frames === "function" ? pageHandle.frames() : [];
10367
+ const mainFrame = typeof pageHandle.mainFrame === "function" ? pageHandle.mainFrame() : null;
10368
+ for (let i2 = 0; i2 < frames.length; i2++) {
10369
+ const frame = frames[i2];
10370
+ if (!frame || frame === mainFrame)
10371
+ continue;
10372
+ const active = await describeFocusedEditableInContext(frame, {
10373
+ kind: "frame",
10374
+ index: i2,
10375
+ url: typeof frame.url === "function" ? frame.url() : "",
10376
+ name: typeof frame.name === "function" ? frame.name() : ""
10377
+ }).catch(() => null);
10378
+ if (active?.["isEditable"])
10379
+ return active;
10380
+ }
10381
+ return main2;
10382
+ }
10383
+ async function clickAndFillBrowserTarget(pageHandle, target, text, typingDelay) {
10384
+ const viewport = pageHandle.viewportSize?.() ?? { width: 1280, height: 720 };
10385
+ let candidate = await findBrowserVisualCandidate(pageHandle, target, viewport.width / 2, viewport.height / 2, true);
10386
+ let source = "dom-candidate";
10387
+ if (!candidate) {
10388
+ candidate = await findBrowserVisualCandidate(pageHandle, target, viewport.width / 2, viewport.height / 2, true, true, true);
10389
+ if (candidate?.["scrolledIntoView"] === true)
10390
+ source += "+scroll";
10391
+ if (candidate)
10392
+ await pageHandle.waitForTimeout(150);
10393
+ }
10394
+ const center = candidate?.["center"];
10395
+ const x = Number(center?.x);
10396
+ const y = Number(center?.y);
10397
+ if (!Number.isFinite(x) || !Number.isFinite(y)) {
10398
+ throw new Error(`No visible editable candidate matched target "${target}". Run observe_bundle or dom_summary to inspect available labels/selectors.`);
10399
+ }
10400
+ await pageHandle.mouse.click(x, y);
10401
+ await pageHandle.waitForTimeout(80);
10402
+ const active = await describeFocusedEditable(pageHandle);
10403
+ if (!active?.["isEditable"]) {
10404
+ throw new Error(`Target "${target}" was clicked, but no editable element became focused. Matched element: ${candidate ? JSON.stringify({
10405
+ tag: candidate["tag"],
10406
+ text: candidate["text"],
10407
+ ariaLabel: candidate["ariaLabel"],
10408
+ placeholder: candidate["placeholder"],
10409
+ name: candidate["name"]
10410
+ }) : "(none)"}.`);
10411
+ }
10412
+ const selectAll = process.platform === "darwin" ? "Meta+A" : "Control+A";
10413
+ await pageHandle.keyboard.press(selectAll);
10414
+ await pageHandle.keyboard.type(text, { delay: typingDelay });
10415
+ return { candidate, active, source };
10416
+ }
10417
+ function evaluateFailureMessage(err, code8) {
10418
+ const raw = err instanceof Error ? err.message : String(err);
10419
+ const hints = [];
10420
+ if (/map is not a function/i.test(raw) && /querySelectorAll/i.test(code8)) {
10421
+ hints.push("document.querySelectorAll() returns a NodeList; use Array.from(document.querySelectorAll(selector)).map(...) or [...document.querySelectorAll(selector)].map(...).");
10422
+ }
10423
+ if (/(?:\.value\s*=|setAttribute\(['"]value['"])/.test(code8) && /\b(input|textarea|querySelector)/i.test(code8)) {
10424
+ hints.push("Do not fill modern React/Vue/Svelte forms by assigning .value in evaluate; use playwright_browser fill, or visual_click the field then type, so input/change events fire.");
10425
+ }
10426
+ if (/querySelectorAll|querySelector/.test(code8)) {
10427
+ hints.push("For page inspection, prefer query_all, dom_summary, or observe_bundle before raw evaluate.");
10428
+ }
10429
+ return [raw.slice(0, 500), ...hints.map((hint) => `Hint: ${hint}`)].join("\n");
10430
+ }
10329
10431
  function buildImageMarker(buffer2) {
10330
10432
  let mimeType = "image/png";
10331
10433
  let out = buffer2;
@@ -10391,8 +10493,8 @@ ${input.text.slice(0, 2e4)}`.toLowerCase();
10391
10493
  }
10392
10494
  return { kind: "none", confidence: 0, evidence: [] };
10393
10495
  }
10394
- async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY, forceCandidate, includeOffscreen = false, scrollIntoView = false) {
10395
- const candidate = await pageHandle.evaluate(`(() => {
10496
+ async function findBrowserVisualCandidateInContext(context2, target, visualX, visualY, forceCandidate, includeOffscreen = false, scrollIntoView = false) {
10497
+ const candidate = await context2.evaluate(`(() => {
10396
10498
  const target = ${JSON.stringify(target)};
10397
10499
  const visualX = ${JSON.stringify(visualX)};
10398
10500
  const visualY = ${JSON.stringify(visualY)};
@@ -10423,12 +10525,55 @@ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY,
10423
10525
  if (window.CSS && typeof window.CSS.escape === "function") return window.CSS.escape(id);
10424
10526
  return String(id).replace(/["\\\\]/g, "\\\\$&");
10425
10527
  };
10528
+ const hasVisibleStyle = (el) => {
10529
+ for (let cur = el; cur && cur.nodeType === 1; cur = cur.parentElement) {
10530
+ const style = getComputedStyle(cur);
10531
+ if (style.display === "none" || style.visibility === "hidden" || style.visibility === "collapse") return false;
10532
+ const opacity = Number(style.opacity);
10533
+ if (Number.isFinite(opacity) && opacity <= 0.02) return false;
10534
+ }
10535
+ return true;
10536
+ };
10537
+ const hasUsableBox = (el) => {
10538
+ const rect = el.getBoundingClientRect();
10539
+ return rect.width > 1 && rect.height > 1;
10540
+ };
10541
+ const inViewport = (rect) => !(rect.bottom < 0 || rect.right < 0 || rect.top > window.innerHeight || rect.left > window.innerWidth);
10542
+ const isRendered = (el) => !!el && hasUsableBox(el) && hasVisibleStyle(el);
10426
10543
  const associatedControl = (el) => {
10427
10544
  if (/^label$/i.test(el.tagName || "") && el.getAttribute("for")) {
10428
10545
  return document.getElementById(el.getAttribute("for"));
10429
10546
  }
10547
+ if (/^label$/i.test(el.tagName || "")) {
10548
+ if (el.control) return el.control;
10549
+ const nested = el.querySelector("input, textarea, select, [contenteditable='true'], [role='textbox']");
10550
+ if (nested) return nested;
10551
+ const labelRect = el.getBoundingClientRect();
10552
+ let scope = el.parentElement;
10553
+ for (let depth = 0; depth < 5 && scope; depth++, scope = scope.parentElement) {
10554
+ const controls = Array.from(scope.querySelectorAll("input, textarea, select, [contenteditable='true'], [role='textbox']"))
10555
+ .filter(control => {
10556
+ const rect = control.getBoundingClientRect();
10557
+ return rect.width > 1 && rect.height > 1 && rect.top >= labelRect.top - 12 && Math.abs(rect.left - labelRect.left) < 260;
10558
+ })
10559
+ .sort((a, b) => {
10560
+ const ar = a.getBoundingClientRect();
10561
+ const br = b.getBoundingClientRect();
10562
+ return (Math.abs(ar.top - labelRect.bottom) - Math.abs(br.top - labelRect.bottom))
10563
+ || (Math.abs(ar.left - labelRect.left) - Math.abs(br.left - labelRect.left));
10564
+ });
10565
+ if (controls[0]) return controls[0];
10566
+ }
10567
+ }
10430
10568
  return el;
10431
10569
  };
10570
+ const clickTargetFor = (el) => {
10571
+ const control = associatedControl(el) || el;
10572
+ if (control === el) return el;
10573
+ if (isRendered(control)) return control;
10574
+ if (/^label$/i.test(el.tagName || "") && isRendered(el)) return el;
10575
+ return control;
10576
+ };
10432
10577
  const associatedLabelText = (el) => {
10433
10578
  const control = associatedControl(el);
10434
10579
  const id = control && control.id ? control.id : el.id;
@@ -10471,20 +10616,25 @@ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY,
10471
10616
  el.id || "",
10472
10617
  el.className || "",
10473
10618
  el.getAttribute("type") || "",
10619
+ el.getAttribute("autocomplete") || "",
10474
10620
  ].join(" ").toLowerCase();
10475
10621
  const infoFor = (el, score) => {
10476
- const rect = el.getBoundingClientRect();
10477
- const visible = !(rect.bottom < 0 || rect.right < 0 || rect.top > window.innerHeight || rect.left > window.innerWidth);
10622
+ const control = associatedControl(el) || el;
10623
+ const clickTarget = clickTargetFor(el);
10624
+ const rect = clickTarget.getBoundingClientRect();
10625
+ const visible = inViewport(rect) && hasVisibleStyle(clickTarget);
10478
10626
  return {
10479
- tag: String(el.tagName || "").toLowerCase(),
10480
- id: el.id || "",
10481
- className: String(el.className || "").slice(0, 160),
10482
- role: el.getAttribute("role") || "",
10483
- ariaLabel: el.getAttribute("aria-label") || "",
10484
- name: el.getAttribute("name") || "",
10485
- type: el.getAttribute("type") || "",
10486
- placeholder: el.getAttribute("placeholder") || "",
10487
- text: String(el.innerText || el.textContent || el.getAttribute("value") || "").trim().slice(0, 240),
10627
+ tag: String(control.tagName || el.tagName || "").toLowerCase(),
10628
+ id: control.id || el.id || "",
10629
+ className: String(control.className || el.className || "").slice(0, 160),
10630
+ role: control.getAttribute("role") || el.getAttribute("role") || "",
10631
+ ariaLabel: control.getAttribute("aria-label") || el.getAttribute("aria-label") || "",
10632
+ name: control.getAttribute("name") || el.getAttribute("name") || "",
10633
+ type: control.getAttribute("type") || el.getAttribute("type") || "",
10634
+ autocomplete: control.getAttribute("autocomplete") || el.getAttribute("autocomplete") || "",
10635
+ placeholder: control.getAttribute("placeholder") || el.getAttribute("placeholder") || "",
10636
+ text: String(el.innerText || el.textContent || control.innerText || control.textContent || control.getAttribute("value") || "").trim().slice(0, 240),
10637
+ clickTag: String(clickTarget.tagName || "").toLowerCase(),
10488
10638
  rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
10489
10639
  center: { x: rect.x + rect.width / 2, y: rect.y + rect.height / 2 },
10490
10640
  visible,
@@ -10506,9 +10656,11 @@ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY,
10506
10656
  let best = null;
10507
10657
  let bestElement = null;
10508
10658
  for (const el of Array.from(document.querySelectorAll(selectors))) {
10509
- const rect = el.getBoundingClientRect();
10510
- if (rect.width <= 1 || rect.height <= 1) continue;
10511
- const visible = !(rect.bottom < 0 || rect.right < 0 || rect.top > window.innerHeight || rect.left > window.innerWidth);
10659
+ if (!isRendered(el)) continue;
10660
+ const clickTarget = clickTargetFor(el);
10661
+ if (!isRendered(clickTarget)) continue;
10662
+ const rect = clickTarget.getBoundingClientRect();
10663
+ const visible = inViewport(rect);
10512
10664
  if (!includeOffscreen && !visible) continue;
10513
10665
  const hay = textFor(el);
10514
10666
  const tokenHits = tokens.filter(t => hay.includes(t)).length;
@@ -10519,7 +10671,7 @@ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY,
10519
10671
  if (score <= (forceCandidate ? 8 : 0)) continue;
10520
10672
  if (!best || score > best.score) {
10521
10673
  best = infoFor(el, score);
10522
- bestElement = el;
10674
+ bestElement = clickTarget;
10523
10675
  }
10524
10676
  }
10525
10677
  if (bestElement && scrollIntoView && best && !best.visible) {
@@ -10531,6 +10683,82 @@ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY,
10531
10683
  })()`);
10532
10684
  return candidate && typeof candidate === "object" ? candidate : null;
10533
10685
  }
10686
+ function offsetBrowserCandidate(candidate, offset, viewport, frameMeta) {
10687
+ const rect = candidate["rect"];
10688
+ const center = candidate["center"];
10689
+ const x = Number(rect?.x) + offset.x;
10690
+ const y = Number(rect?.y) + offset.y;
10691
+ const width = Number(rect?.width);
10692
+ const height = Number(rect?.height);
10693
+ const cx = Number(center?.x) + offset.x;
10694
+ const cy = Number(center?.y) + offset.y;
10695
+ const globalRect = {
10696
+ x,
10697
+ y,
10698
+ width,
10699
+ height
10700
+ };
10701
+ const visible = Number.isFinite(x) && Number.isFinite(y) && Number.isFinite(width) && Number.isFinite(height) && !(y + height < 0 || x + width < 0 || y > viewport.height || x > viewport.width);
10702
+ return {
10703
+ ...candidate,
10704
+ rect: globalRect,
10705
+ center: { x: cx, y: cy },
10706
+ visible: candidate["visible"] === true && visible,
10707
+ ...frameMeta ? {
10708
+ frame: frameMeta,
10709
+ frameLocalRect: rect,
10710
+ frameLocalCenter: center
10711
+ } : {}
10712
+ };
10713
+ }
10714
+ async function findBrowserVisualCandidate(pageHandle, target, visualX, visualY, forceCandidate, includeOffscreen = false, scrollIntoView = false) {
10715
+ const viewport = pageHandle.viewportSize?.() ?? { width: 1280, height: 720 };
10716
+ const candidates = [];
10717
+ const top = await findBrowserVisualCandidateInContext(pageHandle, target, visualX, visualY, forceCandidate, includeOffscreen, scrollIntoView).catch(() => null);
10718
+ if (top)
10719
+ candidates.push(offsetBrowserCandidate(top, { x: 0, y: 0 }, viewport));
10720
+ const frames = typeof pageHandle.frames === "function" ? pageHandle.frames() : [];
10721
+ const mainFrame = typeof pageHandle.mainFrame === "function" ? pageHandle.mainFrame() : null;
10722
+ for (let i2 = 0; i2 < frames.length; i2++) {
10723
+ const frame = frames[i2];
10724
+ if (!frame || frame === mainFrame)
10725
+ continue;
10726
+ const elementHandle = typeof frame.frameElement === "function" ? await frame.frameElement().catch(() => null) : null;
10727
+ if (!elementHandle)
10728
+ continue;
10729
+ let box = await elementHandle.boundingBox().catch(() => null);
10730
+ if (!box || box.width <= 1 || box.height <= 1)
10731
+ continue;
10732
+ const frameVisible = !(box.y + box.height < 0 || box.x + box.width < 0 || box.y > viewport.height || box.x > viewport.width);
10733
+ if (!includeOffscreen && !frameVisible)
10734
+ continue;
10735
+ const localX = Math.max(0, Math.min(box.width, visualX - box.x));
10736
+ const localY = Math.max(0, Math.min(box.height, visualY - box.y));
10737
+ let candidate = await findBrowserVisualCandidateInContext(frame, target, localX, localY, forceCandidate, includeOffscreen, scrollIntoView).catch(() => null);
10738
+ if (!candidate)
10739
+ continue;
10740
+ if (scrollIntoView && (!candidate["visible"] || !frameVisible)) {
10741
+ if (typeof elementHandle.scrollIntoViewIfNeeded === "function") {
10742
+ await elementHandle.scrollIntoViewIfNeeded().catch(() => void 0);
10743
+ }
10744
+ box = await elementHandle.boundingBox().catch(() => box);
10745
+ candidate = {
10746
+ ...candidate,
10747
+ scrolledIntoView: true
10748
+ };
10749
+ }
10750
+ if (!box)
10751
+ continue;
10752
+ candidates.push(offsetBrowserCandidate(candidate, { x: box.x, y: box.y }, viewport, {
10753
+ kind: "frame",
10754
+ index: i2,
10755
+ url: typeof frame.url === "function" ? frame.url() : "",
10756
+ name: typeof frame.name === "function" ? frame.name() : "",
10757
+ rect: { x: box.x, y: box.y, width: box.width, height: box.height }
10758
+ }));
10759
+ }
10760
+ return candidates.filter((candidate) => includeOffscreen || candidate["visible"] === true).sort((a2, b) => Number(b["score"] ?? 0) - Number(a2["score"] ?? 0))[0] ?? null;
10761
+ }
10534
10762
  function ok(output, start2) {
10535
10763
  return { success: true, output, durationMs: Date.now() - start2 };
10536
10764
  }
@@ -10559,7 +10787,7 @@ var init_playwright_browser = __esm({
10559
10787
  PLAYWRIGHT_BROWSERS_DIR = join13(PLAYWRIGHT_RUNTIME_DIR, "browsers");
10560
10788
  PlaywrightBrowserTool = class {
10561
10789
  name = "playwright_browser";
10562
- description = "Full-scope Playwright browser automation + diagnostic capture. Launches a persistent headless Chromium session by default, with optional visible/headed mode when a GUI display is available. Beyond navigation/interaction, this tool buffers everything the running app emits (console messages, network requests, JS exceptions, accessibility tree) so the agent can verify what is ACTUALLY happening — not just what the build/test reports. Auto-installs Playwright + Chromium on first use without sudo or OS package manager escalation. Diagnostic actions: observe_bundle, dom_summary, dom, console_logs, network_log, page_errors, a11y_snapshot, bounding_box, query_all, performance, cookies, storage, viewport, clear_diagnostics. Interaction actions: navigate, click, visual_click, fill, type, press, select, check, hover. Capture actions: screenshot, pdf, content, innerText, innerHTML, getAttribute, evaluate. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Workflow for user-facing work: start/serve the system with the stack-native tool, navigate to the real URL, then inspect page_errors, console_logs, network_log, DOM/accessibility, and screenshot evidence before completion. Build/typecheck/test output is only one layer; runtime browser evidence is required when the delivered artifact is a page, app, dashboard, game, form, visualization, or other UI. Repeat navigate/act/observe until the actual user flow is clean.";
10790
+ description = "Full-scope Playwright browser automation + diagnostic capture. Launches a persistent headless Chromium session by default, with optional visible/headed mode when a GUI display is available. Beyond navigation/interaction, this tool buffers everything the running app emits (console messages, network requests, JS exceptions, accessibility tree) so the agent can verify what is ACTUALLY happening — not just what the build/test reports. Auto-installs Playwright + Chromium on first use without sudo or OS package manager escalation. Diagnostic actions: observe_bundle, dom_summary, dom, console_logs, network_log, page_errors, a11y_snapshot, bounding_box, query_all, performance, cookies, storage, viewport, clear_diagnostics. Interaction actions: navigate, click, visual_click, fill, type, press, select, check, hover. Use fill with a selector or natural-language target for form fields; avoid raw evaluate for form filling because direct .value assignment does not fire app input/change events. This is a separate browser/runtime from browser_action; once you start a workflow here, continue here unless you intentionally navigate browser_action to the same URL. Capture actions: screenshot, pdf, content, innerText, innerHTML, getAttribute, evaluate. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Workflow for user-facing work: start/serve the system with the stack-native tool, navigate to the real URL, then inspect page_errors, console_logs, network_log, DOM/accessibility, and screenshot evidence before completion. Build/typecheck/test output is only one layer; runtime browser evidence is required when the delivered artifact is a page, app, dashboard, game, form, visualization, or other UI. Repeat navigate/act/observe until the actual user flow is clean.";
10563
10791
  parameters = {
10564
10792
  type: "object",
10565
10793
  properties: {
@@ -10609,7 +10837,7 @@ var init_playwright_browser = __esm({
10609
10837
  "clear_diagnostics",
10610
10838
  "close"
10611
10839
  ],
10612
- description: "Action to perform:\n- navigate: go to a URL\n- click: click element by selector\n- fill: clear input and type text (for form fields)\n- type: type text character by character into a selector, or into the currently focused element after visual_click\n- press: press a key (Enter, Tab, Escape, etc.)\n- screenshot: capture the headless browser page, not the desktop; use value to choose the output file path\n- observe_bundle: capture URL/title/viewport, DOM summary, a11y, diagnostics, screenshot, and gate assessment\n- visual_click: browser screenshot -> Moondream point -> elementFromPoint -> human-like Playwright mouse click -> post-action screenshot\n- evaluate: run JavaScript in page context\n- content: get page text content (readable, stripped)\n- dom: get raw page HTML (truncated)\n- dom_summary: compact interactive DOM summary with selectors\n- innerText: get innerText of a specific element\n- select: select dropdown option by value\n- check/uncheck: toggle checkbox\n- hover: hover over element\n- wait: wait for a selector to appear\n- waitForNavigation: wait for page navigation to complete\n- waitForSelector: wait for element matching selector\n- title: get page title\n- url: get current URL\n- getAttribute: get element attribute value\n- innerHTML: get element's innerHTML\n- textContent: get element's textContent\n- goBack/goForward/reload: browser navigation\n- pdf: save page as PDF\n- close: close browser session"
10840
+ description: "Action to perform:\n- navigate: go to a URL\n- click: click element by selector\n- fill: clear input and type text by selector, or by natural-language target when selector is absent\n- type: type text character by character into a selector, or into the currently focused element after visual_click\n- press: press a key (Enter, Tab, Escape, etc.)\n- screenshot: capture the headless browser page, not the desktop; use value to choose the output file path\n- observe_bundle: capture URL/title/viewport, DOM summary, a11y, diagnostics, screenshot, and gate assessment\n- visual_click: browser screenshot -> Moondream point -> elementFromPoint -> human-like Playwright mouse click -> post-action screenshot\n- evaluate: run JavaScript in page context\n- content: get page text content (readable, stripped)\n- dom: get raw page HTML (truncated)\n- dom_summary: compact interactive DOM summary with selectors\n- innerText: get innerText of a specific element\n- select: select dropdown option by value\n- check/uncheck: toggle checkbox\n- hover: hover over element\n- wait: wait for a selector to appear, or sleep for timeout ms when no selector is provided\n- waitForNavigation: wait for page navigation to complete\n- waitForSelector: wait for element matching selector\n- title: get page title\n- url: get current URL\n- getAttribute: get element attribute value\n- innerHTML: get element's innerHTML\n- textContent: get element's textContent\n- goBack/goForward/reload: browser navigation\n- pdf: save page as PDF\n- close: close browser session"
10613
10841
  },
10614
10842
  url: {
10615
10843
  type: "string",
@@ -10625,7 +10853,7 @@ var init_playwright_browser = __esm({
10625
10853
  },
10626
10854
  target: {
10627
10855
  type: "string",
10628
- description: "Natural-language browser visual target for visual_click, for example 'the green Continue button' or 'the search field'."
10856
+ description: "Natural-language browser visual target for visual_click or selector-less fill, for example 'the green Continue button', 'username field', or 'password field'."
10629
10857
  },
10630
10858
  value: {
10631
10859
  type: "string",
@@ -10745,12 +10973,22 @@ var init_playwright_browser = __esm({
10745
10973
  return ok(`Clicked: ${resolvedSelector}${resolvedSelector !== selector ? ` (from ${selector})` : ""}`, start2);
10746
10974
  }
10747
10975
  case "fill": {
10748
- if (!selector)
10749
- return fail("selector is required", start2);
10750
10976
  if (text === void 0)
10751
10977
  return fail("text is required", start2);
10752
- await page.fill(selector, text, { timeout: timeout2 });
10753
- return ok(`Filled ${selector} with "${text}"`, start2);
10978
+ const typingDelay = typeof args.typing_delay_ms === "number" ? Math.max(0, Math.min(500, Math.round(args.typing_delay_ms))) : 20;
10979
+ if (selector) {
10980
+ const resolvedSelector = resolveDomSummarySelector(selector);
10981
+ if (!resolvedSelector)
10982
+ return fail(`No selector known for DOM summary reference ${selector}; run dom_summary and use the emitted selector.`, start2);
10983
+ await page.fill(resolvedSelector, text, { timeout: timeout2 });
10984
+ return ok(`Filled ${resolvedSelector}${resolvedSelector !== selector ? ` (from ${selector})` : ""} with "${text}"`, start2);
10985
+ }
10986
+ const target = typeof args.target === "string" && args.target.trim() ? args.target.trim() : "";
10987
+ if (!target)
10988
+ return fail("selector or target is required for fill. Prefer target for visual/natural-language form fields, e.g. target='username field'.", start2);
10989
+ const result = await clickAndFillBrowserTarget(page, target, text, typingDelay);
10990
+ const active = result.active ?? {};
10991
+ return ok(`Filled target "${target}" via ${result.source} into <${active["tag"] || "element"}>${active["name"] ? ` name=${JSON.stringify(active["name"])}` : ""}${active["placeholder"] ? ` placeholder=${JSON.stringify(active["placeholder"])}` : ""}${active["ariaLabel"] ? ` aria-label=${JSON.stringify(active["ariaLabel"])}` : ""}.`, start2);
10754
10992
  }
10755
10993
  case "type": {
10756
10994
  if (text === void 0)
@@ -10760,30 +10998,14 @@ var init_playwright_browser = __esm({
10760
10998
  await page.type(selector, text, { timeout: timeout2, delay: typingDelay });
10761
10999
  return ok(`Typed "${text}" into ${selector}`, start2);
10762
11000
  }
10763
- const active = await page.evaluate(`(() => {
10764
- const el = document.activeElement;
10765
- if (!el) return null;
10766
- const rect = el.getBoundingClientRect();
10767
- return {
10768
- tag: String(el.tagName || "").toLowerCase(),
10769
- id: el.id || "",
10770
- name: el.getAttribute("name") || "",
10771
- role: el.getAttribute("role") || "",
10772
- ariaLabel: el.getAttribute("aria-label") || "",
10773
- type: el.getAttribute("type") || "",
10774
- placeholder: el.getAttribute("placeholder") || "",
10775
- text: String(el.textContent || "").trim().slice(0, 120),
10776
- isEditable: el.matches("input, textarea, [contenteditable=''], [contenteditable='true']")
10777
- || ["textbox", "searchbox", "combobox"].includes((el.getAttribute("role") || "").toLowerCase()),
10778
- rect: { x: rect.x, y: rect.y, width: rect.width, height: rect.height },
10779
- };
10780
- })()`);
11001
+ const active = await describeFocusedEditable(page);
10781
11002
  if (!active || typeof active !== "object" || active.isEditable !== true) {
10782
11003
  return fail("No editable focused element is active; use visual_click on a form field or pass a selector to type.", start2);
10783
11004
  }
10784
11005
  await page.keyboard.type(text, { delay: typingDelay });
10785
11006
  const label = active && typeof active === "object" ? `<${active.tag || "element"}>${active.id ? `#${active.id}` : ""}` : "focused element";
10786
- return ok(`Typed "${text}" into ${label}`, start2);
11007
+ const frame = active["frame"];
11008
+ return ok(`Typed "${text}" into ${label}${frame?.kind === "frame" ? ` in frame ${frame.index}` : ""}`, start2);
10787
11009
  }
10788
11010
  case "press": {
10789
11011
  const key = text || "Enter";
@@ -10824,7 +11046,14 @@ var init_playwright_browser = __esm({
10824
11046
  return ok(`Hovered: ${resolvedSelector}${resolvedSelector !== selector ? ` (from ${selector})` : ""}`, start2);
10825
11047
  }
10826
11048
  // ── Waiting ──
10827
- case "wait":
11049
+ case "wait": {
11050
+ if (selector) {
11051
+ await page.waitForSelector(selector, { timeout: timeout2 });
11052
+ return ok(`Element appeared: ${selector}`, start2);
11053
+ }
11054
+ await page.waitForTimeout(timeout2);
11055
+ return ok(`Waited ${timeout2}ms`, start2);
11056
+ }
10828
11057
  case "waitForSelector": {
10829
11058
  if (!selector)
10830
11059
  return fail("selector is required", start2);
@@ -10893,9 +11122,13 @@ var init_playwright_browser = __esm({
10893
11122
  case "evaluate": {
10894
11123
  if (!text)
10895
11124
  return fail("text (JavaScript code) is required", start2);
10896
- const result = await page.evaluate(text);
10897
- const serialized = typeof result === "string" ? result : JSON.stringify(result, null, 2);
10898
- return ok(serialized?.slice(0, 15e3) ?? "undefined", start2);
11125
+ try {
11126
+ const result = await page.evaluate(text);
11127
+ const serialized = typeof result === "string" ? result : JSON.stringify(result, null, 2);
11128
+ return ok(serialized?.slice(0, 15e3) ?? "undefined", start2);
11129
+ } catch (err2) {
11130
+ return fail(evaluateFailureMessage(err2, text), start2);
11131
+ }
10899
11132
  }
10900
11133
  // ── Screenshot / PDF ──
10901
11134
  case "screenshot": {
@@ -11273,8 +11506,13 @@ ${JSON.stringify(data, null, 2)}`, start2);
11273
11506
  })()`);
11274
11507
  let clickSource = point.source || pointResult?.source || "vision";
11275
11508
  const candidate = await findBrowserVisualCandidate(page, visualTarget, cssX, cssY, false);
11276
- if (candidate) {
11277
- const candidateRecord = candidate;
11509
+ let candidateRecord = candidate;
11510
+ if (!candidateRecord) {
11511
+ candidateRecord = await findBrowserVisualCandidate(page, visualTarget, cssX, cssY, true, true, true);
11512
+ if (candidateRecord)
11513
+ await page.waitForTimeout(150);
11514
+ }
11515
+ if (candidateRecord) {
11278
11516
  const center = candidateRecord["center"];
11279
11517
  const nextX = Number(center?.x);
11280
11518
  const nextY = Number(center?.y);
@@ -11282,7 +11520,7 @@ ${JSON.stringify(data, null, 2)}`, start2);
11282
11520
  cssX = Math.max(0, Math.min(viewport.width, nextX));
11283
11521
  cssY = Math.max(0, Math.min(viewport.height, nextY));
11284
11522
  elementInfo = candidateRecord;
11285
- clickSource = `${clickSource}+dom-candidate`;
11523
+ clickSource = `${clickSource}+dom-candidate${candidateRecord["scrolledIntoView"] === true ? "+scroll" : ""}`;
11286
11524
  }
11287
11525
  }
11288
11526
  await page.mouse.move(cssX, cssY, { steps: 12 });
@@ -23994,8 +24232,8 @@ var init_explore_tools = __esm({
23994
24232
  enter_worktree: "Create isolated git worktree for safe parallel file modifications",
23995
24233
  exit_worktree: "Exit and optionally remove a git worktree (keep for merge or discard)",
23996
24234
  notebook_edit: "Edit Jupyter .ipynb notebooks at cell level (list, replace, insert, delete cells)",
23997
- browser_action: "Interactive browser: login, fill forms, click buttons, screenshot — session persists between calls; for console/page-error/network diagnostics prefer playwright_browser",
23998
- playwright_browser: "Full browser verification and visual action loop: observe_bundle, visual_click via Moondream pointing, focused-element typing for visual form filling, screenshot, page_errors, console_logs, network_log, DOM/accessibility, storage",
24235
+ browser_action: "Interactive Selenium browser: login, fill forms, click buttons, screenshot — session persists between browser_action calls only; separate runtime from playwright_browser",
24236
+ playwright_browser: "Full browser verification and visual action loop: observe_bundle, visual_click via Moondream pointing, selector/target fill, focused-element typing, screenshot, page_errors, console_logs, network_log, DOM/accessibility, storage",
23999
24237
  carbonyl_browser: "Terminal-rendered real browser automation via Carbonyl: navigate, read rendered text, click/type, sessions, daemon mode",
24000
24238
  scheduler: "Schedule tasks for automatic future execution via OS cron",
24001
24239
  cronjob: "Alias for scheduler: OS cron-backed time triggers",
@@ -284492,6 +284730,7 @@ async function ensureSession(options2 = {}) {
284492
284730
  }
284493
284731
  activeSessionId = null;
284494
284732
  activeSessionHeadless = null;
284733
+ activeSessionUrl = null;
284495
284734
  }
284496
284735
  }
284497
284736
  if (activeSessionId) {
@@ -284503,6 +284742,13 @@ async function ensureSession(options2 = {}) {
284503
284742
  }
284504
284743
  activeSessionId = null;
284505
284744
  activeSessionHeadless = null;
284745
+ activeSessionUrl = null;
284746
+ }
284747
+ if (options2.allowCreate === false) {
284748
+ return {
284749
+ error: "No active browser_action Selenium session exists for this action. browser_action is a separate browser/runtime from playwright_browser; continue the current page with playwright_browser, or call browser_action({action:'navigate', url: ...}) first.",
284750
+ sessionId: ""
284751
+ };
284506
284752
  }
284507
284753
  const headless = options2.headless ?? defaultBrowserHeadless();
284508
284754
  const res = await fetch(`${BASE_URL}/session/start`, {
@@ -284520,8 +284766,16 @@ async function ensureSession(options2 = {}) {
284520
284766
  return { error: String(data.message ?? "Failed to start browser session"), sessionId: "" };
284521
284767
  activeSessionId = data.session_id;
284522
284768
  activeSessionHeadless = headless;
284769
+ activeSessionUrl = null;
284523
284770
  return { sessionId: activeSessionId };
284524
284771
  }
284772
+ function browserActionRuntimeHint() {
284773
+ return [
284774
+ "browser_action is a separate browser/runtime from playwright_browser and uses its own Selenium/Chrome session; it does not share page state, cookies, focus, or navigation.",
284775
+ activeSessionUrl ? `Current browser_action URL: ${activeSessionUrl}` : "Current browser_action URL: unknown or not navigated.",
284776
+ "If this page was opened with playwright_browser, keep using playwright_browser actions such as dom_summary, fill, type, press, visual_click, and observe_bundle."
284777
+ ].join(" ");
284778
+ }
284525
284779
  async function apiCall(endpoint, method = "POST", body) {
284526
284780
  const options2 = {
284527
284781
  method,
@@ -284544,7 +284798,7 @@ async function apiCall(endpoint, method = "POST", body) {
284544
284798
  const res = await fetch(url, options2);
284545
284799
  return await res.json();
284546
284800
  }
284547
- var __dirname3, DEFAULT_PORT, SCRAPE_SCRIPT, BASE_URL, serviceProcess, activeSessionId, activeSessionHeadless, BrowserActionTool;
284801
+ var __dirname3, DEFAULT_PORT, SCRAPE_SCRIPT, BASE_URL, serviceProcess, activeSessionId, activeSessionHeadless, activeSessionUrl, BrowserActionTool;
284548
284802
  var init_browser_action = __esm({
284549
284803
  "packages/execution/dist/tools/browser-action.js"() {
284550
284804
  "use strict";
@@ -284557,9 +284811,10 @@ var init_browser_action = __esm({
284557
284811
  serviceProcess = null;
284558
284812
  activeSessionId = null;
284559
284813
  activeSessionHeadless = null;
284814
+ activeSessionUrl = null;
284560
284815
  BrowserActionTool = class {
284561
284816
  name = "browser_action";
284562
- description = "Control a persistent headless Chrome browser session for interactive web tasks. The browser stays open between calls, maintaining cookies, login state, and history. Use this (not web_fetch/web_crawl) when you need to: (1) log into a website, (2) fill and submit forms, (3) click buttons or links interactively, (4) take screenshots of rendered pages, (5) navigate multi-step workflows (checkout, signup, dashboards), (6) interact with elements that require JavaScript (dropdowns, modals, infinite scroll). Actions: navigate, click, click_xy, type, screenshot, dom, scroll, scroll_up, scroll_down, back, forward, close. For browser visuals, use browser_action({action:'screenshot', width, height, output_path}) — this captures the headless browser viewport, not the desktop. Use the desktop screenshot tool only when the actual OS screen is the target. For verification of browser runtime failures, prefer playwright_browser because it exposes page_errors, console_logs, network_log, DOM/accessibility, and screenshots from the same session. IMPORTANT: Start by calling navigate with the URL — do NOT ask the user for credentials or info first. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Navigate to the page, then use dom/screenshot to see what's there, then type/click to interact. Call 'close' when done to free resources. This tool does not save or download arbitrary rendered files (PDFs, archives, media) to disk — clicking a 'Download' link inside the browser does not produce a local file path for the agent. For file acquisition, use the dedicated download/file tool and validate the resulting content-type and size before treating the result as success.";
284817
+ description = "Control a persistent headless Chrome browser session for interactive web tasks. The browser stays open between calls, maintaining cookies, login state, and history. This is a separate Selenium/Chrome runtime from playwright_browser; do not switch between the two mid-workflow unless you intentionally navigate the second tool to the same URL. Use this (not web_fetch/web_crawl) when you need to: (1) log into a website, (2) fill and submit forms, (3) click buttons or links interactively, (4) take screenshots of rendered pages, (5) navigate multi-step workflows (checkout, signup, dashboards), (6) interact with elements that require JavaScript (dropdowns, modals, infinite scroll). Actions: navigate, click, click_xy, type, screenshot, dom, scroll, scroll_up, scroll_down, back, forward, close. For browser visuals, use browser_action({action:'screenshot', width, height, output_path}) — this captures the headless browser viewport, not the desktop. Use the desktop screenshot tool only when the actual OS screen is the target. For verification of browser runtime failures, prefer playwright_browser because it exposes page_errors, console_logs, network_log, DOM/accessibility, and screenshots from the same session. IMPORTANT: Start by calling navigate with the URL — do NOT ask the user for credentials or info first. Loopback URLs (localhost, 127.0.0.1, ::1) are allowed for local development servers; private LAN and metadata URLs remain blocked. Navigate to the page, then use dom/screenshot to see what's there, then type/click to interact. Call 'close' when done to free resources. This tool does not save or download arbitrary rendered files (PDFs, archives, media) to disk — clicking a 'Download' link inside the browser does not produce a local file path for the agent. For file acquisition, use the dedicated download/file tool and validate the resulting content-type and size before treating the result as success.";
284563
284818
  parameters = {
284564
284819
  type: "object",
284565
284820
  properties: {
@@ -284629,27 +284884,38 @@ var init_browser_action = __esm({
284629
284884
  const requestedWidth = args.width == null ? void 0 : asPositiveInt2(args.width, 1280, 320, 3840);
284630
284885
  const requestedHeight = args.height == null ? void 0 : asPositiveInt2(args.height, 720, 240, 2160);
284631
284886
  const requestedScale = args.device_scale_factor == null ? void 0 : asPositiveNumber(args.device_scale_factor, 1, 0.25, 3);
284632
- const launchErr = await launchService();
284633
- if (launchErr) {
284634
- return { success: false, output: "", error: launchErr, durationMs: Date.now() - start2 };
284635
- }
284636
284887
  if (action === "close") {
284637
- if (activeSessionId) {
284888
+ if (activeSessionId || await probeService()) {
284638
284889
  try {
284639
284890
  await apiCall("/session/close");
284640
284891
  } catch {
284641
284892
  }
284642
284893
  activeSessionId = null;
284643
284894
  activeSessionHeadless = null;
284895
+ activeSessionUrl = null;
284644
284896
  }
284645
284897
  return { success: true, output: "Browser session closed.", durationMs: Date.now() - start2 };
284646
284898
  }
284899
+ const actionStartsSession = action === "navigate";
284900
+ if (!actionStartsSession && !activeSessionId) {
284901
+ return {
284902
+ success: false,
284903
+ output: "",
284904
+ error: `browser_action ${action || "(missing action)"} requires an active browser_action session. ` + browserActionRuntimeHint(),
284905
+ durationMs: Date.now() - start2
284906
+ };
284907
+ }
284908
+ const launchErr = await launchService();
284909
+ if (launchErr) {
284910
+ return { success: false, output: "", error: launchErr, durationMs: Date.now() - start2 };
284911
+ }
284647
284912
  const session = await ensureSession({
284648
284913
  width: requestedWidth,
284649
284914
  height: requestedHeight,
284650
284915
  deviceScaleFactor: requestedScale,
284651
284916
  headless: asOptionalBoolean2(args.headless),
284652
- forceNew: asOptionalBoolean2(args.force_new) === true
284917
+ forceNew: asOptionalBoolean2(args.force_new) === true,
284918
+ allowCreate: actionStartsSession
284653
284919
  });
284654
284920
  if (session.error) {
284655
284921
  return { success: false, output: "", error: session.error, durationMs: Date.now() - start2 };
@@ -284667,7 +284933,13 @@ var init_browser_action = __esm({
284667
284933
  }
284668
284934
  result = await apiCall("/navigate", "POST", { url: args.url });
284669
284935
  if (result.ok) {
284670
- return { success: true, output: `Navigated to ${args.url}`, durationMs: Date.now() - start2 };
284936
+ activeSessionUrl = args.url;
284937
+ return {
284938
+ success: true,
284939
+ output: `Navigated to ${args.url}
284940
+ Runtime: browser_action Selenium/Chrome session. Continue with browser_action for this page, or use playwright_browser separately after navigating it.`,
284941
+ durationMs: Date.now() - start2
284942
+ };
284671
284943
  }
284672
284944
  const navMsg = String(result.message ?? "Navigation failed");
284673
284945
  const navHint = navMsg.toLowerCase().includes("connection") || navMsg.toLowerCase().includes("refused") || navMsg.toLowerCase().includes("err_connection") ? " (the URL appears unreachable — check if the target server is running and accepting connections)" : navMsg.toLowerCase().includes("timeout") ? " (page load timed out — try again or use a different URL)" : "";
@@ -284689,7 +284961,7 @@ var init_browser_action = __esm({
284689
284961
  return {
284690
284962
  success: false,
284691
284963
  output: `Click on ${args.selector} failed: ${clickMsg}`,
284692
- error: `browser_action click failed: ${clickMsg}. Try dom_summary first to see what selectors exist on the page.`,
284964
+ error: `browser_action click failed: ${clickMsg}. Try dom_summary first to see what selectors exist on the page. ${browserActionRuntimeHint()}`,
284693
284965
  durationMs: Date.now() - start2
284694
284966
  };
284695
284967
  }
@@ -284731,7 +285003,7 @@ var init_browser_action = __esm({
284731
285003
  return {
284732
285004
  success: false,
284733
285005
  output: `Type into ${args.selector} failed: ${typeMsg}`,
284734
- error: `browser_action type failed: ${typeMsg}. Verify the element is visible and is an input/textarea — use dom_summary to check.`,
285006
+ error: `browser_action type failed: ${typeMsg}. Verify the element is visible and is an input/textarea — use dom_summary to check. ${browserActionRuntimeHint()}`,
284735
285007
  durationMs: Date.now() - start2
284736
285008
  };
284737
285009
  }
@@ -284872,7 +285144,7 @@ var init_browser_action = __esm({
284872
285144
  if (!pointResult || pointResult.points.length === 0) {
284873
285145
  return {
284874
285146
  success: false,
284875
- output: `Vision could not find "${target}" on the page. Try using dom_summary to find the CSS selector instead.`,
285147
+ output: `Vision could not find "${target}" on the page. Try using dom_summary to find the CSS selector instead. ${browserActionRuntimeHint()}`,
284876
285148
  error: "No point backend returned normalized coordinates.",
284877
285149
  durationMs: Date.now() - start2
284878
285150
  };
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.206",
3
+ "version": "1.0.208",
4
4
  "lockfileVersion": 3,
5
5
  "requires": true,
6
6
  "packages": {
7
7
  "": {
8
8
  "name": "omnius",
9
- "version": "1.0.206",
9
+ "version": "1.0.208",
10
10
  "bundleDependencies": [
11
11
  "image-to-ascii"
12
12
  ],
@@ -4565,9 +4565,19 @@
4565
4565
  }
4566
4566
  },
4567
4567
  "node_modules/js-yaml": {
4568
- "version": "4.1.1",
4569
- "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.1.tgz",
4570
- "integrity": "sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==",
4568
+ "version": "4.2.0",
4569
+ "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.2.0.tgz",
4570
+ "integrity": "sha512-ePWsvanv0DWuDRsW8dnt+R4jQ31SCRCQ7hhNcPXZPsoBZiemuZNYGf7adZdqX2D86j6rvKp3RpCxVTSb8WQlOw==",
4571
+ "funding": [
4572
+ {
4573
+ "type": "github",
4574
+ "url": "https://github.com/sponsors/puzrin"
4575
+ },
4576
+ {
4577
+ "type": "github",
4578
+ "url": "https://github.com/sponsors/nodeca"
4579
+ }
4580
+ ],
4571
4581
  "license": "MIT",
4572
4582
  "dependencies": {
4573
4583
  "argparse": "^2.0.1"
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.206",
3
+ "version": "1.0.208",
4
4
  "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",