npm - mcp-web-inspector - Versions diffs - 0.12.0 → 0.13.0 - Mend

mcp-web-inspector 0.12.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/README.md +5 -5
package/dist/index.js +15 -1
package/dist/toolHandler.d.ts +2 -0
package/dist/toolHandler.js +12 -0
package/dist/tools/browser/base.d.ts +41 -2
package/dist/tools/browser/base.js +221 -16
package/dist/tools/browser/common/postAction.d.ts +12 -0
package/dist/tools/browser/common/postAction.js +158 -0
package/dist/tools/browser/content/get_html.js +19 -6
package/dist/tools/browser/content/get_text.js +24 -8
package/dist/tools/browser/inspection/check_visibility.js +2 -3
package/dist/tools/browser/inspection/compare_element_alignment.js +6 -8
package/dist/tools/browser/inspection/element_exists.js +1 -2
package/dist/tools/browser/inspection/get_computed_styles.js +2 -3
package/dist/tools/browser/inspection/inspect_ancestors.js +4 -5
package/dist/tools/browser/inspection/inspect_dom.js +20 -4
package/dist/tools/browser/inspection/measure_element.js +2 -3
package/dist/tools/browser/inspection/query_selector.js +3 -3
package/dist/tools/browser/interaction/click.js +178 -6
package/dist/tools/browser/interaction/drag.js +2 -4
package/dist/tools/browser/interaction/fill.js +2 -3
package/dist/tools/browser/interaction/hover.js +1 -2
package/dist/tools/browser/interaction/press_key.js +1 -2
package/dist/tools/browser/interaction/select.js +1 -2
package/dist/tools/browser/interaction/upload_file.js +1 -2
package/dist/tools/browser/navigation/scroll_by.js +1 -1
package/dist/tools/browser/navigation/scroll_to_element.js +1 -2
package/dist/tools/browser/waiting/wait_for_element.js +1 -2
package/dist/tools/common/types.d.ts +1 -0
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -723,7 +723,7 @@ Issues:
 Test a selector and return detailed information about all matched elements. Essential for selector debugging and finding the right element to interact with. Returns compact text format with element tag, position, text content, visibility status, and interaction capability. Shows why elements are hidden (display:none, opacity:0, zero size). Supports testid shortcuts (e.g., 'testid:submit-button'). Use limit parameter to control how many matches to show (default: 10). NEW: Use onlyVisible parameter to filter results (true=visible only, false=hidden only, undefined=all).
 - Parameters:
-  - selector (string, required): CSS selector, text selector, or testid shorthand to test (e.g., 'button.submit', 'testid:login-form', 'text=Sign In')
+  - selector (string, required): CSS selector, text selector, or testid shorthand to test (e.g., 'button.submit', 'testid:login-form', 'text=Sign In', 'dialog::button' to scope the lookup to the topmost open dialog/sheet)
   - limit (number, optional): Maximum number of elements to return detailed info for (default: 10, recommended max: 50)
   - onlyVisible (boolean, optional): Filter results by visibility: true = show only visible elements, false = show only hidden elements, undefined/not specified = show all elements (default: undefined)
   - showAttributes (string, optional): Comma-separated list of HTML attributes to display for each element (e.g., 'id,name,aria-label,href,type'). If not specified, attributes are not shown.
@@ -952,7 +952,7 @@ Scroll an element into view. Automatically handles scrolling within the nearest
 Click an element on the page
 - Parameters:
-  - selector (string, required): CSS selector for the element to click
+  - selector (string, required): CSS selector for the element to click. Supports 'testid:NAME' and 'dialog::SELECTOR' (scopes the lookup to the topmost open dialog/sheet, e.g. 'dialog::testid:confirm').
 #### `drag`
 Drag an element to a target location
@@ -965,7 +965,7 @@ Drag an element to a target location
 fill an input/textarea/contenteditable; if the selector matches a wrapper, descends up to 4 levels to a unique fillable descendant (errors if zero or multiple)
 - Parameters:
-  - selector (string, required): CSS selector for input field or its wrapper
+  - selector (string, required): CSS selector for input field or its wrapper. Supports 'testid:NAME' and 'dialog::SELECTOR' (scopes to the topmost open dialog/sheet).
   - value (string, required): Value to fill
 #### `hover`
@@ -1006,10 +1006,10 @@ Upload a file to an input[type='file'] element on the page
   - maxLength (number, optional): Maximum number of characters to return (default: 20000)
 #### `get_text`
-[may return preview+token] ⚠️ RARELY NEEDED: Get ALL visible text content from the entire page (no structure, just raw text). Most tasks need structured inspection instead. ONLY use get_text for: (1) extracting text for content analysis (word count, language detection), (2) searching for text when location is completely unknown, (3) text-only snapshots for comparison. For structured tasks, use: inspect_dom() to understand page structure, find_by_text() to locate specific text with context, query_selector() to find elements. Auto-returns text if <2000 chars (small elements); if larger, returns a preview and a one-time token to fetch the full output via confirm_output. Supports testid shortcuts.
+[may return preview+token] ⚠️ RARELY NEEDED: Get ALL visible text content from the entire page (no structure, just raw text). Most tasks need structured inspection instead. ONLY use get_text for: (1) extracting text for content analysis (word count, language detection), (2) searching for text when location is completely unknown, (3) text-only snapshots for comparison. For structured tasks, use: inspect_dom() to understand page structure, find_by_text() to locate specific text with context, query_selector() to find elements. Auto-returns text if <2000 chars (small elements); if larger, returns a preview and a one-time token to fetch the full output via confirm_output. Supports testid shortcuts and the `dialog::SELECTOR` scope to read inside the topmost open dialog/sheet.
 - Parameters:
-  - selector (string, optional): CSS selector, text selector, or testid shorthand to limit text extraction to a specific container. Omit to get text from entire page. Example: 'testid:article-body' or '#main-content'
+  - selector (string, optional): CSS selector, text selector, or testid shorthand to limit text extraction to a specific container. Omit to get text from entire page. Examples: 'testid:article-body', '#main-content', 'dialog::section' (scopes lookup to the topmost open dialog/sheet — useful when a sheet covers ambiguous page chrome). Use bare 'dialog::' for the whole topmost dialog.
   - maxLength (number, optional): Maximum number of characters to return (default: 20000)
 #### `visual_screenshot_for_humans`

package/dist/index.js CHANGED Viewed

@@ -4,7 +4,7 @@ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"
 import { createToolDefinitions } from "./tools/common/registry.js";
 import { setupRequestHandlers } from "./requestHandler.js";
 import { parseArgs } from "node:util";
-import { setSessionConfig } from "./toolHandler.js";
+import { setSessionConfig, ensureBrowser } from "./toolHandler.js";
 import { readFileSync } from "node:fs";
 import { fileURLToPath } from "node:url";
 import { dirname, join } from "node:path";
@@ -41,6 +41,10 @@ const { values } = parseArgs({
         'cdp-port': {
             type: 'string',
         },
+        'warmup-browser': {
+            type: 'boolean',
+            default: false,
+        },
         'print-tools-json': {
             type: 'boolean',
             default: false,
@@ -93,6 +97,7 @@ async function runServer() {
         headlessDefault: Boolean(values['headless']) || (process.platform === 'linux' && !process.env.DISPLAY && !process.env.WAYLAND_DISPLAY),
         exposeSensitiveNetworkData: Boolean(values['expose-sensitive-network-data']),
         cdpPort,
+        warmupBrowser: Boolean(values['warmup-browser']),
     };
     setSessionConfig(sessionConfig);
     // Create tool definitions with session config
@@ -137,6 +142,15 @@ async function runServer() {
     // Create transport and connect
     const transport = new StdioServerTransport();
     await server.connect(transport);
+    // Optional eager browser launch. Off by default — sessions that never invoke
+    // an MCP tool shouldn't pay for Chromium startup. Useful when external
+    // clients (e.g. CDP seed/login scripts) need the browser up before any tool
+    // call. Non-blocking — failures surface on the first tool call.
+    if (sessionConfig.warmupBrowser) {
+        ensureBrowser({ headless: sessionConfig.headlessDefault }).catch(err => {
+            console.error("Eager browser warmup failed (will retry on first tool call):", err);
+        });
+    }
 }
 runServer().catch((error) => {
     console.error("Fatal error in main():", error);

package/dist/toolHandler.d.ts CHANGED Viewed

@@ -19,6 +19,8 @@ export interface NetworkRequest {
     };
 }
 type ColorSchemeOverride = 'light' | 'dark' | 'no-preference';
+export declare function hasShownNthHint(): boolean;
+export declare function markNthHintShown(): void;
 /**
  * Sets the session configuration
  */

package/dist/toolHandler.js CHANGED Viewed

@@ -14,8 +14,19 @@ let sessionConfig = {
     headlessDefault: false,
     exposeSensitiveNetworkData: false,
     cdpPort: 0,
+    warmupBrowser: false,
 };
 let colorSchemeOverride = null;
+// Session-scoped flag: the verbose "matched multiple elements" nth-selector
+// guidance is only emitted once per browser session. After the first emit,
+// tools surface only the short ⚠ warning to keep agent context lean.
+let nthHintShown = false;
+export function hasShownNthHint() {
+    return nthHintShown;
+}
+export function markNthHintShown() {
+    nthHintShown = true;
+}
 /**
  * Sets the session configuration
  */
@@ -50,6 +61,7 @@ export function resetBrowserState() {
     currentBrowserType = 'chromium';
     currentDevice = undefined;
     networkLog = [];
+    nthHintShown = false;
     clearConsoleLogs();
 }
 /**

package/dist/tools/browser/base.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import type { Page } from 'playwright';
+import type { Locator, Page } from 'playwright';
 import { ToolHandler, ToolContext, ToolResponse } from '../common/types.js';
 /**
  * Base class for all browser-based tools
@@ -20,10 +20,49 @@ export declare abstract class BrowserToolBase implements ToolHandler {
      *     "#radix-\:rc\:-content-123" → "id=radix-:rc:-content-123"
      * - Remove unnecessary escapes for bracket characters only (\\[ and \\])
      *   DO NOT unescape colons globally — colons in class/ID names must stay escaped in CSS.
+     *
+     * Note: the `dialog::SELECTOR` scope shortcut (e.g., `dialog::section`,
+     * `dialog::testid:close`) is NOT handled here — it is a runtime scope
+     * resolved by `createScopedLocator()`, not a syntactic rewrite.
+     *
      * @param selector The selector string
      * @returns Normalized selector
      */
     protected normalizeSelector(selector: string): string;
+    /**
+     * Build a Playwright `Locator` honoring the `dialog::SELECTOR` scope shortcut.
+     *
+     * - `dialog::section`           → topmost open dialog/sheet, then `section` inside it
+     * - `dialog::testid:close`      → topmost open dialog, then `[data-testid="close"]` inside it
+     * - `dialog::`                  → the topmost open dialog itself
+     * - anything else               → `page.locator(normalizeSelector(rawSelector))`
+     *
+     * "Topmost" is determined by the highest effective z-index — for each
+     * candidate dialog, we walk up to the nearest positioned ancestor (almost
+     * always the backdrop/glass-screen wrapper, which is what stacking actually
+     * follows) and read its z-index. DOM order is the tiebreaker. This is more
+     * robust than picking `.last()` because portal frameworks don't always
+     * append in z-order, and modal stacking is driven by the backdrop's
+     * z-index, not the dialog content's.
+     */
+    protected createScopedLocator(page: Page, rawSelector: string): Promise<Locator>;
+    /**
+     * Detect a "user-dominating" open modal — i.e. one that a human would
+     * visually focus on and interact with to the exclusion of the rest of the
+     * page. Used by inspect_dom / get_text / get_html to auto-scope when no
+     * selector is provided, so the LLM's view matches the human's view.
+     *
+     * Strict criterion: requires `aria-modal="true"` (or native `dialog[open]`)
+     * because non-modal `[role="dialog"]` includes things like side panels and
+     * tooltips that don't dominate the page.
+     *
+     * Returns null if no active modal is open. Otherwise returns the topmost
+     * one, ranked by the same z-index walk used by `createScopedLocator()`.
+     */
+    protected detectActiveModal(page: Page): Promise<{
+        descriptor: string;
+        suggestion: string;
+    } | null>;
     /**
      * Sanitize verbose Playwright selector engine messages by removing stack traces and
      * keeping only the essential syntax error information.
@@ -105,7 +144,7 @@ export declare abstract class BrowserToolBase implements ToolHandler {
      * @param preferredVisible Whether visibility preference was used
      * @returns Formatted string or empty if only one element
      */
-    protected formatElementSelectionInfo(selector: string, elementIndex: number, totalCount: number, preferredVisible?: boolean): string;
+    protected formatElementSelectionInfo(selector: string, elementIndex: number, totalCount: number, preferredVisible?: boolean): Promise<string>;
     /**
      * Generate a warning message if the selector is a testid and there are duplicates
      *

package/dist/tools/browser/base.js CHANGED Viewed

@@ -16,6 +16,11 @@ export class BrowserToolBase {
      *     "#radix-\:rc\:-content-123" → "id=radix-:rc:-content-123"
      * - Remove unnecessary escapes for bracket characters only (\\[ and \\])
      *   DO NOT unescape colons globally — colons in class/ID names must stay escaped in CSS.
+     *
+     * Note: the `dialog::SELECTOR` scope shortcut (e.g., `dialog::section`,
+     * `dialog::testid:close`) is NOT handled here — it is a runtime scope
+     * resolved by `createScopedLocator()`, not a syntactic rewrite.
+     *
      * @param selector The selector string
      * @returns Normalized selector
      */
@@ -76,6 +81,188 @@ export class BrowserToolBase {
         cleaned = cleaned.replace(/\\{2,}(?=:)/g, '\\');
         return cleaned;
     }
+    /**
+     * Build a Playwright `Locator` honoring the `dialog::SELECTOR` scope shortcut.
+     *
+     * - `dialog::section`           → topmost open dialog/sheet, then `section` inside it
+     * - `dialog::testid:close`      → topmost open dialog, then `[data-testid="close"]` inside it
+     * - `dialog::`                  → the topmost open dialog itself
+     * - anything else               → `page.locator(normalizeSelector(rawSelector))`
+     *
+     * "Topmost" is determined by the highest effective z-index — for each
+     * candidate dialog, we walk up to the nearest positioned ancestor (almost
+     * always the backdrop/glass-screen wrapper, which is what stacking actually
+     * follows) and read its z-index. DOM order is the tiebreaker. This is more
+     * robust than picking `.last()` because portal frameworks don't always
+     * append in z-order, and modal stacking is driven by the backdrop's
+     * z-index, not the dialog content's.
+     */
+    async createScopedLocator(page, rawSelector) {
+        const trimmed = (rawSelector ?? '').trim();
+        const DIALOG_PREFIX = 'dialog::';
+        if (!trimmed.startsWith(DIALOG_PREFIX)) {
+            return page.locator(this.normalizeSelector(trimmed));
+        }
+        const dialogRoots = '[role="dialog"]:not([aria-hidden="true"]),' +
+            '[role="alertdialog"]:not([aria-hidden="true"]),' +
+            'dialog[open]';
+        // Match detectActiveModal: include only user-visible candidates and
+        // rank by effective z-index. Without the visibility filter, a hidden
+        // dialog left in the DOM (display:none) could be picked over an
+        // actually-open one.
+        const result = await page.evaluate((rootsSelector) => {
+            const isUserVisible = (el) => {
+                const cs = getComputedStyle(el);
+                if (cs.display === 'none' || cs.visibility === 'hidden' || parseFloat(cs.opacity) === 0) {
+                    return false;
+                }
+                const rect = el.getBoundingClientRect();
+                return rect.width > 0 && rect.height > 0;
+            };
+            const allCandidates = Array.from(document.querySelectorAll(rootsSelector));
+            const visibleIndices = [];
+            allCandidates.forEach((el, i) => {
+                if (isUserVisible(el))
+                    visibleIndices.push(i);
+            });
+            if (visibleIndices.length === 0)
+                return { topIndex: -1, hasVisible: false };
+            if (visibleIndices.length === 1)
+                return { topIndex: visibleIndices[0], hasVisible: true };
+            const effectiveZ = (start) => {
+                let z = 0;
+                let node = start;
+                while (node && node !== document.body) {
+                    const cs = getComputedStyle(node);
+                    if (cs.position !== 'static') {
+                        const parsed = parseInt(cs.zIndex, 10);
+                        if (!isNaN(parsed)) {
+                            z = Math.max(z, parsed);
+                        }
+                    }
+                    node = node.parentElement;
+                }
+                return z;
+            };
+            let bestIdx = visibleIndices[0];
+            let bestScore = -Infinity;
+            visibleIndices.forEach((i) => {
+                // Tiebreaker: DOM order — later element is on top.
+                const score = effectiveZ(allCandidates[i]) * 1000000 + i;
+                if (score > bestScore) {
+                    bestScore = score;
+                    bestIdx = i;
+                }
+            });
+            return { topIndex: bestIdx, hasVisible: true };
+        }, dialogRoots).catch(() => ({ topIndex: -1, hasVisible: false }));
+        // No visible dialog → return a never-matching locator so downstream
+        // callers see a clean "No elements found" instead of silently scoping
+        // to a hidden dialog left in the DOM.
+        if (!result.hasVisible) {
+            return page.locator('dialog-no-such-element-sentinel');
+        }
+        const topmostDialog = page.locator(dialogRoots).nth(result.topIndex);
+        const inner = trimmed.slice(DIALOG_PREFIX.length).trim();
+        if (!inner) {
+            return topmostDialog;
+        }
+        return topmostDialog.locator(this.normalizeSelector(inner));
+    }
+    /**
+     * Detect a "user-dominating" open modal — i.e. one that a human would
+     * visually focus on and interact with to the exclusion of the rest of the
+     * page. Used by inspect_dom / get_text / get_html to auto-scope when no
+     * selector is provided, so the LLM's view matches the human's view.
+     *
+     * Strict criterion: requires `aria-modal="true"` (or native `dialog[open]`)
+     * because non-modal `[role="dialog"]` includes things like side panels and
+     * tooltips that don't dominate the page.
+     *
+     * Returns null if no active modal is open. Otherwise returns the topmost
+     * one, ranked by the same z-index walk used by `createScopedLocator()`.
+     */
+    async detectActiveModal(page) {
+        const ACTIVE_MODAL_SELECTOR = '[role="dialog"][aria-modal="true"]:not([aria-hidden="true"]),' +
+            '[role="alertdialog"][aria-modal="true"]:not([aria-hidden="true"]),' +
+            'dialog[open]';
+        return await page.evaluate((rootsSelector) => {
+            const isUserVisible = (el) => {
+                const cs = getComputedStyle(el);
+                if (cs.display === 'none' || cs.visibility === 'hidden' || parseFloat(cs.opacity) === 0) {
+                    return false;
+                }
+                const rect = el.getBoundingClientRect();
+                return rect.width > 0 && rect.height > 0;
+            };
+            const candidates = Array.from(document.querySelectorAll(rootsSelector)).filter(isUserVisible);
+            if (candidates.length === 0)
+                return null;
+            const effectiveZ = (start) => {
+                let z = 0;
+                let node = start;
+                while (node && node !== document.body) {
+                    const cs = getComputedStyle(node);
+                    if (cs.position !== 'static') {
+                        const parsed = parseInt(cs.zIndex, 10);
+                        if (!isNaN(parsed)) {
+                            z = Math.max(z, parsed);
+                        }
+                    }
+                    node = node.parentElement;
+                }
+                return z;
+            };
+            let bestIdx = 0;
+            let bestScore = -Infinity;
+            candidates.forEach((el, i) => {
+                const score = effectiveZ(el) * 1000000 + i;
+                if (score > bestScore) {
+                    bestScore = score;
+                    bestIdx = i;
+                }
+            });
+            const top = candidates[bestIdx];
+            const tag = top.tagName.toLowerCase();
+            const role = top.getAttribute('role') || (tag === 'dialog' ? 'dialog' : '');
+            const testid = top.getAttribute('data-testid') ||
+                top.getAttribute('data-test') ||
+                top.getAttribute('data-cy');
+            const id = top.id || null;
+            const ariaLabel = top.getAttribute('aria-label');
+            const ariaLabelledBy = top.getAttribute('aria-labelledby');
+            let labelText = null;
+            if (ariaLabelledBy) {
+                const labelEl = document.getElementById(ariaLabelledBy);
+                labelText = labelEl?.textContent?.trim() || null;
+            }
+            const parts = [`<${tag}`];
+            if (role)
+                parts.push(`role="${role}"`);
+            if (testid)
+                parts.push(`data-testid="${testid}"`);
+            else if (id)
+                parts.push(`id="${id}"`);
+            if (ariaLabel)
+                parts.push(`aria-label="${ariaLabel}"`);
+            else if (labelText)
+                parts.push(`labelled="${labelText.slice(0, 60)}"`);
+            parts[parts.length - 1] += '>';
+            return {
+                descriptor: parts.join(' '),
+                suggestion: testid ? `dialog::testid:${testid}` : 'dialog::',
+            };
+        }, ACTIVE_MODAL_SELECTOR).then((result) => {
+            // Defensive: only treat as a real modal if the result is a
+            // properly-shaped object. Mocked test environments may return
+            // arbitrary values from page.evaluate() that should not trigger
+            // auto-scope.
+            if (result && typeof result === 'object' && typeof result.descriptor === 'string') {
+                return result;
+            }
+            return null;
+        }, () => null);
+    }
     /**
      * Sanitize verbose Playwright selector engine messages by removing stack traces and
      * keeping only the essential syntax error information.
@@ -249,24 +436,27 @@ export class BrowserToolBase {
         // Check for multiple elements with errorOnMultiple flag
         if (options?.errorOnMultiple && count > 1) {
             const selector = options.originalSelector || 'selector';
-            const nthHint = ''.trimEnd();
-            const warning = ''.trimEnd();
             let message = `Selector "${selector}" matched ${count} elements. Please use a more specific selector.`;
-            if (nthHint) {
-                message += `\n${nthHint}`;
-            }
-            if (warning) {
-                message += `\n${warning}`;
-            }
-            {
+            // Verbose disambiguation guidance is rate-limited per session — useful
+            // once for the agent to learn the pattern, noise on every subsequent call.
+            // After the first emit, fall back to a one-line pointer.
+            const { hasShownNthHint, markNthHintShown } = await import('../../toolHandler.js');
+            if (!hasShownNthHint()) {
                 const guidance = [
                     `1) Preferred: add a unique data-testid and select it directly (e.g., testid:submit).`,
                     `2) If you cannot change markup: append \`>> nth=<index>\` to target a specific match.`,
                 ];
-                const matchesDetails = await this.describeMatchedElements(locator, selector, count);
-                message += `\n${guidance.join('\n')}\n\nMatches:\n${matchesDetails}`;
-                throw new Error(message);
+                message += `\n${guidance.join('\n')}`;
+                markNthHintShown();
             }
+            else {
+                message += `\nUse a more specific selector (e.g. testid:..., or '>> nth=<index>').`;
+            }
+            // Per-call match details remain — they describe what's actually on the
+            // page, not generic advice.
+            const matchesDetails = await this.describeMatchedElements(locator, selector, count);
+            message += `\n\nMatches:\n${matchesDetails}`;
+            throw new Error(message);
         }
         // Handle explicit element index (1-based)
         if (options?.elementIndex !== undefined) {
@@ -316,7 +506,7 @@ export class BrowserToolBase {
      * @param preferredVisible Whether visibility preference was used
      * @returns Formatted string or empty if only one element
      */
-    formatElementSelectionInfo(selector, elementIndex, totalCount, preferredVisible = true) {
+    async formatElementSelectionInfo(selector, elementIndex, totalCount, preferredVisible = true) {
         const usesNth = selector.includes('>> nth=');
         if (totalCount <= 1) {
             // Even when a single element is ultimately targeted, discourage nth usage
@@ -326,10 +516,25 @@ export class BrowserToolBase {
             }
             return '';
         }
-        const duplicateWarning = this.getDuplicateTestIdWarning(selector, totalCount).trimEnd();
-        const nthHint = this.buildNthSelectorHint(selector, totalCount).trimEnd();
         const avoidNth = usesNth ? "💡 Tip: Avoid relying on '>> nth='; add a unique data-testid instead." : '';
-        const extraHints = [duplicateWarning, nthHint, avoidNth].filter(Boolean).join('\n');
+        // Verbose nth-selector guidance is rate-limited to one emit per session.
+        // The short ⚠ warning still surfaces every call; the multi-line hint block
+        // (duplicate-testid tip + nth-selector workaround) appears only on the
+        // first multi-match of the session — it's reference material the agent
+        // only needs once.
+        let extraHints = '';
+        const { hasShownNthHint, markNthHintShown } = await import('../../toolHandler.js');
+        if (!hasShownNthHint()) {
+            const duplicateWarning = this.getDuplicateTestIdWarning(selector, totalCount).trimEnd();
+            const nthHint = this.buildNthSelectorHint(selector, totalCount).trimEnd();
+            extraHints = [duplicateWarning, nthHint, avoidNth].filter(Boolean).join('\n');
+            if (duplicateWarning || nthHint) {
+                markNthHintShown();
+            }
+        }
+        else if (avoidNth) {
+            extraHints = avoidNth;
+        }
         const baseMessage = preferredVisible
             ? `⚠ Found ${totalCount} elements matching "${selector}", using element ${elementIndex + 1} (first visible)`
             : `⚠ Found ${totalCount} elements matching "${selector}", using element ${elementIndex + 1}`;

package/dist/tools/browser/common/postAction.d.ts CHANGED Viewed

@@ -5,3 +5,15 @@ export declare function titleUrlChangeLines(page: Page, initial?: {
     url?: string;
     title?: string;
 }): Promise<string[]>;
+export type OverlayKind = 'dialog' | 'menu' | 'listbox' | 'tooltip' | 'popup';
+export interface OverlayEntry {
+    descriptor: string;
+    kind: OverlayKind;
+    suggestion?: string;
+}
+export interface OverlaySnapshot {
+    keys: string[];
+    entries: Record<string, OverlayEntry>;
+}
+export declare function snapshotOpenOverlays(page: Page): Promise<OverlaySnapshot>;
+export declare function overlayChangeLines(before: OverlaySnapshot, after: OverlaySnapshot): string[];

package/dist/tools/browser/common/postAction.js CHANGED Viewed

@@ -44,3 +44,161 @@ export async function titleUrlChangeLines(page, initial = {}) {
     }
     return lines;
 }
+// Selector covering modal-style dialogs. Kept distinct from popup roots so
+// the kind classifier and the dialog:: shortcut can coexist cleanly.
+const DIALOG_ROOTS_SELECTOR = '[role="dialog"]:not([aria-hidden="true"]),' +
+    '[role="alertdialog"]:not([aria-hidden="true"]),' +
+    'dialog[open]';
+// Selector covering popup-style overlays (menus, listboxes, tooltips,
+// expanded combobox panels, Radix-style data-state="open" content). These
+// surface in click output as "↑ Menu opened" etc. so the agent learns a
+// transient panel appeared without having to re-inspect the page.
+//
+// Note on framework attributes:
+// - `[data-state="open"]` is Radix; we restrict to known panel roles or
+//   Radix's popper content wrapper to avoid false positives on triggers.
+// - `[data-headlessui-state="open"]` is set by Headless UI on EVERY component
+//   (panels AND triggers), so it must be qualified by a panel role too.
+const POPUP_ROOTS_SELECTOR = '[role="menu"]:not([aria-hidden="true"]),' +
+    '[role="listbox"]:not([aria-hidden="true"]),' +
+    '[role="tooltip"]:not([aria-hidden="true"]),' +
+    '[data-state="open"][role="menu"],' +
+    '[data-state="open"][role="listbox"],' +
+    '[data-state="open"][data-radix-popper-content-wrapper],' +
+    '[data-headlessui-state="open"][role="menu"],' +
+    '[data-headlessui-state="open"][role="listbox"]';
+// Snapshot which overlays (dialogs + popups) are currently visible. The key
+// encodes a stable fingerprint (role+testid/id/aria-label) so the same
+// overlay identifies consistently before and after an action, even if the
+// DOM list reorders. We avoid encoding DOM index when an identifying
+// attribute is present, so a re-render at a new position doesn't read as
+// "closed + opened" noise.
+export async function snapshotOpenOverlays(page) {
+    try {
+        return await page.evaluate(({ dialogSel, popupSel }) => {
+            const isUserVisible = (el) => {
+                const cs = getComputedStyle(el);
+                if (cs.display === 'none' || cs.visibility === 'hidden' || parseFloat(cs.opacity) === 0) {
+                    return false;
+                }
+                const rect = el.getBoundingClientRect();
+                return rect.width > 0 && rect.height > 0;
+            };
+            const classify = (el) => {
+                const tag = el.tagName.toLowerCase();
+                const role = el.getAttribute('role') || '';
+                if (role === 'dialog' || role === 'alertdialog' || tag === 'dialog')
+                    return 'dialog';
+                if (role === 'menu')
+                    return 'menu';
+                if (role === 'listbox')
+                    return 'listbox';
+                if (role === 'tooltip')
+                    return 'tooltip';
+                return 'popup';
+            };
+            const describe = (el, kind) => {
+                const tag = el.tagName.toLowerCase();
+                const role = el.getAttribute('role') || (tag === 'dialog' ? 'dialog' : '');
+                const testid = el.getAttribute('data-testid') ||
+                    el.getAttribute('data-test') ||
+                    el.getAttribute('data-cy');
+                const id = el.id || null;
+                const ariaLabel = el.getAttribute('aria-label');
+                const ariaLabelledBy = el.getAttribute('aria-labelledby');
+                let labelText = null;
+                if (ariaLabelledBy) {
+                    const labelEl = document.getElementById(ariaLabelledBy);
+                    labelText = labelEl?.textContent?.trim() || null;
+                }
+                const parts = [`<${tag}`];
+                if (role)
+                    parts.push(`role="${role}"`);
+                if (testid)
+                    parts.push(`data-testid="${testid}"`);
+                else if (id)
+                    parts.push(`id="${id}"`);
+                if (ariaLabel)
+                    parts.push(`aria-label="${ariaLabel}"`);
+                else if (labelText)
+                    parts.push(`labelled="${labelText.slice(0, 60)}"`);
+                parts[parts.length - 1] += '>';
+                const descriptor = parts.join(' ');
+                // When an identifying attribute is present, the key is stable across
+                // re-renders. Otherwise, fall back to a normalized text fingerprint:
+                // digit runs replaced with `#` so that badge counts, timestamps, or
+                // counters in menu items don't flip the key on every re-render.
+                // Avoids the "closed + opened" false positive that a raw DOM index
+                // (or raw text snippet) would produce.
+                const stableTextFingerprint = (() => {
+                    const raw = (el.textContent || '').replace(/\s+/g, ' ').trim();
+                    return raw.replace(/\d+/g, '#').slice(0, 40);
+                })();
+                const fingerprint = testid || id || ariaLabel || labelText || stableTextFingerprint;
+                const key = `${kind}|${role}|${fingerprint}`;
+                const suggestion = kind === 'dialog' ? (testid ? `dialog::testid:${testid}` : 'dialog::') : undefined;
+                return { descriptor, key, suggestion };
+            };
+            const result = { keys: [], entries: {} };
+            const seen = new Set();
+            const collect = (selector) => {
+                Array.from(document.querySelectorAll(selector))
+                    .filter(isUserVisible)
+                    .forEach((el) => {
+                    if (seen.has(el))
+                        return;
+                    seen.add(el);
+                    const kind = classify(el);
+                    const { descriptor, key, suggestion } = describe(el, kind);
+                    // Disambiguate truly-identical overlays by appending a counter.
+                    let uniqueKey = key;
+                    let n = 1;
+                    while (result.entries[uniqueKey]) {
+                        uniqueKey = `${key}#${++n}`;
+                    }
+                    result.keys.push(uniqueKey);
+                    result.entries[uniqueKey] = { descriptor, kind, suggestion };
+                });
+            };
+            collect(dialogSel);
+            collect(popupSel);
+            return result;
+        }, { dialogSel: DIALOG_ROOTS_SELECTOR, popupSel: POPUP_ROOTS_SELECTOR });
+    }
+    catch {
+        return { keys: [], entries: {} };
+    }
+}
+const KIND_LABEL = {
+    dialog: 'Dialog',
+    menu: 'Menu',
+    listbox: 'Listbox',
+    tooltip: 'Tooltip',
+    popup: 'Popup',
+};
+// Diff two snapshots and emit human-readable change lines. Used by click to
+// surface "↑ Menu opened" / "↓ Dialog closed" etc. so the LLM doesn't have
+// to re-inspect the page after every interaction.
+export function overlayChangeLines(before, after) {
+    const beforeSet = new Set(before.keys);
+    const afterSet = new Set(after.keys);
+    const opened = after.keys.filter(k => !beforeSet.has(k));
+    const closed = before.keys.filter(k => !afterSet.has(k));
+    const lines = [];
+    for (const k of opened) {
+        const e = after.entries[k];
+        if (!e)
+            continue;
+        lines.push(`↑ ${KIND_LABEL[e.kind]} opened: ${e.descriptor}`);
+        if (e.suggestion) {
+            lines.push(`  Tip: scope reads/clicks with '${e.suggestion}SELECTOR' (e.g. ${e.suggestion}button).`);
+        }
+    }
+    for (const k of closed) {
+        const e = before.entries[k];
+        if (!e)
+            continue;
+        lines.push(`↓ ${KIND_LABEL[e.kind]} closed: ${e.descriptor}`);
+    }
+    return lines;
+}