mcp-web-inspector 0.12.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. package/README.md +5 -5
  2. package/dist/index.js +15 -1
  3. package/dist/toolHandler.d.ts +2 -0
  4. package/dist/toolHandler.js +12 -0
  5. package/dist/tools/browser/base.d.ts +41 -2
  6. package/dist/tools/browser/base.js +221 -16
  7. package/dist/tools/browser/common/postAction.d.ts +12 -0
  8. package/dist/tools/browser/common/postAction.js +158 -0
  9. package/dist/tools/browser/content/get_html.js +19 -6
  10. package/dist/tools/browser/content/get_text.js +24 -8
  11. package/dist/tools/browser/inspection/check_visibility.js +2 -3
  12. package/dist/tools/browser/inspection/compare_element_alignment.js +6 -8
  13. package/dist/tools/browser/inspection/element_exists.js +1 -2
  14. package/dist/tools/browser/inspection/get_computed_styles.js +2 -3
  15. package/dist/tools/browser/inspection/inspect_ancestors.js +4 -5
  16. package/dist/tools/browser/inspection/inspect_dom.js +20 -4
  17. package/dist/tools/browser/inspection/measure_element.js +2 -3
  18. package/dist/tools/browser/inspection/query_selector.js +3 -3
  19. package/dist/tools/browser/interaction/click.js +178 -6
  20. package/dist/tools/browser/interaction/drag.js +2 -4
  21. package/dist/tools/browser/interaction/fill.js +2 -3
  22. package/dist/tools/browser/interaction/hover.js +1 -2
  23. package/dist/tools/browser/interaction/press_key.js +1 -2
  24. package/dist/tools/browser/interaction/select.js +1 -2
  25. package/dist/tools/browser/interaction/upload_file.js +1 -2
  26. package/dist/tools/browser/navigation/scroll_by.js +1 -1
  27. package/dist/tools/browser/navigation/scroll_to_element.js +1 -2
  28. package/dist/tools/browser/waiting/wait_for_element.js +1 -2
  29. package/dist/tools/common/types.d.ts +1 -0
  30. package/package.json +1 -1
package/README.md CHANGED
@@ -723,7 +723,7 @@ Issues:
723
723
  Test a selector and return detailed information about all matched elements. Essential for selector debugging and finding the right element to interact with. Returns compact text format with element tag, position, text content, visibility status, and interaction capability. Shows why elements are hidden (display:none, opacity:0, zero size). Supports testid shortcuts (e.g., 'testid:submit-button'). Use limit parameter to control how many matches to show (default: 10). NEW: Use onlyVisible parameter to filter results (true=visible only, false=hidden only, undefined=all).
724
724
 
725
725
  - Parameters:
726
- - selector (string, required): CSS selector, text selector, or testid shorthand to test (e.g., 'button.submit', 'testid:login-form', 'text=Sign In')
726
+ - selector (string, required): CSS selector, text selector, or testid shorthand to test (e.g., 'button.submit', 'testid:login-form', 'text=Sign In', 'dialog::button' to scope the lookup to the topmost open dialog/sheet)
727
727
  - limit (number, optional): Maximum number of elements to return detailed info for (default: 10, recommended max: 50)
728
728
  - onlyVisible (boolean, optional): Filter results by visibility: true = show only visible elements, false = show only hidden elements, undefined/not specified = show all elements (default: undefined)
729
729
  - showAttributes (string, optional): Comma-separated list of HTML attributes to display for each element (e.g., 'id,name,aria-label,href,type'). If not specified, attributes are not shown.
@@ -952,7 +952,7 @@ Scroll an element into view. Automatically handles scrolling within the nearest
952
952
  Click an element on the page
953
953
 
954
954
  - Parameters:
955
- - selector (string, required): CSS selector for the element to click
955
+ - selector (string, required): CSS selector for the element to click. Supports 'testid:NAME' and 'dialog::SELECTOR' (scopes the lookup to the topmost open dialog/sheet, e.g. 'dialog::testid:confirm').
956
956
 
957
957
  #### `drag`
958
958
  Drag an element to a target location
@@ -965,7 +965,7 @@ Drag an element to a target location
965
965
  fill an input/textarea/contenteditable; if the selector matches a wrapper, descends up to 4 levels to a unique fillable descendant (errors if zero or multiple)
966
966
 
967
967
  - Parameters:
968
- - selector (string, required): CSS selector for input field or its wrapper
968
+ - selector (string, required): CSS selector for input field or its wrapper. Supports 'testid:NAME' and 'dialog::SELECTOR' (scopes to the topmost open dialog/sheet).
969
969
  - value (string, required): Value to fill
970
970
 
971
971
  #### `hover`
@@ -1006,10 +1006,10 @@ Upload a file to an input[type='file'] element on the page
1006
1006
  - maxLength (number, optional): Maximum number of characters to return (default: 20000)
1007
1007
 
1008
1008
  #### `get_text`
1009
- [may return preview+token] ⚠️ RARELY NEEDED: Get ALL visible text content from the entire page (no structure, just raw text). Most tasks need structured inspection instead. ONLY use get_text for: (1) extracting text for content analysis (word count, language detection), (2) searching for text when location is completely unknown, (3) text-only snapshots for comparison. For structured tasks, use: inspect_dom() to understand page structure, find_by_text() to locate specific text with context, query_selector() to find elements. Auto-returns text if <2000 chars (small elements); if larger, returns a preview and a one-time token to fetch the full output via confirm_output. Supports testid shortcuts.
1009
+ [may return preview+token] ⚠️ RARELY NEEDED: Get ALL visible text content from the entire page (no structure, just raw text). Most tasks need structured inspection instead. ONLY use get_text for: (1) extracting text for content analysis (word count, language detection), (2) searching for text when location is completely unknown, (3) text-only snapshots for comparison. For structured tasks, use: inspect_dom() to understand page structure, find_by_text() to locate specific text with context, query_selector() to find elements. Auto-returns text if <2000 chars (small elements); if larger, returns a preview and a one-time token to fetch the full output via confirm_output. Supports testid shortcuts and the `dialog::SELECTOR` scope to read inside the topmost open dialog/sheet.
1010
1010
 
1011
1011
  - Parameters:
1012
- - selector (string, optional): CSS selector, text selector, or testid shorthand to limit text extraction to a specific container. Omit to get text from entire page. Example: 'testid:article-body' or '#main-content'
1012
+ - selector (string, optional): CSS selector, text selector, or testid shorthand to limit text extraction to a specific container. Omit to get text from entire page. Examples: 'testid:article-body', '#main-content', 'dialog::section' (scopes lookup to the topmost open dialog/sheet — useful when a sheet covers ambiguous page chrome). Use bare 'dialog::' for the whole topmost dialog.
1013
1013
  - maxLength (number, optional): Maximum number of characters to return (default: 20000)
1014
1014
 
1015
1015
  #### `visual_screenshot_for_humans`
package/dist/index.js CHANGED
@@ -4,7 +4,7 @@ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"
4
4
  import { createToolDefinitions } from "./tools/common/registry.js";
5
5
  import { setupRequestHandlers } from "./requestHandler.js";
6
6
  import { parseArgs } from "node:util";
7
- import { setSessionConfig } from "./toolHandler.js";
7
+ import { setSessionConfig, ensureBrowser } from "./toolHandler.js";
8
8
  import { readFileSync } from "node:fs";
9
9
  import { fileURLToPath } from "node:url";
10
10
  import { dirname, join } from "node:path";
@@ -41,6 +41,10 @@ const { values } = parseArgs({
41
41
  'cdp-port': {
42
42
  type: 'string',
43
43
  },
44
+ 'warmup-browser': {
45
+ type: 'boolean',
46
+ default: false,
47
+ },
44
48
  'print-tools-json': {
45
49
  type: 'boolean',
46
50
  default: false,
@@ -93,6 +97,7 @@ async function runServer() {
93
97
  headlessDefault: Boolean(values['headless']) || (process.platform === 'linux' && !process.env.DISPLAY && !process.env.WAYLAND_DISPLAY),
94
98
  exposeSensitiveNetworkData: Boolean(values['expose-sensitive-network-data']),
95
99
  cdpPort,
100
+ warmupBrowser: Boolean(values['warmup-browser']),
96
101
  };
97
102
  setSessionConfig(sessionConfig);
98
103
  // Create tool definitions with session config
@@ -137,6 +142,15 @@ async function runServer() {
137
142
  // Create transport and connect
138
143
  const transport = new StdioServerTransport();
139
144
  await server.connect(transport);
145
+ // Optional eager browser launch. Off by default — sessions that never invoke
146
+ // an MCP tool shouldn't pay for Chromium startup. Useful when external
147
+ // clients (e.g. CDP seed/login scripts) need the browser up before any tool
148
+ // call. Non-blocking — failures surface on the first tool call.
149
+ if (sessionConfig.warmupBrowser) {
150
+ ensureBrowser({ headless: sessionConfig.headlessDefault }).catch(err => {
151
+ console.error("Eager browser warmup failed (will retry on first tool call):", err);
152
+ });
153
+ }
140
154
  }
141
155
  runServer().catch((error) => {
142
156
  console.error("Fatal error in main():", error);
@@ -19,6 +19,8 @@ export interface NetworkRequest {
19
19
  };
20
20
  }
21
21
  type ColorSchemeOverride = 'light' | 'dark' | 'no-preference';
22
+ export declare function hasShownNthHint(): boolean;
23
+ export declare function markNthHintShown(): void;
22
24
  /**
23
25
  * Sets the session configuration
24
26
  */
@@ -14,8 +14,19 @@ let sessionConfig = {
14
14
  headlessDefault: false,
15
15
  exposeSensitiveNetworkData: false,
16
16
  cdpPort: 0,
17
+ warmupBrowser: false,
17
18
  };
18
19
  let colorSchemeOverride = null;
20
+ // Session-scoped flag: the verbose "matched multiple elements" nth-selector
21
+ // guidance is only emitted once per browser session. After the first emit,
22
+ // tools surface only the short ⚠ warning to keep agent context lean.
23
+ let nthHintShown = false;
24
+ export function hasShownNthHint() {
25
+ return nthHintShown;
26
+ }
27
+ export function markNthHintShown() {
28
+ nthHintShown = true;
29
+ }
19
30
  /**
20
31
  * Sets the session configuration
21
32
  */
@@ -50,6 +61,7 @@ export function resetBrowserState() {
50
61
  currentBrowserType = 'chromium';
51
62
  currentDevice = undefined;
52
63
  networkLog = [];
64
+ nthHintShown = false;
53
65
  clearConsoleLogs();
54
66
  }
55
67
  /**
@@ -1,4 +1,4 @@
1
- import type { Page } from 'playwright';
1
+ import type { Locator, Page } from 'playwright';
2
2
  import { ToolHandler, ToolContext, ToolResponse } from '../common/types.js';
3
3
  /**
4
4
  * Base class for all browser-based tools
@@ -20,10 +20,49 @@ export declare abstract class BrowserToolBase implements ToolHandler {
20
20
  * "#radix-\:rc\:-content-123" → "id=radix-:rc:-content-123"
21
21
  * - Remove unnecessary escapes for bracket characters only (\\[ and \\])
22
22
  * DO NOT unescape colons globally — colons in class/ID names must stay escaped in CSS.
23
+ *
24
+ * Note: the `dialog::SELECTOR` scope shortcut (e.g., `dialog::section`,
25
+ * `dialog::testid:close`) is NOT handled here — it is a runtime scope
26
+ * resolved by `createScopedLocator()`, not a syntactic rewrite.
27
+ *
23
28
  * @param selector The selector string
24
29
  * @returns Normalized selector
25
30
  */
26
31
  protected normalizeSelector(selector: string): string;
32
+ /**
33
+ * Build a Playwright `Locator` honoring the `dialog::SELECTOR` scope shortcut.
34
+ *
35
+ * - `dialog::section` → topmost open dialog/sheet, then `section` inside it
36
+ * - `dialog::testid:close` → topmost open dialog, then `[data-testid="close"]` inside it
37
+ * - `dialog::` → the topmost open dialog itself
38
+ * - anything else → `page.locator(normalizeSelector(rawSelector))`
39
+ *
40
+ * "Topmost" is determined by the highest effective z-index — for each
41
+ * candidate dialog, we walk up to the nearest positioned ancestor (almost
42
+ * always the backdrop/glass-screen wrapper, which is what stacking actually
43
+ * follows) and read its z-index. DOM order is the tiebreaker. This is more
44
+ * robust than picking `.last()` because portal frameworks don't always
45
+ * append in z-order, and modal stacking is driven by the backdrop's
46
+ * z-index, not the dialog content's.
47
+ */
48
+ protected createScopedLocator(page: Page, rawSelector: string): Promise<Locator>;
49
+ /**
50
+ * Detect a "user-dominating" open modal — i.e. one that a human would
51
+ * visually focus on and interact with to the exclusion of the rest of the
52
+ * page. Used by inspect_dom / get_text / get_html to auto-scope when no
53
+ * selector is provided, so the LLM's view matches the human's view.
54
+ *
55
+ * Strict criterion: requires `aria-modal="true"` (or native `dialog[open]`)
56
+ * because non-modal `[role="dialog"]` includes things like side panels and
57
+ * tooltips that don't dominate the page.
58
+ *
59
+ * Returns null if no active modal is open. Otherwise returns the topmost
60
+ * one, ranked by the same z-index walk used by `createScopedLocator()`.
61
+ */
62
+ protected detectActiveModal(page: Page): Promise<{
63
+ descriptor: string;
64
+ suggestion: string;
65
+ } | null>;
27
66
  /**
28
67
  * Sanitize verbose Playwright selector engine messages by removing stack traces and
29
68
  * keeping only the essential syntax error information.
@@ -105,7 +144,7 @@ export declare abstract class BrowserToolBase implements ToolHandler {
105
144
  * @param preferredVisible Whether visibility preference was used
106
145
  * @returns Formatted string or empty if only one element
107
146
  */
108
- protected formatElementSelectionInfo(selector: string, elementIndex: number, totalCount: number, preferredVisible?: boolean): string;
147
+ protected formatElementSelectionInfo(selector: string, elementIndex: number, totalCount: number, preferredVisible?: boolean): Promise<string>;
109
148
  /**
110
149
  * Generate a warning message if the selector is a testid and there are duplicates
111
150
  *
@@ -16,6 +16,11 @@ export class BrowserToolBase {
16
16
  * "#radix-\:rc\:-content-123" → "id=radix-:rc:-content-123"
17
17
  * - Remove unnecessary escapes for bracket characters only (\\[ and \\])
18
18
  * DO NOT unescape colons globally — colons in class/ID names must stay escaped in CSS.
19
+ *
20
+ * Note: the `dialog::SELECTOR` scope shortcut (e.g., `dialog::section`,
21
+ * `dialog::testid:close`) is NOT handled here — it is a runtime scope
22
+ * resolved by `createScopedLocator()`, not a syntactic rewrite.
23
+ *
19
24
  * @param selector The selector string
20
25
  * @returns Normalized selector
21
26
  */
@@ -76,6 +81,188 @@ export class BrowserToolBase {
76
81
  cleaned = cleaned.replace(/\\{2,}(?=:)/g, '\\');
77
82
  return cleaned;
78
83
  }
84
+ /**
85
+ * Build a Playwright `Locator` honoring the `dialog::SELECTOR` scope shortcut.
86
+ *
87
+ * - `dialog::section` → topmost open dialog/sheet, then `section` inside it
88
+ * - `dialog::testid:close` → topmost open dialog, then `[data-testid="close"]` inside it
89
+ * - `dialog::` → the topmost open dialog itself
90
+ * - anything else → `page.locator(normalizeSelector(rawSelector))`
91
+ *
92
+ * "Topmost" is determined by the highest effective z-index — for each
93
+ * candidate dialog, we walk up to the nearest positioned ancestor (almost
94
+ * always the backdrop/glass-screen wrapper, which is what stacking actually
95
+ * follows) and read its z-index. DOM order is the tiebreaker. This is more
96
+ * robust than picking `.last()` because portal frameworks don't always
97
+ * append in z-order, and modal stacking is driven by the backdrop's
98
+ * z-index, not the dialog content's.
99
+ */
100
+ async createScopedLocator(page, rawSelector) {
101
+ const trimmed = (rawSelector ?? '').trim();
102
+ const DIALOG_PREFIX = 'dialog::';
103
+ if (!trimmed.startsWith(DIALOG_PREFIX)) {
104
+ return page.locator(this.normalizeSelector(trimmed));
105
+ }
106
+ const dialogRoots = '[role="dialog"]:not([aria-hidden="true"]),' +
107
+ '[role="alertdialog"]:not([aria-hidden="true"]),' +
108
+ 'dialog[open]';
109
+ // Match detectActiveModal: include only user-visible candidates and
110
+ // rank by effective z-index. Without the visibility filter, a hidden
111
+ // dialog left in the DOM (display:none) could be picked over an
112
+ // actually-open one.
113
+ const result = await page.evaluate((rootsSelector) => {
114
+ const isUserVisible = (el) => {
115
+ const cs = getComputedStyle(el);
116
+ if (cs.display === 'none' || cs.visibility === 'hidden' || parseFloat(cs.opacity) === 0) {
117
+ return false;
118
+ }
119
+ const rect = el.getBoundingClientRect();
120
+ return rect.width > 0 && rect.height > 0;
121
+ };
122
+ const allCandidates = Array.from(document.querySelectorAll(rootsSelector));
123
+ const visibleIndices = [];
124
+ allCandidates.forEach((el, i) => {
125
+ if (isUserVisible(el))
126
+ visibleIndices.push(i);
127
+ });
128
+ if (visibleIndices.length === 0)
129
+ return { topIndex: -1, hasVisible: false };
130
+ if (visibleIndices.length === 1)
131
+ return { topIndex: visibleIndices[0], hasVisible: true };
132
+ const effectiveZ = (start) => {
133
+ let z = 0;
134
+ let node = start;
135
+ while (node && node !== document.body) {
136
+ const cs = getComputedStyle(node);
137
+ if (cs.position !== 'static') {
138
+ const parsed = parseInt(cs.zIndex, 10);
139
+ if (!isNaN(parsed)) {
140
+ z = Math.max(z, parsed);
141
+ }
142
+ }
143
+ node = node.parentElement;
144
+ }
145
+ return z;
146
+ };
147
+ let bestIdx = visibleIndices[0];
148
+ let bestScore = -Infinity;
149
+ visibleIndices.forEach((i) => {
150
+ // Tiebreaker: DOM order — later element is on top.
151
+ const score = effectiveZ(allCandidates[i]) * 1000000 + i;
152
+ if (score > bestScore) {
153
+ bestScore = score;
154
+ bestIdx = i;
155
+ }
156
+ });
157
+ return { topIndex: bestIdx, hasVisible: true };
158
+ }, dialogRoots).catch(() => ({ topIndex: -1, hasVisible: false }));
159
+ // No visible dialog → return a never-matching locator so downstream
160
+ // callers see a clean "No elements found" instead of silently scoping
161
+ // to a hidden dialog left in the DOM.
162
+ if (!result.hasVisible) {
163
+ return page.locator('dialog-no-such-element-sentinel');
164
+ }
165
+ const topmostDialog = page.locator(dialogRoots).nth(result.topIndex);
166
+ const inner = trimmed.slice(DIALOG_PREFIX.length).trim();
167
+ if (!inner) {
168
+ return topmostDialog;
169
+ }
170
+ return topmostDialog.locator(this.normalizeSelector(inner));
171
+ }
172
+ /**
173
+ * Detect a "user-dominating" open modal — i.e. one that a human would
174
+ * visually focus on and interact with to the exclusion of the rest of the
175
+ * page. Used by inspect_dom / get_text / get_html to auto-scope when no
176
+ * selector is provided, so the LLM's view matches the human's view.
177
+ *
178
+ * Strict criterion: requires `aria-modal="true"` (or native `dialog[open]`)
179
+ * because non-modal `[role="dialog"]` includes things like side panels and
180
+ * tooltips that don't dominate the page.
181
+ *
182
+ * Returns null if no active modal is open. Otherwise returns the topmost
183
+ * one, ranked by the same z-index walk used by `createScopedLocator()`.
184
+ */
185
+ async detectActiveModal(page) {
186
+ const ACTIVE_MODAL_SELECTOR = '[role="dialog"][aria-modal="true"]:not([aria-hidden="true"]),' +
187
+ '[role="alertdialog"][aria-modal="true"]:not([aria-hidden="true"]),' +
188
+ 'dialog[open]';
189
+ return await page.evaluate((rootsSelector) => {
190
+ const isUserVisible = (el) => {
191
+ const cs = getComputedStyle(el);
192
+ if (cs.display === 'none' || cs.visibility === 'hidden' || parseFloat(cs.opacity) === 0) {
193
+ return false;
194
+ }
195
+ const rect = el.getBoundingClientRect();
196
+ return rect.width > 0 && rect.height > 0;
197
+ };
198
+ const candidates = Array.from(document.querySelectorAll(rootsSelector)).filter(isUserVisible);
199
+ if (candidates.length === 0)
200
+ return null;
201
+ const effectiveZ = (start) => {
202
+ let z = 0;
203
+ let node = start;
204
+ while (node && node !== document.body) {
205
+ const cs = getComputedStyle(node);
206
+ if (cs.position !== 'static') {
207
+ const parsed = parseInt(cs.zIndex, 10);
208
+ if (!isNaN(parsed)) {
209
+ z = Math.max(z, parsed);
210
+ }
211
+ }
212
+ node = node.parentElement;
213
+ }
214
+ return z;
215
+ };
216
+ let bestIdx = 0;
217
+ let bestScore = -Infinity;
218
+ candidates.forEach((el, i) => {
219
+ const score = effectiveZ(el) * 1000000 + i;
220
+ if (score > bestScore) {
221
+ bestScore = score;
222
+ bestIdx = i;
223
+ }
224
+ });
225
+ const top = candidates[bestIdx];
226
+ const tag = top.tagName.toLowerCase();
227
+ const role = top.getAttribute('role') || (tag === 'dialog' ? 'dialog' : '');
228
+ const testid = top.getAttribute('data-testid') ||
229
+ top.getAttribute('data-test') ||
230
+ top.getAttribute('data-cy');
231
+ const id = top.id || null;
232
+ const ariaLabel = top.getAttribute('aria-label');
233
+ const ariaLabelledBy = top.getAttribute('aria-labelledby');
234
+ let labelText = null;
235
+ if (ariaLabelledBy) {
236
+ const labelEl = document.getElementById(ariaLabelledBy);
237
+ labelText = labelEl?.textContent?.trim() || null;
238
+ }
239
+ const parts = [`<${tag}`];
240
+ if (role)
241
+ parts.push(`role="${role}"`);
242
+ if (testid)
243
+ parts.push(`data-testid="${testid}"`);
244
+ else if (id)
245
+ parts.push(`id="${id}"`);
246
+ if (ariaLabel)
247
+ parts.push(`aria-label="${ariaLabel}"`);
248
+ else if (labelText)
249
+ parts.push(`labelled="${labelText.slice(0, 60)}"`);
250
+ parts[parts.length - 1] += '>';
251
+ return {
252
+ descriptor: parts.join(' '),
253
+ suggestion: testid ? `dialog::testid:${testid}` : 'dialog::',
254
+ };
255
+ }, ACTIVE_MODAL_SELECTOR).then((result) => {
256
+ // Defensive: only treat as a real modal if the result is a
257
+ // properly-shaped object. Mocked test environments may return
258
+ // arbitrary values from page.evaluate() that should not trigger
259
+ // auto-scope.
260
+ if (result && typeof result === 'object' && typeof result.descriptor === 'string') {
261
+ return result;
262
+ }
263
+ return null;
264
+ }, () => null);
265
+ }
79
266
  /**
80
267
  * Sanitize verbose Playwright selector engine messages by removing stack traces and
81
268
  * keeping only the essential syntax error information.
@@ -249,24 +436,27 @@ export class BrowserToolBase {
249
436
  // Check for multiple elements with errorOnMultiple flag
250
437
  if (options?.errorOnMultiple && count > 1) {
251
438
  const selector = options.originalSelector || 'selector';
252
- const nthHint = ''.trimEnd();
253
- const warning = ''.trimEnd();
254
439
  let message = `Selector "${selector}" matched ${count} elements. Please use a more specific selector.`;
255
- if (nthHint) {
256
- message += `\n${nthHint}`;
257
- }
258
- if (warning) {
259
- message += `\n${warning}`;
260
- }
261
- {
440
+ // Verbose disambiguation guidance is rate-limited per session — useful
441
+ // once for the agent to learn the pattern, noise on every subsequent call.
442
+ // After the first emit, fall back to a one-line pointer.
443
+ const { hasShownNthHint, markNthHintShown } = await import('../../toolHandler.js');
444
+ if (!hasShownNthHint()) {
262
445
  const guidance = [
263
446
  `1) Preferred: add a unique data-testid and select it directly (e.g., testid:submit).`,
264
447
  `2) If you cannot change markup: append \`>> nth=<index>\` to target a specific match.`,
265
448
  ];
266
- const matchesDetails = await this.describeMatchedElements(locator, selector, count);
267
- message += `\n${guidance.join('\n')}\n\nMatches:\n${matchesDetails}`;
268
- throw new Error(message);
449
+ message += `\n${guidance.join('\n')}`;
450
+ markNthHintShown();
269
451
  }
452
+ else {
453
+ message += `\nUse a more specific selector (e.g. testid:..., or '>> nth=<index>').`;
454
+ }
455
+ // Per-call match details remain — they describe what's actually on the
456
+ // page, not generic advice.
457
+ const matchesDetails = await this.describeMatchedElements(locator, selector, count);
458
+ message += `\n\nMatches:\n${matchesDetails}`;
459
+ throw new Error(message);
270
460
  }
271
461
  // Handle explicit element index (1-based)
272
462
  if (options?.elementIndex !== undefined) {
@@ -316,7 +506,7 @@ export class BrowserToolBase {
316
506
  * @param preferredVisible Whether visibility preference was used
317
507
  * @returns Formatted string or empty if only one element
318
508
  */
319
- formatElementSelectionInfo(selector, elementIndex, totalCount, preferredVisible = true) {
509
+ async formatElementSelectionInfo(selector, elementIndex, totalCount, preferredVisible = true) {
320
510
  const usesNth = selector.includes('>> nth=');
321
511
  if (totalCount <= 1) {
322
512
  // Even when a single element is ultimately targeted, discourage nth usage
@@ -326,10 +516,25 @@ export class BrowserToolBase {
326
516
  }
327
517
  return '';
328
518
  }
329
- const duplicateWarning = this.getDuplicateTestIdWarning(selector, totalCount).trimEnd();
330
- const nthHint = this.buildNthSelectorHint(selector, totalCount).trimEnd();
331
519
  const avoidNth = usesNth ? "💡 Tip: Avoid relying on '>> nth='; add a unique data-testid instead." : '';
332
- const extraHints = [duplicateWarning, nthHint, avoidNth].filter(Boolean).join('\n');
520
+ // Verbose nth-selector guidance is rate-limited to one emit per session.
521
+ // The short ⚠ warning still surfaces every call; the multi-line hint block
522
+ // (duplicate-testid tip + nth-selector workaround) appears only on the
523
+ // first multi-match of the session — it's reference material the agent
524
+ // only needs once.
525
+ let extraHints = '';
526
+ const { hasShownNthHint, markNthHintShown } = await import('../../toolHandler.js');
527
+ if (!hasShownNthHint()) {
528
+ const duplicateWarning = this.getDuplicateTestIdWarning(selector, totalCount).trimEnd();
529
+ const nthHint = this.buildNthSelectorHint(selector, totalCount).trimEnd();
530
+ extraHints = [duplicateWarning, nthHint, avoidNth].filter(Boolean).join('\n');
531
+ if (duplicateWarning || nthHint) {
532
+ markNthHintShown();
533
+ }
534
+ }
535
+ else if (avoidNth) {
536
+ extraHints = avoidNth;
537
+ }
333
538
  const baseMessage = preferredVisible
334
539
  ? `⚠ Found ${totalCount} elements matching "${selector}", using element ${elementIndex + 1} (first visible)`
335
540
  : `⚠ Found ${totalCount} elements matching "${selector}", using element ${elementIndex + 1}`;
@@ -5,3 +5,15 @@ export declare function titleUrlChangeLines(page: Page, initial?: {
5
5
  url?: string;
6
6
  title?: string;
7
7
  }): Promise<string[]>;
8
+ export type OverlayKind = 'dialog' | 'menu' | 'listbox' | 'tooltip' | 'popup';
9
+ export interface OverlayEntry {
10
+ descriptor: string;
11
+ kind: OverlayKind;
12
+ suggestion?: string;
13
+ }
14
+ export interface OverlaySnapshot {
15
+ keys: string[];
16
+ entries: Record<string, OverlayEntry>;
17
+ }
18
+ export declare function snapshotOpenOverlays(page: Page): Promise<OverlaySnapshot>;
19
+ export declare function overlayChangeLines(before: OverlaySnapshot, after: OverlaySnapshot): string[];
@@ -44,3 +44,161 @@ export async function titleUrlChangeLines(page, initial = {}) {
44
44
  }
45
45
  return lines;
46
46
  }
47
+ // Selector covering modal-style dialogs. Kept distinct from popup roots so
48
+ // the kind classifier and the dialog:: shortcut can coexist cleanly.
49
+ const DIALOG_ROOTS_SELECTOR = '[role="dialog"]:not([aria-hidden="true"]),' +
50
+ '[role="alertdialog"]:not([aria-hidden="true"]),' +
51
+ 'dialog[open]';
52
+ // Selector covering popup-style overlays (menus, listboxes, tooltips,
53
+ // expanded combobox panels, Radix-style data-state="open" content). These
54
+ // surface in click output as "↑ Menu opened" etc. so the agent learns a
55
+ // transient panel appeared without having to re-inspect the page.
56
+ //
57
+ // Note on framework attributes:
58
+ // - `[data-state="open"]` is Radix; we restrict to known panel roles or
59
+ // Radix's popper content wrapper to avoid false positives on triggers.
60
+ // - `[data-headlessui-state="open"]` is set by Headless UI on EVERY component
61
+ // (panels AND triggers), so it must be qualified by a panel role too.
62
+ const POPUP_ROOTS_SELECTOR = '[role="menu"]:not([aria-hidden="true"]),' +
63
+ '[role="listbox"]:not([aria-hidden="true"]),' +
64
+ '[role="tooltip"]:not([aria-hidden="true"]),' +
65
+ '[data-state="open"][role="menu"],' +
66
+ '[data-state="open"][role="listbox"],' +
67
+ '[data-state="open"][data-radix-popper-content-wrapper],' +
68
+ '[data-headlessui-state="open"][role="menu"],' +
69
+ '[data-headlessui-state="open"][role="listbox"]';
70
+ // Snapshot which overlays (dialogs + popups) are currently visible. The key
71
+ // encodes a stable fingerprint (role+testid/id/aria-label) so the same
72
+ // overlay identifies consistently before and after an action, even if the
73
+ // DOM list reorders. We avoid encoding DOM index when an identifying
74
+ // attribute is present, so a re-render at a new position doesn't read as
75
+ // "closed + opened" noise.
76
+ export async function snapshotOpenOverlays(page) {
77
+ try {
78
+ return await page.evaluate(({ dialogSel, popupSel }) => {
79
+ const isUserVisible = (el) => {
80
+ const cs = getComputedStyle(el);
81
+ if (cs.display === 'none' || cs.visibility === 'hidden' || parseFloat(cs.opacity) === 0) {
82
+ return false;
83
+ }
84
+ const rect = el.getBoundingClientRect();
85
+ return rect.width > 0 && rect.height > 0;
86
+ };
87
+ const classify = (el) => {
88
+ const tag = el.tagName.toLowerCase();
89
+ const role = el.getAttribute('role') || '';
90
+ if (role === 'dialog' || role === 'alertdialog' || tag === 'dialog')
91
+ return 'dialog';
92
+ if (role === 'menu')
93
+ return 'menu';
94
+ if (role === 'listbox')
95
+ return 'listbox';
96
+ if (role === 'tooltip')
97
+ return 'tooltip';
98
+ return 'popup';
99
+ };
100
+ const describe = (el, kind) => {
101
+ const tag = el.tagName.toLowerCase();
102
+ const role = el.getAttribute('role') || (tag === 'dialog' ? 'dialog' : '');
103
+ const testid = el.getAttribute('data-testid') ||
104
+ el.getAttribute('data-test') ||
105
+ el.getAttribute('data-cy');
106
+ const id = el.id || null;
107
+ const ariaLabel = el.getAttribute('aria-label');
108
+ const ariaLabelledBy = el.getAttribute('aria-labelledby');
109
+ let labelText = null;
110
+ if (ariaLabelledBy) {
111
+ const labelEl = document.getElementById(ariaLabelledBy);
112
+ labelText = labelEl?.textContent?.trim() || null;
113
+ }
114
+ const parts = [`<${tag}`];
115
+ if (role)
116
+ parts.push(`role="${role}"`);
117
+ if (testid)
118
+ parts.push(`data-testid="${testid}"`);
119
+ else if (id)
120
+ parts.push(`id="${id}"`);
121
+ if (ariaLabel)
122
+ parts.push(`aria-label="${ariaLabel}"`);
123
+ else if (labelText)
124
+ parts.push(`labelled="${labelText.slice(0, 60)}"`);
125
+ parts[parts.length - 1] += '>';
126
+ const descriptor = parts.join(' ');
127
+ // When an identifying attribute is present, the key is stable across
128
+ // re-renders. Otherwise, fall back to a normalized text fingerprint:
129
+ // digit runs replaced with `#` so that badge counts, timestamps, or
130
+ // counters in menu items don't flip the key on every re-render.
131
+ // Avoids the "closed + opened" false positive that a raw DOM index
132
+ // (or raw text snippet) would produce.
133
+ const stableTextFingerprint = (() => {
134
+ const raw = (el.textContent || '').replace(/\s+/g, ' ').trim();
135
+ return raw.replace(/\d+/g, '#').slice(0, 40);
136
+ })();
137
+ const fingerprint = testid || id || ariaLabel || labelText || stableTextFingerprint;
138
+ const key = `${kind}|${role}|${fingerprint}`;
139
+ const suggestion = kind === 'dialog' ? (testid ? `dialog::testid:${testid}` : 'dialog::') : undefined;
140
+ return { descriptor, key, suggestion };
141
+ };
142
+ const result = { keys: [], entries: {} };
143
+ const seen = new Set();
144
+ const collect = (selector) => {
145
+ Array.from(document.querySelectorAll(selector))
146
+ .filter(isUserVisible)
147
+ .forEach((el) => {
148
+ if (seen.has(el))
149
+ return;
150
+ seen.add(el);
151
+ const kind = classify(el);
152
+ const { descriptor, key, suggestion } = describe(el, kind);
153
+ // Disambiguate truly-identical overlays by appending a counter.
154
+ let uniqueKey = key;
155
+ let n = 1;
156
+ while (result.entries[uniqueKey]) {
157
+ uniqueKey = `${key}#${++n}`;
158
+ }
159
+ result.keys.push(uniqueKey);
160
+ result.entries[uniqueKey] = { descriptor, kind, suggestion };
161
+ });
162
+ };
163
+ collect(dialogSel);
164
+ collect(popupSel);
165
+ return result;
166
+ }, { dialogSel: DIALOG_ROOTS_SELECTOR, popupSel: POPUP_ROOTS_SELECTOR });
167
+ }
168
+ catch {
169
+ return { keys: [], entries: {} };
170
+ }
171
+ }
172
+ const KIND_LABEL = {
173
+ dialog: 'Dialog',
174
+ menu: 'Menu',
175
+ listbox: 'Listbox',
176
+ tooltip: 'Tooltip',
177
+ popup: 'Popup',
178
+ };
179
+ // Diff two snapshots and emit human-readable change lines. Used by click to
180
+ // surface "↑ Menu opened" / "↓ Dialog closed" etc. so the LLM doesn't have
181
+ // to re-inspect the page after every interaction.
182
+ export function overlayChangeLines(before, after) {
183
+ const beforeSet = new Set(before.keys);
184
+ const afterSet = new Set(after.keys);
185
+ const opened = after.keys.filter(k => !beforeSet.has(k));
186
+ const closed = before.keys.filter(k => !afterSet.has(k));
187
+ const lines = [];
188
+ for (const k of opened) {
189
+ const e = after.entries[k];
190
+ if (!e)
191
+ continue;
192
+ lines.push(`↑ ${KIND_LABEL[e.kind]} opened: ${e.descriptor}`);
193
+ if (e.suggestion) {
194
+ lines.push(` Tip: scope reads/clicks with '${e.suggestion}SELECTOR' (e.g. ${e.suggestion}button).`);
195
+ }
196
+ }
197
+ for (const k of closed) {
198
+ const e = before.entries[k];
199
+ if (!e)
200
+ continue;
201
+ lines.push(`↓ ${KIND_LABEL[e.kind]} closed: ${e.descriptor}`);
202
+ }
203
+ return lines;
204
+ }