chrometools-mcp 3.5.0 → 3.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,29 @@
2
2
 
3
3
  All notable changes to this project will be documented in this file.
4
4
 
5
+ ## [3.5.2] - 2026-02-16
6
+
7
+ ### Added
8
+ - **Modal/Dialog detection (React Portals)** — `analyzePage` now detects modals rendered via React Portals (antd, MUI, Bootstrap, Chakra, Element UI, Headless UI, Radix, Mantine). ModalModel class in Element Model system with `role="dialog"` / `aria-modal="true"` matching. Portal wrapper ancestors are force-included in APOM tree with compact format. Modal metadata includes title and action buttons
9
+ - **TxtInp `clear` action** — TextInput model now supports `executeModelAction(action: "clear")` for clearing pre-filled form fields
10
+
11
+ ### Fixed
12
+ - **React controlled input clearing** — `type(clearFirst: true)` now works correctly with React/Vue/Angular controlled inputs (antd `<Input>`, MUI `<TextField>`, etc.). Uses native `HTMLInputElement.prototype.value` setter to bypass framework value trackers that ignored programmatic `el.value = ''` changes. Applied to both TextInputModel and TextareaModel
13
+ - **"ModelRegistry is not defined" error** — Fixed sporadic ReferenceError when calling `executeModelAction` or `click` after page navigation. Bare `ModelRegistry` identifier was inaccessible after `eval()` in strict mode contexts; changed to `window.ModelRegistry` reference
14
+ - **Modal output bloat** — ModalModel now only matches actual dialog elements (`role="dialog"`), not framework wrapper divs (`ant-modal-root`, `ant-modal-wrap`). Reduces `modalCount` from 3 to 1 per modal and forces wrapper ancestors to compact container format
15
+
16
+ ## [3.5.1] - 2026-02-16
17
+
18
+ ### Fixed
19
+ - **APOM selector uniqueness** — `generateSelector()` in APOM tree converter now verifies CSS selector uniqueness against the entire document instead of just parent element. Fixes critical bug where `click(id: "button_X")` could click the wrong element (e.g., navigation button instead of action button) when multiple elements shared the same class like `.ant-btn`
20
+ - **findElementsByText click timeout** — `executeElementAction` click now uses adaptive strategy with 5s timeout and JS fallback instead of raw Puppeteer `element.click()` which could hang indefinitely on elements inside complex layouts (antd Tabs, scrollable containers)
21
+ - **findElementsByText non-unique selectors** — `getUniqueSelectorInPage` fallback now checks selector uniqueness at each level of path building (max depth 5→8), preventing clicks on wrong elements when multiple matches exist
22
+
23
+ ### Changed
24
+ - **Screenshot defaults optimized** — Default format changed from PNG/auto to JPEG quality 40 for all screenshot tools, reducing token usage from ~15-25k to ~5-10k tokens per screenshot
25
+ - **Action screenshots compressed** — Screenshots from click/findElementsByText/hover with `screenshot: true` now use lightweight JPEG (quality 40, maxWidth 800) instead of raw PNG, dramatically reducing context consumption
26
+ - **Jimp warmup** — Pre-warms Jimp image processor at server startup (non-blocking, after transport connect) to avoid cold-start delays on first screenshot
27
+
5
28
  ## [3.5.0] - 2026-02-16
6
29
 
7
30
  ### Added
package/README.md CHANGED
@@ -8,7 +8,7 @@
8
8
 
9
9
  **For AI Agents & Developers:**
10
10
  - 🎯 **56+ specialized tools** for browser automation - from simple clicks to Figma comparisons
11
- - 🧠 **APOM (Agent Page Object Model)** - AI-friendly page representation (~8-10k tokens vs 15-25k for screenshots)
11
+ - 🧠 **APOM (Agent Page Object Model)** - AI-friendly page representation (~8-10k tokens vs 5-10k for screenshots)
12
12
  - 🔄 **Persistent browser sessions** - pages stay open between commands for iterative workflows
13
13
  - ⚡ **Framework-aware** - handles React, Vue, Angular events and state updates automatically
14
14
  - 📸 **Visual testing** - compare designs pixel-by-pixel with Figma integration
@@ -322,7 +322,7 @@ executeScenario({ name: "login_flow", parameters: { email: "user@test.com" } })
322
322
  1. ✅ **`analyzePage()`** - PRIMARY tool for reading page content
323
323
  - Gets forms, inputs, buttons, links with current values
324
324
  - Use `refresh: true` after interactions to see updated state
325
- - Efficient: 2-5k tokens vs screenshot 15-25k
325
+ - Efficient: 2-5k tokens vs screenshot 5-10k
326
326
  2. ✅ **`findElementsByText()`** - Find specific elements by visible text
327
327
  3. ✅ **`getElement()`** - Get HTML of specific element
328
328
  4. ⚠️ **`executeScript()`** - LAST RESORT, only if above failed
@@ -335,6 +335,14 @@ executeScenario({ name: "login_flow", parameters: { email: "user@test.com" } })
335
335
  - Example: `executeModelAction({id: "input_34", action: "check"})`
336
336
  - Example: `executeModelAction({selector: ".datepicker", action: "SetDate", params: {date: "2024-03-15"}})`
337
337
  - See `models/` directory for available models and actions
338
+ - Available models: TxtInp, Sel, Btn, Chk, Radio, TxtArea, Link, Range, DatePicker, DateInp, FileInp, ColorInp, **Modal**, default
339
+
340
+ #### Modal/Dialog Support
341
+ - **Automatic detection**: APOM detects modals rendered via React Portals (antd, MUI, Bootstrap, Chakra, Mantine, Element UI, Headless UI, Radix)
342
+ - **Detection methods**: `role="dialog"`, `aria-modal="true"`, framework-specific CSS classes
343
+ - **Animation-proof**: Modal elements are included even during CSS appear animations (opacity: 0)
344
+ - **Rich metadata**: Modal nodes include `title` and `actions` (button labels) in metadata
345
+ - **In APOM tree**: Modals appear as `type: "dialog"` with `model: "Modal"`, containing all interactive children
338
346
 
339
347
  **Why specialized tools matter:**
340
348
  - ✅ Trigger proper browser events (click, input, change)
@@ -397,7 +405,7 @@ executeScenario({ name: "login_flow", parameters: { email: "user@test.com" } })
397
405
  - `useLegacyFormat` (optional): Return legacy format instead of APOM (default: false - APOM is the default)
398
406
  - `registerElements` (optional): Auto-register elements for ID-based usage (default: true) - `groupBy` (optional): 'type' or 'flat' - how to group elements (default: 'type') - **Why better than screenshot**:
399
407
  - Shows actual data (form values, validation errors) not just visual
400
- - Uses 2-5k tokens vs screenshot 15-25k tokens
408
+ - Uses 2-5k tokens vs screenshot 5-10k tokens
401
409
  - Returns structured data with **unique element IDs** for easy interaction
402
410
  - **Detects UI frameworks** (MUI, Ant Design, Chakra, Bootstrap, Vuetify, Semantic UI) - **Extracts dropdown options** from both native `<select>` and custom UI components- **Returns**:
403
411
  - **APOM format** (default): Tree-structured Page Object Model with unique IDs - `tree` - Hierarchical tree of page elements (optimized: ~82% smaller than flat format)
@@ -674,11 +682,11 @@ Capture optimized screenshot of specific element with smart compression and auto
674
682
  - `padding` (optional): Padding in pixels (default: 0)
675
683
  - `maxWidth` (optional): Max width for auto-scaling (default: 1024, null for original size)
676
684
  - `maxHeight` (optional): Max height for auto-scaling (default: 8000, null for original size)
677
- - `quality` (optional): JPEG quality 1-100 (default: 80)
678
- - `format` (optional): 'png', 'jpeg', or 'auto' (default: 'auto')
685
+ - `quality` (optional): JPEG quality 1-100 (default: 40)
686
+ - `format` (optional): 'png', 'jpeg', or 'auto' (default: 'jpeg')
679
687
  - **Use case**: Visual documentation, bug reports
680
- - **Returns**: Optimized image with metadata
681
- - **Default behavior**: Auto-scales to 1024px width and 8000px height (API limit) and uses smart compression to reduce AI token usage
688
+ - **Returns**: Optimized image with metadata (~5-10k tokens)
689
+ - **Default behavior**: JPEG at quality 40, auto-scales to 1024px width and 8000px height (API limit). For higher quality, explicitly set `quality` and `format` parameters
682
690
  - **Automatic compression**: If image exceeds 3 MB, automatically reduces quality or scales down to fit within limit
683
691
  - **For original quality**: Set `maxWidth: null`, `maxHeight: null` and `format: 'png'` (still enforces 3 MB limit)
684
692
 
@@ -692,7 +700,7 @@ Save optimized screenshot to filesystem without returning in context, with autom
692
700
  - `maxHeight` (optional): Max height for auto-scaling (default: 8000, null for original)
693
701
  - `quality` (optional): JPEG quality 1-100 (default: 80)
694
702
  - `format` (optional): 'png', 'jpeg', or 'auto' (default: 'auto')
695
- - **Use case**: Baseline screenshots, file storage
703
+ - **Use case**: Baseline screenshots, file storage (higher quality defaults than `screenshot` tool)
696
704
  - **Returns**: File path and metadata (not image data)
697
705
  - **Default behavior**: Auto-scales and compresses to save disk space
698
706
  - **Automatic compression**: If image exceeds 3 MB, automatically reduces quality or scales down to fit within limit
@@ -462,8 +462,10 @@ function getUniqueSelectorInPage(element) {
462
462
  }
463
463
 
464
464
  // 8. Fallback: nth-of-type with path
465
+ // Build path up to 8 levels, verifying uniqueness
465
466
  let current = element;
466
467
  const path = [];
468
+ const MAX_PATH_DEPTH = 8;
467
469
 
468
470
  while (current && current.tagName) {
469
471
  let selector = current.tagName.toLowerCase();
@@ -497,10 +499,21 @@ function getUniqueSelectorInPage(element) {
497
499
  }
498
500
 
499
501
  path.unshift(selector);
502
+
503
+ // Check if current path is already unique
504
+ try {
505
+ const candidateSelector = path.join(' > ');
506
+ if (document.querySelectorAll(candidateSelector).length === 1) {
507
+ return candidateSelector;
508
+ }
509
+ } catch (e) {
510
+ // Invalid selector, continue building path
511
+ }
512
+
500
513
  current = current.parentElement;
501
514
 
502
- // Stop at body or after 5 levels
503
- if (!current || current.tagName === 'BODY' || path.length >= 5) {
515
+ // Stop at body or after max depth
516
+ if (!current || current.tagName === 'BODY' || path.length >= MAX_PATH_DEPTH) {
504
517
  break;
505
518
  }
506
519
  }
package/index.js CHANGED
@@ -1,4 +1,4 @@
1
- #!/usr/bin/env node
1
+ #!/usr/bin/env node
2
2
 
3
3
  import {Server} from "@modelcontextprotocol/sdk/server/index.js";
4
4
  import {StdioServerTransport} from "@modelcontextprotocol/sdk/server/stdio.js";
@@ -584,7 +584,7 @@ async function executeToolInternal(name, args) {
584
584
 
585
585
  // Initialize registry if needed
586
586
  const registry = window.__MODEL_REGISTRY__ || (() => {
587
- const reg = new ModelRegistry();
587
+ const reg = new window.ModelRegistry();
588
588
  if (window.ELEMENT_MODELS_CLASSES) {
589
589
  reg.registerAll(window.ELEMENT_MODELS_CLASSES);
590
590
  }
@@ -2334,7 +2334,7 @@ Start coding now.`;
2334
2334
  return {
2335
2335
  content: [
2336
2336
  { type: 'text', text: JSON.stringify(response, null, 2) },
2337
- { type: 'image', data: actionResult.screenshot, mimeType: 'image/png' }
2337
+ { type: 'image', data: actionResult.screenshot, mimeType: actionResult.screenshotMimeType || 'image/png' }
2338
2338
  ]
2339
2339
  };
2340
2340
  }
@@ -2760,7 +2760,7 @@ Start coding now.`;
2760
2760
  return {
2761
2761
  content: [
2762
2762
  { type: 'text', text: JSON.stringify(response, null, 2) },
2763
- { type: 'image', data: actionResult.screenshot, mimeType: 'image/png' }
2763
+ { type: 'image', data: actionResult.screenshot, mimeType: actionResult.screenshotMimeType || 'image/png' }
2764
2764
  ]
2765
2765
  };
2766
2766
  }
@@ -3938,6 +3938,19 @@ async function main() {
3938
3938
 
3939
3939
  console.error("chrometools-mcp server running on stdio");
3940
3940
  console.error("Browser will be initialized on first openBrowser call");
3941
+
3942
+ // Pre-warm Jimp AFTER server is connected (non-blocking)
3943
+ // Jimp v0.22 constructor is thenable - awaiting it before server.connect()
3944
+ // would block transport and cause MCP client timeout
3945
+ (async () => {
3946
+ try {
3947
+ const img = await new Jimp(1, 1, 0x000000ff);
3948
+ await img.getBufferAsync(Jimp.MIME_JPEG);
3949
+ console.error("[chrometools-mcp] Jimp pre-warmed");
3950
+ } catch (e) {
3951
+ console.error("[chrometools-mcp] Jimp pre-warm failed:", e.message);
3952
+ }
3953
+ })();
3941
3954
  }
3942
3955
 
3943
3956
  main().catch((error) => {
@@ -42,13 +42,22 @@ export class TextInputModel extends BaseInputModel {
42
42
  try {
43
43
  // Method 1: Try Puppeteer typing (works for most cases)
44
44
  try {
45
- // Focus and clear using JS (most reliable)
45
+ // Focus and clear using native setter (works with React/Vue/Angular controlled inputs)
46
46
  await withTimeout(
47
47
  () => this.element.evaluate((el, shouldClear) => {
48
48
  el.focus();
49
49
  el.click();
50
50
  if (shouldClear) {
51
- el.value = '';
51
+ // Use native HTMLInputElement setter to bypass React's value tracker
52
+ // React overrides the value setter and ignores programmatic changes via el.value = ''
53
+ const nativeSetter = Object.getOwnPropertyDescriptor(
54
+ window.HTMLInputElement.prototype, 'value'
55
+ )?.set;
56
+ if (nativeSetter) {
57
+ nativeSetter.call(el, '');
58
+ } else {
59
+ el.value = '';
60
+ }
52
61
  el.dispatchEvent(new Event('input', { bubbles: true }));
53
62
  el.dispatchEvent(new Event('change', { bubbles: true }));
54
63
  }
@@ -78,11 +87,20 @@ export class TextInputModel extends BaseInputModel {
78
87
  // Fall through to JS method
79
88
  }
80
89
 
81
- // Method 2: Fallback to direct JS value setting
90
+ // Method 2: Fallback to direct JS value setting (with React-compatible native setter)
82
91
  await withTimeout(
83
92
  () => this.element.evaluate((el, newValue, shouldClear) => {
84
93
  el.focus();
85
- el.value = shouldClear ? newValue : el.value + newValue;
94
+ const finalValue = shouldClear ? newValue : el.value + newValue;
95
+ // Use native setter to bypass React's value tracker
96
+ const nativeSetter = Object.getOwnPropertyDescriptor(
97
+ window.HTMLInputElement.prototype, 'value'
98
+ )?.set;
99
+ if (nativeSetter) {
100
+ nativeSetter.call(el, finalValue);
101
+ } else {
102
+ el.value = finalValue;
103
+ }
86
104
  el.dispatchEvent(new Event('input', { bubbles: true }));
87
105
  el.dispatchEvent(new Event('change', { bubbles: true }));
88
106
  }, value, clearFirst),
@@ -54,7 +54,15 @@ export class TextareaModel extends BaseInputModel {
54
54
  await withTimeout(
55
55
  () => this.element.evaluate(el => {
56
56
  el.focus();
57
- el.value = '';
57
+ // Use native setter to bypass React's value tracker
58
+ const nativeSetter = Object.getOwnPropertyDescriptor(
59
+ window.HTMLTextAreaElement.prototype, 'value'
60
+ )?.set;
61
+ if (nativeSetter) {
62
+ nativeSetter.call(el, '');
63
+ } else {
64
+ el.value = '';
65
+ }
58
66
  el.dispatchEvent(new Event('input', { bubbles: true }));
59
67
  el.dispatchEvent(new Event('change', { bubbles: true }));
60
68
  }),
@@ -82,11 +90,19 @@ export class TextareaModel extends BaseInputModel {
82
90
  // Fall through to JS method
83
91
  }
84
92
 
85
- // Method 2: Fallback to direct JS value setting
93
+ // Method 2: Fallback to direct JS value setting (with React-compatible native setter)
86
94
  await withTimeout(
87
95
  () => this.element.evaluate((el, newValue, shouldClear) => {
88
96
  el.focus();
89
- el.value = shouldClear ? newValue : el.value + newValue;
97
+ const finalValue = shouldClear ? newValue : el.value + newValue;
98
+ const nativeSetter = Object.getOwnPropertyDescriptor(
99
+ window.HTMLTextAreaElement.prototype, 'value'
100
+ )?.set;
101
+ if (nativeSetter) {
102
+ nativeSetter.call(el, finalValue);
103
+ } else {
104
+ el.value = finalValue;
105
+ }
90
106
  el.dispatchEvent(new Event('input', { bubbles: true }));
91
107
  el.dispatchEvent(new Event('change', { bubbles: true }));
92
108
  }, value, clearFirst),
package/models/index.js CHANGED
@@ -19,7 +19,7 @@ class TextInputModel extends ElementModel {
19
19
  }
20
20
 
21
21
  getActions() {
22
- return ['type', 'click', 'hover', 'screenshot'];
22
+ return ['type', 'clear', 'click', 'hover', 'screenshot'];
23
23
  }
24
24
 
25
25
  matches(element, elementType) {
@@ -32,6 +32,7 @@ class TextInputModel extends ElementModel {
32
32
  getActionHandler(actionName) {
33
33
  const handlers = {
34
34
  'type': 'executeTypeAction',
35
+ 'clear': 'executeTypeAction',
35
36
  'click': 'executeClickAction',
36
37
  'hover': 'executeHoverAction',
37
38
  'screenshot': 'executeScreenshotAction'
@@ -381,6 +382,42 @@ class ColorInputModel extends ElementModel {
381
382
  }
382
383
  }
383
384
 
385
+ /**
386
+ * Modal/Dialog Model
387
+ * Handles: Modal dialogs, popups, overlays (React Portals, framework modals)
388
+ * Detects elements rendered via portals outside the main React tree
389
+ */
390
+ class ModalModel extends ElementModel {
391
+ getName() {
392
+ return 'Modal';
393
+ }
394
+
395
+ getActions() {
396
+ return ['screenshot', 'close', 'scrollTo'];
397
+ }
398
+
399
+ getPriority() {
400
+ return 200; // High priority — check before containers
401
+ }
402
+
403
+ matches(element, elementType) {
404
+ // Only match actual dialog elements, not framework wrappers
405
+ // Framework wrappers are detected separately for portal inclusion
406
+ if (element.getAttribute('role') === 'dialog') return true;
407
+ if (element.getAttribute('aria-modal') === 'true') return true;
408
+ return false;
409
+ }
410
+
411
+ getActionHandler(actionName) {
412
+ const handlers = {
413
+ 'screenshot': 'executeScreenshotAction',
414
+ 'close': 'executeClickAction',
415
+ 'scrollTo': 'executeScrollToAction'
416
+ };
417
+ return handlers[actionName] || null;
418
+ }
419
+ }
420
+
384
421
  /**
385
422
  * Default Model (fallback for non-interactive elements)
386
423
  */
@@ -424,6 +461,7 @@ const MODELS = [
424
461
  DateInputModel,
425
462
  FileInputModel,
426
463
  ColorInputModel,
464
+ ModalModel,
427
465
  DefaultModel
428
466
  ];
429
467
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "chrometools-mcp",
3
- "version": "3.5.0",
3
+ "version": "3.5.2",
4
4
  "description": "MCP (Model Context Protocol) server for Chrome automation using Puppeteer. Persistent browser sessions, UI framework detection (MUI, Ant Design, etc.), Page Object support, visual testing, Figma comparison. Works seamlessly in WSL, Linux, macOS, and Windows.",
5
5
  "type": "module",
6
6
  "main": "index.js",
@@ -16,7 +16,7 @@ function initializeModelRegistry() {
16
16
  }
17
17
 
18
18
  // Create and populate registry
19
- const registry = new ModelRegistry();
19
+ const registry = new (window.ModelRegistry || ModelRegistry)();
20
20
 
21
21
  // Register all models (order doesn't matter, priority is handled internally)
22
22
  if (typeof window !== 'undefined' && window.ELEMENT_MODELS_CLASSES) {
@@ -73,12 +73,51 @@ function buildAPOMTree(interactiveOnly = true, viewportOnly = false) {
73
73
  let idCounter = 0;
74
74
  const elementIds = new WeakMap();
75
75
  const interactiveElements = new WeakSet();
76
+ const modalElements = new WeakSet(); // Elements inside modal portals (skip visibility checks)
77
+ const modalAncestors = new WeakSet(); // Portal wrapper ancestors (force compact format)
76
78
 
77
79
  // First pass: mark all interactive elements
78
80
  if (interactiveOnly) {
79
81
  markInteractiveElements(document.body);
80
82
  }
81
83
 
84
+ // Second pass: detect modal/dialog portals and force-mark for inclusion
85
+ // Modals are rendered via React Portals outside the main tree and may have
86
+ // opacity: 0 during animation — force-include them and all their descendants
87
+ if (interactiveOnly) {
88
+ // Framework-specific portal container patterns (used only for detection, not model assignment)
89
+ const portalPatterns = [
90
+ 'ant-modal-root', 'ant-modal-wrap',
91
+ 'MuiDialog-root', 'MuiModal-root',
92
+ 'modal-dialog',
93
+ 'chakra-modal__content-container',
94
+ 'el-dialog__wrapper', 'el-overlay-dialog',
95
+ 'headlessui-dialog',
96
+ 'radix-dialog',
97
+ 'mantine-Modal-root',
98
+ ];
99
+ function isPortalElement(el) {
100
+ if (el.getAttribute('role') === 'dialog') return true;
101
+ if (el.getAttribute('aria-modal') === 'true') return true;
102
+ const classes = el.className || '';
103
+ if (typeof classes !== 'string') return false;
104
+ return portalPatterns.some(p => classes.includes(p));
105
+ }
106
+
107
+ // Scan body direct children for framework-specific portal roots
108
+ for (const child of document.body.children) {
109
+ if (isPortalElement(child)) {
110
+ forceMarkModalTree(child);
111
+ }
112
+ }
113
+ // Also find deeper dialog elements (some frameworks nest portals)
114
+ document.querySelectorAll('[role="dialog"], [aria-modal="true"]').forEach(el => {
115
+ if (!modalElements.has(el)) {
116
+ forceMarkModalTree(el);
117
+ }
118
+ });
119
+ }
120
+
82
121
  // Build tree from body
83
122
  result.tree = buildNode(document.body, null, 0, []);
84
123
 
@@ -408,6 +447,31 @@ function buildAPOMTree(interactiveOnly = true, viewportOnly = false) {
408
447
  interactiveElements.add(document.body);
409
448
  }
410
449
 
450
+ /**
451
+ * Force-mark modal portal element and all its descendants for inclusion in APOM tree.
452
+ * Modal portals (React Portals) are rendered outside the main tree and may have
453
+ * opacity: 0 during CSS animations — this ensures they're always included.
454
+ */
455
+ function forceMarkModalTree(element) {
456
+ modalElements.add(element);
457
+ interactiveElements.add(element);
458
+ // Mark all descendants — inputs, buttons, etc. inside the modal
459
+ element.querySelectorAll('*').forEach(el => {
460
+ modalElements.add(el);
461
+ interactiveElements.add(el);
462
+ });
463
+ // Mark ancestors up to body (so the path from body to modal is traversed)
464
+ // Must add to modalElements for isVisible() bypass (0x0 dimensions, opacity:0)
465
+ // Also mark as modalAncestors to force compact format (portal wrappers are pass-through)
466
+ let current = element.parentElement;
467
+ while (current && current !== document.body) {
468
+ modalElements.add(current);
469
+ modalAncestors.add(current);
470
+ interactiveElements.add(current);
471
+ current = current.parentElement;
472
+ }
473
+ }
474
+
411
475
  /**
412
476
  * Check if element is in viewport
413
477
  */
@@ -431,7 +495,11 @@ function buildAPOMTree(interactiveOnly = true, viewportOnly = false) {
431
495
  */
432
496
  function isVisible(el) {
433
497
  // Check dimensions first (works for fixed position elements)
434
- if (el.offsetWidth === 0 || el.offsetHeight === 0) return false;
498
+ // Exception: modal portal wrapper divs may have 0x0 dimensions
499
+ // while their visible content (dialog, inputs, buttons) does not
500
+ if (el.offsetWidth === 0 || el.offsetHeight === 0) {
501
+ if (!modalElements.has(el)) return false;
502
+ }
435
503
 
436
504
  // Check computed styles
437
505
  const style = window.getComputedStyle(el);
@@ -439,11 +507,12 @@ function buildAPOMTree(interactiveOnly = true, viewportOnly = false) {
439
507
  return false;
440
508
  }
441
509
 
442
- // Check opacity, but allow exceptions for inputs that are commonly styled with opacity:0
443
- // (checkboxes, radios, file inputs often use opacity:0 with custom visual overlay)
510
+ // Check opacity, but allow exceptions for:
511
+ // - inputs styled with opacity:0 (checkboxes, radios, file inputs with custom overlay)
512
+ // - elements inside modal portals (opacity: 0 during CSS appear animation)
444
513
  const tag = el.tagName.toLowerCase();
445
514
  const isStylableInput = tag === 'input' && ['checkbox', 'radio', 'file'].includes(el.type);
446
- if (style.opacity === '0' && !isStylableInput) {
515
+ if (style.opacity === '0' && !isStylableInput && !modalElements.has(el)) {
447
516
  return false;
448
517
  }
449
518
 
@@ -457,7 +526,8 @@ function buildAPOMTree(interactiveOnly = true, viewportOnly = false) {
457
526
 
458
527
  // Additional check: element should be in viewport or have offsetParent
459
528
  // This handles elements inside position:fixed containers (Angular Material)
460
- return el.offsetParent !== null || style.position === 'fixed' || style.position === 'sticky';
529
+ // Exception: modal portal elements may lack offsetParent
530
+ return el.offsetParent !== null || style.position === 'fixed' || style.position === 'sticky' || modalElements.has(el);
461
531
  }
462
532
 
463
533
  /**
@@ -505,8 +575,31 @@ function buildAPOMTree(interactiveOnly = true, viewportOnly = false) {
505
575
  }
506
576
  }
507
577
 
578
+ // Modal containers: promote to interactive node with metadata
579
+ if (modelName === 'Modal') {
580
+ elementType.isInteractive = true;
581
+ elementType.type = 'dialog';
582
+ // Extract modal title
583
+ const titleEl = element.querySelector(
584
+ '.ant-modal-title, .MuiDialogTitle-root, [class*="modal-title"], [class*="dialog-title"], .modal-header h5, .modal-header h4'
585
+ );
586
+ const titleText = titleEl ? titleEl.textContent.trim().substring(0, 100) : null;
587
+ // Extract action buttons
588
+ const buttons = element.querySelectorAll('button');
589
+ const actions = Array.from(buttons)
590
+ .map(b => b.textContent.trim())
591
+ .filter(t => t && t.length > 0 && t.length < 30);
592
+ elementType.metadata = {
593
+ ...(elementType.metadata || {}),
594
+ ...(titleText ? { title: titleText } : {}),
595
+ ...(actions.length ? { actions: actions.slice(0, 5) } : {})
596
+ };
597
+ }
598
+
508
599
  // Build node - minimize non-interactive parents
509
- const isInteractive = elementType.isInteractive;
600
+ // Modal ancestors (portal wrappers) are forced to compact format —
601
+ // they have onclick handlers (close on outside click) but are just pass-through containers
602
+ const isInteractive = elementType.isInteractive && !modalAncestors.has(element);
510
603
 
511
604
  // Build node structure based on mode
512
605
  let node;
@@ -562,16 +655,14 @@ function buildAPOMTree(interactiveOnly = true, viewportOnly = false) {
562
655
 
563
656
  // Update metadata counters
564
657
  result.metadata.totalElements++;
565
- if (elementType.isInteractive) {
658
+ if (isInteractive) {
566
659
  result.metadata.interactiveCount++;
567
660
  }
568
661
  if (elementType.type === 'form') {
569
662
  result.metadata.formCount++;
570
663
  }
571
- if (position && (position.type === 'fixed' || position.type === 'absolute')) {
572
- if (position.zIndex && position.zIndex >= 100) {
573
- result.metadata.modalCount++;
574
- }
664
+ if (modelName === 'Modal') {
665
+ result.metadata.modalCount++;
575
666
  }
576
667
  if (depth > result.metadata.maxDepth) {
577
668
  result.metadata.maxDepth = depth;
@@ -1003,30 +1094,29 @@ function buildAPOMTree(interactiveOnly = true, viewportOnly = false) {
1003
1094
  if (stableClass) {
1004
1095
  const escapedClass = CSS.escape(stableClass);
1005
1096
  const classSelector = `.${escapedClass}`;
1006
- // Verify it's unique within parent context
1007
- if (element.parentElement) {
1008
- try {
1009
- const matches = element.parentElement.querySelectorAll(classSelector);
1010
- if (matches.length === 1 && matches[0] === element) {
1011
- return classSelector;
1012
- }
1013
- } catch (e) {
1014
- // Invalid selector, continue to path-based approach
1097
+ // Verify it's unique in the ENTIRE document (not just parent)
1098
+ try {
1099
+ const matches = document.querySelectorAll(classSelector);
1100
+ if (matches.length === 1 && matches[0] === element) {
1101
+ return classSelector;
1015
1102
  }
1103
+ } catch (e) {
1104
+ // Invalid selector, continue to path-based approach
1016
1105
  }
1017
1106
  }
1018
1107
 
1019
- // Build path from parent
1108
+ // Build path from element to body, checking uniqueness at each level
1020
1109
  const path = [];
1021
1110
  let current = element;
1111
+ const MAX_PATH_DEPTH = 8;
1022
1112
 
1023
- while (current && current !== document.body) {
1113
+ while (current && current !== document.body && path.length < MAX_PATH_DEPTH) {
1024
1114
  let selector = current.tagName.toLowerCase();
1025
1115
 
1026
1116
  // Add stable class if available (escaped for CSS selector safety)
1027
- const stableClass = getStableClassName(current);
1028
- if (stableClass) {
1029
- selector += `.${CSS.escape(stableClass)}`;
1117
+ const cls = getStableClassName(current);
1118
+ if (cls) {
1119
+ selector += `.${CSS.escape(cls)}`;
1030
1120
  }
1031
1121
 
1032
1122
  // Add nth-of-type if needed
@@ -1041,6 +1131,15 @@ function buildAPOMTree(interactiveOnly = true, viewportOnly = false) {
1041
1131
  }
1042
1132
 
1043
1133
  path.unshift(selector);
1134
+
1135
+ // Check if current path is already unique in the document
1136
+ try {
1137
+ const candidateSelector = path.join(' > ');
1138
+ if (document.querySelectorAll(candidateSelector).length === 1) {
1139
+ return candidateSelector;
1140
+ }
1141
+ } catch (e) { /* continue building path */ }
1142
+
1044
1143
  current = current.parentElement;
1045
1144
  }
1046
1145
 
@@ -92,7 +92,7 @@ export const toolDefinitions = [
92
92
  },
93
93
  {
94
94
  name: "screenshot",
95
- description: "Capture element image (15-25k tokens). Use analyzePage for form data/validation (8-10k tokens).",
95
+ description: "Capture element image (5-10k tokens). Use analyzePage for form data/validation (8-10k tokens).",
96
96
  inputSchema: {
97
97
  type: "object",
98
98
  properties: {
@@ -100,8 +100,8 @@ export const toolDefinitions = [
100
100
  padding: { type: "number", description: "Padding px (default: 0)" },
101
101
  maxWidth: { type: "number", description: "Max width px (default: 1024, null=original)" },
102
102
  maxHeight: { type: "number", description: "Max height px (default: 8000, null=original)" },
103
- quality: { type: "number", minimum: 1, maximum: 100, description: "JPEG quality (default: 80)" },
104
- format: { type: "string", enum: ["png", "jpeg", "auto"], description: "Format (default: auto)" },
103
+ quality: { type: "number", minimum: 1, maximum: 100, description: "JPEG quality (default: 40)" },
104
+ format: { type: "string", enum: ["png", "jpeg", "auto"], description: "Format (default: jpeg)" },
105
105
  },
106
106
  required: ["selector"],
107
107
  },
@@ -115,8 +115,8 @@ export const ScreenshotSchema = z.object({
115
115
  padding: z.number().optional().describe("Padding around element in pixels (default: 0)"),
116
116
  maxWidth: z.number().nullable().optional().describe("Maximum width in pixels, auto-scales if larger (default: 1024, set to null for original size)"),
117
117
  maxHeight: z.number().nullable().optional().describe("Maximum height in pixels, auto-scales if larger (default: 8000 for API limit, set to null for original size)"),
118
- quality: z.number().min(1).max(100).optional().describe("JPEG quality 1-100 (default: 80, only applies to JPEG format)"),
119
- format: z.enum(['png', 'jpeg', 'auto']).optional().describe("Image format: 'png', 'jpeg', or 'auto' (default: 'auto' - chooses based on size)"),
118
+ quality: z.number().min(1).max(100).optional().describe("JPEG quality 1-100 (default: 40)"),
119
+ format: z.enum(['png', 'jpeg', 'auto']).optional().describe("Image format (default: 'jpeg')"),
120
120
  }).refine(data => (data.id && !data.selector) || (!data.id && data.selector), {
121
121
  message: "Either 'id' or 'selector' must be provided, but not both"
122
122
  });
@@ -5,6 +5,7 @@
5
5
 
6
6
  import { runPostClickDiagnostics, formatDiagnosticsForAI } from '../post-click-diagnostics.js';
7
7
  import { generateClickHints } from '../hints-generator.js';
8
+ import { processScreenshot } from '../screenshot-processor.js';
8
9
 
9
10
  /**
10
11
  * Execute click action on element with adaptive strategy
@@ -186,10 +187,16 @@ export async function executeClickAction(page, element, options = {}) {
186
187
  { type: "text", text: `Clicked: ${identifier}${hintsText}${diagnosticsText}` }
187
188
  ];
188
189
 
189
- // Only add screenshot if requested
190
+ // Only add screenshot if requested — lightweight JPEG for action confirmation
190
191
  if (screenshot === true) {
191
- const screenshotData = await page.screenshot({ encoding: 'base64', fullPage: false });
192
- content.push({ type: "image", data: screenshotData, mimeType: "image/png" });
192
+ const screenshotBuffer = await page.screenshot({ encoding: 'binary', fullPage: false });
193
+ const processed = await processScreenshot(screenshotBuffer, {
194
+ maxWidth: 800,
195
+ maxHeight: 4000,
196
+ quality: 40,
197
+ format: 'jpeg',
198
+ });
199
+ content.push({ type: "image", data: processed.buffer.toString('base64'), mimeType: processed.mimeType });
193
200
  }
194
201
 
195
202
  return { content };
@@ -14,8 +14,8 @@ import { processScreenshot } from '../image-processing.js';
14
14
  * @param {number} options.padding - Padding around element in pixels (default: 0)
15
15
  * @param {number|null} options.maxWidth - Max width for scaling (default: 1024, null for original)
16
16
  * @param {number|null} options.maxHeight - Max height for scaling (default: 8000, null for original)
17
- * @param {number} options.quality - JPEG quality 1-100 (default: 80)
18
- * @param {string} options.format - Image format: 'png', 'jpeg', 'auto' (default: 'auto')
17
+ * @param {number} options.quality - JPEG quality 1-100 (default: 50)
18
+ * @param {string} options.format - Image format: 'png', 'jpeg', 'auto' (default: 'jpeg')
19
19
  * @returns {Promise<Object>} Result with content array (text + image)
20
20
  */
21
21
  export async function executeScreenshotAction(page, element, options = {}) {
@@ -24,8 +24,8 @@ export async function executeScreenshotAction(page, element, options = {}) {
24
24
  padding = 0,
25
25
  maxWidth = 1024,
26
26
  maxHeight = 8000,
27
- quality = 80,
28
- format = 'auto'
27
+ quality = 40,
28
+ format = 'jpeg'
29
29
  } = options;
30
30
 
31
31
  // Scroll to element to ensure it's in viewport
@@ -1,3 +1,25 @@
1
+ import { processScreenshot } from './screenshot-processor.js';
2
+
3
+ // Lightweight action screenshot: small JPEG for confirming actions worked
4
+ // These are "report" screenshots, not for detailed analysis
5
+ async function takeActionScreenshot(page, clip) {
6
+ const screenshotBuffer = await page.screenshot({
7
+ encoding: 'binary',
8
+ fullPage: false,
9
+ ...(clip ? { clip } : {})
10
+ });
11
+ const processed = await processScreenshot(screenshotBuffer, {
12
+ maxWidth: 800,
13
+ maxHeight: 4000,
14
+ quality: 40,
15
+ format: 'jpeg',
16
+ });
17
+ return {
18
+ data: processed.buffer.toString('base64'),
19
+ mimeType: processed.mimeType,
20
+ };
21
+ }
22
+
1
23
  // Helper function to execute actions on elements
2
24
  export async function executeElementAction(page, selector, action) {
3
25
  if (!action || !action.type) {
@@ -17,13 +39,27 @@ export async function executeElementAction(page, selector, action) {
17
39
 
18
40
  switch (action.type) {
19
41
  case 'click':
20
- await element.click();
42
+ // Scroll element into view first (direct JS, avoids Puppeteer's scrollIntoViewIfNeeded hang)
43
+ await element.evaluate(el => el.scrollIntoView({ behavior: 'instant', block: 'center' }));
44
+
45
+ // Click with timeout + JS fallback (Puppeteer's click can hang in complex layouts)
46
+ try {
47
+ await Promise.race([
48
+ element.click(),
49
+ new Promise((_, reject) => setTimeout(() => reject(new Error('click timeout')), 5000))
50
+ ]);
51
+ } catch (e) {
52
+ // Fallback to JS click (bypasses Puppeteer's coordinate-based click)
53
+ await element.evaluate(el => el.click());
54
+ }
55
+
21
56
  await new Promise(resolve => setTimeout(resolve, action.waitAfter || 1500));
22
57
  result.message = `Clicked on ${selector}`;
23
58
 
24
59
  if (action.screenshot) {
25
- const screenshot = await page.screenshot({ encoding: 'base64', fullPage: false });
26
- result.screenshot = screenshot;
60
+ const { data, mimeType } = await takeActionScreenshot(page);
61
+ result.screenshot = data;
62
+ result.screenshotMimeType = mimeType;
27
63
  }
28
64
  break;
29
65
 
@@ -38,8 +74,9 @@ export async function executeElementAction(page, selector, action) {
38
74
  result.message = `Typed "${action.text}" into ${selector}`;
39
75
 
40
76
  if (action.screenshot) {
41
- const screenshot = await page.screenshot({ encoding: 'base64', fullPage: false });
42
- result.screenshot = screenshot;
77
+ const { data, mimeType } = await takeActionScreenshot(page);
78
+ result.screenshot = data;
79
+ result.screenshotMimeType = mimeType;
43
80
  }
44
81
  break;
45
82
 
@@ -65,9 +102,10 @@ export async function executeElementAction(page, selector, action) {
65
102
  width: Math.max(box.width, 1),
66
103
  height: Math.max(box.height, 1)
67
104
  };
68
- const screenshot = await page.screenshot({ clip, encoding: 'base64' });
105
+ const { data: screenshotData, mimeType: screenshotMime } = await takeActionScreenshot(page, clip);
69
106
  result.message = `Captured screenshot of ${selector}`;
70
- result.screenshot = screenshot;
107
+ result.screenshot = screenshotData;
108
+ result.screenshotMimeType = screenshotMime;
71
109
  break;
72
110
 
73
111
  case 'hover':
@@ -76,8 +114,9 @@ export async function executeElementAction(page, selector, action) {
76
114
  result.message = `Hovered over ${selector}`;
77
115
 
78
116
  if (action.screenshot) {
79
- const screenshot = await page.screenshot({ encoding: 'base64', fullPage: false });
80
- result.screenshot = screenshot;
117
+ const { data, mimeType } = await takeActionScreenshot(page);
118
+ result.screenshot = data;
119
+ result.screenshotMimeType = mimeType;
81
120
  }
82
121
  break;
83
122
 
@@ -101,8 +140,9 @@ export async function executeElementAction(page, selector, action) {
101
140
  result.message = `Applied styles to ${selector}`;
102
141
 
103
142
  if (action.screenshot) {
104
- const screenshot = await page.screenshot({ encoding: 'base64', fullPage: false });
105
- result.screenshot = screenshot;
143
+ const { data, mimeType } = await takeActionScreenshot(page);
144
+ result.screenshot = data;
145
+ result.screenshotMimeType = mimeType;
106
146
  }
107
147
  break;
108
148