optexity-browser-use 0.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. browser_use/__init__.py +157 -0
  2. browser_use/actor/__init__.py +11 -0
  3. browser_use/actor/element.py +1175 -0
  4. browser_use/actor/mouse.py +134 -0
  5. browser_use/actor/page.py +561 -0
  6. browser_use/actor/playground/flights.py +41 -0
  7. browser_use/actor/playground/mixed_automation.py +54 -0
  8. browser_use/actor/playground/playground.py +236 -0
  9. browser_use/actor/utils.py +176 -0
  10. browser_use/agent/cloud_events.py +282 -0
  11. browser_use/agent/gif.py +424 -0
  12. browser_use/agent/judge.py +170 -0
  13. browser_use/agent/message_manager/service.py +473 -0
  14. browser_use/agent/message_manager/utils.py +52 -0
  15. browser_use/agent/message_manager/views.py +98 -0
  16. browser_use/agent/prompts.py +413 -0
  17. browser_use/agent/service.py +2316 -0
  18. browser_use/agent/system_prompt.md +185 -0
  19. browser_use/agent/system_prompt_flash.md +10 -0
  20. browser_use/agent/system_prompt_no_thinking.md +183 -0
  21. browser_use/agent/views.py +743 -0
  22. browser_use/browser/__init__.py +41 -0
  23. browser_use/browser/cloud/cloud.py +203 -0
  24. browser_use/browser/cloud/views.py +89 -0
  25. browser_use/browser/events.py +578 -0
  26. browser_use/browser/profile.py +1158 -0
  27. browser_use/browser/python_highlights.py +548 -0
  28. browser_use/browser/session.py +3225 -0
  29. browser_use/browser/session_manager.py +399 -0
  30. browser_use/browser/video_recorder.py +162 -0
  31. browser_use/browser/views.py +200 -0
  32. browser_use/browser/watchdog_base.py +260 -0
  33. browser_use/browser/watchdogs/__init__.py +0 -0
  34. browser_use/browser/watchdogs/aboutblank_watchdog.py +253 -0
  35. browser_use/browser/watchdogs/crash_watchdog.py +335 -0
  36. browser_use/browser/watchdogs/default_action_watchdog.py +2729 -0
  37. browser_use/browser/watchdogs/dom_watchdog.py +817 -0
  38. browser_use/browser/watchdogs/downloads_watchdog.py +1277 -0
  39. browser_use/browser/watchdogs/local_browser_watchdog.py +461 -0
  40. browser_use/browser/watchdogs/permissions_watchdog.py +43 -0
  41. browser_use/browser/watchdogs/popups_watchdog.py +143 -0
  42. browser_use/browser/watchdogs/recording_watchdog.py +126 -0
  43. browser_use/browser/watchdogs/screenshot_watchdog.py +62 -0
  44. browser_use/browser/watchdogs/security_watchdog.py +280 -0
  45. browser_use/browser/watchdogs/storage_state_watchdog.py +335 -0
  46. browser_use/cli.py +2359 -0
  47. browser_use/code_use/__init__.py +16 -0
  48. browser_use/code_use/formatting.py +192 -0
  49. browser_use/code_use/namespace.py +665 -0
  50. browser_use/code_use/notebook_export.py +276 -0
  51. browser_use/code_use/service.py +1340 -0
  52. browser_use/code_use/system_prompt.md +574 -0
  53. browser_use/code_use/utils.py +150 -0
  54. browser_use/code_use/views.py +171 -0
  55. browser_use/config.py +505 -0
  56. browser_use/controller/__init__.py +3 -0
  57. browser_use/dom/enhanced_snapshot.py +161 -0
  58. browser_use/dom/markdown_extractor.py +169 -0
  59. browser_use/dom/playground/extraction.py +312 -0
  60. browser_use/dom/playground/multi_act.py +32 -0
  61. browser_use/dom/serializer/clickable_elements.py +200 -0
  62. browser_use/dom/serializer/code_use_serializer.py +287 -0
  63. browser_use/dom/serializer/eval_serializer.py +478 -0
  64. browser_use/dom/serializer/html_serializer.py +212 -0
  65. browser_use/dom/serializer/paint_order.py +197 -0
  66. browser_use/dom/serializer/serializer.py +1170 -0
  67. browser_use/dom/service.py +825 -0
  68. browser_use/dom/utils.py +129 -0
  69. browser_use/dom/views.py +906 -0
  70. browser_use/exceptions.py +5 -0
  71. browser_use/filesystem/__init__.py +0 -0
  72. browser_use/filesystem/file_system.py +619 -0
  73. browser_use/init_cmd.py +376 -0
  74. browser_use/integrations/gmail/__init__.py +24 -0
  75. browser_use/integrations/gmail/actions.py +115 -0
  76. browser_use/integrations/gmail/service.py +225 -0
  77. browser_use/llm/__init__.py +155 -0
  78. browser_use/llm/anthropic/chat.py +242 -0
  79. browser_use/llm/anthropic/serializer.py +312 -0
  80. browser_use/llm/aws/__init__.py +36 -0
  81. browser_use/llm/aws/chat_anthropic.py +242 -0
  82. browser_use/llm/aws/chat_bedrock.py +289 -0
  83. browser_use/llm/aws/serializer.py +257 -0
  84. browser_use/llm/azure/chat.py +91 -0
  85. browser_use/llm/base.py +57 -0
  86. browser_use/llm/browser_use/__init__.py +3 -0
  87. browser_use/llm/browser_use/chat.py +201 -0
  88. browser_use/llm/cerebras/chat.py +193 -0
  89. browser_use/llm/cerebras/serializer.py +109 -0
  90. browser_use/llm/deepseek/chat.py +212 -0
  91. browser_use/llm/deepseek/serializer.py +109 -0
  92. browser_use/llm/exceptions.py +29 -0
  93. browser_use/llm/google/__init__.py +3 -0
  94. browser_use/llm/google/chat.py +542 -0
  95. browser_use/llm/google/serializer.py +120 -0
  96. browser_use/llm/groq/chat.py +229 -0
  97. browser_use/llm/groq/parser.py +158 -0
  98. browser_use/llm/groq/serializer.py +159 -0
  99. browser_use/llm/messages.py +238 -0
  100. browser_use/llm/models.py +271 -0
  101. browser_use/llm/oci_raw/__init__.py +10 -0
  102. browser_use/llm/oci_raw/chat.py +443 -0
  103. browser_use/llm/oci_raw/serializer.py +229 -0
  104. browser_use/llm/ollama/chat.py +97 -0
  105. browser_use/llm/ollama/serializer.py +143 -0
  106. browser_use/llm/openai/chat.py +264 -0
  107. browser_use/llm/openai/like.py +15 -0
  108. browser_use/llm/openai/serializer.py +165 -0
  109. browser_use/llm/openrouter/chat.py +211 -0
  110. browser_use/llm/openrouter/serializer.py +26 -0
  111. browser_use/llm/schema.py +176 -0
  112. browser_use/llm/views.py +48 -0
  113. browser_use/logging_config.py +330 -0
  114. browser_use/mcp/__init__.py +18 -0
  115. browser_use/mcp/__main__.py +12 -0
  116. browser_use/mcp/client.py +544 -0
  117. browser_use/mcp/controller.py +264 -0
  118. browser_use/mcp/server.py +1114 -0
  119. browser_use/observability.py +204 -0
  120. browser_use/py.typed +0 -0
  121. browser_use/sandbox/__init__.py +41 -0
  122. browser_use/sandbox/sandbox.py +637 -0
  123. browser_use/sandbox/views.py +132 -0
  124. browser_use/screenshots/__init__.py +1 -0
  125. browser_use/screenshots/service.py +52 -0
  126. browser_use/sync/__init__.py +6 -0
  127. browser_use/sync/auth.py +357 -0
  128. browser_use/sync/service.py +161 -0
  129. browser_use/telemetry/__init__.py +51 -0
  130. browser_use/telemetry/service.py +112 -0
  131. browser_use/telemetry/views.py +101 -0
  132. browser_use/tokens/__init__.py +0 -0
  133. browser_use/tokens/custom_pricing.py +24 -0
  134. browser_use/tokens/mappings.py +4 -0
  135. browser_use/tokens/service.py +580 -0
  136. browser_use/tokens/views.py +108 -0
  137. browser_use/tools/registry/service.py +572 -0
  138. browser_use/tools/registry/views.py +174 -0
  139. browser_use/tools/service.py +1675 -0
  140. browser_use/tools/utils.py +82 -0
  141. browser_use/tools/views.py +100 -0
  142. browser_use/utils.py +670 -0
  143. optexity_browser_use-0.9.5.dist-info/METADATA +344 -0
  144. optexity_browser_use-0.9.5.dist-info/RECORD +147 -0
  145. optexity_browser_use-0.9.5.dist-info/WHEEL +4 -0
  146. optexity_browser_use-0.9.5.dist-info/entry_points.txt +3 -0
  147. optexity_browser_use-0.9.5.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,825 @@
1
+ import asyncio
2
+ import logging
3
+ import time
4
+ from typing import TYPE_CHECKING
5
+
6
+ from cdp_use.cdp.accessibility.commands import GetFullAXTreeReturns
7
+ from cdp_use.cdp.accessibility.types import AXNode
8
+ from cdp_use.cdp.dom.types import Node
9
+ from cdp_use.cdp.target import TargetID
10
+
11
+ from browser_use.dom.enhanced_snapshot import (
12
+ REQUIRED_COMPUTED_STYLES,
13
+ build_snapshot_lookup,
14
+ )
15
+ from browser_use.dom.serializer.serializer import DOMTreeSerializer
16
+ from browser_use.dom.views import (
17
+ CurrentPageTargets,
18
+ DOMRect,
19
+ EnhancedAXNode,
20
+ EnhancedAXProperty,
21
+ EnhancedDOMTreeNode,
22
+ NodeType,
23
+ SerializedDOMState,
24
+ TargetAllTrees,
25
+ )
26
+ from browser_use.observability import observe_debug
27
+
28
+ if TYPE_CHECKING:
29
+ from browser_use.browser.session import BrowserSession
30
+
31
+ # Note: iframe limits are now configurable via BrowserProfile.max_iframes and BrowserProfile.max_iframe_depth
32
+
33
+
34
+ class DomService:
35
+ """
36
+ Service for getting the DOM tree and other DOM-related information.
37
+
38
+ Either browser or page must be provided.
39
+
40
+ TODO: currently we start a new websocket connection PER STEP, we should definitely keep this persistent
41
+ """
42
+
43
+ logger: logging.Logger
44
+
45
+ def __init__(
46
+ self,
47
+ browser_session: 'BrowserSession',
48
+ logger: logging.Logger | None = None,
49
+ cross_origin_iframes: bool = False,
50
+ paint_order_filtering: bool = True,
51
+ max_iframes: int = 100,
52
+ max_iframe_depth: int = 5,
53
+ ):
54
+ self.browser_session = browser_session
55
+ self.logger = logger or browser_session.logger
56
+ self.cross_origin_iframes = cross_origin_iframes
57
+ self.paint_order_filtering = paint_order_filtering
58
+ self.max_iframes = max_iframes
59
+ self.max_iframe_depth = max_iframe_depth
60
+
61
+ async def __aenter__(self):
62
+ return self
63
+
64
+ async def __aexit__(self, exc_type, exc_value, traceback):
65
+ pass # no need to cleanup anything, browser_session auto handles cleaning up session cache
66
+
67
+ async def _get_targets_for_page(self, target_id: TargetID | None = None) -> CurrentPageTargets:
68
+ """Get the target info for a specific page.
69
+
70
+ Args:
71
+ target_id: The target ID to get info for. If None, uses current_target_id.
72
+ """
73
+ targets = await self.browser_session.cdp_client.send.Target.getTargets()
74
+
75
+ # Use provided target_id or fall back to current_target_id
76
+ if target_id is None:
77
+ target_id = self.browser_session.current_target_id
78
+ if not target_id:
79
+ raise ValueError('No current target ID set in browser session')
80
+
81
+ # Find main page target by ID
82
+ main_target = next((t for t in targets['targetInfos'] if t['targetId'] == target_id), None)
83
+
84
+ if not main_target:
85
+ raise ValueError(f'No target found for target ID: {target_id}')
86
+
87
+ # Get all frames using the new method to find iframe targets for this page
88
+ all_frames, _ = await self.browser_session.get_all_frames()
89
+
90
+ # Find iframe targets that are children of this target
91
+ iframe_targets = []
92
+ for frame_info in all_frames.values():
93
+ # Check if this frame is a cross-origin iframe with its own target
94
+ if frame_info.get('isCrossOrigin') and frame_info.get('frameTargetId'):
95
+ # Check if this frame belongs to our target
96
+ parent_target = frame_info.get('parentTargetId', frame_info.get('frameTargetId'))
97
+ if parent_target == target_id:
98
+ # Find the target info for this iframe
99
+ iframe_target = next(
100
+ (t for t in targets['targetInfos'] if t['targetId'] == frame_info['frameTargetId']), None
101
+ )
102
+ if iframe_target:
103
+ iframe_targets.append(iframe_target)
104
+
105
+ return CurrentPageTargets(
106
+ page_session=main_target,
107
+ iframe_sessions=iframe_targets,
108
+ )
109
+
110
+ def _build_enhanced_ax_node(self, ax_node: AXNode) -> EnhancedAXNode:
111
+ properties: list[EnhancedAXProperty] | None = None
112
+ if 'properties' in ax_node and ax_node['properties']:
113
+ properties = []
114
+ for property in ax_node['properties']:
115
+ try:
116
+ # test whether property name can go into the enum (sometimes Chrome returns some random properties)
117
+ properties.append(
118
+ EnhancedAXProperty(
119
+ name=property['name'],
120
+ value=property.get('value', {}).get('value', None),
121
+ # related_nodes=[], # TODO: add related nodes
122
+ )
123
+ )
124
+ except ValueError:
125
+ pass
126
+
127
+ enhanced_ax_node = EnhancedAXNode(
128
+ ax_node_id=ax_node['nodeId'],
129
+ ignored=ax_node['ignored'],
130
+ role=ax_node.get('role', {}).get('value', None),
131
+ name=ax_node.get('name', {}).get('value', None),
132
+ description=ax_node.get('description', {}).get('value', None),
133
+ properties=properties,
134
+ child_ids=ax_node.get('childIds', []) if ax_node.get('childIds') else None,
135
+ )
136
+ return enhanced_ax_node
137
+
138
+ async def _get_viewport_ratio(self, target_id: TargetID) -> float:
139
+ """Get viewport dimensions, device pixel ratio, and scroll position using CDP."""
140
+ cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=target_id, focus=False)
141
+
142
+ try:
143
+ # Get the layout metrics which includes the visual viewport
144
+ metrics = await cdp_session.cdp_client.send.Page.getLayoutMetrics(session_id=cdp_session.session_id)
145
+
146
+ visual_viewport = metrics.get('visualViewport', {})
147
+
148
+ # IMPORTANT: Use CSS viewport instead of device pixel viewport
149
+ # This fixes the coordinate mismatch on high-DPI displays
150
+ css_visual_viewport = metrics.get('cssVisualViewport', {})
151
+ css_layout_viewport = metrics.get('cssLayoutViewport', {})
152
+
153
+ # Use CSS pixels (what JavaScript sees) instead of device pixels
154
+ width = css_visual_viewport.get('clientWidth', css_layout_viewport.get('clientWidth', 1920.0))
155
+
156
+ # Calculate device pixel ratio
157
+ device_width = visual_viewport.get('clientWidth', width)
158
+ css_width = css_visual_viewport.get('clientWidth', width)
159
+ device_pixel_ratio = device_width / css_width if css_width > 0 else 1.0
160
+
161
+ return float(device_pixel_ratio)
162
+ except Exception as e:
163
+ self.logger.debug(f'Viewport size detection failed: {e}')
164
+ # Fallback to default viewport size
165
+ return 1.0
166
+
167
+ @classmethod
168
+ def is_element_visible_according_to_all_parents(
169
+ cls, node: EnhancedDOMTreeNode, html_frames: list[EnhancedDOMTreeNode]
170
+ ) -> bool:
171
+ """Check if the element is visible according to all its parent HTML frames."""
172
+
173
+ if not node.snapshot_node:
174
+ return False
175
+
176
+ computed_styles = node.snapshot_node.computed_styles or {}
177
+
178
+ display = computed_styles.get('display', '').lower()
179
+ visibility = computed_styles.get('visibility', '').lower()
180
+ opacity = computed_styles.get('opacity', '1')
181
+
182
+ if display == 'none' or visibility == 'hidden':
183
+ return False
184
+
185
+ try:
186
+ if float(opacity) <= 0:
187
+ return False
188
+ except (ValueError, TypeError):
189
+ pass
190
+
191
+ # Start with the element's local bounds (in its own frame's coordinate system)
192
+ current_bounds = node.snapshot_node.bounds
193
+
194
+ if not current_bounds:
195
+ return False # If there are no bounds, the element is not visible
196
+
197
+ """
198
+ Reverse iterate through the html frames (that can be either iframe or document -> if it's a document frame compare if the current bounds interest with it (taking scroll into account) otherwise move the current bounds by the iframe offset)
199
+ """
200
+ for frame in reversed(html_frames):
201
+ if (
202
+ frame.node_type == NodeType.ELEMENT_NODE
203
+ and (frame.node_name.upper() == 'IFRAME' or frame.node_name.upper() == 'FRAME')
204
+ and frame.snapshot_node
205
+ and frame.snapshot_node.bounds
206
+ ):
207
+ iframe_bounds = frame.snapshot_node.bounds
208
+
209
+ # negate the values added in `_construct_enhanced_node`
210
+ current_bounds.x += iframe_bounds.x
211
+ current_bounds.y += iframe_bounds.y
212
+
213
+ if (
214
+ frame.node_type == NodeType.ELEMENT_NODE
215
+ and frame.node_name == 'HTML'
216
+ and frame.snapshot_node
217
+ and frame.snapshot_node.scrollRects
218
+ and frame.snapshot_node.clientRects
219
+ ):
220
+ # For iframe content, we need to check visibility within the iframe's viewport
221
+ # The scrollRects represent the current scroll position
222
+ # The clientRects represent the viewport size
223
+ # Elements are visible if they fall within the viewport after accounting for scroll
224
+
225
+ # The viewport of the frame (what's actually visible)
226
+ viewport_left = 0 # Viewport always starts at 0 in frame coordinates
227
+ viewport_top = 0
228
+ viewport_right = frame.snapshot_node.clientRects.width
229
+ viewport_bottom = frame.snapshot_node.clientRects.height
230
+
231
+ # Adjust element bounds by the scroll offset to get position relative to viewport
232
+ # When scrolled down, scrollRects.y is positive, so we subtract it from element's y
233
+ adjusted_x = current_bounds.x - frame.snapshot_node.scrollRects.x
234
+ adjusted_y = current_bounds.y - frame.snapshot_node.scrollRects.y
235
+
236
+ frame_intersects = (
237
+ adjusted_x < viewport_right
238
+ and adjusted_x + current_bounds.width > viewport_left
239
+ and adjusted_y < viewport_bottom + 1000
240
+ and adjusted_y + current_bounds.height > viewport_top - 1000
241
+ )
242
+
243
+ if not frame_intersects:
244
+ return False
245
+
246
+ # Keep the original coordinate adjustment to maintain consistency
247
+ # This adjustment is needed for proper coordinate transformation
248
+ current_bounds.x -= frame.snapshot_node.scrollRects.x
249
+ current_bounds.y -= frame.snapshot_node.scrollRects.y
250
+
251
+ # If we reach here, element is visible in main viewport and all containing iframes
252
+ return True
253
+
254
+ async def _get_ax_tree_for_all_frames(self, target_id: TargetID) -> GetFullAXTreeReturns:
255
+ """Recursively collect all frames and merge their accessibility trees into a single array."""
256
+
257
+ cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=target_id, focus=False)
258
+ frame_tree = await cdp_session.cdp_client.send.Page.getFrameTree(session_id=cdp_session.session_id)
259
+
260
+ def collect_all_frame_ids(frame_tree_node) -> list[str]:
261
+ """Recursively collect all frame IDs from the frame tree."""
262
+ frame_ids = [frame_tree_node['frame']['id']]
263
+
264
+ if 'childFrames' in frame_tree_node and frame_tree_node['childFrames']:
265
+ for child_frame in frame_tree_node['childFrames']:
266
+ frame_ids.extend(collect_all_frame_ids(child_frame))
267
+
268
+ return frame_ids
269
+
270
+ # Collect all frame IDs recursively
271
+ all_frame_ids = collect_all_frame_ids(frame_tree['frameTree'])
272
+
273
+ # Get accessibility tree for each frame
274
+ ax_tree_requests = []
275
+ for frame_id in all_frame_ids:
276
+ ax_tree_request = cdp_session.cdp_client.send.Accessibility.getFullAXTree(
277
+ params={'frameId': frame_id}, session_id=cdp_session.session_id
278
+ )
279
+ ax_tree_requests.append(ax_tree_request)
280
+
281
+ # Wait for all requests to complete
282
+ ax_trees = await asyncio.gather(*ax_tree_requests)
283
+
284
+ # Merge all AX nodes into a single array
285
+ merged_nodes: list[AXNode] = []
286
+ for ax_tree in ax_trees:
287
+ merged_nodes.extend(ax_tree['nodes'])
288
+
289
+ return {'nodes': merged_nodes}
290
+
291
+ async def _get_all_trees(self, target_id: TargetID) -> TargetAllTrees:
292
+ cdp_session = await self.browser_session.get_or_create_cdp_session(target_id=target_id, focus=False)
293
+
294
+ # Wait for the page to be ready first
295
+ try:
296
+ ready_state = await cdp_session.cdp_client.send.Runtime.evaluate(
297
+ params={'expression': 'document.readyState'}, session_id=cdp_session.session_id
298
+ )
299
+ except Exception as e:
300
+ pass # Page might not be ready yet
301
+ # DEBUG: Log before capturing snapshot
302
+ self.logger.debug(f'🔍 DEBUG: Capturing DOM snapshot for target {target_id}')
303
+
304
+ # Get actual scroll positions for all iframes before capturing snapshot
305
+ iframe_scroll_positions = {}
306
+ try:
307
+ scroll_result = await cdp_session.cdp_client.send.Runtime.evaluate(
308
+ params={
309
+ 'expression': """
310
+ (() => {
311
+ const scrollData = {};
312
+ const iframes = document.querySelectorAll('iframe');
313
+ iframes.forEach((iframe, index) => {
314
+ try {
315
+ const doc = iframe.contentDocument || iframe.contentWindow.document;
316
+ if (doc) {
317
+ scrollData[index] = {
318
+ scrollTop: doc.documentElement.scrollTop || doc.body.scrollTop || 0,
319
+ scrollLeft: doc.documentElement.scrollLeft || doc.body.scrollLeft || 0
320
+ };
321
+ }
322
+ } catch (e) {
323
+ // Cross-origin iframe, can't access
324
+ }
325
+ });
326
+ return scrollData;
327
+ })()
328
+ """,
329
+ 'returnByValue': True,
330
+ },
331
+ session_id=cdp_session.session_id,
332
+ )
333
+ if scroll_result and 'result' in scroll_result and 'value' in scroll_result['result']:
334
+ iframe_scroll_positions = scroll_result['result']['value']
335
+ for idx, scroll_data in iframe_scroll_positions.items():
336
+ self.logger.debug(
337
+ f'🔍 DEBUG: Iframe {idx} actual scroll position - scrollTop={scroll_data.get("scrollTop", 0)}, scrollLeft={scroll_data.get("scrollLeft", 0)}'
338
+ )
339
+ except Exception as e:
340
+ self.logger.debug(f'Failed to get iframe scroll positions: {e}')
341
+
342
+ # Define CDP request factories to avoid duplication
343
+ def create_snapshot_request():
344
+ return cdp_session.cdp_client.send.DOMSnapshot.captureSnapshot(
345
+ params={
346
+ 'computedStyles': REQUIRED_COMPUTED_STYLES,
347
+ 'includePaintOrder': True,
348
+ 'includeDOMRects': True,
349
+ 'includeBlendedBackgroundColors': False,
350
+ 'includeTextColorOpacities': False,
351
+ },
352
+ session_id=cdp_session.session_id,
353
+ )
354
+
355
+ def create_dom_tree_request():
356
+ return cdp_session.cdp_client.send.DOM.getDocument(
357
+ params={'depth': -1, 'pierce': True}, session_id=cdp_session.session_id
358
+ )
359
+
360
+ start = time.time()
361
+
362
+ # Create initial tasks
363
+ tasks = {
364
+ 'snapshot': asyncio.create_task(create_snapshot_request()),
365
+ 'dom_tree': asyncio.create_task(create_dom_tree_request()),
366
+ 'ax_tree': asyncio.create_task(self._get_ax_tree_for_all_frames(target_id)),
367
+ 'device_pixel_ratio': asyncio.create_task(self._get_viewport_ratio(target_id)),
368
+ }
369
+
370
+ # Wait for all tasks with timeout
371
+ done, pending = await asyncio.wait(tasks.values(), timeout=10.0)
372
+
373
+ # Retry any failed or timed out tasks
374
+ if pending:
375
+ for task in pending:
376
+ task.cancel()
377
+
378
+ # Retry mapping for pending tasks
379
+ retry_map = {
380
+ tasks['snapshot']: lambda: asyncio.create_task(create_snapshot_request()),
381
+ tasks['dom_tree']: lambda: asyncio.create_task(create_dom_tree_request()),
382
+ tasks['ax_tree']: lambda: asyncio.create_task(self._get_ax_tree_for_all_frames(target_id)),
383
+ tasks['device_pixel_ratio']: lambda: asyncio.create_task(self._get_viewport_ratio(target_id)),
384
+ }
385
+
386
+ # Create new tasks only for the ones that didn't complete
387
+ for key, task in tasks.items():
388
+ if task in pending and task in retry_map:
389
+ tasks[key] = retry_map[task]()
390
+
391
+ # Wait again with shorter timeout
392
+ done2, pending2 = await asyncio.wait([t for t in tasks.values() if not t.done()], timeout=2.0)
393
+
394
+ if pending2:
395
+ for task in pending2:
396
+ task.cancel()
397
+
398
+ # Extract results, tracking which ones failed
399
+ results = {}
400
+ failed = []
401
+ for key, task in tasks.items():
402
+ if task.done() and not task.cancelled():
403
+ try:
404
+ results[key] = task.result()
405
+ except Exception as e:
406
+ self.logger.warning(f'CDP request {key} failed with exception: {e}')
407
+ failed.append(key)
408
+ else:
409
+ self.logger.warning(f'CDP request {key} timed out')
410
+ failed.append(key)
411
+
412
+ # If any required tasks failed, raise an exception
413
+ if failed:
414
+ raise TimeoutError(f'CDP requests failed or timed out: {", ".join(failed)}')
415
+
416
+ snapshot = results['snapshot']
417
+ dom_tree = results['dom_tree']
418
+ ax_tree = results['ax_tree']
419
+ device_pixel_ratio = results['device_pixel_ratio']
420
+ end = time.time()
421
+ cdp_timing = {'cdp_calls_total': end - start}
422
+
423
+ # DEBUG: Log snapshot info and limit documents to prevent explosion
424
+ if snapshot and 'documents' in snapshot:
425
+ original_doc_count = len(snapshot['documents'])
426
+ # Limit to max_iframes documents to prevent iframe explosion
427
+ if original_doc_count > self.max_iframes:
428
+ self.logger.warning(
429
+ f'⚠️ Limiting processing of {original_doc_count} iframes on page to only first {self.max_iframes} to prevent crashes!'
430
+ )
431
+ snapshot['documents'] = snapshot['documents'][: self.max_iframes]
432
+
433
+ total_nodes = sum(len(doc.get('nodes', [])) for doc in snapshot['documents'])
434
+ self.logger.debug(f'🔍 DEBUG: Snapshot contains {len(snapshot["documents"])} frames with {total_nodes} total nodes')
435
+ # Log iframe-specific info
436
+ for doc_idx, doc in enumerate(snapshot['documents']):
437
+ if doc_idx > 0: # Not the main document
438
+ self.logger.debug(
439
+ f'🔍 DEBUG: Iframe #{doc_idx} {doc.get("frameId", "no-frame-id")} {doc.get("url", "no-url")} has {len(doc.get("nodes", []))} nodes'
440
+ )
441
+
442
+ return TargetAllTrees(
443
+ snapshot=snapshot,
444
+ dom_tree=dom_tree,
445
+ ax_tree=ax_tree,
446
+ device_pixel_ratio=device_pixel_ratio,
447
+ cdp_timing=cdp_timing,
448
+ )
449
+
450
+ @observe_debug(ignore_input=True, ignore_output=True, name='get_dom_tree')
451
+ async def get_dom_tree(
452
+ self,
453
+ target_id: TargetID,
454
+ initial_html_frames: list[EnhancedDOMTreeNode] | None = None,
455
+ initial_total_frame_offset: DOMRect | None = None,
456
+ iframe_depth: int = 0,
457
+ ) -> EnhancedDOMTreeNode:
458
+ """Get the DOM tree for a specific target.
459
+
460
+ Args:
461
+ target_id: Target ID of the page to get the DOM tree for.
462
+ initial_html_frames: List of HTML frame nodes encountered so far
463
+ initial_total_frame_offset: Accumulated coordinate offset
464
+ iframe_depth: Current depth of iframe nesting to prevent infinite recursion
465
+ """
466
+
467
+ trees = await self._get_all_trees(target_id)
468
+
469
+ dom_tree = trees.dom_tree
470
+ ax_tree = trees.ax_tree
471
+ snapshot = trees.snapshot
472
+ device_pixel_ratio = trees.device_pixel_ratio
473
+
474
+ ax_tree_lookup: dict[int, AXNode] = {
475
+ ax_node['backendDOMNodeId']: ax_node for ax_node in ax_tree['nodes'] if 'backendDOMNodeId' in ax_node
476
+ }
477
+
478
+ enhanced_dom_tree_node_lookup: dict[int, EnhancedDOMTreeNode] = {}
479
+ """ NodeId (NOT backend node id) -> enhanced dom tree node""" # way to get the parent/content node
480
+
481
+ # Parse snapshot data with everything calculated upfront
482
+ snapshot_lookup = build_snapshot_lookup(snapshot, device_pixel_ratio)
483
+
484
+ async def _construct_enhanced_node(
485
+ node: Node, html_frames: list[EnhancedDOMTreeNode] | None, total_frame_offset: DOMRect | None
486
+ ) -> EnhancedDOMTreeNode:
487
+ """
488
+ Recursively construct enhanced DOM tree nodes.
489
+
490
+ Args:
491
+ node: The DOM node to construct
492
+ html_frames: List of HTML frame nodes encountered so far
493
+ accumulated_iframe_offset: Accumulated coordinate translation from parent iframes (includes scroll corrections)
494
+ """
495
+
496
+ # Initialize lists if not provided
497
+ if html_frames is None:
498
+ html_frames = []
499
+
500
+ # to get rid of the pointer references
501
+ if total_frame_offset is None:
502
+ total_frame_offset = DOMRect(x=0.0, y=0.0, width=0.0, height=0.0)
503
+ else:
504
+ total_frame_offset = DOMRect(
505
+ total_frame_offset.x, total_frame_offset.y, total_frame_offset.width, total_frame_offset.height
506
+ )
507
+
508
+ # memoize the mf (I don't know if some nodes are duplicated)
509
+ if node['nodeId'] in enhanced_dom_tree_node_lookup:
510
+ return enhanced_dom_tree_node_lookup[node['nodeId']]
511
+
512
+ ax_node = ax_tree_lookup.get(node['backendNodeId'])
513
+ if ax_node:
514
+ enhanced_ax_node = self._build_enhanced_ax_node(ax_node)
515
+ else:
516
+ enhanced_ax_node = None
517
+
518
+ # To make attributes more readable
519
+ attributes: dict[str, str] | None = None
520
+ if 'attributes' in node and node['attributes']:
521
+ attributes = {}
522
+ for i in range(0, len(node['attributes']), 2):
523
+ attributes[node['attributes'][i]] = node['attributes'][i + 1]
524
+
525
+ shadow_root_type = None
526
+ if 'shadowRootType' in node and node['shadowRootType']:
527
+ try:
528
+ shadow_root_type = node['shadowRootType']
529
+ except ValueError:
530
+ pass
531
+
532
+ # Get snapshot data and calculate absolute position
533
+ snapshot_data = snapshot_lookup.get(node['backendNodeId'], None)
534
+ absolute_position = None
535
+ if snapshot_data and snapshot_data.bounds:
536
+ absolute_position = DOMRect(
537
+ x=snapshot_data.bounds.x + total_frame_offset.x,
538
+ y=snapshot_data.bounds.y + total_frame_offset.y,
539
+ width=snapshot_data.bounds.width,
540
+ height=snapshot_data.bounds.height,
541
+ )
542
+
543
+ dom_tree_node = EnhancedDOMTreeNode(
544
+ node_id=node['nodeId'],
545
+ backend_node_id=node['backendNodeId'],
546
+ node_type=NodeType(node['nodeType']),
547
+ node_name=node['nodeName'],
548
+ node_value=node['nodeValue'],
549
+ attributes=attributes or {},
550
+ is_scrollable=node.get('isScrollable', None),
551
+ frame_id=node.get('frameId', None),
552
+ session_id=self.browser_session.agent_focus.session_id if self.browser_session.agent_focus else None,
553
+ target_id=target_id,
554
+ content_document=None,
555
+ shadow_root_type=shadow_root_type,
556
+ shadow_roots=None,
557
+ parent_node=None,
558
+ children_nodes=None,
559
+ ax_node=enhanced_ax_node,
560
+ snapshot_node=snapshot_data,
561
+ is_visible=None,
562
+ absolute_position=absolute_position,
563
+ )
564
+
565
+ enhanced_dom_tree_node_lookup[node['nodeId']] = dom_tree_node
566
+
567
+ if 'parentId' in node and node['parentId']:
568
+ dom_tree_node.parent_node = enhanced_dom_tree_node_lookup[
569
+ node['parentId']
570
+ ] # parents should always be in the lookup
571
+
572
+ # Check if this is an HTML frame node and add it to the list
573
+ updated_html_frames = html_frames.copy()
574
+ if node['nodeType'] == NodeType.ELEMENT_NODE.value and node['nodeName'] == 'HTML' and node.get('frameId') is not None:
575
+ updated_html_frames.append(dom_tree_node)
576
+
577
+ # and adjust the total frame offset by scroll
578
+ if snapshot_data and snapshot_data.scrollRects:
579
+ total_frame_offset.x -= snapshot_data.scrollRects.x
580
+ total_frame_offset.y -= snapshot_data.scrollRects.y
581
+ # DEBUG: Log iframe scroll information
582
+ self.logger.debug(
583
+ f'🔍 DEBUG: HTML frame scroll - scrollY={snapshot_data.scrollRects.y}, scrollX={snapshot_data.scrollRects.x}, frameId={node.get("frameId")}, nodeId={node["nodeId"]}'
584
+ )
585
+
586
+ # Calculate new iframe offset for content documents, accounting for iframe scroll
587
+ if (
588
+ (node['nodeName'].upper() == 'IFRAME' or node['nodeName'].upper() == 'FRAME')
589
+ and snapshot_data
590
+ and snapshot_data.bounds
591
+ ):
592
+ if snapshot_data.bounds:
593
+ updated_html_frames.append(dom_tree_node)
594
+
595
+ total_frame_offset.x += snapshot_data.bounds.x
596
+ total_frame_offset.y += snapshot_data.bounds.y
597
+
598
+ if 'contentDocument' in node and node['contentDocument']:
599
+ dom_tree_node.content_document = await _construct_enhanced_node(
600
+ node['contentDocument'], updated_html_frames, total_frame_offset
601
+ )
602
+ dom_tree_node.content_document.parent_node = dom_tree_node
603
+ # forcefully set the parent node to the content document node (helps traverse the tree)
604
+
605
+ if 'shadowRoots' in node and node['shadowRoots']:
606
+ dom_tree_node.shadow_roots = []
607
+ for shadow_root in node['shadowRoots']:
608
+ shadow_root_node = await _construct_enhanced_node(shadow_root, updated_html_frames, total_frame_offset)
609
+ # forcefully set the parent node to the shadow root node (helps traverse the tree)
610
+ shadow_root_node.parent_node = dom_tree_node
611
+ dom_tree_node.shadow_roots.append(shadow_root_node)
612
+
613
+ if 'children' in node and node['children']:
614
+ dom_tree_node.children_nodes = []
615
+ # Build set of shadow root node IDs to filter them out from children
616
+ shadow_root_node_ids = set()
617
+ if 'shadowRoots' in node and node['shadowRoots']:
618
+ for shadow_root in node['shadowRoots']:
619
+ shadow_root_node_ids.add(shadow_root['nodeId'])
620
+
621
+ for child in node['children']:
622
+ # Skip shadow roots - they should only be in shadow_roots list
623
+ if child['nodeId'] in shadow_root_node_ids:
624
+ continue
625
+ dom_tree_node.children_nodes.append(
626
+ await _construct_enhanced_node(child, updated_html_frames, total_frame_offset)
627
+ )
628
+
629
+ # Set visibility using the collected HTML frames
630
+ dom_tree_node.is_visible = self.is_element_visible_according_to_all_parents(dom_tree_node, updated_html_frames)
631
+
632
+ # DEBUG: Log visibility info for form elements in iframes
633
+ if dom_tree_node.tag_name and dom_tree_node.tag_name.upper() in ['INPUT', 'SELECT', 'TEXTAREA', 'LABEL']:
634
+ attrs = dom_tree_node.attributes or {}
635
+ elem_id = attrs.get('id', '')
636
+ elem_name = attrs.get('name', '')
637
+ if (
638
+ 'city' in elem_id.lower()
639
+ or 'city' in elem_name.lower()
640
+ or 'state' in elem_id.lower()
641
+ or 'state' in elem_name.lower()
642
+ or 'zip' in elem_id.lower()
643
+ or 'zip' in elem_name.lower()
644
+ ):
645
+ self.logger.debug(
646
+ f"🔍 DEBUG: Form element {dom_tree_node.tag_name} id='{elem_id}' name='{elem_name}' - visible={dom_tree_node.is_visible}, bounds={dom_tree_node.snapshot_node.bounds if dom_tree_node.snapshot_node else 'NO_SNAPSHOT'}"
647
+ )
648
+
649
+ # handle cross origin iframe (just recursively call the main function with the proper target if it exists in iframes)
650
+ # only do this if the iframe is visible (otherwise it's not worth it)
651
+
652
+ if (
653
+ # TODO: hacky way to disable cross origin iframes for now
654
+ self.cross_origin_iframes and node['nodeName'].upper() == 'IFRAME' and node.get('contentDocument', None) is None
655
+ ): # None meaning there is no content
656
+ # Check iframe depth to prevent infinite recursion
657
+ if iframe_depth >= self.max_iframe_depth:
658
+ self.logger.debug(
659
+ f'Skipping iframe at depth {iframe_depth} to prevent infinite recursion (max depth: {self.max_iframe_depth})'
660
+ )
661
+ else:
662
+ # Check if iframe is visible and large enough (>= 50px in both dimensions)
663
+ should_process_iframe = False
664
+
665
+ # First check if the iframe element itself is visible
666
+ if dom_tree_node.is_visible:
667
+ # Check iframe dimensions
668
+ if dom_tree_node.snapshot_node and dom_tree_node.snapshot_node.bounds:
669
+ bounds = dom_tree_node.snapshot_node.bounds
670
+ width = bounds.width
671
+ height = bounds.height
672
+
673
+ # Only process if iframe is at least 50px in both dimensions
674
+ if width >= 50 and height >= 50:
675
+ should_process_iframe = True
676
+ self.logger.debug(f'Processing cross-origin iframe: visible=True, width={width}, height={height}')
677
+ else:
678
+ self.logger.debug(
679
+ f'Skipping small cross-origin iframe: width={width}, height={height} (needs >= 50px)'
680
+ )
681
+ else:
682
+ self.logger.debug('Skipping cross-origin iframe: no bounds available')
683
+ else:
684
+ self.logger.debug('Skipping invisible cross-origin iframe')
685
+
686
+ if should_process_iframe:
687
+ # Use get_all_frames to find the iframe's target
688
+ frame_id = node.get('frameId', None)
689
+ if frame_id:
690
+ all_frames, _ = await self.browser_session.get_all_frames()
691
+ frame_info = all_frames.get(frame_id)
692
+ iframe_document_target = None
693
+ if frame_info and frame_info.get('frameTargetId'):
694
+ # Get the target info for this iframe
695
+ targets = await self.browser_session.cdp_client.send.Target.getTargets()
696
+ iframe_document_target = next(
697
+ (t for t in targets['targetInfos'] if t['targetId'] == frame_info['frameTargetId']), None
698
+ )
699
+ else:
700
+ iframe_document_target = None
701
+ # if target actually exists in one of the frames, just recursively build the dom tree for it
702
+ if iframe_document_target:
703
+ self.logger.debug(
704
+ f'Getting content document for iframe {node.get("frameId", None)} at depth {iframe_depth + 1}'
705
+ )
706
+ content_document = await self.get_dom_tree(
707
+ target_id=iframe_document_target.get('targetId'),
708
+ # TODO: experiment with this values -> not sure whether the whole cross origin iframe should be ALWAYS included as soon as some part of it is visible or not.
709
+ # Current config: if the cross origin iframe is AT ALL visible, then just include everything inside of it!
710
+ # initial_html_frames=updated_html_frames,
711
+ initial_total_frame_offset=total_frame_offset,
712
+ iframe_depth=iframe_depth + 1,
713
+ )
714
+
715
+ dom_tree_node.content_document = content_document
716
+ dom_tree_node.content_document.parent_node = dom_tree_node
717
+
718
+ return dom_tree_node
719
+
720
+ enhanced_dom_tree_node = await _construct_enhanced_node(dom_tree['root'], initial_html_frames, initial_total_frame_offset)
721
+
722
+ return enhanced_dom_tree_node
723
+
724
+ @observe_debug(ignore_input=True, ignore_output=True, name='get_serialized_dom_tree')
725
+ async def get_serialized_dom_tree(
726
+ self, previous_cached_state: SerializedDOMState | None = None
727
+ ) -> tuple[SerializedDOMState, EnhancedDOMTreeNode, dict[str, float]]:
728
+ """Get the serialized DOM tree representation for LLM consumption.
729
+
730
+ Returns:
731
+ Tuple of (serialized_dom_state, enhanced_dom_tree_root, timing_info)
732
+ """
733
+
734
+ # Use current target (None means use current)
735
+ assert self.browser_session.current_target_id is not None
736
+ enhanced_dom_tree = await self.get_dom_tree(target_id=self.browser_session.current_target_id)
737
+
738
+ start = time.time()
739
+ serialized_dom_state, serializer_timing = DOMTreeSerializer(
740
+ enhanced_dom_tree, previous_cached_state, paint_order_filtering=self.paint_order_filtering
741
+ ).serialize_accessible_elements()
742
+
743
+ end = time.time()
744
+ serialize_total_timing = {'serialize_dom_tree_total': end - start}
745
+
746
+ # Combine all timing info
747
+ all_timing = {**serializer_timing, **serialize_total_timing}
748
+
749
+ return serialized_dom_state, enhanced_dom_tree, all_timing
750
+
751
+ @staticmethod
752
+ def detect_pagination_buttons(selector_map: dict[int, EnhancedDOMTreeNode]) -> list[dict[str, str | int | bool]]:
753
+ """Detect pagination buttons from the selector map.
754
+
755
+ Args:
756
+ selector_map: Map of element indices to EnhancedDOMTreeNode
757
+
758
+ Returns:
759
+ List of pagination button information dicts with:
760
+ - button_type: 'next', 'prev', 'first', 'last', 'page_number'
761
+ - backend_node_id: Backend node ID for clicking
762
+ - text: Button text/label
763
+ - selector: XPath selector
764
+ - is_disabled: Whether the button appears disabled
765
+ """
766
+ pagination_buttons: list[dict[str, str | int | bool]] = []
767
+
768
+ # Common pagination patterns to look for
769
+ next_patterns = ['next', '>', '»', '→', 'siguiente', 'suivant', 'weiter', 'volgende']
770
+ prev_patterns = ['prev', 'previous', '<', '«', '←', 'anterior', 'précédent', 'zurück', 'vorige']
771
+ first_patterns = ['first', '⇤', '«', 'primera', 'première', 'erste', 'eerste']
772
+ last_patterns = ['last', '⇥', '»', 'última', 'dernier', 'letzte', 'laatste']
773
+
774
+ for index, node in selector_map.items():
775
+ # Skip non-clickable elements
776
+ if not node.snapshot_node or not node.snapshot_node.is_clickable:
777
+ continue
778
+
779
+ # Get element text and attributes
780
+ text = node.get_all_children_text().lower().strip()
781
+ aria_label = node.attributes.get('aria-label', '').lower()
782
+ title = node.attributes.get('title', '').lower()
783
+ class_name = node.attributes.get('class', '').lower()
784
+ role = node.attributes.get('role', '').lower()
785
+
786
+ # Combine all text sources for pattern matching
787
+ all_text = f'{text} {aria_label} {title} {class_name}'.strip()
788
+
789
+ # Check if it's disabled
790
+ is_disabled = (
791
+ node.attributes.get('disabled') == 'true'
792
+ or node.attributes.get('aria-disabled') == 'true'
793
+ or 'disabled' in class_name
794
+ )
795
+
796
+ button_type: str | None = None
797
+
798
+ # Check for next button
799
+ if any(pattern in all_text for pattern in next_patterns):
800
+ button_type = 'next'
801
+ # Check for previous button
802
+ elif any(pattern in all_text for pattern in prev_patterns):
803
+ button_type = 'prev'
804
+ # Check for first button
805
+ elif any(pattern in all_text for pattern in first_patterns):
806
+ button_type = 'first'
807
+ # Check for last button
808
+ elif any(pattern in all_text for pattern in last_patterns):
809
+ button_type = 'last'
810
+ # Check for numeric page buttons (single or double digit)
811
+ elif text.isdigit() and len(text) <= 2 and role in ['button', 'link', '']:
812
+ button_type = 'page_number'
813
+
814
+ if button_type:
815
+ pagination_buttons.append(
816
+ {
817
+ 'button_type': button_type,
818
+ 'backend_node_id': index,
819
+ 'text': node.get_all_children_text().strip() or aria_label or title,
820
+ 'selector': node.xpath,
821
+ 'is_disabled': is_disabled,
822
+ }
823
+ )
824
+
825
+ return pagination_buttons