optexity-browser-use 0.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. browser_use/__init__.py +157 -0
  2. browser_use/actor/__init__.py +11 -0
  3. browser_use/actor/element.py +1175 -0
  4. browser_use/actor/mouse.py +134 -0
  5. browser_use/actor/page.py +561 -0
  6. browser_use/actor/playground/flights.py +41 -0
  7. browser_use/actor/playground/mixed_automation.py +54 -0
  8. browser_use/actor/playground/playground.py +236 -0
  9. browser_use/actor/utils.py +176 -0
  10. browser_use/agent/cloud_events.py +282 -0
  11. browser_use/agent/gif.py +424 -0
  12. browser_use/agent/judge.py +170 -0
  13. browser_use/agent/message_manager/service.py +473 -0
  14. browser_use/agent/message_manager/utils.py +52 -0
  15. browser_use/agent/message_manager/views.py +98 -0
  16. browser_use/agent/prompts.py +413 -0
  17. browser_use/agent/service.py +2316 -0
  18. browser_use/agent/system_prompt.md +185 -0
  19. browser_use/agent/system_prompt_flash.md +10 -0
  20. browser_use/agent/system_prompt_no_thinking.md +183 -0
  21. browser_use/agent/views.py +743 -0
  22. browser_use/browser/__init__.py +41 -0
  23. browser_use/browser/cloud/cloud.py +203 -0
  24. browser_use/browser/cloud/views.py +89 -0
  25. browser_use/browser/events.py +578 -0
  26. browser_use/browser/profile.py +1158 -0
  27. browser_use/browser/python_highlights.py +548 -0
  28. browser_use/browser/session.py +3225 -0
  29. browser_use/browser/session_manager.py +399 -0
  30. browser_use/browser/video_recorder.py +162 -0
  31. browser_use/browser/views.py +200 -0
  32. browser_use/browser/watchdog_base.py +260 -0
  33. browser_use/browser/watchdogs/__init__.py +0 -0
  34. browser_use/browser/watchdogs/aboutblank_watchdog.py +253 -0
  35. browser_use/browser/watchdogs/crash_watchdog.py +335 -0
  36. browser_use/browser/watchdogs/default_action_watchdog.py +2729 -0
  37. browser_use/browser/watchdogs/dom_watchdog.py +817 -0
  38. browser_use/browser/watchdogs/downloads_watchdog.py +1277 -0
  39. browser_use/browser/watchdogs/local_browser_watchdog.py +461 -0
  40. browser_use/browser/watchdogs/permissions_watchdog.py +43 -0
  41. browser_use/browser/watchdogs/popups_watchdog.py +143 -0
  42. browser_use/browser/watchdogs/recording_watchdog.py +126 -0
  43. browser_use/browser/watchdogs/screenshot_watchdog.py +62 -0
  44. browser_use/browser/watchdogs/security_watchdog.py +280 -0
  45. browser_use/browser/watchdogs/storage_state_watchdog.py +335 -0
  46. browser_use/cli.py +2359 -0
  47. browser_use/code_use/__init__.py +16 -0
  48. browser_use/code_use/formatting.py +192 -0
  49. browser_use/code_use/namespace.py +665 -0
  50. browser_use/code_use/notebook_export.py +276 -0
  51. browser_use/code_use/service.py +1340 -0
  52. browser_use/code_use/system_prompt.md +574 -0
  53. browser_use/code_use/utils.py +150 -0
  54. browser_use/code_use/views.py +171 -0
  55. browser_use/config.py +505 -0
  56. browser_use/controller/__init__.py +3 -0
  57. browser_use/dom/enhanced_snapshot.py +161 -0
  58. browser_use/dom/markdown_extractor.py +169 -0
  59. browser_use/dom/playground/extraction.py +312 -0
  60. browser_use/dom/playground/multi_act.py +32 -0
  61. browser_use/dom/serializer/clickable_elements.py +200 -0
  62. browser_use/dom/serializer/code_use_serializer.py +287 -0
  63. browser_use/dom/serializer/eval_serializer.py +478 -0
  64. browser_use/dom/serializer/html_serializer.py +212 -0
  65. browser_use/dom/serializer/paint_order.py +197 -0
  66. browser_use/dom/serializer/serializer.py +1170 -0
  67. browser_use/dom/service.py +825 -0
  68. browser_use/dom/utils.py +129 -0
  69. browser_use/dom/views.py +906 -0
  70. browser_use/exceptions.py +5 -0
  71. browser_use/filesystem/__init__.py +0 -0
  72. browser_use/filesystem/file_system.py +619 -0
  73. browser_use/init_cmd.py +376 -0
  74. browser_use/integrations/gmail/__init__.py +24 -0
  75. browser_use/integrations/gmail/actions.py +115 -0
  76. browser_use/integrations/gmail/service.py +225 -0
  77. browser_use/llm/__init__.py +155 -0
  78. browser_use/llm/anthropic/chat.py +242 -0
  79. browser_use/llm/anthropic/serializer.py +312 -0
  80. browser_use/llm/aws/__init__.py +36 -0
  81. browser_use/llm/aws/chat_anthropic.py +242 -0
  82. browser_use/llm/aws/chat_bedrock.py +289 -0
  83. browser_use/llm/aws/serializer.py +257 -0
  84. browser_use/llm/azure/chat.py +91 -0
  85. browser_use/llm/base.py +57 -0
  86. browser_use/llm/browser_use/__init__.py +3 -0
  87. browser_use/llm/browser_use/chat.py +201 -0
  88. browser_use/llm/cerebras/chat.py +193 -0
  89. browser_use/llm/cerebras/serializer.py +109 -0
  90. browser_use/llm/deepseek/chat.py +212 -0
  91. browser_use/llm/deepseek/serializer.py +109 -0
  92. browser_use/llm/exceptions.py +29 -0
  93. browser_use/llm/google/__init__.py +3 -0
  94. browser_use/llm/google/chat.py +542 -0
  95. browser_use/llm/google/serializer.py +120 -0
  96. browser_use/llm/groq/chat.py +229 -0
  97. browser_use/llm/groq/parser.py +158 -0
  98. browser_use/llm/groq/serializer.py +159 -0
  99. browser_use/llm/messages.py +238 -0
  100. browser_use/llm/models.py +271 -0
  101. browser_use/llm/oci_raw/__init__.py +10 -0
  102. browser_use/llm/oci_raw/chat.py +443 -0
  103. browser_use/llm/oci_raw/serializer.py +229 -0
  104. browser_use/llm/ollama/chat.py +97 -0
  105. browser_use/llm/ollama/serializer.py +143 -0
  106. browser_use/llm/openai/chat.py +264 -0
  107. browser_use/llm/openai/like.py +15 -0
  108. browser_use/llm/openai/serializer.py +165 -0
  109. browser_use/llm/openrouter/chat.py +211 -0
  110. browser_use/llm/openrouter/serializer.py +26 -0
  111. browser_use/llm/schema.py +176 -0
  112. browser_use/llm/views.py +48 -0
  113. browser_use/logging_config.py +330 -0
  114. browser_use/mcp/__init__.py +18 -0
  115. browser_use/mcp/__main__.py +12 -0
  116. browser_use/mcp/client.py +544 -0
  117. browser_use/mcp/controller.py +264 -0
  118. browser_use/mcp/server.py +1114 -0
  119. browser_use/observability.py +204 -0
  120. browser_use/py.typed +0 -0
  121. browser_use/sandbox/__init__.py +41 -0
  122. browser_use/sandbox/sandbox.py +637 -0
  123. browser_use/sandbox/views.py +132 -0
  124. browser_use/screenshots/__init__.py +1 -0
  125. browser_use/screenshots/service.py +52 -0
  126. browser_use/sync/__init__.py +6 -0
  127. browser_use/sync/auth.py +357 -0
  128. browser_use/sync/service.py +161 -0
  129. browser_use/telemetry/__init__.py +51 -0
  130. browser_use/telemetry/service.py +112 -0
  131. browser_use/telemetry/views.py +101 -0
  132. browser_use/tokens/__init__.py +0 -0
  133. browser_use/tokens/custom_pricing.py +24 -0
  134. browser_use/tokens/mappings.py +4 -0
  135. browser_use/tokens/service.py +580 -0
  136. browser_use/tokens/views.py +108 -0
  137. browser_use/tools/registry/service.py +572 -0
  138. browser_use/tools/registry/views.py +174 -0
  139. browser_use/tools/service.py +1675 -0
  140. browser_use/tools/utils.py +82 -0
  141. browser_use/tools/views.py +100 -0
  142. browser_use/utils.py +670 -0
  143. optexity_browser_use-0.9.5.dist-info/METADATA +344 -0
  144. optexity_browser_use-0.9.5.dist-info/RECORD +147 -0
  145. optexity_browser_use-0.9.5.dist-info/WHEEL +4 -0
  146. optexity_browser_use-0.9.5.dist-info/entry_points.txt +3 -0
  147. optexity_browser_use-0.9.5.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1675 @@
1
+ import asyncio
2
+ import enum
3
+ import json
4
+ import logging
5
+ import os
6
+ from typing import Generic, TypeVar
7
+
8
+ try:
9
+ from lmnr import Laminar # type: ignore
10
+ except ImportError:
11
+ Laminar = None # type: ignore
12
+ from pydantic import BaseModel
13
+
14
+ from browser_use.agent.views import ActionModel, ActionResult
15
+ from browser_use.browser import BrowserSession
16
+ from browser_use.browser.events import (
17
+ ClickElementEvent,
18
+ CloseTabEvent,
19
+ GetDropdownOptionsEvent,
20
+ GoBackEvent,
21
+ NavigateToUrlEvent,
22
+ ScrollEvent,
23
+ ScrollToTextEvent,
24
+ SendKeysEvent,
25
+ SwitchTabEvent,
26
+ TypeTextEvent,
27
+ UploadFileEvent,
28
+ )
29
+ from browser_use.browser.views import BrowserError
30
+ from browser_use.dom.service import EnhancedDOMTreeNode
31
+ from browser_use.filesystem.file_system import FileSystem
32
+ from browser_use.llm.base import BaseChatModel
33
+ from browser_use.llm.messages import SystemMessage, UserMessage
34
+ from browser_use.observability import observe_debug
35
+ from browser_use.tools.registry.service import Registry
36
+ from browser_use.tools.utils import get_click_description
37
+ from browser_use.tools.views import (
38
+ ClickElementAction,
39
+ CloseTabAction,
40
+ DoneAction,
41
+ ExtractAction,
42
+ GetDropdownOptionsAction,
43
+ InputTextAction,
44
+ NavigateAction,
45
+ NoParamsAction,
46
+ ScrollAction,
47
+ SearchAction,
48
+ SelectDropdownOptionAction,
49
+ SendKeysAction,
50
+ StructuredOutputAction,
51
+ SwitchTabAction,
52
+ UploadFileAction,
53
+ )
54
+ from browser_use.utils import time_execution_sync
55
+
56
+ logger = logging.getLogger(__name__)
57
+
58
+ # Import EnhancedDOMTreeNode and rebuild event models that have forward references to it
59
+ # This must be done after all imports are complete
60
+ ClickElementEvent.model_rebuild()
61
+ TypeTextEvent.model_rebuild()
62
+ ScrollEvent.model_rebuild()
63
+ UploadFileEvent.model_rebuild()
64
+
65
+ Context = TypeVar('Context')
66
+
67
+ T = TypeVar('T', bound=BaseModel)
68
+
69
+
70
+ def _detect_sensitive_key_name(text: str, sensitive_data: dict[str, str | dict[str, str]] | None) -> str | None:
71
+ """Detect which sensitive key name corresponds to the given text value."""
72
+ if not sensitive_data or not text:
73
+ return None
74
+
75
+ # Collect all sensitive values and their keys
76
+ for domain_or_key, content in sensitive_data.items():
77
+ if isinstance(content, dict):
78
+ # New format: {domain: {key: value}}
79
+ for key, value in content.items():
80
+ if value and value == text:
81
+ return key
82
+ elif content: # Old format: {key: value}
83
+ if content == text:
84
+ return domain_or_key
85
+
86
+ return None
87
+
88
+
89
+ def handle_browser_error(e: BrowserError) -> ActionResult:
90
+ if e.long_term_memory is not None:
91
+ if e.short_term_memory is not None:
92
+ return ActionResult(
93
+ extracted_content=e.short_term_memory, error=e.long_term_memory, include_extracted_content_only_once=True
94
+ )
95
+ else:
96
+ return ActionResult(error=e.long_term_memory)
97
+ # Fallback to original error handling if long_term_memory is None
98
+ logger.warning(
99
+ '⚠️ A BrowserError was raised without long_term_memory - always set long_term_memory when raising BrowserError to propagate right messages to LLM.'
100
+ )
101
+ raise e
102
+
103
+
104
+ class Tools(Generic[Context]):
105
+ def __init__(
106
+ self,
107
+ exclude_actions: list[str] = [],
108
+ output_model: type[T] | None = None,
109
+ display_files_in_done_text: bool = True,
110
+ ):
111
+ self.registry = Registry[Context](exclude_actions)
112
+ self.display_files_in_done_text = display_files_in_done_text
113
+
114
+ """Register all default browser actions"""
115
+
116
+ self._register_done_action(output_model)
117
+
118
+ # Basic Navigation Actions
119
+ @self.registry.action(
120
+ '',
121
+ param_model=SearchAction,
122
+ )
123
+ async def search(params: SearchAction, browser_session: BrowserSession):
124
+ import urllib.parse
125
+
126
+ # Encode query for URL safety
127
+ encoded_query = urllib.parse.quote_plus(params.query)
128
+
129
+ # Build search URL based on search engine
130
+ search_engines = {
131
+ 'duckduckgo': f'https://duckduckgo.com/?q={encoded_query}',
132
+ 'google': f'https://www.google.com/search?q={encoded_query}&udm=14',
133
+ 'bing': f'https://www.bing.com/search?q={encoded_query}',
134
+ }
135
+
136
+ if params.engine.lower() not in search_engines:
137
+ return ActionResult(error=f'Unsupported search engine: {params.engine}. Options: duckduckgo, google, bing')
138
+
139
+ search_url = search_engines[params.engine.lower()]
140
+
141
+ # Simple tab logic: use current tab by default
142
+ use_new_tab = False
143
+
144
+ # Dispatch navigation event
145
+ try:
146
+ event = browser_session.event_bus.dispatch(
147
+ NavigateToUrlEvent(
148
+ url=search_url,
149
+ new_tab=use_new_tab,
150
+ )
151
+ )
152
+ await event
153
+ await event.event_result(raise_if_any=True, raise_if_none=False)
154
+ memory = f"Searched {params.engine.title()} for '{params.query}'"
155
+ msg = f'🔍 {memory}'
156
+ logger.info(msg)
157
+ return ActionResult(extracted_content=memory, long_term_memory=memory)
158
+ except Exception as e:
159
+ logger.error(f'Failed to search {params.engine}: {e}')
160
+ return ActionResult(error=f'Failed to search {params.engine} for "{params.query}": {str(e)}')
161
+
162
+ @self.registry.action(
163
+ '',
164
+ param_model=NavigateAction,
165
+ )
166
+ async def navigate(params: NavigateAction, browser_session: BrowserSession):
167
+ try:
168
+ # Dispatch navigation event
169
+ event = browser_session.event_bus.dispatch(NavigateToUrlEvent(url=params.url, new_tab=params.new_tab))
170
+ await event
171
+ await event.event_result(raise_if_any=True, raise_if_none=False)
172
+
173
+ if params.new_tab:
174
+ memory = f'Opened new tab with URL {params.url}'
175
+ msg = f'🔗 Opened new tab with url {params.url}'
176
+ else:
177
+ memory = f'Navigated to {params.url}'
178
+ msg = f'🔗 {memory}'
179
+
180
+ logger.info(msg)
181
+ return ActionResult(extracted_content=msg, long_term_memory=memory)
182
+ except Exception as e:
183
+ error_msg = str(e)
184
+ # Always log the actual error first for debugging
185
+ browser_session.logger.error(f'❌ Navigation failed: {error_msg}')
186
+
187
+ # Check if it's specifically a RuntimeError about CDP client
188
+ if isinstance(e, RuntimeError) and 'CDP client not initialized' in error_msg:
189
+ browser_session.logger.error('❌ Browser connection failed - CDP client not properly initialized')
190
+ return ActionResult(error=f'Browser connection error: {error_msg}')
191
+ # Check for network-related errors
192
+ elif any(
193
+ err in error_msg
194
+ for err in [
195
+ 'ERR_NAME_NOT_RESOLVED',
196
+ 'ERR_INTERNET_DISCONNECTED',
197
+ 'ERR_CONNECTION_REFUSED',
198
+ 'ERR_TIMED_OUT',
199
+ 'net::',
200
+ ]
201
+ ):
202
+ site_unavailable_msg = f'Navigation failed - site unavailable: {params.url}'
203
+ browser_session.logger.warning(f'⚠️ {site_unavailable_msg} - {error_msg}')
204
+ return ActionResult(error=site_unavailable_msg)
205
+ else:
206
+ # Return error in ActionResult instead of re-raising
207
+ return ActionResult(error=f'Navigation failed: {str(e)}')
208
+
209
+ @self.registry.action('', param_model=NoParamsAction)
210
+ async def go_back(_: NoParamsAction, browser_session: BrowserSession):
211
+ try:
212
+ event = browser_session.event_bus.dispatch(GoBackEvent())
213
+ await event
214
+ memory = 'Navigated back'
215
+ msg = f'🔙 {memory}'
216
+ logger.info(msg)
217
+ return ActionResult(extracted_content=memory)
218
+ except Exception as e:
219
+ logger.error(f'Failed to dispatch GoBackEvent: {type(e).__name__}: {e}')
220
+ error_msg = f'Failed to go back: {str(e)}'
221
+ return ActionResult(error=error_msg)
222
+
223
+ @self.registry.action('')
224
+ async def wait(seconds: int = 3):
225
+ # Cap wait time at maximum 30 seconds
226
+ # Reduce the wait time by 3 seconds to account for the llm call which takes at least 3 seconds
227
+ # So if the model decides to wait for 5 seconds, the llm call took at least 3 seconds, so we only need to wait for 2 seconds
228
+ # Note by Mert: the above doesnt make sense because we do the LLM call right after this or this could be followed by another action after which we would like to wait
229
+ # so I revert this.
230
+ actual_seconds = min(max(seconds - 3, 0), 30)
231
+ memory = f'Waited for {seconds} seconds'
232
+ logger.info(f'🕒 waited for {seconds} second{"" if seconds == 1 else "s"}')
233
+ await asyncio.sleep(actual_seconds)
234
+ return ActionResult(extracted_content=memory, long_term_memory=memory)
235
+
236
+ # Element Interaction Actions
237
+
238
+ @self.registry.action(
239
+ '',
240
+ param_model=ClickElementAction,
241
+ )
242
+ async def click(params: ClickElementAction, browser_session: BrowserSession):
243
+ # Dispatch click event with node
244
+ try:
245
+ assert params.index != 0, (
246
+ 'Cannot click on element with index 0. If there are no interactive elements use wait(), refresh(), etc. to troubleshoot'
247
+ )
248
+
249
+ # Look up the node from the selector map
250
+ node = await browser_session.get_element_by_index(params.index)
251
+ if node is None:
252
+ msg = f'Element index {params.index} not available - page may have changed. Try refreshing browser state.'
253
+ logger.warning(f'⚠️ {msg}')
254
+ return ActionResult(extracted_content=msg)
255
+
256
+ # Get description of clicked element
257
+ element_desc = get_click_description(node)
258
+
259
+ # Highlight the element being clicked (truly non-blocking)
260
+ asyncio.create_task(browser_session.highlight_interaction_element(node))
261
+
262
+ event = browser_session.event_bus.dispatch(ClickElementEvent(node=node))
263
+ await event
264
+ # Wait for handler to complete and get any exception or metadata
265
+ click_metadata = await event.event_result(raise_if_any=True, raise_if_none=False)
266
+
267
+ # Check if result contains validation error (e.g., trying to click <select> or file input)
268
+ if isinstance(click_metadata, dict) and 'validation_error' in click_metadata:
269
+ error_msg = click_metadata['validation_error']
270
+ # If it's a select element, try to get dropdown options as a helpful shortcut
271
+ if 'Cannot click on <select> elements.' in error_msg:
272
+ try:
273
+ return await dropdown_options(
274
+ params=GetDropdownOptionsAction(index=params.index), browser_session=browser_session
275
+ )
276
+ except Exception as dropdown_error:
277
+ logger.debug(
278
+ f'Failed to get dropdown options as shortcut during click on dropdown: {type(dropdown_error).__name__}: {dropdown_error}'
279
+ )
280
+ return ActionResult(error=error_msg)
281
+
282
+ # Build memory with element info
283
+ memory = f'Clicked {element_desc}'
284
+ logger.info(f'🖱️ {memory}')
285
+
286
+ # Include click coordinates in metadata if available
287
+ return ActionResult(
288
+ extracted_content=memory,
289
+ metadata=click_metadata if isinstance(click_metadata, dict) else None,
290
+ )
291
+ except BrowserError as e:
292
+ return handle_browser_error(e)
293
+ except Exception as e:
294
+ error_msg = f'Failed to click element {params.index}: {str(e)}'
295
+ return ActionResult(error=error_msg)
296
+
297
+ @self.registry.action(
298
+ '',
299
+ param_model=InputTextAction,
300
+ )
301
+ async def input(
302
+ params: InputTextAction,
303
+ browser_session: BrowserSession,
304
+ has_sensitive_data: bool = False,
305
+ sensitive_data: dict[str, str | dict[str, str]] | None = None,
306
+ ):
307
+ # Look up the node from the selector map
308
+ node = await browser_session.get_element_by_index(params.index)
309
+ if node is None:
310
+ msg = f'Element index {params.index} not available - page may have changed. Try refreshing browser state.'
311
+ logger.warning(f'⚠️ {msg}')
312
+ return ActionResult(extracted_content=msg)
313
+
314
+ # Highlight the element being typed into (truly non-blocking)
315
+ asyncio.create_task(browser_session.highlight_interaction_element(node))
316
+
317
+ # Dispatch type text event with node
318
+ try:
319
+ # Detect which sensitive key is being used
320
+ sensitive_key_name = None
321
+ if has_sensitive_data and sensitive_data:
322
+ sensitive_key_name = _detect_sensitive_key_name(params.text, sensitive_data)
323
+
324
+ event = browser_session.event_bus.dispatch(
325
+ TypeTextEvent(
326
+ node=node,
327
+ text=params.text,
328
+ clear=params.clear,
329
+ is_sensitive=has_sensitive_data,
330
+ sensitive_key_name=sensitive_key_name,
331
+ )
332
+ )
333
+ await event
334
+ input_metadata = await event.event_result(raise_if_any=True, raise_if_none=False)
335
+
336
+ # Create message with sensitive data handling
337
+ if has_sensitive_data:
338
+ if sensitive_key_name:
339
+ msg = f'Typed {sensitive_key_name}'
340
+ log_msg = f'Typed <{sensitive_key_name}>'
341
+ else:
342
+ msg = 'Typed sensitive data'
343
+ log_msg = 'Typed <sensitive>'
344
+ else:
345
+ msg = f"Typed '{params.text}'"
346
+ log_msg = f"Typed '{params.text}'"
347
+
348
+ logger.debug(log_msg)
349
+
350
+ # Include input coordinates in metadata if available
351
+ return ActionResult(
352
+ extracted_content=msg,
353
+ long_term_memory=msg,
354
+ metadata=input_metadata if isinstance(input_metadata, dict) else None,
355
+ )
356
+ except BrowserError as e:
357
+ return handle_browser_error(e)
358
+ except Exception as e:
359
+ # Log the full error for debugging
360
+ logger.error(f'Failed to dispatch TypeTextEvent: {type(e).__name__}: {e}')
361
+ error_msg = f'Failed to type text into element {params.index}: {e}'
362
+ return ActionResult(error=error_msg)
363
+
364
+ @self.registry.action(
365
+ '',
366
+ param_model=UploadFileAction,
367
+ )
368
+ async def upload_file(
369
+ params: UploadFileAction, browser_session: BrowserSession, available_file_paths: list[str], file_system: FileSystem
370
+ ):
371
+ # Check if file is in available_file_paths (user-provided or downloaded files)
372
+ # For remote browsers (is_local=False), we allow absolute remote paths even if not tracked locally
373
+ if params.path not in available_file_paths:
374
+ # Also check if it's a recently downloaded file that might not be in available_file_paths yet
375
+ downloaded_files = browser_session.downloaded_files
376
+ if params.path not in downloaded_files:
377
+ # Finally, check if it's a file in the FileSystem service
378
+ if file_system and file_system.get_dir():
379
+ # Check if the file is actually managed by the FileSystem service
380
+ # The path should be just the filename for FileSystem files
381
+ file_obj = file_system.get_file(params.path)
382
+ if file_obj:
383
+ # File is managed by FileSystem, construct the full path
384
+ file_system_path = str(file_system.get_dir() / params.path)
385
+ params = UploadFileAction(index=params.index, path=file_system_path)
386
+ else:
387
+ # If browser is remote, allow passing a remote-accessible absolute path
388
+ if not browser_session.is_local:
389
+ pass
390
+ else:
391
+ msg = f'File path {params.path} is not available. To fix: The user must add this file path to the available_file_paths parameter when creating the Agent. Example: Agent(task="...", llm=llm, browser=browser, available_file_paths=["{params.path}"])'
392
+ logger.error(f'❌ {msg}')
393
+ return ActionResult(error=msg)
394
+ else:
395
+ # If browser is remote, allow passing a remote-accessible absolute path
396
+ if not browser_session.is_local:
397
+ pass
398
+ else:
399
+ msg = f'File path {params.path} is not available. To fix: The user must add this file path to the available_file_paths parameter when creating the Agent. Example: Agent(task="...", llm=llm, browser=browser, available_file_paths=["{params.path}"])'
400
+ raise BrowserError(message=msg, long_term_memory=msg)
401
+
402
+ # For local browsers, ensure the file exists on the local filesystem
403
+ if browser_session.is_local:
404
+ if not os.path.exists(params.path):
405
+ msg = f'File {params.path} does not exist'
406
+ return ActionResult(error=msg)
407
+
408
+ # Get the selector map to find the node
409
+ selector_map = await browser_session.get_selector_map()
410
+ if params.index not in selector_map:
411
+ msg = f'Element with index {params.index} does not exist.'
412
+ return ActionResult(error=msg)
413
+
414
+ node = selector_map[params.index]
415
+
416
+ # Helper function to find file input near the selected element
417
+ def find_file_input_near_element(
418
+ node: EnhancedDOMTreeNode, max_height: int = 3, max_descendant_depth: int = 3
419
+ ) -> EnhancedDOMTreeNode | None:
420
+ """Find the closest file input to the selected element."""
421
+
422
+ def find_file_input_in_descendants(n: EnhancedDOMTreeNode, depth: int) -> EnhancedDOMTreeNode | None:
423
+ if depth < 0:
424
+ return None
425
+ if browser_session.is_file_input(n):
426
+ return n
427
+ for child in n.children_nodes or []:
428
+ result = find_file_input_in_descendants(child, depth - 1)
429
+ if result:
430
+ return result
431
+ return None
432
+
433
+ current = node
434
+ for _ in range(max_height + 1):
435
+ # Check the current node itself
436
+ if browser_session.is_file_input(current):
437
+ return current
438
+ # Check all descendants of the current node
439
+ result = find_file_input_in_descendants(current, max_descendant_depth)
440
+ if result:
441
+ return result
442
+ # Check all siblings and their descendants
443
+ if current.parent_node:
444
+ for sibling in current.parent_node.children_nodes or []:
445
+ if sibling is current:
446
+ continue
447
+ if browser_session.is_file_input(sibling):
448
+ return sibling
449
+ result = find_file_input_in_descendants(sibling, max_descendant_depth)
450
+ if result:
451
+ return result
452
+ current = current.parent_node
453
+ if not current:
454
+ break
455
+ return None
456
+
457
+ # Try to find a file input element near the selected element
458
+ file_input_node = find_file_input_near_element(node)
459
+
460
+ # Highlight the file input element if found (truly non-blocking)
461
+ if file_input_node:
462
+ asyncio.create_task(browser_session.highlight_interaction_element(file_input_node))
463
+
464
+ # If not found near the selected element, fallback to finding the closest file input to current scroll position
465
+ if file_input_node is None:
466
+ logger.info(
467
+ f'No file upload element found near index {params.index}, searching for closest file input to scroll position'
468
+ )
469
+
470
+ # Get current scroll position
471
+ cdp_session = await browser_session.get_or_create_cdp_session()
472
+ try:
473
+ scroll_info = await cdp_session.cdp_client.send.Runtime.evaluate(
474
+ params={'expression': 'window.scrollY || window.pageYOffset || 0'}, session_id=cdp_session.session_id
475
+ )
476
+ current_scroll_y = scroll_info.get('result', {}).get('value', 0)
477
+ except Exception:
478
+ current_scroll_y = 0
479
+
480
+ # Find all file inputs in the selector map and pick the closest one to scroll position
481
+ closest_file_input = None
482
+ min_distance = float('inf')
483
+
484
+ for idx, element in selector_map.items():
485
+ if browser_session.is_file_input(element):
486
+ # Get element's Y position
487
+ if element.absolute_position:
488
+ element_y = element.absolute_position.y
489
+ distance = abs(element_y - current_scroll_y)
490
+ if distance < min_distance:
491
+ min_distance = distance
492
+ closest_file_input = element
493
+
494
+ if closest_file_input:
495
+ file_input_node = closest_file_input
496
+ logger.info(f'Found file input closest to scroll position (distance: {min_distance}px)')
497
+ # Highlight the fallback file input element (truly non-blocking)
498
+ asyncio.create_task(browser_session.highlight_interaction_element(file_input_node))
499
+ else:
500
+ msg = 'No file upload element found on the page'
501
+ logger.error(msg)
502
+ raise BrowserError(msg)
503
+ # TODO: figure out why this fails sometimes + add fallback hail mary, just look for any file input on page
504
+
505
+ # Dispatch upload file event with the file input node
506
+ try:
507
+ event = browser_session.event_bus.dispatch(UploadFileEvent(node=file_input_node, file_path=params.path))
508
+ await event
509
+ await event.event_result(raise_if_any=True, raise_if_none=False)
510
+ msg = f'Successfully uploaded file to index {params.index}'
511
+ logger.info(f'📁 {msg}')
512
+ return ActionResult(
513
+ extracted_content=msg,
514
+ long_term_memory=f'Uploaded file {params.path} to element {params.index}',
515
+ )
516
+ except Exception as e:
517
+ logger.error(f'Failed to upload file: {e}')
518
+ raise BrowserError(f'Failed to upload file: {e}')
519
+
520
+ # Tab Management Actions
521
+
522
+ @self.registry.action(
523
+ 'Switch to another open tab by tab_id. Tab IDs are shown in browser state tabs list (last 4 chars of target_id). Use when you need to work with content in a different tab.',
524
+ param_model=SwitchTabAction,
525
+ )
526
+ async def switch(params: SwitchTabAction, browser_session: BrowserSession):
527
+ # Simple switch tab logic
528
+ try:
529
+ target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
530
+
531
+ event = browser_session.event_bus.dispatch(SwitchTabEvent(target_id=target_id))
532
+ await event
533
+ new_target_id = await event.event_result(raise_if_any=False, raise_if_none=False) # Don't raise on errors
534
+
535
+ if new_target_id:
536
+ memory = f'Switched to tab #{new_target_id[-4:]}'
537
+ else:
538
+ memory = f'Switched to tab #{params.tab_id}'
539
+
540
+ logger.info(f'🔄 {memory}')
541
+ return ActionResult(extracted_content=memory, long_term_memory=memory)
542
+ except Exception as e:
543
+ logger.warning(f'Tab switch may have failed: {e}')
544
+ memory = f'Attempted to switch to tab #{params.tab_id}'
545
+ return ActionResult(extracted_content=memory, long_term_memory=memory)
546
+
547
+ @self.registry.action(
548
+ 'Close a tab by tab_id. Tab IDs are shown in browser state tabs list (last 4 chars of target_id). Use to clean up tabs you no longer need.',
549
+ param_model=CloseTabAction,
550
+ )
551
+ async def close(params: CloseTabAction, browser_session: BrowserSession):
552
+ # Simple close tab logic
553
+ try:
554
+ target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
555
+
556
+ # Dispatch close tab event - handle stale target IDs gracefully
557
+ event = browser_session.event_bus.dispatch(CloseTabEvent(target_id=target_id))
558
+ await event
559
+ await event.event_result(raise_if_any=False, raise_if_none=False) # Don't raise on errors
560
+
561
+ memory = f'Closed tab #{params.tab_id}'
562
+ logger.info(f'🗑️ {memory}')
563
+ return ActionResult(
564
+ extracted_content=memory,
565
+ long_term_memory=memory,
566
+ )
567
+ except Exception as e:
568
+ # Handle stale target IDs gracefully
569
+ logger.warning(f'Tab {params.tab_id} may already be closed: {e}')
570
+ memory = f'Tab #{params.tab_id} closed (was already closed or invalid)'
571
+ return ActionResult(
572
+ extracted_content=memory,
573
+ long_term_memory=memory,
574
+ )
575
+
576
+ # Content Actions
577
+
578
+ # TODO: Refactor to use events instead of direct page access
579
+ # This action is temporarily disabled as it needs refactoring to use events
580
+
581
+ @self.registry.action(
582
+ """LLM extracts structured data from page markdown. Use when: on right page, know what to extract, haven't called before on same page+query. Can't get interactive elements. Set extract_links=True for URLs. Use start_from_char if truncated. If fails, use find_text instead.""",
583
+ )
584
+ async def extract(
585
+ params: ExtractAction,
586
+ browser_session: BrowserSession,
587
+ page_extraction_llm: BaseChatModel,
588
+ file_system: FileSystem,
589
+ ):
590
+ # Constants
591
+ MAX_CHAR_LIMIT = 30000
592
+ query = params['query'] if isinstance(params, dict) else params.query
593
+ extract_links = params['extract_links'] if isinstance(params, dict) else params.extract_links
594
+ start_from_char = params['start_from_char'] if isinstance(params, dict) else params.start_from_char
595
+
596
+ # Extract clean markdown using the unified method
597
+ try:
598
+ from browser_use.dom.markdown_extractor import extract_clean_markdown
599
+
600
+ content, content_stats = await extract_clean_markdown(
601
+ browser_session=browser_session, extract_links=extract_links
602
+ )
603
+ except Exception as e:
604
+ raise RuntimeError(f'Could not extract clean markdown: {type(e).__name__}')
605
+
606
+ # Original content length for processing
607
+ final_filtered_length = content_stats['final_filtered_chars']
608
+
609
+ if start_from_char > 0:
610
+ if start_from_char >= len(content):
611
+ return ActionResult(
612
+ error=f'start_from_char ({start_from_char}) exceeds content length {final_filtered_length} characters.'
613
+ )
614
+ content = content[start_from_char:]
615
+ content_stats['started_from_char'] = start_from_char
616
+
617
+ # Smart truncation with context preservation
618
+ truncated = False
619
+ if len(content) > MAX_CHAR_LIMIT:
620
+ # Try to truncate at a natural break point (paragraph, sentence)
621
+ truncate_at = MAX_CHAR_LIMIT
622
+
623
+ # Look for paragraph break within last 500 chars of limit
624
+ paragraph_break = content.rfind('\n\n', MAX_CHAR_LIMIT - 500, MAX_CHAR_LIMIT)
625
+ if paragraph_break > 0:
626
+ truncate_at = paragraph_break
627
+ else:
628
+ # Look for sentence break within last 200 chars of limit
629
+ sentence_break = content.rfind('.', MAX_CHAR_LIMIT - 200, MAX_CHAR_LIMIT)
630
+ if sentence_break > 0:
631
+ truncate_at = sentence_break + 1
632
+
633
+ content = content[:truncate_at]
634
+ truncated = True
635
+ next_start = (start_from_char or 0) + truncate_at
636
+ content_stats['truncated_at_char'] = truncate_at
637
+ content_stats['next_start_char'] = next_start
638
+
639
+ # Add content statistics to the result
640
+ original_html_length = content_stats['original_html_chars']
641
+ initial_markdown_length = content_stats['initial_markdown_chars']
642
+ chars_filtered = content_stats['filtered_chars_removed']
643
+
644
+ stats_summary = f"""Content processed: {original_html_length:,} HTML chars → {initial_markdown_length:,} initial markdown → {final_filtered_length:,} filtered markdown"""
645
+ if start_from_char > 0:
646
+ stats_summary += f' (started from char {start_from_char:,})'
647
+ if truncated:
648
+ stats_summary += f' → {len(content):,} final chars (truncated, use start_from_char={content_stats["next_start_char"]} to continue)'
649
+ elif chars_filtered > 0:
650
+ stats_summary += f' (filtered {chars_filtered:,} chars of noise)'
651
+
652
+ system_prompt = """
653
+ You are an expert at extracting data from the markdown of a webpage.
654
+
655
+ <input>
656
+ You will be given a query and the markdown of a webpage that has been filtered to remove noise and advertising content.
657
+ </input>
658
+
659
+ <instructions>
660
+ - You are tasked to extract information from the webpage that is relevant to the query.
661
+ - You should ONLY use the information available in the webpage to answer the query. Do not make up information or provide guess from your own knowledge.
662
+ - If the information relevant to the query is not available in the page, your response should mention that.
663
+ - If the query asks for all items, products, etc., make sure to directly list all of them.
664
+ - If the content was truncated and you need more information, note that the user can use start_from_char parameter to continue from where truncation occurred.
665
+ </instructions>
666
+
667
+ <output>
668
+ - Your output should present ALL the information relevant to the query in a concise way.
669
+ - Do not answer in conversational format - directly output the relevant information or that the information is unavailable.
670
+ </output>
671
+ """.strip()
672
+
673
+ prompt = f'<query>\n{query}\n</query>\n\n<content_stats>\n{stats_summary}\n</content_stats>\n\n<webpage_content>\n{content}\n</webpage_content>'
674
+
675
+ try:
676
+ response = await asyncio.wait_for(
677
+ page_extraction_llm.ainvoke([SystemMessage(content=system_prompt), UserMessage(content=prompt)]),
678
+ timeout=120.0,
679
+ )
680
+
681
+ current_url = await browser_session.get_current_page_url()
682
+ extracted_content = (
683
+ f'<url>\n{current_url}\n</url>\n<query>\n{query}\n</query>\n<result>\n{response.completion}\n</result>'
684
+ )
685
+
686
+ # Simple memory handling
687
+ MAX_MEMORY_LENGTH = 1000
688
+ if len(extracted_content) < MAX_MEMORY_LENGTH:
689
+ memory = extracted_content
690
+ include_extracted_content_only_once = False
691
+ else:
692
+ file_name = await file_system.save_extracted_content(extracted_content)
693
+ memory = f'Query: {query}\nContent in {file_name} and once in <read_state>.'
694
+ include_extracted_content_only_once = True
695
+
696
+ logger.info(f'📄 {memory}')
697
+ return ActionResult(
698
+ extracted_content=extracted_content,
699
+ include_extracted_content_only_once=include_extracted_content_only_once,
700
+ long_term_memory=memory,
701
+ )
702
+ except Exception as e:
703
+ logger.debug(f'Error extracting content: {e}')
704
+ raise RuntimeError(str(e))
705
+
706
+ @self.registry.action(
707
+ """Scroll by pages (down=True/False, pages=0.5-10.0, default 1.0). Use index for scroll containers (dropdowns/custom UI). High pages (10) reaches bottom. Multi-page scrolls sequentially. Viewport-based height, fallback 1000px/page.""",
708
+ param_model=ScrollAction,
709
+ )
710
+ async def scroll(params: ScrollAction, browser_session: BrowserSession):
711
+ try:
712
+ # Look up the node from the selector map if index is provided
713
+ # Special case: index 0 means scroll the whole page (root/body element)
714
+ node = None
715
+ if params.index is not None and params.index != 0:
716
+ node = await browser_session.get_element_by_index(params.index)
717
+ if node is None:
718
+ # Element does not exist
719
+ msg = f'Element index {params.index} not found in browser state'
720
+ return ActionResult(error=msg)
721
+
722
+ direction = 'down' if params.down else 'up'
723
+ target = f'element {params.index}' if params.index is not None and params.index != 0 else ''
724
+
725
+ # Get actual viewport height for more accurate scrolling
726
+ try:
727
+ cdp_session = await browser_session.get_or_create_cdp_session()
728
+ metrics = await cdp_session.cdp_client.send.Page.getLayoutMetrics(session_id=cdp_session.session_id)
729
+
730
+ # Use cssVisualViewport for the most accurate representation
731
+ css_viewport = metrics.get('cssVisualViewport', {})
732
+ css_layout_viewport = metrics.get('cssLayoutViewport', {})
733
+
734
+ # Get viewport height, prioritizing cssVisualViewport
735
+ viewport_height = int(css_viewport.get('clientHeight') or css_layout_viewport.get('clientHeight', 1000))
736
+
737
+ logger.debug(f'Detected viewport height: {viewport_height}px')
738
+ except Exception as e:
739
+ viewport_height = 1000 # Fallback to 1000px
740
+ logger.debug(f'Failed to get viewport height, using fallback 1000px: {e}')
741
+
742
+ # For multiple pages (>=1.0), scroll one page at a time to ensure each scroll completes
743
+ if params.pages >= 1.0:
744
+ import asyncio
745
+
746
+ num_full_pages = int(params.pages)
747
+ remaining_fraction = params.pages - num_full_pages
748
+
749
+ completed_scrolls = 0
750
+
751
+ # Scroll one page at a time
752
+ for i in range(num_full_pages):
753
+ try:
754
+ pixels = viewport_height # Use actual viewport height
755
+ if not params.down:
756
+ pixels = -pixels
757
+
758
+ event = browser_session.event_bus.dispatch(
759
+ ScrollEvent(direction=direction, amount=abs(pixels), node=node)
760
+ )
761
+ await event
762
+ await event.event_result(raise_if_any=True, raise_if_none=False)
763
+ completed_scrolls += 1
764
+
765
+ # Small delay to ensure scroll completes before next one
766
+ await asyncio.sleep(0.3)
767
+
768
+ except Exception as e:
769
+ logger.warning(f'Scroll {i + 1}/{num_full_pages} failed: {e}')
770
+ # Continue with remaining scrolls even if one fails
771
+
772
+ # Handle fractional page if present
773
+ if remaining_fraction > 0:
774
+ try:
775
+ pixels = int(remaining_fraction * viewport_height)
776
+ if not params.down:
777
+ pixels = -pixels
778
+
779
+ event = browser_session.event_bus.dispatch(
780
+ ScrollEvent(direction=direction, amount=abs(pixels), node=node)
781
+ )
782
+ await event
783
+ await event.event_result(raise_if_any=True, raise_if_none=False)
784
+ completed_scrolls += remaining_fraction
785
+
786
+ except Exception as e:
787
+ logger.warning(f'Fractional scroll failed: {e}')
788
+
789
+ if params.pages == 1.0:
790
+ long_term_memory = f'Scrolled {direction} {target} {viewport_height}px'.replace(' ', ' ')
791
+ else:
792
+ long_term_memory = f'Scrolled {direction} {target} {completed_scrolls:.1f} pages'.replace(' ', ' ')
793
+ else:
794
+ # For fractional pages <1.0, do single scroll
795
+ pixels = int(params.pages * viewport_height)
796
+ event = browser_session.event_bus.dispatch(
797
+ ScrollEvent(direction='down' if params.down else 'up', amount=pixels, node=node)
798
+ )
799
+ await event
800
+ await event.event_result(raise_if_any=True, raise_if_none=False)
801
+ long_term_memory = f'Scrolled {direction} {target} {params.pages} pages'.replace(' ', ' ')
802
+
803
+ msg = f'🔍 {long_term_memory}'
804
+ logger.info(msg)
805
+ return ActionResult(extracted_content=msg, long_term_memory=long_term_memory)
806
+ except Exception as e:
807
+ logger.error(f'Failed to dispatch ScrollEvent: {type(e).__name__}: {e}')
808
+ error_msg = 'Failed to execute scroll action.'
809
+ return ActionResult(error=error_msg)
810
+
811
+ @self.registry.action(
812
+ '',
813
+ param_model=SendKeysAction,
814
+ )
815
+ async def send_keys(params: SendKeysAction, browser_session: BrowserSession):
816
+ # Dispatch send keys event
817
+ try:
818
+ event = browser_session.event_bus.dispatch(SendKeysEvent(keys=params.keys))
819
+ await event
820
+ await event.event_result(raise_if_any=True, raise_if_none=False)
821
+ memory = f'Sent keys: {params.keys}'
822
+ msg = f'⌨️ {memory}'
823
+ logger.info(msg)
824
+ return ActionResult(extracted_content=memory, long_term_memory=memory)
825
+ except Exception as e:
826
+ logger.error(f'Failed to dispatch SendKeysEvent: {type(e).__name__}: {e}')
827
+ error_msg = f'Failed to send keys: {str(e)}'
828
+ return ActionResult(error=error_msg)
829
+
830
+ @self.registry.action('Scroll to text.')
831
+ async def find_text(text: str, browser_session: BrowserSession): # type: ignore
832
+ # Dispatch scroll to text event
833
+ event = browser_session.event_bus.dispatch(ScrollToTextEvent(text=text))
834
+
835
+ try:
836
+ # The handler returns None on success or raises an exception if text not found
837
+ await event.event_result(raise_if_any=True, raise_if_none=False)
838
+ memory = f'Scrolled to text: {text}'
839
+ msg = f'🔍 {memory}'
840
+ logger.info(msg)
841
+ return ActionResult(extracted_content=memory, long_term_memory=memory)
842
+ except Exception as e:
843
+ # Text not found
844
+ msg = f"Text '{text}' not found or not visible on page"
845
+ logger.info(msg)
846
+ return ActionResult(
847
+ extracted_content=msg,
848
+ long_term_memory=f"Tried scrolling to text '{text}' but it was not found",
849
+ )
850
+
851
+ @self.registry.action(
852
+ 'Get a screenshot of the current viewport. Use when: visual inspection needed, layout unclear, element positions uncertain, debugging UI issues, or verifying page state. Screenshot is included in the next browser_state No parameters are needed.',
853
+ param_model=NoParamsAction,
854
+ )
855
+ async def screenshot(_: NoParamsAction):
856
+ """Request that a screenshot be included in the next observation"""
857
+ memory = 'Requested screenshot for next observation'
858
+ msg = f'📸 {memory}'
859
+ logger.info(msg)
860
+
861
+ # Return flag in metadata to signal that screenshot should be included
862
+ return ActionResult(
863
+ extracted_content=memory,
864
+ metadata={'include_screenshot': True},
865
+ )
866
+
867
+ # Dropdown Actions
868
+
869
+ @self.registry.action(
870
+ '',
871
+ param_model=GetDropdownOptionsAction,
872
+ )
873
+ async def dropdown_options(params: GetDropdownOptionsAction, browser_session: BrowserSession):
874
+ """Get all options from a native dropdown or ARIA menu"""
875
+ # Look up the node from the selector map
876
+ node = await browser_session.get_element_by_index(params.index)
877
+ if node is None:
878
+ msg = f'Element index {params.index} not available - page may have changed. Try refreshing browser state.'
879
+ logger.warning(f'⚠️ {msg}')
880
+ return ActionResult(extracted_content=msg)
881
+
882
+ # Dispatch GetDropdownOptionsEvent to the event handler
883
+
884
+ event = browser_session.event_bus.dispatch(GetDropdownOptionsEvent(node=node))
885
+ dropdown_data = await event.event_result(timeout=3.0, raise_if_none=True, raise_if_any=True)
886
+
887
+ if not dropdown_data:
888
+ raise ValueError('Failed to get dropdown options - no data returned')
889
+
890
+ # Use structured memory from the handler
891
+ return ActionResult(
892
+ extracted_content=dropdown_data['short_term_memory'],
893
+ long_term_memory=dropdown_data['long_term_memory'],
894
+ include_extracted_content_only_once=True,
895
+ )
896
+
897
+ @self.registry.action(
898
+ 'Set the option of a <select> element.',
899
+ param_model=SelectDropdownOptionAction,
900
+ )
901
+ async def select_dropdown(params: SelectDropdownOptionAction, browser_session: BrowserSession):
902
+ """Select dropdown option by the text of the option you want to select"""
903
+ # Look up the node from the selector map
904
+ node = await browser_session.get_element_by_index(params.index)
905
+ if node is None:
906
+ msg = f'Element index {params.index} not available - page may have changed. Try refreshing browser state.'
907
+ logger.warning(f'⚠️ {msg}')
908
+ return ActionResult(extracted_content=msg)
909
+
910
+ # Dispatch SelectDropdownOptionEvent to the event handler
911
+ from browser_use.browser.events import SelectDropdownOptionEvent
912
+
913
+ event = browser_session.event_bus.dispatch(SelectDropdownOptionEvent(node=node, text=params.text))
914
+ selection_data = await event.event_result()
915
+
916
+ if not selection_data:
917
+ raise ValueError('Failed to select dropdown option - no data returned')
918
+
919
+ # Check if the selection was successful
920
+ if selection_data.get('success') == 'true':
921
+ # Extract the message from the returned data
922
+ msg = selection_data.get('message', f'Selected option: {params.text}')
923
+ return ActionResult(
924
+ extracted_content=msg,
925
+ include_in_memory=True,
926
+ long_term_memory=f"Selected dropdown option '{params.text}' at index {params.index}",
927
+ )
928
+ else:
929
+ # Handle structured error response
930
+ # TODO: raise BrowserError instead of returning ActionResult
931
+ if 'short_term_memory' in selection_data and 'long_term_memory' in selection_data:
932
+ return ActionResult(
933
+ extracted_content=selection_data['short_term_memory'],
934
+ long_term_memory=selection_data['long_term_memory'],
935
+ include_extracted_content_only_once=True,
936
+ )
937
+ else:
938
+ # Fallback to regular error
939
+ error_msg = selection_data.get('error', f'Failed to select option: {params.text}')
940
+ return ActionResult(error=error_msg)
941
+
942
+ # File System Actions
943
+
944
+ @self.registry.action(
945
+ 'Write content to a file in the local file system. Use this to create new files or overwrite entire file contents. For targeted edits within existing files, use replace_file instead. Supports alphanumeric filename and file extension formats: .txt, .md, .json, .jsonl, .csv, .pdf. For PDF files, write content in markdown format and it will be automatically converted to a properly formatted PDF document.'
946
+ )
947
+ async def write_file(
948
+ file_name: str,
949
+ content: str,
950
+ file_system: FileSystem,
951
+ append: bool = False,
952
+ trailing_newline: bool = True,
953
+ leading_newline: bool = False,
954
+ ):
955
+ if trailing_newline:
956
+ content += '\n'
957
+ if leading_newline:
958
+ content = '\n' + content
959
+ if append:
960
+ result = await file_system.append_file(file_name, content)
961
+ else:
962
+ result = await file_system.write_file(file_name, content)
963
+
964
+ # Log the full path where the file is stored
965
+ file_path = file_system.get_dir() / file_name
966
+ logger.info(f'💾 {result} File location: {file_path}')
967
+
968
+ return ActionResult(extracted_content=result, long_term_memory=result)
969
+
970
+ @self.registry.action(
971
+ 'Replace specific text within a file by searching for old_str and replacing with new_str. Use this for targeted edits like updating todo checkboxes or modifying specific lines without rewriting the entire file.'
972
+ )
973
+ async def replace_file(file_name: str, old_str: str, new_str: str, file_system: FileSystem):
974
+ result = await file_system.replace_file_str(file_name, old_str, new_str)
975
+ logger.info(f'💾 {result}')
976
+ return ActionResult(extracted_content=result, long_term_memory=result)
977
+
978
+ @self.registry.action(
979
+ 'Read the complete content of a file. Use this to view file contents before editing or to retrieve data from files. Supports text files (txt, md, json, csv, jsonl), documents (pdf, docx), and images (jpg, png).'
980
+ )
981
+ async def read_file(file_name: str, available_file_paths: list[str], file_system: FileSystem):
982
+ if available_file_paths and file_name in available_file_paths:
983
+ structured_result = await file_system.read_file_structured(file_name, external_file=True)
984
+ else:
985
+ structured_result = await file_system.read_file_structured(file_name)
986
+
987
+ result = structured_result['message']
988
+ images = structured_result.get('images')
989
+
990
+ MAX_MEMORY_SIZE = 1000
991
+ # For images, create a shorter memory message
992
+ if images:
993
+ memory = f'Read image file {file_name}'
994
+ elif len(result) > MAX_MEMORY_SIZE:
995
+ lines = result.splitlines()
996
+ display = ''
997
+ lines_count = 0
998
+ for line in lines:
999
+ if len(display) + len(line) < MAX_MEMORY_SIZE:
1000
+ display += line + '\n'
1001
+ lines_count += 1
1002
+ else:
1003
+ break
1004
+ remaining_lines = len(lines) - lines_count
1005
+ memory = f'{display}{remaining_lines} more lines...' if remaining_lines > 0 else display
1006
+ else:
1007
+ memory = result
1008
+ logger.info(f'💾 {memory}')
1009
+ return ActionResult(
1010
+ extracted_content=result,
1011
+ long_term_memory=memory,
1012
+ images=images,
1013
+ include_extracted_content_only_once=True,
1014
+ )
1015
+
1016
+ @self.registry.action(
1017
+ """Execute browser JavaScript. Best practice: wrap in IIFE (function(){...})() with try-catch for safety. Use ONLY browser APIs (document, window, DOM). NO Node.js APIs (fs, require, process). Example: (function(){try{const el=document.querySelector('#id');return el?el.value:'not found'}catch(e){return 'Error: '+e.message}})() Avoid comments. Use for hover, drag, zoom, custom selectors, extract/filter links, shadow DOM, or analysing page structure. Limit output size.""",
1018
+ )
1019
+ async def evaluate(code: str, browser_session: BrowserSession):
1020
+ # Execute JavaScript with proper error handling and promise support
1021
+
1022
+ cdp_session = await browser_session.get_or_create_cdp_session()
1023
+
1024
+ try:
1025
+ # Validate and potentially fix JavaScript code before execution
1026
+ validated_code = self._validate_and_fix_javascript(code)
1027
+
1028
+ # Always use awaitPromise=True - it's ignored for non-promises
1029
+ result = await cdp_session.cdp_client.send.Runtime.evaluate(
1030
+ params={'expression': validated_code, 'returnByValue': True, 'awaitPromise': True},
1031
+ session_id=cdp_session.session_id,
1032
+ )
1033
+
1034
+ # Check for JavaScript execution errors
1035
+ if result.get('exceptionDetails'):
1036
+ exception = result['exceptionDetails']
1037
+ error_msg = f'JavaScript execution error: {exception.get("text", "Unknown error")}'
1038
+
1039
+ # Enhanced error message with debugging info
1040
+ enhanced_msg = f"""JavaScript Execution Failed:
1041
+ {error_msg}
1042
+
1043
+ Validated Code (after quote fixing):
1044
+ {validated_code[:500]}{'...' if len(validated_code) > 500 else ''}
1045
+ """
1046
+
1047
+ logger.debug(enhanced_msg)
1048
+ return ActionResult(error=enhanced_msg)
1049
+
1050
+ # Get the result data
1051
+ result_data = result.get('result', {})
1052
+
1053
+ # Check for wasThrown flag (backup error detection)
1054
+ if result_data.get('wasThrown'):
1055
+ msg = f'JavaScript code: {code} execution failed (wasThrown=true)'
1056
+ logger.debug(msg)
1057
+ return ActionResult(error=msg)
1058
+
1059
+ # Get the actual value
1060
+ value = result_data.get('value')
1061
+
1062
+ # Handle different value types
1063
+ if value is None:
1064
+ # Could be legitimate null/undefined result
1065
+ result_text = str(value) if 'value' in result_data else 'undefined'
1066
+ elif isinstance(value, (dict, list)):
1067
+ # Complex objects - should be serialized by returnByValue
1068
+ try:
1069
+ result_text = json.dumps(value, ensure_ascii=False)
1070
+ except (TypeError, ValueError):
1071
+ # Fallback for non-serializable objects
1072
+ result_text = str(value)
1073
+ else:
1074
+ # Primitive values (string, number, boolean)
1075
+ result_text = str(value)
1076
+
1077
+ import re
1078
+
1079
+ image_pattern = r'(data:image/[^;]+;base64,[A-Za-z0-9+/=]+)'
1080
+ found_images = re.findall(image_pattern, result_text)
1081
+
1082
+ metadata = None
1083
+ if found_images:
1084
+ # Store images in metadata so they can be added as ContentPartImageParam
1085
+ metadata = {'images': found_images}
1086
+
1087
+ # Replace image data in result text with shorter placeholder
1088
+ modified_text = result_text
1089
+ for i, img_data in enumerate(found_images, 1):
1090
+ placeholder = '[Image]'
1091
+ modified_text = modified_text.replace(img_data, placeholder)
1092
+ result_text = modified_text
1093
+
1094
+ # Apply length limit with better truncation (after image extraction)
1095
+ if len(result_text) > 20000:
1096
+ result_text = result_text[:19950] + '\n... [Truncated after 20000 characters]'
1097
+
1098
+ # Don't log the code - it's already visible in the user's cell
1099
+ logger.debug(f'JavaScript executed successfully, result length: {len(result_text)}')
1100
+
1101
+ # Return only the result, not the code (code is already in user's cell)
1102
+ return ActionResult(extracted_content=result_text, metadata=metadata)
1103
+
1104
+ except Exception as e:
1105
+ # CDP communication or other system errors
1106
+ error_msg = f'Failed to execute JavaScript: {type(e).__name__}: {e}'
1107
+ logger.debug(f'JavaScript code that failed: {code[:200]}...')
1108
+ return ActionResult(error=error_msg)
1109
+
1110
+ def _validate_and_fix_javascript(self, code: str) -> str:
1111
+ """Validate and fix common JavaScript issues before execution"""
1112
+
1113
+ import re
1114
+
1115
+ # Pattern 1: Fix double-escaped quotes (\\\" → \")
1116
+ fixed_code = re.sub(r'\\"', '"', code)
1117
+
1118
+ # Pattern 2: Fix over-escaped regex patterns (\\\\d → \\d)
1119
+ # Common issue: regex gets double-escaped during parsing
1120
+ fixed_code = re.sub(r'\\\\([dDsSwWbBnrtfv])', r'\\\1', fixed_code)
1121
+ fixed_code = re.sub(r'\\\\([.*+?^${}()|[\]])', r'\\\1', fixed_code)
1122
+
1123
+ # Pattern 3: Fix XPath expressions with mixed quotes
1124
+ xpath_pattern = r'document\.evaluate\s*\(\s*"([^"]*\'[^"]*)"'
1125
+
1126
+ def fix_xpath_quotes(match):
1127
+ xpath_with_quotes = match.group(1)
1128
+ return f'document.evaluate(`{xpath_with_quotes}`,'
1129
+
1130
+ fixed_code = re.sub(xpath_pattern, fix_xpath_quotes, fixed_code)
1131
+
1132
+ # Pattern 4: Fix querySelector/querySelectorAll with mixed quotes
1133
+ selector_pattern = r'(querySelector(?:All)?)\s*\(\s*"([^"]*\'[^"]*)"'
1134
+
1135
+ def fix_selector_quotes(match):
1136
+ method_name = match.group(1)
1137
+ selector_with_quotes = match.group(2)
1138
+ return f'{method_name}(`{selector_with_quotes}`)'
1139
+
1140
+ fixed_code = re.sub(selector_pattern, fix_selector_quotes, fixed_code)
1141
+
1142
+ # Pattern 5: Fix closest() calls with mixed quotes
1143
+ closest_pattern = r'\.closest\s*\(\s*"([^"]*\'[^"]*)"'
1144
+
1145
+ def fix_closest_quotes(match):
1146
+ selector_with_quotes = match.group(1)
1147
+ return f'.closest(`{selector_with_quotes}`)'
1148
+
1149
+ fixed_code = re.sub(closest_pattern, fix_closest_quotes, fixed_code)
1150
+
1151
+ # Pattern 6: Fix .matches() calls with mixed quotes (similar to closest)
1152
+ matches_pattern = r'\.matches\s*\(\s*"([^"]*\'[^"]*)"'
1153
+
1154
+ def fix_matches_quotes(match):
1155
+ selector_with_quotes = match.group(1)
1156
+ return f'.matches(`{selector_with_quotes}`)'
1157
+
1158
+ fixed_code = re.sub(matches_pattern, fix_matches_quotes, fixed_code)
1159
+
1160
+ # Note: Removed getAttribute fix - attribute names rarely have mixed quotes
1161
+ # getAttribute typically uses simple names like "data-value", not complex selectors
1162
+
1163
+ # Log changes made
1164
+ changes_made = []
1165
+ if r'\"' in code and r'\"' not in fixed_code:
1166
+ changes_made.append('fixed escaped quotes')
1167
+ if '`' in fixed_code and '`' not in code:
1168
+ changes_made.append('converted mixed quotes to template literals')
1169
+
1170
+ if changes_made:
1171
+ logger.debug(f'JavaScript fixes applied: {", ".join(changes_made)}')
1172
+
1173
+ return fixed_code
1174
+
1175
+ def _register_done_action(self, output_model: type[T] | None, display_files_in_done_text: bool = True):
1176
+ if output_model is not None:
1177
+ self.display_files_in_done_text = display_files_in_done_text
1178
+
1179
+ @self.registry.action(
1180
+ 'Complete task with structured output.',
1181
+ param_model=StructuredOutputAction[output_model],
1182
+ )
1183
+ async def done(params: StructuredOutputAction):
1184
+ # Exclude success from the output JSON since it's an internal parameter
1185
+ output_dict = params.data.model_dump()
1186
+
1187
+ # Enums are not serializable, convert to string
1188
+ for key, value in output_dict.items():
1189
+ if isinstance(value, enum.Enum):
1190
+ output_dict[key] = value.value
1191
+
1192
+ return ActionResult(
1193
+ is_done=True,
1194
+ success=params.success,
1195
+ extracted_content=json.dumps(output_dict, ensure_ascii=False),
1196
+ long_term_memory=f'Task completed. Success Status: {params.success}',
1197
+ )
1198
+
1199
+ else:
1200
+
1201
+ @self.registry.action(
1202
+ 'Complete task.',
1203
+ param_model=DoneAction,
1204
+ )
1205
+ async def done(params: DoneAction, file_system: FileSystem):
1206
+ user_message = params.text
1207
+
1208
+ len_text = len(params.text)
1209
+ len_max_memory = 100
1210
+ memory = f'Task completed: {params.success} - {params.text[:len_max_memory]}'
1211
+ if len_text > len_max_memory:
1212
+ memory += f' - {len_text - len_max_memory} more characters'
1213
+
1214
+ attachments = []
1215
+ if params.files_to_display:
1216
+ if self.display_files_in_done_text:
1217
+ file_msg = ''
1218
+ for file_name in params.files_to_display:
1219
+ file_content = file_system.display_file(file_name)
1220
+ if file_content:
1221
+ file_msg += f'\n\n{file_name}:\n{file_content}'
1222
+ attachments.append(file_name)
1223
+ if file_msg:
1224
+ user_message += '\n\nAttachments:'
1225
+ user_message += file_msg
1226
+ else:
1227
+ logger.warning('Agent wanted to display files but none were found')
1228
+ else:
1229
+ for file_name in params.files_to_display:
1230
+ file_content = file_system.display_file(file_name)
1231
+ if file_content:
1232
+ attachments.append(file_name)
1233
+
1234
+ attachments = [str(file_system.get_dir() / file_name) for file_name in attachments]
1235
+
1236
+ return ActionResult(
1237
+ is_done=True,
1238
+ success=params.success,
1239
+ extracted_content=user_message,
1240
+ long_term_memory=memory,
1241
+ attachments=attachments,
1242
+ )
1243
+
1244
+ def use_structured_output_action(self, output_model: type[T]):
1245
+ self._register_done_action(output_model)
1246
+
1247
+ # Register ---------------------------------------------------------------
1248
+
1249
+ def action(self, description: str, **kwargs):
1250
+ """Decorator for registering custom actions
1251
+
1252
+ @param description: Describe the LLM what the function does (better description == better function calling)
1253
+ """
1254
+ return self.registry.action(description, **kwargs)
1255
+
1256
+ # Act --------------------------------------------------------------------
1257
+ @observe_debug(ignore_input=True, ignore_output=True, name='act')
1258
+ @time_execution_sync('--act')
1259
+ async def act(
1260
+ self,
1261
+ action: ActionModel,
1262
+ browser_session: BrowserSession,
1263
+ page_extraction_llm: BaseChatModel | None = None,
1264
+ sensitive_data: dict[str, str | dict[str, str]] | None = None,
1265
+ available_file_paths: list[str] | None = None,
1266
+ file_system: FileSystem | None = None,
1267
+ ) -> ActionResult:
1268
+ """Execute an action"""
1269
+
1270
+ for action_name, params in action.model_dump(exclude_unset=True).items():
1271
+ if params is not None:
1272
+ # Use Laminar span if available, otherwise use no-op context manager
1273
+ if Laminar is not None:
1274
+ span_context = Laminar.start_as_current_span(
1275
+ name=action_name,
1276
+ input={
1277
+ 'action': action_name,
1278
+ 'params': params,
1279
+ },
1280
+ span_type='TOOL',
1281
+ )
1282
+ else:
1283
+ # No-op context manager when lmnr is not available
1284
+ from contextlib import nullcontext
1285
+
1286
+ span_context = nullcontext()
1287
+
1288
+ with span_context:
1289
+ try:
1290
+ result = await self.registry.execute_action(
1291
+ action_name=action_name,
1292
+ params=params,
1293
+ browser_session=browser_session,
1294
+ page_extraction_llm=page_extraction_llm,
1295
+ file_system=file_system,
1296
+ sensitive_data=sensitive_data,
1297
+ available_file_paths=available_file_paths,
1298
+ )
1299
+ except BrowserError as e:
1300
+ logger.error(f'❌ Action {action_name} failed with BrowserError: {str(e)}')
1301
+ result = handle_browser_error(e)
1302
+ except TimeoutError as e:
1303
+ logger.error(f'❌ Action {action_name} failed with TimeoutError: {str(e)}')
1304
+ result = ActionResult(error=f'{action_name} was not executed due to timeout.')
1305
+ except Exception as e:
1306
+ # Log the original exception with traceback for observability
1307
+ logger.error(f"Action '{action_name}' failed with error: {str(e)}")
1308
+ result = ActionResult(error=str(e))
1309
+
1310
+ if Laminar is not None:
1311
+ Laminar.set_span_output(result)
1312
+
1313
+ if isinstance(result, str):
1314
+ return ActionResult(extracted_content=result)
1315
+ elif isinstance(result, ActionResult):
1316
+ return result
1317
+ elif result is None:
1318
+ return ActionResult()
1319
+ else:
1320
+ raise ValueError(f'Invalid action result type: {type(result)} of {result}')
1321
+ return ActionResult()
1322
+
1323
+ def __getattr__(self, name: str):
1324
+ """
1325
+ Enable direct action calls like tools.navigate(url=..., browser_session=...).
1326
+ This provides a simpler API for tests and direct usage while maintaining backward compatibility.
1327
+ """
1328
+ # Check if this is a registered action
1329
+ if name in self.registry.registry.actions:
1330
+ from typing import Union
1331
+
1332
+ from pydantic import create_model
1333
+
1334
+ action = self.registry.registry.actions[name]
1335
+
1336
+ # Create a wrapper that calls act() to ensure consistent error handling and result normalization
1337
+ async def action_wrapper(**kwargs):
1338
+ # Extract browser_session (required positional argument for act())
1339
+ browser_session = kwargs.get('browser_session')
1340
+
1341
+ # Separate action params from special params (injected dependencies)
1342
+ special_param_names = {
1343
+ 'browser_session',
1344
+ 'page_extraction_llm',
1345
+ 'file_system',
1346
+ 'available_file_paths',
1347
+ 'sensitive_data',
1348
+ }
1349
+
1350
+ # Extract action params (params for the action itself)
1351
+ action_params = {k: v for k, v in kwargs.items() if k not in special_param_names}
1352
+
1353
+ # Extract special params (injected dependencies) - exclude browser_session as it's positional
1354
+ special_kwargs = {k: v for k, v in kwargs.items() if k in special_param_names and k != 'browser_session'}
1355
+
1356
+ # Create the param instance
1357
+ params_instance = action.param_model(**action_params)
1358
+
1359
+ # Dynamically create an ActionModel with this action
1360
+ # Use Union for type compatibility with create_model
1361
+ DynamicActionModel = create_model(
1362
+ 'DynamicActionModel',
1363
+ __base__=ActionModel,
1364
+ **{name: (Union[action.param_model, None], None)}, # type: ignore
1365
+ )
1366
+
1367
+ # Create the action model instance
1368
+ action_model = DynamicActionModel(**{name: params_instance})
1369
+
1370
+ # Call act() which has all the error handling, result normalization, and observability
1371
+ # browser_session is passed as positional argument (required by act())
1372
+ return await self.act(action=action_model, browser_session=browser_session, **special_kwargs) # type: ignore
1373
+
1374
+ return action_wrapper
1375
+
1376
+ # If not an action, raise AttributeError for normal Python behavior
1377
+ raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
1378
+
1379
+
1380
+ # Alias for backwards compatibility
1381
+ Controller = Tools
1382
+
1383
+
1384
+ class CodeAgentTools(Tools[Context]):
1385
+ """Specialized Tools for CodeAgent agent optimized for Python-based browser automation.
1386
+
1387
+ Includes:
1388
+ - All browser interaction tools (click, input, scroll, navigate, etc.)
1389
+ - JavaScript evaluation
1390
+ - Tab management (switch, close)
1391
+ - Navigation actions (go_back)
1392
+ - Upload file support
1393
+ - Dropdown interactions
1394
+
1395
+ Excludes (optimized for code-use mode):
1396
+ - extract: Use Python + evaluate() instead
1397
+ - find_text: Use Python string operations
1398
+ - screenshot: Not needed in code-use mode
1399
+ - search: Use navigate() directly
1400
+ - File system actions (write_file, read_file, replace_file): Use Python file operations instead
1401
+ """
1402
+
1403
+ def __init__(
1404
+ self,
1405
+ exclude_actions: list[str] | None = None,
1406
+ output_model: type[T] | None = None,
1407
+ display_files_in_done_text: bool = True,
1408
+ ):
1409
+ # Default exclusions for CodeAgent agent
1410
+ if exclude_actions is None:
1411
+ exclude_actions = [
1412
+ # 'scroll', # Keep for code-use
1413
+ 'extract', # Exclude - use Python + evaluate()
1414
+ 'find_text', # Exclude - use Python string ops
1415
+ # 'select_dropdown', # Keep for code-use
1416
+ # 'dropdown_options', # Keep for code-use
1417
+ 'screenshot', # Exclude - not needed
1418
+ 'search', # Exclude - use navigate() directly
1419
+ # 'click', # Keep for code-use
1420
+ # 'input', # Keep for code-use
1421
+ # 'switch', # Keep for code-use
1422
+ # 'send_keys', # Keep for code-use
1423
+ # 'close', # Keep for code-use
1424
+ # 'go_back', # Keep for code-use
1425
+ # 'upload_file', # Keep for code-use
1426
+ # Exclude file system actions - CodeAgent should use Python file operations
1427
+ 'write_file',
1428
+ 'read_file',
1429
+ 'replace_file',
1430
+ ]
1431
+
1432
+ super().__init__(
1433
+ exclude_actions=exclude_actions,
1434
+ output_model=output_model,
1435
+ display_files_in_done_text=display_files_in_done_text,
1436
+ )
1437
+
1438
+ # Override done action for CodeAgent with enhanced file handling
1439
+ self._register_code_use_done_action(output_model, display_files_in_done_text)
1440
+
1441
+ def _register_code_use_done_action(self, output_model: type[T] | None, display_files_in_done_text: bool = True):
1442
+ """Register enhanced done action for CodeAgent that can read files from disk."""
1443
+ if output_model is not None:
1444
+ # Structured output done - use parent's implementation
1445
+ return
1446
+
1447
+ # Override the done action with enhanced version
1448
+ @self.registry.action(
1449
+ 'Complete task.',
1450
+ param_model=DoneAction,
1451
+ )
1452
+ async def done(params: DoneAction, file_system: FileSystem):
1453
+ user_message = params.text
1454
+
1455
+ len_text = len(params.text)
1456
+ len_max_memory = 100
1457
+ memory = f'Task completed: {params.success} - {params.text[:len_max_memory]}'
1458
+ if len_text > len_max_memory:
1459
+ memory += f' - {len_text - len_max_memory} more characters'
1460
+
1461
+ attachments = []
1462
+ if params.files_to_display:
1463
+ if self.display_files_in_done_text:
1464
+ file_msg = ''
1465
+ for file_name in params.files_to_display:
1466
+ file_content = file_system.display_file(file_name)
1467
+ if file_content:
1468
+ file_msg += f'\n\n{file_name}:\n{file_content}'
1469
+ attachments.append(file_name)
1470
+ elif os.path.exists(file_name):
1471
+ # File exists on disk but not in FileSystem - just add to attachments
1472
+ attachments.append(file_name)
1473
+ if file_msg:
1474
+ user_message += '\n\nAttachments:'
1475
+ user_message += file_msg
1476
+ else:
1477
+ logger.warning('Agent wanted to display files but none were found')
1478
+ else:
1479
+ for file_name in params.files_to_display:
1480
+ file_content = file_system.display_file(file_name)
1481
+ if file_content:
1482
+ attachments.append(file_name)
1483
+ elif os.path.exists(file_name):
1484
+ attachments.append(file_name)
1485
+
1486
+ # Convert relative paths to absolute paths - handle both FileSystem-managed and regular files
1487
+ resolved_attachments = []
1488
+ for file_name in attachments:
1489
+ if os.path.isabs(file_name):
1490
+ # Already absolute
1491
+ resolved_attachments.append(file_name)
1492
+ elif file_system.get_file(file_name):
1493
+ # Managed by FileSystem
1494
+ resolved_attachments.append(str(file_system.get_dir() / file_name))
1495
+ elif os.path.exists(file_name):
1496
+ # Regular file in current directory
1497
+ resolved_attachments.append(os.path.abspath(file_name))
1498
+ else:
1499
+ # File doesn't exist, but include the path anyway for error visibility
1500
+ resolved_attachments.append(str(file_system.get_dir() / file_name))
1501
+ attachments = resolved_attachments
1502
+
1503
+ return ActionResult(
1504
+ is_done=True,
1505
+ success=params.success,
1506
+ extracted_content=user_message,
1507
+ long_term_memory=memory,
1508
+ attachments=attachments,
1509
+ )
1510
+
1511
+ # Override upload_file for code agent with relaxed path validation
1512
+ @self.registry.action(
1513
+ 'Upload a file to a file input element. For code-use mode, any file accessible from the current directory can be uploaded.',
1514
+ param_model=UploadFileAction,
1515
+ )
1516
+ async def upload_file(
1517
+ params: UploadFileAction,
1518
+ browser_session: BrowserSession,
1519
+ available_file_paths: list[str],
1520
+ file_system: FileSystem,
1521
+ ):
1522
+ # Path validation logic for code-use mode:
1523
+ # 1. If available_file_paths provided (security mode), enforce it as a whitelist
1524
+ # 2. If no whitelist, for local browsers just check file exists
1525
+ # 3. For remote browsers, allow any path (assume it exists remotely)
1526
+
1527
+ # If whitelist provided, validate path is in it
1528
+ if available_file_paths:
1529
+ if params.path not in available_file_paths:
1530
+ # Also check if it's a recently downloaded file
1531
+ downloaded_files = browser_session.downloaded_files
1532
+ if params.path not in downloaded_files:
1533
+ # Finally, check if it's a file in the FileSystem service (if provided)
1534
+ if file_system is not None and file_system.get_dir():
1535
+ # Check if the file is actually managed by the FileSystem service
1536
+ # The path should be just the filename for FileSystem files
1537
+ file_obj = file_system.get_file(params.path)
1538
+ if file_obj:
1539
+ # File is managed by FileSystem, construct the full path
1540
+ file_system_path = str(file_system.get_dir() / params.path)
1541
+ params = UploadFileAction(index=params.index, path=file_system_path)
1542
+ else:
1543
+ # If browser is remote, allow passing a remote-accessible absolute path
1544
+ if not browser_session.is_local:
1545
+ pass
1546
+ else:
1547
+ msg = f'File path {params.path} is not available. To fix: add this file path to the available_file_paths parameter when creating the Agent. Example: Agent(task="...", llm=llm, browser=browser, available_file_paths=["{params.path}"])'
1548
+ logger.error(f'❌ {msg}')
1549
+ return ActionResult(error=msg)
1550
+ else:
1551
+ # If browser is remote, allow passing a remote-accessible absolute path
1552
+ if not browser_session.is_local:
1553
+ pass
1554
+ else:
1555
+ msg = f'File path {params.path} is not available. To fix: add this file path to the available_file_paths parameter when creating the Agent. Example: Agent(task="...", llm=llm, browser=browser, available_file_paths=["{params.path}"])'
1556
+ logger.error(f'❌ {msg}')
1557
+ return ActionResult(error=msg)
1558
+
1559
+ # For local browsers, ensure the file exists on the local filesystem
1560
+ if browser_session.is_local:
1561
+ if not os.path.exists(params.path):
1562
+ msg = f'File {params.path} does not exist'
1563
+ return ActionResult(error=msg)
1564
+
1565
+ # Get the selector map to find the node
1566
+ selector_map = await browser_session.get_selector_map()
1567
+ if params.index not in selector_map:
1568
+ msg = f'Element with index {params.index} does not exist.'
1569
+ return ActionResult(error=msg)
1570
+
1571
+ node = selector_map[params.index]
1572
+
1573
+ # Helper function to find file input near the selected element
1574
+ def find_file_input_near_element(
1575
+ node: EnhancedDOMTreeNode, max_height: int = 3, max_descendant_depth: int = 3
1576
+ ) -> EnhancedDOMTreeNode | None:
1577
+ """Find the closest file input to the selected element."""
1578
+
1579
+ def find_file_input_in_descendants(n: EnhancedDOMTreeNode, depth: int) -> EnhancedDOMTreeNode | None:
1580
+ if depth < 0:
1581
+ return None
1582
+ if browser_session.is_file_input(n):
1583
+ return n
1584
+ for child in n.children_nodes or []:
1585
+ result = find_file_input_in_descendants(child, depth - 1)
1586
+ if result:
1587
+ return result
1588
+ return None
1589
+
1590
+ current = node
1591
+ for _ in range(max_height + 1):
1592
+ # Check the current node itself
1593
+ if browser_session.is_file_input(current):
1594
+ return current
1595
+ # Check all descendants of the current node
1596
+ result = find_file_input_in_descendants(current, max_descendant_depth)
1597
+ if result:
1598
+ return result
1599
+ # Check all siblings and their descendants
1600
+ if current.parent_node:
1601
+ for sibling in current.parent_node.children_nodes or []:
1602
+ if sibling is current:
1603
+ continue
1604
+ if browser_session.is_file_input(sibling):
1605
+ return sibling
1606
+ result = find_file_input_in_descendants(sibling, max_descendant_depth)
1607
+ if result:
1608
+ return result
1609
+ current = current.parent_node
1610
+ if not current:
1611
+ break
1612
+ return None
1613
+
1614
+ # Try to find a file input element near the selected element
1615
+ file_input_node = find_file_input_near_element(node)
1616
+
1617
+ # Highlight the file input element if found (truly non-blocking)
1618
+ if file_input_node:
1619
+ asyncio.create_task(browser_session.highlight_interaction_element(file_input_node))
1620
+
1621
+ # If not found near the selected element, fallback to finding the closest file input to current scroll position
1622
+ if file_input_node is None:
1623
+ logger.info(
1624
+ f'No file upload element found near index {params.index}, searching for closest file input to scroll position'
1625
+ )
1626
+
1627
+ # Get current scroll position
1628
+ cdp_session = await browser_session.get_or_create_cdp_session()
1629
+ try:
1630
+ scroll_info = await cdp_session.cdp_client.send.Runtime.evaluate(
1631
+ params={'expression': 'window.scrollY || window.pageYOffset || 0'}, session_id=cdp_session.session_id
1632
+ )
1633
+ current_scroll_y = scroll_info.get('result', {}).get('value', 0)
1634
+ except Exception:
1635
+ current_scroll_y = 0
1636
+
1637
+ # Find all file inputs in the selector map and pick the closest one to scroll position
1638
+ closest_file_input = None
1639
+ min_distance = float('inf')
1640
+
1641
+ for idx, element in selector_map.items():
1642
+ if browser_session.is_file_input(element):
1643
+ # Get element's Y position
1644
+ if element.absolute_position:
1645
+ element_y = element.absolute_position.y
1646
+ distance = abs(element_y - current_scroll_y)
1647
+ if distance < min_distance:
1648
+ min_distance = distance
1649
+ closest_file_input = element
1650
+
1651
+ if closest_file_input:
1652
+ file_input_node = closest_file_input
1653
+ logger.info(f'Found file input closest to scroll position (distance: {min_distance}px)')
1654
+ # Highlight the fallback file input element (truly non-blocking)
1655
+ asyncio.create_task(browser_session.highlight_interaction_element(file_input_node))
1656
+ else:
1657
+ msg = 'No file upload element found on the page'
1658
+ logger.error(msg)
1659
+ raise BrowserError(msg)
1660
+ # TODO: figure out why this fails sometimes + add fallback hail mary, just look for any file input on page
1661
+
1662
+ # Dispatch upload file event with the file input node
1663
+ try:
1664
+ event = browser_session.event_bus.dispatch(UploadFileEvent(node=file_input_node, file_path=params.path))
1665
+ await event
1666
+ await event.event_result(raise_if_any=True, raise_if_none=False)
1667
+ msg = f'Successfully uploaded file to index {params.index}'
1668
+ logger.info(f'📁 {msg}')
1669
+ return ActionResult(
1670
+ extracted_content=msg,
1671
+ long_term_memory=f'Uploaded file {params.path} to element {params.index}',
1672
+ )
1673
+ except Exception as e:
1674
+ logger.error(f'Failed to upload file: {e}')
1675
+ raise BrowserError(f'Failed to upload file: {e}')