optexity-browser-use 0.9.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- browser_use/__init__.py +157 -0
- browser_use/actor/__init__.py +11 -0
- browser_use/actor/element.py +1175 -0
- browser_use/actor/mouse.py +134 -0
- browser_use/actor/page.py +561 -0
- browser_use/actor/playground/flights.py +41 -0
- browser_use/actor/playground/mixed_automation.py +54 -0
- browser_use/actor/playground/playground.py +236 -0
- browser_use/actor/utils.py +176 -0
- browser_use/agent/cloud_events.py +282 -0
- browser_use/agent/gif.py +424 -0
- browser_use/agent/judge.py +170 -0
- browser_use/agent/message_manager/service.py +473 -0
- browser_use/agent/message_manager/utils.py +52 -0
- browser_use/agent/message_manager/views.py +98 -0
- browser_use/agent/prompts.py +413 -0
- browser_use/agent/service.py +2316 -0
- browser_use/agent/system_prompt.md +185 -0
- browser_use/agent/system_prompt_flash.md +10 -0
- browser_use/agent/system_prompt_no_thinking.md +183 -0
- browser_use/agent/views.py +743 -0
- browser_use/browser/__init__.py +41 -0
- browser_use/browser/cloud/cloud.py +203 -0
- browser_use/browser/cloud/views.py +89 -0
- browser_use/browser/events.py +578 -0
- browser_use/browser/profile.py +1158 -0
- browser_use/browser/python_highlights.py +548 -0
- browser_use/browser/session.py +3225 -0
- browser_use/browser/session_manager.py +399 -0
- browser_use/browser/video_recorder.py +162 -0
- browser_use/browser/views.py +200 -0
- browser_use/browser/watchdog_base.py +260 -0
- browser_use/browser/watchdogs/__init__.py +0 -0
- browser_use/browser/watchdogs/aboutblank_watchdog.py +253 -0
- browser_use/browser/watchdogs/crash_watchdog.py +335 -0
- browser_use/browser/watchdogs/default_action_watchdog.py +2729 -0
- browser_use/browser/watchdogs/dom_watchdog.py +817 -0
- browser_use/browser/watchdogs/downloads_watchdog.py +1277 -0
- browser_use/browser/watchdogs/local_browser_watchdog.py +461 -0
- browser_use/browser/watchdogs/permissions_watchdog.py +43 -0
- browser_use/browser/watchdogs/popups_watchdog.py +143 -0
- browser_use/browser/watchdogs/recording_watchdog.py +126 -0
- browser_use/browser/watchdogs/screenshot_watchdog.py +62 -0
- browser_use/browser/watchdogs/security_watchdog.py +280 -0
- browser_use/browser/watchdogs/storage_state_watchdog.py +335 -0
- browser_use/cli.py +2359 -0
- browser_use/code_use/__init__.py +16 -0
- browser_use/code_use/formatting.py +192 -0
- browser_use/code_use/namespace.py +665 -0
- browser_use/code_use/notebook_export.py +276 -0
- browser_use/code_use/service.py +1340 -0
- browser_use/code_use/system_prompt.md +574 -0
- browser_use/code_use/utils.py +150 -0
- browser_use/code_use/views.py +171 -0
- browser_use/config.py +505 -0
- browser_use/controller/__init__.py +3 -0
- browser_use/dom/enhanced_snapshot.py +161 -0
- browser_use/dom/markdown_extractor.py +169 -0
- browser_use/dom/playground/extraction.py +312 -0
- browser_use/dom/playground/multi_act.py +32 -0
- browser_use/dom/serializer/clickable_elements.py +200 -0
- browser_use/dom/serializer/code_use_serializer.py +287 -0
- browser_use/dom/serializer/eval_serializer.py +478 -0
- browser_use/dom/serializer/html_serializer.py +212 -0
- browser_use/dom/serializer/paint_order.py +197 -0
- browser_use/dom/serializer/serializer.py +1170 -0
- browser_use/dom/service.py +825 -0
- browser_use/dom/utils.py +129 -0
- browser_use/dom/views.py +906 -0
- browser_use/exceptions.py +5 -0
- browser_use/filesystem/__init__.py +0 -0
- browser_use/filesystem/file_system.py +619 -0
- browser_use/init_cmd.py +376 -0
- browser_use/integrations/gmail/__init__.py +24 -0
- browser_use/integrations/gmail/actions.py +115 -0
- browser_use/integrations/gmail/service.py +225 -0
- browser_use/llm/__init__.py +155 -0
- browser_use/llm/anthropic/chat.py +242 -0
- browser_use/llm/anthropic/serializer.py +312 -0
- browser_use/llm/aws/__init__.py +36 -0
- browser_use/llm/aws/chat_anthropic.py +242 -0
- browser_use/llm/aws/chat_bedrock.py +289 -0
- browser_use/llm/aws/serializer.py +257 -0
- browser_use/llm/azure/chat.py +91 -0
- browser_use/llm/base.py +57 -0
- browser_use/llm/browser_use/__init__.py +3 -0
- browser_use/llm/browser_use/chat.py +201 -0
- browser_use/llm/cerebras/chat.py +193 -0
- browser_use/llm/cerebras/serializer.py +109 -0
- browser_use/llm/deepseek/chat.py +212 -0
- browser_use/llm/deepseek/serializer.py +109 -0
- browser_use/llm/exceptions.py +29 -0
- browser_use/llm/google/__init__.py +3 -0
- browser_use/llm/google/chat.py +542 -0
- browser_use/llm/google/serializer.py +120 -0
- browser_use/llm/groq/chat.py +229 -0
- browser_use/llm/groq/parser.py +158 -0
- browser_use/llm/groq/serializer.py +159 -0
- browser_use/llm/messages.py +238 -0
- browser_use/llm/models.py +271 -0
- browser_use/llm/oci_raw/__init__.py +10 -0
- browser_use/llm/oci_raw/chat.py +443 -0
- browser_use/llm/oci_raw/serializer.py +229 -0
- browser_use/llm/ollama/chat.py +97 -0
- browser_use/llm/ollama/serializer.py +143 -0
- browser_use/llm/openai/chat.py +264 -0
- browser_use/llm/openai/like.py +15 -0
- browser_use/llm/openai/serializer.py +165 -0
- browser_use/llm/openrouter/chat.py +211 -0
- browser_use/llm/openrouter/serializer.py +26 -0
- browser_use/llm/schema.py +176 -0
- browser_use/llm/views.py +48 -0
- browser_use/logging_config.py +330 -0
- browser_use/mcp/__init__.py +18 -0
- browser_use/mcp/__main__.py +12 -0
- browser_use/mcp/client.py +544 -0
- browser_use/mcp/controller.py +264 -0
- browser_use/mcp/server.py +1114 -0
- browser_use/observability.py +204 -0
- browser_use/py.typed +0 -0
- browser_use/sandbox/__init__.py +41 -0
- browser_use/sandbox/sandbox.py +637 -0
- browser_use/sandbox/views.py +132 -0
- browser_use/screenshots/__init__.py +1 -0
- browser_use/screenshots/service.py +52 -0
- browser_use/sync/__init__.py +6 -0
- browser_use/sync/auth.py +357 -0
- browser_use/sync/service.py +161 -0
- browser_use/telemetry/__init__.py +51 -0
- browser_use/telemetry/service.py +112 -0
- browser_use/telemetry/views.py +101 -0
- browser_use/tokens/__init__.py +0 -0
- browser_use/tokens/custom_pricing.py +24 -0
- browser_use/tokens/mappings.py +4 -0
- browser_use/tokens/service.py +580 -0
- browser_use/tokens/views.py +108 -0
- browser_use/tools/registry/service.py +572 -0
- browser_use/tools/registry/views.py +174 -0
- browser_use/tools/service.py +1675 -0
- browser_use/tools/utils.py +82 -0
- browser_use/tools/views.py +100 -0
- browser_use/utils.py +670 -0
- optexity_browser_use-0.9.5.dist-info/METADATA +344 -0
- optexity_browser_use-0.9.5.dist-info/RECORD +147 -0
- optexity_browser_use-0.9.5.dist-info/WHEEL +4 -0
- optexity_browser_use-0.9.5.dist-info/entry_points.txt +3 -0
- optexity_browser_use-0.9.5.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,1675 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import enum
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
from typing import Generic, TypeVar
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from lmnr import Laminar # type: ignore
|
|
10
|
+
except ImportError:
|
|
11
|
+
Laminar = None # type: ignore
|
|
12
|
+
from pydantic import BaseModel
|
|
13
|
+
|
|
14
|
+
from browser_use.agent.views import ActionModel, ActionResult
|
|
15
|
+
from browser_use.browser import BrowserSession
|
|
16
|
+
from browser_use.browser.events import (
|
|
17
|
+
ClickElementEvent,
|
|
18
|
+
CloseTabEvent,
|
|
19
|
+
GetDropdownOptionsEvent,
|
|
20
|
+
GoBackEvent,
|
|
21
|
+
NavigateToUrlEvent,
|
|
22
|
+
ScrollEvent,
|
|
23
|
+
ScrollToTextEvent,
|
|
24
|
+
SendKeysEvent,
|
|
25
|
+
SwitchTabEvent,
|
|
26
|
+
TypeTextEvent,
|
|
27
|
+
UploadFileEvent,
|
|
28
|
+
)
|
|
29
|
+
from browser_use.browser.views import BrowserError
|
|
30
|
+
from browser_use.dom.service import EnhancedDOMTreeNode
|
|
31
|
+
from browser_use.filesystem.file_system import FileSystem
|
|
32
|
+
from browser_use.llm.base import BaseChatModel
|
|
33
|
+
from browser_use.llm.messages import SystemMessage, UserMessage
|
|
34
|
+
from browser_use.observability import observe_debug
|
|
35
|
+
from browser_use.tools.registry.service import Registry
|
|
36
|
+
from browser_use.tools.utils import get_click_description
|
|
37
|
+
from browser_use.tools.views import (
|
|
38
|
+
ClickElementAction,
|
|
39
|
+
CloseTabAction,
|
|
40
|
+
DoneAction,
|
|
41
|
+
ExtractAction,
|
|
42
|
+
GetDropdownOptionsAction,
|
|
43
|
+
InputTextAction,
|
|
44
|
+
NavigateAction,
|
|
45
|
+
NoParamsAction,
|
|
46
|
+
ScrollAction,
|
|
47
|
+
SearchAction,
|
|
48
|
+
SelectDropdownOptionAction,
|
|
49
|
+
SendKeysAction,
|
|
50
|
+
StructuredOutputAction,
|
|
51
|
+
SwitchTabAction,
|
|
52
|
+
UploadFileAction,
|
|
53
|
+
)
|
|
54
|
+
from browser_use.utils import time_execution_sync
|
|
55
|
+
|
|
56
|
+
logger = logging.getLogger(__name__)
|
|
57
|
+
|
|
58
|
+
# Import EnhancedDOMTreeNode and rebuild event models that have forward references to it
|
|
59
|
+
# This must be done after all imports are complete
|
|
60
|
+
ClickElementEvent.model_rebuild()
|
|
61
|
+
TypeTextEvent.model_rebuild()
|
|
62
|
+
ScrollEvent.model_rebuild()
|
|
63
|
+
UploadFileEvent.model_rebuild()
|
|
64
|
+
|
|
65
|
+
Context = TypeVar('Context')
|
|
66
|
+
|
|
67
|
+
T = TypeVar('T', bound=BaseModel)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _detect_sensitive_key_name(text: str, sensitive_data: dict[str, str | dict[str, str]] | None) -> str | None:
|
|
71
|
+
"""Detect which sensitive key name corresponds to the given text value."""
|
|
72
|
+
if not sensitive_data or not text:
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
# Collect all sensitive values and their keys
|
|
76
|
+
for domain_or_key, content in sensitive_data.items():
|
|
77
|
+
if isinstance(content, dict):
|
|
78
|
+
# New format: {domain: {key: value}}
|
|
79
|
+
for key, value in content.items():
|
|
80
|
+
if value and value == text:
|
|
81
|
+
return key
|
|
82
|
+
elif content: # Old format: {key: value}
|
|
83
|
+
if content == text:
|
|
84
|
+
return domain_or_key
|
|
85
|
+
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def handle_browser_error(e: BrowserError) -> ActionResult:
|
|
90
|
+
if e.long_term_memory is not None:
|
|
91
|
+
if e.short_term_memory is not None:
|
|
92
|
+
return ActionResult(
|
|
93
|
+
extracted_content=e.short_term_memory, error=e.long_term_memory, include_extracted_content_only_once=True
|
|
94
|
+
)
|
|
95
|
+
else:
|
|
96
|
+
return ActionResult(error=e.long_term_memory)
|
|
97
|
+
# Fallback to original error handling if long_term_memory is None
|
|
98
|
+
logger.warning(
|
|
99
|
+
'⚠️ A BrowserError was raised without long_term_memory - always set long_term_memory when raising BrowserError to propagate right messages to LLM.'
|
|
100
|
+
)
|
|
101
|
+
raise e
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class Tools(Generic[Context]):
|
|
105
|
+
def __init__(
|
|
106
|
+
self,
|
|
107
|
+
exclude_actions: list[str] = [],
|
|
108
|
+
output_model: type[T] | None = None,
|
|
109
|
+
display_files_in_done_text: bool = True,
|
|
110
|
+
):
|
|
111
|
+
self.registry = Registry[Context](exclude_actions)
|
|
112
|
+
self.display_files_in_done_text = display_files_in_done_text
|
|
113
|
+
|
|
114
|
+
"""Register all default browser actions"""
|
|
115
|
+
|
|
116
|
+
self._register_done_action(output_model)
|
|
117
|
+
|
|
118
|
+
# Basic Navigation Actions
|
|
119
|
+
@self.registry.action(
|
|
120
|
+
'',
|
|
121
|
+
param_model=SearchAction,
|
|
122
|
+
)
|
|
123
|
+
async def search(params: SearchAction, browser_session: BrowserSession):
|
|
124
|
+
import urllib.parse
|
|
125
|
+
|
|
126
|
+
# Encode query for URL safety
|
|
127
|
+
encoded_query = urllib.parse.quote_plus(params.query)
|
|
128
|
+
|
|
129
|
+
# Build search URL based on search engine
|
|
130
|
+
search_engines = {
|
|
131
|
+
'duckduckgo': f'https://duckduckgo.com/?q={encoded_query}',
|
|
132
|
+
'google': f'https://www.google.com/search?q={encoded_query}&udm=14',
|
|
133
|
+
'bing': f'https://www.bing.com/search?q={encoded_query}',
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
if params.engine.lower() not in search_engines:
|
|
137
|
+
return ActionResult(error=f'Unsupported search engine: {params.engine}. Options: duckduckgo, google, bing')
|
|
138
|
+
|
|
139
|
+
search_url = search_engines[params.engine.lower()]
|
|
140
|
+
|
|
141
|
+
# Simple tab logic: use current tab by default
|
|
142
|
+
use_new_tab = False
|
|
143
|
+
|
|
144
|
+
# Dispatch navigation event
|
|
145
|
+
try:
|
|
146
|
+
event = browser_session.event_bus.dispatch(
|
|
147
|
+
NavigateToUrlEvent(
|
|
148
|
+
url=search_url,
|
|
149
|
+
new_tab=use_new_tab,
|
|
150
|
+
)
|
|
151
|
+
)
|
|
152
|
+
await event
|
|
153
|
+
await event.event_result(raise_if_any=True, raise_if_none=False)
|
|
154
|
+
memory = f"Searched {params.engine.title()} for '{params.query}'"
|
|
155
|
+
msg = f'🔍 {memory}'
|
|
156
|
+
logger.info(msg)
|
|
157
|
+
return ActionResult(extracted_content=memory, long_term_memory=memory)
|
|
158
|
+
except Exception as e:
|
|
159
|
+
logger.error(f'Failed to search {params.engine}: {e}')
|
|
160
|
+
return ActionResult(error=f'Failed to search {params.engine} for "{params.query}": {str(e)}')
|
|
161
|
+
|
|
162
|
+
@self.registry.action(
|
|
163
|
+
'',
|
|
164
|
+
param_model=NavigateAction,
|
|
165
|
+
)
|
|
166
|
+
async def navigate(params: NavigateAction, browser_session: BrowserSession):
|
|
167
|
+
try:
|
|
168
|
+
# Dispatch navigation event
|
|
169
|
+
event = browser_session.event_bus.dispatch(NavigateToUrlEvent(url=params.url, new_tab=params.new_tab))
|
|
170
|
+
await event
|
|
171
|
+
await event.event_result(raise_if_any=True, raise_if_none=False)
|
|
172
|
+
|
|
173
|
+
if params.new_tab:
|
|
174
|
+
memory = f'Opened new tab with URL {params.url}'
|
|
175
|
+
msg = f'🔗 Opened new tab with url {params.url}'
|
|
176
|
+
else:
|
|
177
|
+
memory = f'Navigated to {params.url}'
|
|
178
|
+
msg = f'🔗 {memory}'
|
|
179
|
+
|
|
180
|
+
logger.info(msg)
|
|
181
|
+
return ActionResult(extracted_content=msg, long_term_memory=memory)
|
|
182
|
+
except Exception as e:
|
|
183
|
+
error_msg = str(e)
|
|
184
|
+
# Always log the actual error first for debugging
|
|
185
|
+
browser_session.logger.error(f'❌ Navigation failed: {error_msg}')
|
|
186
|
+
|
|
187
|
+
# Check if it's specifically a RuntimeError about CDP client
|
|
188
|
+
if isinstance(e, RuntimeError) and 'CDP client not initialized' in error_msg:
|
|
189
|
+
browser_session.logger.error('❌ Browser connection failed - CDP client not properly initialized')
|
|
190
|
+
return ActionResult(error=f'Browser connection error: {error_msg}')
|
|
191
|
+
# Check for network-related errors
|
|
192
|
+
elif any(
|
|
193
|
+
err in error_msg
|
|
194
|
+
for err in [
|
|
195
|
+
'ERR_NAME_NOT_RESOLVED',
|
|
196
|
+
'ERR_INTERNET_DISCONNECTED',
|
|
197
|
+
'ERR_CONNECTION_REFUSED',
|
|
198
|
+
'ERR_TIMED_OUT',
|
|
199
|
+
'net::',
|
|
200
|
+
]
|
|
201
|
+
):
|
|
202
|
+
site_unavailable_msg = f'Navigation failed - site unavailable: {params.url}'
|
|
203
|
+
browser_session.logger.warning(f'⚠️ {site_unavailable_msg} - {error_msg}')
|
|
204
|
+
return ActionResult(error=site_unavailable_msg)
|
|
205
|
+
else:
|
|
206
|
+
# Return error in ActionResult instead of re-raising
|
|
207
|
+
return ActionResult(error=f'Navigation failed: {str(e)}')
|
|
208
|
+
|
|
209
|
+
@self.registry.action('', param_model=NoParamsAction)
|
|
210
|
+
async def go_back(_: NoParamsAction, browser_session: BrowserSession):
|
|
211
|
+
try:
|
|
212
|
+
event = browser_session.event_bus.dispatch(GoBackEvent())
|
|
213
|
+
await event
|
|
214
|
+
memory = 'Navigated back'
|
|
215
|
+
msg = f'🔙 {memory}'
|
|
216
|
+
logger.info(msg)
|
|
217
|
+
return ActionResult(extracted_content=memory)
|
|
218
|
+
except Exception as e:
|
|
219
|
+
logger.error(f'Failed to dispatch GoBackEvent: {type(e).__name__}: {e}')
|
|
220
|
+
error_msg = f'Failed to go back: {str(e)}'
|
|
221
|
+
return ActionResult(error=error_msg)
|
|
222
|
+
|
|
223
|
+
@self.registry.action('')
|
|
224
|
+
async def wait(seconds: int = 3):
|
|
225
|
+
# Cap wait time at maximum 30 seconds
|
|
226
|
+
# Reduce the wait time by 3 seconds to account for the llm call which takes at least 3 seconds
|
|
227
|
+
# So if the model decides to wait for 5 seconds, the llm call took at least 3 seconds, so we only need to wait for 2 seconds
|
|
228
|
+
# Note by Mert: the above doesnt make sense because we do the LLM call right after this or this could be followed by another action after which we would like to wait
|
|
229
|
+
# so I revert this.
|
|
230
|
+
actual_seconds = min(max(seconds - 3, 0), 30)
|
|
231
|
+
memory = f'Waited for {seconds} seconds'
|
|
232
|
+
logger.info(f'🕒 waited for {seconds} second{"" if seconds == 1 else "s"}')
|
|
233
|
+
await asyncio.sleep(actual_seconds)
|
|
234
|
+
return ActionResult(extracted_content=memory, long_term_memory=memory)
|
|
235
|
+
|
|
236
|
+
# Element Interaction Actions
|
|
237
|
+
|
|
238
|
+
@self.registry.action(
|
|
239
|
+
'',
|
|
240
|
+
param_model=ClickElementAction,
|
|
241
|
+
)
|
|
242
|
+
async def click(params: ClickElementAction, browser_session: BrowserSession):
|
|
243
|
+
# Dispatch click event with node
|
|
244
|
+
try:
|
|
245
|
+
assert params.index != 0, (
|
|
246
|
+
'Cannot click on element with index 0. If there are no interactive elements use wait(), refresh(), etc. to troubleshoot'
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Look up the node from the selector map
|
|
250
|
+
node = await browser_session.get_element_by_index(params.index)
|
|
251
|
+
if node is None:
|
|
252
|
+
msg = f'Element index {params.index} not available - page may have changed. Try refreshing browser state.'
|
|
253
|
+
logger.warning(f'⚠️ {msg}')
|
|
254
|
+
return ActionResult(extracted_content=msg)
|
|
255
|
+
|
|
256
|
+
# Get description of clicked element
|
|
257
|
+
element_desc = get_click_description(node)
|
|
258
|
+
|
|
259
|
+
# Highlight the element being clicked (truly non-blocking)
|
|
260
|
+
asyncio.create_task(browser_session.highlight_interaction_element(node))
|
|
261
|
+
|
|
262
|
+
event = browser_session.event_bus.dispatch(ClickElementEvent(node=node))
|
|
263
|
+
await event
|
|
264
|
+
# Wait for handler to complete and get any exception or metadata
|
|
265
|
+
click_metadata = await event.event_result(raise_if_any=True, raise_if_none=False)
|
|
266
|
+
|
|
267
|
+
# Check if result contains validation error (e.g., trying to click <select> or file input)
|
|
268
|
+
if isinstance(click_metadata, dict) and 'validation_error' in click_metadata:
|
|
269
|
+
error_msg = click_metadata['validation_error']
|
|
270
|
+
# If it's a select element, try to get dropdown options as a helpful shortcut
|
|
271
|
+
if 'Cannot click on <select> elements.' in error_msg:
|
|
272
|
+
try:
|
|
273
|
+
return await dropdown_options(
|
|
274
|
+
params=GetDropdownOptionsAction(index=params.index), browser_session=browser_session
|
|
275
|
+
)
|
|
276
|
+
except Exception as dropdown_error:
|
|
277
|
+
logger.debug(
|
|
278
|
+
f'Failed to get dropdown options as shortcut during click on dropdown: {type(dropdown_error).__name__}: {dropdown_error}'
|
|
279
|
+
)
|
|
280
|
+
return ActionResult(error=error_msg)
|
|
281
|
+
|
|
282
|
+
# Build memory with element info
|
|
283
|
+
memory = f'Clicked {element_desc}'
|
|
284
|
+
logger.info(f'🖱️ {memory}')
|
|
285
|
+
|
|
286
|
+
# Include click coordinates in metadata if available
|
|
287
|
+
return ActionResult(
|
|
288
|
+
extracted_content=memory,
|
|
289
|
+
metadata=click_metadata if isinstance(click_metadata, dict) else None,
|
|
290
|
+
)
|
|
291
|
+
except BrowserError as e:
|
|
292
|
+
return handle_browser_error(e)
|
|
293
|
+
except Exception as e:
|
|
294
|
+
error_msg = f'Failed to click element {params.index}: {str(e)}'
|
|
295
|
+
return ActionResult(error=error_msg)
|
|
296
|
+
|
|
297
|
+
@self.registry.action(
|
|
298
|
+
'',
|
|
299
|
+
param_model=InputTextAction,
|
|
300
|
+
)
|
|
301
|
+
async def input(
|
|
302
|
+
params: InputTextAction,
|
|
303
|
+
browser_session: BrowserSession,
|
|
304
|
+
has_sensitive_data: bool = False,
|
|
305
|
+
sensitive_data: dict[str, str | dict[str, str]] | None = None,
|
|
306
|
+
):
|
|
307
|
+
# Look up the node from the selector map
|
|
308
|
+
node = await browser_session.get_element_by_index(params.index)
|
|
309
|
+
if node is None:
|
|
310
|
+
msg = f'Element index {params.index} not available - page may have changed. Try refreshing browser state.'
|
|
311
|
+
logger.warning(f'⚠️ {msg}')
|
|
312
|
+
return ActionResult(extracted_content=msg)
|
|
313
|
+
|
|
314
|
+
# Highlight the element being typed into (truly non-blocking)
|
|
315
|
+
asyncio.create_task(browser_session.highlight_interaction_element(node))
|
|
316
|
+
|
|
317
|
+
# Dispatch type text event with node
|
|
318
|
+
try:
|
|
319
|
+
# Detect which sensitive key is being used
|
|
320
|
+
sensitive_key_name = None
|
|
321
|
+
if has_sensitive_data and sensitive_data:
|
|
322
|
+
sensitive_key_name = _detect_sensitive_key_name(params.text, sensitive_data)
|
|
323
|
+
|
|
324
|
+
event = browser_session.event_bus.dispatch(
|
|
325
|
+
TypeTextEvent(
|
|
326
|
+
node=node,
|
|
327
|
+
text=params.text,
|
|
328
|
+
clear=params.clear,
|
|
329
|
+
is_sensitive=has_sensitive_data,
|
|
330
|
+
sensitive_key_name=sensitive_key_name,
|
|
331
|
+
)
|
|
332
|
+
)
|
|
333
|
+
await event
|
|
334
|
+
input_metadata = await event.event_result(raise_if_any=True, raise_if_none=False)
|
|
335
|
+
|
|
336
|
+
# Create message with sensitive data handling
|
|
337
|
+
if has_sensitive_data:
|
|
338
|
+
if sensitive_key_name:
|
|
339
|
+
msg = f'Typed {sensitive_key_name}'
|
|
340
|
+
log_msg = f'Typed <{sensitive_key_name}>'
|
|
341
|
+
else:
|
|
342
|
+
msg = 'Typed sensitive data'
|
|
343
|
+
log_msg = 'Typed <sensitive>'
|
|
344
|
+
else:
|
|
345
|
+
msg = f"Typed '{params.text}'"
|
|
346
|
+
log_msg = f"Typed '{params.text}'"
|
|
347
|
+
|
|
348
|
+
logger.debug(log_msg)
|
|
349
|
+
|
|
350
|
+
# Include input coordinates in metadata if available
|
|
351
|
+
return ActionResult(
|
|
352
|
+
extracted_content=msg,
|
|
353
|
+
long_term_memory=msg,
|
|
354
|
+
metadata=input_metadata if isinstance(input_metadata, dict) else None,
|
|
355
|
+
)
|
|
356
|
+
except BrowserError as e:
|
|
357
|
+
return handle_browser_error(e)
|
|
358
|
+
except Exception as e:
|
|
359
|
+
# Log the full error for debugging
|
|
360
|
+
logger.error(f'Failed to dispatch TypeTextEvent: {type(e).__name__}: {e}')
|
|
361
|
+
error_msg = f'Failed to type text into element {params.index}: {e}'
|
|
362
|
+
return ActionResult(error=error_msg)
|
|
363
|
+
|
|
364
|
+
@self.registry.action(
|
|
365
|
+
'',
|
|
366
|
+
param_model=UploadFileAction,
|
|
367
|
+
)
|
|
368
|
+
async def upload_file(
|
|
369
|
+
params: UploadFileAction, browser_session: BrowserSession, available_file_paths: list[str], file_system: FileSystem
|
|
370
|
+
):
|
|
371
|
+
# Check if file is in available_file_paths (user-provided or downloaded files)
|
|
372
|
+
# For remote browsers (is_local=False), we allow absolute remote paths even if not tracked locally
|
|
373
|
+
if params.path not in available_file_paths:
|
|
374
|
+
# Also check if it's a recently downloaded file that might not be in available_file_paths yet
|
|
375
|
+
downloaded_files = browser_session.downloaded_files
|
|
376
|
+
if params.path not in downloaded_files:
|
|
377
|
+
# Finally, check if it's a file in the FileSystem service
|
|
378
|
+
if file_system and file_system.get_dir():
|
|
379
|
+
# Check if the file is actually managed by the FileSystem service
|
|
380
|
+
# The path should be just the filename for FileSystem files
|
|
381
|
+
file_obj = file_system.get_file(params.path)
|
|
382
|
+
if file_obj:
|
|
383
|
+
# File is managed by FileSystem, construct the full path
|
|
384
|
+
file_system_path = str(file_system.get_dir() / params.path)
|
|
385
|
+
params = UploadFileAction(index=params.index, path=file_system_path)
|
|
386
|
+
else:
|
|
387
|
+
# If browser is remote, allow passing a remote-accessible absolute path
|
|
388
|
+
if not browser_session.is_local:
|
|
389
|
+
pass
|
|
390
|
+
else:
|
|
391
|
+
msg = f'File path {params.path} is not available. To fix: The user must add this file path to the available_file_paths parameter when creating the Agent. Example: Agent(task="...", llm=llm, browser=browser, available_file_paths=["{params.path}"])'
|
|
392
|
+
logger.error(f'❌ {msg}')
|
|
393
|
+
return ActionResult(error=msg)
|
|
394
|
+
else:
|
|
395
|
+
# If browser is remote, allow passing a remote-accessible absolute path
|
|
396
|
+
if not browser_session.is_local:
|
|
397
|
+
pass
|
|
398
|
+
else:
|
|
399
|
+
msg = f'File path {params.path} is not available. To fix: The user must add this file path to the available_file_paths parameter when creating the Agent. Example: Agent(task="...", llm=llm, browser=browser, available_file_paths=["{params.path}"])'
|
|
400
|
+
raise BrowserError(message=msg, long_term_memory=msg)
|
|
401
|
+
|
|
402
|
+
# For local browsers, ensure the file exists on the local filesystem
|
|
403
|
+
if browser_session.is_local:
|
|
404
|
+
if not os.path.exists(params.path):
|
|
405
|
+
msg = f'File {params.path} does not exist'
|
|
406
|
+
return ActionResult(error=msg)
|
|
407
|
+
|
|
408
|
+
# Get the selector map to find the node
|
|
409
|
+
selector_map = await browser_session.get_selector_map()
|
|
410
|
+
if params.index not in selector_map:
|
|
411
|
+
msg = f'Element with index {params.index} does not exist.'
|
|
412
|
+
return ActionResult(error=msg)
|
|
413
|
+
|
|
414
|
+
node = selector_map[params.index]
|
|
415
|
+
|
|
416
|
+
# Helper function to find file input near the selected element
|
|
417
|
+
def find_file_input_near_element(
|
|
418
|
+
node: EnhancedDOMTreeNode, max_height: int = 3, max_descendant_depth: int = 3
|
|
419
|
+
) -> EnhancedDOMTreeNode | None:
|
|
420
|
+
"""Find the closest file input to the selected element."""
|
|
421
|
+
|
|
422
|
+
def find_file_input_in_descendants(n: EnhancedDOMTreeNode, depth: int) -> EnhancedDOMTreeNode | None:
|
|
423
|
+
if depth < 0:
|
|
424
|
+
return None
|
|
425
|
+
if browser_session.is_file_input(n):
|
|
426
|
+
return n
|
|
427
|
+
for child in n.children_nodes or []:
|
|
428
|
+
result = find_file_input_in_descendants(child, depth - 1)
|
|
429
|
+
if result:
|
|
430
|
+
return result
|
|
431
|
+
return None
|
|
432
|
+
|
|
433
|
+
current = node
|
|
434
|
+
for _ in range(max_height + 1):
|
|
435
|
+
# Check the current node itself
|
|
436
|
+
if browser_session.is_file_input(current):
|
|
437
|
+
return current
|
|
438
|
+
# Check all descendants of the current node
|
|
439
|
+
result = find_file_input_in_descendants(current, max_descendant_depth)
|
|
440
|
+
if result:
|
|
441
|
+
return result
|
|
442
|
+
# Check all siblings and their descendants
|
|
443
|
+
if current.parent_node:
|
|
444
|
+
for sibling in current.parent_node.children_nodes or []:
|
|
445
|
+
if sibling is current:
|
|
446
|
+
continue
|
|
447
|
+
if browser_session.is_file_input(sibling):
|
|
448
|
+
return sibling
|
|
449
|
+
result = find_file_input_in_descendants(sibling, max_descendant_depth)
|
|
450
|
+
if result:
|
|
451
|
+
return result
|
|
452
|
+
current = current.parent_node
|
|
453
|
+
if not current:
|
|
454
|
+
break
|
|
455
|
+
return None
|
|
456
|
+
|
|
457
|
+
# Try to find a file input element near the selected element
|
|
458
|
+
file_input_node = find_file_input_near_element(node)
|
|
459
|
+
|
|
460
|
+
# Highlight the file input element if found (truly non-blocking)
|
|
461
|
+
if file_input_node:
|
|
462
|
+
asyncio.create_task(browser_session.highlight_interaction_element(file_input_node))
|
|
463
|
+
|
|
464
|
+
# If not found near the selected element, fallback to finding the closest file input to current scroll position
|
|
465
|
+
if file_input_node is None:
|
|
466
|
+
logger.info(
|
|
467
|
+
f'No file upload element found near index {params.index}, searching for closest file input to scroll position'
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
# Get current scroll position
|
|
471
|
+
cdp_session = await browser_session.get_or_create_cdp_session()
|
|
472
|
+
try:
|
|
473
|
+
scroll_info = await cdp_session.cdp_client.send.Runtime.evaluate(
|
|
474
|
+
params={'expression': 'window.scrollY || window.pageYOffset || 0'}, session_id=cdp_session.session_id
|
|
475
|
+
)
|
|
476
|
+
current_scroll_y = scroll_info.get('result', {}).get('value', 0)
|
|
477
|
+
except Exception:
|
|
478
|
+
current_scroll_y = 0
|
|
479
|
+
|
|
480
|
+
# Find all file inputs in the selector map and pick the closest one to scroll position
|
|
481
|
+
closest_file_input = None
|
|
482
|
+
min_distance = float('inf')
|
|
483
|
+
|
|
484
|
+
for idx, element in selector_map.items():
|
|
485
|
+
if browser_session.is_file_input(element):
|
|
486
|
+
# Get element's Y position
|
|
487
|
+
if element.absolute_position:
|
|
488
|
+
element_y = element.absolute_position.y
|
|
489
|
+
distance = abs(element_y - current_scroll_y)
|
|
490
|
+
if distance < min_distance:
|
|
491
|
+
min_distance = distance
|
|
492
|
+
closest_file_input = element
|
|
493
|
+
|
|
494
|
+
if closest_file_input:
|
|
495
|
+
file_input_node = closest_file_input
|
|
496
|
+
logger.info(f'Found file input closest to scroll position (distance: {min_distance}px)')
|
|
497
|
+
# Highlight the fallback file input element (truly non-blocking)
|
|
498
|
+
asyncio.create_task(browser_session.highlight_interaction_element(file_input_node))
|
|
499
|
+
else:
|
|
500
|
+
msg = 'No file upload element found on the page'
|
|
501
|
+
logger.error(msg)
|
|
502
|
+
raise BrowserError(msg)
|
|
503
|
+
# TODO: figure out why this fails sometimes + add fallback hail mary, just look for any file input on page
|
|
504
|
+
|
|
505
|
+
# Dispatch upload file event with the file input node
|
|
506
|
+
try:
|
|
507
|
+
event = browser_session.event_bus.dispatch(UploadFileEvent(node=file_input_node, file_path=params.path))
|
|
508
|
+
await event
|
|
509
|
+
await event.event_result(raise_if_any=True, raise_if_none=False)
|
|
510
|
+
msg = f'Successfully uploaded file to index {params.index}'
|
|
511
|
+
logger.info(f'📁 {msg}')
|
|
512
|
+
return ActionResult(
|
|
513
|
+
extracted_content=msg,
|
|
514
|
+
long_term_memory=f'Uploaded file {params.path} to element {params.index}',
|
|
515
|
+
)
|
|
516
|
+
except Exception as e:
|
|
517
|
+
logger.error(f'Failed to upload file: {e}')
|
|
518
|
+
raise BrowserError(f'Failed to upload file: {e}')
|
|
519
|
+
|
|
520
|
+
# Tab Management Actions
|
|
521
|
+
|
|
522
|
+
@self.registry.action(
|
|
523
|
+
'Switch to another open tab by tab_id. Tab IDs are shown in browser state tabs list (last 4 chars of target_id). Use when you need to work with content in a different tab.',
|
|
524
|
+
param_model=SwitchTabAction,
|
|
525
|
+
)
|
|
526
|
+
async def switch(params: SwitchTabAction, browser_session: BrowserSession):
|
|
527
|
+
# Simple switch tab logic
|
|
528
|
+
try:
|
|
529
|
+
target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
|
|
530
|
+
|
|
531
|
+
event = browser_session.event_bus.dispatch(SwitchTabEvent(target_id=target_id))
|
|
532
|
+
await event
|
|
533
|
+
new_target_id = await event.event_result(raise_if_any=False, raise_if_none=False) # Don't raise on errors
|
|
534
|
+
|
|
535
|
+
if new_target_id:
|
|
536
|
+
memory = f'Switched to tab #{new_target_id[-4:]}'
|
|
537
|
+
else:
|
|
538
|
+
memory = f'Switched to tab #{params.tab_id}'
|
|
539
|
+
|
|
540
|
+
logger.info(f'🔄 {memory}')
|
|
541
|
+
return ActionResult(extracted_content=memory, long_term_memory=memory)
|
|
542
|
+
except Exception as e:
|
|
543
|
+
logger.warning(f'Tab switch may have failed: {e}')
|
|
544
|
+
memory = f'Attempted to switch to tab #{params.tab_id}'
|
|
545
|
+
return ActionResult(extracted_content=memory, long_term_memory=memory)
|
|
546
|
+
|
|
547
|
+
@self.registry.action(
|
|
548
|
+
'Close a tab by tab_id. Tab IDs are shown in browser state tabs list (last 4 chars of target_id). Use to clean up tabs you no longer need.',
|
|
549
|
+
param_model=CloseTabAction,
|
|
550
|
+
)
|
|
551
|
+
async def close(params: CloseTabAction, browser_session: BrowserSession):
|
|
552
|
+
# Simple close tab logic
|
|
553
|
+
try:
|
|
554
|
+
target_id = await browser_session.get_target_id_from_tab_id(params.tab_id)
|
|
555
|
+
|
|
556
|
+
# Dispatch close tab event - handle stale target IDs gracefully
|
|
557
|
+
event = browser_session.event_bus.dispatch(CloseTabEvent(target_id=target_id))
|
|
558
|
+
await event
|
|
559
|
+
await event.event_result(raise_if_any=False, raise_if_none=False) # Don't raise on errors
|
|
560
|
+
|
|
561
|
+
memory = f'Closed tab #{params.tab_id}'
|
|
562
|
+
logger.info(f'🗑️ {memory}')
|
|
563
|
+
return ActionResult(
|
|
564
|
+
extracted_content=memory,
|
|
565
|
+
long_term_memory=memory,
|
|
566
|
+
)
|
|
567
|
+
except Exception as e:
|
|
568
|
+
# Handle stale target IDs gracefully
|
|
569
|
+
logger.warning(f'Tab {params.tab_id} may already be closed: {e}')
|
|
570
|
+
memory = f'Tab #{params.tab_id} closed (was already closed or invalid)'
|
|
571
|
+
return ActionResult(
|
|
572
|
+
extracted_content=memory,
|
|
573
|
+
long_term_memory=memory,
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
# Content Actions
|
|
577
|
+
|
|
578
|
+
# TODO: Refactor to use events instead of direct page access
|
|
579
|
+
# This action is temporarily disabled as it needs refactoring to use events
|
|
580
|
+
|
|
581
|
+
@self.registry.action(
|
|
582
|
+
"""LLM extracts structured data from page markdown. Use when: on right page, know what to extract, haven't called before on same page+query. Can't get interactive elements. Set extract_links=True for URLs. Use start_from_char if truncated. If fails, use find_text instead.""",
|
|
583
|
+
)
|
|
584
|
+
async def extract(
|
|
585
|
+
params: ExtractAction,
|
|
586
|
+
browser_session: BrowserSession,
|
|
587
|
+
page_extraction_llm: BaseChatModel,
|
|
588
|
+
file_system: FileSystem,
|
|
589
|
+
):
|
|
590
|
+
# Constants
|
|
591
|
+
MAX_CHAR_LIMIT = 30000
|
|
592
|
+
query = params['query'] if isinstance(params, dict) else params.query
|
|
593
|
+
extract_links = params['extract_links'] if isinstance(params, dict) else params.extract_links
|
|
594
|
+
start_from_char = params['start_from_char'] if isinstance(params, dict) else params.start_from_char
|
|
595
|
+
|
|
596
|
+
# Extract clean markdown using the unified method
|
|
597
|
+
try:
|
|
598
|
+
from browser_use.dom.markdown_extractor import extract_clean_markdown
|
|
599
|
+
|
|
600
|
+
content, content_stats = await extract_clean_markdown(
|
|
601
|
+
browser_session=browser_session, extract_links=extract_links
|
|
602
|
+
)
|
|
603
|
+
except Exception as e:
|
|
604
|
+
raise RuntimeError(f'Could not extract clean markdown: {type(e).__name__}')
|
|
605
|
+
|
|
606
|
+
# Original content length for processing
|
|
607
|
+
final_filtered_length = content_stats['final_filtered_chars']
|
|
608
|
+
|
|
609
|
+
if start_from_char > 0:
|
|
610
|
+
if start_from_char >= len(content):
|
|
611
|
+
return ActionResult(
|
|
612
|
+
error=f'start_from_char ({start_from_char}) exceeds content length {final_filtered_length} characters.'
|
|
613
|
+
)
|
|
614
|
+
content = content[start_from_char:]
|
|
615
|
+
content_stats['started_from_char'] = start_from_char
|
|
616
|
+
|
|
617
|
+
# Smart truncation with context preservation
|
|
618
|
+
truncated = False
|
|
619
|
+
if len(content) > MAX_CHAR_LIMIT:
|
|
620
|
+
# Try to truncate at a natural break point (paragraph, sentence)
|
|
621
|
+
truncate_at = MAX_CHAR_LIMIT
|
|
622
|
+
|
|
623
|
+
# Look for paragraph break within last 500 chars of limit
|
|
624
|
+
paragraph_break = content.rfind('\n\n', MAX_CHAR_LIMIT - 500, MAX_CHAR_LIMIT)
|
|
625
|
+
if paragraph_break > 0:
|
|
626
|
+
truncate_at = paragraph_break
|
|
627
|
+
else:
|
|
628
|
+
# Look for sentence break within last 200 chars of limit
|
|
629
|
+
sentence_break = content.rfind('.', MAX_CHAR_LIMIT - 200, MAX_CHAR_LIMIT)
|
|
630
|
+
if sentence_break > 0:
|
|
631
|
+
truncate_at = sentence_break + 1
|
|
632
|
+
|
|
633
|
+
content = content[:truncate_at]
|
|
634
|
+
truncated = True
|
|
635
|
+
next_start = (start_from_char or 0) + truncate_at
|
|
636
|
+
content_stats['truncated_at_char'] = truncate_at
|
|
637
|
+
content_stats['next_start_char'] = next_start
|
|
638
|
+
|
|
639
|
+
# Add content statistics to the result
|
|
640
|
+
original_html_length = content_stats['original_html_chars']
|
|
641
|
+
initial_markdown_length = content_stats['initial_markdown_chars']
|
|
642
|
+
chars_filtered = content_stats['filtered_chars_removed']
|
|
643
|
+
|
|
644
|
+
stats_summary = f"""Content processed: {original_html_length:,} HTML chars → {initial_markdown_length:,} initial markdown → {final_filtered_length:,} filtered markdown"""
|
|
645
|
+
if start_from_char > 0:
|
|
646
|
+
stats_summary += f' (started from char {start_from_char:,})'
|
|
647
|
+
if truncated:
|
|
648
|
+
stats_summary += f' → {len(content):,} final chars (truncated, use start_from_char={content_stats["next_start_char"]} to continue)'
|
|
649
|
+
elif chars_filtered > 0:
|
|
650
|
+
stats_summary += f' (filtered {chars_filtered:,} chars of noise)'
|
|
651
|
+
|
|
652
|
+
system_prompt = """
|
|
653
|
+
You are an expert at extracting data from the markdown of a webpage.
|
|
654
|
+
|
|
655
|
+
<input>
|
|
656
|
+
You will be given a query and the markdown of a webpage that has been filtered to remove noise and advertising content.
|
|
657
|
+
</input>
|
|
658
|
+
|
|
659
|
+
<instructions>
|
|
660
|
+
- You are tasked to extract information from the webpage that is relevant to the query.
|
|
661
|
+
- You should ONLY use the information available in the webpage to answer the query. Do not make up information or provide guess from your own knowledge.
|
|
662
|
+
- If the information relevant to the query is not available in the page, your response should mention that.
|
|
663
|
+
- If the query asks for all items, products, etc., make sure to directly list all of them.
|
|
664
|
+
- If the content was truncated and you need more information, note that the user can use start_from_char parameter to continue from where truncation occurred.
|
|
665
|
+
</instructions>
|
|
666
|
+
|
|
667
|
+
<output>
|
|
668
|
+
- Your output should present ALL the information relevant to the query in a concise way.
|
|
669
|
+
- Do not answer in conversational format - directly output the relevant information or that the information is unavailable.
|
|
670
|
+
</output>
|
|
671
|
+
""".strip()
|
|
672
|
+
|
|
673
|
+
prompt = f'<query>\n{query}\n</query>\n\n<content_stats>\n{stats_summary}\n</content_stats>\n\n<webpage_content>\n{content}\n</webpage_content>'
|
|
674
|
+
|
|
675
|
+
try:
|
|
676
|
+
response = await asyncio.wait_for(
|
|
677
|
+
page_extraction_llm.ainvoke([SystemMessage(content=system_prompt), UserMessage(content=prompt)]),
|
|
678
|
+
timeout=120.0,
|
|
679
|
+
)
|
|
680
|
+
|
|
681
|
+
current_url = await browser_session.get_current_page_url()
|
|
682
|
+
extracted_content = (
|
|
683
|
+
f'<url>\n{current_url}\n</url>\n<query>\n{query}\n</query>\n<result>\n{response.completion}\n</result>'
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
# Simple memory handling
|
|
687
|
+
MAX_MEMORY_LENGTH = 1000
|
|
688
|
+
if len(extracted_content) < MAX_MEMORY_LENGTH:
|
|
689
|
+
memory = extracted_content
|
|
690
|
+
include_extracted_content_only_once = False
|
|
691
|
+
else:
|
|
692
|
+
file_name = await file_system.save_extracted_content(extracted_content)
|
|
693
|
+
memory = f'Query: {query}\nContent in {file_name} and once in <read_state>.'
|
|
694
|
+
include_extracted_content_only_once = True
|
|
695
|
+
|
|
696
|
+
logger.info(f'📄 {memory}')
|
|
697
|
+
return ActionResult(
|
|
698
|
+
extracted_content=extracted_content,
|
|
699
|
+
include_extracted_content_only_once=include_extracted_content_only_once,
|
|
700
|
+
long_term_memory=memory,
|
|
701
|
+
)
|
|
702
|
+
except Exception as e:
|
|
703
|
+
logger.debug(f'Error extracting content: {e}')
|
|
704
|
+
raise RuntimeError(str(e))
|
|
705
|
+
|
|
706
|
+
@self.registry.action(
|
|
707
|
+
"""Scroll by pages (down=True/False, pages=0.5-10.0, default 1.0). Use index for scroll containers (dropdowns/custom UI). High pages (10) reaches bottom. Multi-page scrolls sequentially. Viewport-based height, fallback 1000px/page.""",
|
|
708
|
+
param_model=ScrollAction,
|
|
709
|
+
)
|
|
710
|
+
async def scroll(params: ScrollAction, browser_session: BrowserSession):
|
|
711
|
+
try:
|
|
712
|
+
# Look up the node from the selector map if index is provided
|
|
713
|
+
# Special case: index 0 means scroll the whole page (root/body element)
|
|
714
|
+
node = None
|
|
715
|
+
if params.index is not None and params.index != 0:
|
|
716
|
+
node = await browser_session.get_element_by_index(params.index)
|
|
717
|
+
if node is None:
|
|
718
|
+
# Element does not exist
|
|
719
|
+
msg = f'Element index {params.index} not found in browser state'
|
|
720
|
+
return ActionResult(error=msg)
|
|
721
|
+
|
|
722
|
+
direction = 'down' if params.down else 'up'
|
|
723
|
+
target = f'element {params.index}' if params.index is not None and params.index != 0 else ''
|
|
724
|
+
|
|
725
|
+
# Get actual viewport height for more accurate scrolling
|
|
726
|
+
try:
|
|
727
|
+
cdp_session = await browser_session.get_or_create_cdp_session()
|
|
728
|
+
metrics = await cdp_session.cdp_client.send.Page.getLayoutMetrics(session_id=cdp_session.session_id)
|
|
729
|
+
|
|
730
|
+
# Use cssVisualViewport for the most accurate representation
|
|
731
|
+
css_viewport = metrics.get('cssVisualViewport', {})
|
|
732
|
+
css_layout_viewport = metrics.get('cssLayoutViewport', {})
|
|
733
|
+
|
|
734
|
+
# Get viewport height, prioritizing cssVisualViewport
|
|
735
|
+
viewport_height = int(css_viewport.get('clientHeight') or css_layout_viewport.get('clientHeight', 1000))
|
|
736
|
+
|
|
737
|
+
logger.debug(f'Detected viewport height: {viewport_height}px')
|
|
738
|
+
except Exception as e:
|
|
739
|
+
viewport_height = 1000 # Fallback to 1000px
|
|
740
|
+
logger.debug(f'Failed to get viewport height, using fallback 1000px: {e}')
|
|
741
|
+
|
|
742
|
+
# For multiple pages (>=1.0), scroll one page at a time to ensure each scroll completes
|
|
743
|
+
if params.pages >= 1.0:
|
|
744
|
+
import asyncio
|
|
745
|
+
|
|
746
|
+
num_full_pages = int(params.pages)
|
|
747
|
+
remaining_fraction = params.pages - num_full_pages
|
|
748
|
+
|
|
749
|
+
completed_scrolls = 0
|
|
750
|
+
|
|
751
|
+
# Scroll one page at a time
|
|
752
|
+
for i in range(num_full_pages):
|
|
753
|
+
try:
|
|
754
|
+
pixels = viewport_height # Use actual viewport height
|
|
755
|
+
if not params.down:
|
|
756
|
+
pixels = -pixels
|
|
757
|
+
|
|
758
|
+
event = browser_session.event_bus.dispatch(
|
|
759
|
+
ScrollEvent(direction=direction, amount=abs(pixels), node=node)
|
|
760
|
+
)
|
|
761
|
+
await event
|
|
762
|
+
await event.event_result(raise_if_any=True, raise_if_none=False)
|
|
763
|
+
completed_scrolls += 1
|
|
764
|
+
|
|
765
|
+
# Small delay to ensure scroll completes before next one
|
|
766
|
+
await asyncio.sleep(0.3)
|
|
767
|
+
|
|
768
|
+
except Exception as e:
|
|
769
|
+
logger.warning(f'Scroll {i + 1}/{num_full_pages} failed: {e}')
|
|
770
|
+
# Continue with remaining scrolls even if one fails
|
|
771
|
+
|
|
772
|
+
# Handle fractional page if present
|
|
773
|
+
if remaining_fraction > 0:
|
|
774
|
+
try:
|
|
775
|
+
pixels = int(remaining_fraction * viewport_height)
|
|
776
|
+
if not params.down:
|
|
777
|
+
pixels = -pixels
|
|
778
|
+
|
|
779
|
+
event = browser_session.event_bus.dispatch(
|
|
780
|
+
ScrollEvent(direction=direction, amount=abs(pixels), node=node)
|
|
781
|
+
)
|
|
782
|
+
await event
|
|
783
|
+
await event.event_result(raise_if_any=True, raise_if_none=False)
|
|
784
|
+
completed_scrolls += remaining_fraction
|
|
785
|
+
|
|
786
|
+
except Exception as e:
|
|
787
|
+
logger.warning(f'Fractional scroll failed: {e}')
|
|
788
|
+
|
|
789
|
+
if params.pages == 1.0:
|
|
790
|
+
long_term_memory = f'Scrolled {direction} {target} {viewport_height}px'.replace(' ', ' ')
|
|
791
|
+
else:
|
|
792
|
+
long_term_memory = f'Scrolled {direction} {target} {completed_scrolls:.1f} pages'.replace(' ', ' ')
|
|
793
|
+
else:
|
|
794
|
+
# For fractional pages <1.0, do single scroll
|
|
795
|
+
pixels = int(params.pages * viewport_height)
|
|
796
|
+
event = browser_session.event_bus.dispatch(
|
|
797
|
+
ScrollEvent(direction='down' if params.down else 'up', amount=pixels, node=node)
|
|
798
|
+
)
|
|
799
|
+
await event
|
|
800
|
+
await event.event_result(raise_if_any=True, raise_if_none=False)
|
|
801
|
+
long_term_memory = f'Scrolled {direction} {target} {params.pages} pages'.replace(' ', ' ')
|
|
802
|
+
|
|
803
|
+
msg = f'🔍 {long_term_memory}'
|
|
804
|
+
logger.info(msg)
|
|
805
|
+
return ActionResult(extracted_content=msg, long_term_memory=long_term_memory)
|
|
806
|
+
except Exception as e:
|
|
807
|
+
logger.error(f'Failed to dispatch ScrollEvent: {type(e).__name__}: {e}')
|
|
808
|
+
error_msg = 'Failed to execute scroll action.'
|
|
809
|
+
return ActionResult(error=error_msg)
|
|
810
|
+
|
|
811
|
+
@self.registry.action(
|
|
812
|
+
'',
|
|
813
|
+
param_model=SendKeysAction,
|
|
814
|
+
)
|
|
815
|
+
async def send_keys(params: SendKeysAction, browser_session: BrowserSession):
|
|
816
|
+
# Dispatch send keys event
|
|
817
|
+
try:
|
|
818
|
+
event = browser_session.event_bus.dispatch(SendKeysEvent(keys=params.keys))
|
|
819
|
+
await event
|
|
820
|
+
await event.event_result(raise_if_any=True, raise_if_none=False)
|
|
821
|
+
memory = f'Sent keys: {params.keys}'
|
|
822
|
+
msg = f'⌨️ {memory}'
|
|
823
|
+
logger.info(msg)
|
|
824
|
+
return ActionResult(extracted_content=memory, long_term_memory=memory)
|
|
825
|
+
except Exception as e:
|
|
826
|
+
logger.error(f'Failed to dispatch SendKeysEvent: {type(e).__name__}: {e}')
|
|
827
|
+
error_msg = f'Failed to send keys: {str(e)}'
|
|
828
|
+
return ActionResult(error=error_msg)
|
|
829
|
+
|
|
830
|
+
@self.registry.action('Scroll to text.')
|
|
831
|
+
async def find_text(text: str, browser_session: BrowserSession): # type: ignore
|
|
832
|
+
# Dispatch scroll to text event
|
|
833
|
+
event = browser_session.event_bus.dispatch(ScrollToTextEvent(text=text))
|
|
834
|
+
|
|
835
|
+
try:
|
|
836
|
+
# The handler returns None on success or raises an exception if text not found
|
|
837
|
+
await event.event_result(raise_if_any=True, raise_if_none=False)
|
|
838
|
+
memory = f'Scrolled to text: {text}'
|
|
839
|
+
msg = f'🔍 {memory}'
|
|
840
|
+
logger.info(msg)
|
|
841
|
+
return ActionResult(extracted_content=memory, long_term_memory=memory)
|
|
842
|
+
except Exception as e:
|
|
843
|
+
# Text not found
|
|
844
|
+
msg = f"Text '{text}' not found or not visible on page"
|
|
845
|
+
logger.info(msg)
|
|
846
|
+
return ActionResult(
|
|
847
|
+
extracted_content=msg,
|
|
848
|
+
long_term_memory=f"Tried scrolling to text '{text}' but it was not found",
|
|
849
|
+
)
|
|
850
|
+
|
|
851
|
+
@self.registry.action(
|
|
852
|
+
'Get a screenshot of the current viewport. Use when: visual inspection needed, layout unclear, element positions uncertain, debugging UI issues, or verifying page state. Screenshot is included in the next browser_state No parameters are needed.',
|
|
853
|
+
param_model=NoParamsAction,
|
|
854
|
+
)
|
|
855
|
+
async def screenshot(_: NoParamsAction):
|
|
856
|
+
"""Request that a screenshot be included in the next observation"""
|
|
857
|
+
memory = 'Requested screenshot for next observation'
|
|
858
|
+
msg = f'📸 {memory}'
|
|
859
|
+
logger.info(msg)
|
|
860
|
+
|
|
861
|
+
# Return flag in metadata to signal that screenshot should be included
|
|
862
|
+
return ActionResult(
|
|
863
|
+
extracted_content=memory,
|
|
864
|
+
metadata={'include_screenshot': True},
|
|
865
|
+
)
|
|
866
|
+
|
|
867
|
+
# Dropdown Actions
|
|
868
|
+
|
|
869
|
+
@self.registry.action(
|
|
870
|
+
'',
|
|
871
|
+
param_model=GetDropdownOptionsAction,
|
|
872
|
+
)
|
|
873
|
+
async def dropdown_options(params: GetDropdownOptionsAction, browser_session: BrowserSession):
|
|
874
|
+
"""Get all options from a native dropdown or ARIA menu"""
|
|
875
|
+
# Look up the node from the selector map
|
|
876
|
+
node = await browser_session.get_element_by_index(params.index)
|
|
877
|
+
if node is None:
|
|
878
|
+
msg = f'Element index {params.index} not available - page may have changed. Try refreshing browser state.'
|
|
879
|
+
logger.warning(f'⚠️ {msg}')
|
|
880
|
+
return ActionResult(extracted_content=msg)
|
|
881
|
+
|
|
882
|
+
# Dispatch GetDropdownOptionsEvent to the event handler
|
|
883
|
+
|
|
884
|
+
event = browser_session.event_bus.dispatch(GetDropdownOptionsEvent(node=node))
|
|
885
|
+
dropdown_data = await event.event_result(timeout=3.0, raise_if_none=True, raise_if_any=True)
|
|
886
|
+
|
|
887
|
+
if not dropdown_data:
|
|
888
|
+
raise ValueError('Failed to get dropdown options - no data returned')
|
|
889
|
+
|
|
890
|
+
# Use structured memory from the handler
|
|
891
|
+
return ActionResult(
|
|
892
|
+
extracted_content=dropdown_data['short_term_memory'],
|
|
893
|
+
long_term_memory=dropdown_data['long_term_memory'],
|
|
894
|
+
include_extracted_content_only_once=True,
|
|
895
|
+
)
|
|
896
|
+
|
|
897
|
+
@self.registry.action(
|
|
898
|
+
'Set the option of a <select> element.',
|
|
899
|
+
param_model=SelectDropdownOptionAction,
|
|
900
|
+
)
|
|
901
|
+
async def select_dropdown(params: SelectDropdownOptionAction, browser_session: BrowserSession):
|
|
902
|
+
"""Select dropdown option by the text of the option you want to select"""
|
|
903
|
+
# Look up the node from the selector map
|
|
904
|
+
node = await browser_session.get_element_by_index(params.index)
|
|
905
|
+
if node is None:
|
|
906
|
+
msg = f'Element index {params.index} not available - page may have changed. Try refreshing browser state.'
|
|
907
|
+
logger.warning(f'⚠️ {msg}')
|
|
908
|
+
return ActionResult(extracted_content=msg)
|
|
909
|
+
|
|
910
|
+
# Dispatch SelectDropdownOptionEvent to the event handler
|
|
911
|
+
from browser_use.browser.events import SelectDropdownOptionEvent
|
|
912
|
+
|
|
913
|
+
event = browser_session.event_bus.dispatch(SelectDropdownOptionEvent(node=node, text=params.text))
|
|
914
|
+
selection_data = await event.event_result()
|
|
915
|
+
|
|
916
|
+
if not selection_data:
|
|
917
|
+
raise ValueError('Failed to select dropdown option - no data returned')
|
|
918
|
+
|
|
919
|
+
# Check if the selection was successful
|
|
920
|
+
if selection_data.get('success') == 'true':
|
|
921
|
+
# Extract the message from the returned data
|
|
922
|
+
msg = selection_data.get('message', f'Selected option: {params.text}')
|
|
923
|
+
return ActionResult(
|
|
924
|
+
extracted_content=msg,
|
|
925
|
+
include_in_memory=True,
|
|
926
|
+
long_term_memory=f"Selected dropdown option '{params.text}' at index {params.index}",
|
|
927
|
+
)
|
|
928
|
+
else:
|
|
929
|
+
# Handle structured error response
|
|
930
|
+
# TODO: raise BrowserError instead of returning ActionResult
|
|
931
|
+
if 'short_term_memory' in selection_data and 'long_term_memory' in selection_data:
|
|
932
|
+
return ActionResult(
|
|
933
|
+
extracted_content=selection_data['short_term_memory'],
|
|
934
|
+
long_term_memory=selection_data['long_term_memory'],
|
|
935
|
+
include_extracted_content_only_once=True,
|
|
936
|
+
)
|
|
937
|
+
else:
|
|
938
|
+
# Fallback to regular error
|
|
939
|
+
error_msg = selection_data.get('error', f'Failed to select option: {params.text}')
|
|
940
|
+
return ActionResult(error=error_msg)
|
|
941
|
+
|
|
942
|
+
# File System Actions
|
|
943
|
+
|
|
944
|
+
@self.registry.action(
|
|
945
|
+
'Write content to a file in the local file system. Use this to create new files or overwrite entire file contents. For targeted edits within existing files, use replace_file instead. Supports alphanumeric filename and file extension formats: .txt, .md, .json, .jsonl, .csv, .pdf. For PDF files, write content in markdown format and it will be automatically converted to a properly formatted PDF document.'
|
|
946
|
+
)
|
|
947
|
+
async def write_file(
|
|
948
|
+
file_name: str,
|
|
949
|
+
content: str,
|
|
950
|
+
file_system: FileSystem,
|
|
951
|
+
append: bool = False,
|
|
952
|
+
trailing_newline: bool = True,
|
|
953
|
+
leading_newline: bool = False,
|
|
954
|
+
):
|
|
955
|
+
if trailing_newline:
|
|
956
|
+
content += '\n'
|
|
957
|
+
if leading_newline:
|
|
958
|
+
content = '\n' + content
|
|
959
|
+
if append:
|
|
960
|
+
result = await file_system.append_file(file_name, content)
|
|
961
|
+
else:
|
|
962
|
+
result = await file_system.write_file(file_name, content)
|
|
963
|
+
|
|
964
|
+
# Log the full path where the file is stored
|
|
965
|
+
file_path = file_system.get_dir() / file_name
|
|
966
|
+
logger.info(f'💾 {result} File location: {file_path}')
|
|
967
|
+
|
|
968
|
+
return ActionResult(extracted_content=result, long_term_memory=result)
|
|
969
|
+
|
|
970
|
+
@self.registry.action(
|
|
971
|
+
'Replace specific text within a file by searching for old_str and replacing with new_str. Use this for targeted edits like updating todo checkboxes or modifying specific lines without rewriting the entire file.'
|
|
972
|
+
)
|
|
973
|
+
async def replace_file(file_name: str, old_str: str, new_str: str, file_system: FileSystem):
|
|
974
|
+
result = await file_system.replace_file_str(file_name, old_str, new_str)
|
|
975
|
+
logger.info(f'💾 {result}')
|
|
976
|
+
return ActionResult(extracted_content=result, long_term_memory=result)
|
|
977
|
+
|
|
978
|
+
@self.registry.action(
|
|
979
|
+
'Read the complete content of a file. Use this to view file contents before editing or to retrieve data from files. Supports text files (txt, md, json, csv, jsonl), documents (pdf, docx), and images (jpg, png).'
|
|
980
|
+
)
|
|
981
|
+
async def read_file(file_name: str, available_file_paths: list[str], file_system: FileSystem):
|
|
982
|
+
if available_file_paths and file_name in available_file_paths:
|
|
983
|
+
structured_result = await file_system.read_file_structured(file_name, external_file=True)
|
|
984
|
+
else:
|
|
985
|
+
structured_result = await file_system.read_file_structured(file_name)
|
|
986
|
+
|
|
987
|
+
result = structured_result['message']
|
|
988
|
+
images = structured_result.get('images')
|
|
989
|
+
|
|
990
|
+
MAX_MEMORY_SIZE = 1000
|
|
991
|
+
# For images, create a shorter memory message
|
|
992
|
+
if images:
|
|
993
|
+
memory = f'Read image file {file_name}'
|
|
994
|
+
elif len(result) > MAX_MEMORY_SIZE:
|
|
995
|
+
lines = result.splitlines()
|
|
996
|
+
display = ''
|
|
997
|
+
lines_count = 0
|
|
998
|
+
for line in lines:
|
|
999
|
+
if len(display) + len(line) < MAX_MEMORY_SIZE:
|
|
1000
|
+
display += line + '\n'
|
|
1001
|
+
lines_count += 1
|
|
1002
|
+
else:
|
|
1003
|
+
break
|
|
1004
|
+
remaining_lines = len(lines) - lines_count
|
|
1005
|
+
memory = f'{display}{remaining_lines} more lines...' if remaining_lines > 0 else display
|
|
1006
|
+
else:
|
|
1007
|
+
memory = result
|
|
1008
|
+
logger.info(f'💾 {memory}')
|
|
1009
|
+
return ActionResult(
|
|
1010
|
+
extracted_content=result,
|
|
1011
|
+
long_term_memory=memory,
|
|
1012
|
+
images=images,
|
|
1013
|
+
include_extracted_content_only_once=True,
|
|
1014
|
+
)
|
|
1015
|
+
|
|
1016
|
+
@self.registry.action(
|
|
1017
|
+
"""Execute browser JavaScript. Best practice: wrap in IIFE (function(){...})() with try-catch for safety. Use ONLY browser APIs (document, window, DOM). NO Node.js APIs (fs, require, process). Example: (function(){try{const el=document.querySelector('#id');return el?el.value:'not found'}catch(e){return 'Error: '+e.message}})() Avoid comments. Use for hover, drag, zoom, custom selectors, extract/filter links, shadow DOM, or analysing page structure. Limit output size.""",
|
|
1018
|
+
)
|
|
1019
|
+
async def evaluate(code: str, browser_session: BrowserSession):
|
|
1020
|
+
# Execute JavaScript with proper error handling and promise support
|
|
1021
|
+
|
|
1022
|
+
cdp_session = await browser_session.get_or_create_cdp_session()
|
|
1023
|
+
|
|
1024
|
+
try:
|
|
1025
|
+
# Validate and potentially fix JavaScript code before execution
|
|
1026
|
+
validated_code = self._validate_and_fix_javascript(code)
|
|
1027
|
+
|
|
1028
|
+
# Always use awaitPromise=True - it's ignored for non-promises
|
|
1029
|
+
result = await cdp_session.cdp_client.send.Runtime.evaluate(
|
|
1030
|
+
params={'expression': validated_code, 'returnByValue': True, 'awaitPromise': True},
|
|
1031
|
+
session_id=cdp_session.session_id,
|
|
1032
|
+
)
|
|
1033
|
+
|
|
1034
|
+
# Check for JavaScript execution errors
|
|
1035
|
+
if result.get('exceptionDetails'):
|
|
1036
|
+
exception = result['exceptionDetails']
|
|
1037
|
+
error_msg = f'JavaScript execution error: {exception.get("text", "Unknown error")}'
|
|
1038
|
+
|
|
1039
|
+
# Enhanced error message with debugging info
|
|
1040
|
+
enhanced_msg = f"""JavaScript Execution Failed:
|
|
1041
|
+
{error_msg}
|
|
1042
|
+
|
|
1043
|
+
Validated Code (after quote fixing):
|
|
1044
|
+
{validated_code[:500]}{'...' if len(validated_code) > 500 else ''}
|
|
1045
|
+
"""
|
|
1046
|
+
|
|
1047
|
+
logger.debug(enhanced_msg)
|
|
1048
|
+
return ActionResult(error=enhanced_msg)
|
|
1049
|
+
|
|
1050
|
+
# Get the result data
|
|
1051
|
+
result_data = result.get('result', {})
|
|
1052
|
+
|
|
1053
|
+
# Check for wasThrown flag (backup error detection)
|
|
1054
|
+
if result_data.get('wasThrown'):
|
|
1055
|
+
msg = f'JavaScript code: {code} execution failed (wasThrown=true)'
|
|
1056
|
+
logger.debug(msg)
|
|
1057
|
+
return ActionResult(error=msg)
|
|
1058
|
+
|
|
1059
|
+
# Get the actual value
|
|
1060
|
+
value = result_data.get('value')
|
|
1061
|
+
|
|
1062
|
+
# Handle different value types
|
|
1063
|
+
if value is None:
|
|
1064
|
+
# Could be legitimate null/undefined result
|
|
1065
|
+
result_text = str(value) if 'value' in result_data else 'undefined'
|
|
1066
|
+
elif isinstance(value, (dict, list)):
|
|
1067
|
+
# Complex objects - should be serialized by returnByValue
|
|
1068
|
+
try:
|
|
1069
|
+
result_text = json.dumps(value, ensure_ascii=False)
|
|
1070
|
+
except (TypeError, ValueError):
|
|
1071
|
+
# Fallback for non-serializable objects
|
|
1072
|
+
result_text = str(value)
|
|
1073
|
+
else:
|
|
1074
|
+
# Primitive values (string, number, boolean)
|
|
1075
|
+
result_text = str(value)
|
|
1076
|
+
|
|
1077
|
+
import re
|
|
1078
|
+
|
|
1079
|
+
image_pattern = r'(data:image/[^;]+;base64,[A-Za-z0-9+/=]+)'
|
|
1080
|
+
found_images = re.findall(image_pattern, result_text)
|
|
1081
|
+
|
|
1082
|
+
metadata = None
|
|
1083
|
+
if found_images:
|
|
1084
|
+
# Store images in metadata so they can be added as ContentPartImageParam
|
|
1085
|
+
metadata = {'images': found_images}
|
|
1086
|
+
|
|
1087
|
+
# Replace image data in result text with shorter placeholder
|
|
1088
|
+
modified_text = result_text
|
|
1089
|
+
for i, img_data in enumerate(found_images, 1):
|
|
1090
|
+
placeholder = '[Image]'
|
|
1091
|
+
modified_text = modified_text.replace(img_data, placeholder)
|
|
1092
|
+
result_text = modified_text
|
|
1093
|
+
|
|
1094
|
+
# Apply length limit with better truncation (after image extraction)
|
|
1095
|
+
if len(result_text) > 20000:
|
|
1096
|
+
result_text = result_text[:19950] + '\n... [Truncated after 20000 characters]'
|
|
1097
|
+
|
|
1098
|
+
# Don't log the code - it's already visible in the user's cell
|
|
1099
|
+
logger.debug(f'JavaScript executed successfully, result length: {len(result_text)}')
|
|
1100
|
+
|
|
1101
|
+
# Return only the result, not the code (code is already in user's cell)
|
|
1102
|
+
return ActionResult(extracted_content=result_text, metadata=metadata)
|
|
1103
|
+
|
|
1104
|
+
except Exception as e:
|
|
1105
|
+
# CDP communication or other system errors
|
|
1106
|
+
error_msg = f'Failed to execute JavaScript: {type(e).__name__}: {e}'
|
|
1107
|
+
logger.debug(f'JavaScript code that failed: {code[:200]}...')
|
|
1108
|
+
return ActionResult(error=error_msg)
|
|
1109
|
+
|
|
1110
|
+
def _validate_and_fix_javascript(self, code: str) -> str:
|
|
1111
|
+
"""Validate and fix common JavaScript issues before execution"""
|
|
1112
|
+
|
|
1113
|
+
import re
|
|
1114
|
+
|
|
1115
|
+
# Pattern 1: Fix double-escaped quotes (\\\" → \")
|
|
1116
|
+
fixed_code = re.sub(r'\\"', '"', code)
|
|
1117
|
+
|
|
1118
|
+
# Pattern 2: Fix over-escaped regex patterns (\\\\d → \\d)
|
|
1119
|
+
# Common issue: regex gets double-escaped during parsing
|
|
1120
|
+
fixed_code = re.sub(r'\\\\([dDsSwWbBnrtfv])', r'\\\1', fixed_code)
|
|
1121
|
+
fixed_code = re.sub(r'\\\\([.*+?^${}()|[\]])', r'\\\1', fixed_code)
|
|
1122
|
+
|
|
1123
|
+
# Pattern 3: Fix XPath expressions with mixed quotes
|
|
1124
|
+
xpath_pattern = r'document\.evaluate\s*\(\s*"([^"]*\'[^"]*)"'
|
|
1125
|
+
|
|
1126
|
+
def fix_xpath_quotes(match):
|
|
1127
|
+
xpath_with_quotes = match.group(1)
|
|
1128
|
+
return f'document.evaluate(`{xpath_with_quotes}`,'
|
|
1129
|
+
|
|
1130
|
+
fixed_code = re.sub(xpath_pattern, fix_xpath_quotes, fixed_code)
|
|
1131
|
+
|
|
1132
|
+
# Pattern 4: Fix querySelector/querySelectorAll with mixed quotes
|
|
1133
|
+
selector_pattern = r'(querySelector(?:All)?)\s*\(\s*"([^"]*\'[^"]*)"'
|
|
1134
|
+
|
|
1135
|
+
def fix_selector_quotes(match):
|
|
1136
|
+
method_name = match.group(1)
|
|
1137
|
+
selector_with_quotes = match.group(2)
|
|
1138
|
+
return f'{method_name}(`{selector_with_quotes}`)'
|
|
1139
|
+
|
|
1140
|
+
fixed_code = re.sub(selector_pattern, fix_selector_quotes, fixed_code)
|
|
1141
|
+
|
|
1142
|
+
# Pattern 5: Fix closest() calls with mixed quotes
|
|
1143
|
+
closest_pattern = r'\.closest\s*\(\s*"([^"]*\'[^"]*)"'
|
|
1144
|
+
|
|
1145
|
+
def fix_closest_quotes(match):
|
|
1146
|
+
selector_with_quotes = match.group(1)
|
|
1147
|
+
return f'.closest(`{selector_with_quotes}`)'
|
|
1148
|
+
|
|
1149
|
+
fixed_code = re.sub(closest_pattern, fix_closest_quotes, fixed_code)
|
|
1150
|
+
|
|
1151
|
+
# Pattern 6: Fix .matches() calls with mixed quotes (similar to closest)
|
|
1152
|
+
matches_pattern = r'\.matches\s*\(\s*"([^"]*\'[^"]*)"'
|
|
1153
|
+
|
|
1154
|
+
def fix_matches_quotes(match):
|
|
1155
|
+
selector_with_quotes = match.group(1)
|
|
1156
|
+
return f'.matches(`{selector_with_quotes}`)'
|
|
1157
|
+
|
|
1158
|
+
fixed_code = re.sub(matches_pattern, fix_matches_quotes, fixed_code)
|
|
1159
|
+
|
|
1160
|
+
# Note: Removed getAttribute fix - attribute names rarely have mixed quotes
|
|
1161
|
+
# getAttribute typically uses simple names like "data-value", not complex selectors
|
|
1162
|
+
|
|
1163
|
+
# Log changes made
|
|
1164
|
+
changes_made = []
|
|
1165
|
+
if r'\"' in code and r'\"' not in fixed_code:
|
|
1166
|
+
changes_made.append('fixed escaped quotes')
|
|
1167
|
+
if '`' in fixed_code and '`' not in code:
|
|
1168
|
+
changes_made.append('converted mixed quotes to template literals')
|
|
1169
|
+
|
|
1170
|
+
if changes_made:
|
|
1171
|
+
logger.debug(f'JavaScript fixes applied: {", ".join(changes_made)}')
|
|
1172
|
+
|
|
1173
|
+
return fixed_code
|
|
1174
|
+
|
|
1175
|
+
def _register_done_action(self, output_model: type[T] | None, display_files_in_done_text: bool = True):
|
|
1176
|
+
if output_model is not None:
|
|
1177
|
+
self.display_files_in_done_text = display_files_in_done_text
|
|
1178
|
+
|
|
1179
|
+
@self.registry.action(
|
|
1180
|
+
'Complete task with structured output.',
|
|
1181
|
+
param_model=StructuredOutputAction[output_model],
|
|
1182
|
+
)
|
|
1183
|
+
async def done(params: StructuredOutputAction):
|
|
1184
|
+
# Exclude success from the output JSON since it's an internal parameter
|
|
1185
|
+
output_dict = params.data.model_dump()
|
|
1186
|
+
|
|
1187
|
+
# Enums are not serializable, convert to string
|
|
1188
|
+
for key, value in output_dict.items():
|
|
1189
|
+
if isinstance(value, enum.Enum):
|
|
1190
|
+
output_dict[key] = value.value
|
|
1191
|
+
|
|
1192
|
+
return ActionResult(
|
|
1193
|
+
is_done=True,
|
|
1194
|
+
success=params.success,
|
|
1195
|
+
extracted_content=json.dumps(output_dict, ensure_ascii=False),
|
|
1196
|
+
long_term_memory=f'Task completed. Success Status: {params.success}',
|
|
1197
|
+
)
|
|
1198
|
+
|
|
1199
|
+
else:
|
|
1200
|
+
|
|
1201
|
+
@self.registry.action(
|
|
1202
|
+
'Complete task.',
|
|
1203
|
+
param_model=DoneAction,
|
|
1204
|
+
)
|
|
1205
|
+
async def done(params: DoneAction, file_system: FileSystem):
|
|
1206
|
+
user_message = params.text
|
|
1207
|
+
|
|
1208
|
+
len_text = len(params.text)
|
|
1209
|
+
len_max_memory = 100
|
|
1210
|
+
memory = f'Task completed: {params.success} - {params.text[:len_max_memory]}'
|
|
1211
|
+
if len_text > len_max_memory:
|
|
1212
|
+
memory += f' - {len_text - len_max_memory} more characters'
|
|
1213
|
+
|
|
1214
|
+
attachments = []
|
|
1215
|
+
if params.files_to_display:
|
|
1216
|
+
if self.display_files_in_done_text:
|
|
1217
|
+
file_msg = ''
|
|
1218
|
+
for file_name in params.files_to_display:
|
|
1219
|
+
file_content = file_system.display_file(file_name)
|
|
1220
|
+
if file_content:
|
|
1221
|
+
file_msg += f'\n\n{file_name}:\n{file_content}'
|
|
1222
|
+
attachments.append(file_name)
|
|
1223
|
+
if file_msg:
|
|
1224
|
+
user_message += '\n\nAttachments:'
|
|
1225
|
+
user_message += file_msg
|
|
1226
|
+
else:
|
|
1227
|
+
logger.warning('Agent wanted to display files but none were found')
|
|
1228
|
+
else:
|
|
1229
|
+
for file_name in params.files_to_display:
|
|
1230
|
+
file_content = file_system.display_file(file_name)
|
|
1231
|
+
if file_content:
|
|
1232
|
+
attachments.append(file_name)
|
|
1233
|
+
|
|
1234
|
+
attachments = [str(file_system.get_dir() / file_name) for file_name in attachments]
|
|
1235
|
+
|
|
1236
|
+
return ActionResult(
|
|
1237
|
+
is_done=True,
|
|
1238
|
+
success=params.success,
|
|
1239
|
+
extracted_content=user_message,
|
|
1240
|
+
long_term_memory=memory,
|
|
1241
|
+
attachments=attachments,
|
|
1242
|
+
)
|
|
1243
|
+
|
|
1244
|
+
def use_structured_output_action(self, output_model: type[T]):
|
|
1245
|
+
self._register_done_action(output_model)
|
|
1246
|
+
|
|
1247
|
+
# Register ---------------------------------------------------------------
|
|
1248
|
+
|
|
1249
|
+
def action(self, description: str, **kwargs):
|
|
1250
|
+
"""Decorator for registering custom actions
|
|
1251
|
+
|
|
1252
|
+
@param description: Describe the LLM what the function does (better description == better function calling)
|
|
1253
|
+
"""
|
|
1254
|
+
return self.registry.action(description, **kwargs)
|
|
1255
|
+
|
|
1256
|
+
# Act --------------------------------------------------------------------
|
|
1257
|
+
@observe_debug(ignore_input=True, ignore_output=True, name='act')
|
|
1258
|
+
@time_execution_sync('--act')
|
|
1259
|
+
async def act(
|
|
1260
|
+
self,
|
|
1261
|
+
action: ActionModel,
|
|
1262
|
+
browser_session: BrowserSession,
|
|
1263
|
+
page_extraction_llm: BaseChatModel | None = None,
|
|
1264
|
+
sensitive_data: dict[str, str | dict[str, str]] | None = None,
|
|
1265
|
+
available_file_paths: list[str] | None = None,
|
|
1266
|
+
file_system: FileSystem | None = None,
|
|
1267
|
+
) -> ActionResult:
|
|
1268
|
+
"""Execute an action"""
|
|
1269
|
+
|
|
1270
|
+
for action_name, params in action.model_dump(exclude_unset=True).items():
|
|
1271
|
+
if params is not None:
|
|
1272
|
+
# Use Laminar span if available, otherwise use no-op context manager
|
|
1273
|
+
if Laminar is not None:
|
|
1274
|
+
span_context = Laminar.start_as_current_span(
|
|
1275
|
+
name=action_name,
|
|
1276
|
+
input={
|
|
1277
|
+
'action': action_name,
|
|
1278
|
+
'params': params,
|
|
1279
|
+
},
|
|
1280
|
+
span_type='TOOL',
|
|
1281
|
+
)
|
|
1282
|
+
else:
|
|
1283
|
+
# No-op context manager when lmnr is not available
|
|
1284
|
+
from contextlib import nullcontext
|
|
1285
|
+
|
|
1286
|
+
span_context = nullcontext()
|
|
1287
|
+
|
|
1288
|
+
with span_context:
|
|
1289
|
+
try:
|
|
1290
|
+
result = await self.registry.execute_action(
|
|
1291
|
+
action_name=action_name,
|
|
1292
|
+
params=params,
|
|
1293
|
+
browser_session=browser_session,
|
|
1294
|
+
page_extraction_llm=page_extraction_llm,
|
|
1295
|
+
file_system=file_system,
|
|
1296
|
+
sensitive_data=sensitive_data,
|
|
1297
|
+
available_file_paths=available_file_paths,
|
|
1298
|
+
)
|
|
1299
|
+
except BrowserError as e:
|
|
1300
|
+
logger.error(f'❌ Action {action_name} failed with BrowserError: {str(e)}')
|
|
1301
|
+
result = handle_browser_error(e)
|
|
1302
|
+
except TimeoutError as e:
|
|
1303
|
+
logger.error(f'❌ Action {action_name} failed with TimeoutError: {str(e)}')
|
|
1304
|
+
result = ActionResult(error=f'{action_name} was not executed due to timeout.')
|
|
1305
|
+
except Exception as e:
|
|
1306
|
+
# Log the original exception with traceback for observability
|
|
1307
|
+
logger.error(f"Action '{action_name}' failed with error: {str(e)}")
|
|
1308
|
+
result = ActionResult(error=str(e))
|
|
1309
|
+
|
|
1310
|
+
if Laminar is not None:
|
|
1311
|
+
Laminar.set_span_output(result)
|
|
1312
|
+
|
|
1313
|
+
if isinstance(result, str):
|
|
1314
|
+
return ActionResult(extracted_content=result)
|
|
1315
|
+
elif isinstance(result, ActionResult):
|
|
1316
|
+
return result
|
|
1317
|
+
elif result is None:
|
|
1318
|
+
return ActionResult()
|
|
1319
|
+
else:
|
|
1320
|
+
raise ValueError(f'Invalid action result type: {type(result)} of {result}')
|
|
1321
|
+
return ActionResult()
|
|
1322
|
+
|
|
1323
|
+
def __getattr__(self, name: str):
|
|
1324
|
+
"""
|
|
1325
|
+
Enable direct action calls like tools.navigate(url=..., browser_session=...).
|
|
1326
|
+
This provides a simpler API for tests and direct usage while maintaining backward compatibility.
|
|
1327
|
+
"""
|
|
1328
|
+
# Check if this is a registered action
|
|
1329
|
+
if name in self.registry.registry.actions:
|
|
1330
|
+
from typing import Union
|
|
1331
|
+
|
|
1332
|
+
from pydantic import create_model
|
|
1333
|
+
|
|
1334
|
+
action = self.registry.registry.actions[name]
|
|
1335
|
+
|
|
1336
|
+
# Create a wrapper that calls act() to ensure consistent error handling and result normalization
|
|
1337
|
+
async def action_wrapper(**kwargs):
|
|
1338
|
+
# Extract browser_session (required positional argument for act())
|
|
1339
|
+
browser_session = kwargs.get('browser_session')
|
|
1340
|
+
|
|
1341
|
+
# Separate action params from special params (injected dependencies)
|
|
1342
|
+
special_param_names = {
|
|
1343
|
+
'browser_session',
|
|
1344
|
+
'page_extraction_llm',
|
|
1345
|
+
'file_system',
|
|
1346
|
+
'available_file_paths',
|
|
1347
|
+
'sensitive_data',
|
|
1348
|
+
}
|
|
1349
|
+
|
|
1350
|
+
# Extract action params (params for the action itself)
|
|
1351
|
+
action_params = {k: v for k, v in kwargs.items() if k not in special_param_names}
|
|
1352
|
+
|
|
1353
|
+
# Extract special params (injected dependencies) - exclude browser_session as it's positional
|
|
1354
|
+
special_kwargs = {k: v for k, v in kwargs.items() if k in special_param_names and k != 'browser_session'}
|
|
1355
|
+
|
|
1356
|
+
# Create the param instance
|
|
1357
|
+
params_instance = action.param_model(**action_params)
|
|
1358
|
+
|
|
1359
|
+
# Dynamically create an ActionModel with this action
|
|
1360
|
+
# Use Union for type compatibility with create_model
|
|
1361
|
+
DynamicActionModel = create_model(
|
|
1362
|
+
'DynamicActionModel',
|
|
1363
|
+
__base__=ActionModel,
|
|
1364
|
+
**{name: (Union[action.param_model, None], None)}, # type: ignore
|
|
1365
|
+
)
|
|
1366
|
+
|
|
1367
|
+
# Create the action model instance
|
|
1368
|
+
action_model = DynamicActionModel(**{name: params_instance})
|
|
1369
|
+
|
|
1370
|
+
# Call act() which has all the error handling, result normalization, and observability
|
|
1371
|
+
# browser_session is passed as positional argument (required by act())
|
|
1372
|
+
return await self.act(action=action_model, browser_session=browser_session, **special_kwargs) # type: ignore
|
|
1373
|
+
|
|
1374
|
+
return action_wrapper
|
|
1375
|
+
|
|
1376
|
+
# If not an action, raise AttributeError for normal Python behavior
|
|
1377
|
+
raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
|
|
1378
|
+
|
|
1379
|
+
|
|
1380
|
+
# Alias for backwards compatibility
|
|
1381
|
+
Controller = Tools
|
|
1382
|
+
|
|
1383
|
+
|
|
1384
|
+
class CodeAgentTools(Tools[Context]):
|
|
1385
|
+
"""Specialized Tools for CodeAgent agent optimized for Python-based browser automation.
|
|
1386
|
+
|
|
1387
|
+
Includes:
|
|
1388
|
+
- All browser interaction tools (click, input, scroll, navigate, etc.)
|
|
1389
|
+
- JavaScript evaluation
|
|
1390
|
+
- Tab management (switch, close)
|
|
1391
|
+
- Navigation actions (go_back)
|
|
1392
|
+
- Upload file support
|
|
1393
|
+
- Dropdown interactions
|
|
1394
|
+
|
|
1395
|
+
Excludes (optimized for code-use mode):
|
|
1396
|
+
- extract: Use Python + evaluate() instead
|
|
1397
|
+
- find_text: Use Python string operations
|
|
1398
|
+
- screenshot: Not needed in code-use mode
|
|
1399
|
+
- search: Use navigate() directly
|
|
1400
|
+
- File system actions (write_file, read_file, replace_file): Use Python file operations instead
|
|
1401
|
+
"""
|
|
1402
|
+
|
|
1403
|
+
def __init__(
|
|
1404
|
+
self,
|
|
1405
|
+
exclude_actions: list[str] | None = None,
|
|
1406
|
+
output_model: type[T] | None = None,
|
|
1407
|
+
display_files_in_done_text: bool = True,
|
|
1408
|
+
):
|
|
1409
|
+
# Default exclusions for CodeAgent agent
|
|
1410
|
+
if exclude_actions is None:
|
|
1411
|
+
exclude_actions = [
|
|
1412
|
+
# 'scroll', # Keep for code-use
|
|
1413
|
+
'extract', # Exclude - use Python + evaluate()
|
|
1414
|
+
'find_text', # Exclude - use Python string ops
|
|
1415
|
+
# 'select_dropdown', # Keep for code-use
|
|
1416
|
+
# 'dropdown_options', # Keep for code-use
|
|
1417
|
+
'screenshot', # Exclude - not needed
|
|
1418
|
+
'search', # Exclude - use navigate() directly
|
|
1419
|
+
# 'click', # Keep for code-use
|
|
1420
|
+
# 'input', # Keep for code-use
|
|
1421
|
+
# 'switch', # Keep for code-use
|
|
1422
|
+
# 'send_keys', # Keep for code-use
|
|
1423
|
+
# 'close', # Keep for code-use
|
|
1424
|
+
# 'go_back', # Keep for code-use
|
|
1425
|
+
# 'upload_file', # Keep for code-use
|
|
1426
|
+
# Exclude file system actions - CodeAgent should use Python file operations
|
|
1427
|
+
'write_file',
|
|
1428
|
+
'read_file',
|
|
1429
|
+
'replace_file',
|
|
1430
|
+
]
|
|
1431
|
+
|
|
1432
|
+
super().__init__(
|
|
1433
|
+
exclude_actions=exclude_actions,
|
|
1434
|
+
output_model=output_model,
|
|
1435
|
+
display_files_in_done_text=display_files_in_done_text,
|
|
1436
|
+
)
|
|
1437
|
+
|
|
1438
|
+
# Override done action for CodeAgent with enhanced file handling
|
|
1439
|
+
self._register_code_use_done_action(output_model, display_files_in_done_text)
|
|
1440
|
+
|
|
1441
|
+
def _register_code_use_done_action(self, output_model: type[T] | None, display_files_in_done_text: bool = True):
|
|
1442
|
+
"""Register enhanced done action for CodeAgent that can read files from disk."""
|
|
1443
|
+
if output_model is not None:
|
|
1444
|
+
# Structured output done - use parent's implementation
|
|
1445
|
+
return
|
|
1446
|
+
|
|
1447
|
+
# Override the done action with enhanced version
|
|
1448
|
+
@self.registry.action(
|
|
1449
|
+
'Complete task.',
|
|
1450
|
+
param_model=DoneAction,
|
|
1451
|
+
)
|
|
1452
|
+
async def done(params: DoneAction, file_system: FileSystem):
|
|
1453
|
+
user_message = params.text
|
|
1454
|
+
|
|
1455
|
+
len_text = len(params.text)
|
|
1456
|
+
len_max_memory = 100
|
|
1457
|
+
memory = f'Task completed: {params.success} - {params.text[:len_max_memory]}'
|
|
1458
|
+
if len_text > len_max_memory:
|
|
1459
|
+
memory += f' - {len_text - len_max_memory} more characters'
|
|
1460
|
+
|
|
1461
|
+
attachments = []
|
|
1462
|
+
if params.files_to_display:
|
|
1463
|
+
if self.display_files_in_done_text:
|
|
1464
|
+
file_msg = ''
|
|
1465
|
+
for file_name in params.files_to_display:
|
|
1466
|
+
file_content = file_system.display_file(file_name)
|
|
1467
|
+
if file_content:
|
|
1468
|
+
file_msg += f'\n\n{file_name}:\n{file_content}'
|
|
1469
|
+
attachments.append(file_name)
|
|
1470
|
+
elif os.path.exists(file_name):
|
|
1471
|
+
# File exists on disk but not in FileSystem - just add to attachments
|
|
1472
|
+
attachments.append(file_name)
|
|
1473
|
+
if file_msg:
|
|
1474
|
+
user_message += '\n\nAttachments:'
|
|
1475
|
+
user_message += file_msg
|
|
1476
|
+
else:
|
|
1477
|
+
logger.warning('Agent wanted to display files but none were found')
|
|
1478
|
+
else:
|
|
1479
|
+
for file_name in params.files_to_display:
|
|
1480
|
+
file_content = file_system.display_file(file_name)
|
|
1481
|
+
if file_content:
|
|
1482
|
+
attachments.append(file_name)
|
|
1483
|
+
elif os.path.exists(file_name):
|
|
1484
|
+
attachments.append(file_name)
|
|
1485
|
+
|
|
1486
|
+
# Convert relative paths to absolute paths - handle both FileSystem-managed and regular files
|
|
1487
|
+
resolved_attachments = []
|
|
1488
|
+
for file_name in attachments:
|
|
1489
|
+
if os.path.isabs(file_name):
|
|
1490
|
+
# Already absolute
|
|
1491
|
+
resolved_attachments.append(file_name)
|
|
1492
|
+
elif file_system.get_file(file_name):
|
|
1493
|
+
# Managed by FileSystem
|
|
1494
|
+
resolved_attachments.append(str(file_system.get_dir() / file_name))
|
|
1495
|
+
elif os.path.exists(file_name):
|
|
1496
|
+
# Regular file in current directory
|
|
1497
|
+
resolved_attachments.append(os.path.abspath(file_name))
|
|
1498
|
+
else:
|
|
1499
|
+
# File doesn't exist, but include the path anyway for error visibility
|
|
1500
|
+
resolved_attachments.append(str(file_system.get_dir() / file_name))
|
|
1501
|
+
attachments = resolved_attachments
|
|
1502
|
+
|
|
1503
|
+
return ActionResult(
|
|
1504
|
+
is_done=True,
|
|
1505
|
+
success=params.success,
|
|
1506
|
+
extracted_content=user_message,
|
|
1507
|
+
long_term_memory=memory,
|
|
1508
|
+
attachments=attachments,
|
|
1509
|
+
)
|
|
1510
|
+
|
|
1511
|
+
# Override upload_file for code agent with relaxed path validation
|
|
1512
|
+
@self.registry.action(
|
|
1513
|
+
'Upload a file to a file input element. For code-use mode, any file accessible from the current directory can be uploaded.',
|
|
1514
|
+
param_model=UploadFileAction,
|
|
1515
|
+
)
|
|
1516
|
+
async def upload_file(
|
|
1517
|
+
params: UploadFileAction,
|
|
1518
|
+
browser_session: BrowserSession,
|
|
1519
|
+
available_file_paths: list[str],
|
|
1520
|
+
file_system: FileSystem,
|
|
1521
|
+
):
|
|
1522
|
+
# Path validation logic for code-use mode:
|
|
1523
|
+
# 1. If available_file_paths provided (security mode), enforce it as a whitelist
|
|
1524
|
+
# 2. If no whitelist, for local browsers just check file exists
|
|
1525
|
+
# 3. For remote browsers, allow any path (assume it exists remotely)
|
|
1526
|
+
|
|
1527
|
+
# If whitelist provided, validate path is in it
|
|
1528
|
+
if available_file_paths:
|
|
1529
|
+
if params.path not in available_file_paths:
|
|
1530
|
+
# Also check if it's a recently downloaded file
|
|
1531
|
+
downloaded_files = browser_session.downloaded_files
|
|
1532
|
+
if params.path not in downloaded_files:
|
|
1533
|
+
# Finally, check if it's a file in the FileSystem service (if provided)
|
|
1534
|
+
if file_system is not None and file_system.get_dir():
|
|
1535
|
+
# Check if the file is actually managed by the FileSystem service
|
|
1536
|
+
# The path should be just the filename for FileSystem files
|
|
1537
|
+
file_obj = file_system.get_file(params.path)
|
|
1538
|
+
if file_obj:
|
|
1539
|
+
# File is managed by FileSystem, construct the full path
|
|
1540
|
+
file_system_path = str(file_system.get_dir() / params.path)
|
|
1541
|
+
params = UploadFileAction(index=params.index, path=file_system_path)
|
|
1542
|
+
else:
|
|
1543
|
+
# If browser is remote, allow passing a remote-accessible absolute path
|
|
1544
|
+
if not browser_session.is_local:
|
|
1545
|
+
pass
|
|
1546
|
+
else:
|
|
1547
|
+
msg = f'File path {params.path} is not available. To fix: add this file path to the available_file_paths parameter when creating the Agent. Example: Agent(task="...", llm=llm, browser=browser, available_file_paths=["{params.path}"])'
|
|
1548
|
+
logger.error(f'❌ {msg}')
|
|
1549
|
+
return ActionResult(error=msg)
|
|
1550
|
+
else:
|
|
1551
|
+
# If browser is remote, allow passing a remote-accessible absolute path
|
|
1552
|
+
if not browser_session.is_local:
|
|
1553
|
+
pass
|
|
1554
|
+
else:
|
|
1555
|
+
msg = f'File path {params.path} is not available. To fix: add this file path to the available_file_paths parameter when creating the Agent. Example: Agent(task="...", llm=llm, browser=browser, available_file_paths=["{params.path}"])'
|
|
1556
|
+
logger.error(f'❌ {msg}')
|
|
1557
|
+
return ActionResult(error=msg)
|
|
1558
|
+
|
|
1559
|
+
# For local browsers, ensure the file exists on the local filesystem
|
|
1560
|
+
if browser_session.is_local:
|
|
1561
|
+
if not os.path.exists(params.path):
|
|
1562
|
+
msg = f'File {params.path} does not exist'
|
|
1563
|
+
return ActionResult(error=msg)
|
|
1564
|
+
|
|
1565
|
+
# Get the selector map to find the node
|
|
1566
|
+
selector_map = await browser_session.get_selector_map()
|
|
1567
|
+
if params.index not in selector_map:
|
|
1568
|
+
msg = f'Element with index {params.index} does not exist.'
|
|
1569
|
+
return ActionResult(error=msg)
|
|
1570
|
+
|
|
1571
|
+
node = selector_map[params.index]
|
|
1572
|
+
|
|
1573
|
+
# Helper function to find file input near the selected element
|
|
1574
|
+
def find_file_input_near_element(
|
|
1575
|
+
node: EnhancedDOMTreeNode, max_height: int = 3, max_descendant_depth: int = 3
|
|
1576
|
+
) -> EnhancedDOMTreeNode | None:
|
|
1577
|
+
"""Find the closest file input to the selected element."""
|
|
1578
|
+
|
|
1579
|
+
def find_file_input_in_descendants(n: EnhancedDOMTreeNode, depth: int) -> EnhancedDOMTreeNode | None:
|
|
1580
|
+
if depth < 0:
|
|
1581
|
+
return None
|
|
1582
|
+
if browser_session.is_file_input(n):
|
|
1583
|
+
return n
|
|
1584
|
+
for child in n.children_nodes or []:
|
|
1585
|
+
result = find_file_input_in_descendants(child, depth - 1)
|
|
1586
|
+
if result:
|
|
1587
|
+
return result
|
|
1588
|
+
return None
|
|
1589
|
+
|
|
1590
|
+
current = node
|
|
1591
|
+
for _ in range(max_height + 1):
|
|
1592
|
+
# Check the current node itself
|
|
1593
|
+
if browser_session.is_file_input(current):
|
|
1594
|
+
return current
|
|
1595
|
+
# Check all descendants of the current node
|
|
1596
|
+
result = find_file_input_in_descendants(current, max_descendant_depth)
|
|
1597
|
+
if result:
|
|
1598
|
+
return result
|
|
1599
|
+
# Check all siblings and their descendants
|
|
1600
|
+
if current.parent_node:
|
|
1601
|
+
for sibling in current.parent_node.children_nodes or []:
|
|
1602
|
+
if sibling is current:
|
|
1603
|
+
continue
|
|
1604
|
+
if browser_session.is_file_input(sibling):
|
|
1605
|
+
return sibling
|
|
1606
|
+
result = find_file_input_in_descendants(sibling, max_descendant_depth)
|
|
1607
|
+
if result:
|
|
1608
|
+
return result
|
|
1609
|
+
current = current.parent_node
|
|
1610
|
+
if not current:
|
|
1611
|
+
break
|
|
1612
|
+
return None
|
|
1613
|
+
|
|
1614
|
+
# Try to find a file input element near the selected element
|
|
1615
|
+
file_input_node = find_file_input_near_element(node)
|
|
1616
|
+
|
|
1617
|
+
# Highlight the file input element if found (truly non-blocking)
|
|
1618
|
+
if file_input_node:
|
|
1619
|
+
asyncio.create_task(browser_session.highlight_interaction_element(file_input_node))
|
|
1620
|
+
|
|
1621
|
+
# If not found near the selected element, fallback to finding the closest file input to current scroll position
|
|
1622
|
+
if file_input_node is None:
|
|
1623
|
+
logger.info(
|
|
1624
|
+
f'No file upload element found near index {params.index}, searching for closest file input to scroll position'
|
|
1625
|
+
)
|
|
1626
|
+
|
|
1627
|
+
# Get current scroll position
|
|
1628
|
+
cdp_session = await browser_session.get_or_create_cdp_session()
|
|
1629
|
+
try:
|
|
1630
|
+
scroll_info = await cdp_session.cdp_client.send.Runtime.evaluate(
|
|
1631
|
+
params={'expression': 'window.scrollY || window.pageYOffset || 0'}, session_id=cdp_session.session_id
|
|
1632
|
+
)
|
|
1633
|
+
current_scroll_y = scroll_info.get('result', {}).get('value', 0)
|
|
1634
|
+
except Exception:
|
|
1635
|
+
current_scroll_y = 0
|
|
1636
|
+
|
|
1637
|
+
# Find all file inputs in the selector map and pick the closest one to scroll position
|
|
1638
|
+
closest_file_input = None
|
|
1639
|
+
min_distance = float('inf')
|
|
1640
|
+
|
|
1641
|
+
for idx, element in selector_map.items():
|
|
1642
|
+
if browser_session.is_file_input(element):
|
|
1643
|
+
# Get element's Y position
|
|
1644
|
+
if element.absolute_position:
|
|
1645
|
+
element_y = element.absolute_position.y
|
|
1646
|
+
distance = abs(element_y - current_scroll_y)
|
|
1647
|
+
if distance < min_distance:
|
|
1648
|
+
min_distance = distance
|
|
1649
|
+
closest_file_input = element
|
|
1650
|
+
|
|
1651
|
+
if closest_file_input:
|
|
1652
|
+
file_input_node = closest_file_input
|
|
1653
|
+
logger.info(f'Found file input closest to scroll position (distance: {min_distance}px)')
|
|
1654
|
+
# Highlight the fallback file input element (truly non-blocking)
|
|
1655
|
+
asyncio.create_task(browser_session.highlight_interaction_element(file_input_node))
|
|
1656
|
+
else:
|
|
1657
|
+
msg = 'No file upload element found on the page'
|
|
1658
|
+
logger.error(msg)
|
|
1659
|
+
raise BrowserError(msg)
|
|
1660
|
+
# TODO: figure out why this fails sometimes + add fallback hail mary, just look for any file input on page
|
|
1661
|
+
|
|
1662
|
+
# Dispatch upload file event with the file input node
|
|
1663
|
+
try:
|
|
1664
|
+
event = browser_session.event_bus.dispatch(UploadFileEvent(node=file_input_node, file_path=params.path))
|
|
1665
|
+
await event
|
|
1666
|
+
await event.event_result(raise_if_any=True, raise_if_none=False)
|
|
1667
|
+
msg = f'Successfully uploaded file to index {params.index}'
|
|
1668
|
+
logger.info(f'📁 {msg}')
|
|
1669
|
+
return ActionResult(
|
|
1670
|
+
extracted_content=msg,
|
|
1671
|
+
long_term_memory=f'Uploaded file {params.path} to element {params.index}',
|
|
1672
|
+
)
|
|
1673
|
+
except Exception as e:
|
|
1674
|
+
logger.error(f'Failed to upload file: {e}')
|
|
1675
|
+
raise BrowserError(f'Failed to upload file: {e}')
|