camel-ai 0.2.72a10__py3-none-any.whl → 0.2.73a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of camel-ai might be problematic. Click here for more details.
- camel/__init__.py +1 -1
- camel/agents/chat_agent.py +113 -338
- camel/memories/agent_memories.py +18 -17
- camel/societies/workforce/prompts.py +10 -4
- camel/societies/workforce/single_agent_worker.py +7 -5
- camel/toolkits/__init__.py +6 -1
- camel/toolkits/base.py +57 -1
- camel/toolkits/hybrid_browser_toolkit/config_loader.py +136 -413
- camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit.py +796 -1631
- camel/toolkits/hybrid_browser_toolkit/ts/package-lock.json +4356 -0
- camel/toolkits/hybrid_browser_toolkit/ts/package.json +33 -0
- camel/toolkits/hybrid_browser_toolkit/ts/src/browser-scripts.js +125 -0
- camel/toolkits/hybrid_browser_toolkit/ts/src/browser-session.ts +945 -0
- camel/toolkits/hybrid_browser_toolkit/ts/src/config-loader.ts +226 -0
- camel/toolkits/hybrid_browser_toolkit/ts/src/hybrid-browser-toolkit.ts +522 -0
- camel/toolkits/hybrid_browser_toolkit/ts/src/index.ts +7 -0
- camel/toolkits/hybrid_browser_toolkit/ts/src/types.ts +110 -0
- camel/toolkits/hybrid_browser_toolkit/ts/tsconfig.json +26 -0
- camel/toolkits/hybrid_browser_toolkit/ts/websocket-server.js +210 -0
- camel/toolkits/hybrid_browser_toolkit/ws_wrapper.py +533 -0
- camel/toolkits/message_integration.py +592 -0
- camel/toolkits/notion_mcp_toolkit.py +234 -0
- camel/toolkits/screenshot_toolkit.py +116 -31
- camel/toolkits/search_toolkit.py +20 -2
- camel/toolkits/terminal_toolkit.py +16 -2
- camel/toolkits/video_analysis_toolkit.py +13 -13
- camel/toolkits/video_download_toolkit.py +11 -11
- {camel_ai-0.2.72a10.dist-info → camel_ai-0.2.73a1.dist-info}/METADATA +12 -6
- {camel_ai-0.2.72a10.dist-info → camel_ai-0.2.73a1.dist-info}/RECORD +31 -24
- camel/toolkits/hybrid_browser_toolkit/actions.py +0 -417
- camel/toolkits/hybrid_browser_toolkit/agent.py +0 -311
- camel/toolkits/hybrid_browser_toolkit/browser_session.py +0 -740
- camel/toolkits/hybrid_browser_toolkit/snapshot.py +0 -227
- camel/toolkits/hybrid_browser_toolkit/stealth_script.js +0 -0
- camel/toolkits/hybrid_browser_toolkit/unified_analyzer.js +0 -1002
- {camel_ai-0.2.72a10.dist-info → camel_ai-0.2.73a1.dist-info}/WHEEL +0 -0
- {camel_ai-0.2.72a10.dist-info → camel_ai-0.2.73a1.dist-info}/licenses/LICENSE +0 -0
|
@@ -11,75 +11,64 @@
|
|
|
11
11
|
# See the License for the specific language governing permissions and
|
|
12
12
|
# limitations under the License.
|
|
13
13
|
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
|
|
14
|
+
# =========
|
|
14
15
|
|
|
15
|
-
import base64
|
|
16
|
-
import datetime
|
|
17
|
-
import io
|
|
18
|
-
import json
|
|
19
|
-
import os
|
|
20
16
|
import time
|
|
21
|
-
import urllib.parse
|
|
22
|
-
from functools import wraps
|
|
23
17
|
from typing import Any, Callable, ClassVar, Dict, List, Optional, cast
|
|
24
18
|
|
|
25
19
|
from camel.logger import get_logger
|
|
20
|
+
from camel.messages import BaseMessage
|
|
26
21
|
from camel.models import BaseModelBackend
|
|
27
|
-
from camel.toolkits.base import BaseToolkit
|
|
22
|
+
from camel.toolkits.base import BaseToolkit, RegisteredAgentToolkit
|
|
28
23
|
from camel.toolkits.function_tool import FunctionTool
|
|
29
|
-
from camel.utils import sanitize_filename
|
|
30
24
|
from camel.utils.commons import dependencies_required
|
|
31
|
-
from camel.utils.tool_result import ToolResult
|
|
32
25
|
|
|
33
|
-
from .agent import PlaywrightLLMAgent
|
|
34
|
-
from .browser_session import HybridBrowserSession
|
|
35
26
|
from .config_loader import ConfigLoader
|
|
27
|
+
from .ws_wrapper import WebSocketBrowserWrapper
|
|
36
28
|
|
|
37
29
|
logger = get_logger(__name__)
|
|
38
30
|
|
|
39
31
|
|
|
40
|
-
class HybridBrowserToolkit(BaseToolkit):
|
|
32
|
+
class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
|
|
41
33
|
r"""A hybrid browser toolkit that combines non-visual, DOM-based browser
|
|
42
34
|
automation with visual, screenshot-based capabilities.
|
|
43
35
|
|
|
44
|
-
This toolkit
|
|
45
|
-
|
|
46
|
-
both programmatic control of browser actions (like clicking and typing)
|
|
47
|
-
and visual analysis of the page layout through screenshots with marked
|
|
48
|
-
interactive elements.
|
|
36
|
+
This toolkit now uses TypeScript implementation with Playwright's
|
|
37
|
+
_snapshotForAI functionality for enhanced AI integration.
|
|
49
38
|
"""
|
|
50
39
|
|
|
51
40
|
# Default tool list - core browser functionality
|
|
52
41
|
DEFAULT_TOOLS: ClassVar[List[str]] = [
|
|
53
|
-
"
|
|
54
|
-
"
|
|
55
|
-
"
|
|
56
|
-
"
|
|
57
|
-
"
|
|
58
|
-
"
|
|
59
|
-
"
|
|
60
|
-
"
|
|
42
|
+
"browser_open",
|
|
43
|
+
"browser_close",
|
|
44
|
+
"browser_visit_page",
|
|
45
|
+
"browser_back",
|
|
46
|
+
"browser_forward",
|
|
47
|
+
"browser_click",
|
|
48
|
+
"browser_type",
|
|
49
|
+
"browser_switch_tab",
|
|
61
50
|
]
|
|
62
51
|
|
|
63
52
|
# All available tools
|
|
64
53
|
ALL_TOOLS: ClassVar[List[str]] = [
|
|
65
|
-
"
|
|
66
|
-
"
|
|
67
|
-
"
|
|
68
|
-
"
|
|
69
|
-
"
|
|
70
|
-
"
|
|
71
|
-
"
|
|
72
|
-
"
|
|
73
|
-
"
|
|
74
|
-
"
|
|
75
|
-
"
|
|
76
|
-
"
|
|
77
|
-
"
|
|
78
|
-
"
|
|
79
|
-
"
|
|
80
|
-
"
|
|
81
|
-
"
|
|
82
|
-
"
|
|
54
|
+
"browser_open",
|
|
55
|
+
"browser_close",
|
|
56
|
+
"browser_visit_page",
|
|
57
|
+
"browser_back",
|
|
58
|
+
"browser_forward",
|
|
59
|
+
"browser_get_page_snapshot",
|
|
60
|
+
"browser_get_som_screenshot",
|
|
61
|
+
"browser_get_page_links",
|
|
62
|
+
"browser_click",
|
|
63
|
+
"browser_type",
|
|
64
|
+
"browser_select",
|
|
65
|
+
"browser_scroll",
|
|
66
|
+
"browser_enter",
|
|
67
|
+
"browser_wait_user",
|
|
68
|
+
"browser_solve_task",
|
|
69
|
+
"browser_switch_tab",
|
|
70
|
+
"browser_close_tab",
|
|
71
|
+
"browser_get_tab_info",
|
|
83
72
|
]
|
|
84
73
|
|
|
85
74
|
def __init__(
|
|
@@ -101,137 +90,104 @@ class HybridBrowserToolkit(BaseToolkit):
|
|
|
101
90
|
screenshot_timeout: Optional[int] = None,
|
|
102
91
|
page_stability_timeout: Optional[int] = None,
|
|
103
92
|
dom_content_loaded_timeout: Optional[int] = None,
|
|
93
|
+
viewport_limit: bool = False,
|
|
94
|
+
connect_over_cdp: bool = False,
|
|
95
|
+
cdp_url: Optional[str] = None,
|
|
104
96
|
) -> None:
|
|
105
97
|
r"""Initialize the HybridBrowserToolkit.
|
|
106
98
|
|
|
107
99
|
Args:
|
|
108
|
-
headless (bool): Whether to run
|
|
109
|
-
|
|
110
|
-
user_data_dir (Optional[str]):
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
default
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
Defaults to `None`.
|
|
150
|
-
short_timeout (Optional[int]): Short timeout in milliseconds
|
|
151
|
-
for quick browser actions. If None, uses environment variable
|
|
152
|
-
HYBRID_BROWSER_SHORT_TIMEOUT or defaults to 1000ms.
|
|
153
|
-
Defaults to `None`.
|
|
154
|
-
navigation_timeout (Optional[int]): Custom navigation timeout in
|
|
155
|
-
milliseconds.
|
|
156
|
-
If None, uses environment variable
|
|
157
|
-
HYBRID_BROWSER_NAVIGATION_TIMEOUT or defaults to 10000ms.
|
|
158
|
-
Defaults to `None`.
|
|
159
|
-
network_idle_timeout (Optional[int]): Custom network idle
|
|
160
|
-
timeout in milliseconds.
|
|
161
|
-
If None, uses environment variable
|
|
162
|
-
HYBRID_BROWSER_NETWORK_IDLE_TIMEOUT or defaults to 5000ms.
|
|
163
|
-
Defaults to `None`.
|
|
164
|
-
screenshot_timeout (Optional[int]): Custom screenshot timeout in
|
|
165
|
-
milliseconds.
|
|
166
|
-
If None, uses environment variable
|
|
167
|
-
HYBRID_BROWSER_SCREENSHOT_TIMEOUT or defaults to 15000ms.
|
|
168
|
-
Defaults to `None`.
|
|
169
|
-
page_stability_timeout (Optional[int]): Custom page stability
|
|
170
|
-
timeout in milliseconds.
|
|
171
|
-
If None, uses environment variable
|
|
172
|
-
HYBRID_BROWSER_PAGE_STABILITY_TIMEOUT or defaults to 1500ms.
|
|
173
|
-
Defaults to `None`.
|
|
174
|
-
dom_content_loaded_timeout (Optional[int]): Custom DOM content
|
|
175
|
-
loaded timeout in milliseconds.
|
|
176
|
-
If None, uses environment variable
|
|
177
|
-
HYBRID_BROWSER_DOM_CONTENT_LOADED_TIMEOUT or defaults to
|
|
178
|
-
5000ms.
|
|
179
|
-
Defaults to `None`.
|
|
100
|
+
headless (bool): Whether to run browser in headless mode.
|
|
101
|
+
Defaults to True.
|
|
102
|
+
user_data_dir (Optional[str]): Directory for user data
|
|
103
|
+
persistence. Defaults to None.
|
|
104
|
+
stealth (bool): Whether to enable stealth mode. Defaults to
|
|
105
|
+
False.
|
|
106
|
+
web_agent_model (Optional[BaseModelBackend]): Model for web
|
|
107
|
+
agent operations. Defaults to None.
|
|
108
|
+
cache_dir (str): Directory for caching. Defaults to "tmp/".
|
|
109
|
+
enabled_tools (Optional[List[str]]): List of enabled tools.
|
|
110
|
+
Defaults to None.
|
|
111
|
+
browser_log_to_file (bool): Whether to log browser actions to
|
|
112
|
+
file. Defaults to False.
|
|
113
|
+
session_id (Optional[str]): Session identifier. Defaults to None.
|
|
114
|
+
default_start_url (str): Default URL to start with. Defaults
|
|
115
|
+
to "https://google.com/".
|
|
116
|
+
default_timeout (Optional[int]): Default timeout in
|
|
117
|
+
milliseconds. Defaults to None.
|
|
118
|
+
short_timeout (Optional[int]): Short timeout in milliseconds.
|
|
119
|
+
Defaults to None.
|
|
120
|
+
navigation_timeout (Optional[int]): Navigation timeout in
|
|
121
|
+
milliseconds. Defaults to None.
|
|
122
|
+
network_idle_timeout (Optional[int]): Network idle timeout in
|
|
123
|
+
milliseconds. Defaults to None.
|
|
124
|
+
screenshot_timeout (Optional[int]): Screenshot timeout in
|
|
125
|
+
milliseconds. Defaults to None.
|
|
126
|
+
page_stability_timeout (Optional[int]): Page stability timeout
|
|
127
|
+
in milliseconds. Defaults to None.
|
|
128
|
+
dom_content_loaded_timeout (Optional[int]): DOM content loaded
|
|
129
|
+
timeout in milliseconds. Defaults to None.
|
|
130
|
+
viewport_limit (bool): Whether to filter page snapshot
|
|
131
|
+
elements to only those visible in the current viewport.
|
|
132
|
+
When True, only elements within the current viewport
|
|
133
|
+
bounds will be included in snapshots.
|
|
134
|
+
When False (default), all elements on the page are
|
|
135
|
+
included. Defaults to False.
|
|
136
|
+
connect_over_cdp (bool): Whether to connect to an existing
|
|
137
|
+
browser via Chrome DevTools Protocol. Defaults to False.
|
|
138
|
+
cdp_url (Optional[str]): WebSocket endpoint URL for CDP
|
|
139
|
+
connection (e.g., 'ws://localhost:9222/devtools/browser/...').
|
|
140
|
+
Required when connect_over_cdp is True. Defaults to None.
|
|
180
141
|
"""
|
|
181
142
|
super().__init__()
|
|
182
|
-
self
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
self.
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
page_stability_timeout
|
|
205
|
-
)
|
|
206
|
-
self._dom_content_loaded_timeout = (
|
|
207
|
-
ConfigLoader.get_dom_content_loaded_timeout(
|
|
208
|
-
dom_content_loaded_timeout
|
|
209
|
-
)
|
|
143
|
+
RegisteredAgentToolkit.__init__(self)
|
|
144
|
+
|
|
145
|
+
# Initialize configuration loader
|
|
146
|
+
self.config_loader = ConfigLoader.from_kwargs(
|
|
147
|
+
headless=headless,
|
|
148
|
+
user_data_dir=user_data_dir,
|
|
149
|
+
stealth=stealth,
|
|
150
|
+
default_start_url=default_start_url,
|
|
151
|
+
default_timeout=default_timeout,
|
|
152
|
+
short_timeout=short_timeout,
|
|
153
|
+
navigation_timeout=navigation_timeout,
|
|
154
|
+
network_idle_timeout=network_idle_timeout,
|
|
155
|
+
screenshot_timeout=screenshot_timeout,
|
|
156
|
+
page_stability_timeout=page_stability_timeout,
|
|
157
|
+
dom_content_loaded_timeout=dom_content_loaded_timeout,
|
|
158
|
+
viewport_limit=viewport_limit,
|
|
159
|
+
cache_dir=cache_dir,
|
|
160
|
+
browser_log_to_file=browser_log_to_file,
|
|
161
|
+
session_id=session_id,
|
|
162
|
+
enabled_tools=enabled_tools,
|
|
163
|
+
connect_over_cdp=connect_over_cdp,
|
|
164
|
+
cdp_url=cdp_url,
|
|
210
165
|
)
|
|
211
166
|
|
|
212
|
-
#
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
self.enable_page_loading_logging = True
|
|
216
|
-
self.log_to_console = False # Always disabled for cleaner output
|
|
217
|
-
self.log_to_file = browser_log_to_file
|
|
218
|
-
self.max_log_length = None # No truncation for file logs
|
|
219
|
-
|
|
220
|
-
# Set up log file if needed
|
|
221
|
-
if self.log_to_file:
|
|
222
|
-
# Create log directory if it doesn't exist
|
|
223
|
-
log_dir = "browser_log"
|
|
224
|
-
os.makedirs(log_dir, exist_ok=True)
|
|
225
|
-
|
|
226
|
-
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
227
|
-
self.log_file_path: Optional[str] = os.path.join(
|
|
228
|
-
log_dir, f"hybrid_browser_toolkit_{timestamp}_{session_id}.log"
|
|
229
|
-
)
|
|
230
|
-
else:
|
|
231
|
-
self.log_file_path = None
|
|
167
|
+
# Legacy attribute access for backward compatibility
|
|
168
|
+
browser_config = self.config_loader.get_browser_config()
|
|
169
|
+
toolkit_config = self.config_loader.get_toolkit_config()
|
|
232
170
|
|
|
233
|
-
|
|
234
|
-
self.
|
|
171
|
+
self._headless = browser_config.headless
|
|
172
|
+
self._user_data_dir = browser_config.user_data_dir
|
|
173
|
+
self._stealth = browser_config.stealth
|
|
174
|
+
self._web_agent_model = web_agent_model
|
|
175
|
+
self._cache_dir = toolkit_config.cache_dir
|
|
176
|
+
self._browser_log_to_file = toolkit_config.browser_log_to_file
|
|
177
|
+
self._default_start_url = browser_config.default_start_url
|
|
178
|
+
self._session_id = toolkit_config.session_id or "default"
|
|
179
|
+
self._viewport_limit = browser_config.viewport_limit
|
|
180
|
+
|
|
181
|
+
# Store timeout configuration for backward compatibility
|
|
182
|
+
self._default_timeout = browser_config.default_timeout
|
|
183
|
+
self._short_timeout = browser_config.short_timeout
|
|
184
|
+
self._navigation_timeout = browser_config.navigation_timeout
|
|
185
|
+
self._network_idle_timeout = browser_config.network_idle_timeout
|
|
186
|
+
self._screenshot_timeout = browser_config.screenshot_timeout
|
|
187
|
+
self._page_stability_timeout = browser_config.page_stability_timeout
|
|
188
|
+
self._dom_content_loaded_timeout = (
|
|
189
|
+
browser_config.dom_content_loaded_timeout
|
|
190
|
+
)
|
|
235
191
|
|
|
236
192
|
# Configure enabled tools
|
|
237
193
|
if enabled_tools is None:
|
|
@@ -250,42 +206,22 @@ class HybridBrowserToolkit(BaseToolkit):
|
|
|
250
206
|
|
|
251
207
|
logger.info(f"Enabled tools: {self.enabled_tools}")
|
|
252
208
|
|
|
253
|
-
#
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
"HybridBrowserToolkit initialized with file logging enabled"
|
|
257
|
-
)
|
|
258
|
-
logger.info(f"Log file path: {self.log_file_path}")
|
|
259
|
-
|
|
260
|
-
# Core components
|
|
261
|
-
temp_session = HybridBrowserSession(
|
|
262
|
-
headless=headless,
|
|
263
|
-
user_data_dir=user_data_dir,
|
|
264
|
-
stealth=stealth,
|
|
265
|
-
session_id=session_id,
|
|
266
|
-
default_timeout=default_timeout,
|
|
267
|
-
short_timeout=short_timeout,
|
|
268
|
-
)
|
|
269
|
-
# Use the session directly - singleton logic is handled in
|
|
270
|
-
# ensure_browser
|
|
271
|
-
self._session = temp_session
|
|
272
|
-
self._agent: Optional[PlaywrightLLMAgent] = None
|
|
273
|
-
self._unified_script = self._load_unified_analyzer()
|
|
274
|
-
|
|
275
|
-
@property
|
|
276
|
-
def web_agent_model(self) -> Optional[BaseModelBackend]:
|
|
277
|
-
"""Get the web agent model."""
|
|
278
|
-
return self._web_agent_model
|
|
209
|
+
# Initialize WebSocket wrapper
|
|
210
|
+
self._ws_wrapper: Optional[WebSocketBrowserWrapper] = None
|
|
211
|
+
self._ws_config = self.config_loader.to_ws_config()
|
|
279
212
|
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
213
|
+
async def _ensure_ws_wrapper(self):
|
|
214
|
+
"""Ensure WebSocket wrapper is initialized."""
|
|
215
|
+
if self._ws_wrapper is None:
|
|
216
|
+
self._ws_wrapper = WebSocketBrowserWrapper(self._ws_config)
|
|
217
|
+
await self._ws_wrapper.start()
|
|
284
218
|
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
219
|
+
async def _get_ws_wrapper(self) -> WebSocketBrowserWrapper:
|
|
220
|
+
"""Get the WebSocket wrapper, initializing if needed."""
|
|
221
|
+
await self._ensure_ws_wrapper()
|
|
222
|
+
if self._ws_wrapper is None:
|
|
223
|
+
raise RuntimeError("Failed to initialize WebSocket wrapper")
|
|
224
|
+
return self._ws_wrapper
|
|
289
225
|
|
|
290
226
|
def __del__(self):
|
|
291
227
|
r"""Cleanup browser resources on garbage collection."""
|
|
@@ -300,800 +236,35 @@ class HybridBrowserToolkit(BaseToolkit):
|
|
|
300
236
|
try:
|
|
301
237
|
loop = asyncio.get_event_loop()
|
|
302
238
|
if not loop.is_closed() and not loop.is_running():
|
|
303
|
-
# Try to close browser with a timeout to prevent hanging
|
|
304
239
|
try:
|
|
305
240
|
loop.run_until_complete(
|
|
306
|
-
asyncio.wait_for(self.
|
|
241
|
+
asyncio.wait_for(self.browser_close(), timeout=2.0)
|
|
307
242
|
)
|
|
308
243
|
except asyncio.TimeoutError:
|
|
309
|
-
pass
|
|
244
|
+
pass
|
|
310
245
|
except (RuntimeError, ImportError):
|
|
311
|
-
pass
|
|
312
|
-
except Exception:
|
|
313
|
-
pass # Suppress all errors during garbage collection
|
|
314
|
-
|
|
315
|
-
def _load_unified_analyzer(self) -> str:
|
|
316
|
-
r"""Load the unified analyzer JavaScript script."""
|
|
317
|
-
script_path = os.path.join(
|
|
318
|
-
os.path.dirname(os.path.abspath(__file__)), "unified_analyzer.js"
|
|
319
|
-
)
|
|
320
|
-
|
|
321
|
-
try:
|
|
322
|
-
with open(
|
|
323
|
-
script_path, "r", encoding='utf-8', errors='replace'
|
|
324
|
-
) as f:
|
|
325
|
-
script_content = f.read()
|
|
326
|
-
|
|
327
|
-
if not script_content.strip():
|
|
328
|
-
raise ValueError(f"Script is empty: {script_path}")
|
|
329
|
-
|
|
330
|
-
logger.debug(
|
|
331
|
-
f"Loaded unified analyzer ({len(script_content)} chars)"
|
|
332
|
-
)
|
|
333
|
-
return script_content
|
|
334
|
-
except FileNotFoundError:
|
|
335
|
-
raise FileNotFoundError(f"Script not found: {script_path}")
|
|
336
|
-
|
|
337
|
-
def _validate_ref(self, ref: str, method_name: str) -> None:
|
|
338
|
-
r"""Validate ref parameter."""
|
|
339
|
-
if not ref or not isinstance(ref, str):
|
|
340
|
-
raise ValueError(
|
|
341
|
-
f"{method_name}: 'ref' must be a non-empty string"
|
|
342
|
-
)
|
|
343
|
-
|
|
344
|
-
def _truncate_if_needed(self, content: Any) -> str:
|
|
345
|
-
r"""Truncate content if max_log_length is set."""
|
|
346
|
-
content_str = str(content)
|
|
347
|
-
if (
|
|
348
|
-
self.max_log_length is not None
|
|
349
|
-
and len(content_str) > self.max_log_length
|
|
350
|
-
):
|
|
351
|
-
return content_str[: self.max_log_length] + "... [TRUNCATED]"
|
|
352
|
-
return content_str
|
|
353
|
-
|
|
354
|
-
async def _get_current_url(self) -> Optional[str]:
|
|
355
|
-
r"""Safely get the current URL of the active page."""
|
|
356
|
-
try:
|
|
357
|
-
page = await self._session.get_page()
|
|
358
|
-
if page and not page.is_closed():
|
|
359
|
-
return page.url
|
|
360
|
-
return None # Return None if page is closed
|
|
246
|
+
pass
|
|
361
247
|
except Exception:
|
|
362
|
-
|
|
363
|
-
return None
|
|
248
|
+
pass
|
|
364
249
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
outputs: Any,
|
|
370
|
-
execution_time: float,
|
|
371
|
-
page_load_time: Optional[float] = None,
|
|
372
|
-
error: Optional[str] = None,
|
|
373
|
-
) -> None:
|
|
374
|
-
r"""Log action details with comprehensive information."""
|
|
375
|
-
if not (self.enable_action_logging or self.enable_timing_logging):
|
|
376
|
-
return
|
|
377
|
-
|
|
378
|
-
current_url = await self._get_current_url()
|
|
379
|
-
|
|
380
|
-
log_entry: Dict[str, Any] = {
|
|
381
|
-
"timestamp": datetime.datetime.now().isoformat(),
|
|
382
|
-
"action": action_name,
|
|
383
|
-
"url": current_url,
|
|
384
|
-
"execution_time_ms": round(execution_time * 1000, 2),
|
|
385
|
-
}
|
|
386
|
-
|
|
387
|
-
if self.enable_action_logging:
|
|
388
|
-
log_entry["inputs"] = inputs
|
|
389
|
-
if error:
|
|
390
|
-
log_entry["error"] = str(error)
|
|
391
|
-
elif isinstance(outputs, dict):
|
|
392
|
-
# Unpack dictionary items into the log entry
|
|
393
|
-
log_entry.update(outputs)
|
|
394
|
-
elif isinstance(outputs, ToolResult):
|
|
395
|
-
log_entry["outputs"] = {
|
|
396
|
-
"text": outputs.text,
|
|
397
|
-
"images": outputs.images,
|
|
398
|
-
}
|
|
399
|
-
else:
|
|
400
|
-
# For non-dict outputs, assign to 'outputs' key
|
|
401
|
-
log_entry["outputs"] = outputs
|
|
402
|
-
|
|
403
|
-
if page_load_time is not None and self.enable_page_loading_logging:
|
|
404
|
-
log_entry["page_load_time_ms"] = round(page_load_time * 1000, 2)
|
|
405
|
-
|
|
406
|
-
# Add to buffer
|
|
407
|
-
self.log_buffer.append(log_entry)
|
|
408
|
-
|
|
409
|
-
# Console logging
|
|
410
|
-
if self.log_to_console:
|
|
411
|
-
log_msg = f"[BROWSER ACTION] {action_name}"
|
|
412
|
-
if self.enable_timing_logging:
|
|
413
|
-
log_msg += f" | Execution: {log_entry['execution_time_ms']}ms"
|
|
414
|
-
if page_load_time is not None and self.enable_page_loading_logging:
|
|
415
|
-
log_msg += f" | Page Load: {log_entry['page_load_time_ms']}ms"
|
|
416
|
-
if error:
|
|
417
|
-
log_msg += f" | ERROR: {error}"
|
|
418
|
-
|
|
419
|
-
logger.info(log_msg)
|
|
420
|
-
|
|
421
|
-
if self.enable_action_logging:
|
|
422
|
-
logger.info(f" Inputs: {self._truncate_if_needed(inputs)}")
|
|
423
|
-
if not error:
|
|
424
|
-
if isinstance(outputs, dict):
|
|
425
|
-
for key, value in outputs.items():
|
|
426
|
-
logger.info(
|
|
427
|
-
f" - {key}: "
|
|
428
|
-
f"{self._truncate_if_needed(value)}"
|
|
429
|
-
)
|
|
430
|
-
else:
|
|
431
|
-
logger.info(
|
|
432
|
-
f" Outputs: {self._truncate_if_needed(outputs)}"
|
|
433
|
-
)
|
|
434
|
-
|
|
435
|
-
# File logging
|
|
436
|
-
if self.log_to_file and self.log_file_path:
|
|
437
|
-
try:
|
|
438
|
-
with open(self.log_file_path, 'a', encoding='utf-8') as f:
|
|
439
|
-
# Write full log entry to file without truncation
|
|
440
|
-
f.write(
|
|
441
|
-
json.dumps(log_entry, ensure_ascii=False, indent=2)
|
|
442
|
-
+ '\n'
|
|
443
|
-
)
|
|
444
|
-
except Exception as e:
|
|
445
|
-
logger.error(f"Failed to write to log file: {e}")
|
|
446
|
-
|
|
447
|
-
@staticmethod
|
|
448
|
-
def action_logger(func: Callable[..., Any]) -> Callable[..., Any]:
|
|
449
|
-
r"""Decorator to add logging to action methods."""
|
|
450
|
-
|
|
451
|
-
@wraps(func)
|
|
452
|
-
async def wrapper(self, *args, **kwargs):
|
|
453
|
-
action_name = func.__name__
|
|
454
|
-
start_time = time.time()
|
|
455
|
-
|
|
456
|
-
# Log inputs
|
|
457
|
-
inputs = {
|
|
458
|
-
"args": args, # Don't skip self since it's already handled
|
|
459
|
-
"kwargs": kwargs,
|
|
460
|
-
}
|
|
461
|
-
|
|
462
|
-
try:
|
|
463
|
-
# Execute the original function
|
|
464
|
-
result = await func(self, *args, **kwargs)
|
|
465
|
-
execution_time = time.time() - start_time
|
|
466
|
-
|
|
467
|
-
# Log success
|
|
468
|
-
await self._log_action(
|
|
469
|
-
action_name=action_name,
|
|
470
|
-
inputs=inputs,
|
|
471
|
-
outputs=result,
|
|
472
|
-
execution_time=execution_time,
|
|
473
|
-
)
|
|
474
|
-
|
|
475
|
-
return result
|
|
476
|
-
|
|
477
|
-
except Exception as e:
|
|
478
|
-
execution_time = time.time() - start_time
|
|
479
|
-
error_msg = f"{type(e).__name__}: {e!s}"
|
|
480
|
-
|
|
481
|
-
# Log error
|
|
482
|
-
await self._log_action(
|
|
483
|
-
action_name=action_name,
|
|
484
|
-
inputs=inputs,
|
|
485
|
-
outputs=None,
|
|
486
|
-
execution_time=execution_time,
|
|
487
|
-
error=error_msg,
|
|
488
|
-
)
|
|
489
|
-
|
|
490
|
-
raise
|
|
491
|
-
|
|
492
|
-
return wrapper
|
|
493
|
-
|
|
494
|
-
async def _get_session(self) -> "HybridBrowserSession":
|
|
495
|
-
"""Get the correct singleton session instance."""
|
|
496
|
-
singleton = await HybridBrowserSession._get_or_create_instance(
|
|
497
|
-
self._session
|
|
498
|
-
)
|
|
499
|
-
if singleton is not self._session:
|
|
500
|
-
logger.debug("Updating to singleton session instance")
|
|
501
|
-
self._session = singleton
|
|
502
|
-
return self._session
|
|
503
|
-
|
|
504
|
-
async def _ensure_browser(self):
|
|
505
|
-
# Get singleton instance and update self._session if needed
|
|
506
|
-
session = await self._get_session()
|
|
507
|
-
await session.ensure_browser()
|
|
508
|
-
|
|
509
|
-
async def _require_page(self):
|
|
510
|
-
# Get singleton instance and update self._session if needed
|
|
511
|
-
session = await self._get_session()
|
|
512
|
-
await session.ensure_browser()
|
|
513
|
-
return await session.get_page()
|
|
514
|
-
|
|
515
|
-
async def _wait_for_page_stability(self):
|
|
516
|
-
r"""Wait for page to become stable after actions that might trigger
|
|
517
|
-
updates. Optimized with shorter timeouts.
|
|
518
|
-
"""
|
|
519
|
-
page = await self._require_page()
|
|
520
|
-
import asyncio
|
|
521
|
-
|
|
522
|
-
try:
|
|
523
|
-
# Wait for DOM content to be loaded (reduced timeout)
|
|
524
|
-
await page.wait_for_load_state(
|
|
525
|
-
'domcontentloaded', timeout=self._page_stability_timeout
|
|
526
|
-
)
|
|
527
|
-
logger.debug("DOM content loaded")
|
|
528
|
-
|
|
529
|
-
# Try to wait for network idle with shorter timeout
|
|
530
|
-
try:
|
|
531
|
-
await page.wait_for_load_state(
|
|
532
|
-
'networkidle', timeout=self._network_idle_timeout
|
|
533
|
-
)
|
|
534
|
-
logger.debug("Network idle achieved")
|
|
535
|
-
except Exception:
|
|
536
|
-
logger.debug("Network idle timeout - continuing anyway")
|
|
537
|
-
|
|
538
|
-
# Reduced delay for JavaScript execution
|
|
539
|
-
await asyncio.sleep(0.2) # Reduced from 0.5s
|
|
540
|
-
logger.debug("Page stability wait completed")
|
|
541
|
-
|
|
542
|
-
except Exception as e:
|
|
543
|
-
logger.debug(
|
|
544
|
-
f"Page stability wait failed: {e} - continuing anyway"
|
|
545
|
-
)
|
|
546
|
-
|
|
547
|
-
async def _get_unified_analysis(
|
|
548
|
-
self, max_retries: int = 3
|
|
549
|
-
) -> Dict[str, Any]:
|
|
550
|
-
r"""Get unified analysis data from the page with retry mechanism for
|
|
551
|
-
navigation issues."""
|
|
552
|
-
page = await self._require_page()
|
|
553
|
-
|
|
554
|
-
for attempt in range(max_retries):
|
|
555
|
-
try:
|
|
556
|
-
if not self._unified_script:
|
|
557
|
-
logger.error("Unified analyzer script not loaded")
|
|
558
|
-
return {"elements": {}, "metadata": {"elementCount": 0}}
|
|
559
|
-
|
|
560
|
-
# Wait for DOM stability before each attempt (with optimized
|
|
561
|
-
# timeout)
|
|
562
|
-
try:
|
|
563
|
-
await page.wait_for_load_state(
|
|
564
|
-
'domcontentloaded',
|
|
565
|
-
timeout=self._dom_content_loaded_timeout,
|
|
566
|
-
)
|
|
567
|
-
except Exception:
|
|
568
|
-
# Don't fail if DOM wait times out
|
|
569
|
-
pass
|
|
570
|
-
|
|
571
|
-
result = await page.evaluate(self._unified_script)
|
|
572
|
-
|
|
573
|
-
if not isinstance(result, dict):
|
|
574
|
-
logger.warning(f"Invalid result type: {type(result)}")
|
|
575
|
-
return {"elements": {}, "metadata": {"elementCount": 0}}
|
|
576
|
-
|
|
577
|
-
# Success - return result
|
|
578
|
-
if attempt > 0:
|
|
579
|
-
logger.debug(
|
|
580
|
-
f"Unified analysis succeeded on attempt {attempt + 1}"
|
|
581
|
-
)
|
|
582
|
-
return result
|
|
583
|
-
|
|
584
|
-
except Exception as e:
|
|
585
|
-
error_msg = str(e)
|
|
586
|
-
|
|
587
|
-
# Check if this is a navigation-related error
|
|
588
|
-
is_navigation_error = (
|
|
589
|
-
"Execution context was destroyed" in error_msg
|
|
590
|
-
or "Most likely because of a navigation" in error_msg
|
|
591
|
-
or "Target page, context or browser has been closed"
|
|
592
|
-
in error_msg
|
|
593
|
-
)
|
|
594
|
-
|
|
595
|
-
if is_navigation_error and attempt < max_retries - 1:
|
|
596
|
-
logger.debug(
|
|
597
|
-
f"Navigation error in unified analysis (attempt "
|
|
598
|
-
f"{attempt + 1}/{max_retries}): {e}. Retrying..."
|
|
599
|
-
)
|
|
600
|
-
|
|
601
|
-
# Wait a bit for page stability before retrying (optimized)
|
|
602
|
-
try:
|
|
603
|
-
await page.wait_for_load_state(
|
|
604
|
-
'domcontentloaded',
|
|
605
|
-
timeout=self._page_stability_timeout,
|
|
606
|
-
)
|
|
607
|
-
# Reduced delay for JS context to stabilize
|
|
608
|
-
import asyncio
|
|
609
|
-
|
|
610
|
-
await asyncio.sleep(0.1) # Reduced from 0.2s
|
|
611
|
-
except Exception:
|
|
612
|
-
# Continue even if wait fails
|
|
613
|
-
pass
|
|
614
|
-
|
|
615
|
-
continue
|
|
616
|
-
|
|
617
|
-
# Non-navigation error or final attempt - log and return
|
|
618
|
-
# empty result
|
|
619
|
-
if attempt == max_retries - 1:
|
|
620
|
-
logger.warning(
|
|
621
|
-
f"Error in unified analysis after {max_retries} "
|
|
622
|
-
f"attempts: {e}"
|
|
623
|
-
)
|
|
624
|
-
else:
|
|
625
|
-
logger.warning(
|
|
626
|
-
f"Non-retryable error in unified analysis: {e}"
|
|
627
|
-
)
|
|
628
|
-
|
|
629
|
-
return {"elements": {}, "metadata": {"elementCount": 0}}
|
|
630
|
-
|
|
631
|
-
# Should not reach here, but just in case
|
|
632
|
-
return {"elements": {}, "metadata": {"elementCount": 0}}
|
|
633
|
-
|
|
634
|
-
def _convert_analysis_to_rects(
|
|
635
|
-
self, analysis_data: Dict[str, Any]
|
|
636
|
-
) -> Dict[str, Any]:
|
|
637
|
-
r"""Convert analysis data to rect format for visual marking."""
|
|
638
|
-
rects = {}
|
|
639
|
-
elements = analysis_data.get("elements", {})
|
|
640
|
-
|
|
641
|
-
for ref, element_data in elements.items():
|
|
642
|
-
coordinates = element_data.get("coordinates", [])
|
|
643
|
-
if coordinates:
|
|
644
|
-
rects[ref] = {
|
|
645
|
-
"role": element_data.get("role", "generic"),
|
|
646
|
-
"aria-name": element_data.get("name", ""),
|
|
647
|
-
"rects": [coordinates[0]],
|
|
648
|
-
}
|
|
649
|
-
return rects
|
|
650
|
-
|
|
651
|
-
def _add_set_of_mark(self, image, rects):
|
|
652
|
-
r"""Add visual marks to the image."""
|
|
653
|
-
try:
|
|
654
|
-
from PIL import ImageDraw, ImageFont
|
|
655
|
-
except ImportError:
|
|
656
|
-
logger.warning("PIL not available, returning original image")
|
|
657
|
-
return image
|
|
658
|
-
|
|
659
|
-
marked_image = image.copy()
|
|
660
|
-
draw = ImageDraw.Draw(marked_image)
|
|
661
|
-
|
|
662
|
-
# Try to get font
|
|
663
|
-
try:
|
|
664
|
-
font = ImageFont.truetype("arial.ttf", 16)
|
|
665
|
-
except (OSError, IOError):
|
|
666
|
-
try:
|
|
667
|
-
font = ImageFont.load_default()
|
|
668
|
-
except (OSError, IOError):
|
|
669
|
-
font = None
|
|
670
|
-
|
|
671
|
-
# Color scheme
|
|
672
|
-
colors = {
|
|
673
|
-
"button": "#FF6B6B",
|
|
674
|
-
"link": "#4ECDC4",
|
|
675
|
-
"textbox": "#45B7D1",
|
|
676
|
-
"select": "#96CEB4",
|
|
677
|
-
"checkbox": "#FECA57",
|
|
678
|
-
"radio": "#FF9FF3",
|
|
679
|
-
"default": "#DDA0DD",
|
|
680
|
-
}
|
|
681
|
-
|
|
682
|
-
for ref, rect_data in rects.items():
|
|
683
|
-
rects_list = rect_data.get("rects", [])
|
|
684
|
-
role = rect_data.get("role", "generic")
|
|
685
|
-
color = colors.get(role, colors["default"])
|
|
686
|
-
|
|
687
|
-
for rect in rects_list:
|
|
688
|
-
x, y = rect.get("x", 0), rect.get("y", 0)
|
|
689
|
-
width, height = rect.get("width", 0), rect.get("height", 0)
|
|
690
|
-
|
|
691
|
-
# Draw rectangle outline
|
|
692
|
-
draw.rectangle(
|
|
693
|
-
[x, y, x + width, y + height], outline=color, width=2
|
|
694
|
-
)
|
|
695
|
-
|
|
696
|
-
# Draw reference label
|
|
697
|
-
label_text = ref
|
|
698
|
-
if font:
|
|
699
|
-
bbox = draw.textbbox((0, 0), label_text, font=font)
|
|
700
|
-
text_width, text_height = (
|
|
701
|
-
bbox[2] - bbox[0],
|
|
702
|
-
bbox[3] - bbox[1],
|
|
703
|
-
)
|
|
704
|
-
else:
|
|
705
|
-
text_width, text_height = len(label_text) * 8, 16
|
|
706
|
-
|
|
707
|
-
label_x, label_y = max(0, x - 2), max(0, y - text_height - 2)
|
|
708
|
-
|
|
709
|
-
# Background and text
|
|
710
|
-
draw.rectangle(
|
|
711
|
-
[
|
|
712
|
-
label_x,
|
|
713
|
-
label_y,
|
|
714
|
-
label_x + text_width + 4,
|
|
715
|
-
label_y + text_height + 2,
|
|
716
|
-
],
|
|
717
|
-
fill=color,
|
|
718
|
-
)
|
|
719
|
-
draw.text(
|
|
720
|
-
(label_x + 2, label_y + 1),
|
|
721
|
-
label_text,
|
|
722
|
-
fill="white",
|
|
723
|
-
font=font,
|
|
724
|
-
)
|
|
725
|
-
|
|
726
|
-
return marked_image
|
|
727
|
-
|
|
728
|
-
def _format_snapshot_from_analysis(
|
|
729
|
-
self, analysis_data: Dict[str, Any]
|
|
730
|
-
) -> str:
|
|
731
|
-
r"""Format analysis data into snapshot string."""
|
|
732
|
-
lines = []
|
|
733
|
-
elements = analysis_data.get("elements", {})
|
|
734
|
-
|
|
735
|
-
for ref, element_data in elements.items():
|
|
736
|
-
role = element_data.get("role", "generic")
|
|
737
|
-
name = element_data.get("name", "")
|
|
738
|
-
|
|
739
|
-
line = f"- {role}"
|
|
740
|
-
if name:
|
|
741
|
-
line += f' "{name}"'
|
|
742
|
-
|
|
743
|
-
# Add properties
|
|
744
|
-
props = []
|
|
745
|
-
for prop in ["disabled", "checked", "expanded"]:
|
|
746
|
-
value = element_data.get(prop)
|
|
747
|
-
if value is True:
|
|
748
|
-
props.append(prop)
|
|
749
|
-
elif value is not None and prop in ["checked", "expanded"]:
|
|
750
|
-
props.append(f"{prop}={value}")
|
|
751
|
-
|
|
752
|
-
if props:
|
|
753
|
-
line += f" {' '.join(props)}"
|
|
754
|
-
|
|
755
|
-
line += f" [ref={ref}]"
|
|
756
|
-
lines.append(line)
|
|
757
|
-
|
|
758
|
-
return "\n".join(lines)
|
|
759
|
-
|
|
760
|
-
async def _get_tab_info_for_output(self) -> Dict[str, Any]:
|
|
761
|
-
r"""Get tab information to include in action outputs."""
|
|
762
|
-
try:
|
|
763
|
-
# Ensure we have the correct singleton session instance first
|
|
764
|
-
session = await self._get_session()
|
|
765
|
-
|
|
766
|
-
# Add debug info for tab info retrieval
|
|
767
|
-
logger.debug("Attempting to get tab info from session...")
|
|
768
|
-
tab_info = await session.get_tab_info()
|
|
769
|
-
current_tab_index = await session.get_current_tab_id()
|
|
770
|
-
|
|
771
|
-
# Debug log the successful retrieval
|
|
772
|
-
logger.debug(
|
|
773
|
-
f"Successfully retrieved {len(tab_info)} tabs, current: "
|
|
774
|
-
f"{current_tab_index}"
|
|
775
|
-
)
|
|
776
|
-
|
|
777
|
-
return {
|
|
778
|
-
"tabs": tab_info,
|
|
779
|
-
"current_tab": current_tab_index,
|
|
780
|
-
"total_tabs": len(tab_info),
|
|
781
|
-
}
|
|
782
|
-
except Exception as e:
|
|
783
|
-
logger.warning(
|
|
784
|
-
f"Failed to get tab info from session: {type(e).__name__}: {e}"
|
|
785
|
-
)
|
|
786
|
-
|
|
787
|
-
# Try to get actual tab count from session pages directly
|
|
788
|
-
try:
|
|
789
|
-
# Get the correct session instance for fallback
|
|
790
|
-
fallback_session = await self._get_session()
|
|
791
|
-
|
|
792
|
-
# Check browser session state
|
|
793
|
-
session_state = {
|
|
794
|
-
"has_session": fallback_session is not None,
|
|
795
|
-
"has_pages_attr": hasattr(fallback_session, '_pages'),
|
|
796
|
-
"pages_count": len(fallback_session._pages)
|
|
797
|
-
if hasattr(fallback_session, '_pages')
|
|
798
|
-
else "unknown",
|
|
799
|
-
"has_page": hasattr(fallback_session, '_page')
|
|
800
|
-
and fallback_session._page is not None,
|
|
801
|
-
"session_id": getattr(
|
|
802
|
-
fallback_session, '_session_id', 'unknown'
|
|
803
|
-
),
|
|
804
|
-
}
|
|
805
|
-
logger.debug(f"Browser session state: {session_state}")
|
|
806
|
-
|
|
807
|
-
actual_tab_count = 0
|
|
808
|
-
if (
|
|
809
|
-
hasattr(fallback_session, '_pages')
|
|
810
|
-
and fallback_session._pages
|
|
811
|
-
):
|
|
812
|
-
actual_tab_count = len(fallback_session._pages)
|
|
813
|
-
# Also try to filter out closed pages
|
|
814
|
-
try:
|
|
815
|
-
open_pages = [
|
|
816
|
-
p
|
|
817
|
-
for p in fallback_session._pages.values()
|
|
818
|
-
if not p.is_closed()
|
|
819
|
-
]
|
|
820
|
-
actual_tab_count = len(open_pages)
|
|
821
|
-
logger.debug(
|
|
822
|
-
f"Found {actual_tab_count} open tabs out of "
|
|
823
|
-
f"{len(fallback_session._pages)} total"
|
|
824
|
-
)
|
|
825
|
-
except Exception:
|
|
826
|
-
# Keep the original count if we can't check page status
|
|
827
|
-
pass
|
|
828
|
-
|
|
829
|
-
if actual_tab_count == 0:
|
|
830
|
-
# If no pages, check if browser is even initialized
|
|
831
|
-
if (
|
|
832
|
-
hasattr(fallback_session, '_page')
|
|
833
|
-
and fallback_session._page is not None
|
|
834
|
-
):
|
|
835
|
-
actual_tab_count = 1
|
|
836
|
-
logger.debug(
|
|
837
|
-
"No pages in list but main page exists, assuming "
|
|
838
|
-
"1 tab"
|
|
839
|
-
)
|
|
840
|
-
else:
|
|
841
|
-
actual_tab_count = 1
|
|
842
|
-
logger.debug("No pages found, defaulting to 1 tab")
|
|
843
|
-
|
|
844
|
-
logger.debug(f"Using fallback tab count: {actual_tab_count}")
|
|
845
|
-
return {
|
|
846
|
-
"tabs": [],
|
|
847
|
-
"current_tab": 0,
|
|
848
|
-
"total_tabs": actual_tab_count,
|
|
849
|
-
}
|
|
850
|
-
|
|
851
|
-
except Exception as fallback_error:
|
|
852
|
-
logger.warning(
|
|
853
|
-
f"Fallback tab count also failed: "
|
|
854
|
-
f"{type(fallback_error).__name__}: {fallback_error}"
|
|
855
|
-
)
|
|
856
|
-
return {"tabs": [], "current_tab": 0, "total_tabs": 1}
|
|
857
|
-
|
|
858
|
-
async def _exec_with_snapshot(
|
|
859
|
-
self,
|
|
860
|
-
action: Dict[str, Any],
|
|
861
|
-
element_details: Optional[Dict[str, Any]] = None,
|
|
862
|
-
) -> Dict[str, str]:
|
|
863
|
-
r"""Execute action and return result with snapshot comparison."""
|
|
864
|
-
|
|
865
|
-
# Log action execution start
|
|
866
|
-
action_type = action.get("type", "unknown")
|
|
867
|
-
logger.info(f"Executing action: {action_type}")
|
|
868
|
-
|
|
869
|
-
action_start_time = time.time()
|
|
870
|
-
inputs: Dict[str, Any] = {"action": action}
|
|
871
|
-
page_load_time = None
|
|
872
|
-
|
|
873
|
-
try:
|
|
874
|
-
# Get before snapshot
|
|
875
|
-
logger.info("Capturing pre-action snapshot...")
|
|
876
|
-
snapshot_start_before = time.time()
|
|
877
|
-
before_snapshot = await self._session.get_snapshot(
|
|
878
|
-
force_refresh=True, diff_only=False
|
|
879
|
-
)
|
|
880
|
-
before_snapshot_time = time.time() - snapshot_start_before
|
|
881
|
-
logger.info(
|
|
882
|
-
f"Pre-action snapshot captured in {before_snapshot_time:.2f}s"
|
|
883
|
-
)
|
|
884
|
-
|
|
885
|
-
# Execute action
|
|
886
|
-
logger.info(f"Executing {action_type} action...")
|
|
887
|
-
exec_start = time.time()
|
|
888
|
-
exec_result = await self._session.exec_action(action)
|
|
889
|
-
exec_time = time.time() - exec_start
|
|
890
|
-
logger.info(f"Action {action_type} completed in {exec_time:.2f}s")
|
|
891
|
-
|
|
892
|
-
# Parse the detailed result from ActionExecutor
|
|
893
|
-
if isinstance(exec_result, dict):
|
|
894
|
-
result_message = exec_result.get("message", str(exec_result))
|
|
895
|
-
action_details = exec_result.get("details", {})
|
|
896
|
-
success = exec_result.get("success", True)
|
|
897
|
-
else:
|
|
898
|
-
result_message = str(exec_result)
|
|
899
|
-
action_details = {}
|
|
900
|
-
success = True
|
|
901
|
-
|
|
902
|
-
# Wait for page stability after action (especially important for
|
|
903
|
-
# click)
|
|
904
|
-
stability_time: float = 0.0
|
|
905
|
-
if action_type in ["click", "type", "select", "enter"]:
|
|
906
|
-
logger.info(
|
|
907
|
-
f"Waiting for page stability " f"after {action_type}..."
|
|
908
|
-
)
|
|
909
|
-
stability_start = time.time()
|
|
910
|
-
await self._wait_for_page_stability()
|
|
911
|
-
stability_time = time.time() - stability_start
|
|
912
|
-
logger.info(
|
|
913
|
-
f"Page stability wait "
|
|
914
|
-
f"completed in "
|
|
915
|
-
f"{stability_time:.2f}s"
|
|
916
|
-
)
|
|
917
|
-
page_load_time = stability_time
|
|
918
|
-
|
|
919
|
-
# Enhanced logging for page loading times
|
|
920
|
-
if self.enable_page_loading_logging and self.log_to_console:
|
|
921
|
-
logger.info(
|
|
922
|
-
f"[PAGE LOADING] Page stability for {action_type}: "
|
|
923
|
-
f"{round(stability_time * 1000, 2)}ms"
|
|
924
|
-
)
|
|
925
|
-
|
|
926
|
-
# Get after snapshot
|
|
927
|
-
logger.info("Capturing post-action snapshot...")
|
|
928
|
-
snapshot_start_after = time.time()
|
|
929
|
-
after_snapshot = await self._session.get_snapshot(
|
|
930
|
-
force_refresh=True, diff_only=False
|
|
931
|
-
)
|
|
932
|
-
after_snapshot_time = time.time() - snapshot_start_after
|
|
933
|
-
logger.info(
|
|
934
|
-
f"Post-action snapshot "
|
|
935
|
-
f"captured in {after_snapshot_time:.2f}s"
|
|
936
|
-
)
|
|
937
|
-
|
|
938
|
-
# Check for snapshot quality and log warnings
|
|
939
|
-
if before_snapshot == after_snapshot:
|
|
940
|
-
snapshot = "snapshot not changed"
|
|
941
|
-
logger.debug("Page snapshot unchanged after action")
|
|
942
|
-
else:
|
|
943
|
-
snapshot = after_snapshot
|
|
944
|
-
# Check if snapshot is empty or problematic
|
|
945
|
-
if "<empty>" in after_snapshot:
|
|
946
|
-
logger.warning(
|
|
947
|
-
f"Action {action_type} resulted "
|
|
948
|
-
f"in empty snapshot - "
|
|
949
|
-
f"page may still be loading"
|
|
950
|
-
)
|
|
951
|
-
elif len(after_snapshot.strip()) < 50:
|
|
952
|
-
logger.warning(
|
|
953
|
-
f"Action {action_type} resulted "
|
|
954
|
-
f"in very short snapshot:"
|
|
955
|
-
f" {len(after_snapshot)} chars"
|
|
956
|
-
)
|
|
957
|
-
else:
|
|
958
|
-
logger.debug(
|
|
959
|
-
f"Action {action_type} resulted "
|
|
960
|
-
f"in updated snapshot: "
|
|
961
|
-
f"{len(after_snapshot)} chars"
|
|
962
|
-
)
|
|
963
|
-
|
|
964
|
-
# Get tab information for output
|
|
965
|
-
tab_info = await self._get_tab_info_for_output()
|
|
966
|
-
|
|
967
|
-
# Create comprehensive output for logging
|
|
968
|
-
execution_time = time.time() - action_start_time
|
|
969
|
-
total_snapshot_time = before_snapshot_time + after_snapshot_time
|
|
970
|
-
outputs = {
|
|
971
|
-
"result": result_message,
|
|
972
|
-
"snapshot": snapshot,
|
|
973
|
-
"success": success,
|
|
974
|
-
"action_details": action_details,
|
|
975
|
-
"execution_stats": {
|
|
976
|
-
"exec_time_ms": round(exec_time * 1000, 2),
|
|
977
|
-
"stability_time_ms": round(stability_time * 1000, 2)
|
|
978
|
-
if stability_time > 0
|
|
979
|
-
else None,
|
|
980
|
-
"snapshot_time_ms": round(total_snapshot_time * 1000, 2),
|
|
981
|
-
"total_time_ms": round(execution_time * 1000, 2),
|
|
982
|
-
},
|
|
983
|
-
**tab_info, # Include tab information
|
|
984
|
-
}
|
|
985
|
-
|
|
986
|
-
# If snapshot is unchanged after click, add element details to log
|
|
987
|
-
if (
|
|
988
|
-
snapshot == "snapshot not changed"
|
|
989
|
-
and action_type == "click"
|
|
990
|
-
and element_details
|
|
991
|
-
):
|
|
992
|
-
logger.debug(
|
|
993
|
-
"Snapshot unchanged after click. "
|
|
994
|
-
"Adding element details to log."
|
|
995
|
-
)
|
|
996
|
-
outputs["clicked_element_tag"] = element_details.get(
|
|
997
|
-
"tagName", "N/A"
|
|
998
|
-
)
|
|
999
|
-
outputs["clicked_element_content"] = element_details.get(
|
|
1000
|
-
"name", ""
|
|
1001
|
-
)
|
|
1002
|
-
outputs["clicked_element_type"] = element_details.get(
|
|
1003
|
-
"role", "generic"
|
|
1004
|
-
)
|
|
1005
|
-
|
|
1006
|
-
# Log the action with all details
|
|
1007
|
-
await self._log_action(
|
|
1008
|
-
action_name=f"_exec_with_snapshot_{action_type}",
|
|
1009
|
-
inputs=inputs,
|
|
1010
|
-
outputs=outputs,
|
|
1011
|
-
execution_time=execution_time,
|
|
1012
|
-
page_load_time=page_load_time,
|
|
1013
|
-
)
|
|
1014
|
-
|
|
1015
|
-
return {"result": result_message, "snapshot": snapshot}
|
|
1016
|
-
|
|
1017
|
-
except Exception as e:
|
|
1018
|
-
execution_time = time.time() - action_start_time
|
|
1019
|
-
error_msg = f"{type(e).__name__}: {e!s}"
|
|
1020
|
-
|
|
1021
|
-
# Log error
|
|
1022
|
-
await self._log_action(
|
|
1023
|
-
action_name=f"_exec_with_snapshot_{action_type}",
|
|
1024
|
-
inputs=inputs,
|
|
1025
|
-
outputs=None,
|
|
1026
|
-
execution_time=execution_time,
|
|
1027
|
-
page_load_time=page_load_time,
|
|
1028
|
-
error=error_msg,
|
|
1029
|
-
)
|
|
1030
|
-
|
|
1031
|
-
raise
|
|
1032
|
-
|
|
1033
|
-
async def _extract_links_by_refs(
|
|
1034
|
-
self, snapshot: str, page, refs: List[str]
|
|
1035
|
-
) -> List[Dict[str, str]]:
|
|
1036
|
-
r"""Extract multiple links by their reference IDs."""
|
|
1037
|
-
import re
|
|
1038
|
-
|
|
1039
|
-
found_links = []
|
|
1040
|
-
ref_set = set(refs)
|
|
1041
|
-
lines = snapshot.split('\n')
|
|
1042
|
-
|
|
1043
|
-
for line in lines:
|
|
1044
|
-
link_match = re.search(
|
|
1045
|
-
r'- link\s+"([^"]+)"\s+\[ref=([^\]]+)\]', line
|
|
1046
|
-
)
|
|
1047
|
-
if link_match and link_match.group(2) in ref_set:
|
|
1048
|
-
text, found_ref = link_match.groups()
|
|
1049
|
-
try:
|
|
1050
|
-
url = await self._get_link_url_by_ref(page, found_ref)
|
|
1051
|
-
found_links.append(
|
|
1052
|
-
{"text": text, "ref": found_ref, "url": url or ""}
|
|
1053
|
-
)
|
|
1054
|
-
except Exception as e:
|
|
1055
|
-
logger.warning(
|
|
1056
|
-
f"Failed to get URL for ref {found_ref}: {e}"
|
|
1057
|
-
)
|
|
1058
|
-
found_links.append(
|
|
1059
|
-
{"text": text, "ref": found_ref, "url": ""}
|
|
1060
|
-
)
|
|
1061
|
-
|
|
1062
|
-
return found_links
|
|
250
|
+
@property
|
|
251
|
+
def web_agent_model(self) -> Optional[BaseModelBackend]:
|
|
252
|
+
"""Get the web agent model."""
|
|
253
|
+
return self._web_agent_model
|
|
1063
254
|
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
if element:
|
|
1069
|
-
href = await element.get_attribute('href')
|
|
1070
|
-
if href:
|
|
1071
|
-
from urllib.parse import urljoin
|
|
1072
|
-
|
|
1073
|
-
return urljoin(page.url, href)
|
|
1074
|
-
return ""
|
|
1075
|
-
except Exception as e:
|
|
1076
|
-
logger.warning(f"Failed to get URL for ref {ref}: {e}")
|
|
1077
|
-
return ""
|
|
1078
|
-
|
|
1079
|
-
def _ensure_agent(self) -> PlaywrightLLMAgent:
|
|
1080
|
-
r"""Create PlaywrightLLMAgent on first use."""
|
|
1081
|
-
if self._web_agent_model is None:
|
|
1082
|
-
raise RuntimeError(
|
|
1083
|
-
"web_agent_model required for high-level task planning"
|
|
1084
|
-
)
|
|
255
|
+
@web_agent_model.setter
|
|
256
|
+
def web_agent_model(self, value: Optional[BaseModelBackend]) -> None:
|
|
257
|
+
"""Set the web agent model."""
|
|
258
|
+
self._web_agent_model = value
|
|
1085
259
|
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
model_backend=self._web_agent_model,
|
|
1091
|
-
)
|
|
1092
|
-
return self._agent
|
|
260
|
+
@property
|
|
261
|
+
def cache_dir(self) -> str:
|
|
262
|
+
"""Get the cache directory."""
|
|
263
|
+
return self._cache_dir
|
|
1093
264
|
|
|
1094
265
|
# Public API Methods
|
|
1095
266
|
|
|
1096
|
-
async def
|
|
267
|
+
async def browser_open(self) -> Dict[str, Any]:
|
|
1097
268
|
r"""Starts a new browser session. This must be the first browser
|
|
1098
269
|
action.
|
|
1099
270
|
|
|
@@ -1103,60 +274,45 @@ class HybridBrowserToolkit(BaseToolkit):
|
|
|
1103
274
|
Returns:
|
|
1104
275
|
Dict[str, Any]: A dictionary with the result of the action:
|
|
1105
276
|
- "result" (str): Confirmation of the action.
|
|
1106
|
-
- "snapshot" (str): A textual snapshot of interactive
|
|
277
|
+
- "snapshot" (str): A textual snapshot of interactive
|
|
278
|
+
elements.
|
|
1107
279
|
- "tabs" (List[Dict]): Information about all open tabs.
|
|
1108
280
|
- "current_tab" (int): Index of the active tab.
|
|
1109
281
|
- "total_tabs" (int): Total number of open tabs.
|
|
1110
282
|
"""
|
|
1111
|
-
# Add logging if enabled
|
|
1112
|
-
action_start = time.time()
|
|
1113
|
-
inputs: Dict[str, Any] = {} # No input parameters for agents
|
|
1114
|
-
|
|
1115
|
-
logger.info("Starting browser session...")
|
|
1116
|
-
|
|
1117
|
-
browser_start = time.time()
|
|
1118
|
-
await self._session.ensure_browser()
|
|
1119
|
-
browser_time = time.time() - browser_start
|
|
1120
|
-
logger.info(f"Browser session started in {browser_time:.2f}s")
|
|
1121
|
-
|
|
1122
283
|
try:
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
result
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
284
|
+
ws_wrapper = await self._get_ws_wrapper()
|
|
285
|
+
result = await ws_wrapper.open_browser(self._default_start_url)
|
|
286
|
+
|
|
287
|
+
# Add tab information
|
|
288
|
+
tab_info = await ws_wrapper.get_tab_info()
|
|
289
|
+
result.update(
|
|
290
|
+
{
|
|
291
|
+
"tabs": tab_info,
|
|
292
|
+
"current_tab": next(
|
|
293
|
+
(
|
|
294
|
+
i
|
|
295
|
+
for i, tab in enumerate(tab_info)
|
|
296
|
+
if tab.get("is_current")
|
|
297
|
+
),
|
|
298
|
+
0,
|
|
299
|
+
),
|
|
300
|
+
"total_tabs": len(tab_info),
|
|
301
|
+
}
|
|
302
|
+
)
|
|
1142
303
|
|
|
1143
304
|
return result
|
|
1144
|
-
|
|
1145
305
|
except Exception as e:
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
error=f"{type(e).__name__}: {e!s}",
|
|
1155
|
-
)
|
|
1156
|
-
raise
|
|
306
|
+
logger.error(f"Failed to open browser: {e}")
|
|
307
|
+
return {
|
|
308
|
+
"result": f"Error opening browser: {e}",
|
|
309
|
+
"snapshot": "",
|
|
310
|
+
"tabs": [],
|
|
311
|
+
"current_tab": 0,
|
|
312
|
+
"total_tabs": 0,
|
|
313
|
+
}
|
|
1157
314
|
|
|
1158
|
-
|
|
1159
|
-
async def close_browser(self) -> str:
|
|
315
|
+
async def browser_close(self) -> str:
|
|
1160
316
|
r"""Closes the browser session, releasing all resources.
|
|
1161
317
|
|
|
1162
318
|
This should be called at the end of a task for cleanup.
|
|
@@ -1164,18 +320,16 @@ class HybridBrowserToolkit(BaseToolkit):
|
|
|
1164
320
|
Returns:
|
|
1165
321
|
str: A confirmation message.
|
|
1166
322
|
"""
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
await self.
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
return "Browser session closed."
|
|
323
|
+
try:
|
|
324
|
+
if self._ws_wrapper:
|
|
325
|
+
await self._ws_wrapper.stop()
|
|
326
|
+
self._ws_wrapper = None
|
|
327
|
+
return "Browser session closed."
|
|
328
|
+
except Exception as e:
|
|
329
|
+
logger.error(f"Failed to close browser: {e}")
|
|
330
|
+
return f"Error closing browser: {e}"
|
|
1176
331
|
|
|
1177
|
-
|
|
1178
|
-
async def visit_page(self, url: str) -> Dict[str, Any]:
|
|
332
|
+
async def browser_visit_page(self, url: str) -> Dict[str, Any]:
|
|
1179
333
|
r"""Opens a URL in a new browser tab and switches to it.
|
|
1180
334
|
|
|
1181
335
|
Args:
|
|
@@ -1190,70 +344,39 @@ class HybridBrowserToolkit(BaseToolkit):
|
|
|
1190
344
|
- "current_tab" (int): Index of the new active tab.
|
|
1191
345
|
- "total_tabs" (int): Total number of open tabs.
|
|
1192
346
|
"""
|
|
1193
|
-
|
|
347
|
+
try:
|
|
348
|
+
ws_wrapper = await self._get_ws_wrapper()
|
|
349
|
+
result = await ws_wrapper.visit_page(url)
|
|
350
|
+
|
|
351
|
+
# Add tab information
|
|
352
|
+
tab_info = await ws_wrapper.get_tab_info()
|
|
353
|
+
result.update(
|
|
354
|
+
{
|
|
355
|
+
"tabs": tab_info,
|
|
356
|
+
"current_tab": next(
|
|
357
|
+
(
|
|
358
|
+
i
|
|
359
|
+
for i, tab in enumerate(tab_info)
|
|
360
|
+
if tab.get("is_current")
|
|
361
|
+
),
|
|
362
|
+
0,
|
|
363
|
+
),
|
|
364
|
+
"total_tabs": len(tab_info),
|
|
365
|
+
}
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
return result
|
|
369
|
+
except Exception as e:
|
|
370
|
+
logger.error(f"Failed to visit page: {e}")
|
|
1194
371
|
return {
|
|
1195
|
-
"result": "Error
|
|
372
|
+
"result": f"Error visiting page: {e}",
|
|
1196
373
|
"snapshot": "",
|
|
1197
374
|
"tabs": [],
|
|
1198
375
|
"current_tab": 0,
|
|
1199
|
-
"total_tabs":
|
|
376
|
+
"total_tabs": 0,
|
|
1200
377
|
}
|
|
1201
378
|
|
|
1202
|
-
|
|
1203
|
-
url = f'https://{url}'
|
|
1204
|
-
|
|
1205
|
-
await self._ensure_browser()
|
|
1206
|
-
session = await self._get_session()
|
|
1207
|
-
nav_result = ""
|
|
1208
|
-
|
|
1209
|
-
# By default, we want to create a new tab.
|
|
1210
|
-
should_create_new_tab = True
|
|
1211
|
-
try:
|
|
1212
|
-
# If the browser has just started with a single "about:blank" tab,
|
|
1213
|
-
# use that tab instead of creating a new one.
|
|
1214
|
-
tab_info_data = await self._get_tab_info_for_output()
|
|
1215
|
-
tabs = tab_info_data.get("tabs", [])
|
|
1216
|
-
if len(tabs) == 1 and tabs[0].get("url") == "about:blank":
|
|
1217
|
-
logger.info(
|
|
1218
|
-
"Found single blank tab, navigating in current tab "
|
|
1219
|
-
"instead of creating a new one."
|
|
1220
|
-
)
|
|
1221
|
-
should_create_new_tab = False
|
|
1222
|
-
except Exception as e:
|
|
1223
|
-
logger.warning(
|
|
1224
|
-
"Could not get tab info to check for blank tab, "
|
|
1225
|
-
f"proceeding with default behavior (new tab). Error: {e}"
|
|
1226
|
-
)
|
|
1227
|
-
|
|
1228
|
-
if should_create_new_tab:
|
|
1229
|
-
logger.info(f"Creating new tab and navigating to URL: {url}")
|
|
1230
|
-
try:
|
|
1231
|
-
new_tab_id = await session.create_new_tab(url)
|
|
1232
|
-
await session.switch_to_tab(new_tab_id)
|
|
1233
|
-
nav_result = f"Visited {url} in new tab {new_tab_id}"
|
|
1234
|
-
except Exception as e:
|
|
1235
|
-
logger.error(f"Failed to create new tab and navigate: {e}")
|
|
1236
|
-
nav_result = f"Error creating new tab: {e}"
|
|
1237
|
-
else:
|
|
1238
|
-
logger.info(f"Navigating to URL in current tab: {url}")
|
|
1239
|
-
nav_result = await session.visit(url)
|
|
1240
|
-
|
|
1241
|
-
# Get snapshot
|
|
1242
|
-
snapshot = ""
|
|
1243
|
-
try:
|
|
1244
|
-
snapshot = await session.get_snapshot(
|
|
1245
|
-
force_refresh=True, diff_only=False
|
|
1246
|
-
)
|
|
1247
|
-
except Exception as e:
|
|
1248
|
-
logger.warning(f"Failed to capture snapshot: {e}")
|
|
1249
|
-
|
|
1250
|
-
# Get tab information
|
|
1251
|
-
tab_info = await self._get_tab_info_for_output()
|
|
1252
|
-
|
|
1253
|
-
return {"result": nav_result, "snapshot": snapshot, **tab_info}
|
|
1254
|
-
|
|
1255
|
-
@action_logger
|
|
1256
|
-
async def back(self) -> Dict[str, Any]:
|
|
379
|
+
async def browser_back(self) -> Dict[str, Any]:
|
|
1257
380
|
r"""Goes back to the previous page in the browser history.
|
|
1258
381
|
|
|
1259
382
|
This action simulates using the browser's "back" button in the
|
|
@@ -1267,57 +390,39 @@ class HybridBrowserToolkit(BaseToolkit):
|
|
|
1267
390
|
- "current_tab" (int): Index of the active tab.
|
|
1268
391
|
- "total_tabs" (int): Total number of open tabs.
|
|
1269
392
|
"""
|
|
1270
|
-
page = await self._require_page()
|
|
1271
|
-
|
|
1272
393
|
try:
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
)
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
)
|
|
1292
|
-
snapshot_time = time.time() - snapshot_start
|
|
1293
|
-
logger.info(
|
|
1294
|
-
f"Back navigation snapshot captured in {snapshot_time:.2f}s"
|
|
394
|
+
ws_wrapper = await self._get_ws_wrapper()
|
|
395
|
+
result = await ws_wrapper.back()
|
|
396
|
+
|
|
397
|
+
# Add tab information
|
|
398
|
+
tab_info = await ws_wrapper.get_tab_info()
|
|
399
|
+
result.update(
|
|
400
|
+
{
|
|
401
|
+
"tabs": tab_info,
|
|
402
|
+
"current_tab": next(
|
|
403
|
+
(
|
|
404
|
+
i
|
|
405
|
+
for i, tab in enumerate(tab_info)
|
|
406
|
+
if tab.get("is_current")
|
|
407
|
+
),
|
|
408
|
+
0,
|
|
409
|
+
),
|
|
410
|
+
"total_tabs": len(tab_info),
|
|
411
|
+
}
|
|
1295
412
|
)
|
|
1296
413
|
|
|
1297
|
-
|
|
1298
|
-
tab_info = await self._get_tab_info_for_output()
|
|
1299
|
-
|
|
1300
|
-
return {
|
|
1301
|
-
"result": "Back navigation successful.",
|
|
1302
|
-
"snapshot": snapshot,
|
|
1303
|
-
**tab_info,
|
|
1304
|
-
}
|
|
1305
|
-
|
|
414
|
+
return result
|
|
1306
415
|
except Exception as e:
|
|
1307
|
-
logger.
|
|
1308
|
-
# Get current snapshot even if navigation failed
|
|
1309
|
-
snapshot = await self._session.get_snapshot(
|
|
1310
|
-
force_refresh=True, diff_only=False
|
|
1311
|
-
)
|
|
1312
|
-
tab_info = await self._get_tab_info_for_output()
|
|
416
|
+
logger.error(f"Failed to navigate back: {e}")
|
|
1313
417
|
return {
|
|
1314
|
-
"result": f"
|
|
1315
|
-
"snapshot":
|
|
1316
|
-
|
|
418
|
+
"result": f"Error navigating back: {e}",
|
|
419
|
+
"snapshot": "",
|
|
420
|
+
"tabs": [],
|
|
421
|
+
"current_tab": 0,
|
|
422
|
+
"total_tabs": 0,
|
|
1317
423
|
}
|
|
1318
424
|
|
|
1319
|
-
|
|
1320
|
-
async def forward(self) -> Dict[str, Any]:
|
|
425
|
+
async def browser_forward(self) -> Dict[str, Any]:
|
|
1321
426
|
r"""Goes forward to the next page in the browser history.
|
|
1322
427
|
|
|
1323
428
|
This action simulates using the browser's "forward" button in the
|
|
@@ -1331,164 +436,191 @@ class HybridBrowserToolkit(BaseToolkit):
|
|
|
1331
436
|
- "current_tab" (int): Index of the active tab.
|
|
1332
437
|
- "total_tabs" (int): Total number of open tabs.
|
|
1333
438
|
"""
|
|
1334
|
-
page = await self._require_page()
|
|
1335
|
-
|
|
1336
439
|
try:
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
)
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
force_refresh=True, diff_only=False
|
|
1356
|
-
)
|
|
1357
|
-
snapshot_time = time.time() - snapshot_start
|
|
1358
|
-
logger.info(
|
|
1359
|
-
f"Forward navigation snapshot captured in {snapshot_time:.2f}s"
|
|
440
|
+
ws_wrapper = await self._get_ws_wrapper()
|
|
441
|
+
result = await ws_wrapper.forward()
|
|
442
|
+
|
|
443
|
+
# Add tab information
|
|
444
|
+
tab_info = await ws_wrapper.get_tab_info()
|
|
445
|
+
result.update(
|
|
446
|
+
{
|
|
447
|
+
"tabs": tab_info,
|
|
448
|
+
"current_tab": next(
|
|
449
|
+
(
|
|
450
|
+
i
|
|
451
|
+
for i, tab in enumerate(tab_info)
|
|
452
|
+
if tab.get("is_current")
|
|
453
|
+
),
|
|
454
|
+
0,
|
|
455
|
+
),
|
|
456
|
+
"total_tabs": len(tab_info),
|
|
457
|
+
}
|
|
1360
458
|
)
|
|
1361
459
|
|
|
1362
|
-
|
|
1363
|
-
tab_info = await self._get_tab_info_for_output()
|
|
1364
|
-
|
|
1365
|
-
return {
|
|
1366
|
-
"result": "Forward navigation successful.",
|
|
1367
|
-
"snapshot": snapshot,
|
|
1368
|
-
**tab_info,
|
|
1369
|
-
}
|
|
1370
|
-
|
|
460
|
+
return result
|
|
1371
461
|
except Exception as e:
|
|
1372
|
-
logger.
|
|
1373
|
-
# Get current snapshot even if navigation failed
|
|
1374
|
-
snapshot = await self._session.get_snapshot(
|
|
1375
|
-
force_refresh=True, diff_only=False
|
|
1376
|
-
)
|
|
1377
|
-
tab_info = await self._get_tab_info_for_output()
|
|
462
|
+
logger.error(f"Failed to navigate forward: {e}")
|
|
1378
463
|
return {
|
|
1379
|
-
"result": f"
|
|
1380
|
-
"snapshot":
|
|
1381
|
-
|
|
464
|
+
"result": f"Error navigating forward: {e}",
|
|
465
|
+
"snapshot": "",
|
|
466
|
+
"tabs": [],
|
|
467
|
+
"current_tab": 0,
|
|
468
|
+
"total_tabs": 0,
|
|
1382
469
|
}
|
|
1383
470
|
|
|
1384
|
-
|
|
1385
|
-
async def get_page_snapshot(self) -> str:
|
|
471
|
+
async def browser_get_page_snapshot(self) -> str:
|
|
1386
472
|
r"""Gets a textual snapshot of the page's interactive elements.
|
|
1387
473
|
|
|
1388
|
-
The snapshot lists elements like buttons, links, and inputs,
|
|
474
|
+
The snapshot lists elements like buttons, links, and inputs,
|
|
475
|
+
each with
|
|
1389
476
|
a unique `ref` ID. This ID is used by other tools (e.g., `click`,
|
|
1390
477
|
`type`) to interact with a specific element. This tool provides no
|
|
1391
478
|
visual information.
|
|
1392
479
|
|
|
480
|
+
If viewport_limit is enabled, only elements within the current
|
|
481
|
+
viewport
|
|
482
|
+
will be included in the snapshot.
|
|
483
|
+
|
|
1393
484
|
Returns:
|
|
1394
485
|
str: A formatted string representing the interactive elements and
|
|
1395
486
|
their `ref` IDs. For example:
|
|
1396
487
|
'- link "Sign In" [ref=1]'
|
|
1397
488
|
'- textbox "Username" [ref=2]'
|
|
1398
489
|
"""
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
f"Page snapshot analysis " f"completed in {analysis_time:.2f}s"
|
|
1406
|
-
)
|
|
1407
|
-
|
|
1408
|
-
snapshot_text = analysis_data.get("snapshotText", "")
|
|
1409
|
-
return (
|
|
1410
|
-
snapshot_text
|
|
1411
|
-
if snapshot_text
|
|
1412
|
-
else self._format_snapshot_from_analysis(analysis_data)
|
|
1413
|
-
)
|
|
490
|
+
try:
|
|
491
|
+
ws_wrapper = await self._get_ws_wrapper()
|
|
492
|
+
return await ws_wrapper.get_page_snapshot(self._viewport_limit)
|
|
493
|
+
except Exception as e:
|
|
494
|
+
logger.error(f"Failed to get page snapshot: {e}")
|
|
495
|
+
return f"Error capturing snapshot: {e}"
|
|
1414
496
|
|
|
1415
497
|
@dependencies_required('PIL')
|
|
1416
|
-
|
|
1417
|
-
|
|
498
|
+
async def browser_get_som_screenshot(
|
|
499
|
+
self,
|
|
500
|
+
read_image: bool = True,
|
|
501
|
+
instruction: Optional[str] = None,
|
|
502
|
+
) -> str:
|
|
1418
503
|
r"""Captures a screenshot with interactive elements highlighted.
|
|
1419
504
|
|
|
1420
|
-
"SoM" stands for "Set of Marks". This tool takes a screenshot and
|
|
505
|
+
"SoM" stands for "Set of Marks". This tool takes a screenshot and
|
|
506
|
+
draws
|
|
1421
507
|
boxes around clickable elements, overlaying a `ref` ID on each. Use
|
|
1422
508
|
this for a visual understanding of the page, especially when the
|
|
1423
509
|
textual snapshot is not enough.
|
|
1424
510
|
|
|
511
|
+
Args:
|
|
512
|
+
read_image (bool, optional): If `True`, the agent will analyze
|
|
513
|
+
the screenshot. Requires agent to be registered.
|
|
514
|
+
(default: :obj:`True`)
|
|
515
|
+
instruction (Optional[str], optional): A specific question or
|
|
516
|
+
command for the agent regarding the screenshot, used only if
|
|
517
|
+
`read_image` is `True`. For example: "Find the login button."
|
|
518
|
+
|
|
1425
519
|
Returns:
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
- `images` (List[str]): A list containing one base64-encoded
|
|
1430
|
-
PNG image data URL.
|
|
520
|
+
str: A confirmation message indicating the screenshot was
|
|
521
|
+
captured, the file path where it was saved, and optionally the
|
|
522
|
+
agent's analysis if `read_image` is `True`.
|
|
1431
523
|
"""
|
|
1432
|
-
|
|
524
|
+
import base64
|
|
525
|
+
import datetime
|
|
526
|
+
import os
|
|
527
|
+
import urllib.parse
|
|
1433
528
|
|
|
1434
|
-
from camel.utils
|
|
529
|
+
from camel.utils import sanitize_filename
|
|
1435
530
|
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
531
|
+
try:
|
|
532
|
+
ws_wrapper = await self._get_ws_wrapper()
|
|
533
|
+
result = await ws_wrapper.get_som_screenshot()
|
|
1439
534
|
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
f"with timeout: {self._screenshot_timeout}ms"
|
|
1444
|
-
)
|
|
535
|
+
# Initialize result text
|
|
536
|
+
result_text = result.text
|
|
537
|
+
file_path = None
|
|
1445
538
|
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
539
|
+
# Save screenshot to cache directory if images are available
|
|
540
|
+
if result.images:
|
|
541
|
+
# Ensure cache directory exists (use absolute path)
|
|
542
|
+
cache_dir = os.path.abspath(self._cache_dir)
|
|
543
|
+
os.makedirs(cache_dir, exist_ok=True)
|
|
544
|
+
|
|
545
|
+
# Get current page URL for filename
|
|
546
|
+
try:
|
|
547
|
+
# Try to get the current page URL from the wrapper
|
|
548
|
+
page_info = await ws_wrapper.get_tab_info()
|
|
549
|
+
current_tab = next(
|
|
550
|
+
(tab for tab in page_info if tab.get('is_current')),
|
|
551
|
+
None,
|
|
552
|
+
)
|
|
553
|
+
url = current_tab['url'] if current_tab else 'unknown'
|
|
554
|
+
except Exception:
|
|
555
|
+
url = 'unknown'
|
|
556
|
+
|
|
557
|
+
# Generate filename
|
|
558
|
+
parsed_url = urllib.parse.urlparse(url)
|
|
559
|
+
url_name = sanitize_filename(
|
|
560
|
+
str(parsed_url.path) or 'homepage', max_length=241
|
|
561
|
+
)
|
|
562
|
+
timestamp = datetime.datetime.now().strftime("%m%d%H%M%S")
|
|
563
|
+
file_path = os.path.join(
|
|
564
|
+
cache_dir, f"{url_name}_{timestamp}_som.png"
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
# Extract base64 data and save to file
|
|
568
|
+
for _, image_data in enumerate(result.images):
|
|
569
|
+
if image_data.startswith('data:image/png;base64,'):
|
|
570
|
+
# Remove data URL prefix
|
|
571
|
+
base64_data = image_data.split(',', 1)[1]
|
|
572
|
+
|
|
573
|
+
# Decode and save
|
|
574
|
+
image_bytes = base64.b64decode(base64_data)
|
|
575
|
+
with open(file_path, 'wb') as f:
|
|
576
|
+
f.write(image_bytes)
|
|
577
|
+
|
|
578
|
+
logger.info(f"Screenshot saved to: {file_path}")
|
|
579
|
+
|
|
580
|
+
# Update result text to include file path
|
|
581
|
+
result_text += f" (saved to: {file_path})"
|
|
582
|
+
break
|
|
583
|
+
|
|
584
|
+
# Analyze image if requested and agent is registered
|
|
585
|
+
if read_image and file_path:
|
|
586
|
+
if self.agent is None:
|
|
587
|
+
logger.error(
|
|
588
|
+
"Cannot analyze screenshot: No agent registered. "
|
|
589
|
+
"Please pass this toolkit to ChatAgent via "
|
|
590
|
+
"toolkits_to_register_agent parameter."
|
|
591
|
+
)
|
|
592
|
+
result_text += (
|
|
593
|
+
" Error: No agent registered for image analysis. "
|
|
594
|
+
"Please pass this toolkit to ChatAgent via "
|
|
595
|
+
"toolkits_to_register_agent parameter."
|
|
596
|
+
)
|
|
597
|
+
else:
|
|
598
|
+
try:
|
|
599
|
+
# Load the image and create a message
|
|
600
|
+
from PIL import Image
|
|
601
|
+
|
|
602
|
+
img = Image.open(file_path)
|
|
603
|
+
inst = instruction if instruction is not None else ""
|
|
604
|
+
message = BaseMessage.make_user_message(
|
|
605
|
+
role_name="User",
|
|
606
|
+
content=inst,
|
|
607
|
+
image_list=[img],
|
|
608
|
+
)
|
|
1488
609
|
|
|
1489
|
-
|
|
610
|
+
# Get agent's analysis
|
|
611
|
+
response = await self.agent.astep(message)
|
|
612
|
+
agent_response = response.msgs[0].content
|
|
613
|
+
result_text += f". Agent analysis: {agent_response}"
|
|
614
|
+
except Exception as e:
|
|
615
|
+
logger.error(f"Error analyzing screenshot: {e}")
|
|
616
|
+
result_text += f". Error analyzing screenshot: {e}"
|
|
1490
617
|
|
|
1491
|
-
|
|
618
|
+
return result_text
|
|
619
|
+
except Exception as e:
|
|
620
|
+
logger.error(f"Failed to get screenshot: {e}")
|
|
621
|
+
return f"Error capturing screenshot: {e}"
|
|
622
|
+
|
|
623
|
+
async def browser_click(self, *, ref: str) -> Dict[str, Any]:
|
|
1492
624
|
r"""Performs a click on an element on the page.
|
|
1493
625
|
|
|
1494
626
|
Args:
|
|
@@ -1505,155 +637,379 @@ class HybridBrowserToolkit(BaseToolkit):
|
|
|
1505
637
|
- "current_tab" (int): Index of the active tab.
|
|
1506
638
|
- "total_tabs" (int): Total number of open tabs.
|
|
1507
639
|
"""
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1516
|
-
|
|
640
|
+
try:
|
|
641
|
+
ws_wrapper = await self._get_ws_wrapper()
|
|
642
|
+
result = await ws_wrapper.click(ref)
|
|
643
|
+
|
|
644
|
+
# Add tab information
|
|
645
|
+
tab_info = await ws_wrapper.get_tab_info()
|
|
646
|
+
result.update(
|
|
647
|
+
{
|
|
648
|
+
"tabs": tab_info,
|
|
649
|
+
"current_tab": next(
|
|
650
|
+
(
|
|
651
|
+
i
|
|
652
|
+
for i, tab in enumerate(tab_info)
|
|
653
|
+
if tab.get("is_current")
|
|
654
|
+
),
|
|
655
|
+
0,
|
|
656
|
+
),
|
|
657
|
+
"total_tabs": len(tab_info),
|
|
658
|
+
}
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
return result
|
|
662
|
+
except Exception as e:
|
|
663
|
+
logger.error(f"Failed to click element: {e}")
|
|
1517
664
|
return {
|
|
1518
|
-
"result": f"Error
|
|
1519
|
-
"snapshot":
|
|
1520
|
-
|
|
665
|
+
"result": f"Error clicking element: {e}",
|
|
666
|
+
"snapshot": "",
|
|
667
|
+
"tabs": [],
|
|
668
|
+
"current_tab": 0,
|
|
669
|
+
"total_tabs": 0,
|
|
1521
670
|
}
|
|
1522
671
|
|
|
1523
|
-
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
|
|
672
|
+
async def browser_type(self, *, ref: str, text: str) -> Dict[str, Any]:
|
|
673
|
+
r"""Types text into an input element on the page.
|
|
674
|
+
|
|
675
|
+
Args:
|
|
676
|
+
ref (str): The `ref` ID of the input element, from a snapshot.
|
|
677
|
+
text (str): The text to type into the element.
|
|
678
|
+
|
|
679
|
+
Returns:
|
|
680
|
+
Dict[str, Any]: A dictionary with the result of the action:
|
|
681
|
+
- "result" (str): Confirmation of the action.
|
|
682
|
+
- "snapshot" (str): A textual snapshot of the page after
|
|
683
|
+
typing.
|
|
684
|
+
- "tabs" (List[Dict]): Information about all open tabs.
|
|
685
|
+
- "current_tab" (int): Index of the active tab.
|
|
686
|
+
- "total_tabs" (int): Total number of open tabs.
|
|
687
|
+
"""
|
|
688
|
+
try:
|
|
689
|
+
ws_wrapper = await self._get_ws_wrapper()
|
|
690
|
+
result = await ws_wrapper.type(ref, text)
|
|
691
|
+
|
|
692
|
+
# Add tab information
|
|
693
|
+
tab_info = await ws_wrapper.get_tab_info()
|
|
694
|
+
result.update(
|
|
695
|
+
{
|
|
696
|
+
"tabs": tab_info,
|
|
697
|
+
"current_tab": next(
|
|
698
|
+
(
|
|
699
|
+
i
|
|
700
|
+
for i, tab in enumerate(tab_info)
|
|
701
|
+
if tab.get("is_current")
|
|
702
|
+
),
|
|
703
|
+
0,
|
|
704
|
+
),
|
|
705
|
+
"total_tabs": len(tab_info),
|
|
706
|
+
}
|
|
707
|
+
)
|
|
708
|
+
|
|
709
|
+
return result
|
|
710
|
+
except Exception as e:
|
|
711
|
+
logger.error(f"Failed to type text: {e}")
|
|
712
|
+
return {
|
|
713
|
+
"result": f"Error typing text: {e}",
|
|
714
|
+
"snapshot": "",
|
|
715
|
+
"tabs": [],
|
|
716
|
+
"current_tab": 0,
|
|
717
|
+
"total_tabs": 0,
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
async def browser_select(self, *, ref: str, value: str) -> Dict[str, Any]:
|
|
721
|
+
r"""Selects an option in a dropdown (`<select>`) element.
|
|
722
|
+
|
|
723
|
+
Args:
|
|
724
|
+
ref (str): The `ref` ID of the `<select>` element.
|
|
725
|
+
value (str): The `value` attribute of the `<option>` to select,
|
|
726
|
+
not its visible text.
|
|
1528
727
|
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
|
|
728
|
+
Returns:
|
|
729
|
+
Dict[str, Any]: A dictionary with the result of the action:
|
|
730
|
+
- "result" (str): Confirmation of the action.
|
|
731
|
+
- "snapshot" (str): A snapshot of the page after the
|
|
732
|
+
selection.
|
|
733
|
+
- "tabs" (List[Dict]): Information about all open tabs.
|
|
734
|
+
- "current_tab" (int): Index of the active tab.
|
|
735
|
+
- "total_tabs" (int): Total number of open tabs.
|
|
736
|
+
"""
|
|
737
|
+
try:
|
|
738
|
+
ws_wrapper = await self._get_ws_wrapper()
|
|
739
|
+
result = await ws_wrapper.select(ref, value)
|
|
740
|
+
|
|
741
|
+
# Add tab information
|
|
742
|
+
tab_info = await ws_wrapper.get_tab_info()
|
|
743
|
+
result.update(
|
|
744
|
+
{
|
|
745
|
+
"tabs": tab_info,
|
|
746
|
+
"current_tab": next(
|
|
747
|
+
(
|
|
748
|
+
i
|
|
749
|
+
for i, tab in enumerate(tab_info)
|
|
750
|
+
if tab.get("is_current")
|
|
751
|
+
),
|
|
752
|
+
0,
|
|
753
|
+
),
|
|
754
|
+
"total_tabs": len(tab_info),
|
|
755
|
+
}
|
|
756
|
+
)
|
|
1532
757
|
|
|
1533
|
-
|
|
758
|
+
return result
|
|
759
|
+
except Exception as e:
|
|
760
|
+
logger.error(f"Failed to select option: {e}")
|
|
761
|
+
return {
|
|
762
|
+
"result": f"Error selecting option: {e}",
|
|
763
|
+
"snapshot": "",
|
|
764
|
+
"tabs": [],
|
|
765
|
+
"current_tab": 0,
|
|
766
|
+
"total_tabs": 0,
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
async def browser_scroll(
|
|
770
|
+
self, *, direction: str, amount: int = 500
|
|
771
|
+
) -> Dict[str, Any]:
|
|
772
|
+
r"""Scrolls the current page window.
|
|
773
|
+
|
|
774
|
+
Args:
|
|
775
|
+
direction (str): The direction to scroll: 'up' or 'down'.
|
|
776
|
+
amount (int): The number of pixels to scroll, default is 500.
|
|
777
|
+
|
|
778
|
+
Returns:
|
|
779
|
+
Dict[str, Any]: A dictionary with the result of the action:
|
|
780
|
+
- "result" (str): Confirmation of the action.
|
|
781
|
+
- "snapshot" (str): A snapshot of the page after scrolling.
|
|
782
|
+
- "tabs" (List[Dict]): Information about all open tabs.
|
|
783
|
+
- "current_tab" (int): Index of the active tab.
|
|
784
|
+
- "total_tabs" (int): Total number of open tabs.
|
|
785
|
+
"""
|
|
786
|
+
try:
|
|
787
|
+
ws_wrapper = await self._get_ws_wrapper()
|
|
788
|
+
result = await ws_wrapper.scroll(direction, amount)
|
|
789
|
+
|
|
790
|
+
# Add tab information
|
|
791
|
+
tab_info = await ws_wrapper.get_tab_info()
|
|
792
|
+
result.update(
|
|
793
|
+
{
|
|
794
|
+
"tabs": tab_info,
|
|
795
|
+
"current_tab": next(
|
|
796
|
+
(
|
|
797
|
+
i
|
|
798
|
+
for i, tab in enumerate(tab_info)
|
|
799
|
+
if tab.get("is_current")
|
|
800
|
+
),
|
|
801
|
+
0,
|
|
802
|
+
),
|
|
803
|
+
"total_tabs": len(tab_info),
|
|
804
|
+
}
|
|
805
|
+
)
|
|
806
|
+
|
|
807
|
+
return result
|
|
808
|
+
except Exception as e:
|
|
809
|
+
logger.error(f"Failed to scroll: {e}")
|
|
810
|
+
return {
|
|
811
|
+
"result": f"Error scrolling: {e}",
|
|
812
|
+
"snapshot": "",
|
|
813
|
+
"tabs": [],
|
|
814
|
+
"current_tab": 0,
|
|
815
|
+
"total_tabs": 0,
|
|
816
|
+
}
|
|
1534
817
|
|
|
1535
|
-
async def
|
|
1536
|
-
r"""
|
|
818
|
+
async def browser_enter(self) -> Dict[str, Any]:
|
|
819
|
+
r"""Simulates pressing the Enter key on the currently focused
|
|
820
|
+
element.
|
|
1537
821
|
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
text (str): The text to type into the element.
|
|
822
|
+
This is useful for submitting forms or search queries after using the
|
|
823
|
+
`type` tool.
|
|
1541
824
|
|
|
1542
825
|
Returns:
|
|
1543
826
|
Dict[str, Any]: A dictionary with the result of the action:
|
|
1544
827
|
- "result" (str): Confirmation of the action.
|
|
1545
|
-
- "snapshot" (str): A
|
|
1546
|
-
|
|
828
|
+
- "snapshot" (str): A new page snapshot, as this action often
|
|
829
|
+
triggers navigation.
|
|
1547
830
|
- "tabs" (List[Dict]): Information about all open tabs.
|
|
1548
831
|
- "current_tab" (int): Index of the active tab.
|
|
1549
832
|
- "total_tabs" (int): Total number of open tabs.
|
|
1550
833
|
"""
|
|
1551
|
-
|
|
1552
|
-
|
|
1553
|
-
|
|
1554
|
-
|
|
1555
|
-
|
|
834
|
+
try:
|
|
835
|
+
ws_wrapper = await self._get_ws_wrapper()
|
|
836
|
+
result = await ws_wrapper.enter()
|
|
837
|
+
|
|
838
|
+
# Add tab information
|
|
839
|
+
tab_info = await ws_wrapper.get_tab_info()
|
|
840
|
+
result.update(
|
|
841
|
+
{
|
|
842
|
+
"tabs": tab_info,
|
|
843
|
+
"current_tab": next(
|
|
844
|
+
(
|
|
845
|
+
i
|
|
846
|
+
for i, tab in enumerate(tab_info)
|
|
847
|
+
if tab.get("is_current")
|
|
848
|
+
),
|
|
849
|
+
0,
|
|
850
|
+
),
|
|
851
|
+
"total_tabs": len(tab_info),
|
|
852
|
+
}
|
|
853
|
+
)
|
|
1556
854
|
|
|
1557
|
-
|
|
1558
|
-
|
|
1559
|
-
|
|
855
|
+
return result
|
|
856
|
+
except Exception as e:
|
|
857
|
+
logger.error(f"Failed to press enter: {e}")
|
|
858
|
+
return {
|
|
859
|
+
"result": f"Error pressing enter: {e}",
|
|
860
|
+
"snapshot": "",
|
|
861
|
+
"tabs": [],
|
|
862
|
+
"current_tab": 0,
|
|
863
|
+
"total_tabs": 0,
|
|
864
|
+
}
|
|
1560
865
|
|
|
1561
|
-
|
|
866
|
+
async def browser_switch_tab(self, *, tab_id: str) -> Dict[str, Any]:
|
|
867
|
+
r"""Switches to a different browser tab using its ID.
|
|
1562
868
|
|
|
1563
|
-
|
|
1564
|
-
|
|
869
|
+
After switching, all actions will apply to the new tab. Use
|
|
870
|
+
`get_tab_info` to find the ID of the tab you want to switch to.
|
|
1565
871
|
|
|
1566
872
|
Args:
|
|
1567
|
-
|
|
1568
|
-
value (str): The `value` attribute of the `<option>` to select,
|
|
1569
|
-
not its visible text.
|
|
873
|
+
tab_id (str): The ID of the tab to activate.
|
|
1570
874
|
|
|
1571
875
|
Returns:
|
|
1572
876
|
Dict[str, Any]: A dictionary with the result of the action:
|
|
1573
877
|
- "result" (str): Confirmation of the action.
|
|
1574
|
-
- "snapshot" (str): A snapshot of the
|
|
1575
|
-
selection.
|
|
878
|
+
- "snapshot" (str): A snapshot of the newly active tab.
|
|
1576
879
|
- "tabs" (List[Dict]): Information about all open tabs.
|
|
1577
|
-
- "current_tab" (int): Index of the active tab.
|
|
880
|
+
- "current_tab" (int): Index of the new active tab.
|
|
1578
881
|
- "total_tabs" (int): Total number of open tabs.
|
|
1579
882
|
"""
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
|
|
1583
|
-
|
|
1584
|
-
|
|
883
|
+
try:
|
|
884
|
+
ws_wrapper = await self._get_ws_wrapper()
|
|
885
|
+
result = await ws_wrapper.switch_tab(tab_id)
|
|
886
|
+
|
|
887
|
+
# Add tab information
|
|
888
|
+
tab_info = await ws_wrapper.get_tab_info()
|
|
889
|
+
result.update(
|
|
890
|
+
{
|
|
891
|
+
"tabs": tab_info,
|
|
892
|
+
"current_tab": next(
|
|
893
|
+
(
|
|
894
|
+
i
|
|
895
|
+
for i, tab in enumerate(tab_info)
|
|
896
|
+
if tab.get("is_current")
|
|
897
|
+
),
|
|
898
|
+
0,
|
|
899
|
+
),
|
|
900
|
+
"total_tabs": len(tab_info),
|
|
901
|
+
}
|
|
902
|
+
)
|
|
1585
903
|
|
|
1586
|
-
|
|
1587
|
-
|
|
1588
|
-
|
|
904
|
+
return result
|
|
905
|
+
except Exception as e:
|
|
906
|
+
logger.error(f"Failed to switch tab: {e}")
|
|
907
|
+
return {
|
|
908
|
+
"result": f"Error switching tab: {e}",
|
|
909
|
+
"snapshot": "",
|
|
910
|
+
"tabs": [],
|
|
911
|
+
"current_tab": 0,
|
|
912
|
+
"total_tabs": 0,
|
|
913
|
+
}
|
|
1589
914
|
|
|
1590
|
-
|
|
915
|
+
async def browser_close_tab(self, *, tab_id: str) -> Dict[str, Any]:
|
|
916
|
+
r"""Closes a browser tab using its ID.
|
|
1591
917
|
|
|
1592
|
-
|
|
1593
|
-
|
|
918
|
+
Use `get_tab_info` to find the ID of the tab to close. After
|
|
919
|
+
closing, the browser will switch to another tab if available.
|
|
1594
920
|
|
|
1595
921
|
Args:
|
|
1596
|
-
|
|
1597
|
-
amount (int): The number of pixels to scroll.
|
|
922
|
+
tab_id (str): The ID of the tab to close.
|
|
1598
923
|
|
|
1599
924
|
Returns:
|
|
1600
925
|
Dict[str, Any]: A dictionary with the result of the action:
|
|
1601
926
|
- "result" (str): Confirmation of the action.
|
|
1602
|
-
- "snapshot" (str): A snapshot of the
|
|
1603
|
-
|
|
1604
|
-
- "
|
|
1605
|
-
- "
|
|
927
|
+
- "snapshot" (str): A snapshot of the active tab after
|
|
928
|
+
closure.
|
|
929
|
+
- "tabs" (List[Dict]): Information about remaining tabs.
|
|
930
|
+
- "current_tab" (int): Index of the new active tab.
|
|
931
|
+
- "total_tabs" (int): Total number of remaining tabs.
|
|
1606
932
|
"""
|
|
1607
|
-
|
|
1608
|
-
|
|
933
|
+
try:
|
|
934
|
+
ws_wrapper = await self._get_ws_wrapper()
|
|
935
|
+
result = await ws_wrapper.close_tab(tab_id)
|
|
936
|
+
|
|
937
|
+
# Add tab information
|
|
938
|
+
tab_info = await ws_wrapper.get_tab_info()
|
|
939
|
+
result.update(
|
|
940
|
+
{
|
|
941
|
+
"tabs": tab_info,
|
|
942
|
+
"current_tab": next(
|
|
943
|
+
(
|
|
944
|
+
i
|
|
945
|
+
for i, tab in enumerate(tab_info)
|
|
946
|
+
if tab.get("is_current")
|
|
947
|
+
),
|
|
948
|
+
0,
|
|
949
|
+
),
|
|
950
|
+
"total_tabs": len(tab_info),
|
|
951
|
+
}
|
|
952
|
+
)
|
|
953
|
+
|
|
954
|
+
return result
|
|
955
|
+
except Exception as e:
|
|
956
|
+
logger.error(f"Failed to close tab: {e}")
|
|
1609
957
|
return {
|
|
1610
|
-
"result": "Error
|
|
958
|
+
"result": f"Error closing tab: {e}",
|
|
1611
959
|
"snapshot": "",
|
|
1612
|
-
|
|
960
|
+
"tabs": [],
|
|
961
|
+
"current_tab": 0,
|
|
962
|
+
"total_tabs": 0,
|
|
1613
963
|
}
|
|
1614
964
|
|
|
1615
|
-
|
|
1616
|
-
|
|
1617
|
-
|
|
1618
|
-
# Add tab information to the result
|
|
1619
|
-
tab_info = await self._get_tab_info_for_output()
|
|
1620
|
-
result.update(tab_info)
|
|
1621
|
-
|
|
1622
|
-
return result
|
|
1623
|
-
|
|
1624
|
-
async def enter(self) -> Dict[str, Any]:
|
|
1625
|
-
r"""Simulates pressing the Enter key on the currently focused element.
|
|
965
|
+
async def browser_get_tab_info(self) -> Dict[str, Any]:
|
|
966
|
+
r"""Gets a list of all open browser tabs and their information.
|
|
1626
967
|
|
|
1627
|
-
This
|
|
1628
|
-
|
|
968
|
+
This includes each tab's index, title, and URL, and indicates which
|
|
969
|
+
tab is currently active. Use this to manage multiple tabs.
|
|
1629
970
|
|
|
1630
971
|
Returns:
|
|
1631
|
-
Dict[str, Any]: A dictionary with
|
|
1632
|
-
- "
|
|
1633
|
-
|
|
1634
|
-
|
|
1635
|
-
|
|
972
|
+
Dict[str, Any]: A dictionary with tab information:
|
|
973
|
+
- "tabs" (List[Dict]): A list of open tabs, each with:
|
|
974
|
+
- "index" (int): The tab's zero-based index.
|
|
975
|
+
- "title" (str): The page title.
|
|
976
|
+
- "url" (str): The current URL.
|
|
977
|
+
- "is_current" (bool): True if the tab is active.
|
|
1636
978
|
- "current_tab" (int): Index of the active tab.
|
|
1637
979
|
- "total_tabs" (int): Total number of open tabs.
|
|
1638
980
|
"""
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
result = await self._exec_with_snapshot(action)
|
|
1643
|
-
|
|
1644
|
-
# Add tab information to the result
|
|
1645
|
-
tab_info = await self._get_tab_info_for_output()
|
|
1646
|
-
result.update(tab_info)
|
|
981
|
+
try:
|
|
982
|
+
ws_wrapper = await self._get_ws_wrapper()
|
|
983
|
+
tab_info = await ws_wrapper.get_tab_info()
|
|
1647
984
|
|
|
1648
|
-
|
|
985
|
+
return {
|
|
986
|
+
"tabs": tab_info,
|
|
987
|
+
"current_tab": next(
|
|
988
|
+
(
|
|
989
|
+
i
|
|
990
|
+
for i, tab in enumerate(tab_info)
|
|
991
|
+
if tab.get("is_current")
|
|
992
|
+
),
|
|
993
|
+
0,
|
|
994
|
+
),
|
|
995
|
+
"total_tabs": len(tab_info),
|
|
996
|
+
}
|
|
997
|
+
except Exception as e:
|
|
998
|
+
logger.error(f"Failed to get tab info: {e}")
|
|
999
|
+
return {
|
|
1000
|
+
"tabs": [],
|
|
1001
|
+
"current_tab": 0,
|
|
1002
|
+
"total_tabs": 0,
|
|
1003
|
+
}
|
|
1649
1004
|
|
|
1650
|
-
|
|
1651
|
-
async def
|
|
1005
|
+
# Additional methods for backward compatibility
|
|
1006
|
+
async def browser_wait_user(
|
|
1652
1007
|
self, timeout_sec: Optional[float] = None
|
|
1653
1008
|
) -> Dict[str, Any]:
|
|
1654
1009
|
r"""Pauses execution and waits for human input from the console.
|
|
1655
1010
|
|
|
1656
|
-
Use this for tasks requiring manual steps, like solving a CAPTCHA.
|
|
1011
|
+
Use this for tasks requiring manual steps, like solving a CAPTCHA.
|
|
1012
|
+
The
|
|
1657
1013
|
agent will resume after the user presses Enter in the console.
|
|
1658
1014
|
|
|
1659
1015
|
Args:
|
|
@@ -1677,7 +1033,13 @@ class HybridBrowserToolkit(BaseToolkit):
|
|
|
1677
1033
|
logger.info(f"\n{prompt}\n")
|
|
1678
1034
|
|
|
1679
1035
|
async def _await_enter():
|
|
1680
|
-
|
|
1036
|
+
try:
|
|
1037
|
+
await asyncio.to_thread(
|
|
1038
|
+
input, ">>> Press Enter to resume <<<\n"
|
|
1039
|
+
)
|
|
1040
|
+
except (asyncio.CancelledError, Exception):
|
|
1041
|
+
# Handle cancellation gracefully
|
|
1042
|
+
pass
|
|
1681
1043
|
|
|
1682
1044
|
try:
|
|
1683
1045
|
if timeout_sec is not None:
|
|
@@ -1685,178 +1047,48 @@ class HybridBrowserToolkit(BaseToolkit):
|
|
|
1685
1047
|
f"Waiting for user input with timeout: {timeout_sec}s"
|
|
1686
1048
|
)
|
|
1687
1049
|
start_time = time.time()
|
|
1688
|
-
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
|
|
1050
|
+
task = asyncio.create_task(_await_enter())
|
|
1051
|
+
try:
|
|
1052
|
+
await asyncio.wait_for(task, timeout=timeout_sec)
|
|
1053
|
+
wait_time = time.time() - start_time
|
|
1054
|
+
logger.info(f"User input received after {wait_time:.2f}s")
|
|
1055
|
+
result_msg = "User resumed."
|
|
1056
|
+
except asyncio.TimeoutError:
|
|
1057
|
+
task.cancel()
|
|
1058
|
+
# Wait for task to be cancelled properly
|
|
1059
|
+
try:
|
|
1060
|
+
await task
|
|
1061
|
+
except asyncio.CancelledError:
|
|
1062
|
+
pass
|
|
1063
|
+
raise
|
|
1692
1064
|
else:
|
|
1693
|
-
logger.info("Waiting for user
|
|
1065
|
+
logger.info("Waiting for user input (no timeout)")
|
|
1694
1066
|
start_time = time.time()
|
|
1695
1067
|
await _await_enter()
|
|
1696
1068
|
wait_time = time.time() - start_time
|
|
1697
|
-
logger.info(f"User input received
|
|
1069
|
+
logger.info(f"User input received after {wait_time:.2f}s")
|
|
1698
1070
|
result_msg = "User resumed."
|
|
1699
1071
|
except asyncio.TimeoutError:
|
|
1700
1072
|
wait_time = timeout_sec or 0.0
|
|
1701
1073
|
logger.info(
|
|
1702
|
-
f"User input timeout reached "
|
|
1703
|
-
f"
|
|
1074
|
+
f"User input timeout reached after {wait_time}s, "
|
|
1075
|
+
f"auto-resuming"
|
|
1704
1076
|
)
|
|
1705
1077
|
result_msg = f"Timeout {timeout_sec}s reached, auto-resumed."
|
|
1706
1078
|
|
|
1707
|
-
|
|
1708
|
-
|
|
1709
|
-
|
|
1710
|
-
|
|
1711
|
-
|
|
1712
|
-
|
|
1713
|
-
|
|
1714
|
-
|
|
1715
|
-
|
|
1716
|
-
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
|
|
1720
|
-
Args:
|
|
1721
|
-
ref (List[str]): A list of `ref` IDs for link elements, obtained
|
|
1722
|
-
from a page snapshot.
|
|
1723
|
-
|
|
1724
|
-
Returns:
|
|
1725
|
-
Dict[str, Any]: A dictionary containing:
|
|
1726
|
-
- "links" (List[Dict]): A list of found links, where each
|
|
1727
|
-
link has "text", "ref", and "url" keys.
|
|
1728
|
-
"""
|
|
1729
|
-
if not ref or not isinstance(ref, list):
|
|
1730
|
-
return {"links": []}
|
|
1731
|
-
|
|
1732
|
-
for r in ref:
|
|
1733
|
-
if not r or not isinstance(r, str):
|
|
1734
|
-
return {"links": []}
|
|
1735
|
-
|
|
1736
|
-
page = await self._require_page()
|
|
1737
|
-
snapshot = await self._session.get_snapshot(
|
|
1738
|
-
force_refresh=True, diff_only=False
|
|
1739
|
-
)
|
|
1740
|
-
links = await self._extract_links_by_refs(snapshot, page, ref)
|
|
1741
|
-
|
|
1742
|
-
return {"links": links}
|
|
1743
|
-
|
|
1744
|
-
@action_logger
|
|
1745
|
-
async def solve_task(
|
|
1746
|
-
self, task_prompt: str, start_url: str, max_steps: int = 15
|
|
1747
|
-
) -> str:
|
|
1748
|
-
r"""Delegates a complex, high-level task to a specialized web agent.
|
|
1749
|
-
|
|
1750
|
-
Use this for multi-step tasks that can be described in a single prompt
|
|
1751
|
-
(e.g., "log into my account and check for new messages"). The agent
|
|
1752
|
-
will autonomously perform the necessary browser actions.
|
|
1753
|
-
|
|
1754
|
-
NOTE: This is a high-level action; for simple interactions, use tools
|
|
1755
|
-
like `click` and `type`. `web_agent_model` must be provided during
|
|
1756
|
-
toolkit initialization.
|
|
1757
|
-
|
|
1758
|
-
Args:
|
|
1759
|
-
task_prompt (str): A natural language description of the task.
|
|
1760
|
-
start_url (str): The URL to start the task from. This should be a
|
|
1761
|
-
valid and existing URL, as agents may generate non-existent
|
|
1762
|
-
ones.
|
|
1763
|
-
max_steps (int): The maximum number of steps the agent can take.
|
|
1764
|
-
|
|
1765
|
-
Returns:
|
|
1766
|
-
str: A summary message indicating the task has finished.
|
|
1767
|
-
"""
|
|
1768
|
-
agent = self._ensure_agent()
|
|
1769
|
-
await agent.navigate(start_url)
|
|
1770
|
-
await agent.process_command(task_prompt, max_steps=max_steps)
|
|
1771
|
-
return "Task processing finished - see stdout for detailed trace."
|
|
1772
|
-
|
|
1773
|
-
def get_log_summary(self) -> Dict[str, Any]:
|
|
1774
|
-
r"""Get a summary of logged actions."""
|
|
1775
|
-
if not self.log_buffer:
|
|
1776
|
-
return {"total_actions": 0, "summary": "No actions logged"}
|
|
1777
|
-
|
|
1778
|
-
total_actions = len(self.log_buffer)
|
|
1779
|
-
total_execution_time = sum(
|
|
1780
|
-
entry.get("execution_time_ms", 0) for entry in self.log_buffer
|
|
1781
|
-
)
|
|
1782
|
-
total_page_load_time = sum(
|
|
1783
|
-
entry.get("page_load_time_ms", 0)
|
|
1784
|
-
for entry in self.log_buffer
|
|
1785
|
-
if "page_load_time_ms" in entry
|
|
1786
|
-
)
|
|
1787
|
-
|
|
1788
|
-
action_counts: Dict[str, int] = {}
|
|
1789
|
-
error_count = 0
|
|
1790
|
-
|
|
1791
|
-
for entry in self.log_buffer:
|
|
1792
|
-
action = entry["action"]
|
|
1793
|
-
action_counts[action] = action_counts.get(action, 0) + 1
|
|
1794
|
-
if "error" in entry:
|
|
1795
|
-
error_count += 1
|
|
1796
|
-
|
|
1797
|
-
return {
|
|
1798
|
-
"total_actions": total_actions,
|
|
1799
|
-
"total_execution_time_ms": round(total_execution_time, 2),
|
|
1800
|
-
"total_page_load_time_ms": round(total_page_load_time, 2),
|
|
1801
|
-
"action_counts": action_counts,
|
|
1802
|
-
"error_count": error_count,
|
|
1803
|
-
"success_rate": round(
|
|
1804
|
-
(total_actions - error_count) / total_actions * 100, 2
|
|
1805
|
-
)
|
|
1806
|
-
if total_actions > 0
|
|
1807
|
-
else 0,
|
|
1808
|
-
}
|
|
1809
|
-
|
|
1810
|
-
def clear_logs(self) -> None:
|
|
1811
|
-
r"""Clear the log buffer."""
|
|
1812
|
-
self.log_buffer.clear()
|
|
1813
|
-
logger.info("Log buffer cleared")
|
|
1814
|
-
|
|
1815
|
-
def get_tools(self) -> List[FunctionTool]:
|
|
1816
|
-
r"""Get available function tools
|
|
1817
|
-
based on enabled_tools configuration."""
|
|
1818
|
-
# Map tool names to their corresponding methods
|
|
1819
|
-
tool_map = {
|
|
1820
|
-
"open_browser": self.open_browser,
|
|
1821
|
-
"close_browser": self.close_browser,
|
|
1822
|
-
"visit_page": self.visit_page,
|
|
1823
|
-
"back": self.back,
|
|
1824
|
-
"forward": self.forward,
|
|
1825
|
-
"get_page_snapshot": self.get_page_snapshot,
|
|
1826
|
-
"get_som_screenshot": self.get_som_screenshot,
|
|
1827
|
-
"get_page_links": self.get_page_links,
|
|
1828
|
-
"click": self.click,
|
|
1829
|
-
"type": self.type,
|
|
1830
|
-
"select": self.select,
|
|
1831
|
-
"scroll": self.scroll,
|
|
1832
|
-
"enter": self.enter,
|
|
1833
|
-
"wait_user": self.wait_user,
|
|
1834
|
-
"solve_task": self.solve_task,
|
|
1835
|
-
"switch_tab": self.switch_tab,
|
|
1836
|
-
"close_tab": self.close_tab,
|
|
1837
|
-
"get_tab_info": self.get_tab_info,
|
|
1838
|
-
}
|
|
1839
|
-
|
|
1840
|
-
enabled_tools = []
|
|
1841
|
-
|
|
1842
|
-
for tool_name in self.enabled_tools:
|
|
1843
|
-
if tool_name == "solve_task" and self._web_agent_model is None:
|
|
1844
|
-
logger.warning(
|
|
1845
|
-
f"Tool '{tool_name}' is enabled but web_agent_model "
|
|
1846
|
-
f"is not provided. Skipping this tool."
|
|
1847
|
-
)
|
|
1848
|
-
continue
|
|
1849
|
-
|
|
1850
|
-
if tool_name in tool_map:
|
|
1851
|
-
tool = FunctionTool(
|
|
1852
|
-
cast(Callable[..., Any], tool_map[tool_name])
|
|
1853
|
-
)
|
|
1854
|
-
enabled_tools.append(tool)
|
|
1855
|
-
else:
|
|
1856
|
-
logger.warning(f"Unknown tool name: {tool_name}")
|
|
1857
|
-
|
|
1858
|
-
logger.info(f"Returning {len(enabled_tools)} enabled tools")
|
|
1859
|
-
return enabled_tools
|
|
1079
|
+
try:
|
|
1080
|
+
snapshot = await self.browser_get_page_snapshot()
|
|
1081
|
+
tab_info = await self.browser_get_tab_info()
|
|
1082
|
+
return {"result": result_msg, "snapshot": snapshot, **tab_info}
|
|
1083
|
+
except Exception as e:
|
|
1084
|
+
logger.warning(f"Failed to get snapshot after wait: {e}")
|
|
1085
|
+
return {
|
|
1086
|
+
"result": result_msg,
|
|
1087
|
+
"snapshot": "",
|
|
1088
|
+
"tabs": [],
|
|
1089
|
+
"current_tab": 0,
|
|
1090
|
+
"total_tabs": 0,
|
|
1091
|
+
}
|
|
1860
1092
|
|
|
1861
1093
|
def clone_for_new_session(
|
|
1862
1094
|
self, new_session_id: Optional[str] = None
|
|
@@ -1882,7 +1114,8 @@ class HybridBrowserToolkit(BaseToolkit):
|
|
|
1882
1114
|
user_data_dir=self._user_data_dir,
|
|
1883
1115
|
stealth=self._stealth,
|
|
1884
1116
|
web_agent_model=self._web_agent_model,
|
|
1885
|
-
cache_dir=f"{self._cache_dir.rstrip('/')}_clone_
|
|
1117
|
+
cache_dir=f"{self._cache_dir.rstrip('/')}_clone_"
|
|
1118
|
+
f"{new_session_id}/",
|
|
1886
1119
|
enabled_tools=self.enabled_tools.copy(),
|
|
1887
1120
|
browser_log_to_file=self._browser_log_to_file,
|
|
1888
1121
|
session_id=new_session_id,
|
|
@@ -1896,117 +1129,49 @@ class HybridBrowserToolkit(BaseToolkit):
|
|
|
1896
1129
|
dom_content_loaded_timeout=self._dom_content_loaded_timeout,
|
|
1897
1130
|
)
|
|
1898
1131
|
|
|
1899
|
-
|
|
1900
|
-
|
|
1901
|
-
|
|
1902
|
-
|
|
1903
|
-
|
|
1904
|
-
|
|
1905
|
-
|
|
1906
|
-
|
|
1907
|
-
|
|
1908
|
-
|
|
1909
|
-
|
|
1910
|
-
|
|
1911
|
-
|
|
1912
|
-
|
|
1913
|
-
|
|
1914
|
-
|
|
1915
|
-
|
|
1916
|
-
|
|
1917
|
-
|
|
1918
|
-
|
|
1919
|
-
|
|
1920
|
-
|
|
1921
|
-
|
|
1922
|
-
if success:
|
|
1923
|
-
snapshot = await session.get_snapshot(
|
|
1924
|
-
force_refresh=True, diff_only=False
|
|
1925
|
-
)
|
|
1926
|
-
tab_info = await self._get_tab_info_for_output()
|
|
1927
|
-
|
|
1928
|
-
result = {
|
|
1929
|
-
"result": f"Successfully switched to tab {tab_id}",
|
|
1930
|
-
"snapshot": snapshot,
|
|
1931
|
-
**tab_info,
|
|
1932
|
-
}
|
|
1933
|
-
else:
|
|
1934
|
-
tab_info = await self._get_tab_info_for_output()
|
|
1935
|
-
result = {
|
|
1936
|
-
"result": f"Failed to switch to tab {tab_id}. Tab may not "
|
|
1937
|
-
f"exist.",
|
|
1938
|
-
"snapshot": "",
|
|
1939
|
-
**tab_info,
|
|
1940
|
-
}
|
|
1941
|
-
|
|
1942
|
-
return result
|
|
1943
|
-
|
|
1944
|
-
@action_logger
|
|
1945
|
-
async def close_tab(self, *, tab_id: str) -> Dict[str, Any]:
|
|
1946
|
-
r"""Closes a browser tab using its ID.
|
|
1947
|
-
|
|
1948
|
-
Use `get_tab_info` to find the ID of the tab to close. After
|
|
1949
|
-
closing, the browser will switch to another tab if available.
|
|
1950
|
-
|
|
1951
|
-
Args:
|
|
1952
|
-
tab_id (str): The ID of the tab to close.
|
|
1953
|
-
|
|
1954
|
-
Returns:
|
|
1955
|
-
Dict[str, Any]: A dictionary with the result of the action:
|
|
1956
|
-
- "result" (str): Confirmation of the action.
|
|
1957
|
-
- "snapshot" (str): A snapshot of the active tab after closure.
|
|
1958
|
-
- "tabs" (List[Dict]): Information about remaining tabs.
|
|
1959
|
-
- "current_tab" (int): Index of the new active tab.
|
|
1960
|
-
- "total_tabs" (int): Total number of remaining tabs.
|
|
1961
|
-
"""
|
|
1962
|
-
await self._ensure_browser()
|
|
1963
|
-
session = await self._get_session()
|
|
1132
|
+
def get_tools(self) -> List[FunctionTool]:
|
|
1133
|
+
r"""Get available function tools based
|
|
1134
|
+
on enabled_tools configuration."""
|
|
1135
|
+
# Map tool names to their corresponding methods
|
|
1136
|
+
tool_map = {
|
|
1137
|
+
"browser_open": self.browser_open,
|
|
1138
|
+
"browser_close": self.browser_close,
|
|
1139
|
+
"browser_visit_page": self.browser_visit_page,
|
|
1140
|
+
"browser_back": self.browser_back,
|
|
1141
|
+
"browser_forward": self.browser_forward,
|
|
1142
|
+
"browser_get_page_snapshot": self.browser_get_page_snapshot,
|
|
1143
|
+
"browser_get_som_screenshot": self.browser_get_som_screenshot,
|
|
1144
|
+
"browser_click": self.browser_click,
|
|
1145
|
+
"browser_type": self.browser_type,
|
|
1146
|
+
"browser_select": self.browser_select,
|
|
1147
|
+
"browser_scroll": self.browser_scroll,
|
|
1148
|
+
"browser_enter": self.browser_enter,
|
|
1149
|
+
"browser_wait_user": self.browser_wait_user,
|
|
1150
|
+
"browser_switch_tab": self.browser_switch_tab,
|
|
1151
|
+
"browser_close_tab": self.browser_close_tab,
|
|
1152
|
+
"browser_get_tab_info": self.browser_get_tab_info,
|
|
1153
|
+
}
|
|
1964
1154
|
|
|
1965
|
-
|
|
1155
|
+
enabled_tools = []
|
|
1966
1156
|
|
|
1967
|
-
|
|
1968
|
-
|
|
1969
|
-
|
|
1970
|
-
|
|
1971
|
-
|
|
1157
|
+
for tool_name in self.enabled_tools:
|
|
1158
|
+
if (
|
|
1159
|
+
tool_name == "browser_solve_task"
|
|
1160
|
+
and self._web_agent_model is None
|
|
1161
|
+
):
|
|
1162
|
+
logger.warning(
|
|
1163
|
+
f"Tool '{tool_name}' is enabled but web_agent_model "
|
|
1164
|
+
f"is not provided. Skipping this tool."
|
|
1972
1165
|
)
|
|
1973
|
-
|
|
1974
|
-
snapshot = "" # No active tab
|
|
1975
|
-
|
|
1976
|
-
tab_info = await self._get_tab_info_for_output()
|
|
1977
|
-
|
|
1978
|
-
result = {
|
|
1979
|
-
"result": f"Successfully closed tab {tab_id}",
|
|
1980
|
-
"snapshot": snapshot,
|
|
1981
|
-
**tab_info,
|
|
1982
|
-
}
|
|
1983
|
-
else:
|
|
1984
|
-
tab_info = await self._get_tab_info_for_output()
|
|
1985
|
-
result = {
|
|
1986
|
-
"result": f"Failed to close tab {tab_id}. Tab may not "
|
|
1987
|
-
f"exist.",
|
|
1988
|
-
"snapshot": "",
|
|
1989
|
-
**tab_info,
|
|
1990
|
-
}
|
|
1991
|
-
|
|
1992
|
-
return result
|
|
1993
|
-
|
|
1994
|
-
@action_logger
|
|
1995
|
-
async def get_tab_info(self) -> Dict[str, Any]:
|
|
1996
|
-
r"""Gets a list of all open browser tabs and their information.
|
|
1166
|
+
continue
|
|
1997
1167
|
|
|
1998
|
-
|
|
1999
|
-
|
|
1168
|
+
if tool_name in tool_map:
|
|
1169
|
+
tool = FunctionTool(
|
|
1170
|
+
cast(Callable[..., Any], tool_map[tool_name])
|
|
1171
|
+
)
|
|
1172
|
+
enabled_tools.append(tool)
|
|
1173
|
+
else:
|
|
1174
|
+
logger.warning(f"Unknown tool name: {tool_name}")
|
|
2000
1175
|
|
|
2001
|
-
|
|
2002
|
-
|
|
2003
|
-
- "tabs" (List[Dict]): A list of open tabs, each with:
|
|
2004
|
-
- "index" (int): The tab's zero-based index.
|
|
2005
|
-
- "title" (str): The page title.
|
|
2006
|
-
- "url" (str): The current URL.
|
|
2007
|
-
- "is_current" (bool): True if the tab is active.
|
|
2008
|
-
- "current_tab" (int): Index of the active tab.
|
|
2009
|
-
- "total_tabs" (int): Total number of open tabs.
|
|
2010
|
-
"""
|
|
2011
|
-
await self._ensure_browser()
|
|
2012
|
-
return await self._get_tab_info_for_output()
|
|
1176
|
+
logger.info(f"Returning {len(enabled_tools)} enabled tools")
|
|
1177
|
+
return enabled_tools
|