camel-ai 0.2.72a10__py3-none-any.whl → 0.2.73a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

Files changed (37) hide show
  1. camel/__init__.py +1 -1
  2. camel/agents/chat_agent.py +113 -338
  3. camel/memories/agent_memories.py +18 -17
  4. camel/societies/workforce/prompts.py +10 -4
  5. camel/societies/workforce/single_agent_worker.py +7 -5
  6. camel/toolkits/__init__.py +6 -1
  7. camel/toolkits/base.py +57 -1
  8. camel/toolkits/hybrid_browser_toolkit/config_loader.py +136 -413
  9. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit.py +796 -1631
  10. camel/toolkits/hybrid_browser_toolkit/ts/package-lock.json +4356 -0
  11. camel/toolkits/hybrid_browser_toolkit/ts/package.json +33 -0
  12. camel/toolkits/hybrid_browser_toolkit/ts/src/browser-scripts.js +125 -0
  13. camel/toolkits/hybrid_browser_toolkit/ts/src/browser-session.ts +945 -0
  14. camel/toolkits/hybrid_browser_toolkit/ts/src/config-loader.ts +226 -0
  15. camel/toolkits/hybrid_browser_toolkit/ts/src/hybrid-browser-toolkit.ts +522 -0
  16. camel/toolkits/hybrid_browser_toolkit/ts/src/index.ts +7 -0
  17. camel/toolkits/hybrid_browser_toolkit/ts/src/types.ts +110 -0
  18. camel/toolkits/hybrid_browser_toolkit/ts/tsconfig.json +26 -0
  19. camel/toolkits/hybrid_browser_toolkit/ts/websocket-server.js +210 -0
  20. camel/toolkits/hybrid_browser_toolkit/ws_wrapper.py +533 -0
  21. camel/toolkits/message_integration.py +592 -0
  22. camel/toolkits/notion_mcp_toolkit.py +234 -0
  23. camel/toolkits/screenshot_toolkit.py +116 -31
  24. camel/toolkits/search_toolkit.py +20 -2
  25. camel/toolkits/terminal_toolkit.py +16 -2
  26. camel/toolkits/video_analysis_toolkit.py +13 -13
  27. camel/toolkits/video_download_toolkit.py +11 -11
  28. {camel_ai-0.2.72a10.dist-info → camel_ai-0.2.73a1.dist-info}/METADATA +12 -6
  29. {camel_ai-0.2.72a10.dist-info → camel_ai-0.2.73a1.dist-info}/RECORD +31 -24
  30. camel/toolkits/hybrid_browser_toolkit/actions.py +0 -417
  31. camel/toolkits/hybrid_browser_toolkit/agent.py +0 -311
  32. camel/toolkits/hybrid_browser_toolkit/browser_session.py +0 -740
  33. camel/toolkits/hybrid_browser_toolkit/snapshot.py +0 -227
  34. camel/toolkits/hybrid_browser_toolkit/stealth_script.js +0 -0
  35. camel/toolkits/hybrid_browser_toolkit/unified_analyzer.js +0 -1002
  36. {camel_ai-0.2.72a10.dist-info → camel_ai-0.2.73a1.dist-info}/WHEEL +0 -0
  37. {camel_ai-0.2.72a10.dist-info → camel_ai-0.2.73a1.dist-info}/licenses/LICENSE +0 -0
@@ -11,75 +11,64 @@
11
11
  # See the License for the specific language governing permissions and
12
12
  # limitations under the License.
13
13
  # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14
+ # =========
14
15
 
15
- import base64
16
- import datetime
17
- import io
18
- import json
19
- import os
20
16
  import time
21
- import urllib.parse
22
- from functools import wraps
23
17
  from typing import Any, Callable, ClassVar, Dict, List, Optional, cast
24
18
 
25
19
  from camel.logger import get_logger
20
+ from camel.messages import BaseMessage
26
21
  from camel.models import BaseModelBackend
27
- from camel.toolkits.base import BaseToolkit
22
+ from camel.toolkits.base import BaseToolkit, RegisteredAgentToolkit
28
23
  from camel.toolkits.function_tool import FunctionTool
29
- from camel.utils import sanitize_filename
30
24
  from camel.utils.commons import dependencies_required
31
- from camel.utils.tool_result import ToolResult
32
25
 
33
- from .agent import PlaywrightLLMAgent
34
- from .browser_session import HybridBrowserSession
35
26
  from .config_loader import ConfigLoader
27
+ from .ws_wrapper import WebSocketBrowserWrapper
36
28
 
37
29
  logger = get_logger(__name__)
38
30
 
39
31
 
40
- class HybridBrowserToolkit(BaseToolkit):
32
+ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
41
33
  r"""A hybrid browser toolkit that combines non-visual, DOM-based browser
42
34
  automation with visual, screenshot-based capabilities.
43
35
 
44
- This toolkit exposes a set of actions as CAMEL FunctionTools for agents
45
- to interact with web pages. It can operate in headless mode and supports
46
- both programmatic control of browser actions (like clicking and typing)
47
- and visual analysis of the page layout through screenshots with marked
48
- interactive elements.
36
+ This toolkit now uses TypeScript implementation with Playwright's
37
+ _snapshotForAI functionality for enhanced AI integration.
49
38
  """
50
39
 
51
40
  # Default tool list - core browser functionality
52
41
  DEFAULT_TOOLS: ClassVar[List[str]] = [
53
- "open_browser",
54
- "close_browser",
55
- "visit_page",
56
- "back",
57
- "forward",
58
- "click",
59
- "type",
60
- "switch_tab",
42
+ "browser_open",
43
+ "browser_close",
44
+ "browser_visit_page",
45
+ "browser_back",
46
+ "browser_forward",
47
+ "browser_click",
48
+ "browser_type",
49
+ "browser_switch_tab",
61
50
  ]
62
51
 
63
52
  # All available tools
64
53
  ALL_TOOLS: ClassVar[List[str]] = [
65
- "open_browser",
66
- "close_browser",
67
- "visit_page",
68
- "back",
69
- "forward",
70
- "get_page_snapshot",
71
- "get_som_screenshot",
72
- "get_page_links",
73
- "click",
74
- "type",
75
- "select",
76
- "scroll",
77
- "enter",
78
- "wait_user",
79
- "solve_task",
80
- "switch_tab",
81
- "close_tab",
82
- "get_tab_info",
54
+ "browser_open",
55
+ "browser_close",
56
+ "browser_visit_page",
57
+ "browser_back",
58
+ "browser_forward",
59
+ "browser_get_page_snapshot",
60
+ "browser_get_som_screenshot",
61
+ "browser_get_page_links",
62
+ "browser_click",
63
+ "browser_type",
64
+ "browser_select",
65
+ "browser_scroll",
66
+ "browser_enter",
67
+ "browser_wait_user",
68
+ "browser_solve_task",
69
+ "browser_switch_tab",
70
+ "browser_close_tab",
71
+ "browser_get_tab_info",
83
72
  ]
84
73
 
85
74
  def __init__(
@@ -101,137 +90,104 @@ class HybridBrowserToolkit(BaseToolkit):
101
90
  screenshot_timeout: Optional[int] = None,
102
91
  page_stability_timeout: Optional[int] = None,
103
92
  dom_content_loaded_timeout: Optional[int] = None,
93
+ viewport_limit: bool = False,
94
+ connect_over_cdp: bool = False,
95
+ cdp_url: Optional[str] = None,
104
96
  ) -> None:
105
97
  r"""Initialize the HybridBrowserToolkit.
106
98
 
107
99
  Args:
108
- headless (bool): Whether to run the browser in headless mode.
109
- Defaults to `True`.
110
- user_data_dir (Optional[str]): Path to a directory for storing
111
- browser data like cookies and local storage. Useful for
112
- maintaining sessions across runs. Defaults to `None` (a
113
- temporary directory is used).
114
- stealth (bool): Whether to run the browser in stealth mode to avoid
115
- bot detection. When enabled, hides WebDriver characteristics,
116
- spoofs navigator properties, and implements various
117
- anti-detection
118
- measures. Highly recommended for production use and when
119
- accessing sites with bot detection. Defaults to `False`.
120
- web_agent_model (Optional[BaseModelBackend]): The language model
121
- backend to use for the high-level `solve_task` agent. This is
122
- required only if you plan to use `solve_task`.
123
- Defaults to `None`.
124
- cache_dir (str): The directory to store cached files, such as
125
- screenshots. Defaults to `"tmp/"`.
126
- enabled_tools (Optional[List[str]]): List of tool names to enable.
127
- If None, uses DEFAULT_TOOLS. Available tools: open_browser,
128
- close_browser, visit_page, back, forward, get_page_snapshot,
129
- get_som_screenshot, get_page_links, click, type, select,
130
- scroll, enter, wait_user, solve_task.
131
- Defaults to `None`.
132
- browser_log_to_file (bool): Whether to save detailed browser
133
- action logs to file.
134
- When enabled, logs action inputs/outputs, execution times,
135
- and page loading times.
136
- Logs are saved to an auto-generated timestamped file.
137
- Defaults to `False`.
138
- session_id (Optional[str]): A unique identifier for this browser
139
- session. When multiple HybridBrowserToolkit instances are used
140
- concurrently, different session IDs prevent them from sharing
141
- the same browser session and causing conflicts. If None, a
142
- default session will be used. Defaults to `None`.
143
- default_start_url (str): The default URL to navigate to when
144
- open_browser() is called without a start_url parameter or with
145
- None. Defaults to `"https://google.com/"`.
146
- default_timeout (Optional[int]): Default timeout in milliseconds
147
- for browser actions. If None, uses environment variable
148
- HYBRID_BROWSER_DEFAULT_TIMEOUT or defaults to 3000ms.
149
- Defaults to `None`.
150
- short_timeout (Optional[int]): Short timeout in milliseconds
151
- for quick browser actions. If None, uses environment variable
152
- HYBRID_BROWSER_SHORT_TIMEOUT or defaults to 1000ms.
153
- Defaults to `None`.
154
- navigation_timeout (Optional[int]): Custom navigation timeout in
155
- milliseconds.
156
- If None, uses environment variable
157
- HYBRID_BROWSER_NAVIGATION_TIMEOUT or defaults to 10000ms.
158
- Defaults to `None`.
159
- network_idle_timeout (Optional[int]): Custom network idle
160
- timeout in milliseconds.
161
- If None, uses environment variable
162
- HYBRID_BROWSER_NETWORK_IDLE_TIMEOUT or defaults to 5000ms.
163
- Defaults to `None`.
164
- screenshot_timeout (Optional[int]): Custom screenshot timeout in
165
- milliseconds.
166
- If None, uses environment variable
167
- HYBRID_BROWSER_SCREENSHOT_TIMEOUT or defaults to 15000ms.
168
- Defaults to `None`.
169
- page_stability_timeout (Optional[int]): Custom page stability
170
- timeout in milliseconds.
171
- If None, uses environment variable
172
- HYBRID_BROWSER_PAGE_STABILITY_TIMEOUT or defaults to 1500ms.
173
- Defaults to `None`.
174
- dom_content_loaded_timeout (Optional[int]): Custom DOM content
175
- loaded timeout in milliseconds.
176
- If None, uses environment variable
177
- HYBRID_BROWSER_DOM_CONTENT_LOADED_TIMEOUT or defaults to
178
- 5000ms.
179
- Defaults to `None`.
100
+ headless (bool): Whether to run browser in headless mode.
101
+ Defaults to True.
102
+ user_data_dir (Optional[str]): Directory for user data
103
+ persistence. Defaults to None.
104
+ stealth (bool): Whether to enable stealth mode. Defaults to
105
+ False.
106
+ web_agent_model (Optional[BaseModelBackend]): Model for web
107
+ agent operations. Defaults to None.
108
+ cache_dir (str): Directory for caching. Defaults to "tmp/".
109
+ enabled_tools (Optional[List[str]]): List of enabled tools.
110
+ Defaults to None.
111
+ browser_log_to_file (bool): Whether to log browser actions to
112
+ file. Defaults to False.
113
+ session_id (Optional[str]): Session identifier. Defaults to None.
114
+ default_start_url (str): Default URL to start with. Defaults
115
+ to "https://google.com/".
116
+ default_timeout (Optional[int]): Default timeout in
117
+ milliseconds. Defaults to None.
118
+ short_timeout (Optional[int]): Short timeout in milliseconds.
119
+ Defaults to None.
120
+ navigation_timeout (Optional[int]): Navigation timeout in
121
+ milliseconds. Defaults to None.
122
+ network_idle_timeout (Optional[int]): Network idle timeout in
123
+ milliseconds. Defaults to None.
124
+ screenshot_timeout (Optional[int]): Screenshot timeout in
125
+ milliseconds. Defaults to None.
126
+ page_stability_timeout (Optional[int]): Page stability timeout
127
+ in milliseconds. Defaults to None.
128
+ dom_content_loaded_timeout (Optional[int]): DOM content loaded
129
+ timeout in milliseconds. Defaults to None.
130
+ viewport_limit (bool): Whether to filter page snapshot
131
+ elements to only those visible in the current viewport.
132
+ When True, only elements within the current viewport
133
+ bounds will be included in snapshots.
134
+ When False (default), all elements on the page are
135
+ included. Defaults to False.
136
+ connect_over_cdp (bool): Whether to connect to an existing
137
+ browser via Chrome DevTools Protocol. Defaults to False.
138
+ cdp_url (Optional[str]): WebSocket endpoint URL for CDP
139
+ connection (e.g., 'ws://localhost:9222/devtools/browser/...').
140
+ Required when connect_over_cdp is True. Defaults to None.
180
141
  """
181
142
  super().__init__()
182
- self._headless = headless
183
- self._user_data_dir = user_data_dir
184
- self._stealth = stealth
185
- self._web_agent_model = web_agent_model
186
- self._cache_dir = cache_dir
187
- self._browser_log_to_file = browser_log_to_file
188
- self._default_start_url = default_start_url
189
- self._session_id = session_id or "default"
190
-
191
- # Store timeout configuration
192
- self._default_timeout = default_timeout
193
- self._short_timeout = short_timeout
194
- self._navigation_timeout = ConfigLoader.get_navigation_timeout(
195
- navigation_timeout
196
- )
197
- self._network_idle_timeout = ConfigLoader.get_network_idle_timeout(
198
- network_idle_timeout
199
- )
200
- self._screenshot_timeout = ConfigLoader.get_screenshot_timeout(
201
- screenshot_timeout
202
- )
203
- self._page_stability_timeout = ConfigLoader.get_page_stability_timeout(
204
- page_stability_timeout
205
- )
206
- self._dom_content_loaded_timeout = (
207
- ConfigLoader.get_dom_content_loaded_timeout(
208
- dom_content_loaded_timeout
209
- )
143
+ RegisteredAgentToolkit.__init__(self)
144
+
145
+ # Initialize configuration loader
146
+ self.config_loader = ConfigLoader.from_kwargs(
147
+ headless=headless,
148
+ user_data_dir=user_data_dir,
149
+ stealth=stealth,
150
+ default_start_url=default_start_url,
151
+ default_timeout=default_timeout,
152
+ short_timeout=short_timeout,
153
+ navigation_timeout=navigation_timeout,
154
+ network_idle_timeout=network_idle_timeout,
155
+ screenshot_timeout=screenshot_timeout,
156
+ page_stability_timeout=page_stability_timeout,
157
+ dom_content_loaded_timeout=dom_content_loaded_timeout,
158
+ viewport_limit=viewport_limit,
159
+ cache_dir=cache_dir,
160
+ browser_log_to_file=browser_log_to_file,
161
+ session_id=session_id,
162
+ enabled_tools=enabled_tools,
163
+ connect_over_cdp=connect_over_cdp,
164
+ cdp_url=cdp_url,
210
165
  )
211
166
 
212
- # Logging configuration - fixed values for simplicity
213
- self.enable_action_logging = True
214
- self.enable_timing_logging = True
215
- self.enable_page_loading_logging = True
216
- self.log_to_console = False # Always disabled for cleaner output
217
- self.log_to_file = browser_log_to_file
218
- self.max_log_length = None # No truncation for file logs
219
-
220
- # Set up log file if needed
221
- if self.log_to_file:
222
- # Create log directory if it doesn't exist
223
- log_dir = "browser_log"
224
- os.makedirs(log_dir, exist_ok=True)
225
-
226
- timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
227
- self.log_file_path: Optional[str] = os.path.join(
228
- log_dir, f"hybrid_browser_toolkit_{timestamp}_{session_id}.log"
229
- )
230
- else:
231
- self.log_file_path = None
167
+ # Legacy attribute access for backward compatibility
168
+ browser_config = self.config_loader.get_browser_config()
169
+ toolkit_config = self.config_loader.get_toolkit_config()
232
170
 
233
- # Initialize log buffer for in-memory storage
234
- self.log_buffer: List[Dict[str, Any]] = []
171
+ self._headless = browser_config.headless
172
+ self._user_data_dir = browser_config.user_data_dir
173
+ self._stealth = browser_config.stealth
174
+ self._web_agent_model = web_agent_model
175
+ self._cache_dir = toolkit_config.cache_dir
176
+ self._browser_log_to_file = toolkit_config.browser_log_to_file
177
+ self._default_start_url = browser_config.default_start_url
178
+ self._session_id = toolkit_config.session_id or "default"
179
+ self._viewport_limit = browser_config.viewport_limit
180
+
181
+ # Store timeout configuration for backward compatibility
182
+ self._default_timeout = browser_config.default_timeout
183
+ self._short_timeout = browser_config.short_timeout
184
+ self._navigation_timeout = browser_config.navigation_timeout
185
+ self._network_idle_timeout = browser_config.network_idle_timeout
186
+ self._screenshot_timeout = browser_config.screenshot_timeout
187
+ self._page_stability_timeout = browser_config.page_stability_timeout
188
+ self._dom_content_loaded_timeout = (
189
+ browser_config.dom_content_loaded_timeout
190
+ )
235
191
 
236
192
  # Configure enabled tools
237
193
  if enabled_tools is None:
@@ -250,42 +206,22 @@ class HybridBrowserToolkit(BaseToolkit):
250
206
 
251
207
  logger.info(f"Enabled tools: {self.enabled_tools}")
252
208
 
253
- # Log initialization if file logging is enabled
254
- if self.log_to_file:
255
- logger.info(
256
- "HybridBrowserToolkit initialized with file logging enabled"
257
- )
258
- logger.info(f"Log file path: {self.log_file_path}")
259
-
260
- # Core components
261
- temp_session = HybridBrowserSession(
262
- headless=headless,
263
- user_data_dir=user_data_dir,
264
- stealth=stealth,
265
- session_id=session_id,
266
- default_timeout=default_timeout,
267
- short_timeout=short_timeout,
268
- )
269
- # Use the session directly - singleton logic is handled in
270
- # ensure_browser
271
- self._session = temp_session
272
- self._agent: Optional[PlaywrightLLMAgent] = None
273
- self._unified_script = self._load_unified_analyzer()
274
-
275
- @property
276
- def web_agent_model(self) -> Optional[BaseModelBackend]:
277
- """Get the web agent model."""
278
- return self._web_agent_model
209
+ # Initialize WebSocket wrapper
210
+ self._ws_wrapper: Optional[WebSocketBrowserWrapper] = None
211
+ self._ws_config = self.config_loader.to_ws_config()
279
212
 
280
- @web_agent_model.setter
281
- def web_agent_model(self, value: Optional[BaseModelBackend]) -> None:
282
- """Set the web agent model."""
283
- self._web_agent_model = value
213
+ async def _ensure_ws_wrapper(self):
214
+ """Ensure WebSocket wrapper is initialized."""
215
+ if self._ws_wrapper is None:
216
+ self._ws_wrapper = WebSocketBrowserWrapper(self._ws_config)
217
+ await self._ws_wrapper.start()
284
218
 
285
- @property
286
- def cache_dir(self) -> str:
287
- """Get the cache directory."""
288
- return self._cache_dir
219
+ async def _get_ws_wrapper(self) -> WebSocketBrowserWrapper:
220
+ """Get the WebSocket wrapper, initializing if needed."""
221
+ await self._ensure_ws_wrapper()
222
+ if self._ws_wrapper is None:
223
+ raise RuntimeError("Failed to initialize WebSocket wrapper")
224
+ return self._ws_wrapper
289
225
 
290
226
  def __del__(self):
291
227
  r"""Cleanup browser resources on garbage collection."""
@@ -300,800 +236,35 @@ class HybridBrowserToolkit(BaseToolkit):
300
236
  try:
301
237
  loop = asyncio.get_event_loop()
302
238
  if not loop.is_closed() and not loop.is_running():
303
- # Try to close browser with a timeout to prevent hanging
304
239
  try:
305
240
  loop.run_until_complete(
306
- asyncio.wait_for(self.close_browser(), timeout=2.0)
241
+ asyncio.wait_for(self.browser_close(), timeout=2.0)
307
242
  )
308
243
  except asyncio.TimeoutError:
309
- pass # Skip cleanup if it takes too long
244
+ pass
310
245
  except (RuntimeError, ImportError):
311
- pass # Event loop unavailable, skip cleanup
312
- except Exception:
313
- pass # Suppress all errors during garbage collection
314
-
315
- def _load_unified_analyzer(self) -> str:
316
- r"""Load the unified analyzer JavaScript script."""
317
- script_path = os.path.join(
318
- os.path.dirname(os.path.abspath(__file__)), "unified_analyzer.js"
319
- )
320
-
321
- try:
322
- with open(
323
- script_path, "r", encoding='utf-8', errors='replace'
324
- ) as f:
325
- script_content = f.read()
326
-
327
- if not script_content.strip():
328
- raise ValueError(f"Script is empty: {script_path}")
329
-
330
- logger.debug(
331
- f"Loaded unified analyzer ({len(script_content)} chars)"
332
- )
333
- return script_content
334
- except FileNotFoundError:
335
- raise FileNotFoundError(f"Script not found: {script_path}")
336
-
337
- def _validate_ref(self, ref: str, method_name: str) -> None:
338
- r"""Validate ref parameter."""
339
- if not ref or not isinstance(ref, str):
340
- raise ValueError(
341
- f"{method_name}: 'ref' must be a non-empty string"
342
- )
343
-
344
- def _truncate_if_needed(self, content: Any) -> str:
345
- r"""Truncate content if max_log_length is set."""
346
- content_str = str(content)
347
- if (
348
- self.max_log_length is not None
349
- and len(content_str) > self.max_log_length
350
- ):
351
- return content_str[: self.max_log_length] + "... [TRUNCATED]"
352
- return content_str
353
-
354
- async def _get_current_url(self) -> Optional[str]:
355
- r"""Safely get the current URL of the active page."""
356
- try:
357
- page = await self._session.get_page()
358
- if page and not page.is_closed():
359
- return page.url
360
- return None # Return None if page is closed
246
+ pass
361
247
  except Exception:
362
- # This can happen if browser is not open.
363
- return None
248
+ pass
364
249
 
365
- async def _log_action(
366
- self,
367
- action_name: str,
368
- inputs: Dict[str, Any],
369
- outputs: Any,
370
- execution_time: float,
371
- page_load_time: Optional[float] = None,
372
- error: Optional[str] = None,
373
- ) -> None:
374
- r"""Log action details with comprehensive information."""
375
- if not (self.enable_action_logging or self.enable_timing_logging):
376
- return
377
-
378
- current_url = await self._get_current_url()
379
-
380
- log_entry: Dict[str, Any] = {
381
- "timestamp": datetime.datetime.now().isoformat(),
382
- "action": action_name,
383
- "url": current_url,
384
- "execution_time_ms": round(execution_time * 1000, 2),
385
- }
386
-
387
- if self.enable_action_logging:
388
- log_entry["inputs"] = inputs
389
- if error:
390
- log_entry["error"] = str(error)
391
- elif isinstance(outputs, dict):
392
- # Unpack dictionary items into the log entry
393
- log_entry.update(outputs)
394
- elif isinstance(outputs, ToolResult):
395
- log_entry["outputs"] = {
396
- "text": outputs.text,
397
- "images": outputs.images,
398
- }
399
- else:
400
- # For non-dict outputs, assign to 'outputs' key
401
- log_entry["outputs"] = outputs
402
-
403
- if page_load_time is not None and self.enable_page_loading_logging:
404
- log_entry["page_load_time_ms"] = round(page_load_time * 1000, 2)
405
-
406
- # Add to buffer
407
- self.log_buffer.append(log_entry)
408
-
409
- # Console logging
410
- if self.log_to_console:
411
- log_msg = f"[BROWSER ACTION] {action_name}"
412
- if self.enable_timing_logging:
413
- log_msg += f" | Execution: {log_entry['execution_time_ms']}ms"
414
- if page_load_time is not None and self.enable_page_loading_logging:
415
- log_msg += f" | Page Load: {log_entry['page_load_time_ms']}ms"
416
- if error:
417
- log_msg += f" | ERROR: {error}"
418
-
419
- logger.info(log_msg)
420
-
421
- if self.enable_action_logging:
422
- logger.info(f" Inputs: {self._truncate_if_needed(inputs)}")
423
- if not error:
424
- if isinstance(outputs, dict):
425
- for key, value in outputs.items():
426
- logger.info(
427
- f" - {key}: "
428
- f"{self._truncate_if_needed(value)}"
429
- )
430
- else:
431
- logger.info(
432
- f" Outputs: {self._truncate_if_needed(outputs)}"
433
- )
434
-
435
- # File logging
436
- if self.log_to_file and self.log_file_path:
437
- try:
438
- with open(self.log_file_path, 'a', encoding='utf-8') as f:
439
- # Write full log entry to file without truncation
440
- f.write(
441
- json.dumps(log_entry, ensure_ascii=False, indent=2)
442
- + '\n'
443
- )
444
- except Exception as e:
445
- logger.error(f"Failed to write to log file: {e}")
446
-
447
- @staticmethod
448
- def action_logger(func: Callable[..., Any]) -> Callable[..., Any]:
449
- r"""Decorator to add logging to action methods."""
450
-
451
- @wraps(func)
452
- async def wrapper(self, *args, **kwargs):
453
- action_name = func.__name__
454
- start_time = time.time()
455
-
456
- # Log inputs
457
- inputs = {
458
- "args": args, # Don't skip self since it's already handled
459
- "kwargs": kwargs,
460
- }
461
-
462
- try:
463
- # Execute the original function
464
- result = await func(self, *args, **kwargs)
465
- execution_time = time.time() - start_time
466
-
467
- # Log success
468
- await self._log_action(
469
- action_name=action_name,
470
- inputs=inputs,
471
- outputs=result,
472
- execution_time=execution_time,
473
- )
474
-
475
- return result
476
-
477
- except Exception as e:
478
- execution_time = time.time() - start_time
479
- error_msg = f"{type(e).__name__}: {e!s}"
480
-
481
- # Log error
482
- await self._log_action(
483
- action_name=action_name,
484
- inputs=inputs,
485
- outputs=None,
486
- execution_time=execution_time,
487
- error=error_msg,
488
- )
489
-
490
- raise
491
-
492
- return wrapper
493
-
494
- async def _get_session(self) -> "HybridBrowserSession":
495
- """Get the correct singleton session instance."""
496
- singleton = await HybridBrowserSession._get_or_create_instance(
497
- self._session
498
- )
499
- if singleton is not self._session:
500
- logger.debug("Updating to singleton session instance")
501
- self._session = singleton
502
- return self._session
503
-
504
- async def _ensure_browser(self):
505
- # Get singleton instance and update self._session if needed
506
- session = await self._get_session()
507
- await session.ensure_browser()
508
-
509
- async def _require_page(self):
510
- # Get singleton instance and update self._session if needed
511
- session = await self._get_session()
512
- await session.ensure_browser()
513
- return await session.get_page()
514
-
515
- async def _wait_for_page_stability(self):
516
- r"""Wait for page to become stable after actions that might trigger
517
- updates. Optimized with shorter timeouts.
518
- """
519
- page = await self._require_page()
520
- import asyncio
521
-
522
- try:
523
- # Wait for DOM content to be loaded (reduced timeout)
524
- await page.wait_for_load_state(
525
- 'domcontentloaded', timeout=self._page_stability_timeout
526
- )
527
- logger.debug("DOM content loaded")
528
-
529
- # Try to wait for network idle with shorter timeout
530
- try:
531
- await page.wait_for_load_state(
532
- 'networkidle', timeout=self._network_idle_timeout
533
- )
534
- logger.debug("Network idle achieved")
535
- except Exception:
536
- logger.debug("Network idle timeout - continuing anyway")
537
-
538
- # Reduced delay for JavaScript execution
539
- await asyncio.sleep(0.2) # Reduced from 0.5s
540
- logger.debug("Page stability wait completed")
541
-
542
- except Exception as e:
543
- logger.debug(
544
- f"Page stability wait failed: {e} - continuing anyway"
545
- )
546
-
547
- async def _get_unified_analysis(
548
- self, max_retries: int = 3
549
- ) -> Dict[str, Any]:
550
- r"""Get unified analysis data from the page with retry mechanism for
551
- navigation issues."""
552
- page = await self._require_page()
553
-
554
- for attempt in range(max_retries):
555
- try:
556
- if not self._unified_script:
557
- logger.error("Unified analyzer script not loaded")
558
- return {"elements": {}, "metadata": {"elementCount": 0}}
559
-
560
- # Wait for DOM stability before each attempt (with optimized
561
- # timeout)
562
- try:
563
- await page.wait_for_load_state(
564
- 'domcontentloaded',
565
- timeout=self._dom_content_loaded_timeout,
566
- )
567
- except Exception:
568
- # Don't fail if DOM wait times out
569
- pass
570
-
571
- result = await page.evaluate(self._unified_script)
572
-
573
- if not isinstance(result, dict):
574
- logger.warning(f"Invalid result type: {type(result)}")
575
- return {"elements": {}, "metadata": {"elementCount": 0}}
576
-
577
- # Success - return result
578
- if attempt > 0:
579
- logger.debug(
580
- f"Unified analysis succeeded on attempt {attempt + 1}"
581
- )
582
- return result
583
-
584
- except Exception as e:
585
- error_msg = str(e)
586
-
587
- # Check if this is a navigation-related error
588
- is_navigation_error = (
589
- "Execution context was destroyed" in error_msg
590
- or "Most likely because of a navigation" in error_msg
591
- or "Target page, context or browser has been closed"
592
- in error_msg
593
- )
594
-
595
- if is_navigation_error and attempt < max_retries - 1:
596
- logger.debug(
597
- f"Navigation error in unified analysis (attempt "
598
- f"{attempt + 1}/{max_retries}): {e}. Retrying..."
599
- )
600
-
601
- # Wait a bit for page stability before retrying (optimized)
602
- try:
603
- await page.wait_for_load_state(
604
- 'domcontentloaded',
605
- timeout=self._page_stability_timeout,
606
- )
607
- # Reduced delay for JS context to stabilize
608
- import asyncio
609
-
610
- await asyncio.sleep(0.1) # Reduced from 0.2s
611
- except Exception:
612
- # Continue even if wait fails
613
- pass
614
-
615
- continue
616
-
617
- # Non-navigation error or final attempt - log and return
618
- # empty result
619
- if attempt == max_retries - 1:
620
- logger.warning(
621
- f"Error in unified analysis after {max_retries} "
622
- f"attempts: {e}"
623
- )
624
- else:
625
- logger.warning(
626
- f"Non-retryable error in unified analysis: {e}"
627
- )
628
-
629
- return {"elements": {}, "metadata": {"elementCount": 0}}
630
-
631
- # Should not reach here, but just in case
632
- return {"elements": {}, "metadata": {"elementCount": 0}}
633
-
634
- def _convert_analysis_to_rects(
635
- self, analysis_data: Dict[str, Any]
636
- ) -> Dict[str, Any]:
637
- r"""Convert analysis data to rect format for visual marking."""
638
- rects = {}
639
- elements = analysis_data.get("elements", {})
640
-
641
- for ref, element_data in elements.items():
642
- coordinates = element_data.get("coordinates", [])
643
- if coordinates:
644
- rects[ref] = {
645
- "role": element_data.get("role", "generic"),
646
- "aria-name": element_data.get("name", ""),
647
- "rects": [coordinates[0]],
648
- }
649
- return rects
650
-
651
- def _add_set_of_mark(self, image, rects):
652
- r"""Add visual marks to the image."""
653
- try:
654
- from PIL import ImageDraw, ImageFont
655
- except ImportError:
656
- logger.warning("PIL not available, returning original image")
657
- return image
658
-
659
- marked_image = image.copy()
660
- draw = ImageDraw.Draw(marked_image)
661
-
662
- # Try to get font
663
- try:
664
- font = ImageFont.truetype("arial.ttf", 16)
665
- except (OSError, IOError):
666
- try:
667
- font = ImageFont.load_default()
668
- except (OSError, IOError):
669
- font = None
670
-
671
- # Color scheme
672
- colors = {
673
- "button": "#FF6B6B",
674
- "link": "#4ECDC4",
675
- "textbox": "#45B7D1",
676
- "select": "#96CEB4",
677
- "checkbox": "#FECA57",
678
- "radio": "#FF9FF3",
679
- "default": "#DDA0DD",
680
- }
681
-
682
- for ref, rect_data in rects.items():
683
- rects_list = rect_data.get("rects", [])
684
- role = rect_data.get("role", "generic")
685
- color = colors.get(role, colors["default"])
686
-
687
- for rect in rects_list:
688
- x, y = rect.get("x", 0), rect.get("y", 0)
689
- width, height = rect.get("width", 0), rect.get("height", 0)
690
-
691
- # Draw rectangle outline
692
- draw.rectangle(
693
- [x, y, x + width, y + height], outline=color, width=2
694
- )
695
-
696
- # Draw reference label
697
- label_text = ref
698
- if font:
699
- bbox = draw.textbbox((0, 0), label_text, font=font)
700
- text_width, text_height = (
701
- bbox[2] - bbox[0],
702
- bbox[3] - bbox[1],
703
- )
704
- else:
705
- text_width, text_height = len(label_text) * 8, 16
706
-
707
- label_x, label_y = max(0, x - 2), max(0, y - text_height - 2)
708
-
709
- # Background and text
710
- draw.rectangle(
711
- [
712
- label_x,
713
- label_y,
714
- label_x + text_width + 4,
715
- label_y + text_height + 2,
716
- ],
717
- fill=color,
718
- )
719
- draw.text(
720
- (label_x + 2, label_y + 1),
721
- label_text,
722
- fill="white",
723
- font=font,
724
- )
725
-
726
- return marked_image
727
-
728
- def _format_snapshot_from_analysis(
729
- self, analysis_data: Dict[str, Any]
730
- ) -> str:
731
- r"""Format analysis data into snapshot string."""
732
- lines = []
733
- elements = analysis_data.get("elements", {})
734
-
735
- for ref, element_data in elements.items():
736
- role = element_data.get("role", "generic")
737
- name = element_data.get("name", "")
738
-
739
- line = f"- {role}"
740
- if name:
741
- line += f' "{name}"'
742
-
743
- # Add properties
744
- props = []
745
- for prop in ["disabled", "checked", "expanded"]:
746
- value = element_data.get(prop)
747
- if value is True:
748
- props.append(prop)
749
- elif value is not None and prop in ["checked", "expanded"]:
750
- props.append(f"{prop}={value}")
751
-
752
- if props:
753
- line += f" {' '.join(props)}"
754
-
755
- line += f" [ref={ref}]"
756
- lines.append(line)
757
-
758
- return "\n".join(lines)
759
-
760
- async def _get_tab_info_for_output(self) -> Dict[str, Any]:
761
- r"""Get tab information to include in action outputs."""
762
- try:
763
- # Ensure we have the correct singleton session instance first
764
- session = await self._get_session()
765
-
766
- # Add debug info for tab info retrieval
767
- logger.debug("Attempting to get tab info from session...")
768
- tab_info = await session.get_tab_info()
769
- current_tab_index = await session.get_current_tab_id()
770
-
771
- # Debug log the successful retrieval
772
- logger.debug(
773
- f"Successfully retrieved {len(tab_info)} tabs, current: "
774
- f"{current_tab_index}"
775
- )
776
-
777
- return {
778
- "tabs": tab_info,
779
- "current_tab": current_tab_index,
780
- "total_tabs": len(tab_info),
781
- }
782
- except Exception as e:
783
- logger.warning(
784
- f"Failed to get tab info from session: {type(e).__name__}: {e}"
785
- )
786
-
787
- # Try to get actual tab count from session pages directly
788
- try:
789
- # Get the correct session instance for fallback
790
- fallback_session = await self._get_session()
791
-
792
- # Check browser session state
793
- session_state = {
794
- "has_session": fallback_session is not None,
795
- "has_pages_attr": hasattr(fallback_session, '_pages'),
796
- "pages_count": len(fallback_session._pages)
797
- if hasattr(fallback_session, '_pages')
798
- else "unknown",
799
- "has_page": hasattr(fallback_session, '_page')
800
- and fallback_session._page is not None,
801
- "session_id": getattr(
802
- fallback_session, '_session_id', 'unknown'
803
- ),
804
- }
805
- logger.debug(f"Browser session state: {session_state}")
806
-
807
- actual_tab_count = 0
808
- if (
809
- hasattr(fallback_session, '_pages')
810
- and fallback_session._pages
811
- ):
812
- actual_tab_count = len(fallback_session._pages)
813
- # Also try to filter out closed pages
814
- try:
815
- open_pages = [
816
- p
817
- for p in fallback_session._pages.values()
818
- if not p.is_closed()
819
- ]
820
- actual_tab_count = len(open_pages)
821
- logger.debug(
822
- f"Found {actual_tab_count} open tabs out of "
823
- f"{len(fallback_session._pages)} total"
824
- )
825
- except Exception:
826
- # Keep the original count if we can't check page status
827
- pass
828
-
829
- if actual_tab_count == 0:
830
- # If no pages, check if browser is even initialized
831
- if (
832
- hasattr(fallback_session, '_page')
833
- and fallback_session._page is not None
834
- ):
835
- actual_tab_count = 1
836
- logger.debug(
837
- "No pages in list but main page exists, assuming "
838
- "1 tab"
839
- )
840
- else:
841
- actual_tab_count = 1
842
- logger.debug("No pages found, defaulting to 1 tab")
843
-
844
- logger.debug(f"Using fallback tab count: {actual_tab_count}")
845
- return {
846
- "tabs": [],
847
- "current_tab": 0,
848
- "total_tabs": actual_tab_count,
849
- }
850
-
851
- except Exception as fallback_error:
852
- logger.warning(
853
- f"Fallback tab count also failed: "
854
- f"{type(fallback_error).__name__}: {fallback_error}"
855
- )
856
- return {"tabs": [], "current_tab": 0, "total_tabs": 1}
857
-
858
- async def _exec_with_snapshot(
859
- self,
860
- action: Dict[str, Any],
861
- element_details: Optional[Dict[str, Any]] = None,
862
- ) -> Dict[str, str]:
863
- r"""Execute action and return result with snapshot comparison."""
864
-
865
- # Log action execution start
866
- action_type = action.get("type", "unknown")
867
- logger.info(f"Executing action: {action_type}")
868
-
869
- action_start_time = time.time()
870
- inputs: Dict[str, Any] = {"action": action}
871
- page_load_time = None
872
-
873
- try:
874
- # Get before snapshot
875
- logger.info("Capturing pre-action snapshot...")
876
- snapshot_start_before = time.time()
877
- before_snapshot = await self._session.get_snapshot(
878
- force_refresh=True, diff_only=False
879
- )
880
- before_snapshot_time = time.time() - snapshot_start_before
881
- logger.info(
882
- f"Pre-action snapshot captured in {before_snapshot_time:.2f}s"
883
- )
884
-
885
- # Execute action
886
- logger.info(f"Executing {action_type} action...")
887
- exec_start = time.time()
888
- exec_result = await self._session.exec_action(action)
889
- exec_time = time.time() - exec_start
890
- logger.info(f"Action {action_type} completed in {exec_time:.2f}s")
891
-
892
- # Parse the detailed result from ActionExecutor
893
- if isinstance(exec_result, dict):
894
- result_message = exec_result.get("message", str(exec_result))
895
- action_details = exec_result.get("details", {})
896
- success = exec_result.get("success", True)
897
- else:
898
- result_message = str(exec_result)
899
- action_details = {}
900
- success = True
901
-
902
- # Wait for page stability after action (especially important for
903
- # click)
904
- stability_time: float = 0.0
905
- if action_type in ["click", "type", "select", "enter"]:
906
- logger.info(
907
- f"Waiting for page stability " f"after {action_type}..."
908
- )
909
- stability_start = time.time()
910
- await self._wait_for_page_stability()
911
- stability_time = time.time() - stability_start
912
- logger.info(
913
- f"Page stability wait "
914
- f"completed in "
915
- f"{stability_time:.2f}s"
916
- )
917
- page_load_time = stability_time
918
-
919
- # Enhanced logging for page loading times
920
- if self.enable_page_loading_logging and self.log_to_console:
921
- logger.info(
922
- f"[PAGE LOADING] Page stability for {action_type}: "
923
- f"{round(stability_time * 1000, 2)}ms"
924
- )
925
-
926
- # Get after snapshot
927
- logger.info("Capturing post-action snapshot...")
928
- snapshot_start_after = time.time()
929
- after_snapshot = await self._session.get_snapshot(
930
- force_refresh=True, diff_only=False
931
- )
932
- after_snapshot_time = time.time() - snapshot_start_after
933
- logger.info(
934
- f"Post-action snapshot "
935
- f"captured in {after_snapshot_time:.2f}s"
936
- )
937
-
938
- # Check for snapshot quality and log warnings
939
- if before_snapshot == after_snapshot:
940
- snapshot = "snapshot not changed"
941
- logger.debug("Page snapshot unchanged after action")
942
- else:
943
- snapshot = after_snapshot
944
- # Check if snapshot is empty or problematic
945
- if "<empty>" in after_snapshot:
946
- logger.warning(
947
- f"Action {action_type} resulted "
948
- f"in empty snapshot - "
949
- f"page may still be loading"
950
- )
951
- elif len(after_snapshot.strip()) < 50:
952
- logger.warning(
953
- f"Action {action_type} resulted "
954
- f"in very short snapshot:"
955
- f" {len(after_snapshot)} chars"
956
- )
957
- else:
958
- logger.debug(
959
- f"Action {action_type} resulted "
960
- f"in updated snapshot: "
961
- f"{len(after_snapshot)} chars"
962
- )
963
-
964
- # Get tab information for output
965
- tab_info = await self._get_tab_info_for_output()
966
-
967
- # Create comprehensive output for logging
968
- execution_time = time.time() - action_start_time
969
- total_snapshot_time = before_snapshot_time + after_snapshot_time
970
- outputs = {
971
- "result": result_message,
972
- "snapshot": snapshot,
973
- "success": success,
974
- "action_details": action_details,
975
- "execution_stats": {
976
- "exec_time_ms": round(exec_time * 1000, 2),
977
- "stability_time_ms": round(stability_time * 1000, 2)
978
- if stability_time > 0
979
- else None,
980
- "snapshot_time_ms": round(total_snapshot_time * 1000, 2),
981
- "total_time_ms": round(execution_time * 1000, 2),
982
- },
983
- **tab_info, # Include tab information
984
- }
985
-
986
- # If snapshot is unchanged after click, add element details to log
987
- if (
988
- snapshot == "snapshot not changed"
989
- and action_type == "click"
990
- and element_details
991
- ):
992
- logger.debug(
993
- "Snapshot unchanged after click. "
994
- "Adding element details to log."
995
- )
996
- outputs["clicked_element_tag"] = element_details.get(
997
- "tagName", "N/A"
998
- )
999
- outputs["clicked_element_content"] = element_details.get(
1000
- "name", ""
1001
- )
1002
- outputs["clicked_element_type"] = element_details.get(
1003
- "role", "generic"
1004
- )
1005
-
1006
- # Log the action with all details
1007
- await self._log_action(
1008
- action_name=f"_exec_with_snapshot_{action_type}",
1009
- inputs=inputs,
1010
- outputs=outputs,
1011
- execution_time=execution_time,
1012
- page_load_time=page_load_time,
1013
- )
1014
-
1015
- return {"result": result_message, "snapshot": snapshot}
1016
-
1017
- except Exception as e:
1018
- execution_time = time.time() - action_start_time
1019
- error_msg = f"{type(e).__name__}: {e!s}"
1020
-
1021
- # Log error
1022
- await self._log_action(
1023
- action_name=f"_exec_with_snapshot_{action_type}",
1024
- inputs=inputs,
1025
- outputs=None,
1026
- execution_time=execution_time,
1027
- page_load_time=page_load_time,
1028
- error=error_msg,
1029
- )
1030
-
1031
- raise
1032
-
1033
- async def _extract_links_by_refs(
1034
- self, snapshot: str, page, refs: List[str]
1035
- ) -> List[Dict[str, str]]:
1036
- r"""Extract multiple links by their reference IDs."""
1037
- import re
1038
-
1039
- found_links = []
1040
- ref_set = set(refs)
1041
- lines = snapshot.split('\n')
1042
-
1043
- for line in lines:
1044
- link_match = re.search(
1045
- r'- link\s+"([^"]+)"\s+\[ref=([^\]]+)\]', line
1046
- )
1047
- if link_match and link_match.group(2) in ref_set:
1048
- text, found_ref = link_match.groups()
1049
- try:
1050
- url = await self._get_link_url_by_ref(page, found_ref)
1051
- found_links.append(
1052
- {"text": text, "ref": found_ref, "url": url or ""}
1053
- )
1054
- except Exception as e:
1055
- logger.warning(
1056
- f"Failed to get URL for ref {found_ref}: {e}"
1057
- )
1058
- found_links.append(
1059
- {"text": text, "ref": found_ref, "url": ""}
1060
- )
1061
-
1062
- return found_links
250
+ @property
251
+ def web_agent_model(self) -> Optional[BaseModelBackend]:
252
+ """Get the web agent model."""
253
+ return self._web_agent_model
1063
254
 
1064
- async def _get_link_url_by_ref(self, page, ref: str) -> str:
1065
- r"""Get URL of a link element by reference ID."""
1066
- try:
1067
- element = await page.query_selector(f'[aria-ref="{ref}"]')
1068
- if element:
1069
- href = await element.get_attribute('href')
1070
- if href:
1071
- from urllib.parse import urljoin
1072
-
1073
- return urljoin(page.url, href)
1074
- return ""
1075
- except Exception as e:
1076
- logger.warning(f"Failed to get URL for ref {ref}: {e}")
1077
- return ""
1078
-
1079
- def _ensure_agent(self) -> PlaywrightLLMAgent:
1080
- r"""Create PlaywrightLLMAgent on first use."""
1081
- if self._web_agent_model is None:
1082
- raise RuntimeError(
1083
- "web_agent_model required for high-level task planning"
1084
- )
255
+ @web_agent_model.setter
256
+ def web_agent_model(self, value: Optional[BaseModelBackend]) -> None:
257
+ """Set the web agent model."""
258
+ self._web_agent_model = value
1085
259
 
1086
- if self._agent is None:
1087
- self._agent = PlaywrightLLMAgent(
1088
- headless=self._headless,
1089
- user_data_dir=self._user_data_dir,
1090
- model_backend=self._web_agent_model,
1091
- )
1092
- return self._agent
260
+ @property
261
+ def cache_dir(self) -> str:
262
+ """Get the cache directory."""
263
+ return self._cache_dir
1093
264
 
1094
265
  # Public API Methods
1095
266
 
1096
- async def open_browser(self) -> Dict[str, Any]:
267
+ async def browser_open(self) -> Dict[str, Any]:
1097
268
  r"""Starts a new browser session. This must be the first browser
1098
269
  action.
1099
270
 
@@ -1103,60 +274,45 @@ class HybridBrowserToolkit(BaseToolkit):
1103
274
  Returns:
1104
275
  Dict[str, Any]: A dictionary with the result of the action:
1105
276
  - "result" (str): Confirmation of the action.
1106
- - "snapshot" (str): A textual snapshot of interactive elements.
277
+ - "snapshot" (str): A textual snapshot of interactive
278
+ elements.
1107
279
  - "tabs" (List[Dict]): Information about all open tabs.
1108
280
  - "current_tab" (int): Index of the active tab.
1109
281
  - "total_tabs" (int): Total number of open tabs.
1110
282
  """
1111
- # Add logging if enabled
1112
- action_start = time.time()
1113
- inputs: Dict[str, Any] = {} # No input parameters for agents
1114
-
1115
- logger.info("Starting browser session...")
1116
-
1117
- browser_start = time.time()
1118
- await self._session.ensure_browser()
1119
- browser_time = time.time() - browser_start
1120
- logger.info(f"Browser session started in {browser_time:.2f}s")
1121
-
1122
283
  try:
1123
- # Always use the configured default start URL
1124
- start_url = self._default_start_url
1125
- logger.info(f"Navigating to configured default page: {start_url}")
1126
-
1127
- # Use visit_page without creating a new tab
1128
- result = await self.visit_page(start_url)
1129
-
1130
- # Log success
1131
- if self.enable_action_logging or self.enable_timing_logging:
1132
- execution_time = time.time() - action_start
1133
- await self._log_action(
1134
- action_name="open_browser",
1135
- inputs=inputs,
1136
- outputs={
1137
- "result": "Browser opened and navigated to "
1138
- "default page."
1139
- },
1140
- execution_time=execution_time,
1141
- )
284
+ ws_wrapper = await self._get_ws_wrapper()
285
+ result = await ws_wrapper.open_browser(self._default_start_url)
286
+
287
+ # Add tab information
288
+ tab_info = await ws_wrapper.get_tab_info()
289
+ result.update(
290
+ {
291
+ "tabs": tab_info,
292
+ "current_tab": next(
293
+ (
294
+ i
295
+ for i, tab in enumerate(tab_info)
296
+ if tab.get("is_current")
297
+ ),
298
+ 0,
299
+ ),
300
+ "total_tabs": len(tab_info),
301
+ }
302
+ )
1142
303
 
1143
304
  return result
1144
-
1145
305
  except Exception as e:
1146
- # Log error
1147
- if self.enable_action_logging or self.enable_timing_logging:
1148
- execution_time = time.time() - action_start
1149
- await self._log_action(
1150
- action_name="open_browser",
1151
- inputs=inputs,
1152
- outputs=None,
1153
- execution_time=execution_time,
1154
- error=f"{type(e).__name__}: {e!s}",
1155
- )
1156
- raise
306
+ logger.error(f"Failed to open browser: {e}")
307
+ return {
308
+ "result": f"Error opening browser: {e}",
309
+ "snapshot": "",
310
+ "tabs": [],
311
+ "current_tab": 0,
312
+ "total_tabs": 0,
313
+ }
1157
314
 
1158
- @action_logger
1159
- async def close_browser(self) -> str:
315
+ async def browser_close(self) -> str:
1160
316
  r"""Closes the browser session, releasing all resources.
1161
317
 
1162
318
  This should be called at the end of a task for cleanup.
@@ -1164,18 +320,16 @@ class HybridBrowserToolkit(BaseToolkit):
1164
320
  Returns:
1165
321
  str: A confirmation message.
1166
322
  """
1167
- if self._agent is not None:
1168
- try:
1169
- await self._agent.close()
1170
- except Exception:
1171
- pass
1172
- self._agent = None
1173
-
1174
- await self._session.close()
1175
- return "Browser session closed."
323
+ try:
324
+ if self._ws_wrapper:
325
+ await self._ws_wrapper.stop()
326
+ self._ws_wrapper = None
327
+ return "Browser session closed."
328
+ except Exception as e:
329
+ logger.error(f"Failed to close browser: {e}")
330
+ return f"Error closing browser: {e}"
1176
331
 
1177
- @action_logger
1178
- async def visit_page(self, url: str) -> Dict[str, Any]:
332
+ async def browser_visit_page(self, url: str) -> Dict[str, Any]:
1179
333
  r"""Opens a URL in a new browser tab and switches to it.
1180
334
 
1181
335
  Args:
@@ -1190,70 +344,39 @@ class HybridBrowserToolkit(BaseToolkit):
1190
344
  - "current_tab" (int): Index of the new active tab.
1191
345
  - "total_tabs" (int): Total number of open tabs.
1192
346
  """
1193
- if not url or not isinstance(url, str):
347
+ try:
348
+ ws_wrapper = await self._get_ws_wrapper()
349
+ result = await ws_wrapper.visit_page(url)
350
+
351
+ # Add tab information
352
+ tab_info = await ws_wrapper.get_tab_info()
353
+ result.update(
354
+ {
355
+ "tabs": tab_info,
356
+ "current_tab": next(
357
+ (
358
+ i
359
+ for i, tab in enumerate(tab_info)
360
+ if tab.get("is_current")
361
+ ),
362
+ 0,
363
+ ),
364
+ "total_tabs": len(tab_info),
365
+ }
366
+ )
367
+
368
+ return result
369
+ except Exception as e:
370
+ logger.error(f"Failed to visit page: {e}")
1194
371
  return {
1195
- "result": "Error: 'url' must be a non-empty string",
372
+ "result": f"Error visiting page: {e}",
1196
373
  "snapshot": "",
1197
374
  "tabs": [],
1198
375
  "current_tab": 0,
1199
- "total_tabs": 1,
376
+ "total_tabs": 0,
1200
377
  }
1201
378
 
1202
- if '://' not in url:
1203
- url = f'https://{url}'
1204
-
1205
- await self._ensure_browser()
1206
- session = await self._get_session()
1207
- nav_result = ""
1208
-
1209
- # By default, we want to create a new tab.
1210
- should_create_new_tab = True
1211
- try:
1212
- # If the browser has just started with a single "about:blank" tab,
1213
- # use that tab instead of creating a new one.
1214
- tab_info_data = await self._get_tab_info_for_output()
1215
- tabs = tab_info_data.get("tabs", [])
1216
- if len(tabs) == 1 and tabs[0].get("url") == "about:blank":
1217
- logger.info(
1218
- "Found single blank tab, navigating in current tab "
1219
- "instead of creating a new one."
1220
- )
1221
- should_create_new_tab = False
1222
- except Exception as e:
1223
- logger.warning(
1224
- "Could not get tab info to check for blank tab, "
1225
- f"proceeding with default behavior (new tab). Error: {e}"
1226
- )
1227
-
1228
- if should_create_new_tab:
1229
- logger.info(f"Creating new tab and navigating to URL: {url}")
1230
- try:
1231
- new_tab_id = await session.create_new_tab(url)
1232
- await session.switch_to_tab(new_tab_id)
1233
- nav_result = f"Visited {url} in new tab {new_tab_id}"
1234
- except Exception as e:
1235
- logger.error(f"Failed to create new tab and navigate: {e}")
1236
- nav_result = f"Error creating new tab: {e}"
1237
- else:
1238
- logger.info(f"Navigating to URL in current tab: {url}")
1239
- nav_result = await session.visit(url)
1240
-
1241
- # Get snapshot
1242
- snapshot = ""
1243
- try:
1244
- snapshot = await session.get_snapshot(
1245
- force_refresh=True, diff_only=False
1246
- )
1247
- except Exception as e:
1248
- logger.warning(f"Failed to capture snapshot: {e}")
1249
-
1250
- # Get tab information
1251
- tab_info = await self._get_tab_info_for_output()
1252
-
1253
- return {"result": nav_result, "snapshot": snapshot, **tab_info}
1254
-
1255
- @action_logger
1256
- async def back(self) -> Dict[str, Any]:
379
+ async def browser_back(self) -> Dict[str, Any]:
1257
380
  r"""Goes back to the previous page in the browser history.
1258
381
 
1259
382
  This action simulates using the browser's "back" button in the
@@ -1267,57 +390,39 @@ class HybridBrowserToolkit(BaseToolkit):
1267
390
  - "current_tab" (int): Index of the active tab.
1268
391
  - "total_tabs" (int): Total number of open tabs.
1269
392
  """
1270
- page = await self._require_page()
1271
-
1272
393
  try:
1273
- logger.info("Navigating back in browser history...")
1274
- nav_start = time.time()
1275
- await page.go_back(
1276
- wait_until="domcontentloaded", timeout=self._navigation_timeout
1277
- )
1278
- nav_time = time.time() - nav_start
1279
- logger.info(f"Back navigation completed in {nav_time:.2f}s")
1280
-
1281
- # Minimal wait for page stability (back navigation is usually fast)
1282
- import asyncio
1283
-
1284
- await asyncio.sleep(0.2)
1285
-
1286
- # Get snapshot
1287
- logger.info("Capturing page snapshot after back navigation...")
1288
- snapshot_start = time.time()
1289
- snapshot = await self._session.get_snapshot(
1290
- force_refresh=True, diff_only=False
1291
- )
1292
- snapshot_time = time.time() - snapshot_start
1293
- logger.info(
1294
- f"Back navigation snapshot captured in {snapshot_time:.2f}s"
394
+ ws_wrapper = await self._get_ws_wrapper()
395
+ result = await ws_wrapper.back()
396
+
397
+ # Add tab information
398
+ tab_info = await ws_wrapper.get_tab_info()
399
+ result.update(
400
+ {
401
+ "tabs": tab_info,
402
+ "current_tab": next(
403
+ (
404
+ i
405
+ for i, tab in enumerate(tab_info)
406
+ if tab.get("is_current")
407
+ ),
408
+ 0,
409
+ ),
410
+ "total_tabs": len(tab_info),
411
+ }
1295
412
  )
1296
413
 
1297
- # Get tab information
1298
- tab_info = await self._get_tab_info_for_output()
1299
-
1300
- return {
1301
- "result": "Back navigation successful.",
1302
- "snapshot": snapshot,
1303
- **tab_info,
1304
- }
1305
-
414
+ return result
1306
415
  except Exception as e:
1307
- logger.warning(f"Back navigation failed: {e}")
1308
- # Get current snapshot even if navigation failed
1309
- snapshot = await self._session.get_snapshot(
1310
- force_refresh=True, diff_only=False
1311
- )
1312
- tab_info = await self._get_tab_info_for_output()
416
+ logger.error(f"Failed to navigate back: {e}")
1313
417
  return {
1314
- "result": f"Back navigation failed: {e!s}",
1315
- "snapshot": snapshot,
1316
- **tab_info,
418
+ "result": f"Error navigating back: {e}",
419
+ "snapshot": "",
420
+ "tabs": [],
421
+ "current_tab": 0,
422
+ "total_tabs": 0,
1317
423
  }
1318
424
 
1319
- @action_logger
1320
- async def forward(self) -> Dict[str, Any]:
425
+ async def browser_forward(self) -> Dict[str, Any]:
1321
426
  r"""Goes forward to the next page in the browser history.
1322
427
 
1323
428
  This action simulates using the browser's "forward" button in the
@@ -1331,164 +436,191 @@ class HybridBrowserToolkit(BaseToolkit):
1331
436
  - "current_tab" (int): Index of the active tab.
1332
437
  - "total_tabs" (int): Total number of open tabs.
1333
438
  """
1334
- page = await self._require_page()
1335
-
1336
439
  try:
1337
- logger.info("Navigating forward in browser history...")
1338
- nav_start = time.time()
1339
- await page.go_forward(
1340
- wait_until="domcontentloaded", timeout=self._navigation_timeout
1341
- )
1342
- nav_time = time.time() - nav_start
1343
- logger.info(f"Forward navigation completed in {nav_time:.2f}s")
1344
-
1345
- # Minimal wait for page stability (forward navigation is usually
1346
- # fast)
1347
- import asyncio
1348
-
1349
- await asyncio.sleep(0.2)
1350
-
1351
- # Get snapshot
1352
- logger.info("Capturing page snapshot after forward navigation...")
1353
- snapshot_start = time.time()
1354
- snapshot = await self._session.get_snapshot(
1355
- force_refresh=True, diff_only=False
1356
- )
1357
- snapshot_time = time.time() - snapshot_start
1358
- logger.info(
1359
- f"Forward navigation snapshot captured in {snapshot_time:.2f}s"
440
+ ws_wrapper = await self._get_ws_wrapper()
441
+ result = await ws_wrapper.forward()
442
+
443
+ # Add tab information
444
+ tab_info = await ws_wrapper.get_tab_info()
445
+ result.update(
446
+ {
447
+ "tabs": tab_info,
448
+ "current_tab": next(
449
+ (
450
+ i
451
+ for i, tab in enumerate(tab_info)
452
+ if tab.get("is_current")
453
+ ),
454
+ 0,
455
+ ),
456
+ "total_tabs": len(tab_info),
457
+ }
1360
458
  )
1361
459
 
1362
- # Get tab information
1363
- tab_info = await self._get_tab_info_for_output()
1364
-
1365
- return {
1366
- "result": "Forward navigation successful.",
1367
- "snapshot": snapshot,
1368
- **tab_info,
1369
- }
1370
-
460
+ return result
1371
461
  except Exception as e:
1372
- logger.warning(f"Forward navigation failed: {e}")
1373
- # Get current snapshot even if navigation failed
1374
- snapshot = await self._session.get_snapshot(
1375
- force_refresh=True, diff_only=False
1376
- )
1377
- tab_info = await self._get_tab_info_for_output()
462
+ logger.error(f"Failed to navigate forward: {e}")
1378
463
  return {
1379
- "result": f"Forward navigation failed: {e!s}",
1380
- "snapshot": snapshot,
1381
- **tab_info,
464
+ "result": f"Error navigating forward: {e}",
465
+ "snapshot": "",
466
+ "tabs": [],
467
+ "current_tab": 0,
468
+ "total_tabs": 0,
1382
469
  }
1383
470
 
1384
- @action_logger
1385
- async def get_page_snapshot(self) -> str:
471
+ async def browser_get_page_snapshot(self) -> str:
1386
472
  r"""Gets a textual snapshot of the page's interactive elements.
1387
473
 
1388
- The snapshot lists elements like buttons, links, and inputs, each with
474
+ The snapshot lists elements like buttons, links, and inputs,
475
+ each with
1389
476
  a unique `ref` ID. This ID is used by other tools (e.g., `click`,
1390
477
  `type`) to interact with a specific element. This tool provides no
1391
478
  visual information.
1392
479
 
480
+ If viewport_limit is enabled, only elements within the current
481
+ viewport
482
+ will be included in the snapshot.
483
+
1393
484
  Returns:
1394
485
  str: A formatted string representing the interactive elements and
1395
486
  their `ref` IDs. For example:
1396
487
  '- link "Sign In" [ref=1]'
1397
488
  '- textbox "Username" [ref=2]'
1398
489
  """
1399
- logger.info("Capturing page snapshot")
1400
-
1401
- analysis_start = time.time()
1402
- analysis_data = await self._get_unified_analysis()
1403
- analysis_time = time.time() - analysis_start
1404
- logger.info(
1405
- f"Page snapshot analysis " f"completed in {analysis_time:.2f}s"
1406
- )
1407
-
1408
- snapshot_text = analysis_data.get("snapshotText", "")
1409
- return (
1410
- snapshot_text
1411
- if snapshot_text
1412
- else self._format_snapshot_from_analysis(analysis_data)
1413
- )
490
+ try:
491
+ ws_wrapper = await self._get_ws_wrapper()
492
+ return await ws_wrapper.get_page_snapshot(self._viewport_limit)
493
+ except Exception as e:
494
+ logger.error(f"Failed to get page snapshot: {e}")
495
+ return f"Error capturing snapshot: {e}"
1414
496
 
1415
497
  @dependencies_required('PIL')
1416
- @action_logger
1417
- async def get_som_screenshot(self):
498
+ async def browser_get_som_screenshot(
499
+ self,
500
+ read_image: bool = True,
501
+ instruction: Optional[str] = None,
502
+ ) -> str:
1418
503
  r"""Captures a screenshot with interactive elements highlighted.
1419
504
 
1420
- "SoM" stands for "Set of Marks". This tool takes a screenshot and draws
505
+ "SoM" stands for "Set of Marks". This tool takes a screenshot and
506
+ draws
1421
507
  boxes around clickable elements, overlaying a `ref` ID on each. Use
1422
508
  this for a visual understanding of the page, especially when the
1423
509
  textual snapshot is not enough.
1424
510
 
511
+ Args:
512
+ read_image (bool, optional): If `True`, the agent will analyze
513
+ the screenshot. Requires agent to be registered.
514
+ (default: :obj:`True`)
515
+ instruction (Optional[str], optional): A specific question or
516
+ command for the agent regarding the screenshot, used only if
517
+ `read_image` is `True`. For example: "Find the login button."
518
+
1425
519
  Returns:
1426
- ToolResult: An object containing:
1427
- - `text` (str): A summary, e.g., "Visual webpage screenshot
1428
- captured with 42 interactive elements".
1429
- - `images` (List[str]): A list containing one base64-encoded
1430
- PNG image data URL.
520
+ str: A confirmation message indicating the screenshot was
521
+ captured, the file path where it was saved, and optionally the
522
+ agent's analysis if `read_image` is `True`.
1431
523
  """
1432
- from PIL import Image
524
+ import base64
525
+ import datetime
526
+ import os
527
+ import urllib.parse
1433
528
 
1434
- from camel.utils.tool_result import ToolResult
529
+ from camel.utils import sanitize_filename
1435
530
 
1436
- os.makedirs(self._cache_dir, exist_ok=True)
1437
- # Get screenshot and analysis
1438
- page = await self._require_page()
531
+ try:
532
+ ws_wrapper = await self._get_ws_wrapper()
533
+ result = await ws_wrapper.get_som_screenshot()
1439
534
 
1440
- # Log screenshot timeout start
1441
- logger.info(
1442
- f"Starting screenshot capture"
1443
- f"with timeout: {self._screenshot_timeout}ms"
1444
- )
535
+ # Initialize result text
536
+ result_text = result.text
537
+ file_path = None
1445
538
 
1446
- start_time = time.time()
1447
- image_data = await page.screenshot(timeout=self._screenshot_timeout)
1448
- screenshot_time = time.time() - start_time
1449
-
1450
- logger.info(f"Screenshot capture completed in {screenshot_time:.2f}s")
1451
- image = Image.open(io.BytesIO(image_data))
1452
-
1453
- # Log unified analysis start
1454
- logger.info("Starting unified page analysis...")
1455
- analysis_start_time = time.time()
1456
- analysis_data = await self._get_unified_analysis()
1457
- analysis_time = time.time() - analysis_start_time
1458
- logger.info(f"Unified page analysis completed in {analysis_time:.2f}s")
1459
-
1460
- # Log image processing
1461
- logger.info("Processing visual marks on screenshot...")
1462
- mark_start_time = time.time()
1463
- rects = self._convert_analysis_to_rects(analysis_data)
1464
- marked_image = self._add_set_of_mark(image, rects)
1465
- mark_time = time.time() - mark_start_time
1466
- logger.info(f"Visual marks processing completed in {mark_time:.2f}s")
1467
-
1468
- # Save screenshot to cache directory
1469
- parsed_url = urllib.parse.urlparse(page.url)
1470
- url_name = sanitize_filename(str(parsed_url.path), max_length=241)
1471
- timestamp = datetime.datetime.now().strftime("%m%d%H%M%S")
1472
- file_path = os.path.join(
1473
- self._cache_dir, f"{url_name}_{timestamp}_som.png"
1474
- )
1475
- marked_image.save(file_path, "PNG")
1476
-
1477
- # Convert to base64
1478
- img_buffer = io.BytesIO()
1479
- marked_image.save(img_buffer, format="PNG")
1480
- img_buffer.seek(0)
1481
- img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
1482
- img_data_url = f"data:image/png;base64,{img_base64}"
1483
-
1484
- text_result = (
1485
- f"Visual webpage screenshot "
1486
- f"captured with {len(rects)} interactive elements"
1487
- )
539
+ # Save screenshot to cache directory if images are available
540
+ if result.images:
541
+ # Ensure cache directory exists (use absolute path)
542
+ cache_dir = os.path.abspath(self._cache_dir)
543
+ os.makedirs(cache_dir, exist_ok=True)
544
+
545
+ # Get current page URL for filename
546
+ try:
547
+ # Try to get the current page URL from the wrapper
548
+ page_info = await ws_wrapper.get_tab_info()
549
+ current_tab = next(
550
+ (tab for tab in page_info if tab.get('is_current')),
551
+ None,
552
+ )
553
+ url = current_tab['url'] if current_tab else 'unknown'
554
+ except Exception:
555
+ url = 'unknown'
556
+
557
+ # Generate filename
558
+ parsed_url = urllib.parse.urlparse(url)
559
+ url_name = sanitize_filename(
560
+ str(parsed_url.path) or 'homepage', max_length=241
561
+ )
562
+ timestamp = datetime.datetime.now().strftime("%m%d%H%M%S")
563
+ file_path = os.path.join(
564
+ cache_dir, f"{url_name}_{timestamp}_som.png"
565
+ )
566
+
567
+ # Extract base64 data and save to file
568
+ for _, image_data in enumerate(result.images):
569
+ if image_data.startswith('data:image/png;base64,'):
570
+ # Remove data URL prefix
571
+ base64_data = image_data.split(',', 1)[1]
572
+
573
+ # Decode and save
574
+ image_bytes = base64.b64decode(base64_data)
575
+ with open(file_path, 'wb') as f:
576
+ f.write(image_bytes)
577
+
578
+ logger.info(f"Screenshot saved to: {file_path}")
579
+
580
+ # Update result text to include file path
581
+ result_text += f" (saved to: {file_path})"
582
+ break
583
+
584
+ # Analyze image if requested and agent is registered
585
+ if read_image and file_path:
586
+ if self.agent is None:
587
+ logger.error(
588
+ "Cannot analyze screenshot: No agent registered. "
589
+ "Please pass this toolkit to ChatAgent via "
590
+ "toolkits_to_register_agent parameter."
591
+ )
592
+ result_text += (
593
+ " Error: No agent registered for image analysis. "
594
+ "Please pass this toolkit to ChatAgent via "
595
+ "toolkits_to_register_agent parameter."
596
+ )
597
+ else:
598
+ try:
599
+ # Load the image and create a message
600
+ from PIL import Image
601
+
602
+ img = Image.open(file_path)
603
+ inst = instruction if instruction is not None else ""
604
+ message = BaseMessage.make_user_message(
605
+ role_name="User",
606
+ content=inst,
607
+ image_list=[img],
608
+ )
1488
609
 
1489
- return ToolResult(text=text_result, images=[img_data_url])
610
+ # Get agent's analysis
611
+ response = await self.agent.astep(message)
612
+ agent_response = response.msgs[0].content
613
+ result_text += f". Agent analysis: {agent_response}"
614
+ except Exception as e:
615
+ logger.error(f"Error analyzing screenshot: {e}")
616
+ result_text += f". Error analyzing screenshot: {e}"
1490
617
 
1491
- async def click(self, *, ref: str) -> Dict[str, Any]:
618
+ return result_text
619
+ except Exception as e:
620
+ logger.error(f"Failed to get screenshot: {e}")
621
+ return f"Error capturing screenshot: {e}"
622
+
623
+ async def browser_click(self, *, ref: str) -> Dict[str, Any]:
1492
624
  r"""Performs a click on an element on the page.
1493
625
 
1494
626
  Args:
@@ -1505,155 +637,379 @@ class HybridBrowserToolkit(BaseToolkit):
1505
637
  - "current_tab" (int): Index of the active tab.
1506
638
  - "total_tabs" (int): Total number of open tabs.
1507
639
  """
1508
- self._validate_ref(ref, "click")
1509
-
1510
- analysis = await self._get_unified_analysis()
1511
- elements = analysis.get("elements", {})
1512
- if ref not in elements:
1513
- logger.error(f"Error: Element reference '{ref}' not found. ")
1514
- # Added snapshot to give more context on failure
1515
- snapshot = self._format_snapshot_from_analysis(analysis)
1516
- tab_info = await self._get_tab_info_for_output()
640
+ try:
641
+ ws_wrapper = await self._get_ws_wrapper()
642
+ result = await ws_wrapper.click(ref)
643
+
644
+ # Add tab information
645
+ tab_info = await ws_wrapper.get_tab_info()
646
+ result.update(
647
+ {
648
+ "tabs": tab_info,
649
+ "current_tab": next(
650
+ (
651
+ i
652
+ for i, tab in enumerate(tab_info)
653
+ if tab.get("is_current")
654
+ ),
655
+ 0,
656
+ ),
657
+ "total_tabs": len(tab_info),
658
+ }
659
+ )
660
+
661
+ return result
662
+ except Exception as e:
663
+ logger.error(f"Failed to click element: {e}")
1517
664
  return {
1518
- "result": f"Error: Element reference '{ref}' not found. ",
1519
- "snapshot": snapshot,
1520
- **tab_info,
665
+ "result": f"Error clicking element: {e}",
666
+ "snapshot": "",
667
+ "tabs": [],
668
+ "current_tab": 0,
669
+ "total_tabs": 0,
1521
670
  }
1522
671
 
1523
- element_details = elements.get(ref)
1524
- action = {"type": "click", "ref": ref}
1525
- result = await self._exec_with_snapshot(
1526
- action, element_details=element_details
1527
- )
672
+ async def browser_type(self, *, ref: str, text: str) -> Dict[str, Any]:
673
+ r"""Types text into an input element on the page.
674
+
675
+ Args:
676
+ ref (str): The `ref` ID of the input element, from a snapshot.
677
+ text (str): The text to type into the element.
678
+
679
+ Returns:
680
+ Dict[str, Any]: A dictionary with the result of the action:
681
+ - "result" (str): Confirmation of the action.
682
+ - "snapshot" (str): A textual snapshot of the page after
683
+ typing.
684
+ - "tabs" (List[Dict]): Information about all open tabs.
685
+ - "current_tab" (int): Index of the active tab.
686
+ - "total_tabs" (int): Total number of open tabs.
687
+ """
688
+ try:
689
+ ws_wrapper = await self._get_ws_wrapper()
690
+ result = await ws_wrapper.type(ref, text)
691
+
692
+ # Add tab information
693
+ tab_info = await ws_wrapper.get_tab_info()
694
+ result.update(
695
+ {
696
+ "tabs": tab_info,
697
+ "current_tab": next(
698
+ (
699
+ i
700
+ for i, tab in enumerate(tab_info)
701
+ if tab.get("is_current")
702
+ ),
703
+ 0,
704
+ ),
705
+ "total_tabs": len(tab_info),
706
+ }
707
+ )
708
+
709
+ return result
710
+ except Exception as e:
711
+ logger.error(f"Failed to type text: {e}")
712
+ return {
713
+ "result": f"Error typing text: {e}",
714
+ "snapshot": "",
715
+ "tabs": [],
716
+ "current_tab": 0,
717
+ "total_tabs": 0,
718
+ }
719
+
720
+ async def browser_select(self, *, ref: str, value: str) -> Dict[str, Any]:
721
+ r"""Selects an option in a dropdown (`<select>`) element.
722
+
723
+ Args:
724
+ ref (str): The `ref` ID of the `<select>` element.
725
+ value (str): The `value` attribute of the `<option>` to select,
726
+ not its visible text.
1528
727
 
1529
- # Add tab information to the result
1530
- tab_info = await self._get_tab_info_for_output()
1531
- result.update(tab_info)
728
+ Returns:
729
+ Dict[str, Any]: A dictionary with the result of the action:
730
+ - "result" (str): Confirmation of the action.
731
+ - "snapshot" (str): A snapshot of the page after the
732
+ selection.
733
+ - "tabs" (List[Dict]): Information about all open tabs.
734
+ - "current_tab" (int): Index of the active tab.
735
+ - "total_tabs" (int): Total number of open tabs.
736
+ """
737
+ try:
738
+ ws_wrapper = await self._get_ws_wrapper()
739
+ result = await ws_wrapper.select(ref, value)
740
+
741
+ # Add tab information
742
+ tab_info = await ws_wrapper.get_tab_info()
743
+ result.update(
744
+ {
745
+ "tabs": tab_info,
746
+ "current_tab": next(
747
+ (
748
+ i
749
+ for i, tab in enumerate(tab_info)
750
+ if tab.get("is_current")
751
+ ),
752
+ 0,
753
+ ),
754
+ "total_tabs": len(tab_info),
755
+ }
756
+ )
1532
757
 
1533
- return result
758
+ return result
759
+ except Exception as e:
760
+ logger.error(f"Failed to select option: {e}")
761
+ return {
762
+ "result": f"Error selecting option: {e}",
763
+ "snapshot": "",
764
+ "tabs": [],
765
+ "current_tab": 0,
766
+ "total_tabs": 0,
767
+ }
768
+
769
+ async def browser_scroll(
770
+ self, *, direction: str, amount: int = 500
771
+ ) -> Dict[str, Any]:
772
+ r"""Scrolls the current page window.
773
+
774
+ Args:
775
+ direction (str): The direction to scroll: 'up' or 'down'.
776
+ amount (int): The number of pixels to scroll, default is 500.
777
+
778
+ Returns:
779
+ Dict[str, Any]: A dictionary with the result of the action:
780
+ - "result" (str): Confirmation of the action.
781
+ - "snapshot" (str): A snapshot of the page after scrolling.
782
+ - "tabs" (List[Dict]): Information about all open tabs.
783
+ - "current_tab" (int): Index of the active tab.
784
+ - "total_tabs" (int): Total number of open tabs.
785
+ """
786
+ try:
787
+ ws_wrapper = await self._get_ws_wrapper()
788
+ result = await ws_wrapper.scroll(direction, amount)
789
+
790
+ # Add tab information
791
+ tab_info = await ws_wrapper.get_tab_info()
792
+ result.update(
793
+ {
794
+ "tabs": tab_info,
795
+ "current_tab": next(
796
+ (
797
+ i
798
+ for i, tab in enumerate(tab_info)
799
+ if tab.get("is_current")
800
+ ),
801
+ 0,
802
+ ),
803
+ "total_tabs": len(tab_info),
804
+ }
805
+ )
806
+
807
+ return result
808
+ except Exception as e:
809
+ logger.error(f"Failed to scroll: {e}")
810
+ return {
811
+ "result": f"Error scrolling: {e}",
812
+ "snapshot": "",
813
+ "tabs": [],
814
+ "current_tab": 0,
815
+ "total_tabs": 0,
816
+ }
1534
817
 
1535
- async def type(self, *, ref: str, text: str) -> Dict[str, Any]:
1536
- r"""Types text into an input element on the page.
818
+ async def browser_enter(self) -> Dict[str, Any]:
819
+ r"""Simulates pressing the Enter key on the currently focused
820
+ element.
1537
821
 
1538
- Args:
1539
- ref (str): The `ref` ID of the input element, from a snapshot.
1540
- text (str): The text to type into the element.
822
+ This is useful for submitting forms or search queries after using the
823
+ `type` tool.
1541
824
 
1542
825
  Returns:
1543
826
  Dict[str, Any]: A dictionary with the result of the action:
1544
827
  - "result" (str): Confirmation of the action.
1545
- - "snapshot" (str): A textual snapshot of the page after
1546
- typing.
828
+ - "snapshot" (str): A new page snapshot, as this action often
829
+ triggers navigation.
1547
830
  - "tabs" (List[Dict]): Information about all open tabs.
1548
831
  - "current_tab" (int): Index of the active tab.
1549
832
  - "total_tabs" (int): Total number of open tabs.
1550
833
  """
1551
- self._validate_ref(ref, "type")
1552
- await self._get_unified_analysis() # Ensure aria-ref attributes
1553
-
1554
- action = {"type": "type", "ref": ref, "text": text}
1555
- result = await self._exec_with_snapshot(action)
834
+ try:
835
+ ws_wrapper = await self._get_ws_wrapper()
836
+ result = await ws_wrapper.enter()
837
+
838
+ # Add tab information
839
+ tab_info = await ws_wrapper.get_tab_info()
840
+ result.update(
841
+ {
842
+ "tabs": tab_info,
843
+ "current_tab": next(
844
+ (
845
+ i
846
+ for i, tab in enumerate(tab_info)
847
+ if tab.get("is_current")
848
+ ),
849
+ 0,
850
+ ),
851
+ "total_tabs": len(tab_info),
852
+ }
853
+ )
1556
854
 
1557
- # Add tab information to the result
1558
- tab_info = await self._get_tab_info_for_output()
1559
- result.update(tab_info)
855
+ return result
856
+ except Exception as e:
857
+ logger.error(f"Failed to press enter: {e}")
858
+ return {
859
+ "result": f"Error pressing enter: {e}",
860
+ "snapshot": "",
861
+ "tabs": [],
862
+ "current_tab": 0,
863
+ "total_tabs": 0,
864
+ }
1560
865
 
1561
- return result
866
+ async def browser_switch_tab(self, *, tab_id: str) -> Dict[str, Any]:
867
+ r"""Switches to a different browser tab using its ID.
1562
868
 
1563
- async def select(self, *, ref: str, value: str) -> Dict[str, Any]:
1564
- r"""Selects an option in a dropdown (`<select>`) element.
869
+ After switching, all actions will apply to the new tab. Use
870
+ `get_tab_info` to find the ID of the tab you want to switch to.
1565
871
 
1566
872
  Args:
1567
- ref (str): The `ref` ID of the `<select>` element.
1568
- value (str): The `value` attribute of the `<option>` to select,
1569
- not its visible text.
873
+ tab_id (str): The ID of the tab to activate.
1570
874
 
1571
875
  Returns:
1572
876
  Dict[str, Any]: A dictionary with the result of the action:
1573
877
  - "result" (str): Confirmation of the action.
1574
- - "snapshot" (str): A snapshot of the page after the
1575
- selection.
878
+ - "snapshot" (str): A snapshot of the newly active tab.
1576
879
  - "tabs" (List[Dict]): Information about all open tabs.
1577
- - "current_tab" (int): Index of the active tab.
880
+ - "current_tab" (int): Index of the new active tab.
1578
881
  - "total_tabs" (int): Total number of open tabs.
1579
882
  """
1580
- self._validate_ref(ref, "select")
1581
- await self._get_unified_analysis()
1582
-
1583
- action = {"type": "select", "ref": ref, "value": value}
1584
- result = await self._exec_with_snapshot(action)
883
+ try:
884
+ ws_wrapper = await self._get_ws_wrapper()
885
+ result = await ws_wrapper.switch_tab(tab_id)
886
+
887
+ # Add tab information
888
+ tab_info = await ws_wrapper.get_tab_info()
889
+ result.update(
890
+ {
891
+ "tabs": tab_info,
892
+ "current_tab": next(
893
+ (
894
+ i
895
+ for i, tab in enumerate(tab_info)
896
+ if tab.get("is_current")
897
+ ),
898
+ 0,
899
+ ),
900
+ "total_tabs": len(tab_info),
901
+ }
902
+ )
1585
903
 
1586
- # Add tab information to the result
1587
- tab_info = await self._get_tab_info_for_output()
1588
- result.update(tab_info)
904
+ return result
905
+ except Exception as e:
906
+ logger.error(f"Failed to switch tab: {e}")
907
+ return {
908
+ "result": f"Error switching tab: {e}",
909
+ "snapshot": "",
910
+ "tabs": [],
911
+ "current_tab": 0,
912
+ "total_tabs": 0,
913
+ }
1589
914
 
1590
- return result
915
+ async def browser_close_tab(self, *, tab_id: str) -> Dict[str, Any]:
916
+ r"""Closes a browser tab using its ID.
1591
917
 
1592
- async def scroll(self, *, direction: str, amount: int) -> Dict[str, Any]:
1593
- r"""Scrolls the current page window.
918
+ Use `get_tab_info` to find the ID of the tab to close. After
919
+ closing, the browser will switch to another tab if available.
1594
920
 
1595
921
  Args:
1596
- direction (str): The direction to scroll: 'up' or 'down'.
1597
- amount (int): The number of pixels to scroll.
922
+ tab_id (str): The ID of the tab to close.
1598
923
 
1599
924
  Returns:
1600
925
  Dict[str, Any]: A dictionary with the result of the action:
1601
926
  - "result" (str): Confirmation of the action.
1602
- - "snapshot" (str): A snapshot of the page after scrolling.
1603
- - "tabs" (List[Dict]): Information about all open tabs.
1604
- - "current_tab" (int): Index of the active tab.
1605
- - "total_tabs" (int): Total number of open tabs.
927
+ - "snapshot" (str): A snapshot of the active tab after
928
+ closure.
929
+ - "tabs" (List[Dict]): Information about remaining tabs.
930
+ - "current_tab" (int): Index of the new active tab.
931
+ - "total_tabs" (int): Total number of remaining tabs.
1606
932
  """
1607
- if direction not in ("up", "down"):
1608
- tab_info = await self._get_tab_info_for_output()
933
+ try:
934
+ ws_wrapper = await self._get_ws_wrapper()
935
+ result = await ws_wrapper.close_tab(tab_id)
936
+
937
+ # Add tab information
938
+ tab_info = await ws_wrapper.get_tab_info()
939
+ result.update(
940
+ {
941
+ "tabs": tab_info,
942
+ "current_tab": next(
943
+ (
944
+ i
945
+ for i, tab in enumerate(tab_info)
946
+ if tab.get("is_current")
947
+ ),
948
+ 0,
949
+ ),
950
+ "total_tabs": len(tab_info),
951
+ }
952
+ )
953
+
954
+ return result
955
+ except Exception as e:
956
+ logger.error(f"Failed to close tab: {e}")
1609
957
  return {
1610
- "result": "Error: direction must be 'up' or 'down'",
958
+ "result": f"Error closing tab: {e}",
1611
959
  "snapshot": "",
1612
- **tab_info,
960
+ "tabs": [],
961
+ "current_tab": 0,
962
+ "total_tabs": 0,
1613
963
  }
1614
964
 
1615
- action = {"type": "scroll", "direction": direction, "amount": amount}
1616
- result = await self._exec_with_snapshot(action)
1617
-
1618
- # Add tab information to the result
1619
- tab_info = await self._get_tab_info_for_output()
1620
- result.update(tab_info)
1621
-
1622
- return result
1623
-
1624
- async def enter(self) -> Dict[str, Any]:
1625
- r"""Simulates pressing the Enter key on the currently focused element.
965
+ async def browser_get_tab_info(self) -> Dict[str, Any]:
966
+ r"""Gets a list of all open browser tabs and their information.
1626
967
 
1627
- This is useful for submitting forms or search queries after using the
1628
- `type` tool.
968
+ This includes each tab's index, title, and URL, and indicates which
969
+ tab is currently active. Use this to manage multiple tabs.
1629
970
 
1630
971
  Returns:
1631
- Dict[str, Any]: A dictionary with the result of the action:
1632
- - "result" (str): Confirmation of the action.
1633
- - "snapshot" (str): A new page snapshot, as this action often
1634
- triggers navigation.
1635
- - "tabs" (List[Dict]): Information about all open tabs.
972
+ Dict[str, Any]: A dictionary with tab information:
973
+ - "tabs" (List[Dict]): A list of open tabs, each with:
974
+ - "index" (int): The tab's zero-based index.
975
+ - "title" (str): The page title.
976
+ - "url" (str): The current URL.
977
+ - "is_current" (bool): True if the tab is active.
1636
978
  - "current_tab" (int): Index of the active tab.
1637
979
  - "total_tabs" (int): Total number of open tabs.
1638
980
  """
1639
- # Always press Enter on the currently focused element
1640
- action = {"type": "enter"}
1641
-
1642
- result = await self._exec_with_snapshot(action)
1643
-
1644
- # Add tab information to the result
1645
- tab_info = await self._get_tab_info_for_output()
1646
- result.update(tab_info)
981
+ try:
982
+ ws_wrapper = await self._get_ws_wrapper()
983
+ tab_info = await ws_wrapper.get_tab_info()
1647
984
 
1648
- return result
985
+ return {
986
+ "tabs": tab_info,
987
+ "current_tab": next(
988
+ (
989
+ i
990
+ for i, tab in enumerate(tab_info)
991
+ if tab.get("is_current")
992
+ ),
993
+ 0,
994
+ ),
995
+ "total_tabs": len(tab_info),
996
+ }
997
+ except Exception as e:
998
+ logger.error(f"Failed to get tab info: {e}")
999
+ return {
1000
+ "tabs": [],
1001
+ "current_tab": 0,
1002
+ "total_tabs": 0,
1003
+ }
1649
1004
 
1650
- @action_logger
1651
- async def wait_user(
1005
+ # Additional methods for backward compatibility
1006
+ async def browser_wait_user(
1652
1007
  self, timeout_sec: Optional[float] = None
1653
1008
  ) -> Dict[str, Any]:
1654
1009
  r"""Pauses execution and waits for human input from the console.
1655
1010
 
1656
- Use this for tasks requiring manual steps, like solving a CAPTCHA. The
1011
+ Use this for tasks requiring manual steps, like solving a CAPTCHA.
1012
+ The
1657
1013
  agent will resume after the user presses Enter in the console.
1658
1014
 
1659
1015
  Args:
@@ -1677,7 +1033,13 @@ class HybridBrowserToolkit(BaseToolkit):
1677
1033
  logger.info(f"\n{prompt}\n")
1678
1034
 
1679
1035
  async def _await_enter():
1680
- await asyncio.to_thread(input, ">>> Press Enter to resume <<<\n")
1036
+ try:
1037
+ await asyncio.to_thread(
1038
+ input, ">>> Press Enter to resume <<<\n"
1039
+ )
1040
+ except (asyncio.CancelledError, Exception):
1041
+ # Handle cancellation gracefully
1042
+ pass
1681
1043
 
1682
1044
  try:
1683
1045
  if timeout_sec is not None:
@@ -1685,178 +1047,48 @@ class HybridBrowserToolkit(BaseToolkit):
1685
1047
  f"Waiting for user input with timeout: {timeout_sec}s"
1686
1048
  )
1687
1049
  start_time = time.time()
1688
- await asyncio.wait_for(_await_enter(), timeout=timeout_sec)
1689
- wait_time = time.time() - start_time
1690
- logger.info(f"User input received after {wait_time:.2f}s")
1691
- result_msg = "User resumed."
1050
+ task = asyncio.create_task(_await_enter())
1051
+ try:
1052
+ await asyncio.wait_for(task, timeout=timeout_sec)
1053
+ wait_time = time.time() - start_time
1054
+ logger.info(f"User input received after {wait_time:.2f}s")
1055
+ result_msg = "User resumed."
1056
+ except asyncio.TimeoutError:
1057
+ task.cancel()
1058
+ # Wait for task to be cancelled properly
1059
+ try:
1060
+ await task
1061
+ except asyncio.CancelledError:
1062
+ pass
1063
+ raise
1692
1064
  else:
1693
- logger.info("Waiting for user " "input (no timeout)")
1065
+ logger.info("Waiting for user input (no timeout)")
1694
1066
  start_time = time.time()
1695
1067
  await _await_enter()
1696
1068
  wait_time = time.time() - start_time
1697
- logger.info(f"User input received " f"after {wait_time:.2f}s")
1069
+ logger.info(f"User input received after {wait_time:.2f}s")
1698
1070
  result_msg = "User resumed."
1699
1071
  except asyncio.TimeoutError:
1700
1072
  wait_time = timeout_sec or 0.0
1701
1073
  logger.info(
1702
- f"User input timeout reached "
1703
- f"after {wait_time}s, auto-resuming"
1074
+ f"User input timeout reached after {wait_time}s, "
1075
+ f"auto-resuming"
1704
1076
  )
1705
1077
  result_msg = f"Timeout {timeout_sec}s reached, auto-resumed."
1706
1078
 
1707
- snapshot = await self._session.get_snapshot(
1708
- force_refresh=True, diff_only=False
1709
- )
1710
- tab_info = await self._get_tab_info_for_output()
1711
-
1712
- return {"result": result_msg, "snapshot": snapshot, **tab_info}
1713
-
1714
- @action_logger
1715
- async def get_page_links(self, *, ref: List[str]) -> Dict[str, Any]:
1716
- r"""Gets the destination URLs for a list of link elements.
1717
-
1718
- This is useful to know where a link goes before clicking it.
1719
-
1720
- Args:
1721
- ref (List[str]): A list of `ref` IDs for link elements, obtained
1722
- from a page snapshot.
1723
-
1724
- Returns:
1725
- Dict[str, Any]: A dictionary containing:
1726
- - "links" (List[Dict]): A list of found links, where each
1727
- link has "text", "ref", and "url" keys.
1728
- """
1729
- if not ref or not isinstance(ref, list):
1730
- return {"links": []}
1731
-
1732
- for r in ref:
1733
- if not r or not isinstance(r, str):
1734
- return {"links": []}
1735
-
1736
- page = await self._require_page()
1737
- snapshot = await self._session.get_snapshot(
1738
- force_refresh=True, diff_only=False
1739
- )
1740
- links = await self._extract_links_by_refs(snapshot, page, ref)
1741
-
1742
- return {"links": links}
1743
-
1744
- @action_logger
1745
- async def solve_task(
1746
- self, task_prompt: str, start_url: str, max_steps: int = 15
1747
- ) -> str:
1748
- r"""Delegates a complex, high-level task to a specialized web agent.
1749
-
1750
- Use this for multi-step tasks that can be described in a single prompt
1751
- (e.g., "log into my account and check for new messages"). The agent
1752
- will autonomously perform the necessary browser actions.
1753
-
1754
- NOTE: This is a high-level action; for simple interactions, use tools
1755
- like `click` and `type`. `web_agent_model` must be provided during
1756
- toolkit initialization.
1757
-
1758
- Args:
1759
- task_prompt (str): A natural language description of the task.
1760
- start_url (str): The URL to start the task from. This should be a
1761
- valid and existing URL, as agents may generate non-existent
1762
- ones.
1763
- max_steps (int): The maximum number of steps the agent can take.
1764
-
1765
- Returns:
1766
- str: A summary message indicating the task has finished.
1767
- """
1768
- agent = self._ensure_agent()
1769
- await agent.navigate(start_url)
1770
- await agent.process_command(task_prompt, max_steps=max_steps)
1771
- return "Task processing finished - see stdout for detailed trace."
1772
-
1773
- def get_log_summary(self) -> Dict[str, Any]:
1774
- r"""Get a summary of logged actions."""
1775
- if not self.log_buffer:
1776
- return {"total_actions": 0, "summary": "No actions logged"}
1777
-
1778
- total_actions = len(self.log_buffer)
1779
- total_execution_time = sum(
1780
- entry.get("execution_time_ms", 0) for entry in self.log_buffer
1781
- )
1782
- total_page_load_time = sum(
1783
- entry.get("page_load_time_ms", 0)
1784
- for entry in self.log_buffer
1785
- if "page_load_time_ms" in entry
1786
- )
1787
-
1788
- action_counts: Dict[str, int] = {}
1789
- error_count = 0
1790
-
1791
- for entry in self.log_buffer:
1792
- action = entry["action"]
1793
- action_counts[action] = action_counts.get(action, 0) + 1
1794
- if "error" in entry:
1795
- error_count += 1
1796
-
1797
- return {
1798
- "total_actions": total_actions,
1799
- "total_execution_time_ms": round(total_execution_time, 2),
1800
- "total_page_load_time_ms": round(total_page_load_time, 2),
1801
- "action_counts": action_counts,
1802
- "error_count": error_count,
1803
- "success_rate": round(
1804
- (total_actions - error_count) / total_actions * 100, 2
1805
- )
1806
- if total_actions > 0
1807
- else 0,
1808
- }
1809
-
1810
- def clear_logs(self) -> None:
1811
- r"""Clear the log buffer."""
1812
- self.log_buffer.clear()
1813
- logger.info("Log buffer cleared")
1814
-
1815
- def get_tools(self) -> List[FunctionTool]:
1816
- r"""Get available function tools
1817
- based on enabled_tools configuration."""
1818
- # Map tool names to their corresponding methods
1819
- tool_map = {
1820
- "open_browser": self.open_browser,
1821
- "close_browser": self.close_browser,
1822
- "visit_page": self.visit_page,
1823
- "back": self.back,
1824
- "forward": self.forward,
1825
- "get_page_snapshot": self.get_page_snapshot,
1826
- "get_som_screenshot": self.get_som_screenshot,
1827
- "get_page_links": self.get_page_links,
1828
- "click": self.click,
1829
- "type": self.type,
1830
- "select": self.select,
1831
- "scroll": self.scroll,
1832
- "enter": self.enter,
1833
- "wait_user": self.wait_user,
1834
- "solve_task": self.solve_task,
1835
- "switch_tab": self.switch_tab,
1836
- "close_tab": self.close_tab,
1837
- "get_tab_info": self.get_tab_info,
1838
- }
1839
-
1840
- enabled_tools = []
1841
-
1842
- for tool_name in self.enabled_tools:
1843
- if tool_name == "solve_task" and self._web_agent_model is None:
1844
- logger.warning(
1845
- f"Tool '{tool_name}' is enabled but web_agent_model "
1846
- f"is not provided. Skipping this tool."
1847
- )
1848
- continue
1849
-
1850
- if tool_name in tool_map:
1851
- tool = FunctionTool(
1852
- cast(Callable[..., Any], tool_map[tool_name])
1853
- )
1854
- enabled_tools.append(tool)
1855
- else:
1856
- logger.warning(f"Unknown tool name: {tool_name}")
1857
-
1858
- logger.info(f"Returning {len(enabled_tools)} enabled tools")
1859
- return enabled_tools
1079
+ try:
1080
+ snapshot = await self.browser_get_page_snapshot()
1081
+ tab_info = await self.browser_get_tab_info()
1082
+ return {"result": result_msg, "snapshot": snapshot, **tab_info}
1083
+ except Exception as e:
1084
+ logger.warning(f"Failed to get snapshot after wait: {e}")
1085
+ return {
1086
+ "result": result_msg,
1087
+ "snapshot": "",
1088
+ "tabs": [],
1089
+ "current_tab": 0,
1090
+ "total_tabs": 0,
1091
+ }
1860
1092
 
1861
1093
  def clone_for_new_session(
1862
1094
  self, new_session_id: Optional[str] = None
@@ -1882,7 +1114,8 @@ class HybridBrowserToolkit(BaseToolkit):
1882
1114
  user_data_dir=self._user_data_dir,
1883
1115
  stealth=self._stealth,
1884
1116
  web_agent_model=self._web_agent_model,
1885
- cache_dir=f"{self._cache_dir.rstrip('/')}_clone_{new_session_id}/",
1117
+ cache_dir=f"{self._cache_dir.rstrip('/')}_clone_"
1118
+ f"{new_session_id}/",
1886
1119
  enabled_tools=self.enabled_tools.copy(),
1887
1120
  browser_log_to_file=self._browser_log_to_file,
1888
1121
  session_id=new_session_id,
@@ -1896,117 +1129,49 @@ class HybridBrowserToolkit(BaseToolkit):
1896
1129
  dom_content_loaded_timeout=self._dom_content_loaded_timeout,
1897
1130
  )
1898
1131
 
1899
- @action_logger
1900
- async def switch_tab(self, *, tab_id: str) -> Dict[str, Any]:
1901
- r"""Switches to a different browser tab using its ID.
1902
-
1903
- After switching, all actions will apply to the new tab. Use
1904
- `get_tab_info` to find the ID of the tab you want to switch to.
1905
-
1906
- Args:
1907
- tab_id (str): The ID of the tab to activate.
1908
-
1909
- Returns:
1910
- Dict[str, Any]: A dictionary with the result of the action:
1911
- - "result" (str): Confirmation of the action.
1912
- - "snapshot" (str): A snapshot of the newly active tab.
1913
- - "tabs" (List[Dict]): Information about all open tabs.
1914
- - "current_tab" (int): Index of the new active tab.
1915
- - "total_tabs" (int): Total number of open tabs.
1916
- """
1917
- await self._ensure_browser()
1918
- session = await self._get_session()
1919
-
1920
- success = await session.switch_to_tab(tab_id)
1921
-
1922
- if success:
1923
- snapshot = await session.get_snapshot(
1924
- force_refresh=True, diff_only=False
1925
- )
1926
- tab_info = await self._get_tab_info_for_output()
1927
-
1928
- result = {
1929
- "result": f"Successfully switched to tab {tab_id}",
1930
- "snapshot": snapshot,
1931
- **tab_info,
1932
- }
1933
- else:
1934
- tab_info = await self._get_tab_info_for_output()
1935
- result = {
1936
- "result": f"Failed to switch to tab {tab_id}. Tab may not "
1937
- f"exist.",
1938
- "snapshot": "",
1939
- **tab_info,
1940
- }
1941
-
1942
- return result
1943
-
1944
- @action_logger
1945
- async def close_tab(self, *, tab_id: str) -> Dict[str, Any]:
1946
- r"""Closes a browser tab using its ID.
1947
-
1948
- Use `get_tab_info` to find the ID of the tab to close. After
1949
- closing, the browser will switch to another tab if available.
1950
-
1951
- Args:
1952
- tab_id (str): The ID of the tab to close.
1953
-
1954
- Returns:
1955
- Dict[str, Any]: A dictionary with the result of the action:
1956
- - "result" (str): Confirmation of the action.
1957
- - "snapshot" (str): A snapshot of the active tab after closure.
1958
- - "tabs" (List[Dict]): Information about remaining tabs.
1959
- - "current_tab" (int): Index of the new active tab.
1960
- - "total_tabs" (int): Total number of remaining tabs.
1961
- """
1962
- await self._ensure_browser()
1963
- session = await self._get_session()
1132
+ def get_tools(self) -> List[FunctionTool]:
1133
+ r"""Get available function tools based
1134
+ on enabled_tools configuration."""
1135
+ # Map tool names to their corresponding methods
1136
+ tool_map = {
1137
+ "browser_open": self.browser_open,
1138
+ "browser_close": self.browser_close,
1139
+ "browser_visit_page": self.browser_visit_page,
1140
+ "browser_back": self.browser_back,
1141
+ "browser_forward": self.browser_forward,
1142
+ "browser_get_page_snapshot": self.browser_get_page_snapshot,
1143
+ "browser_get_som_screenshot": self.browser_get_som_screenshot,
1144
+ "browser_click": self.browser_click,
1145
+ "browser_type": self.browser_type,
1146
+ "browser_select": self.browser_select,
1147
+ "browser_scroll": self.browser_scroll,
1148
+ "browser_enter": self.browser_enter,
1149
+ "browser_wait_user": self.browser_wait_user,
1150
+ "browser_switch_tab": self.browser_switch_tab,
1151
+ "browser_close_tab": self.browser_close_tab,
1152
+ "browser_get_tab_info": self.browser_get_tab_info,
1153
+ }
1964
1154
 
1965
- success = await session.close_tab(tab_id)
1155
+ enabled_tools = []
1966
1156
 
1967
- if success:
1968
- # Get current state after closing the tab
1969
- try:
1970
- snapshot = await session.get_snapshot(
1971
- force_refresh=True, diff_only=False
1157
+ for tool_name in self.enabled_tools:
1158
+ if (
1159
+ tool_name == "browser_solve_task"
1160
+ and self._web_agent_model is None
1161
+ ):
1162
+ logger.warning(
1163
+ f"Tool '{tool_name}' is enabled but web_agent_model "
1164
+ f"is not provided. Skipping this tool."
1972
1165
  )
1973
- except Exception:
1974
- snapshot = "" # No active tab
1975
-
1976
- tab_info = await self._get_tab_info_for_output()
1977
-
1978
- result = {
1979
- "result": f"Successfully closed tab {tab_id}",
1980
- "snapshot": snapshot,
1981
- **tab_info,
1982
- }
1983
- else:
1984
- tab_info = await self._get_tab_info_for_output()
1985
- result = {
1986
- "result": f"Failed to close tab {tab_id}. Tab may not "
1987
- f"exist.",
1988
- "snapshot": "",
1989
- **tab_info,
1990
- }
1991
-
1992
- return result
1993
-
1994
- @action_logger
1995
- async def get_tab_info(self) -> Dict[str, Any]:
1996
- r"""Gets a list of all open browser tabs and their information.
1166
+ continue
1997
1167
 
1998
- This includes each tab's index, title, and URL, and indicates which
1999
- tab is currently active. Use this to manage multiple tabs.
1168
+ if tool_name in tool_map:
1169
+ tool = FunctionTool(
1170
+ cast(Callable[..., Any], tool_map[tool_name])
1171
+ )
1172
+ enabled_tools.append(tool)
1173
+ else:
1174
+ logger.warning(f"Unknown tool name: {tool_name}")
2000
1175
 
2001
- Returns:
2002
- Dict[str, Any]: A dictionary with tab information:
2003
- - "tabs" (List[Dict]): A list of open tabs, each with:
2004
- - "index" (int): The tab's zero-based index.
2005
- - "title" (str): The page title.
2006
- - "url" (str): The current URL.
2007
- - "is_current" (bool): True if the tab is active.
2008
- - "current_tab" (int): Index of the active tab.
2009
- - "total_tabs" (int): Total number of open tabs.
2010
- """
2011
- await self._ensure_browser()
2012
- return await self._get_tab_info_for_output()
1176
+ logger.info(f"Returning {len(enabled_tools)} enabled tools")
1177
+ return enabled_tools