camel-ai 0.2.72a10__py3-none-any.whl → 0.2.73__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

Files changed (52) hide show
  1. camel/__init__.py +1 -1
  2. camel/agents/chat_agent.py +140 -345
  3. camel/memories/agent_memories.py +18 -17
  4. camel/societies/__init__.py +2 -0
  5. camel/societies/workforce/prompts.py +36 -10
  6. camel/societies/workforce/single_agent_worker.py +7 -5
  7. camel/societies/workforce/workforce.py +6 -4
  8. camel/storages/key_value_storages/mem0_cloud.py +48 -47
  9. camel/storages/vectordb_storages/__init__.py +1 -0
  10. camel/storages/vectordb_storages/surreal.py +100 -150
  11. camel/toolkits/__init__.py +6 -1
  12. camel/toolkits/base.py +60 -2
  13. camel/toolkits/excel_toolkit.py +153 -64
  14. camel/toolkits/file_write_toolkit.py +67 -0
  15. camel/toolkits/hybrid_browser_toolkit/config_loader.py +136 -413
  16. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit.py +131 -1966
  17. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit_ts.py +1177 -0
  18. camel/toolkits/hybrid_browser_toolkit/ts/package-lock.json +4356 -0
  19. camel/toolkits/hybrid_browser_toolkit/ts/package.json +33 -0
  20. camel/toolkits/hybrid_browser_toolkit/ts/src/browser-scripts.js +125 -0
  21. camel/toolkits/hybrid_browser_toolkit/ts/src/browser-session.ts +945 -0
  22. camel/toolkits/hybrid_browser_toolkit/ts/src/config-loader.ts +226 -0
  23. camel/toolkits/hybrid_browser_toolkit/ts/src/hybrid-browser-toolkit.ts +522 -0
  24. camel/toolkits/hybrid_browser_toolkit/ts/src/index.ts +7 -0
  25. camel/toolkits/hybrid_browser_toolkit/ts/src/types.ts +110 -0
  26. camel/toolkits/hybrid_browser_toolkit/ts/tsconfig.json +26 -0
  27. camel/toolkits/hybrid_browser_toolkit/ts/websocket-server.js +254 -0
  28. camel/toolkits/hybrid_browser_toolkit/ws_wrapper.py +582 -0
  29. camel/toolkits/hybrid_browser_toolkit_py/__init__.py +17 -0
  30. camel/toolkits/hybrid_browser_toolkit_py/config_loader.py +447 -0
  31. camel/toolkits/hybrid_browser_toolkit_py/hybrid_browser_toolkit.py +2077 -0
  32. camel/toolkits/mcp_toolkit.py +341 -46
  33. camel/toolkits/message_integration.py +719 -0
  34. camel/toolkits/notion_mcp_toolkit.py +234 -0
  35. camel/toolkits/screenshot_toolkit.py +116 -31
  36. camel/toolkits/search_toolkit.py +20 -2
  37. camel/toolkits/slack_toolkit.py +43 -48
  38. camel/toolkits/terminal_toolkit.py +288 -46
  39. camel/toolkits/video_analysis_toolkit.py +13 -13
  40. camel/toolkits/video_download_toolkit.py +11 -11
  41. camel/toolkits/web_deploy_toolkit.py +207 -12
  42. camel/types/enums.py +6 -0
  43. {camel_ai-0.2.72a10.dist-info → camel_ai-0.2.73.dist-info}/METADATA +49 -9
  44. {camel_ai-0.2.72a10.dist-info → camel_ai-0.2.73.dist-info}/RECORD +52 -35
  45. /camel/toolkits/{hybrid_browser_toolkit → hybrid_browser_toolkit_py}/actions.py +0 -0
  46. /camel/toolkits/{hybrid_browser_toolkit → hybrid_browser_toolkit_py}/agent.py +0 -0
  47. /camel/toolkits/{hybrid_browser_toolkit → hybrid_browser_toolkit_py}/browser_session.py +0 -0
  48. /camel/toolkits/{hybrid_browser_toolkit → hybrid_browser_toolkit_py}/snapshot.py +0 -0
  49. /camel/toolkits/{hybrid_browser_toolkit → hybrid_browser_toolkit_py}/stealth_script.js +0 -0
  50. /camel/toolkits/{hybrid_browser_toolkit → hybrid_browser_toolkit_py}/unified_analyzer.js +0 -0
  51. {camel_ai-0.2.72a10.dist-info → camel_ai-0.2.73.dist-info}/WHEEL +0 -0
  52. {camel_ai-0.2.72a10.dist-info → camel_ai-0.2.73.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,2077 @@
1
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14
+
15
+ import datetime
16
+ import io
17
+ import json
18
+ import os
19
+ import time
20
+ import urllib.parse
21
+ from functools import wraps
22
+ from typing import Any, Callable, ClassVar, Dict, List, Optional, cast
23
+
24
+ from camel.logger import get_logger
25
+ from camel.models import BaseModelBackend
26
+ from camel.toolkits.base import BaseToolkit, RegisteredAgentToolkit
27
+ from camel.toolkits.function_tool import FunctionTool
28
+ from camel.utils import sanitize_filename
29
+ from camel.utils.commons import dependencies_required
30
+
31
+ from .agent import PlaywrightLLMAgent
32
+ from .browser_session import HybridBrowserSession
33
+ from .config_loader import ConfigLoader
34
+
35
+ logger = get_logger(__name__)
36
+
37
+
38
+ class HybridBrowserToolkit(BaseToolkit, RegisteredAgentToolkit):
39
+ r"""A hybrid browser toolkit that combines non-visual, DOM-based browser
40
+ automation with visual, screenshot-based capabilities.
41
+
42
+ This toolkit exposes a set of actions as CAMEL FunctionTools for agents
43
+ to interact with web pages. It can operate in headless mode and supports
44
+ both programmatic control of browser actions (like clicking and typing)
45
+ and visual analysis of the page layout through screenshots with marked
46
+ interactive elements.
47
+ """
48
+
49
+ # Default tool list - core browser functionality
50
+ DEFAULT_TOOLS: ClassVar[List[str]] = [
51
+ "browser_open",
52
+ "browser_close",
53
+ "browser_visit_page",
54
+ "browser_back",
55
+ "browser_forward",
56
+ "browser_click",
57
+ "browser_type",
58
+ "browser_switch_tab",
59
+ ]
60
+
61
+ # All available tools
62
+ ALL_TOOLS: ClassVar[List[str]] = [
63
+ "browser_open",
64
+ "browser_close",
65
+ "browser_visit_page",
66
+ "browser_back",
67
+ "browser_forward",
68
+ "browser_get_page_snapshot",
69
+ "browser_get_som_screenshot",
70
+ "browser_get_page_links",
71
+ "browser_click",
72
+ "browser_type",
73
+ "browser_select",
74
+ "browser_scroll",
75
+ "browser_enter",
76
+ "browser_wait_user",
77
+ "browser_solve_task",
78
+ "browser_switch_tab",
79
+ "browser_close_tab",
80
+ "browser_get_tab_info",
81
+ ]
82
+
83
+ def __init__(
84
+ self,
85
+ *,
86
+ headless: bool = True,
87
+ user_data_dir: Optional[str] = None,
88
+ stealth: bool = False,
89
+ web_agent_model: Optional[BaseModelBackend] = None,
90
+ cache_dir: str = "tmp/",
91
+ enabled_tools: Optional[List[str]] = None,
92
+ browser_log_to_file: bool = False,
93
+ session_id: Optional[str] = None,
94
+ default_start_url: str = "https://google.com/",
95
+ default_timeout: Optional[int] = None,
96
+ short_timeout: Optional[int] = None,
97
+ navigation_timeout: Optional[int] = None,
98
+ network_idle_timeout: Optional[int] = None,
99
+ screenshot_timeout: Optional[int] = None,
100
+ page_stability_timeout: Optional[int] = None,
101
+ dom_content_loaded_timeout: Optional[int] = None,
102
+ ) -> None:
103
+ r"""Initialize the HybridBrowserToolkit.
104
+
105
+ Args:
106
+ headless (bool): Whether to run the browser in headless mode.
107
+ Defaults to `True`.
108
+ user_data_dir (Optional[str]): Path to a directory for storing
109
+ browser data like cookies and local storage. Useful for
110
+ maintaining sessions across runs. Defaults to `None` (a
111
+ temporary directory is used).
112
+ stealth (bool): Whether to run the browser in stealth mode to
113
+ avoid
114
+ bot detection. When enabled, hides WebDriver characteristics,
115
+ spoofs navigator properties, and implements various
116
+ anti-detection
117
+ measures. Highly recommended for production use and when
118
+ accessing sites with bot detection. Defaults to `False`.
119
+ web_agent_model (Optional[BaseModelBackend]): The language model
120
+ backend to use for the high-level `solve_task` agent. This is
121
+ required only if you plan to use `solve_task`.
122
+ Defaults to `None`.
123
+ cache_dir (str): The directory to store cached files, such as
124
+ screenshots. Defaults to `"tmp/"`.
125
+ enabled_tools (Optional[List[str]]): List of tool names to
126
+ enable.
127
+ If None, uses DEFAULT_TOOLS. Available tools: browser_open,
128
+ browser_close, browser_visit_page, browser_back,
129
+ browser_forward, browser_get_page_snapshot,
130
+ browser_get_som_screenshot, browser_get_page_links,
131
+ browser_click, browser_type, browser_select,
132
+ browser_scroll, browser_enter, browser_wait_user,
133
+ browser_solve_task.
134
+ Defaults to `None`.
135
+ browser_log_to_file (bool): Whether to save detailed browser
136
+ action logs to file.
137
+ When enabled, logs action inputs/outputs, execution times,
138
+ and page loading times.
139
+ Logs are saved to an auto-generated timestamped file.
140
+ Defaults to `False`.
141
+ session_id (Optional[str]): A unique identifier for this browser
142
+ session. When multiple HybridBrowserToolkit instances are
143
+ used
144
+ concurrently, different session IDs prevent them from sharing
145
+ the same browser session and causing conflicts. If None, a
146
+ default session will be used. Defaults to `None`.
147
+ default_start_url (str): The default URL to navigate to when
148
+ open_browser() is called without a start_url parameter or
149
+ with
150
+ None. Defaults to `"https://google.com/"`.
151
+ default_timeout (Optional[int]): Default timeout in milliseconds
152
+ for browser actions. If None, uses environment variable
153
+ HYBRID_BROWSER_DEFAULT_TIMEOUT or defaults to 3000ms.
154
+ Defaults to `None`.
155
+ short_timeout (Optional[int]): Short timeout in milliseconds
156
+ for quick browser actions. If None, uses environment variable
157
+ HYBRID_BROWSER_SHORT_TIMEOUT or defaults to 1000ms.
158
+ Defaults to `None`.
159
+ navigation_timeout (Optional[int]): Custom navigation timeout in
160
+ milliseconds.
161
+ If None, uses environment variable
162
+ HYBRID_BROWSER_NAVIGATION_TIMEOUT or defaults to 10000ms.
163
+ Defaults to `None`.
164
+ network_idle_timeout (Optional[int]): Custom network idle
165
+ timeout in milliseconds.
166
+ If None, uses environment variable
167
+ HYBRID_BROWSER_NETWORK_IDLE_TIMEOUT or defaults to 5000ms.
168
+ Defaults to `None`.
169
+ screenshot_timeout (Optional[int]): Custom screenshot timeout in
170
+ milliseconds.
171
+ If None, uses environment variable
172
+ HYBRID_BROWSER_SCREENSHOT_TIMEOUT or defaults to 15000ms.
173
+ Defaults to `None`.
174
+ page_stability_timeout (Optional[int]): Custom page stability
175
+ timeout in milliseconds.
176
+ If None, uses environment variable
177
+ HYBRID_BROWSER_PAGE_STABILITY_TIMEOUT or defaults to 1500ms.
178
+ Defaults to `None`.
179
+ dom_content_loaded_timeout (Optional[int]): Custom DOM content
180
+ loaded timeout in milliseconds.
181
+ If None, uses environment variable
182
+ HYBRID_BROWSER_DOM_CONTENT_LOADED_TIMEOUT or defaults to
183
+ 5000ms.
184
+ Defaults to `None`.
185
+ """
186
+ super().__init__()
187
+ RegisteredAgentToolkit.__init__(self)
188
+ self._headless = headless
189
+ self._user_data_dir = user_data_dir
190
+ self._stealth = stealth
191
+ self._web_agent_model = web_agent_model
192
+ self._cache_dir = cache_dir
193
+ self._browser_log_to_file = browser_log_to_file
194
+ self._default_start_url = default_start_url
195
+ self._session_id = session_id or "default"
196
+
197
+ # Store timeout configuration
198
+ self._default_timeout = default_timeout
199
+ self._short_timeout = short_timeout
200
+ self._navigation_timeout = ConfigLoader.get_navigation_timeout(
201
+ navigation_timeout
202
+ )
203
+ self._network_idle_timeout = ConfigLoader.get_network_idle_timeout(
204
+ network_idle_timeout
205
+ )
206
+ self._screenshot_timeout = ConfigLoader.get_screenshot_timeout(
207
+ screenshot_timeout
208
+ )
209
+ self._page_stability_timeout = ConfigLoader.get_page_stability_timeout(
210
+ page_stability_timeout
211
+ )
212
+ self._dom_content_loaded_timeout = (
213
+ ConfigLoader.get_dom_content_loaded_timeout(
214
+ dom_content_loaded_timeout
215
+ )
216
+ )
217
+
218
+ # Logging configuration - fixed values for simplicity
219
+ self.enable_action_logging = True
220
+ self.enable_timing_logging = True
221
+ self.enable_page_loading_logging = True
222
+ self.log_to_console = False # Always disabled for cleaner output
223
+ self.log_to_file = browser_log_to_file
224
+ self.max_log_length = None # No truncation for file logs
225
+
226
+ # Set up log file if needed
227
+ if self.log_to_file:
228
+ # Create log directory if it doesn't exist
229
+ log_dir = "browser_log"
230
+ os.makedirs(log_dir, exist_ok=True)
231
+
232
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
233
+ self.log_file_path: Optional[str] = os.path.join(
234
+ log_dir, f"hybrid_browser_toolkit_{timestamp}_{session_id}.log"
235
+ )
236
+ else:
237
+ self.log_file_path = None
238
+
239
+ # Initialize log buffer for in-memory storage
240
+ self.log_buffer: List[Dict[str, Any]] = []
241
+
242
+ # Configure enabled tools
243
+ if enabled_tools is None:
244
+ self.enabled_tools = self.DEFAULT_TOOLS.copy()
245
+ else:
246
+ # Validate enabled tools
247
+ invalid_tools = [
248
+ tool for tool in enabled_tools if tool not in self.ALL_TOOLS
249
+ ]
250
+ if invalid_tools:
251
+ raise ValueError(
252
+ f"Invalid tools specified: {invalid_tools}. "
253
+ f"Available tools: {self.ALL_TOOLS}"
254
+ )
255
+ self.enabled_tools = enabled_tools.copy()
256
+
257
+ logger.info(f"Enabled tools: {self.enabled_tools}")
258
+
259
+ # Log initialization if file logging is enabled
260
+ if self.log_to_file:
261
+ logger.info(
262
+ "HybridBrowserToolkit initialized with file logging enabled"
263
+ )
264
+ logger.info(f"Log file path: {self.log_file_path}")
265
+
266
+ # Core components
267
+ temp_session = HybridBrowserSession(
268
+ headless=headless,
269
+ user_data_dir=user_data_dir,
270
+ stealth=stealth,
271
+ session_id=session_id,
272
+ default_timeout=default_timeout,
273
+ short_timeout=short_timeout,
274
+ )
275
+ # Use the session directly - singleton logic is handled in
276
+ # ensure_browser
277
+ self._session = temp_session
278
+ self._playwright_agent: Optional[PlaywrightLLMAgent] = None
279
+ self._unified_script = self._load_unified_analyzer()
280
+
281
+ @property
282
+ def web_agent_model(self) -> Optional[BaseModelBackend]:
283
+ """Get the web agent model."""
284
+ return self._web_agent_model
285
+
286
+ @web_agent_model.setter
287
+ def web_agent_model(self, value: Optional[BaseModelBackend]) -> None:
288
+ """Set the web agent model."""
289
+ self._web_agent_model = value
290
+
291
+ @property
292
+ def cache_dir(self) -> str:
293
+ """Get the cache directory."""
294
+ return self._cache_dir
295
+
296
+ def __del__(self):
297
+ r"""Cleanup browser resources on garbage collection."""
298
+ try:
299
+ import sys
300
+
301
+ if getattr(sys, "is_finalizing", lambda: False)():
302
+ return
303
+
304
+ import asyncio
305
+
306
+ try:
307
+ loop = asyncio.get_event_loop()
308
+ if not loop.is_closed() and not loop.is_running():
309
+ # Try to close browser with a timeout to prevent hanging
310
+ try:
311
+ loop.run_until_complete(
312
+ asyncio.wait_for(self.close_browser(), timeout=2.0)
313
+ )
314
+ except asyncio.TimeoutError:
315
+ pass # Skip cleanup if it takes too long
316
+ except (RuntimeError, ImportError):
317
+ pass # Event loop unavailable, skip cleanup
318
+ except Exception:
319
+ pass # Suppress all errors during garbage collection
320
+
321
+ def _load_unified_analyzer(self) -> str:
322
+ r"""Load the unified analyzer JavaScript script."""
323
+ script_path = os.path.join(
324
+ os.path.dirname(os.path.abspath(__file__)), "unified_analyzer.js"
325
+ )
326
+
327
+ try:
328
+ with open(
329
+ script_path, "r", encoding='utf-8', errors='replace'
330
+ ) as f:
331
+ script_content = f.read()
332
+
333
+ if not script_content.strip():
334
+ raise ValueError(f"Script is empty: {script_path}")
335
+
336
+ logger.debug(
337
+ f"Loaded unified analyzer ({len(script_content)} chars)"
338
+ )
339
+ return script_content
340
+ except FileNotFoundError:
341
+ raise FileNotFoundError(f"Script not found: {script_path}")
342
+
343
+ def _validate_ref(self, ref: str, method_name: str) -> None:
344
+ r"""Validate ref parameter."""
345
+ if not ref or not isinstance(ref, str):
346
+ raise ValueError(
347
+ f"{method_name}: 'ref' must be a non-empty string"
348
+ )
349
+
350
+ def _truncate_if_needed(self, content: Any) -> str:
351
+ r"""Truncate content if max_log_length is set."""
352
+ content_str = str(content)
353
+ if (
354
+ self.max_log_length is not None
355
+ and len(content_str) > self.max_log_length
356
+ ):
357
+ return content_str[: self.max_log_length] + "... [TRUNCATED]"
358
+ return content_str
359
+
360
+ async def _get_current_url(self) -> Optional[str]:
361
+ r"""Safely get the current URL of the active page."""
362
+ try:
363
+ page = await self._session.get_page()
364
+ if page and not page.is_closed():
365
+ return page.url
366
+ return None # Return None if page is closed
367
+ except Exception:
368
+ # This can happen if browser is not open.
369
+ return None
370
+
371
+ async def _log_action(
372
+ self,
373
+ action_name: str,
374
+ inputs: Dict[str, Any],
375
+ outputs: Any,
376
+ execution_time: float,
377
+ page_load_time: Optional[float] = None,
378
+ error: Optional[str] = None,
379
+ ) -> None:
380
+ r"""Log action details with comprehensive information."""
381
+ if not (self.enable_action_logging or self.enable_timing_logging):
382
+ return
383
+
384
+ current_url = await self._get_current_url()
385
+
386
+ log_entry: Dict[str, Any] = {
387
+ "timestamp": datetime.datetime.now().isoformat(),
388
+ "action": action_name,
389
+ "url": current_url,
390
+ "execution_time_ms": round(execution_time * 1000, 2),
391
+ }
392
+
393
+ if self.enable_action_logging:
394
+ log_entry["inputs"] = inputs
395
+ if error:
396
+ log_entry["error"] = str(error)
397
+ elif isinstance(outputs, dict):
398
+ # Unpack dictionary items into the log entry
399
+ log_entry.update(outputs)
400
+ else:
401
+ # For non-dict outputs, assign to 'outputs' key
402
+ log_entry["outputs"] = outputs
403
+
404
+ if page_load_time is not None and self.enable_page_loading_logging:
405
+ log_entry["page_load_time_ms"] = round(page_load_time * 1000, 2)
406
+
407
+ # Add to buffer
408
+ self.log_buffer.append(log_entry)
409
+
410
+ # Console logging
411
+ if self.log_to_console:
412
+ log_msg = f"[BROWSER ACTION] {action_name}"
413
+ if self.enable_timing_logging:
414
+ log_msg += (
415
+ f" | Execution: " f"{log_entry['execution_time_ms']}ms"
416
+ )
417
+ if page_load_time is not None and self.enable_page_loading_logging:
418
+ log_msg += (
419
+ f" | Page Load: " f"{log_entry['page_load_time_ms']}ms"
420
+ )
421
+ if error:
422
+ log_msg += f" | ERROR: {error}"
423
+
424
+ logger.info(log_msg)
425
+
426
+ if self.enable_action_logging:
427
+ logger.info(f" Inputs: {self._truncate_if_needed(inputs)}")
428
+ if not error:
429
+ if isinstance(outputs, dict):
430
+ for key, value in outputs.items():
431
+ logger.info(
432
+ f" - {key}: "
433
+ f"{self._truncate_if_needed(value)}"
434
+ )
435
+ else:
436
+ logger.info(
437
+ f" Outputs: {self._truncate_if_needed(outputs)}"
438
+ )
439
+
440
+ # File logging
441
+ if self.log_to_file and self.log_file_path:
442
+ try:
443
+ with open(self.log_file_path, 'a', encoding='utf-8') as f:
444
+ # Write full log entry to file without truncation
445
+ f.write(
446
+ json.dumps(log_entry, ensure_ascii=False, indent=2)
447
+ + '\n'
448
+ )
449
+ except Exception as e:
450
+ logger.error(f"Failed to write to log file: {e}")
451
+
452
+ @staticmethod
453
+ def action_logger(func: Callable[..., Any]) -> Callable[..., Any]:
454
+ r"""Decorator to add logging to action methods."""
455
+
456
+ @wraps(func)
457
+ async def wrapper(self, *args, **kwargs):
458
+ action_name = func.__name__
459
+ start_time = time.time()
460
+
461
+ # Log inputs
462
+ inputs = {
463
+ "args": args, # Don't skip self since it's already handled
464
+ "kwargs": kwargs,
465
+ }
466
+
467
+ try:
468
+ # Execute the original function
469
+ result = await func(self, *args, **kwargs)
470
+ execution_time = time.time() - start_time
471
+
472
+ # Log success
473
+ await self._log_action(
474
+ action_name=action_name,
475
+ inputs=inputs,
476
+ outputs=result,
477
+ execution_time=execution_time,
478
+ )
479
+
480
+ return result
481
+
482
+ except Exception as e:
483
+ execution_time = time.time() - start_time
484
+ error_msg = f"{type(e).__name__}: {e!s}"
485
+
486
+ # Log error
487
+ await self._log_action(
488
+ action_name=action_name,
489
+ inputs=inputs,
490
+ outputs=None,
491
+ execution_time=execution_time,
492
+ error=error_msg,
493
+ )
494
+
495
+ raise
496
+
497
+ return wrapper
498
+
499
+ async def _get_session(self) -> "HybridBrowserSession":
500
+ """Get the correct singleton session instance."""
501
+ singleton = await HybridBrowserSession._get_or_create_instance(
502
+ self._session
503
+ )
504
+ if singleton is not self._session:
505
+ logger.debug("Updating to singleton session instance")
506
+ self._session = singleton
507
+ return self._session
508
+
509
+ async def _ensure_browser(self):
510
+ # Get singleton instance and update self._session if needed
511
+ session = await self._get_session()
512
+ await session.ensure_browser()
513
+
514
+ async def _require_page(self):
515
+ # Get singleton instance and update self._session if needed
516
+ session = await self._get_session()
517
+ await session.ensure_browser()
518
+ return await session.get_page()
519
+
520
+ async def _wait_for_page_stability(self):
521
+ r"""Wait for page to become stable after actions that might trigger
522
+ updates. Optimized with shorter timeouts.
523
+ """
524
+ page = await self._require_page()
525
+ import asyncio
526
+
527
+ try:
528
+ # Wait for DOM content to be loaded (reduced timeout)
529
+ await page.wait_for_load_state(
530
+ 'domcontentloaded', timeout=self._page_stability_timeout
531
+ )
532
+ logger.debug("DOM content loaded")
533
+
534
+ # Try to wait for network idle with shorter timeout
535
+ try:
536
+ await page.wait_for_load_state(
537
+ 'networkidle', timeout=self._network_idle_timeout
538
+ )
539
+ logger.debug("Network idle achieved")
540
+ except Exception:
541
+ logger.debug("Network idle timeout - continuing anyway")
542
+
543
+ # Reduced delay for JavaScript execution
544
+ await asyncio.sleep(0.2) # Reduced from 0.5s
545
+ logger.debug("Page stability wait completed")
546
+
547
+ except Exception as e:
548
+ logger.debug(
549
+ f"Page stability wait failed: {e} - continuing anyway"
550
+ )
551
+
552
+ async def _get_unified_analysis(
553
+ self, max_retries: int = 3
554
+ ) -> Dict[str, Any]:
555
+ r"""Get unified analysis data from the page with retry mechanism for
556
+ navigation issues."""
557
+ page = await self._require_page()
558
+
559
+ for attempt in range(max_retries):
560
+ try:
561
+ if not self._unified_script:
562
+ logger.error("Unified analyzer script not loaded")
563
+ return {"elements": {}, "metadata": {"elementCount": 0}}
564
+
565
+ # Wait for DOM stability before each attempt (with optimized
566
+ # timeout)
567
+ try:
568
+ await page.wait_for_load_state(
569
+ 'domcontentloaded',
570
+ timeout=self._dom_content_loaded_timeout,
571
+ )
572
+ except Exception:
573
+ # Don't fail if DOM wait times out
574
+ pass
575
+
576
+ result = await page.evaluate(self._unified_script)
577
+
578
+ if not isinstance(result, dict):
579
+ logger.warning(f"Invalid result type: {type(result)}")
580
+ return {"elements": {}, "metadata": {"elementCount": 0}}
581
+
582
+ # Success - return result
583
+ if attempt > 0:
584
+ logger.debug(
585
+ f"Unified analysis succeeded on attempt "
586
+ f"{attempt + 1}"
587
+ )
588
+ return result
589
+
590
+ except Exception as e:
591
+ error_msg = str(e)
592
+
593
+ # Check if this is a navigation-related error
594
+ is_navigation_error = (
595
+ "Execution context was destroyed" in error_msg
596
+ or "Most likely because of a navigation" in error_msg
597
+ or "Target page, context or browser has been closed"
598
+ in error_msg
599
+ )
600
+
601
+ if is_navigation_error and attempt < max_retries - 1:
602
+ logger.debug(
603
+ f"Navigation error in unified analysis (attempt "
604
+ f"{attempt + 1}/{max_retries}): {e}. Retrying..."
605
+ )
606
+
607
+ # Wait a bit for page stability before retrying (
608
+ # optimized)
609
+ try:
610
+ await page.wait_for_load_state(
611
+ 'domcontentloaded',
612
+ timeout=self._page_stability_timeout,
613
+ )
614
+ # Reduced delay for JS context to stabilize
615
+ import asyncio
616
+
617
+ await asyncio.sleep(0.1) # Reduced from 0.2s
618
+ except Exception:
619
+ # Continue even if wait fails
620
+ pass
621
+
622
+ continue
623
+
624
+ # Non-navigation error or final attempt - log and return
625
+ # empty result
626
+ if attempt == max_retries - 1:
627
+ logger.warning(
628
+ f"Error in unified analysis after {max_retries} "
629
+ f"attempts: {e}"
630
+ )
631
+ else:
632
+ logger.warning(
633
+ f"Non-retryable error in unified analysis: {e}"
634
+ )
635
+
636
+ return {"elements": {}, "metadata": {"elementCount": 0}}
637
+
638
+ # Should not reach here, but just in case
639
+ return {"elements": {}, "metadata": {"elementCount": 0}}
640
+
641
+ def _convert_analysis_to_rects(
642
+ self, analysis_data: Dict[str, Any]
643
+ ) -> Dict[str, Any]:
644
+ r"""Convert analysis data to rect format for visual marking."""
645
+ rects = {}
646
+ elements = analysis_data.get("elements", {})
647
+
648
+ for ref, element_data in elements.items():
649
+ coordinates = element_data.get("coordinates", [])
650
+ if coordinates:
651
+ rects[ref] = {
652
+ "role": element_data.get("role", "generic"),
653
+ "aria-name": element_data.get("name", ""),
654
+ "rects": [coordinates[0]],
655
+ }
656
+ return rects
657
+
658
+ def _add_set_of_mark(self, image, rects):
659
+ r"""Add visual marks to the image."""
660
+ try:
661
+ from PIL import ImageDraw, ImageFont
662
+ except ImportError:
663
+ logger.warning("PIL not available, returning original image")
664
+ return image
665
+
666
+ marked_image = image.copy()
667
+ draw = ImageDraw.Draw(marked_image)
668
+
669
+ # Try to get font
670
+ try:
671
+ font = ImageFont.truetype("arial.ttf", 16)
672
+ except (OSError, IOError):
673
+ try:
674
+ font = ImageFont.load_default()
675
+ except (OSError, IOError):
676
+ font = None
677
+
678
+ # Color scheme
679
+ colors = {
680
+ "button": "#FF6B6B",
681
+ "link": "#4ECDC4",
682
+ "textbox": "#45B7D1",
683
+ "select": "#96CEB4",
684
+ "checkbox": "#FECA57",
685
+ "radio": "#FF9FF3",
686
+ "default": "#DDA0DD",
687
+ }
688
+
689
+ for ref, rect_data in rects.items():
690
+ rects_list = rect_data.get("rects", [])
691
+ role = rect_data.get("role", "generic")
692
+ color = colors.get(role, colors["default"])
693
+
694
+ for rect in rects_list:
695
+ x, y = rect.get("x", 0), rect.get("y", 0)
696
+ width, height = rect.get("width", 0), rect.get("height", 0)
697
+
698
+ # Draw rectangle outline
699
+ draw.rectangle(
700
+ [x, y, x + width, y + height], outline=color, width=2
701
+ )
702
+
703
+ # Draw reference label
704
+ label_text = ref
705
+ if font:
706
+ bbox = draw.textbbox((0, 0), label_text, font=font)
707
+ text_width, text_height = (
708
+ bbox[2] - bbox[0],
709
+ bbox[3] - bbox[1],
710
+ )
711
+ else:
712
+ text_width, text_height = len(label_text) * 8, 16
713
+
714
+ label_x, label_y = max(0, x - 2), max(0, y - text_height - 2)
715
+
716
+ # Background and text
717
+ draw.rectangle(
718
+ [
719
+ label_x,
720
+ label_y,
721
+ label_x + text_width + 4,
722
+ label_y + text_height + 2,
723
+ ],
724
+ fill=color,
725
+ )
726
+ draw.text(
727
+ (label_x + 2, label_y + 1),
728
+ label_text,
729
+ fill="white",
730
+ font=font,
731
+ )
732
+
733
+ return marked_image
734
+
735
+ def _format_snapshot_from_analysis(
736
+ self, analysis_data: Dict[str, Any]
737
+ ) -> str:
738
+ r"""Format analysis data into snapshot string."""
739
+ lines = []
740
+ elements = analysis_data.get("elements", {})
741
+
742
+ for ref, element_data in elements.items():
743
+ role = element_data.get("role", "generic")
744
+ name = element_data.get("name", "")
745
+
746
+ line = f"- {role}"
747
+ if name:
748
+ line += f' "{name}"'
749
+
750
+ # Add properties
751
+ props = []
752
+ for prop in ["disabled", "checked", "expanded"]:
753
+ value = element_data.get(prop)
754
+ if value is True:
755
+ props.append(prop)
756
+ elif value is not None and prop in ["checked", "expanded"]:
757
+ props.append(f"{prop}={value}")
758
+
759
+ if props:
760
+ line += f" {' '.join(props)}"
761
+
762
+ line += f" [ref={ref}]"
763
+ lines.append(line)
764
+
765
+ return "\n".join(lines)
766
+
767
+ async def _get_tab_info_for_output(self) -> Dict[str, Any]:
768
+ r"""Get tab information to include in action outputs."""
769
+ try:
770
+ # Ensure we have the correct singleton session instance first
771
+ session = await self._get_session()
772
+
773
+ # Add debug info for tab info retrieval
774
+ logger.debug("Attempting to get tab info from session...")
775
+ tab_info = await session.get_tab_info()
776
+ current_tab_index = await session.get_current_tab_id()
777
+
778
+ # Debug log the successful retrieval
779
+ logger.debug(
780
+ f"Successfully retrieved {len(tab_info)} tabs, current: "
781
+ f"{current_tab_index}"
782
+ )
783
+
784
+ return {
785
+ "tabs": tab_info,
786
+ "current_tab": current_tab_index,
787
+ "total_tabs": len(tab_info),
788
+ }
789
+ except Exception as e:
790
+ logger.warning(
791
+ f"Failed to get tab info from session: {type(e).__name__}: "
792
+ f"{e}"
793
+ )
794
+
795
+ # Try to get actual tab count from session pages directly
796
+ try:
797
+ # Get the correct session instance for fallback
798
+ fallback_session = await self._get_session()
799
+
800
+ # Check browser session state
801
+ session_state = {
802
+ "has_session": fallback_session is not None,
803
+ "has_pages_attr": hasattr(fallback_session, '_pages'),
804
+ "pages_count": len(fallback_session._pages)
805
+ if hasattr(fallback_session, '_pages')
806
+ else "unknown",
807
+ "has_page": hasattr(fallback_session, '_page')
808
+ and fallback_session._page is not None,
809
+ "session_id": getattr(
810
+ fallback_session, '_session_id', 'unknown'
811
+ ),
812
+ }
813
+ logger.debug(f"Browser session state: {session_state}")
814
+
815
+ actual_tab_count = 0
816
+ if (
817
+ hasattr(fallback_session, '_pages')
818
+ and fallback_session._pages
819
+ ):
820
+ actual_tab_count = len(fallback_session._pages)
821
+ # Also try to filter out closed pages
822
+ try:
823
+ open_pages = [
824
+ p
825
+ for p in fallback_session._pages.values()
826
+ if not p.is_closed()
827
+ ]
828
+ actual_tab_count = len(open_pages)
829
+ logger.debug(
830
+ f"Found {actual_tab_count} open tabs out of "
831
+ f"{len(fallback_session._pages)} total"
832
+ )
833
+ except Exception:
834
+ # Keep the original count if we can't check page
835
+ # status
836
+ pass
837
+
838
+ if actual_tab_count == 0:
839
+ # If no pages, check if browser is even initialized
840
+ if (
841
+ hasattr(fallback_session, '_page')
842
+ and fallback_session._page is not None
843
+ ):
844
+ actual_tab_count = 1
845
+ logger.debug(
846
+ "No pages in list but main page exists, "
847
+ "assuming "
848
+ "1 tab"
849
+ )
850
+ else:
851
+ actual_tab_count = 1
852
+ logger.debug("No pages found, defaulting to 1 tab")
853
+
854
+ logger.debug(f"Using fallback tab count: {actual_tab_count}")
855
+ return {
856
+ "tabs": [],
857
+ "current_tab": 0,
858
+ "total_tabs": actual_tab_count,
859
+ }
860
+
861
+ except Exception as fallback_error:
862
+ logger.warning(
863
+ f"Fallback tab count also failed: "
864
+ f"{type(fallback_error).__name__}: {fallback_error}"
865
+ )
866
+ return {"tabs": [], "current_tab": 0, "total_tabs": 1}
867
+
868
+ async def _exec_with_snapshot(
869
+ self,
870
+ action: Dict[str, Any],
871
+ element_details: Optional[Dict[str, Any]] = None,
872
+ ) -> Dict[str, str]:
873
+ r"""Execute action and return result with snapshot comparison."""
874
+
875
+ # Log action execution start
876
+ action_type = action.get("type", "unknown")
877
+ logger.info(f"Executing action: {action_type}")
878
+
879
+ action_start_time = time.time()
880
+ inputs: Dict[str, Any] = {"action": action}
881
+ page_load_time = None
882
+
883
+ try:
884
+ # Get before snapshot
885
+ logger.info("Capturing pre-action snapshot...")
886
+ snapshot_start_before = time.time()
887
+ before_snapshot = await self._session.get_snapshot(
888
+ force_refresh=True, diff_only=False
889
+ )
890
+ before_snapshot_time = time.time() - snapshot_start_before
891
+ logger.info(
892
+ f"Pre-action snapshot captured in "
893
+ f"{before_snapshot_time:.2f}s"
894
+ )
895
+
896
+ # Execute action
897
+ logger.info(f"Executing {action_type} action...")
898
+ exec_start = time.time()
899
+ exec_result = await self._session.exec_action(action)
900
+ exec_time = time.time() - exec_start
901
+ logger.info(f"Action {action_type} completed in {exec_time:.2f}s")
902
+
903
+ # Parse the detailed result from ActionExecutor
904
+ if isinstance(exec_result, dict):
905
+ result_message = exec_result.get("message", str(exec_result))
906
+ action_details = exec_result.get("details", {})
907
+ success = exec_result.get("success", True)
908
+ else:
909
+ result_message = str(exec_result)
910
+ action_details = {}
911
+ success = True
912
+
913
+ # Wait for page stability after action (especially important for
914
+ # click)
915
+ stability_time: float = 0.0
916
+ if action_type in ["click", "type", "select", "enter"]:
917
+ logger.info(
918
+ f"Waiting for page stability " f"after {action_type}..."
919
+ )
920
+ stability_start = time.time()
921
+ await self._wait_for_page_stability()
922
+ stability_time = time.time() - stability_start
923
+ logger.info(
924
+ f"Page stability wait "
925
+ f"completed in "
926
+ f"{stability_time:.2f}s"
927
+ )
928
+ page_load_time = stability_time
929
+
930
+ # Enhanced logging for page loading times
931
+ if self.enable_page_loading_logging and self.log_to_console:
932
+ logger.info(
933
+ f"[PAGE LOADING] Page stability for {action_type}: "
934
+ f"{round(stability_time * 1000, 2)}ms"
935
+ )
936
+
937
+ # Get after snapshot
938
+ logger.info("Capturing post-action snapshot...")
939
+ snapshot_start_after = time.time()
940
+ after_snapshot = await self._session.get_snapshot(
941
+ force_refresh=True, diff_only=False
942
+ )
943
+ after_snapshot_time = time.time() - snapshot_start_after
944
+ logger.info(
945
+ f"Post-action snapshot "
946
+ f"captured in {after_snapshot_time:.2f}s"
947
+ )
948
+
949
+ # Check for snapshot quality and log warnings
950
+ if before_snapshot == after_snapshot:
951
+ snapshot = "snapshot not changed"
952
+ logger.debug("Page snapshot unchanged after action")
953
+ else:
954
+ snapshot = after_snapshot
955
+ # Check if snapshot is empty or problematic
956
+ if "<empty>" in after_snapshot:
957
+ logger.warning(
958
+ f"Action {action_type} resulted "
959
+ f"in empty snapshot - "
960
+ f"page may still be loading"
961
+ )
962
+ elif len(after_snapshot.strip()) < 50:
963
+ logger.warning(
964
+ f"Action {action_type} resulted "
965
+ f"in very short snapshot:"
966
+ f" {len(after_snapshot)} chars"
967
+ )
968
+ else:
969
+ logger.debug(
970
+ f"Action {action_type} resulted "
971
+ f"in updated snapshot: "
972
+ f"{len(after_snapshot)} chars"
973
+ )
974
+
975
+ # Get tab information for output
976
+ tab_info = await self._get_tab_info_for_output()
977
+
978
+ # Create comprehensive output for logging
979
+ execution_time = time.time() - action_start_time
980
+ total_snapshot_time = before_snapshot_time + after_snapshot_time
981
+ outputs = {
982
+ "result": result_message,
983
+ "snapshot": snapshot,
984
+ "success": success,
985
+ "action_details": action_details,
986
+ "execution_stats": {
987
+ "exec_time_ms": round(exec_time * 1000, 2),
988
+ "stability_time_ms": round(stability_time * 1000, 2)
989
+ if stability_time > 0
990
+ else None,
991
+ "snapshot_time_ms": round(total_snapshot_time * 1000, 2),
992
+ "total_time_ms": round(execution_time * 1000, 2),
993
+ },
994
+ **tab_info, # Include tab information
995
+ }
996
+
997
+ # If snapshot is unchanged after click, add element details to
998
+ # log
999
+ if (
1000
+ snapshot == "snapshot not changed"
1001
+ and action_type == "click"
1002
+ and element_details
1003
+ ):
1004
+ logger.debug(
1005
+ "Snapshot unchanged after click. "
1006
+ "Adding element details to log."
1007
+ )
1008
+ outputs["clicked_element_tag"] = element_details.get(
1009
+ "tagName", "N/A"
1010
+ )
1011
+ outputs["clicked_element_content"] = element_details.get(
1012
+ "name", ""
1013
+ )
1014
+ outputs["clicked_element_type"] = element_details.get(
1015
+ "role", "generic"
1016
+ )
1017
+
1018
+ # Log the action with all details
1019
+ await self._log_action(
1020
+ action_name=f"_exec_with_snapshot_{action_type}",
1021
+ inputs=inputs,
1022
+ outputs=outputs,
1023
+ execution_time=execution_time,
1024
+ page_load_time=page_load_time,
1025
+ )
1026
+
1027
+ return {"result": result_message, "snapshot": snapshot}
1028
+
1029
+ except Exception as e:
1030
+ execution_time = time.time() - action_start_time
1031
+ error_msg = f"{type(e).__name__}: {e!s}"
1032
+
1033
+ # Log error
1034
+ await self._log_action(
1035
+ action_name=f"_exec_with_snapshot_{action_type}",
1036
+ inputs=inputs,
1037
+ outputs=None,
1038
+ execution_time=execution_time,
1039
+ page_load_time=page_load_time,
1040
+ error=error_msg,
1041
+ )
1042
+
1043
+ raise
1044
+
1045
+ async def _extract_links_by_refs(
1046
+ self, snapshot: str, page, refs: List[str]
1047
+ ) -> List[Dict[str, str]]:
1048
+ r"""Extract multiple links by their reference IDs."""
1049
+ import re
1050
+
1051
+ found_links = []
1052
+ ref_set = set(refs)
1053
+ lines = snapshot.split('\n')
1054
+
1055
+ for line in lines:
1056
+ link_match = re.search(
1057
+ r'- link\s+"([^"]+)"\s+\[ref=([^\]]+)\]', line
1058
+ )
1059
+ if link_match and link_match.group(2) in ref_set:
1060
+ text, found_ref = link_match.groups()
1061
+ try:
1062
+ url = await self._get_link_url_by_ref(page, found_ref)
1063
+ found_links.append(
1064
+ {"text": text, "ref": found_ref, "url": url or ""}
1065
+ )
1066
+ except Exception as e:
1067
+ logger.warning(
1068
+ f"Failed to get URL for ref {found_ref}: {e}"
1069
+ )
1070
+ found_links.append(
1071
+ {"text": text, "ref": found_ref, "url": ""}
1072
+ )
1073
+
1074
+ return found_links
1075
+
1076
+ async def _get_link_url_by_ref(self, page, ref: str) -> str:
1077
+ r"""Get URL of a link element by reference ID."""
1078
+ try:
1079
+ element = await page.query_selector(f'[aria-ref="{ref}"]')
1080
+ if element:
1081
+ href = await element.get_attribute('href')
1082
+ if href:
1083
+ from urllib.parse import urljoin
1084
+
1085
+ return urljoin(page.url, href)
1086
+ return ""
1087
+ except Exception as e:
1088
+ logger.warning(f"Failed to get URL for ref {ref}: {e}")
1089
+ return ""
1090
+
1091
+ def _ensure_agent(self) -> PlaywrightLLMAgent:
1092
+ r"""Create PlaywrightLLMAgent on first use."""
1093
+ if self._web_agent_model is None:
1094
+ raise RuntimeError(
1095
+ "web_agent_model required for high-level task planning"
1096
+ )
1097
+
1098
+ if self._playwright_agent is None:
1099
+ self._playwright_agent = PlaywrightLLMAgent(
1100
+ headless=self._headless,
1101
+ user_data_dir=self._user_data_dir,
1102
+ model_backend=self._web_agent_model,
1103
+ )
1104
+ return self._playwright_agent
1105
+
1106
+ # Public API Methods
1107
+
1108
+ async def browser_open(self) -> Dict[str, Any]:
1109
+ r"""Starts a new browser session. This must be the first browser
1110
+ action.
1111
+
1112
+ This method initializes the browser and navigates to a default start
1113
+ page. To visit a specific URL, use `visit_page` after this.
1114
+
1115
+ Returns:
1116
+ Dict[str, Any]: A dictionary with the result of the action:
1117
+ - "result" (str): Confirmation of the action.
1118
+ - "snapshot" (str): A textual snapshot of interactive
1119
+ elements.
1120
+ - "tabs" (List[Dict]): Information about all open tabs.
1121
+ - "current_tab" (int): Index of the active tab.
1122
+ - "total_tabs" (int): Total number of open tabs.
1123
+ """
1124
+ # Add logging if enabled
1125
+ action_start = time.time()
1126
+ inputs: Dict[str, Any] = {} # No input parameters for agents
1127
+
1128
+ logger.info("Starting browser session...")
1129
+
1130
+ browser_start = time.time()
1131
+ await self._session.ensure_browser()
1132
+ browser_time = time.time() - browser_start
1133
+ logger.info(f"Browser session started in {browser_time:.2f}s")
1134
+
1135
+ try:
1136
+ # Always use the configured default start URL
1137
+ start_url = self._default_start_url
1138
+ logger.info(f"Navigating to configured default page: {start_url}")
1139
+
1140
+ # Use visit_page without creating a new tab
1141
+ result = await self.browser_visit_page(url=start_url)
1142
+
1143
+ # Log success
1144
+ if self.enable_action_logging or self.enable_timing_logging:
1145
+ execution_time = time.time() - action_start
1146
+ await self._log_action(
1147
+ action_name="browser_open",
1148
+ inputs=inputs,
1149
+ outputs={
1150
+ "result": "Browser opened and navigated to "
1151
+ "default page."
1152
+ },
1153
+ execution_time=execution_time,
1154
+ )
1155
+
1156
+ return result
1157
+
1158
+ except Exception as e:
1159
+ # Log error
1160
+ if self.enable_action_logging or self.enable_timing_logging:
1161
+ execution_time = time.time() - action_start
1162
+ await self._log_action(
1163
+ action_name="browser_open",
1164
+ inputs=inputs,
1165
+ outputs=None,
1166
+ execution_time=execution_time,
1167
+ error=f"{type(e).__name__}: {e!s}",
1168
+ )
1169
+ raise
1170
+
1171
+ @action_logger
1172
+ async def browser_close(self) -> str:
1173
+ r"""Closes the browser session, releasing all resources.
1174
+
1175
+ This should be called at the end of a task for cleanup.
1176
+
1177
+ Returns:
1178
+ str: A confirmation message.
1179
+ """
1180
+ if self._playwright_agent is not None:
1181
+ try:
1182
+ await self._playwright_agent.close()
1183
+ except Exception:
1184
+ pass
1185
+ self._playwright_agent = None
1186
+
1187
+ await self._session.close()
1188
+ return "Browser session closed."
1189
+
1190
+ @action_logger
1191
+ async def browser_visit_page(self, url: str) -> Dict[str, Any]:
1192
+ r"""Opens a URL in a new browser tab and switches to it.
1193
+
1194
+ Args:
1195
+ url (str): The web address to load. This should be a valid and
1196
+ existing URL.
1197
+
1198
+ Returns:
1199
+ Dict[str, Any]: A dictionary with the result of the action:
1200
+ - "result" (str): Confirmation of the action.
1201
+ - "snapshot" (str): A textual snapshot of the new page.
1202
+ - "tabs" (List[Dict]): Information about all open tabs.
1203
+ - "current_tab" (int): Index of the new active tab.
1204
+ - "total_tabs" (int): Total number of open tabs.
1205
+ """
1206
+ if not url or not isinstance(url, str):
1207
+ return {
1208
+ "result": "Error: 'url' must be a non-empty string",
1209
+ "snapshot": "",
1210
+ "tabs": [],
1211
+ "current_tab": 0,
1212
+ "total_tabs": 1,
1213
+ }
1214
+
1215
+ if '://' not in url:
1216
+ url = f'https://{url}'
1217
+
1218
+ await self._ensure_browser()
1219
+ session = await self._get_session()
1220
+ nav_result = ""
1221
+
1222
+ # By default, we want to create a new tab.
1223
+ should_create_new_tab = True
1224
+ try:
1225
+ # If the browser has just started with a single "about:blank"
1226
+ # tab,
1227
+ # use that tab instead of creating a new one.
1228
+ tab_info_data = await self._get_tab_info_for_output()
1229
+ tabs = tab_info_data.get("tabs", [])
1230
+ if len(tabs) == 1 and tabs[0].get("url") == "about:blank":
1231
+ logger.info(
1232
+ "Found single blank tab, navigating in current tab "
1233
+ "instead of creating a new one."
1234
+ )
1235
+ should_create_new_tab = False
1236
+ except Exception as e:
1237
+ logger.warning(
1238
+ "Could not get tab info to check for blank tab, "
1239
+ f"proceeding with default behavior (new tab). Error: {e}"
1240
+ )
1241
+
1242
+ if should_create_new_tab:
1243
+ logger.info(f"Creating new tab and navigating to URL: {url}")
1244
+ try:
1245
+ new_tab_id = await session.create_new_tab(url)
1246
+ await session.switch_to_tab(new_tab_id)
1247
+ nav_result = f"Visited {url} in new tab {new_tab_id}"
1248
+ except Exception as e:
1249
+ logger.error(f"Failed to create new tab and navigate: {e}")
1250
+ nav_result = f"Error creating new tab: {e}"
1251
+ else:
1252
+ logger.info(f"Navigating to URL in current tab: {url}")
1253
+ nav_result = await session.visit(url)
1254
+
1255
+ # Get snapshot
1256
+ snapshot = ""
1257
+ try:
1258
+ snapshot = await session.get_snapshot(
1259
+ force_refresh=True, diff_only=False
1260
+ )
1261
+ except Exception as e:
1262
+ logger.warning(f"Failed to capture snapshot: {e}")
1263
+
1264
+ # Get tab information
1265
+ tab_info = await self._get_tab_info_for_output()
1266
+
1267
+ return {"result": nav_result, "snapshot": snapshot, **tab_info}
1268
+
1269
+ @action_logger
1270
+ async def browser_back(self) -> Dict[str, Any]:
1271
+ r"""Goes back to the previous page in the browser history.
1272
+
1273
+ This action simulates using the browser's "back" button in the
1274
+ currently active tab.
1275
+
1276
+ Returns:
1277
+ Dict[str, Any]: A dictionary with the result of the action:
1278
+ - "result" (str): Confirmation of the action.
1279
+ - "snapshot" (str): A textual snapshot of the previous page.
1280
+ - "tabs" (List[Dict]): Information about all open tabs.
1281
+ - "current_tab" (int): Index of the active tab.
1282
+ - "total_tabs" (int): Total number of open tabs.
1283
+ """
1284
+ page = await self._require_page()
1285
+
1286
+ try:
1287
+ logger.info("Navigating back in browser history...")
1288
+ nav_start = time.time()
1289
+ await page.go_back(
1290
+ wait_until="domcontentloaded", timeout=self._navigation_timeout
1291
+ )
1292
+ nav_time = time.time() - nav_start
1293
+ logger.info(f"Back navigation completed in {nav_time:.2f}s")
1294
+
1295
+ # Minimal wait for page stability (back navigation is usually
1296
+ # fast)
1297
+ import asyncio
1298
+
1299
+ await asyncio.sleep(0.2)
1300
+
1301
+ # Get snapshot
1302
+ logger.info("Capturing page snapshot after back navigation...")
1303
+ snapshot_start = time.time()
1304
+ snapshot = await self._session.get_snapshot(
1305
+ force_refresh=True, diff_only=False
1306
+ )
1307
+ snapshot_time = time.time() - snapshot_start
1308
+ logger.info(
1309
+ f"Back navigation snapshot captured in {snapshot_time:.2f}s"
1310
+ )
1311
+
1312
+ # Get tab information
1313
+ tab_info = await self._get_tab_info_for_output()
1314
+
1315
+ return {
1316
+ "result": "Back navigation successful.",
1317
+ "snapshot": snapshot,
1318
+ **tab_info,
1319
+ }
1320
+
1321
+ except Exception as e:
1322
+ logger.warning(f"Back navigation failed: {e}")
1323
+ # Get current snapshot even if navigation failed
1324
+ snapshot = await self._session.get_snapshot(
1325
+ force_refresh=True, diff_only=False
1326
+ )
1327
+ tab_info = await self._get_tab_info_for_output()
1328
+ return {
1329
+ "result": f"Back navigation failed: {e!s}",
1330
+ "snapshot": snapshot,
1331
+ **tab_info,
1332
+ }
1333
+
1334
+ @action_logger
1335
+ async def browser_forward(self) -> Dict[str, Any]:
1336
+ r"""Goes forward to the next page in the browser history.
1337
+
1338
+ This action simulates using the browser's "forward" button in the
1339
+ currently active tab.
1340
+
1341
+ Returns:
1342
+ Dict[str, Any]: A dictionary with the result of the action:
1343
+ - "result" (str): Confirmation of the action.
1344
+ - "snapshot" (str): A textual snapshot of the next page.
1345
+ - "tabs" (List[Dict]): Information about all open tabs.
1346
+ - "current_tab" (int): Index of the active tab.
1347
+ - "total_tabs" (int): Total number of open tabs.
1348
+ """
1349
+ page = await self._require_page()
1350
+
1351
+ try:
1352
+ logger.info("Navigating forward in browser history...")
1353
+ nav_start = time.time()
1354
+ await page.go_forward(
1355
+ wait_until="domcontentloaded", timeout=self._navigation_timeout
1356
+ )
1357
+ nav_time = time.time() - nav_start
1358
+ logger.info(f"Forward navigation completed in {nav_time:.2f}s")
1359
+
1360
+ # Minimal wait for page stability (forward navigation is usually
1361
+ # fast)
1362
+ import asyncio
1363
+
1364
+ await asyncio.sleep(0.2)
1365
+
1366
+ # Get snapshot
1367
+ logger.info("Capturing page snapshot after forward navigation...")
1368
+ snapshot_start = time.time()
1369
+ snapshot = await self._session.get_snapshot(
1370
+ force_refresh=True, diff_only=False
1371
+ )
1372
+ snapshot_time = time.time() - snapshot_start
1373
+ logger.info(
1374
+ f"Forward navigation snapshot captured in "
1375
+ f"{snapshot_time:.2f}s"
1376
+ )
1377
+
1378
+ # Get tab information
1379
+ tab_info = await self._get_tab_info_for_output()
1380
+
1381
+ return {
1382
+ "result": "Forward navigation successful.",
1383
+ "snapshot": snapshot,
1384
+ **tab_info,
1385
+ }
1386
+
1387
+ except Exception as e:
1388
+ logger.warning(f"Forward navigation failed: {e}")
1389
+ # Get current snapshot even if navigation failed
1390
+ snapshot = await self._session.get_snapshot(
1391
+ force_refresh=True, diff_only=False
1392
+ )
1393
+ tab_info = await self._get_tab_info_for_output()
1394
+ return {
1395
+ "result": f"Forward navigation failed: {e!s}",
1396
+ "snapshot": snapshot,
1397
+ **tab_info,
1398
+ }
1399
+
1400
+ @action_logger
1401
+ async def browser_get_page_snapshot(self) -> str:
1402
+ r"""Gets a textual snapshot of the page's interactive elements.
1403
+
1404
+ The snapshot lists elements like buttons, links, and inputs,
1405
+ each with
1406
+ a unique `ref` ID. This ID is used by other tools (e.g., `click`,
1407
+ `type`) to interact with a specific element. This tool provides no
1408
+ visual information.
1409
+
1410
+ Returns:
1411
+ str: A formatted string representing the interactive elements and
1412
+ their `ref` IDs. For example:
1413
+ '- link "Sign In" [ref=1]'
1414
+ '- textbox "Username" [ref=2]'
1415
+ """
1416
+ logger.info("Capturing page snapshot")
1417
+
1418
+ analysis_start = time.time()
1419
+ analysis_data = await self._get_unified_analysis()
1420
+ analysis_time = time.time() - analysis_start
1421
+ logger.info(
1422
+ f"Page snapshot analysis " f"completed in {analysis_time:.2f}s"
1423
+ )
1424
+
1425
+ snapshot_text = analysis_data.get("snapshotText", "")
1426
+ return (
1427
+ snapshot_text
1428
+ if snapshot_text
1429
+ else self._format_snapshot_from_analysis(analysis_data)
1430
+ )
1431
+
1432
+ @dependencies_required('PIL')
1433
+ @action_logger
1434
+ async def browser_get_som_screenshot(
1435
+ self,
1436
+ read_image: bool = True,
1437
+ instruction: Optional[str] = None,
1438
+ ):
1439
+ r"""Captures a screenshot with interactive elements highlighted.
1440
+
1441
+ "SoM" stands for "Set of Marks". This tool takes a screenshot and
1442
+ draws
1443
+ boxes around clickable elements, overlaying a `ref` ID on each. Use
1444
+ this for a visual understanding of the page, especially when the
1445
+ textual snapshot is not enough.
1446
+
1447
+ Args:
1448
+ read_image (bool, optional): If `True`, the agent will analyze
1449
+ the screenshot. Requires agent to be registered.
1450
+ (default: :obj:`True`)
1451
+ instruction (Optional[str], optional): A specific question or
1452
+ command for the agent regarding the screenshot, used only if
1453
+ `read_image` is `True`. For example: "Find the login button."
1454
+
1455
+ Returns:
1456
+ str: A summary message including the file path of the saved
1457
+ screenshot, e.g., "Visual webpage screenshot captured with 42
1458
+ interactive elements and saved to /path/to/screenshot.png",
1459
+ and optionally the agent's analysis if `read_image` is
1460
+ `True`.
1461
+ """
1462
+ from PIL import Image
1463
+
1464
+ os.makedirs(self._cache_dir, exist_ok=True)
1465
+ # Get screenshot and analysis
1466
+ page = await self._require_page()
1467
+
1468
+ # Log screenshot timeout start
1469
+ logger.info(
1470
+ f"Starting screenshot capture"
1471
+ f"with timeout: {self._screenshot_timeout}ms"
1472
+ )
1473
+
1474
+ start_time = time.time()
1475
+ image_data = await page.screenshot(timeout=self._screenshot_timeout)
1476
+ screenshot_time = time.time() - start_time
1477
+
1478
+ logger.info(f"Screenshot capture completed in {screenshot_time:.2f}s")
1479
+ image = Image.open(io.BytesIO(image_data))
1480
+
1481
+ # Log unified analysis start
1482
+ logger.info("Starting unified page analysis...")
1483
+ analysis_start_time = time.time()
1484
+ analysis_data = await self._get_unified_analysis()
1485
+ analysis_time = time.time() - analysis_start_time
1486
+ logger.info(f"Unified page analysis completed in {analysis_time:.2f}s")
1487
+
1488
+ # Log image processing
1489
+ logger.info("Processing visual marks on screenshot...")
1490
+ mark_start_time = time.time()
1491
+ rects = self._convert_analysis_to_rects(analysis_data)
1492
+ marked_image = self._add_set_of_mark(image, rects)
1493
+ mark_time = time.time() - mark_start_time
1494
+ logger.info(f"Visual marks processing completed in {mark_time:.2f}s")
1495
+
1496
+ # Save screenshot to cache directory
1497
+ parsed_url = urllib.parse.urlparse(page.url)
1498
+ url_name = sanitize_filename(str(parsed_url.path), max_length=241)
1499
+ timestamp = datetime.datetime.now().strftime("%m%d%H%M%S")
1500
+ file_path = os.path.join(
1501
+ self._cache_dir, f"{url_name}_{timestamp}_som.png"
1502
+ )
1503
+ marked_image.save(file_path, "PNG")
1504
+
1505
+ text_result = (
1506
+ f"Visual webpage screenshot captured with {len(rects)} "
1507
+ f"interactive elements."
1508
+ )
1509
+
1510
+ # Analyze image if requested and agent is registered
1511
+ if read_image and file_path:
1512
+ if self.agent is None:
1513
+ logger.error(
1514
+ "Cannot analyze screenshot: No agent registered. "
1515
+ "Please pass this toolkit to ChatAgent via "
1516
+ "toolkits_to_register_agent parameter."
1517
+ )
1518
+ text_result += (
1519
+ " Error: No agent registered for image analysis. "
1520
+ "Please pass this toolkit to ChatAgent via "
1521
+ "toolkits_to_register_agent parameter."
1522
+ )
1523
+ else:
1524
+ try:
1525
+ # Load the image and create a message
1526
+ from camel.messages import BaseMessage
1527
+
1528
+ img = Image.open(file_path)
1529
+ inst = instruction if instruction is not None else ""
1530
+ message = BaseMessage.make_user_message(
1531
+ role_name="User",
1532
+ content=inst,
1533
+ image_list=[img],
1534
+ )
1535
+
1536
+ # Get agent's analysis
1537
+ await self.agent.astep(message)
1538
+ except Exception as e:
1539
+ logger.error(f"Error analyzing screenshot: {e}")
1540
+ text_result += f". Error analyzing screenshot: {e}"
1541
+
1542
+ return text_result
1543
+
1544
+ async def browser_click(self, *, ref: str) -> Dict[str, Any]:
1545
+ r"""Performs a click on an element on the page.
1546
+
1547
+ Args:
1548
+ ref (str): The `ref` ID of the element to click. This ID is
1549
+ obtained from a page snapshot (`get_page_snapshot` or
1550
+ `get_som_screenshot`).
1551
+
1552
+ Returns:
1553
+ Dict[str, Any]: A dictionary with the result of the action:
1554
+ - "result" (str): Confirmation of the action.
1555
+ - "snapshot" (str): A textual snapshot of the page after the
1556
+ click.
1557
+ - "tabs" (List[Dict]): Information about all open tabs.
1558
+ - "current_tab" (int): Index of the active tab.
1559
+ - "total_tabs" (int): Total number of open tabs.
1560
+ """
1561
+ self._validate_ref(ref, "click")
1562
+
1563
+ analysis = await self._get_unified_analysis()
1564
+ elements = analysis.get("elements", {})
1565
+ if ref not in elements:
1566
+ logger.error(f"Error: Element reference '{ref}' not found. ")
1567
+ # Added snapshot to give more context on failure
1568
+ snapshot = self._format_snapshot_from_analysis(analysis)
1569
+ tab_info = await self._get_tab_info_for_output()
1570
+ return {
1571
+ "result": f"Error: Element reference '{ref}' not found. ",
1572
+ "snapshot": snapshot,
1573
+ **tab_info,
1574
+ }
1575
+
1576
+ element_details = elements.get(ref)
1577
+ action = {"type": "click", "ref": ref}
1578
+ result = await self._exec_with_snapshot(
1579
+ action, element_details=element_details
1580
+ )
1581
+
1582
+ # Add tab information to the result
1583
+ tab_info = await self._get_tab_info_for_output()
1584
+ result.update(tab_info)
1585
+
1586
+ return result
1587
+
1588
+ async def browser_type(self, *, ref: str, text: str) -> Dict[str, Any]:
1589
+ r"""Types text into an input element on the page.
1590
+
1591
+ Args:
1592
+ ref (str): The `ref` ID of the input element, from a snapshot.
1593
+ text (str): The text to type into the element.
1594
+
1595
+ Returns:
1596
+ Dict[str, Any]: A dictionary with the result of the action:
1597
+ - "result" (str): Confirmation of the action.
1598
+ - "snapshot" (str): A textual snapshot of the page after
1599
+ typing.
1600
+ - "tabs" (List[Dict]): Information about all open tabs.
1601
+ - "current_tab" (int): Index of the active tab.
1602
+ - "total_tabs" (int): Total number of open tabs.
1603
+ """
1604
+ self._validate_ref(ref, "type")
1605
+ await self._get_unified_analysis() # Ensure aria-ref attributes
1606
+
1607
+ action = {"type": "type", "ref": ref, "text": text}
1608
+ result = await self._exec_with_snapshot(action)
1609
+
1610
+ # Add tab information to the result
1611
+ tab_info = await self._get_tab_info_for_output()
1612
+ result.update(tab_info)
1613
+
1614
+ return result
1615
+
1616
+ async def browser_select(self, *, ref: str, value: str) -> Dict[str, Any]:
1617
+ r"""Selects an option in a dropdown (`<select>`) element.
1618
+
1619
+ Args:
1620
+ ref (str): The `ref` ID of the `<select>` element.
1621
+ value (str): The `value` attribute of the `<option>` to select,
1622
+ not its visible text.
1623
+
1624
+ Returns:
1625
+ Dict[str, Any]: A dictionary with the result of the action:
1626
+ - "result" (str): Confirmation of the action.
1627
+ - "snapshot" (str): A snapshot of the page after the
1628
+ selection.
1629
+ - "tabs" (List[Dict]): Information about all open tabs.
1630
+ - "current_tab" (int): Index of the active tab.
1631
+ - "total_tabs" (int): Total number of open tabs.
1632
+ """
1633
+ self._validate_ref(ref, "select")
1634
+ await self._get_unified_analysis()
1635
+
1636
+ action = {"type": "select", "ref": ref, "value": value}
1637
+ result = await self._exec_with_snapshot(action)
1638
+
1639
+ # Add tab information to the result
1640
+ tab_info = await self._get_tab_info_for_output()
1641
+ result.update(tab_info)
1642
+
1643
+ return result
1644
+
1645
+ async def browser_scroll(
1646
+ self, *, direction: str, amount: int
1647
+ ) -> Dict[str, Any]:
1648
+ r"""Scrolls the current page window.
1649
+
1650
+ Args:
1651
+ direction (str): The direction to scroll: 'up' or 'down'.
1652
+ amount (int): The number of pixels to scroll.
1653
+
1654
+ Returns:
1655
+ Dict[str, Any]: A dictionary with the result of the action:
1656
+ - "result" (str): Confirmation of the action.
1657
+ - "snapshot" (str): A snapshot of the page after scrolling.
1658
+ - "tabs" (List[Dict]): Information about all open tabs.
1659
+ - "current_tab" (int): Index of the active tab.
1660
+ - "total_tabs" (int): Total number of open tabs.
1661
+ """
1662
+ if direction not in ("up", "down"):
1663
+ tab_info = await self._get_tab_info_for_output()
1664
+ return {
1665
+ "result": "Error: direction must be 'up' or 'down'",
1666
+ "snapshot": "",
1667
+ **tab_info,
1668
+ }
1669
+
1670
+ action = {"type": "scroll", "direction": direction, "amount": amount}
1671
+ result = await self._exec_with_snapshot(action)
1672
+
1673
+ # Add tab information to the result
1674
+ tab_info = await self._get_tab_info_for_output()
1675
+ result.update(tab_info)
1676
+
1677
+ return result
1678
+
1679
+ async def browser_enter(self) -> Dict[str, Any]:
1680
+ r"""Simulates pressing the Enter key on the currently focused
1681
+ element.
1682
+
1683
+ This is useful for submitting forms or search queries after using the
1684
+ `type` tool.
1685
+
1686
+ Returns:
1687
+ Dict[str, Any]: A dictionary with the result of the action:
1688
+ - "result" (str): Confirmation of the action.
1689
+ - "snapshot" (str): A new page snapshot, as this action often
1690
+ triggers navigation.
1691
+ - "tabs" (List[Dict]): Information about all open tabs.
1692
+ - "current_tab" (int): Index of the active tab.
1693
+ - "total_tabs" (int): Total number of open tabs.
1694
+ """
1695
+ # Always press Enter on the currently focused element
1696
+ action = {"type": "enter"}
1697
+
1698
+ result = await self._exec_with_snapshot(action)
1699
+
1700
+ # Add tab information to the result
1701
+ tab_info = await self._get_tab_info_for_output()
1702
+ result.update(tab_info)
1703
+
1704
+ return result
1705
+
1706
+ @action_logger
1707
+ async def browser_wait_user(
1708
+ self, timeout_sec: Optional[float] = None
1709
+ ) -> Dict[str, Any]:
1710
+ r"""Pauses execution and waits for human input from the console.
1711
+
1712
+ Use this for tasks requiring manual steps, like solving a CAPTCHA.
1713
+ The
1714
+ agent will resume after the user presses Enter in the console.
1715
+
1716
+ Args:
1717
+ timeout_sec (Optional[float]): Max time to wait in seconds. If
1718
+ `None`, it will wait indefinitely.
1719
+
1720
+ Returns:
1721
+ Dict[str, Any]: A dictionary with the result of the action:
1722
+ - "result" (str): A message indicating how the wait ended.
1723
+ - "snapshot" (str): The page snapshot after the wait.
1724
+ - "tabs" (List[Dict]): Information about all open tabs.
1725
+ - "current_tab" (int): Index of the active tab.
1726
+ - "total_tabs" (int): Total number of open tabs.
1727
+ """
1728
+ import asyncio
1729
+
1730
+ prompt = (
1731
+ "🕑 Agent waiting for human input. "
1732
+ "Complete action in browser, then press Enter..."
1733
+ )
1734
+ logger.info(f"\n{prompt}\n")
1735
+
1736
+ async def _await_enter():
1737
+ await asyncio.to_thread(input, ">>> Press Enter to resume <<<\n")
1738
+
1739
+ try:
1740
+ if timeout_sec is not None:
1741
+ logger.info(
1742
+ f"Waiting for user input with timeout: {timeout_sec}s"
1743
+ )
1744
+ start_time = time.time()
1745
+ await asyncio.wait_for(_await_enter(), timeout=timeout_sec)
1746
+ wait_time = time.time() - start_time
1747
+ logger.info(f"User input received after {wait_time:.2f}s")
1748
+ result_msg = "User resumed."
1749
+ else:
1750
+ logger.info("Waiting for user " "input (no timeout)")
1751
+ start_time = time.time()
1752
+ await _await_enter()
1753
+ wait_time = time.time() - start_time
1754
+ logger.info(f"User input received " f"after {wait_time:.2f}s")
1755
+ result_msg = "User resumed."
1756
+ except asyncio.TimeoutError:
1757
+ wait_time = timeout_sec or 0.0
1758
+ logger.info(
1759
+ f"User input timeout reached "
1760
+ f"after {wait_time}s, auto-resuming"
1761
+ )
1762
+ result_msg = f"Timeout {timeout_sec}s reached, auto-resumed."
1763
+
1764
+ snapshot = await self._session.get_snapshot(
1765
+ force_refresh=True, diff_only=False
1766
+ )
1767
+ tab_info = await self._get_tab_info_for_output()
1768
+
1769
+ return {"result": result_msg, "snapshot": snapshot, **tab_info}
1770
+
1771
+ @action_logger
1772
+ async def browser_get_page_links(
1773
+ self, *, ref: List[str]
1774
+ ) -> Dict[str, Any]:
1775
+ r"""Gets the destination URLs for a list of link elements.
1776
+
1777
+ This is useful to know where a link goes before clicking it.
1778
+
1779
+ Args:
1780
+ ref (List[str]): A list of `ref` IDs for link elements, obtained
1781
+ from a page snapshot.
1782
+
1783
+ Returns:
1784
+ Dict[str, Any]: A dictionary containing:
1785
+ - "links" (List[Dict]): A list of found links, where each
1786
+ link has "text", "ref", and "url" keys.
1787
+ """
1788
+ if not ref or not isinstance(ref, list):
1789
+ return {"links": []}
1790
+
1791
+ for r in ref:
1792
+ if not r or not isinstance(r, str):
1793
+ return {"links": []}
1794
+
1795
+ page = await self._require_page()
1796
+ snapshot = await self._session.get_snapshot(
1797
+ force_refresh=True, diff_only=False
1798
+ )
1799
+ links = await self._extract_links_by_refs(snapshot, page, ref)
1800
+
1801
+ return {"links": links}
1802
+
1803
+ @action_logger
1804
+ async def browser_solve_task(
1805
+ self, task_prompt: str, start_url: str, max_steps: int = 15
1806
+ ) -> str:
1807
+ r"""Delegates a complex, high-level task to a specialized web agent.
1808
+
1809
+ Use this for multi-step tasks that can be described in a single
1810
+ prompt
1811
+ (e.g., "log into my account and check for new messages"). The agent
1812
+ will autonomously perform the necessary browser actions.
1813
+
1814
+ NOTE: This is a high-level action; for simple interactions, use tools
1815
+ like `click` and `type`. `web_agent_model` must be provided during
1816
+ toolkit initialization.
1817
+
1818
+ Args:
1819
+ task_prompt (str): A natural language description of the task.
1820
+ start_url (str): The URL to start the task from. This should be a
1821
+ valid and existing URL, as agents may generate non-existent
1822
+ ones.
1823
+ max_steps (int): The maximum number of steps the agent can take.
1824
+
1825
+ Returns:
1826
+ str: A summary message indicating the task has finished.
1827
+ """
1828
+ agent = self._ensure_agent()
1829
+ await agent.navigate(start_url)
1830
+ await agent.process_command(task_prompt, max_steps=max_steps)
1831
+ return "Task processing finished - see stdout for detailed trace."
1832
+
1833
+ def get_log_summary(self) -> Dict[str, Any]:
1834
+ r"""Get a summary of logged actions."""
1835
+ if not self.log_buffer:
1836
+ return {"total_actions": 0, "summary": "No actions logged"}
1837
+
1838
+ total_actions = len(self.log_buffer)
1839
+ total_execution_time = sum(
1840
+ entry.get("execution_time_ms", 0) for entry in self.log_buffer
1841
+ )
1842
+ total_page_load_time = sum(
1843
+ entry.get("page_load_time_ms", 0)
1844
+ for entry in self.log_buffer
1845
+ if "page_load_time_ms" in entry
1846
+ )
1847
+
1848
+ action_counts: Dict[str, int] = {}
1849
+ error_count = 0
1850
+
1851
+ for entry in self.log_buffer:
1852
+ action = entry["action"]
1853
+ action_counts[action] = action_counts.get(action, 0) + 1
1854
+ if "error" in entry:
1855
+ error_count += 1
1856
+
1857
+ return {
1858
+ "total_actions": total_actions,
1859
+ "total_execution_time_ms": round(total_execution_time, 2),
1860
+ "total_page_load_time_ms": round(total_page_load_time, 2),
1861
+ "action_counts": action_counts,
1862
+ "error_count": error_count,
1863
+ "success_rate": round(
1864
+ (total_actions - error_count) / total_actions * 100, 2
1865
+ )
1866
+ if total_actions > 0
1867
+ else 0,
1868
+ }
1869
+
1870
+ def clear_logs(self) -> None:
1871
+ r"""Clear the log buffer."""
1872
+ self.log_buffer.clear()
1873
+ logger.info("Log buffer cleared")
1874
+
1875
+ def clone_for_new_session(
1876
+ self, new_session_id: Optional[str] = None
1877
+ ) -> "HybridBrowserToolkit":
1878
+ r"""Create a new instance of HybridBrowserToolkit with a unique
1879
+ session.
1880
+
1881
+ Args:
1882
+ new_session_id: Optional new session ID. If None, a UUID will be
1883
+ generated.
1884
+
1885
+ Returns:
1886
+ A new HybridBrowserToolkit instance with the same configuration
1887
+ but a different session.
1888
+ """
1889
+ import uuid
1890
+
1891
+ if new_session_id is None:
1892
+ new_session_id = str(uuid.uuid4())[:8]
1893
+
1894
+ return HybridBrowserToolkit(
1895
+ headless=self._headless,
1896
+ user_data_dir=self._user_data_dir,
1897
+ stealth=self._stealth,
1898
+ web_agent_model=self._web_agent_model,
1899
+ cache_dir=f"{self._cache_dir.rstrip('/')}_clone_"
1900
+ f"{new_session_id}/",
1901
+ enabled_tools=self.enabled_tools.copy(),
1902
+ browser_log_to_file=self._browser_log_to_file,
1903
+ session_id=new_session_id,
1904
+ default_start_url=self._default_start_url,
1905
+ default_timeout=self._default_timeout,
1906
+ short_timeout=self._short_timeout,
1907
+ navigation_timeout=self._navigation_timeout,
1908
+ network_idle_timeout=self._network_idle_timeout,
1909
+ screenshot_timeout=self._screenshot_timeout,
1910
+ page_stability_timeout=self._page_stability_timeout,
1911
+ dom_content_loaded_timeout=self._dom_content_loaded_timeout,
1912
+ )
1913
+
1914
+ @action_logger
1915
+ async def browser_switch_tab(self, *, tab_id: str) -> Dict[str, Any]:
1916
+ r"""Switches to a different browser tab using its ID.
1917
+
1918
+ After switching, all actions will apply to the new tab. Use
1919
+ `get_tab_info` to find the ID of the tab you want to switch to.
1920
+
1921
+ Args:
1922
+ tab_id (str): The ID of the tab to activate.
1923
+
1924
+ Returns:
1925
+ Dict[str, Any]: A dictionary with the result of the action:
1926
+ - "result" (str): Confirmation of the action.
1927
+ - "snapshot" (str): A snapshot of the newly active tab.
1928
+ - "tabs" (List[Dict]): Information about all open tabs.
1929
+ - "current_tab" (int): Index of the new active tab.
1930
+ - "total_tabs" (int): Total number of open tabs.
1931
+ """
1932
+ await self._ensure_browser()
1933
+ session = await self._get_session()
1934
+
1935
+ success = await session.switch_to_tab(tab_id)
1936
+
1937
+ if success:
1938
+ snapshot = await session.get_snapshot(
1939
+ force_refresh=True, diff_only=False
1940
+ )
1941
+ tab_info = await self._get_tab_info_for_output()
1942
+
1943
+ result = {
1944
+ "result": f"Successfully switched to tab {tab_id}",
1945
+ "snapshot": snapshot,
1946
+ **tab_info,
1947
+ }
1948
+ else:
1949
+ tab_info = await self._get_tab_info_for_output()
1950
+ result = {
1951
+ "result": f"Failed to switch to tab {tab_id}. Tab may not "
1952
+ f"exist.",
1953
+ "snapshot": "",
1954
+ **tab_info,
1955
+ }
1956
+
1957
+ return result
1958
+
1959
+ @action_logger
1960
+ async def browser_close_tab(self, *, tab_id: str) -> Dict[str, Any]:
1961
+ r"""Closes a browser tab using its ID.
1962
+
1963
+ Use `get_tab_info` to find the ID of the tab to close. After
1964
+ closing, the browser will switch to another tab if available.
1965
+
1966
+ Args:
1967
+ tab_id (str): The ID of the tab to close.
1968
+
1969
+ Returns:
1970
+ Dict[str, Any]: A dictionary with the result of the action:
1971
+ - "result" (str): Confirmation of the action.
1972
+ - "snapshot" (str): A snapshot of the active tab after
1973
+ closure.
1974
+ - "tabs" (List[Dict]): Information about remaining tabs.
1975
+ - "current_tab" (int): Index of the new active tab.
1976
+ - "total_tabs" (int): Total number of remaining tabs.
1977
+ """
1978
+ await self._ensure_browser()
1979
+ session = await self._get_session()
1980
+
1981
+ success = await session.close_tab(tab_id)
1982
+
1983
+ if success:
1984
+ # Get current state after closing the tab
1985
+ try:
1986
+ snapshot = await session.get_snapshot(
1987
+ force_refresh=True, diff_only=False
1988
+ )
1989
+ except Exception:
1990
+ snapshot = "" # No active tab
1991
+
1992
+ tab_info = await self._get_tab_info_for_output()
1993
+
1994
+ result = {
1995
+ "result": f"Successfully closed tab {tab_id}",
1996
+ "snapshot": snapshot,
1997
+ **tab_info,
1998
+ }
1999
+ else:
2000
+ tab_info = await self._get_tab_info_for_output()
2001
+ result = {
2002
+ "result": f"Failed to close tab {tab_id}. Tab may not "
2003
+ f"exist.",
2004
+ "snapshot": "",
2005
+ **tab_info,
2006
+ }
2007
+
2008
+ return result
2009
+
2010
+ @action_logger
2011
+ async def browser_get_tab_info(self) -> Dict[str, Any]:
2012
+ r"""Gets a list of all open browser tabs and their information.
2013
+
2014
+ This includes each tab's index, title, and URL, and indicates which
2015
+ tab is currently active. Use this to manage multiple tabs.
2016
+
2017
+ Returns:
2018
+ Dict[str, Any]: A dictionary with tab information:
2019
+ - "tabs" (List[Dict]): A list of open tabs, each with:
2020
+ - "index" (int): The tab's zero-based index.
2021
+ - "title" (str): The page title.
2022
+ - "url" (str): The current URL.
2023
+ - "is_current" (bool): True if the tab is active.
2024
+ - "current_tab" (int): Index of the active tab.
2025
+ - "total_tabs" (int): Total number of open tabs.
2026
+ """
2027
+ await self._ensure_browser()
2028
+ return await self._get_tab_info_for_output()
2029
+
2030
+ def get_tools(self) -> List[FunctionTool]:
2031
+ r"""Get available function tools
2032
+ based on enabled_tools configuration."""
2033
+ # Map tool names to their corresponding methods
2034
+ tool_map = {
2035
+ "browser_open": self.browser_open,
2036
+ "browser_close": self.browser_close,
2037
+ "browser_visit_page": self.browser_visit_page,
2038
+ "browser_back": self.browser_back,
2039
+ "browser_forward": self.browser_forward,
2040
+ "browser_get_page_snapshot": self.browser_get_page_snapshot,
2041
+ "browser_get_som_screenshot": self.browser_get_som_screenshot,
2042
+ "browser_get_page_links": self.browser_get_page_links,
2043
+ "browser_click": self.browser_click,
2044
+ "browser_type": self.browser_type,
2045
+ "browser_select": self.browser_select,
2046
+ "browser_scroll": self.browser_scroll,
2047
+ "browser_enter": self.browser_enter,
2048
+ "browser_wait_user": self.browser_wait_user,
2049
+ "browser_solve_task": self.browser_solve_task,
2050
+ "browser_switch_tab": self.browser_switch_tab,
2051
+ "browser_close_tab": self.browser_close_tab,
2052
+ "browser_get_tab_info": self.browser_get_tab_info,
2053
+ }
2054
+
2055
+ enabled_tools = []
2056
+
2057
+ for tool_name in self.enabled_tools:
2058
+ if (
2059
+ tool_name == "browser_solve_task"
2060
+ and self._web_agent_model is None
2061
+ ):
2062
+ logger.warning(
2063
+ f"Tool '{tool_name}' is enabled but web_agent_model "
2064
+ f"is not provided. Skipping this tool."
2065
+ )
2066
+ continue
2067
+
2068
+ if tool_name in tool_map:
2069
+ tool = FunctionTool(
2070
+ cast(Callable[..., Any], tool_map[tool_name])
2071
+ )
2072
+ enabled_tools.append(tool)
2073
+ else:
2074
+ logger.warning(f"Unknown tool name: {tool_name}")
2075
+
2076
+ logger.info(f"Returning {len(enabled_tools)} enabled tools")
2077
+ return enabled_tools