camel-ai 0.2.73a1__py3-none-any.whl → 0.2.73a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

@@ -0,0 +1,1994 @@
1
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14
+
15
+ import datetime
16
+ import io
17
+ import json
18
+ import os
19
+ import time
20
+ import urllib.parse
21
+ from functools import wraps
22
+ from typing import Any, Callable, ClassVar, Dict, List, Optional, cast
23
+
24
+ from camel.logger import get_logger
25
+ from camel.models import BaseModelBackend
26
+ from camel.toolkits.base import BaseToolkit
27
+ from camel.toolkits.function_tool import FunctionTool
28
+ from camel.utils import sanitize_filename
29
+ from camel.utils.commons import dependencies_required
30
+
31
+ from .agent import PlaywrightLLMAgent
32
+ from .browser_session import HybridBrowserSession
33
+ from .config_loader import ConfigLoader
34
+
35
+ logger = get_logger(__name__)
36
+
37
+
38
+ class HybridBrowserToolkit(BaseToolkit):
39
+ r"""A hybrid browser toolkit that combines non-visual, DOM-based browser
40
+ automation with visual, screenshot-based capabilities.
41
+
42
+ This toolkit exposes a set of actions as CAMEL FunctionTools for agents
43
+ to interact with web pages. It can operate in headless mode and supports
44
+ both programmatic control of browser actions (like clicking and typing)
45
+ and visual analysis of the page layout through screenshots with marked
46
+ interactive elements.
47
+ """
48
+
49
+ # Default tool list - core browser functionality
50
+ DEFAULT_TOOLS: ClassVar[List[str]] = [
51
+ "open_browser",
52
+ "close_browser",
53
+ "visit_page",
54
+ "back",
55
+ "forward",
56
+ "click",
57
+ "type",
58
+ "switch_tab",
59
+ ]
60
+
61
+ # All available tools
62
+ ALL_TOOLS: ClassVar[List[str]] = [
63
+ "open_browser",
64
+ "close_browser",
65
+ "visit_page",
66
+ "back",
67
+ "forward",
68
+ "get_page_snapshot",
69
+ "get_som_screenshot",
70
+ "get_page_links",
71
+ "click",
72
+ "type",
73
+ "select",
74
+ "scroll",
75
+ "enter",
76
+ "wait_user",
77
+ "solve_task",
78
+ "switch_tab",
79
+ "close_tab",
80
+ "get_tab_info",
81
+ ]
82
+
83
+ def __init__(
84
+ self,
85
+ *,
86
+ headless: bool = True,
87
+ user_data_dir: Optional[str] = None,
88
+ stealth: bool = False,
89
+ web_agent_model: Optional[BaseModelBackend] = None,
90
+ cache_dir: str = "tmp/",
91
+ enabled_tools: Optional[List[str]] = None,
92
+ browser_log_to_file: bool = False,
93
+ session_id: Optional[str] = None,
94
+ default_start_url: str = "https://google.com/",
95
+ default_timeout: Optional[int] = None,
96
+ short_timeout: Optional[int] = None,
97
+ navigation_timeout: Optional[int] = None,
98
+ network_idle_timeout: Optional[int] = None,
99
+ screenshot_timeout: Optional[int] = None,
100
+ page_stability_timeout: Optional[int] = None,
101
+ dom_content_loaded_timeout: Optional[int] = None,
102
+ ) -> None:
103
+ r"""Initialize the HybridBrowserToolkit.
104
+
105
+ Args:
106
+ headless (bool): Whether to run the browser in headless mode.
107
+ Defaults to `True`.
108
+ user_data_dir (Optional[str]): Path to a directory for storing
109
+ browser data like cookies and local storage. Useful for
110
+ maintaining sessions across runs. Defaults to `None` (a
111
+ temporary directory is used).
112
+ stealth (bool): Whether to run the browser in stealth mode to avoid
113
+ bot detection. When enabled, hides WebDriver characteristics,
114
+ spoofs navigator properties, and implements various
115
+ anti-detection
116
+ measures. Highly recommended for production use and when
117
+ accessing sites with bot detection. Defaults to `False`.
118
+ web_agent_model (Optional[BaseModelBackend]): The language model
119
+ backend to use for the high-level `solve_task` agent. This is
120
+ required only if you plan to use `solve_task`.
121
+ Defaults to `None`.
122
+ cache_dir (str): The directory to store cached files, such as
123
+ screenshots. Defaults to `"tmp/"`.
124
+ enabled_tools (Optional[List[str]]): List of tool names to enable.
125
+ If None, uses DEFAULT_TOOLS. Available tools: open_browser,
126
+ close_browser, visit_page, back, forward, get_page_snapshot,
127
+ get_som_screenshot, get_page_links, click, type, select,
128
+ scroll, enter, wait_user, solve_task.
129
+ Defaults to `None`.
130
+ browser_log_to_file (bool): Whether to save detailed browser
131
+ action logs to file.
132
+ When enabled, logs action inputs/outputs, execution times,
133
+ and page loading times.
134
+ Logs are saved to an auto-generated timestamped file.
135
+ Defaults to `False`.
136
+ session_id (Optional[str]): A unique identifier for this browser
137
+ session. When multiple HybridBrowserToolkit instances are used
138
+ concurrently, different session IDs prevent them from sharing
139
+ the same browser session and causing conflicts. If None, a
140
+ default session will be used. Defaults to `None`.
141
+ default_start_url (str): The default URL to navigate to when
142
+ open_browser() is called without a start_url parameter or with
143
+ None. Defaults to `"https://google.com/"`.
144
+ default_timeout (Optional[int]): Default timeout in milliseconds
145
+ for browser actions. If None, uses environment variable
146
+ HYBRID_BROWSER_DEFAULT_TIMEOUT or defaults to 3000ms.
147
+ Defaults to `None`.
148
+ short_timeout (Optional[int]): Short timeout in milliseconds
149
+ for quick browser actions. If None, uses environment variable
150
+ HYBRID_BROWSER_SHORT_TIMEOUT or defaults to 1000ms.
151
+ Defaults to `None`.
152
+ navigation_timeout (Optional[int]): Custom navigation timeout in
153
+ milliseconds.
154
+ If None, uses environment variable
155
+ HYBRID_BROWSER_NAVIGATION_TIMEOUT or defaults to 10000ms.
156
+ Defaults to `None`.
157
+ network_idle_timeout (Optional[int]): Custom network idle
158
+ timeout in milliseconds.
159
+ If None, uses environment variable
160
+ HYBRID_BROWSER_NETWORK_IDLE_TIMEOUT or defaults to 5000ms.
161
+ Defaults to `None`.
162
+ screenshot_timeout (Optional[int]): Custom screenshot timeout in
163
+ milliseconds.
164
+ If None, uses environment variable
165
+ HYBRID_BROWSER_SCREENSHOT_TIMEOUT or defaults to 15000ms.
166
+ Defaults to `None`.
167
+ page_stability_timeout (Optional[int]): Custom page stability
168
+ timeout in milliseconds.
169
+ If None, uses environment variable
170
+ HYBRID_BROWSER_PAGE_STABILITY_TIMEOUT or defaults to 1500ms.
171
+ Defaults to `None`.
172
+ dom_content_loaded_timeout (Optional[int]): Custom DOM content
173
+ loaded timeout in milliseconds.
174
+ If None, uses environment variable
175
+ HYBRID_BROWSER_DOM_CONTENT_LOADED_TIMEOUT or defaults to
176
+ 5000ms.
177
+ Defaults to `None`.
178
+ """
179
+ super().__init__()
180
+ self._headless = headless
181
+ self._user_data_dir = user_data_dir
182
+ self._stealth = stealth
183
+ self._web_agent_model = web_agent_model
184
+ self._cache_dir = cache_dir
185
+ self._browser_log_to_file = browser_log_to_file
186
+ self._default_start_url = default_start_url
187
+ self._session_id = session_id or "default"
188
+
189
+ # Store timeout configuration
190
+ self._default_timeout = default_timeout
191
+ self._short_timeout = short_timeout
192
+ self._navigation_timeout = ConfigLoader.get_navigation_timeout(
193
+ navigation_timeout
194
+ )
195
+ self._network_idle_timeout = ConfigLoader.get_network_idle_timeout(
196
+ network_idle_timeout
197
+ )
198
+ self._screenshot_timeout = ConfigLoader.get_screenshot_timeout(
199
+ screenshot_timeout
200
+ )
201
+ self._page_stability_timeout = ConfigLoader.get_page_stability_timeout(
202
+ page_stability_timeout
203
+ )
204
+ self._dom_content_loaded_timeout = (
205
+ ConfigLoader.get_dom_content_loaded_timeout(
206
+ dom_content_loaded_timeout
207
+ )
208
+ )
209
+
210
+ # Logging configuration - fixed values for simplicity
211
+ self.enable_action_logging = True
212
+ self.enable_timing_logging = True
213
+ self.enable_page_loading_logging = True
214
+ self.log_to_console = False # Always disabled for cleaner output
215
+ self.log_to_file = browser_log_to_file
216
+ self.max_log_length = None # No truncation for file logs
217
+
218
+ # Set up log file if needed
219
+ if self.log_to_file:
220
+ # Create log directory if it doesn't exist
221
+ log_dir = "browser_log"
222
+ os.makedirs(log_dir, exist_ok=True)
223
+
224
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
225
+ self.log_file_path: Optional[str] = os.path.join(
226
+ log_dir, f"hybrid_browser_toolkit_{timestamp}_{session_id}.log"
227
+ )
228
+ else:
229
+ self.log_file_path = None
230
+
231
+ # Initialize log buffer for in-memory storage
232
+ self.log_buffer: List[Dict[str, Any]] = []
233
+
234
+ # Configure enabled tools
235
+ if enabled_tools is None:
236
+ self.enabled_tools = self.DEFAULT_TOOLS.copy()
237
+ else:
238
+ # Validate enabled tools
239
+ invalid_tools = [
240
+ tool for tool in enabled_tools if tool not in self.ALL_TOOLS
241
+ ]
242
+ if invalid_tools:
243
+ raise ValueError(
244
+ f"Invalid tools specified: {invalid_tools}. "
245
+ f"Available tools: {self.ALL_TOOLS}"
246
+ )
247
+ self.enabled_tools = enabled_tools.copy()
248
+
249
+ logger.info(f"Enabled tools: {self.enabled_tools}")
250
+
251
+ # Log initialization if file logging is enabled
252
+ if self.log_to_file:
253
+ logger.info(
254
+ "HybridBrowserToolkit initialized with file logging enabled"
255
+ )
256
+ logger.info(f"Log file path: {self.log_file_path}")
257
+
258
+ # Core components
259
+ temp_session = HybridBrowserSession(
260
+ headless=headless,
261
+ user_data_dir=user_data_dir,
262
+ stealth=stealth,
263
+ session_id=session_id,
264
+ default_timeout=default_timeout,
265
+ short_timeout=short_timeout,
266
+ )
267
+ # Use the session directly - singleton logic is handled in
268
+ # ensure_browser
269
+ self._session = temp_session
270
+ self._agent: Optional[PlaywrightLLMAgent] = None
271
+ self._unified_script = self._load_unified_analyzer()
272
+
273
+ @property
274
+ def web_agent_model(self) -> Optional[BaseModelBackend]:
275
+ """Get the web agent model."""
276
+ return self._web_agent_model
277
+
278
+ @web_agent_model.setter
279
+ def web_agent_model(self, value: Optional[BaseModelBackend]) -> None:
280
+ """Set the web agent model."""
281
+ self._web_agent_model = value
282
+
283
+ @property
284
+ def cache_dir(self) -> str:
285
+ """Get the cache directory."""
286
+ return self._cache_dir
287
+
288
+ def __del__(self):
289
+ r"""Cleanup browser resources on garbage collection."""
290
+ try:
291
+ import sys
292
+
293
+ if getattr(sys, "is_finalizing", lambda: False)():
294
+ return
295
+
296
+ import asyncio
297
+
298
+ try:
299
+ loop = asyncio.get_event_loop()
300
+ if not loop.is_closed() and not loop.is_running():
301
+ # Try to close browser with a timeout to prevent hanging
302
+ try:
303
+ loop.run_until_complete(
304
+ asyncio.wait_for(self.close_browser(), timeout=2.0)
305
+ )
306
+ except asyncio.TimeoutError:
307
+ pass # Skip cleanup if it takes too long
308
+ except (RuntimeError, ImportError):
309
+ pass # Event loop unavailable, skip cleanup
310
+ except Exception:
311
+ pass # Suppress all errors during garbage collection
312
+
313
+ def _load_unified_analyzer(self) -> str:
314
+ r"""Load the unified analyzer JavaScript script."""
315
+ script_path = os.path.join(
316
+ os.path.dirname(os.path.abspath(__file__)), "unified_analyzer.js"
317
+ )
318
+
319
+ try:
320
+ with open(
321
+ script_path, "r", encoding='utf-8', errors='replace'
322
+ ) as f:
323
+ script_content = f.read()
324
+
325
+ if not script_content.strip():
326
+ raise ValueError(f"Script is empty: {script_path}")
327
+
328
+ logger.debug(
329
+ f"Loaded unified analyzer ({len(script_content)} chars)"
330
+ )
331
+ return script_content
332
+ except FileNotFoundError:
333
+ raise FileNotFoundError(f"Script not found: {script_path}")
334
+
335
+ def _validate_ref(self, ref: str, method_name: str) -> None:
336
+ r"""Validate ref parameter."""
337
+ if not ref or not isinstance(ref, str):
338
+ raise ValueError(
339
+ f"{method_name}: 'ref' must be a non-empty string"
340
+ )
341
+
342
+ def _truncate_if_needed(self, content: Any) -> str:
343
+ r"""Truncate content if max_log_length is set."""
344
+ content_str = str(content)
345
+ if (
346
+ self.max_log_length is not None
347
+ and len(content_str) > self.max_log_length
348
+ ):
349
+ return content_str[: self.max_log_length] + "... [TRUNCATED]"
350
+ return content_str
351
+
352
+ async def _get_current_url(self) -> Optional[str]:
353
+ r"""Safely get the current URL of the active page."""
354
+ try:
355
+ page = await self._session.get_page()
356
+ if page and not page.is_closed():
357
+ return page.url
358
+ return None # Return None if page is closed
359
+ except Exception:
360
+ # This can happen if browser is not open.
361
+ return None
362
+
363
+ async def _log_action(
364
+ self,
365
+ action_name: str,
366
+ inputs: Dict[str, Any],
367
+ outputs: Any,
368
+ execution_time: float,
369
+ page_load_time: Optional[float] = None,
370
+ error: Optional[str] = None,
371
+ ) -> None:
372
+ r"""Log action details with comprehensive information."""
373
+ if not (self.enable_action_logging or self.enable_timing_logging):
374
+ return
375
+
376
+ current_url = await self._get_current_url()
377
+
378
+ log_entry: Dict[str, Any] = {
379
+ "timestamp": datetime.datetime.now().isoformat(),
380
+ "action": action_name,
381
+ "url": current_url,
382
+ "execution_time_ms": round(execution_time * 1000, 2),
383
+ }
384
+
385
+ if self.enable_action_logging:
386
+ log_entry["inputs"] = inputs
387
+ if error:
388
+ log_entry["error"] = str(error)
389
+ elif isinstance(outputs, dict):
390
+ # Unpack dictionary items into the log entry
391
+ log_entry.update(outputs)
392
+ else:
393
+ # For non-dict outputs, assign to 'outputs' key
394
+ log_entry["outputs"] = outputs
395
+
396
+ if page_load_time is not None and self.enable_page_loading_logging:
397
+ log_entry["page_load_time_ms"] = round(page_load_time * 1000, 2)
398
+
399
+ # Add to buffer
400
+ self.log_buffer.append(log_entry)
401
+
402
+ # Console logging
403
+ if self.log_to_console:
404
+ log_msg = f"[BROWSER ACTION] {action_name}"
405
+ if self.enable_timing_logging:
406
+ log_msg += f" | Execution: {log_entry['execution_time_ms']}ms"
407
+ if page_load_time is not None and self.enable_page_loading_logging:
408
+ log_msg += f" | Page Load: {log_entry['page_load_time_ms']}ms"
409
+ if error:
410
+ log_msg += f" | ERROR: {error}"
411
+
412
+ logger.info(log_msg)
413
+
414
+ if self.enable_action_logging:
415
+ logger.info(f" Inputs: {self._truncate_if_needed(inputs)}")
416
+ if not error:
417
+ if isinstance(outputs, dict):
418
+ for key, value in outputs.items():
419
+ logger.info(
420
+ f" - {key}: "
421
+ f"{self._truncate_if_needed(value)}"
422
+ )
423
+ else:
424
+ logger.info(
425
+ f" Outputs: {self._truncate_if_needed(outputs)}"
426
+ )
427
+
428
+ # File logging
429
+ if self.log_to_file and self.log_file_path:
430
+ try:
431
+ with open(self.log_file_path, 'a', encoding='utf-8') as f:
432
+ # Write full log entry to file without truncation
433
+ f.write(
434
+ json.dumps(log_entry, ensure_ascii=False, indent=2)
435
+ + '\n'
436
+ )
437
+ except Exception as e:
438
+ logger.error(f"Failed to write to log file: {e}")
439
+
440
+ @staticmethod
441
+ def action_logger(func: Callable[..., Any]) -> Callable[..., Any]:
442
+ r"""Decorator to add logging to action methods."""
443
+
444
+ @wraps(func)
445
+ async def wrapper(self, *args, **kwargs):
446
+ action_name = func.__name__
447
+ start_time = time.time()
448
+
449
+ # Log inputs
450
+ inputs = {
451
+ "args": args, # Don't skip self since it's already handled
452
+ "kwargs": kwargs,
453
+ }
454
+
455
+ try:
456
+ # Execute the original function
457
+ result = await func(self, *args, **kwargs)
458
+ execution_time = time.time() - start_time
459
+
460
+ # Log success
461
+ await self._log_action(
462
+ action_name=action_name,
463
+ inputs=inputs,
464
+ outputs=result,
465
+ execution_time=execution_time,
466
+ )
467
+
468
+ return result
469
+
470
+ except Exception as e:
471
+ execution_time = time.time() - start_time
472
+ error_msg = f"{type(e).__name__}: {e!s}"
473
+
474
+ # Log error
475
+ await self._log_action(
476
+ action_name=action_name,
477
+ inputs=inputs,
478
+ outputs=None,
479
+ execution_time=execution_time,
480
+ error=error_msg,
481
+ )
482
+
483
+ raise
484
+
485
+ return wrapper
486
+
487
+ async def _get_session(self) -> "HybridBrowserSession":
488
+ """Get the correct singleton session instance."""
489
+ singleton = await HybridBrowserSession._get_or_create_instance(
490
+ self._session
491
+ )
492
+ if singleton is not self._session:
493
+ logger.debug("Updating to singleton session instance")
494
+ self._session = singleton
495
+ return self._session
496
+
497
+ async def _ensure_browser(self):
498
+ # Get singleton instance and update self._session if needed
499
+ session = await self._get_session()
500
+ await session.ensure_browser()
501
+
502
+ async def _require_page(self):
503
+ # Get singleton instance and update self._session if needed
504
+ session = await self._get_session()
505
+ await session.ensure_browser()
506
+ return await session.get_page()
507
+
508
+ async def _wait_for_page_stability(self):
509
+ r"""Wait for page to become stable after actions that might trigger
510
+ updates. Optimized with shorter timeouts.
511
+ """
512
+ page = await self._require_page()
513
+ import asyncio
514
+
515
+ try:
516
+ # Wait for DOM content to be loaded (reduced timeout)
517
+ await page.wait_for_load_state(
518
+ 'domcontentloaded', timeout=self._page_stability_timeout
519
+ )
520
+ logger.debug("DOM content loaded")
521
+
522
+ # Try to wait for network idle with shorter timeout
523
+ try:
524
+ await page.wait_for_load_state(
525
+ 'networkidle', timeout=self._network_idle_timeout
526
+ )
527
+ logger.debug("Network idle achieved")
528
+ except Exception:
529
+ logger.debug("Network idle timeout - continuing anyway")
530
+
531
+ # Reduced delay for JavaScript execution
532
+ await asyncio.sleep(0.2) # Reduced from 0.5s
533
+ logger.debug("Page stability wait completed")
534
+
535
+ except Exception as e:
536
+ logger.debug(
537
+ f"Page stability wait failed: {e} - continuing anyway"
538
+ )
539
+
540
+ async def _get_unified_analysis(
541
+ self, max_retries: int = 3
542
+ ) -> Dict[str, Any]:
543
+ r"""Get unified analysis data from the page with retry mechanism for
544
+ navigation issues."""
545
+ page = await self._require_page()
546
+
547
+ for attempt in range(max_retries):
548
+ try:
549
+ if not self._unified_script:
550
+ logger.error("Unified analyzer script not loaded")
551
+ return {"elements": {}, "metadata": {"elementCount": 0}}
552
+
553
+ # Wait for DOM stability before each attempt (with optimized
554
+ # timeout)
555
+ try:
556
+ await page.wait_for_load_state(
557
+ 'domcontentloaded',
558
+ timeout=self._dom_content_loaded_timeout,
559
+ )
560
+ except Exception:
561
+ # Don't fail if DOM wait times out
562
+ pass
563
+
564
+ result = await page.evaluate(self._unified_script)
565
+
566
+ if not isinstance(result, dict):
567
+ logger.warning(f"Invalid result type: {type(result)}")
568
+ return {"elements": {}, "metadata": {"elementCount": 0}}
569
+
570
+ # Success - return result
571
+ if attempt > 0:
572
+ logger.debug(
573
+ f"Unified analysis succeeded on attempt {attempt + 1}"
574
+ )
575
+ return result
576
+
577
+ except Exception as e:
578
+ error_msg = str(e)
579
+
580
+ # Check if this is a navigation-related error
581
+ is_navigation_error = (
582
+ "Execution context was destroyed" in error_msg
583
+ or "Most likely because of a navigation" in error_msg
584
+ or "Target page, context or browser has been closed"
585
+ in error_msg
586
+ )
587
+
588
+ if is_navigation_error and attempt < max_retries - 1:
589
+ logger.debug(
590
+ f"Navigation error in unified analysis (attempt "
591
+ f"{attempt + 1}/{max_retries}): {e}. Retrying..."
592
+ )
593
+
594
+ # Wait a bit for page stability before retrying (optimized)
595
+ try:
596
+ await page.wait_for_load_state(
597
+ 'domcontentloaded',
598
+ timeout=self._page_stability_timeout,
599
+ )
600
+ # Reduced delay for JS context to stabilize
601
+ import asyncio
602
+
603
+ await asyncio.sleep(0.1) # Reduced from 0.2s
604
+ except Exception:
605
+ # Continue even if wait fails
606
+ pass
607
+
608
+ continue
609
+
610
+ # Non-navigation error or final attempt - log and return
611
+ # empty result
612
+ if attempt == max_retries - 1:
613
+ logger.warning(
614
+ f"Error in unified analysis after {max_retries} "
615
+ f"attempts: {e}"
616
+ )
617
+ else:
618
+ logger.warning(
619
+ f"Non-retryable error in unified analysis: {e}"
620
+ )
621
+
622
+ return {"elements": {}, "metadata": {"elementCount": 0}}
623
+
624
+ # Should not reach here, but just in case
625
+ return {"elements": {}, "metadata": {"elementCount": 0}}
626
+
627
+ def _convert_analysis_to_rects(
628
+ self, analysis_data: Dict[str, Any]
629
+ ) -> Dict[str, Any]:
630
+ r"""Convert analysis data to rect format for visual marking."""
631
+ rects = {}
632
+ elements = analysis_data.get("elements", {})
633
+
634
+ for ref, element_data in elements.items():
635
+ coordinates = element_data.get("coordinates", [])
636
+ if coordinates:
637
+ rects[ref] = {
638
+ "role": element_data.get("role", "generic"),
639
+ "aria-name": element_data.get("name", ""),
640
+ "rects": [coordinates[0]],
641
+ }
642
+ return rects
643
+
644
+ def _add_set_of_mark(self, image, rects):
645
+ r"""Add visual marks to the image."""
646
+ try:
647
+ from PIL import ImageDraw, ImageFont
648
+ except ImportError:
649
+ logger.warning("PIL not available, returning original image")
650
+ return image
651
+
652
+ marked_image = image.copy()
653
+ draw = ImageDraw.Draw(marked_image)
654
+
655
+ # Try to get font
656
+ try:
657
+ font = ImageFont.truetype("arial.ttf", 16)
658
+ except (OSError, IOError):
659
+ try:
660
+ font = ImageFont.load_default()
661
+ except (OSError, IOError):
662
+ font = None
663
+
664
+ # Color scheme
665
+ colors = {
666
+ "button": "#FF6B6B",
667
+ "link": "#4ECDC4",
668
+ "textbox": "#45B7D1",
669
+ "select": "#96CEB4",
670
+ "checkbox": "#FECA57",
671
+ "radio": "#FF9FF3",
672
+ "default": "#DDA0DD",
673
+ }
674
+
675
+ for ref, rect_data in rects.items():
676
+ rects_list = rect_data.get("rects", [])
677
+ role = rect_data.get("role", "generic")
678
+ color = colors.get(role, colors["default"])
679
+
680
+ for rect in rects_list:
681
+ x, y = rect.get("x", 0), rect.get("y", 0)
682
+ width, height = rect.get("width", 0), rect.get("height", 0)
683
+
684
+ # Draw rectangle outline
685
+ draw.rectangle(
686
+ [x, y, x + width, y + height], outline=color, width=2
687
+ )
688
+
689
+ # Draw reference label
690
+ label_text = ref
691
+ if font:
692
+ bbox = draw.textbbox((0, 0), label_text, font=font)
693
+ text_width, text_height = (
694
+ bbox[2] - bbox[0],
695
+ bbox[3] - bbox[1],
696
+ )
697
+ else:
698
+ text_width, text_height = len(label_text) * 8, 16
699
+
700
+ label_x, label_y = max(0, x - 2), max(0, y - text_height - 2)
701
+
702
+ # Background and text
703
+ draw.rectangle(
704
+ [
705
+ label_x,
706
+ label_y,
707
+ label_x + text_width + 4,
708
+ label_y + text_height + 2,
709
+ ],
710
+ fill=color,
711
+ )
712
+ draw.text(
713
+ (label_x + 2, label_y + 1),
714
+ label_text,
715
+ fill="white",
716
+ font=font,
717
+ )
718
+
719
+ return marked_image
720
+
721
+ def _format_snapshot_from_analysis(
722
+ self, analysis_data: Dict[str, Any]
723
+ ) -> str:
724
+ r"""Format analysis data into snapshot string."""
725
+ lines = []
726
+ elements = analysis_data.get("elements", {})
727
+
728
+ for ref, element_data in elements.items():
729
+ role = element_data.get("role", "generic")
730
+ name = element_data.get("name", "")
731
+
732
+ line = f"- {role}"
733
+ if name:
734
+ line += f' "{name}"'
735
+
736
+ # Add properties
737
+ props = []
738
+ for prop in ["disabled", "checked", "expanded"]:
739
+ value = element_data.get(prop)
740
+ if value is True:
741
+ props.append(prop)
742
+ elif value is not None and prop in ["checked", "expanded"]:
743
+ props.append(f"{prop}={value}")
744
+
745
+ if props:
746
+ line += f" {' '.join(props)}"
747
+
748
+ line += f" [ref={ref}]"
749
+ lines.append(line)
750
+
751
+ return "\n".join(lines)
752
+
753
+ async def _get_tab_info_for_output(self) -> Dict[str, Any]:
754
+ r"""Get tab information to include in action outputs."""
755
+ try:
756
+ # Ensure we have the correct singleton session instance first
757
+ session = await self._get_session()
758
+
759
+ # Add debug info for tab info retrieval
760
+ logger.debug("Attempting to get tab info from session...")
761
+ tab_info = await session.get_tab_info()
762
+ current_tab_index = await session.get_current_tab_id()
763
+
764
+ # Debug log the successful retrieval
765
+ logger.debug(
766
+ f"Successfully retrieved {len(tab_info)} tabs, current: "
767
+ f"{current_tab_index}"
768
+ )
769
+
770
+ return {
771
+ "tabs": tab_info,
772
+ "current_tab": current_tab_index,
773
+ "total_tabs": len(tab_info),
774
+ }
775
+ except Exception as e:
776
+ logger.warning(
777
+ f"Failed to get tab info from session: {type(e).__name__}: {e}"
778
+ )
779
+
780
+ # Try to get actual tab count from session pages directly
781
+ try:
782
+ # Get the correct session instance for fallback
783
+ fallback_session = await self._get_session()
784
+
785
+ # Check browser session state
786
+ session_state = {
787
+ "has_session": fallback_session is not None,
788
+ "has_pages_attr": hasattr(fallback_session, '_pages'),
789
+ "pages_count": len(fallback_session._pages)
790
+ if hasattr(fallback_session, '_pages')
791
+ else "unknown",
792
+ "has_page": hasattr(fallback_session, '_page')
793
+ and fallback_session._page is not None,
794
+ "session_id": getattr(
795
+ fallback_session, '_session_id', 'unknown'
796
+ ),
797
+ }
798
+ logger.debug(f"Browser session state: {session_state}")
799
+
800
+ actual_tab_count = 0
801
+ if (
802
+ hasattr(fallback_session, '_pages')
803
+ and fallback_session._pages
804
+ ):
805
+ actual_tab_count = len(fallback_session._pages)
806
+ # Also try to filter out closed pages
807
+ try:
808
+ open_pages = [
809
+ p
810
+ for p in fallback_session._pages.values()
811
+ if not p.is_closed()
812
+ ]
813
+ actual_tab_count = len(open_pages)
814
+ logger.debug(
815
+ f"Found {actual_tab_count} open tabs out of "
816
+ f"{len(fallback_session._pages)} total"
817
+ )
818
+ except Exception:
819
+ # Keep the original count if we can't check page status
820
+ pass
821
+
822
+ if actual_tab_count == 0:
823
+ # If no pages, check if browser is even initialized
824
+ if (
825
+ hasattr(fallback_session, '_page')
826
+ and fallback_session._page is not None
827
+ ):
828
+ actual_tab_count = 1
829
+ logger.debug(
830
+ "No pages in list but main page exists, assuming "
831
+ "1 tab"
832
+ )
833
+ else:
834
+ actual_tab_count = 1
835
+ logger.debug("No pages found, defaulting to 1 tab")
836
+
837
+ logger.debug(f"Using fallback tab count: {actual_tab_count}")
838
+ return {
839
+ "tabs": [],
840
+ "current_tab": 0,
841
+ "total_tabs": actual_tab_count,
842
+ }
843
+
844
+ except Exception as fallback_error:
845
+ logger.warning(
846
+ f"Fallback tab count also failed: "
847
+ f"{type(fallback_error).__name__}: {fallback_error}"
848
+ )
849
+ return {"tabs": [], "current_tab": 0, "total_tabs": 1}
850
+
851
+ async def _exec_with_snapshot(
852
+ self,
853
+ action: Dict[str, Any],
854
+ element_details: Optional[Dict[str, Any]] = None,
855
+ ) -> Dict[str, str]:
856
+ r"""Execute action and return result with snapshot comparison."""
857
+
858
+ # Log action execution start
859
+ action_type = action.get("type", "unknown")
860
+ logger.info(f"Executing action: {action_type}")
861
+
862
+ action_start_time = time.time()
863
+ inputs: Dict[str, Any] = {"action": action}
864
+ page_load_time = None
865
+
866
+ try:
867
+ # Get before snapshot
868
+ logger.info("Capturing pre-action snapshot...")
869
+ snapshot_start_before = time.time()
870
+ before_snapshot = await self._session.get_snapshot(
871
+ force_refresh=True, diff_only=False
872
+ )
873
+ before_snapshot_time = time.time() - snapshot_start_before
874
+ logger.info(
875
+ f"Pre-action snapshot captured in {before_snapshot_time:.2f}s"
876
+ )
877
+
878
+ # Execute action
879
+ logger.info(f"Executing {action_type} action...")
880
+ exec_start = time.time()
881
+ exec_result = await self._session.exec_action(action)
882
+ exec_time = time.time() - exec_start
883
+ logger.info(f"Action {action_type} completed in {exec_time:.2f}s")
884
+
885
+ # Parse the detailed result from ActionExecutor
886
+ if isinstance(exec_result, dict):
887
+ result_message = exec_result.get("message", str(exec_result))
888
+ action_details = exec_result.get("details", {})
889
+ success = exec_result.get("success", True)
890
+ else:
891
+ result_message = str(exec_result)
892
+ action_details = {}
893
+ success = True
894
+
895
+ # Wait for page stability after action (especially important for
896
+ # click)
897
+ stability_time: float = 0.0
898
+ if action_type in ["click", "type", "select", "enter"]:
899
+ logger.info(
900
+ f"Waiting for page stability " f"after {action_type}..."
901
+ )
902
+ stability_start = time.time()
903
+ await self._wait_for_page_stability()
904
+ stability_time = time.time() - stability_start
905
+ logger.info(
906
+ f"Page stability wait "
907
+ f"completed in "
908
+ f"{stability_time:.2f}s"
909
+ )
910
+ page_load_time = stability_time
911
+
912
+ # Enhanced logging for page loading times
913
+ if self.enable_page_loading_logging and self.log_to_console:
914
+ logger.info(
915
+ f"[PAGE LOADING] Page stability for {action_type}: "
916
+ f"{round(stability_time * 1000, 2)}ms"
917
+ )
918
+
919
+ # Get after snapshot
920
+ logger.info("Capturing post-action snapshot...")
921
+ snapshot_start_after = time.time()
922
+ after_snapshot = await self._session.get_snapshot(
923
+ force_refresh=True, diff_only=False
924
+ )
925
+ after_snapshot_time = time.time() - snapshot_start_after
926
+ logger.info(
927
+ f"Post-action snapshot "
928
+ f"captured in {after_snapshot_time:.2f}s"
929
+ )
930
+
931
+ # Check for snapshot quality and log warnings
932
+ if before_snapshot == after_snapshot:
933
+ snapshot = "snapshot not changed"
934
+ logger.debug("Page snapshot unchanged after action")
935
+ else:
936
+ snapshot = after_snapshot
937
+ # Check if snapshot is empty or problematic
938
+ if "<empty>" in after_snapshot:
939
+ logger.warning(
940
+ f"Action {action_type} resulted "
941
+ f"in empty snapshot - "
942
+ f"page may still be loading"
943
+ )
944
+ elif len(after_snapshot.strip()) < 50:
945
+ logger.warning(
946
+ f"Action {action_type} resulted "
947
+ f"in very short snapshot:"
948
+ f" {len(after_snapshot)} chars"
949
+ )
950
+ else:
951
+ logger.debug(
952
+ f"Action {action_type} resulted "
953
+ f"in updated snapshot: "
954
+ f"{len(after_snapshot)} chars"
955
+ )
956
+
957
+ # Get tab information for output
958
+ tab_info = await self._get_tab_info_for_output()
959
+
960
+ # Create comprehensive output for logging
961
+ execution_time = time.time() - action_start_time
962
+ total_snapshot_time = before_snapshot_time + after_snapshot_time
963
+ outputs = {
964
+ "result": result_message,
965
+ "snapshot": snapshot,
966
+ "success": success,
967
+ "action_details": action_details,
968
+ "execution_stats": {
969
+ "exec_time_ms": round(exec_time * 1000, 2),
970
+ "stability_time_ms": round(stability_time * 1000, 2)
971
+ if stability_time > 0
972
+ else None,
973
+ "snapshot_time_ms": round(total_snapshot_time * 1000, 2),
974
+ "total_time_ms": round(execution_time * 1000, 2),
975
+ },
976
+ **tab_info, # Include tab information
977
+ }
978
+
979
+ # If snapshot is unchanged after click, add element details to log
980
+ if (
981
+ snapshot == "snapshot not changed"
982
+ and action_type == "click"
983
+ and element_details
984
+ ):
985
+ logger.debug(
986
+ "Snapshot unchanged after click. "
987
+ "Adding element details to log."
988
+ )
989
+ outputs["clicked_element_tag"] = element_details.get(
990
+ "tagName", "N/A"
991
+ )
992
+ outputs["clicked_element_content"] = element_details.get(
993
+ "name", ""
994
+ )
995
+ outputs["clicked_element_type"] = element_details.get(
996
+ "role", "generic"
997
+ )
998
+
999
+ # Log the action with all details
1000
+ await self._log_action(
1001
+ action_name=f"_exec_with_snapshot_{action_type}",
1002
+ inputs=inputs,
1003
+ outputs=outputs,
1004
+ execution_time=execution_time,
1005
+ page_load_time=page_load_time,
1006
+ )
1007
+
1008
+ return {"result": result_message, "snapshot": snapshot}
1009
+
1010
+ except Exception as e:
1011
+ execution_time = time.time() - action_start_time
1012
+ error_msg = f"{type(e).__name__}: {e!s}"
1013
+
1014
+ # Log error
1015
+ await self._log_action(
1016
+ action_name=f"_exec_with_snapshot_{action_type}",
1017
+ inputs=inputs,
1018
+ outputs=None,
1019
+ execution_time=execution_time,
1020
+ page_load_time=page_load_time,
1021
+ error=error_msg,
1022
+ )
1023
+
1024
+ raise
1025
+
1026
+ async def _extract_links_by_refs(
1027
+ self, snapshot: str, page, refs: List[str]
1028
+ ) -> List[Dict[str, str]]:
1029
+ r"""Extract multiple links by their reference IDs."""
1030
+ import re
1031
+
1032
+ found_links = []
1033
+ ref_set = set(refs)
1034
+ lines = snapshot.split('\n')
1035
+
1036
+ for line in lines:
1037
+ link_match = re.search(
1038
+ r'- link\s+"([^"]+)"\s+\[ref=([^\]]+)\]', line
1039
+ )
1040
+ if link_match and link_match.group(2) in ref_set:
1041
+ text, found_ref = link_match.groups()
1042
+ try:
1043
+ url = await self._get_link_url_by_ref(page, found_ref)
1044
+ found_links.append(
1045
+ {"text": text, "ref": found_ref, "url": url or ""}
1046
+ )
1047
+ except Exception as e:
1048
+ logger.warning(
1049
+ f"Failed to get URL for ref {found_ref}: {e}"
1050
+ )
1051
+ found_links.append(
1052
+ {"text": text, "ref": found_ref, "url": ""}
1053
+ )
1054
+
1055
+ return found_links
1056
+
1057
+ async def _get_link_url_by_ref(self, page, ref: str) -> str:
1058
+ r"""Get URL of a link element by reference ID."""
1059
+ try:
1060
+ element = await page.query_selector(f'[aria-ref="{ref}"]')
1061
+ if element:
1062
+ href = await element.get_attribute('href')
1063
+ if href:
1064
+ from urllib.parse import urljoin
1065
+
1066
+ return urljoin(page.url, href)
1067
+ return ""
1068
+ except Exception as e:
1069
+ logger.warning(f"Failed to get URL for ref {ref}: {e}")
1070
+ return ""
1071
+
1072
+ def _ensure_agent(self) -> PlaywrightLLMAgent:
1073
+ r"""Create PlaywrightLLMAgent on first use."""
1074
+ if self._web_agent_model is None:
1075
+ raise RuntimeError(
1076
+ "web_agent_model required for high-level task planning"
1077
+ )
1078
+
1079
+ if self._agent is None:
1080
+ self._agent = PlaywrightLLMAgent(
1081
+ headless=self._headless,
1082
+ user_data_dir=self._user_data_dir,
1083
+ model_backend=self._web_agent_model,
1084
+ )
1085
+ return self._agent
1086
+
1087
+ # Public API Methods
1088
+
1089
+ async def open_browser(self) -> Dict[str, Any]:
1090
+ r"""Starts a new browser session. This must be the first browser
1091
+ action.
1092
+
1093
+ This method initializes the browser and navigates to a default start
1094
+ page. To visit a specific URL, use `visit_page` after this.
1095
+
1096
+ Returns:
1097
+ Dict[str, Any]: A dictionary with the result of the action:
1098
+ - "result" (str): Confirmation of the action.
1099
+ - "snapshot" (str): A textual snapshot of interactive elements.
1100
+ - "tabs" (List[Dict]): Information about all open tabs.
1101
+ - "current_tab" (int): Index of the active tab.
1102
+ - "total_tabs" (int): Total number of open tabs.
1103
+ """
1104
+ # Add logging if enabled
1105
+ action_start = time.time()
1106
+ inputs: Dict[str, Any] = {} # No input parameters for agents
1107
+
1108
+ logger.info("Starting browser session...")
1109
+
1110
+ browser_start = time.time()
1111
+ await self._session.ensure_browser()
1112
+ browser_time = time.time() - browser_start
1113
+ logger.info(f"Browser session started in {browser_time:.2f}s")
1114
+
1115
+ try:
1116
+ # Always use the configured default start URL
1117
+ start_url = self._default_start_url
1118
+ logger.info(f"Navigating to configured default page: {start_url}")
1119
+
1120
+ # Use visit_page without creating a new tab
1121
+ result = await self.visit_page(start_url)
1122
+
1123
+ # Log success
1124
+ if self.enable_action_logging or self.enable_timing_logging:
1125
+ execution_time = time.time() - action_start
1126
+ await self._log_action(
1127
+ action_name="open_browser",
1128
+ inputs=inputs,
1129
+ outputs={
1130
+ "result": "Browser opened and navigated to "
1131
+ "default page."
1132
+ },
1133
+ execution_time=execution_time,
1134
+ )
1135
+
1136
+ return result
1137
+
1138
+ except Exception as e:
1139
+ # Log error
1140
+ if self.enable_action_logging or self.enable_timing_logging:
1141
+ execution_time = time.time() - action_start
1142
+ await self._log_action(
1143
+ action_name="open_browser",
1144
+ inputs=inputs,
1145
+ outputs=None,
1146
+ execution_time=execution_time,
1147
+ error=f"{type(e).__name__}: {e!s}",
1148
+ )
1149
+ raise
1150
+
1151
+ @action_logger
1152
+ async def close_browser(self) -> str:
1153
+ r"""Closes the browser session, releasing all resources.
1154
+
1155
+ This should be called at the end of a task for cleanup.
1156
+
1157
+ Returns:
1158
+ str: A confirmation message.
1159
+ """
1160
+ if self._agent is not None:
1161
+ try:
1162
+ await self._agent.close()
1163
+ except Exception:
1164
+ pass
1165
+ self._agent = None
1166
+
1167
+ await self._session.close()
1168
+ return "Browser session closed."
1169
+
1170
+ @action_logger
1171
+ async def visit_page(self, url: str) -> Dict[str, Any]:
1172
+ r"""Opens a URL in a new browser tab and switches to it.
1173
+
1174
+ Args:
1175
+ url (str): The web address to load. This should be a valid and
1176
+ existing URL.
1177
+
1178
+ Returns:
1179
+ Dict[str, Any]: A dictionary with the result of the action:
1180
+ - "result" (str): Confirmation of the action.
1181
+ - "snapshot" (str): A textual snapshot of the new page.
1182
+ - "tabs" (List[Dict]): Information about all open tabs.
1183
+ - "current_tab" (int): Index of the new active tab.
1184
+ - "total_tabs" (int): Total number of open tabs.
1185
+ """
1186
+ if not url or not isinstance(url, str):
1187
+ return {
1188
+ "result": "Error: 'url' must be a non-empty string",
1189
+ "snapshot": "",
1190
+ "tabs": [],
1191
+ "current_tab": 0,
1192
+ "total_tabs": 1,
1193
+ }
1194
+
1195
+ if '://' not in url:
1196
+ url = f'https://{url}'
1197
+
1198
+ await self._ensure_browser()
1199
+ session = await self._get_session()
1200
+ nav_result = ""
1201
+
1202
+ # By default, we want to create a new tab.
1203
+ should_create_new_tab = True
1204
+ try:
1205
+ # If the browser has just started with a single "about:blank" tab,
1206
+ # use that tab instead of creating a new one.
1207
+ tab_info_data = await self._get_tab_info_for_output()
1208
+ tabs = tab_info_data.get("tabs", [])
1209
+ if len(tabs) == 1 and tabs[0].get("url") == "about:blank":
1210
+ logger.info(
1211
+ "Found single blank tab, navigating in current tab "
1212
+ "instead of creating a new one."
1213
+ )
1214
+ should_create_new_tab = False
1215
+ except Exception as e:
1216
+ logger.warning(
1217
+ "Could not get tab info to check for blank tab, "
1218
+ f"proceeding with default behavior (new tab). Error: {e}"
1219
+ )
1220
+
1221
+ if should_create_new_tab:
1222
+ logger.info(f"Creating new tab and navigating to URL: {url}")
1223
+ try:
1224
+ new_tab_id = await session.create_new_tab(url)
1225
+ await session.switch_to_tab(new_tab_id)
1226
+ nav_result = f"Visited {url} in new tab {new_tab_id}"
1227
+ except Exception as e:
1228
+ logger.error(f"Failed to create new tab and navigate: {e}")
1229
+ nav_result = f"Error creating new tab: {e}"
1230
+ else:
1231
+ logger.info(f"Navigating to URL in current tab: {url}")
1232
+ nav_result = await session.visit(url)
1233
+
1234
+ # Get snapshot
1235
+ snapshot = ""
1236
+ try:
1237
+ snapshot = await session.get_snapshot(
1238
+ force_refresh=True, diff_only=False
1239
+ )
1240
+ except Exception as e:
1241
+ logger.warning(f"Failed to capture snapshot: {e}")
1242
+
1243
+ # Get tab information
1244
+ tab_info = await self._get_tab_info_for_output()
1245
+
1246
+ return {"result": nav_result, "snapshot": snapshot, **tab_info}
1247
+
1248
+ @action_logger
1249
+ async def back(self) -> Dict[str, Any]:
1250
+ r"""Goes back to the previous page in the browser history.
1251
+
1252
+ This action simulates using the browser's "back" button in the
1253
+ currently active tab.
1254
+
1255
+ Returns:
1256
+ Dict[str, Any]: A dictionary with the result of the action:
1257
+ - "result" (str): Confirmation of the action.
1258
+ - "snapshot" (str): A textual snapshot of the previous page.
1259
+ - "tabs" (List[Dict]): Information about all open tabs.
1260
+ - "current_tab" (int): Index of the active tab.
1261
+ - "total_tabs" (int): Total number of open tabs.
1262
+ """
1263
+ page = await self._require_page()
1264
+
1265
+ try:
1266
+ logger.info("Navigating back in browser history...")
1267
+ nav_start = time.time()
1268
+ await page.go_back(
1269
+ wait_until="domcontentloaded", timeout=self._navigation_timeout
1270
+ )
1271
+ nav_time = time.time() - nav_start
1272
+ logger.info(f"Back navigation completed in {nav_time:.2f}s")
1273
+
1274
+ # Minimal wait for page stability (back navigation is usually fast)
1275
+ import asyncio
1276
+
1277
+ await asyncio.sleep(0.2)
1278
+
1279
+ # Get snapshot
1280
+ logger.info("Capturing page snapshot after back navigation...")
1281
+ snapshot_start = time.time()
1282
+ snapshot = await self._session.get_snapshot(
1283
+ force_refresh=True, diff_only=False
1284
+ )
1285
+ snapshot_time = time.time() - snapshot_start
1286
+ logger.info(
1287
+ f"Back navigation snapshot captured in {snapshot_time:.2f}s"
1288
+ )
1289
+
1290
+ # Get tab information
1291
+ tab_info = await self._get_tab_info_for_output()
1292
+
1293
+ return {
1294
+ "result": "Back navigation successful.",
1295
+ "snapshot": snapshot,
1296
+ **tab_info,
1297
+ }
1298
+
1299
+ except Exception as e:
1300
+ logger.warning(f"Back navigation failed: {e}")
1301
+ # Get current snapshot even if navigation failed
1302
+ snapshot = await self._session.get_snapshot(
1303
+ force_refresh=True, diff_only=False
1304
+ )
1305
+ tab_info = await self._get_tab_info_for_output()
1306
+ return {
1307
+ "result": f"Back navigation failed: {e!s}",
1308
+ "snapshot": snapshot,
1309
+ **tab_info,
1310
+ }
1311
+
1312
+ @action_logger
1313
+ async def forward(self) -> Dict[str, Any]:
1314
+ r"""Goes forward to the next page in the browser history.
1315
+
1316
+ This action simulates using the browser's "forward" button in the
1317
+ currently active tab.
1318
+
1319
+ Returns:
1320
+ Dict[str, Any]: A dictionary with the result of the action:
1321
+ - "result" (str): Confirmation of the action.
1322
+ - "snapshot" (str): A textual snapshot of the next page.
1323
+ - "tabs" (List[Dict]): Information about all open tabs.
1324
+ - "current_tab" (int): Index of the active tab.
1325
+ - "total_tabs" (int): Total number of open tabs.
1326
+ """
1327
+ page = await self._require_page()
1328
+
1329
+ try:
1330
+ logger.info("Navigating forward in browser history...")
1331
+ nav_start = time.time()
1332
+ await page.go_forward(
1333
+ wait_until="domcontentloaded", timeout=self._navigation_timeout
1334
+ )
1335
+ nav_time = time.time() - nav_start
1336
+ logger.info(f"Forward navigation completed in {nav_time:.2f}s")
1337
+
1338
+ # Minimal wait for page stability (forward navigation is usually
1339
+ # fast)
1340
+ import asyncio
1341
+
1342
+ await asyncio.sleep(0.2)
1343
+
1344
+ # Get snapshot
1345
+ logger.info("Capturing page snapshot after forward navigation...")
1346
+ snapshot_start = time.time()
1347
+ snapshot = await self._session.get_snapshot(
1348
+ force_refresh=True, diff_only=False
1349
+ )
1350
+ snapshot_time = time.time() - snapshot_start
1351
+ logger.info(
1352
+ f"Forward navigation snapshot captured in {snapshot_time:.2f}s"
1353
+ )
1354
+
1355
+ # Get tab information
1356
+ tab_info = await self._get_tab_info_for_output()
1357
+
1358
+ return {
1359
+ "result": "Forward navigation successful.",
1360
+ "snapshot": snapshot,
1361
+ **tab_info,
1362
+ }
1363
+
1364
+ except Exception as e:
1365
+ logger.warning(f"Forward navigation failed: {e}")
1366
+ # Get current snapshot even if navigation failed
1367
+ snapshot = await self._session.get_snapshot(
1368
+ force_refresh=True, diff_only=False
1369
+ )
1370
+ tab_info = await self._get_tab_info_for_output()
1371
+ return {
1372
+ "result": f"Forward navigation failed: {e!s}",
1373
+ "snapshot": snapshot,
1374
+ **tab_info,
1375
+ }
1376
+
1377
+ @action_logger
1378
+ async def get_page_snapshot(self) -> str:
1379
+ r"""Gets a textual snapshot of the page's interactive elements.
1380
+
1381
+ The snapshot lists elements like buttons, links, and inputs, each with
1382
+ a unique `ref` ID. This ID is used by other tools (e.g., `click`,
1383
+ `type`) to interact with a specific element. This tool provides no
1384
+ visual information.
1385
+
1386
+ Returns:
1387
+ str: A formatted string representing the interactive elements and
1388
+ their `ref` IDs. For example:
1389
+ '- link "Sign In" [ref=1]'
1390
+ '- textbox "Username" [ref=2]'
1391
+ """
1392
+ logger.info("Capturing page snapshot")
1393
+
1394
+ analysis_start = time.time()
1395
+ analysis_data = await self._get_unified_analysis()
1396
+ analysis_time = time.time() - analysis_start
1397
+ logger.info(
1398
+ f"Page snapshot analysis " f"completed in {analysis_time:.2f}s"
1399
+ )
1400
+
1401
+ snapshot_text = analysis_data.get("snapshotText", "")
1402
+ return (
1403
+ snapshot_text
1404
+ if snapshot_text
1405
+ else self._format_snapshot_from_analysis(analysis_data)
1406
+ )
1407
+
1408
+ @dependencies_required('PIL')
1409
+ @action_logger
1410
+ async def get_som_screenshot(self):
1411
+ r"""Captures a screenshot with interactive elements highlighted.
1412
+
1413
+ "SoM" stands for "Set of Marks". This tool takes a screenshot and draws
1414
+ boxes around clickable elements, overlaying a `ref` ID on each. Use
1415
+ this for a visual understanding of the page, especially when the
1416
+ textual snapshot is not enough.
1417
+
1418
+ Returns:
1419
+ str: A summary message including the file path of the saved
1420
+ screenshot, e.g., "Visual webpage screenshot captured with 42
1421
+ interactive elements and saved to /path/to/screenshot.png"
1422
+ """
1423
+ from PIL import Image
1424
+
1425
+ os.makedirs(self._cache_dir, exist_ok=True)
1426
+ # Get screenshot and analysis
1427
+ page = await self._require_page()
1428
+
1429
+ # Log screenshot timeout start
1430
+ logger.info(
1431
+ f"Starting screenshot capture"
1432
+ f"with timeout: {self._screenshot_timeout}ms"
1433
+ )
1434
+
1435
+ start_time = time.time()
1436
+ image_data = await page.screenshot(timeout=self._screenshot_timeout)
1437
+ screenshot_time = time.time() - start_time
1438
+
1439
+ logger.info(f"Screenshot capture completed in {screenshot_time:.2f}s")
1440
+ image = Image.open(io.BytesIO(image_data))
1441
+
1442
+ # Log unified analysis start
1443
+ logger.info("Starting unified page analysis...")
1444
+ analysis_start_time = time.time()
1445
+ analysis_data = await self._get_unified_analysis()
1446
+ analysis_time = time.time() - analysis_start_time
1447
+ logger.info(f"Unified page analysis completed in {analysis_time:.2f}s")
1448
+
1449
+ # Log image processing
1450
+ logger.info("Processing visual marks on screenshot...")
1451
+ mark_start_time = time.time()
1452
+ rects = self._convert_analysis_to_rects(analysis_data)
1453
+ marked_image = self._add_set_of_mark(image, rects)
1454
+ mark_time = time.time() - mark_start_time
1455
+ logger.info(f"Visual marks processing completed in {mark_time:.2f}s")
1456
+
1457
+ # Save screenshot to cache directory
1458
+ parsed_url = urllib.parse.urlparse(page.url)
1459
+ url_name = sanitize_filename(str(parsed_url.path), max_length=241)
1460
+ timestamp = datetime.datetime.now().strftime("%m%d%H%M%S")
1461
+ file_path = os.path.join(
1462
+ self._cache_dir, f"{url_name}_{timestamp}_som.png"
1463
+ )
1464
+ marked_image.save(file_path, "PNG")
1465
+
1466
+ text_result = (
1467
+ f"Visual webpage screenshot captured with {len(rects)} "
1468
+ f"interactive elements and saved to {file_path}"
1469
+ )
1470
+
1471
+ return text_result
1472
+
1473
+ async def click(self, *, ref: str) -> Dict[str, Any]:
1474
+ r"""Performs a click on an element on the page.
1475
+
1476
+ Args:
1477
+ ref (str): The `ref` ID of the element to click. This ID is
1478
+ obtained from a page snapshot (`get_page_snapshot` or
1479
+ `get_som_screenshot`).
1480
+
1481
+ Returns:
1482
+ Dict[str, Any]: A dictionary with the result of the action:
1483
+ - "result" (str): Confirmation of the action.
1484
+ - "snapshot" (str): A textual snapshot of the page after the
1485
+ click.
1486
+ - "tabs" (List[Dict]): Information about all open tabs.
1487
+ - "current_tab" (int): Index of the active tab.
1488
+ - "total_tabs" (int): Total number of open tabs.
1489
+ """
1490
+ self._validate_ref(ref, "click")
1491
+
1492
+ analysis = await self._get_unified_analysis()
1493
+ elements = analysis.get("elements", {})
1494
+ if ref not in elements:
1495
+ logger.error(f"Error: Element reference '{ref}' not found. ")
1496
+ # Added snapshot to give more context on failure
1497
+ snapshot = self._format_snapshot_from_analysis(analysis)
1498
+ tab_info = await self._get_tab_info_for_output()
1499
+ return {
1500
+ "result": f"Error: Element reference '{ref}' not found. ",
1501
+ "snapshot": snapshot,
1502
+ **tab_info,
1503
+ }
1504
+
1505
+ element_details = elements.get(ref)
1506
+ action = {"type": "click", "ref": ref}
1507
+ result = await self._exec_with_snapshot(
1508
+ action, element_details=element_details
1509
+ )
1510
+
1511
+ # Add tab information to the result
1512
+ tab_info = await self._get_tab_info_for_output()
1513
+ result.update(tab_info)
1514
+
1515
+ return result
1516
+
1517
+ async def type(self, *, ref: str, text: str) -> Dict[str, Any]:
1518
+ r"""Types text into an input element on the page.
1519
+
1520
+ Args:
1521
+ ref (str): The `ref` ID of the input element, from a snapshot.
1522
+ text (str): The text to type into the element.
1523
+
1524
+ Returns:
1525
+ Dict[str, Any]: A dictionary with the result of the action:
1526
+ - "result" (str): Confirmation of the action.
1527
+ - "snapshot" (str): A textual snapshot of the page after
1528
+ typing.
1529
+ - "tabs" (List[Dict]): Information about all open tabs.
1530
+ - "current_tab" (int): Index of the active tab.
1531
+ - "total_tabs" (int): Total number of open tabs.
1532
+ """
1533
+ self._validate_ref(ref, "type")
1534
+ await self._get_unified_analysis() # Ensure aria-ref attributes
1535
+
1536
+ action = {"type": "type", "ref": ref, "text": text}
1537
+ result = await self._exec_with_snapshot(action)
1538
+
1539
+ # Add tab information to the result
1540
+ tab_info = await self._get_tab_info_for_output()
1541
+ result.update(tab_info)
1542
+
1543
+ return result
1544
+
1545
+ async def select(self, *, ref: str, value: str) -> Dict[str, Any]:
1546
+ r"""Selects an option in a dropdown (`<select>`) element.
1547
+
1548
+ Args:
1549
+ ref (str): The `ref` ID of the `<select>` element.
1550
+ value (str): The `value` attribute of the `<option>` to select,
1551
+ not its visible text.
1552
+
1553
+ Returns:
1554
+ Dict[str, Any]: A dictionary with the result of the action:
1555
+ - "result" (str): Confirmation of the action.
1556
+ - "snapshot" (str): A snapshot of the page after the
1557
+ selection.
1558
+ - "tabs" (List[Dict]): Information about all open tabs.
1559
+ - "current_tab" (int): Index of the active tab.
1560
+ - "total_tabs" (int): Total number of open tabs.
1561
+ """
1562
+ self._validate_ref(ref, "select")
1563
+ await self._get_unified_analysis()
1564
+
1565
+ action = {"type": "select", "ref": ref, "value": value}
1566
+ result = await self._exec_with_snapshot(action)
1567
+
1568
+ # Add tab information to the result
1569
+ tab_info = await self._get_tab_info_for_output()
1570
+ result.update(tab_info)
1571
+
1572
+ return result
1573
+
1574
+ async def scroll(self, *, direction: str, amount: int) -> Dict[str, Any]:
1575
+ r"""Scrolls the current page window.
1576
+
1577
+ Args:
1578
+ direction (str): The direction to scroll: 'up' or 'down'.
1579
+ amount (int): The number of pixels to scroll.
1580
+
1581
+ Returns:
1582
+ Dict[str, Any]: A dictionary with the result of the action:
1583
+ - "result" (str): Confirmation of the action.
1584
+ - "snapshot" (str): A snapshot of the page after scrolling.
1585
+ - "tabs" (List[Dict]): Information about all open tabs.
1586
+ - "current_tab" (int): Index of the active tab.
1587
+ - "total_tabs" (int): Total number of open tabs.
1588
+ """
1589
+ if direction not in ("up", "down"):
1590
+ tab_info = await self._get_tab_info_for_output()
1591
+ return {
1592
+ "result": "Error: direction must be 'up' or 'down'",
1593
+ "snapshot": "",
1594
+ **tab_info,
1595
+ }
1596
+
1597
+ action = {"type": "scroll", "direction": direction, "amount": amount}
1598
+ result = await self._exec_with_snapshot(action)
1599
+
1600
+ # Add tab information to the result
1601
+ tab_info = await self._get_tab_info_for_output()
1602
+ result.update(tab_info)
1603
+
1604
+ return result
1605
+
1606
+ async def enter(self) -> Dict[str, Any]:
1607
+ r"""Simulates pressing the Enter key on the currently focused element.
1608
+
1609
+ This is useful for submitting forms or search queries after using the
1610
+ `type` tool.
1611
+
1612
+ Returns:
1613
+ Dict[str, Any]: A dictionary with the result of the action:
1614
+ - "result" (str): Confirmation of the action.
1615
+ - "snapshot" (str): A new page snapshot, as this action often
1616
+ triggers navigation.
1617
+ - "tabs" (List[Dict]): Information about all open tabs.
1618
+ - "current_tab" (int): Index of the active tab.
1619
+ - "total_tabs" (int): Total number of open tabs.
1620
+ """
1621
+ # Always press Enter on the currently focused element
1622
+ action = {"type": "enter"}
1623
+
1624
+ result = await self._exec_with_snapshot(action)
1625
+
1626
+ # Add tab information to the result
1627
+ tab_info = await self._get_tab_info_for_output()
1628
+ result.update(tab_info)
1629
+
1630
+ return result
1631
+
1632
+ @action_logger
1633
+ async def wait_user(
1634
+ self, timeout_sec: Optional[float] = None
1635
+ ) -> Dict[str, Any]:
1636
+ r"""Pauses execution and waits for human input from the console.
1637
+
1638
+ Use this for tasks requiring manual steps, like solving a CAPTCHA. The
1639
+ agent will resume after the user presses Enter in the console.
1640
+
1641
+ Args:
1642
+ timeout_sec (Optional[float]): Max time to wait in seconds. If
1643
+ `None`, it will wait indefinitely.
1644
+
1645
+ Returns:
1646
+ Dict[str, Any]: A dictionary with the result of the action:
1647
+ - "result" (str): A message indicating how the wait ended.
1648
+ - "snapshot" (str): The page snapshot after the wait.
1649
+ - "tabs" (List[Dict]): Information about all open tabs.
1650
+ - "current_tab" (int): Index of the active tab.
1651
+ - "total_tabs" (int): Total number of open tabs.
1652
+ """
1653
+ import asyncio
1654
+
1655
+ prompt = (
1656
+ "🕑 Agent waiting for human input. "
1657
+ "Complete action in browser, then press Enter..."
1658
+ )
1659
+ logger.info(f"\n{prompt}\n")
1660
+
1661
+ async def _await_enter():
1662
+ await asyncio.to_thread(input, ">>> Press Enter to resume <<<\n")
1663
+
1664
+ try:
1665
+ if timeout_sec is not None:
1666
+ logger.info(
1667
+ f"Waiting for user input with timeout: {timeout_sec}s"
1668
+ )
1669
+ start_time = time.time()
1670
+ await asyncio.wait_for(_await_enter(), timeout=timeout_sec)
1671
+ wait_time = time.time() - start_time
1672
+ logger.info(f"User input received after {wait_time:.2f}s")
1673
+ result_msg = "User resumed."
1674
+ else:
1675
+ logger.info("Waiting for user " "input (no timeout)")
1676
+ start_time = time.time()
1677
+ await _await_enter()
1678
+ wait_time = time.time() - start_time
1679
+ logger.info(f"User input received " f"after {wait_time:.2f}s")
1680
+ result_msg = "User resumed."
1681
+ except asyncio.TimeoutError:
1682
+ wait_time = timeout_sec or 0.0
1683
+ logger.info(
1684
+ f"User input timeout reached "
1685
+ f"after {wait_time}s, auto-resuming"
1686
+ )
1687
+ result_msg = f"Timeout {timeout_sec}s reached, auto-resumed."
1688
+
1689
+ snapshot = await self._session.get_snapshot(
1690
+ force_refresh=True, diff_only=False
1691
+ )
1692
+ tab_info = await self._get_tab_info_for_output()
1693
+
1694
+ return {"result": result_msg, "snapshot": snapshot, **tab_info}
1695
+
1696
+ @action_logger
1697
+ async def get_page_links(self, *, ref: List[str]) -> Dict[str, Any]:
1698
+ r"""Gets the destination URLs for a list of link elements.
1699
+
1700
+ This is useful to know where a link goes before clicking it.
1701
+
1702
+ Args:
1703
+ ref (List[str]): A list of `ref` IDs for link elements, obtained
1704
+ from a page snapshot.
1705
+
1706
+ Returns:
1707
+ Dict[str, Any]: A dictionary containing:
1708
+ - "links" (List[Dict]): A list of found links, where each
1709
+ link has "text", "ref", and "url" keys.
1710
+ """
1711
+ if not ref or not isinstance(ref, list):
1712
+ return {"links": []}
1713
+
1714
+ for r in ref:
1715
+ if not r or not isinstance(r, str):
1716
+ return {"links": []}
1717
+
1718
+ page = await self._require_page()
1719
+ snapshot = await self._session.get_snapshot(
1720
+ force_refresh=True, diff_only=False
1721
+ )
1722
+ links = await self._extract_links_by_refs(snapshot, page, ref)
1723
+
1724
+ return {"links": links}
1725
+
1726
+ @action_logger
1727
+ async def solve_task(
1728
+ self, task_prompt: str, start_url: str, max_steps: int = 15
1729
+ ) -> str:
1730
+ r"""Delegates a complex, high-level task to a specialized web agent.
1731
+
1732
+ Use this for multi-step tasks that can be described in a single prompt
1733
+ (e.g., "log into my account and check for new messages"). The agent
1734
+ will autonomously perform the necessary browser actions.
1735
+
1736
+ NOTE: This is a high-level action; for simple interactions, use tools
1737
+ like `click` and `type`. `web_agent_model` must be provided during
1738
+ toolkit initialization.
1739
+
1740
+ Args:
1741
+ task_prompt (str): A natural language description of the task.
1742
+ start_url (str): The URL to start the task from. This should be a
1743
+ valid and existing URL, as agents may generate non-existent
1744
+ ones.
1745
+ max_steps (int): The maximum number of steps the agent can take.
1746
+
1747
+ Returns:
1748
+ str: A summary message indicating the task has finished.
1749
+ """
1750
+ agent = self._ensure_agent()
1751
+ await agent.navigate(start_url)
1752
+ await agent.process_command(task_prompt, max_steps=max_steps)
1753
+ return "Task processing finished - see stdout for detailed trace."
1754
+
1755
+ def get_log_summary(self) -> Dict[str, Any]:
1756
+ r"""Get a summary of logged actions."""
1757
+ if not self.log_buffer:
1758
+ return {"total_actions": 0, "summary": "No actions logged"}
1759
+
1760
+ total_actions = len(self.log_buffer)
1761
+ total_execution_time = sum(
1762
+ entry.get("execution_time_ms", 0) for entry in self.log_buffer
1763
+ )
1764
+ total_page_load_time = sum(
1765
+ entry.get("page_load_time_ms", 0)
1766
+ for entry in self.log_buffer
1767
+ if "page_load_time_ms" in entry
1768
+ )
1769
+
1770
+ action_counts: Dict[str, int] = {}
1771
+ error_count = 0
1772
+
1773
+ for entry in self.log_buffer:
1774
+ action = entry["action"]
1775
+ action_counts[action] = action_counts.get(action, 0) + 1
1776
+ if "error" in entry:
1777
+ error_count += 1
1778
+
1779
+ return {
1780
+ "total_actions": total_actions,
1781
+ "total_execution_time_ms": round(total_execution_time, 2),
1782
+ "total_page_load_time_ms": round(total_page_load_time, 2),
1783
+ "action_counts": action_counts,
1784
+ "error_count": error_count,
1785
+ "success_rate": round(
1786
+ (total_actions - error_count) / total_actions * 100, 2
1787
+ )
1788
+ if total_actions > 0
1789
+ else 0,
1790
+ }
1791
+
1792
+ def clear_logs(self) -> None:
1793
+ r"""Clear the log buffer."""
1794
+ self.log_buffer.clear()
1795
+ logger.info("Log buffer cleared")
1796
+
1797
+ def get_tools(self) -> List[FunctionTool]:
1798
+ r"""Get available function tools
1799
+ based on enabled_tools configuration."""
1800
+ # Map tool names to their corresponding methods
1801
+ tool_map = {
1802
+ "open_browser": self.open_browser,
1803
+ "close_browser": self.close_browser,
1804
+ "visit_page": self.visit_page,
1805
+ "back": self.back,
1806
+ "forward": self.forward,
1807
+ "get_page_snapshot": self.get_page_snapshot,
1808
+ "get_som_screenshot": self.get_som_screenshot,
1809
+ "get_page_links": self.get_page_links,
1810
+ "click": self.click,
1811
+ "type": self.type,
1812
+ "select": self.select,
1813
+ "scroll": self.scroll,
1814
+ "enter": self.enter,
1815
+ "wait_user": self.wait_user,
1816
+ "solve_task": self.solve_task,
1817
+ "switch_tab": self.switch_tab,
1818
+ "close_tab": self.close_tab,
1819
+ "get_tab_info": self.get_tab_info,
1820
+ }
1821
+
1822
+ enabled_tools = []
1823
+
1824
+ for tool_name in self.enabled_tools:
1825
+ if tool_name == "solve_task" and self._web_agent_model is None:
1826
+ logger.warning(
1827
+ f"Tool '{tool_name}' is enabled but web_agent_model "
1828
+ f"is not provided. Skipping this tool."
1829
+ )
1830
+ continue
1831
+
1832
+ if tool_name in tool_map:
1833
+ tool = FunctionTool(
1834
+ cast(Callable[..., Any], tool_map[tool_name])
1835
+ )
1836
+ enabled_tools.append(tool)
1837
+ else:
1838
+ logger.warning(f"Unknown tool name: {tool_name}")
1839
+
1840
+ logger.info(f"Returning {len(enabled_tools)} enabled tools")
1841
+ return enabled_tools
1842
+
1843
+ def clone_for_new_session(
1844
+ self, new_session_id: Optional[str] = None
1845
+ ) -> "HybridBrowserToolkit":
1846
+ r"""Create a new instance of HybridBrowserToolkit with a unique
1847
+ session.
1848
+
1849
+ Args:
1850
+ new_session_id: Optional new session ID. If None, a UUID will be
1851
+ generated.
1852
+
1853
+ Returns:
1854
+ A new HybridBrowserToolkit instance with the same configuration
1855
+ but a different session.
1856
+ """
1857
+ import uuid
1858
+
1859
+ if new_session_id is None:
1860
+ new_session_id = str(uuid.uuid4())[:8]
1861
+
1862
+ return HybridBrowserToolkit(
1863
+ headless=self._headless,
1864
+ user_data_dir=self._user_data_dir,
1865
+ stealth=self._stealth,
1866
+ web_agent_model=self._web_agent_model,
1867
+ cache_dir=f"{self._cache_dir.rstrip('/')}_clone_{new_session_id}/",
1868
+ enabled_tools=self.enabled_tools.copy(),
1869
+ browser_log_to_file=self._browser_log_to_file,
1870
+ session_id=new_session_id,
1871
+ default_start_url=self._default_start_url,
1872
+ default_timeout=self._default_timeout,
1873
+ short_timeout=self._short_timeout,
1874
+ navigation_timeout=self._navigation_timeout,
1875
+ network_idle_timeout=self._network_idle_timeout,
1876
+ screenshot_timeout=self._screenshot_timeout,
1877
+ page_stability_timeout=self._page_stability_timeout,
1878
+ dom_content_loaded_timeout=self._dom_content_loaded_timeout,
1879
+ )
1880
+
1881
+ @action_logger
1882
+ async def switch_tab(self, *, tab_id: str) -> Dict[str, Any]:
1883
+ r"""Switches to a different browser tab using its ID.
1884
+
1885
+ After switching, all actions will apply to the new tab. Use
1886
+ `get_tab_info` to find the ID of the tab you want to switch to.
1887
+
1888
+ Args:
1889
+ tab_id (str): The ID of the tab to activate.
1890
+
1891
+ Returns:
1892
+ Dict[str, Any]: A dictionary with the result of the action:
1893
+ - "result" (str): Confirmation of the action.
1894
+ - "snapshot" (str): A snapshot of the newly active tab.
1895
+ - "tabs" (List[Dict]): Information about all open tabs.
1896
+ - "current_tab" (int): Index of the new active tab.
1897
+ - "total_tabs" (int): Total number of open tabs.
1898
+ """
1899
+ await self._ensure_browser()
1900
+ session = await self._get_session()
1901
+
1902
+ success = await session.switch_to_tab(tab_id)
1903
+
1904
+ if success:
1905
+ snapshot = await session.get_snapshot(
1906
+ force_refresh=True, diff_only=False
1907
+ )
1908
+ tab_info = await self._get_tab_info_for_output()
1909
+
1910
+ result = {
1911
+ "result": f"Successfully switched to tab {tab_id}",
1912
+ "snapshot": snapshot,
1913
+ **tab_info,
1914
+ }
1915
+ else:
1916
+ tab_info = await self._get_tab_info_for_output()
1917
+ result = {
1918
+ "result": f"Failed to switch to tab {tab_id}. Tab may not "
1919
+ f"exist.",
1920
+ "snapshot": "",
1921
+ **tab_info,
1922
+ }
1923
+
1924
+ return result
1925
+
1926
+ @action_logger
1927
+ async def close_tab(self, *, tab_id: str) -> Dict[str, Any]:
1928
+ r"""Closes a browser tab using its ID.
1929
+
1930
+ Use `get_tab_info` to find the ID of the tab to close. After
1931
+ closing, the browser will switch to another tab if available.
1932
+
1933
+ Args:
1934
+ tab_id (str): The ID of the tab to close.
1935
+
1936
+ Returns:
1937
+ Dict[str, Any]: A dictionary with the result of the action:
1938
+ - "result" (str): Confirmation of the action.
1939
+ - "snapshot" (str): A snapshot of the active tab after closure.
1940
+ - "tabs" (List[Dict]): Information about remaining tabs.
1941
+ - "current_tab" (int): Index of the new active tab.
1942
+ - "total_tabs" (int): Total number of remaining tabs.
1943
+ """
1944
+ await self._ensure_browser()
1945
+ session = await self._get_session()
1946
+
1947
+ success = await session.close_tab(tab_id)
1948
+
1949
+ if success:
1950
+ # Get current state after closing the tab
1951
+ try:
1952
+ snapshot = await session.get_snapshot(
1953
+ force_refresh=True, diff_only=False
1954
+ )
1955
+ except Exception:
1956
+ snapshot = "" # No active tab
1957
+
1958
+ tab_info = await self._get_tab_info_for_output()
1959
+
1960
+ result = {
1961
+ "result": f"Successfully closed tab {tab_id}",
1962
+ "snapshot": snapshot,
1963
+ **tab_info,
1964
+ }
1965
+ else:
1966
+ tab_info = await self._get_tab_info_for_output()
1967
+ result = {
1968
+ "result": f"Failed to close tab {tab_id}. Tab may not "
1969
+ f"exist.",
1970
+ "snapshot": "",
1971
+ **tab_info,
1972
+ }
1973
+
1974
+ return result
1975
+
1976
+ @action_logger
1977
+ async def get_tab_info(self) -> Dict[str, Any]:
1978
+ r"""Gets a list of all open browser tabs and their information.
1979
+
1980
+ This includes each tab's index, title, and URL, and indicates which
1981
+ tab is currently active. Use this to manage multiple tabs.
1982
+
1983
+ Returns:
1984
+ Dict[str, Any]: A dictionary with tab information:
1985
+ - "tabs" (List[Dict]): A list of open tabs, each with:
1986
+ - "index" (int): The tab's zero-based index.
1987
+ - "title" (str): The page title.
1988
+ - "url" (str): The current URL.
1989
+ - "is_current" (bool): True if the tab is active.
1990
+ - "current_tab" (int): Index of the active tab.
1991
+ - "total_tabs" (int): Total number of open tabs.
1992
+ """
1993
+ await self._ensure_browser()
1994
+ return await self._get_tab_info_for_output()